747 files changed, 60627 insertions, 28055 deletions
diff --git a/lib/Analysis/AliasAnalysis.cpp b/lib/Analysis/AliasAnalysis.cpp
index 95c834b..3b6aab1 100644
--- a/lib/Analysis/AliasAnalysis.cpp
+++ b/lib/Analysis/AliasAnalysis.cpp
@@ -25,6 +25,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Pass.h"
 #include "llvm/BasicBlock.h"
 #include "llvm/Function.h"
@@ -356,6 +359,86 @@ AliasAnalysis::getModRefInfo(const AtomicRMWInst *RMW, const Location &Loc) {
   return ModRef;
 }
 
+namespace {
+  /// Only find pointer captures which happen before the given instruction. Uses
+  /// the dominator tree to determine whether one instruction is before another.
+  struct CapturesBefore : public CaptureTracker {
+    CapturesBefore(const Instruction *I, DominatorTree *DT)
+      : BeforeHere(I), DT(DT), Captured(false) {}
+
+    void tooManyUses() { Captured = true; }
+
+    bool shouldExplore(Use *U) {
+      Instruction *I = cast<Instruction>(U->getUser());
+      BasicBlock *BB = I->getParent();
+      if (BeforeHere != I &&
+          (!DT->isReachableFromEntry(BB) || DT->dominates(BeforeHere, I)))
+        return false;
+      return true;
+    }
+
+    bool captured(Use *U) {
+      Instruction *I = cast<Instruction>(U->getUser());
+      BasicBlock *BB = I->getParent();
+      if (BeforeHere != I &&
+          (!DT->isReachableFromEntry(BB) || DT->dominates(BeforeHere, I)))
+        return false;
+      Captured = true;
+      return true;
+    }
+
+    const Instruction *BeforeHere;
+    DominatorTree *DT;
+
+    bool Captured;
+  };
+}
+
+// FIXME: this is really just shoring-up a deficiency in alias analysis.
+// BasicAA isn't willing to spend linear time determining whether an alloca
+// was captured before or after this particular call, while we are. However,
+// with a smarter AA in place, this test is just wasting compile time.
+AliasAnalysis::ModRefResult
+AliasAnalysis::callCapturesBefore(const Instruction *I,
+                                  const AliasAnalysis::Location &MemLoc,
+                                  DominatorTree *DT) {
+  if (!DT || !TD) return AliasAnalysis::ModRef;
+
+  const Value *Object = GetUnderlyingObject(MemLoc.Ptr, TD);
+  if (!isIdentifiedObject(Object) || isa<GlobalValue>(Object) ||
+      isa<Constant>(Object))
+    return AliasAnalysis::ModRef;
+
+  ImmutableCallSite CS(I);
+  if (!CS.getInstruction() || CS.getInstruction() == Object)
+    return AliasAnalysis::ModRef;
+
+  CapturesBefore CB(I, DT);
+  llvm::PointerMayBeCaptured(Object, &CB);
+  if (CB.Captured)
+    return AliasAnalysis::ModRef;
+
+  unsigned ArgNo = 0;
+  for (ImmutableCallSite::arg_iterator CI = CS.arg_begin(), CE = CS.arg_end();
+       CI != CE; ++CI, ++ArgNo) {
+    // Only look at the no-capture or byval pointer arguments.  If this
+    // pointer were passed to arguments that were neither of these, then it
+    // couldn't be no-capture.
+    if (!(*CI)->getType()->isPointerTy() ||
+        (!CS.doesNotCapture(ArgNo) && !CS.isByValArgument(ArgNo)))
+      continue;
+
+    // If this is a no-capture pointer argument, see if we can tell that it
+    // is impossible to alias the pointer we're checking.  If not, we have to
+    // assume that the call could touch the pointer, even though it doesn't
+    // escape.
+    if (!isNoAlias(AliasAnalysis::Location(*CI),
+                   AliasAnalysis::Location(Object))) {
+      return AliasAnalysis::ModRef;
+    }
+  }
+  return AliasAnalysis::NoModRef;
+}
 
 // AliasAnalysis destructor: DO NOT move this to the header file for
 // AliasAnalysis or else clients of the AliasAnalysis class may not depend on
diff --git a/lib/Analysis/AliasSetTracker.cpp b/lib/Analysis/AliasSetTracker.cpp
index f80e2fb..92e8906 100644
--- a/lib/Analysis/AliasSetTracker.cpp
+++ b/lib/Analysis/AliasSetTracker.cpp
@@ -501,7 +501,7 @@ void AliasSetTracker::deleteValue(Value *PtrVal) {
   }
 
   // First, look up the PointerRec for this pointer.
-  PointerMapType::iterator I = PointerMap.find(PtrVal);
+  PointerMapType::iterator I = PointerMap.find_as(PtrVal);
   if (I == PointerMap.end()) return;  // Noop
 
   // If we found one, remove the pointer from the alias set it is in.
@@ -527,7 +527,7 @@ void AliasSetTracker::copyValue(Value *From, Value *To) {
   AA.copyValue(From, To);
 
   // First, look up the PointerRec for this pointer.
-  PointerMapType::iterator I = PointerMap.find(From);
+  PointerMapType::iterator I = PointerMap.find_as(From);
   if (I == PointerMap.end())
     return;  // Noop
   assert(I->second->hasAliasSet() && "Dead entry?");
@@ -536,7 +536,7 @@ void AliasSetTracker::copyValue(Value *From, Value *To) {
   if (Entry.hasAliasSet()) return;    // Already in the tracker!
 
   // Add it to the alias set it aliases...
-  I = PointerMap.find(From);
+  I = PointerMap.find_as(From);
   AliasSet *AS = I->second->getAliasSet(*this);
   AS->addPointer(*this, Entry, I->second->getSize(),
                  I->second->getTBAAInfo(),
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index 20ecfd2..1d028c2 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -86,47 +86,10 @@ static bool isEscapeSource(const Value *V) {
 /// UnknownSize if unknown.
 static uint64_t getObjectSize(const Value *V, const TargetData &TD,
                               bool RoundToAlign = false) {
-  Type *AccessTy;
-  unsigned Align;
-  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) {
-    if (!GV->hasDefinitiveInitializer())
-      return AliasAnalysis::UnknownSize;
-    AccessTy = GV->getType()->getElementType();
-    Align = GV->getAlignment();
-  } else if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
-    if (!AI->isArrayAllocation())
-      AccessTy = AI->getType()->getElementType();
-    else
-      return AliasAnalysis::UnknownSize;
-    Align = AI->getAlignment();
-  } else if (const CallInst* CI = extractMallocCall(V)) {
-    if (!RoundToAlign && !isArrayMalloc(V, &TD))
-      // The size is the argument to the malloc call.
-      if (const ConstantInt* C = dyn_cast<ConstantInt>(CI->getArgOperand(0)))
-        return C->getZExtValue();
-    return AliasAnalysis::UnknownSize;
-  } else if (const Argument *A = dyn_cast<Argument>(V)) {
-    if (A->hasByValAttr()) {
-      AccessTy = cast<PointerType>(A->getType())->getElementType();
-      Align = A->getParamAlignment();
-    } else {
-      return AliasAnalysis::UnknownSize;
-    }
-  } else {
-    return AliasAnalysis::UnknownSize;
-  }
-
-  if (!AccessTy->isSized())
-    return AliasAnalysis::UnknownSize;
-
-  uint64_t Size = TD.getTypeAllocSize(AccessTy);
-  // If there is an explicitly specified alignment, and we need to
-  // take alignment into account, round up the size. (If the alignment
-  // is implicit, getTypeAllocSize is sufficient.)
-  if (RoundToAlign && Align)
-    Size = RoundUpToAlignment(Size, Align);
-
-  return Size;
+  uint64_t Size;
+  if (getObjectSize(V, Size, &TD, RoundToAlign))
+    return Size;
+  return AliasAnalysis::UnknownSize;
 }
 
 /// isObjectSmallerThan - Return true if we can prove that the object specified
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index 2e3ec8b..96e68b4 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -12,9 +12,7 @@ add_llvm_library(LLVMAnalysis
   CaptureTracking.cpp
   CodeMetrics.cpp
   ConstantFolding.cpp
-  DIBuilder.cpp
   DbgInfoPrinter.cpp
-  DebugInfo.cpp
   DomPrinter.cpp
   DominanceFrontier.cpp
   IVUsers.cpp
@@ -59,4 +57,6 @@ add_llvm_library(LLVMAnalysis
   ValueTracking.cpp
   )
 
+add_dependencies(LLVMAnalysis intrinsics_gen)
+
 add_subdirectory(IPA)
diff --git a/lib/Analysis/CaptureTracking.cpp b/lib/Analysis/CaptureTracking.cpp
index dd33eeb..974b906 100644
--- a/lib/Analysis/CaptureTracking.cpp
+++ b/lib/Analysis/CaptureTracking.cpp
@@ -34,7 +34,7 @@ namespace {
 
     bool captured(Use *U) {
       if (isa<ReturnInst>(U->getUser()) && !ReturnCaptures)
-	return false;
+        return false;
 
       Captured = true;
       return true;
diff --git a/lib/Analysis/CodeMetrics.cpp b/lib/Analysis/CodeMetrics.cpp
index 316e7bc..acda34b 100644
--- a/lib/Analysis/CodeMetrics.cpp
+++ b/lib/Analysis/CodeMetrics.cpp
@@ -22,7 +22,11 @@ using namespace llvm;
 /// callIsSmall - If a call is likely to lower to a single target instruction,
 /// or is otherwise deemed small return true.
 /// TODO: Perhaps calls like memcpy, strcpy, etc?
-bool llvm::callIsSmall(const Function *F) {
+bool llvm::callIsSmall(ImmutableCallSite CS) {
+  if (isa<IntrinsicInst>(CS.getInstruction()))
+    return true;
+
+  const Function *F = CS.getCalledFunction();
   if (!F) return false;
 
   if (F->hasLocalLinkage()) return false;
@@ -79,8 +83,24 @@ bool llvm::isInstructionFree(const Instruction *I, const TargetData *TD) {
 
   if (const CastInst *CI = dyn_cast<CastInst>(I)) {
     // Noop casts, including ptr <-> int,  don't count.
-    if (CI->isLosslessCast() || isa<IntToPtrInst>(CI) || isa<PtrToIntInst>(CI))
+    if (CI->isLosslessCast())
+      return true;
+
+    Value *Op = CI->getOperand(0);
+    // An inttoptr cast is free so long as the input is a legal integer type
+    // which doesn't contain values outside the range of a pointer.
+    if (isa<IntToPtrInst>(CI) && TD &&
+        TD->isLegalInteger(Op->getType()->getScalarSizeInBits()) &&
+        Op->getType()->getScalarSizeInBits() <= TD->getPointerSizeInBits())
       return true;
+
+    // A ptrtoint cast is free so long as the result is large enough to store
+    // the pointer, and a legal integer type.
+    if (isa<PtrToIntInst>(CI) && TD &&
+        TD->isLegalInteger(Op->getType()->getScalarSizeInBits()) &&
+        Op->getType()->getScalarSizeInBits() >= TD->getPointerSizeInBits())
+      return true;
+
     // trunc to a native type is free (assuming the target has compare and
     // shift-right of the same width).
     if (TD && isa<TruncInst>(CI) &&
@@ -126,7 +146,7 @@ void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB,
           isRecursive = true;
       }
 
-      if (!isa<IntrinsicInst>(II) && !callIsSmall(CS.getCalledFunction())) {
+      if (!callIsSmall(CS)) {
         // Each argument to a call takes on average one instruction to set up.
         NumInsts += CS.arg_size();
 
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 7a0a4e1..7ced848 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -681,6 +681,7 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
   // This makes it easy to determine if the getelementptr is "inbounds".
   // Also, this helps GlobalOpt do SROA on GlobalVariables.
   Type *Ty = Ptr->getType();
+  assert(Ty->isPointerTy() && "Forming regular GEP of non-pointer type");
   SmallVector<Constant*, 32> NewIdxs;
   do {
     if (SequentialType *ATy = dyn_cast<SequentialType>(Ty)) {
@@ -711,10 +712,17 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
       }
       Ty = ATy->getElementType();
     } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
-      // Determine which field of the struct the offset points into. The
-      // getZExtValue is at least as safe as the StructLayout API because we
-      // know the offset is within the struct at this point.
+      // If we end up with an offset that isn't valid for this struct type, we
+      // can't re-form this GEP in a regular form, so bail out. The pointer
+      // operand likely went through casts that are necessary to make the GEP
+      // sensible.
       const StructLayout &SL = *TD->getStructLayout(STy);
+      if (Offset.uge(SL.getSizeInBytes()))
+        break;
+
+      // Determine which field of the struct the offset points into. The
+      // getZExtValue is fine as we've already ensured that the offset is
+      // within the range representable by the StructLayout API.
       unsigned ElIdx = SL.getElementContainingOffset(Offset.getZExtValue());
       NewIdxs.push_back(ConstantInt::get(Type::getInt32Ty(Ty->getContext()),
                                          ElIdx));
@@ -772,14 +780,21 @@ Constant *llvm::ConstantFoldInstruction(Instruction *I,
       // all operands are constants.
       if (isa<UndefValue>(Incoming))
         continue;
-      // If the incoming value is not a constant, or is a different constant to
-      // the one we saw previously, then give up.
+      // If the incoming value is not a constant, then give up.
       Constant *C = dyn_cast<Constant>(Incoming);
-      if (!C || (CommonValue && C != CommonValue))
+      if (!C)
+        return 0;
+      // Fold the PHI's operands.
+      if (ConstantExpr *NewC = dyn_cast<ConstantExpr>(C))
+        C = ConstantFoldConstantExpression(NewC, TD, TLI);
+      // If the incoming value is a different constant to
+      // the one we saw previously, then give up.
+      if (CommonValue && C != CommonValue)
         return 0;
       CommonValue = C;
     }
 
+
     // If we reach here, all incoming values are the same constant or undef.
     return CommonValue ? CommonValue : UndefValue::get(PN->getType());
   }
@@ -787,12 +802,18 @@ Constant *llvm::ConstantFoldInstruction(Instruction *I,
   // Scan the operand list, checking to see if they are all constants, if so,
   // hand off to ConstantFoldInstOperands.
   SmallVector<Constant*, 8> Ops;
-  for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i)
-    if (Constant *Op = dyn_cast<Constant>(*i))
-      Ops.push_back(Op);
-    else
+  for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i) {
+    Constant *Op = dyn_cast<Constant>(*i);
+    if (!Op)
       return 0;  // All operands not constant!
 
+    // Fold the Instruction's operands.
+    if (ConstantExpr *NewCE = dyn_cast<ConstantExpr>(Op))
+      Op = ConstantFoldConstantExpression(NewCE, TD, TLI);
+
+    Ops.push_back(Op);
+  }
+
   if (const CmpInst *CI = dyn_cast<CmpInst>(I))
     return ConstantFoldCompareInstOperands(CI->getPredicate(), Ops[0], Ops[1],
                                            TD, TLI);
diff --git a/lib/Analysis/DbgInfoPrinter.cpp b/lib/Analysis/DbgInfoPrinter.cpp
index cd832ab..41cd34c 100644
--- a/lib/Analysis/DbgInfoPrinter.cpp
+++ b/lib/Analysis/DbgInfoPrinter.cpp
@@ -16,14 +16,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Pass.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/Function.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Metadata.h"
 #include "llvm/Module.h"
-#include "llvm/Assembly/Writer.h"
-#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/Pass.h"
 #include "llvm/Analysis/Passes.h"
+#include "llvm/Assembly/Writer.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/lib/Analysis/IPA/CMakeLists.txt b/lib/Analysis/IPA/CMakeLists.txt
index 8ffef29..34d6d1b 100644
--- a/lib/Analysis/IPA/CMakeLists.txt
+++ b/lib/Analysis/IPA/CMakeLists.txt
@@ -5,3 +5,5 @@ add_llvm_library(LLVMipa
   GlobalsModRef.cpp
   IPA.cpp
   )
+
+add_dependencies(LLVMipa intrinsics_gen)
diff --git a/lib/Analysis/IPA/CallGraphSCCPass.cpp b/lib/Analysis/IPA/CallGraphSCCPass.cpp
index 963da75..449b7ee 100644
--- a/lib/Analysis/IPA/CallGraphSCCPass.cpp
+++ b/lib/Analysis/IPA/CallGraphSCCPass.cpp
@@ -246,7 +246,9 @@ bool CGPassManager::RefreshCallGraph(CallGraphSCC &CurSCC,
     for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
       for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
         CallSite CS(cast<Value>(I));
-        if (!CS || isa<IntrinsicInst>(I)) continue;
+        if (!CS) continue;
+        Function *Callee = CS.getCalledFunction();
+        if (Callee && Callee->isIntrinsic()) continue;
         
         // If this call site already existed in the callgraph, just verify it
         // matches up to expectations and remove it from CallSites.
diff --git a/lib/Analysis/IPA/GlobalsModRef.cpp b/lib/Analysis/IPA/GlobalsModRef.cpp
index c1d8e3e..22f6e96 100644
--- a/lib/Analysis/IPA/GlobalsModRef.cpp
+++ b/lib/Analysis/IPA/GlobalsModRef.cpp
@@ -329,15 +329,8 @@ bool GlobalsModRef::AnalyzeIndirectGlobalMemory(GlobalValue *GV) {
       // Check the value being stored.
       Value *Ptr = GetUnderlyingObject(SI->getOperand(0));
 
-      if (isMalloc(Ptr)) {
-        // Okay, easy case.
-      } else if (CallInst *CI = dyn_cast<CallInst>(Ptr)) {
-        Function *F = CI->getCalledFunction();
-        if (!F || !F->isDeclaration()) return false;     // Too hard to analyze.
-        if (F->getName() != "calloc") return false;   // Not calloc.
-      } else {
+      if (!isAllocLikeFn(Ptr))
         return false;  // Too hard to analyze.
-      }
 
       // Analyze all uses of the allocation.  If any of them are used in a
       // non-simple way (e.g. stored to another global) bail out.
@@ -454,19 +447,18 @@ void GlobalsModRef::AnalyzeCallGraph(CallGraph &CG, Module &M) {
       for (inst_iterator II = inst_begin(SCC[i]->getFunction()),
              E = inst_end(SCC[i]->getFunction());
            II != E && FunctionEffect != ModRef; ++II)
-        if (isa<LoadInst>(*II)) {
+        if (LoadInst *LI = dyn_cast<LoadInst>(&*II)) {
           FunctionEffect |= Ref;
-          if (cast<LoadInst>(*II).isVolatile())
+          if (LI->isVolatile())
             // Volatile loads may have side-effects, so mark them as writing
             // memory (for example, a flag inside the processor).
             FunctionEffect |= Mod;
-        } else if (isa<StoreInst>(*II)) {
+        } else if (StoreInst *SI = dyn_cast<StoreInst>(&*II)) {
           FunctionEffect |= Mod;
-          if (cast<StoreInst>(*II).isVolatile())
+          if (SI->isVolatile())
             // Treat volatile stores as reading memory somewhere.
             FunctionEffect |= Ref;
-        } else if (isMalloc(&cast<Instruction>(*II)) ||
-                   isFreeCall(&cast<Instruction>(*II))) {
+        } else if (isAllocationFn(&*II) || isFreeCall(&*II)) {
           FunctionEffect |= ModRef;
         } else if (IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(&*II)) {
           // The callgraph doesn't include intrinsic calls.
diff --git a/lib/Analysis/IVUsers.cpp b/lib/Analysis/IVUsers.cpp
index b80966b..0a6682a 100644
--- a/lib/Analysis/IVUsers.cpp
+++ b/lib/Analysis/IVUsers.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/ADT/STLExtras.h"
@@ -120,6 +121,12 @@ bool IVUsers::AddUsersImpl(Instruction *I,
   if (!SE->isSCEVable(I->getType()))
     return false;   // Void and FP expressions cannot be reduced.
 
+  // IVUsers is used by LSR which assumes that all SCEV expressions are safe to
+  // pass to SCEVExpander. Expressions are not safe to expand if they represent
+  // operations that are not safe to speculate, namely integer division.
+  if (!isa<PHINode>(I) && !isSafeToSpeculativelyExecute(I, TD))
+    return false;
+
   // LSR is not APInt clean, do not touch integers bigger than 64-bits.
   // Also avoid creating IVs of non-native types. For example, we don't want a
   // 64-bit IV in 32-bit code just because the loop has one 64-bit cast.
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index 3e3d2ab..a6bf4a8 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -178,7 +178,7 @@ bool CallAnalyzer::lookupSROAArgAndCost(
 
 /// \brief Disable SROA for the candidate marked by this cost iterator.
 ///
-/// This markes the candidate as no longer viable for SROA, and adds the cost
+/// This marks the candidate as no longer viable for SROA, and adds the cost
 /// savings associated with it back into the inline cost measurement.
 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
   // If we're no longer able to perform SROA we need to undo its cost savings
@@ -398,10 +398,7 @@ bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
     SROAArgValues[&I] = SROAArg;
 
-  // A ptrtoint cast is free so long as the result is large enough to store the
-  // pointer, and a legal integer type.
-  return TD && TD->isLegalInteger(IntegerSize) &&
-         IntegerSize >= TD->getPointerSizeInBits();
+  return isInstructionFree(&I, TD);
 }
 
 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
@@ -428,10 +425,7 @@ bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
   if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
     SROAArgValues[&I] = SROAArg;
 
-  // An inttoptr cast is free so long as the input is a legal integer type
-  // which doesn't contain values outside the range of a pointer.
-  return TD && TD->isLegalInteger(IntegerSize) &&
-         IntegerSize <= TD->getPointerSizeInBits();
+  return isInstructionFree(&I, TD);
 }
 
 bool CallAnalyzer::visitCastInst(CastInst &I) {
@@ -445,24 +439,7 @@ bool CallAnalyzer::visitCastInst(CastInst &I) {
   // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
   disableSROA(I.getOperand(0));
 
-  // No-op casts don't have any cost.
-  if (I.isLosslessCast())
-    return true;
-
-  // trunc to a native type is free (assuming the target has compare and
-  // shift-right of the same width).
-  if (TD && isa<TruncInst>(I) &&
-      TD->isLegalInteger(TD->getTypeSizeInBits(I.getType())))
-    return true;
-
-  // Result of a cmp instruction is often extended (to be used by other
-  // cmp instructions, logical or return instructions). These are usually
-  // no-ops on most sane targets.
-  if (isa<CmpInst>(I.getOperand(0)))
-    return true;
-
-  // Assume the rest of the casts require work.
-  return false;
+  return isInstructionFree(&I, TD);
 }
 
 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
@@ -636,21 +613,11 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
     default:
       return Base::visitCallSite(CS);
 
-    case Intrinsic::dbg_declare:
-    case Intrinsic::dbg_value:
-    case Intrinsic::invariant_start:
-    case Intrinsic::invariant_end:
-    case Intrinsic::lifetime_start:
-    case Intrinsic::lifetime_end:
     case Intrinsic::memset:
     case Intrinsic::memcpy:
     case Intrinsic::memmove:
-    case Intrinsic::objectsize:
-    case Intrinsic::ptr_annotation:
-    case Intrinsic::var_annotation:
-      // SROA can usually chew through these intrinsics and they have no cost
-      // so don't pay the price of analyzing them in detail.
-      return true;
+      // SROA can usually chew through these intrinsics, but they aren't free.
+      return false;
     }
   }
 
@@ -662,7 +629,7 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
       return false;
     }
 
-    if (!callIsSmall(F)) {
+    if (!callIsSmall(CS)) {
       // We account for the average 1 instruction per call argument setup
       // here.
       Cost += CS.arg_size() * InlineConstants::InstrCost;
@@ -706,6 +673,11 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
 }
 
 bool CallAnalyzer::visitInstruction(Instruction &I) {
+  // Some instructions are free. All of the free intrinsics can also be
+  // handled by SROA, etc.
+  if (isInstructionFree(&I, TD))
+    return true;
+
   // We found something we don't understand or can't handle. Mark any SROA-able
   // values in the operand list as no longer viable.
   for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI)
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index 16e7a72..16a9a04 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -47,7 +47,7 @@ struct Query {
   const DominatorTree *DT;
 
   Query(const TargetData *td, const TargetLibraryInfo *tli,
-        const DominatorTree *dt) : TD(td), TLI(tli), DT(dt) {};
+        const DominatorTree *dt) : TD(td), TLI(tli), DT(dt) {}
 };
 
 static Value *SimplifyAndInst(Value *, Value *, const Query &, unsigned);
diff --git a/lib/Analysis/LazyValueInfo.cpp b/lib/Analysis/LazyValueInfo.cpp
index 5ca2746..9140786 100644
--- a/lib/Analysis/LazyValueInfo.cpp
+++ b/lib/Analysis/LazyValueInfo.cpp
@@ -172,7 +172,7 @@ public:
       if (NewR.isEmptySet())
         return markOverdefined();
       
-      bool changed = Range == NewR;
+      bool changed = Range != NewR;
       Range = NewR;
       return changed;
     }
@@ -457,8 +457,10 @@ void LazyValueInfoCache::eraseBlock(BasicBlock *BB) {
 void LazyValueInfoCache::solve() {
   while (!BlockValueStack.empty()) {
     std::pair<BasicBlock*, Value*> &e = BlockValueStack.top();
-    if (solveBlockValue(e.second, e.first))
+    if (solveBlockValue(e.second, e.first)) {
+      assert(BlockValueStack.top() == e);
       BlockValueStack.pop();
+    }
   }
 }
 
@@ -766,15 +768,10 @@ bool LazyValueInfoCache::solveBlockValueConstantRange(LVILatticeVal &BBLV,
   return true;
 }
 
-/// getEdgeValue - This method attempts to infer more complex 
-bool LazyValueInfoCache::getEdgeValue(Value *Val, BasicBlock *BBFrom,
-                                      BasicBlock *BBTo, LVILatticeVal &Result) {
-  // If already a constant, there is nothing to compute.
-  if (Constant *VC = dyn_cast<Constant>(Val)) {
-    Result = LVILatticeVal::get(VC);
-    return true;
-  }
-  
+/// \brief Compute the value of Val on the edge BBFrom -> BBTo. Returns false if
+/// Val is not constrained on the edge.
+static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
+                              BasicBlock *BBTo, LVILatticeVal &Result) {
   // TODO: Handle more complex conditionals.  If (v == 0 || v2 < 1) is false, we
   // know that v != 0.
   if (BranchInst *BI = dyn_cast<BranchInst>(BBFrom->getTerminator())) {
@@ -818,7 +815,7 @@ bool LazyValueInfoCache::getEdgeValue(Value *Val, BasicBlock *BBFrom,
         ConstantInt *CI = dyn_cast<ConstantInt>(ICI->getOperand(1));
         if (CI && (ICI->getOperand(0) == Val || NegOffset)) {
           // Calculate the range of values that would satisfy the comparison.
-          ConstantRange CmpRange(CI->getValue(), CI->getValue()+1);
+          ConstantRange CmpRange(CI->getValue());
           ConstantRange TrueValues =
             ConstantRange::makeICmpRegion(ICI->getPredicate(), CmpRange);
 
@@ -827,25 +824,8 @@ bool LazyValueInfoCache::getEdgeValue(Value *Val, BasicBlock *BBFrom,
 
           // If we're interested in the false dest, invert the condition.
           if (!isTrueDest) TrueValues = TrueValues.inverse();
-          
-          // Figure out the possible values of the query BEFORE this branch.  
-          if (!hasBlockValue(Val, BBFrom)) {
-            BlockValueStack.push(std::make_pair(BBFrom, Val));
-            return false;
-          }
-          
-          LVILatticeVal InBlock = getBlockValue(Val, BBFrom);
-          if (!InBlock.isConstantRange()) {
-            Result = LVILatticeVal::getRange(TrueValues);
-            return true;
-          }
-
-          // Find all potential values that satisfy both the input and output
-          // conditions.
-          ConstantRange PossibleValues =
-            TrueValues.intersectWith(InBlock.getConstantRange());
-
-          Result = LVILatticeVal::getRange(PossibleValues);
+
+          Result = LVILatticeVal::getRange(TrueValues);
           return true;
         }
       }
@@ -855,40 +835,71 @@ bool LazyValueInfoCache::getEdgeValue(Value *Val, BasicBlock *BBFrom,
   // If the edge was formed by a switch on the value, then we may know exactly
   // what it is.
   if (SwitchInst *SI = dyn_cast<SwitchInst>(BBFrom->getTerminator())) {
-    if (SI->getCondition() == Val) {
-      // We don't know anything in the default case.
-      if (SI->getDefaultDest() == BBTo) {
-        Result.markOverdefined();
-        return true;
-      }
-      
-      // We only know something if there is exactly one value that goes from
-      // BBFrom to BBTo.
-      unsigned NumEdges = 0;
-      ConstantInt *EdgeVal = 0;
-      for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
-           i != e; ++i) {
-        if (i.getCaseSuccessor() != BBTo) continue;
-        if (NumEdges++) break;
-        EdgeVal = i.getCaseValue();
-      }
-      assert(EdgeVal && "Missing successor?");
-      if (NumEdges == 1) {
-        Result = LVILatticeVal::get(EdgeVal);
-        return true;
-      }
+    if (SI->getCondition() != Val)
+      return false;
+
+    bool DefaultCase = SI->getDefaultDest() == BBTo;
+    unsigned BitWidth = Val->getType()->getIntegerBitWidth();
+    ConstantRange EdgesVals(BitWidth, DefaultCase/*isFullSet*/);
+
+    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+         i != e; ++i) {
+      ConstantRange EdgeVal(i.getCaseValue()->getValue());
+      if (DefaultCase)
+        EdgesVals = EdgesVals.difference(EdgeVal);
+      else if (i.getCaseSuccessor() == BBTo)
+        EdgesVals = EdgesVals.unionWith(EdgeVal);
     }
-  }
-  
-  // Otherwise see if the value is known in the block.
-  if (hasBlockValue(Val, BBFrom)) {
-    Result = getBlockValue(Val, BBFrom);
+    Result = LVILatticeVal::getRange(EdgesVals);
     return true;
   }
-  BlockValueStack.push(std::make_pair(BBFrom, Val));
   return false;
 }
 
+/// \brief Compute the value of Val on the edge BBFrom -> BBTo, or the value at
+/// the basic block if the edge does not constraint Val.
+bool LazyValueInfoCache::getEdgeValue(Value *Val, BasicBlock *BBFrom,
+                                      BasicBlock *BBTo, LVILatticeVal &Result) {
+  // If already a constant, there is nothing to compute.
+  if (Constant *VC = dyn_cast<Constant>(Val)) {
+    Result = LVILatticeVal::get(VC);
+    return true;
+  }
+
+  if (getEdgeValueLocal(Val, BBFrom, BBTo, Result)) {
+    if (!Result.isConstantRange() ||
+      Result.getConstantRange().getSingleElement())
+      return true;
+
+    // FIXME: this check should be moved to the beginning of the function when
+    // LVI better supports recursive values. Even for the single value case, we
+    // can intersect to detect dead code (an empty range).
+    if (!hasBlockValue(Val, BBFrom)) {
+      BlockValueStack.push(std::make_pair(BBFrom, Val));
+      return false;
+    }
+
+    // Try to intersect ranges of the BB and the constraint on the edge.
+    LVILatticeVal InBlock = getBlockValue(Val, BBFrom);
+    if (!InBlock.isConstantRange())
+      return true;
+
+    ConstantRange Range =
+      Result.getConstantRange().intersectWith(InBlock.getConstantRange());
+    Result = LVILatticeVal::getRange(Range);
+    return true;
+  }
+
+  if (!hasBlockValue(Val, BBFrom)) {
+    BlockValueStack.push(std::make_pair(BBFrom, Val));
+    return false;
+  }
+
+  // if we couldn't compute the value on the edge, use the value from the BB
+  Result = getBlockValue(Val, BBFrom);
+  return true;
+}
+
 LVILatticeVal LazyValueInfoCache::getValueInBlock(Value *V, BasicBlock *BB) {
   DEBUG(dbgs() << "LVI Getting block end value " << *V << " at '"
         << BB->getName() << "'\n");
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index f7a60a1..20c33a3 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Constants.h"
 #include "llvm/Instructions.h"
 #include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfoImpl.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Assembly/Writer.h"
@@ -29,6 +30,10 @@
 #include <algorithm>
 using namespace llvm;
 
+// Explicitly instantiate methods in LoopInfoImpl.h for IR-level Loops.
+template class llvm::LoopBase<BasicBlock, Loop>;
+template class llvm::LoopInfoBase<BasicBlock, Loop>;
+
 // Always verify loopinfo if expensive checking is enabled.
 #ifdef XDEBUG
 static bool VerifyLoopInfo = true;
@@ -507,7 +512,7 @@ Loop *UnloopUpdater::getNearestLoop(BasicBlock *BB, Loop *BBLoop) {
 //
 bool LoopInfo::runOnFunction(Function &) {
   releaseMemory();
-  LI.Calculate(getAnalysis<DominatorTree>().getBase());    // Update
+  LI.Analyze(getAnalysis<DominatorTree>().getBase());
   return false;
 }
 
@@ -589,9 +594,6 @@ void LoopInfo::verifyAnalysis() const {
   }
 
   // Verify that blocks are mapped to valid loops.
-  //
-  // FIXME: With an up-to-date DFS (see LoopIterator.h) and DominatorTree, we
-  // could also verify that the blocks are still in the correct loops.
   for (DenseMap<BasicBlock*, Loop*>::const_iterator I = LI.BBMap.begin(),
          E = LI.BBMap.end(); I != E; ++I) {
     assert(Loops.count(I->second) && "orphaned loop");
diff --git a/lib/Analysis/LoopPass.cpp b/lib/Analysis/LoopPass.cpp
index aba700a..1540112 100644
--- a/lib/Analysis/LoopPass.cpp
+++ b/lib/Analysis/LoopPass.cpp
@@ -162,7 +162,7 @@ void LPPassManager::deleteSimpleAnalysisValue(Value *V, Loop *L) {
 // Recurse through all subloops and all loops  into LQ.
 static void addLoopIntoQueue(Loop *L, std::deque<Loop *> &LQ) {
   LQ.push_back(L);
-  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+  for (Loop::reverse_iterator I = L->rbegin(), E = L->rend(); I != E; ++I)
     addLoopIntoQueue(*I, LQ);
 }
 
@@ -183,8 +183,12 @@ bool LPPassManager::runOnFunction(Function &F) {
   // Collect inherited analysis from Module level pass manager.
   populateInheritedAnalysis(TPM->activeStack);
 
-  // Populate Loop Queue
-  for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
+  // Populate the loop queue in reverse program order. There is no clear need to
+  // process sibling loops in either forward or reverse order. There may be some
+  // advantage in deleting uses in a later loop before optimizing the
+  // definitions in an earlier loop. If we find a clear reason to process in
+  // forward order, then a forward variant of LoopPassManager should be created.
+  for (LoopInfo::reverse_iterator I = LI->rbegin(), E = LI->rend(); I != E; ++I)
     addLoopIntoQueue(*I, LQ);
 
   if (LQ.empty()) // No loops, skip calling finalizers
diff --git a/lib/Analysis/MemDepPrinter.cpp b/lib/Analysis/MemDepPrinter.cpp
index 22414b3..8578a63 100644
--- a/lib/Analysis/MemDepPrinter.cpp
+++ b/lib/Analysis/MemDepPrinter.cpp
@@ -32,7 +32,7 @@ namespace {
       Unknown
     };
 
-    static const char* DepTypeStr[];
+    static const char *const DepTypeStr[];
 
     typedef PointerIntPair<const Instruction *, 2, DepType> InstTypePair;
     typedef std::pair<InstTypePair, const BasicBlock *> Dep;
@@ -88,7 +88,7 @@ FunctionPass *llvm::createMemDepPrinter() {
   return new MemDepPrinter();
 }
 
-const char* MemDepPrinter::DepTypeStr[]
+const char *const MemDepPrinter::DepTypeStr[]
   = {"Clobber", "Def", "NonFuncLocal", "Unknown"};
 
 bool MemDepPrinter::runOnFunction(Function &F) {
diff --git a/lib/Analysis/MemoryBuiltins.cpp b/lib/Analysis/MemoryBuiltins.cpp
index b145650..8d99ec3 100644
--- a/lib/Analysis/MemoryBuiltins.cpp
+++ b/lib/Analysis/MemoryBuiltins.cpp
@@ -12,80 +12,168 @@
 //
 //===----------------------------------------------------------------------===//
 
+#define DEBUG_TYPE "memory-builtins"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Constants.h"
+#include "llvm/GlobalVariable.h"
 #include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Metadata.h"
 #include "llvm/Module.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
-//===----------------------------------------------------------------------===//
-//  malloc Call Utility Functions.
-//
+enum AllocType {
+  MallocLike         = 1<<0, // allocates
+  CallocLike         = 1<<1, // allocates + bzero
+  ReallocLike        = 1<<2, // reallocates
+  StrDupLike         = 1<<3,
+  AllocLike          = MallocLike | CallocLike | StrDupLike,
+  AnyAlloc           = MallocLike | CallocLike | ReallocLike | StrDupLike
+};
+
+struct AllocFnsTy {
+  const char *Name;
+  AllocType AllocTy;
+  unsigned char NumParams;
+  // First and Second size parameters (or -1 if unused)
+  signed char FstParam, SndParam;
+};
+
+// FIXME: certain users need more information. E.g., SimplifyLibCalls needs to
+// know which functions are nounwind, noalias, nocapture parameters, etc.
+static const AllocFnsTy AllocationFnData[] = {
+  {"malloc",              MallocLike,  1, 0,  -1},
+  {"valloc",              MallocLike,  1, 0,  -1},
+  {"_Znwj",               MallocLike,  1, 0,  -1}, // new(unsigned int)
+  {"_ZnwjRKSt9nothrow_t", MallocLike,  2, 0,  -1}, // new(unsigned int, nothrow)
+  {"_Znwm",               MallocLike,  1, 0,  -1}, // new(unsigned long)
+  {"_ZnwmRKSt9nothrow_t", MallocLike,  2, 0,  -1}, // new(unsigned long, nothrow)
+  {"_Znaj",               MallocLike,  1, 0,  -1}, // new[](unsigned int)
+  {"_ZnajRKSt9nothrow_t", MallocLike,  2, 0,  -1}, // new[](unsigned int, nothrow)
+  {"_Znam",               MallocLike,  1, 0,  -1}, // new[](unsigned long)
+  {"_ZnamRKSt9nothrow_t", MallocLike,  2, 0,  -1}, // new[](unsigned long, nothrow)
+  {"posix_memalign",      MallocLike,  3, 2,  -1},
+  {"calloc",              CallocLike,  2, 0,  1},
+  {"realloc",             ReallocLike, 2, 1,  -1},
+  {"reallocf",            ReallocLike, 2, 1,  -1},
+  {"strdup",              StrDupLike,  1, -1, -1},
+  {"strndup",             StrDupLike,  2, -1, -1}
+};
+
 
-/// isMalloc - Returns true if the value is either a malloc call or a
-/// bitcast of the result of a malloc call.
-bool llvm::isMalloc(const Value *I) {
-  return extractMallocCall(I) || extractMallocCallFromBitCast(I);
+static Function *getCalledFunction(const Value *V, bool LookThroughBitCast) {
+  if (LookThroughBitCast)
+    V = V->stripPointerCasts();
+
+  CallSite CS(const_cast<Value*>(V));
+  if (!CS.getInstruction())
+    return 0;
+
+  Function *Callee = CS.getCalledFunction();
+  if (!Callee || !Callee->isDeclaration())
+    return 0;
+  return Callee;
 }
 
-static bool isMallocCall(const CallInst *CI) {
-  if (!CI)
-    return false;
+/// \brief Returns the allocation data for the given value if it is a call to a
+/// known allocation function, and NULL otherwise.
+static const AllocFnsTy *getAllocationData(const Value *V, AllocType AllocTy,
+                                           bool LookThroughBitCast = false) {
+  Function *Callee = getCalledFunction(V, LookThroughBitCast);
+  if (!Callee)
+    return 0;
 
-  Function *Callee = CI->getCalledFunction();
-  if (Callee == 0 || !Callee->isDeclaration())
-    return false;
-  if (Callee->getName() != "malloc" &&
-      Callee->getName() != "_Znwj" && // operator new(unsigned int)
-      Callee->getName() != "_Znwm" && // operator new(unsigned long)
-      Callee->getName() != "_Znaj" && // operator new[](unsigned int)
-      Callee->getName() != "_Znam")   // operator new[](unsigned long)
-    return false;
+  unsigned i = 0;
+  bool found = false;
+  for ( ; i < array_lengthof(AllocationFnData); ++i) {
+    if (Callee->getName() == AllocationFnData[i].Name) {
+      found = true;
+      break;
+    }
+  }
+  if (!found)
+    return 0;
 
-  // Check malloc prototype.
-  // FIXME: workaround for PR5130, this will be obsolete when a nobuiltin 
-  // attribute will exist.
+  const AllocFnsTy *FnData = &AllocationFnData[i];
+  if ((FnData->AllocTy & AllocTy) == 0)
+    return 0;
+
+  // Check function prototype.
+  // FIXME: Check the nobuiltin metadata?? (PR5130)
+  int FstParam = FnData->FstParam;
+  int SndParam = FnData->SndParam;
   FunctionType *FTy = Callee->getFunctionType();
-  return FTy->getReturnType() == Type::getInt8PtrTy(FTy->getContext()) &&
-         FTy->getNumParams() == 1 &&
-         (FTy->getParamType(0)->isIntegerTy(32) ||
-          FTy->getParamType(0)->isIntegerTy(64));
+
+  if (FTy->getReturnType() == Type::getInt8PtrTy(FTy->getContext()) &&
+      FTy->getNumParams() == FnData->NumParams &&
+      (FstParam < 0 ||
+       (FTy->getParamType(FstParam)->isIntegerTy(32) ||
+        FTy->getParamType(FstParam)->isIntegerTy(64))) &&
+      (SndParam < 0 ||
+       FTy->getParamType(SndParam)->isIntegerTy(32) ||
+       FTy->getParamType(SndParam)->isIntegerTy(64)))
+    return FnData;
+  return 0;
 }
 
-/// extractMallocCall - Returns the corresponding CallInst if the instruction
-/// is a malloc call.  Since CallInst::CreateMalloc() only creates calls, we
-/// ignore InvokeInst here.
-const CallInst *llvm::extractMallocCall(const Value *I) {
-  const CallInst *CI = dyn_cast<CallInst>(I);
-  return (isMallocCall(CI)) ? CI : NULL;
+static bool hasNoAliasAttr(const Value *V, bool LookThroughBitCast) {
+  ImmutableCallSite CS(LookThroughBitCast ? V->stripPointerCasts() : V);
+  return CS && CS.hasFnAttr(Attribute::NoAlias);
 }
 
-CallInst *llvm::extractMallocCall(Value *I) {
-  CallInst *CI = dyn_cast<CallInst>(I);
-  return (isMallocCall(CI)) ? CI : NULL;
+
+/// \brief Tests if a value is a call or invoke to a library function that
+/// allocates or reallocates memory (either malloc, calloc, realloc, or strdup
+/// like).
+bool llvm::isAllocationFn(const Value *V, bool LookThroughBitCast) {
+  return getAllocationData(V, AnyAlloc, LookThroughBitCast);
 }
 
-static bool isBitCastOfMallocCall(const BitCastInst *BCI) {
-  if (!BCI)
-    return false;
-    
-  return isMallocCall(dyn_cast<CallInst>(BCI->getOperand(0)));
+/// \brief Tests if a value is a call or invoke to a function that returns a
+/// NoAlias pointer (including malloc/calloc/realloc/strdup-like functions).
+bool llvm::isNoAliasFn(const Value *V, bool LookThroughBitCast) {
+  // it's safe to consider realloc as noalias since accessing the original
+  // pointer is undefined behavior
+  return isAllocationFn(V, LookThroughBitCast) ||
+         hasNoAliasAttr(V, LookThroughBitCast);
 }
 
-/// extractMallocCallFromBitCast - Returns the corresponding CallInst if the
-/// instruction is a bitcast of the result of a malloc call.
-CallInst *llvm::extractMallocCallFromBitCast(Value *I) {
-  BitCastInst *BCI = dyn_cast<BitCastInst>(I);
-  return (isBitCastOfMallocCall(BCI)) ? cast<CallInst>(BCI->getOperand(0))
-                                      : NULL;
+/// \brief Tests if a value is a call or invoke to a library function that
+/// allocates uninitialized memory (such as malloc).
+bool llvm::isMallocLikeFn(const Value *V, bool LookThroughBitCast) {
+  return getAllocationData(V, MallocLike, LookThroughBitCast);
 }
 
-const CallInst *llvm::extractMallocCallFromBitCast(const Value *I) {
-  const BitCastInst *BCI = dyn_cast<BitCastInst>(I);
-  return (isBitCastOfMallocCall(BCI)) ? cast<CallInst>(BCI->getOperand(0))
-                                      : NULL;
+/// \brief Tests if a value is a call or invoke to a library function that
+/// allocates zero-filled memory (such as calloc).
+bool llvm::isCallocLikeFn(const Value *V, bool LookThroughBitCast) {
+  return getAllocationData(V, CallocLike, LookThroughBitCast);
+}
+
+/// \brief Tests if a value is a call or invoke to a library function that
+/// allocates memory (either malloc, calloc, or strdup like).
+bool llvm::isAllocLikeFn(const Value *V, bool LookThroughBitCast) {
+  return getAllocationData(V, AllocLike, LookThroughBitCast);
+}
+
+/// \brief Tests if a value is a call or invoke to a library function that
+/// reallocates memory (such as realloc).
+bool llvm::isReallocLikeFn(const Value *V, bool LookThroughBitCast) {
+  return getAllocationData(V, ReallocLike, LookThroughBitCast);
+}
+
+/// extractMallocCall - Returns the corresponding CallInst if the instruction
+/// is a malloc call.  Since CallInst::CreateMalloc() only creates calls, we
+/// ignore InvokeInst here.
+const CallInst *llvm::extractMallocCall(const Value *I) {
+  return isMallocLikeFn(I) ? dyn_cast<CallInst>(I) : 0;
 }
 
 static Value *computeArraySize(const CallInst *CI, const TargetData *TD,
@@ -134,7 +222,7 @@ const CallInst *llvm::isArrayMalloc(const Value *I, const TargetData *TD) {
 ///   1: PointerType is the bitcast's result type.
 ///  >1: Unique PointerType cannot be determined, return NULL.
 PointerType *llvm::getMallocType(const CallInst *CI) {
-  assert(isMalloc(CI) && "getMallocType and not malloc call");
+  assert(isMallocLikeFn(CI) && "getMallocType and not malloc call");
   
   PointerType *MallocType = NULL;
   unsigned NumOfBitCastUses = 0;
@@ -176,13 +264,17 @@ Type *llvm::getMallocAllocatedType(const CallInst *CI) {
 /// determined.
 Value *llvm::getMallocArraySize(CallInst *CI, const TargetData *TD,
                                 bool LookThroughSExt) {
-  assert(isMalloc(CI) && "getMallocArraySize and not malloc call");
+  assert(isMallocLikeFn(CI) && "getMallocArraySize and not malloc call");
   return computeArraySize(CI, TD, LookThroughSExt);
 }
 
-//===----------------------------------------------------------------------===//
-//  free Call Utility Functions.
-//
+
+/// extractCallocCall - Returns the corresponding CallInst if the instruction
+/// is a calloc call.
+const CallInst *llvm::extractCallocCall(const Value *I) {
+  return isCallocLikeFn(I) ? cast<CallInst>(I) : 0;
+}
+
 
 /// isFreeCall - Returns non-null if the value is a call to the builtin free()
 const CallInst *llvm::isFreeCall(const Value *I) {
@@ -211,3 +303,417 @@ const CallInst *llvm::isFreeCall(const Value *I) {
 
   return CI;
 }
+
+
+
+//===----------------------------------------------------------------------===//
+//  Utility functions to compute size of objects.
+//
+
+
+/// \brief Compute the size of the object pointed by Ptr. Returns true and the
+/// object size in Size if successful, and false otherwise.
+/// If RoundToAlign is true, then Size is rounded up to the aligment of allocas,
+/// byval arguments, and global variables.
+bool llvm::getObjectSize(const Value *Ptr, uint64_t &Size, const TargetData *TD,
+                         bool RoundToAlign) {
+  if (!TD)
+    return false;
+
+  ObjectSizeOffsetVisitor Visitor(TD, Ptr->getContext(), RoundToAlign);
+  SizeOffsetType Data = Visitor.compute(const_cast<Value*>(Ptr));
+  if (!Visitor.bothKnown(Data))
+    return false;
+
+  APInt ObjSize = Data.first, Offset = Data.second;
+  // check for overflow
+  if (Offset.slt(0) || ObjSize.ult(Offset))
+    Size = 0;
+  else
+    Size = (ObjSize - Offset).getZExtValue();
+  return true;
+}
+
+
+STATISTIC(ObjectVisitorArgument,
+          "Number of arguments with unsolved size and offset");
+STATISTIC(ObjectVisitorLoad,
+          "Number of load instructions with unsolved size and offset");
+
+
+APInt ObjectSizeOffsetVisitor::align(APInt Size, uint64_t Align) {
+  if (RoundToAlign && Align)
+    return APInt(IntTyBits, RoundUpToAlignment(Size.getZExtValue(), Align));
+  return Size;
+}
+
+ObjectSizeOffsetVisitor::ObjectSizeOffsetVisitor(const TargetData *TD,
+                                                 LLVMContext &Context,
+                                                 bool RoundToAlign)
+: TD(TD), RoundToAlign(RoundToAlign) {
+  IntegerType *IntTy = TD->getIntPtrType(Context);
+  IntTyBits = IntTy->getBitWidth();
+  Zero = APInt::getNullValue(IntTyBits);
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::compute(Value *V) {
+  V = V->stripPointerCasts();
+
+  if (GEPOperator *GEP = dyn_cast<GEPOperator>(V))
+    return visitGEPOperator(*GEP);
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    return visit(*I);
+  if (Argument *A = dyn_cast<Argument>(V))
+    return visitArgument(*A);
+  if (ConstantPointerNull *P = dyn_cast<ConstantPointerNull>(V))
+    return visitConstantPointerNull(*P);
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+    return visitGlobalVariable(*GV);
+  if (UndefValue *UV = dyn_cast<UndefValue>(V))
+    return visitUndefValue(*UV);
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+    if (CE->getOpcode() == Instruction::IntToPtr)
+      return unknown(); // clueless
+
+  DEBUG(dbgs() << "ObjectSizeOffsetVisitor::compute() unhandled value: " << *V
+        << '\n');
+  return unknown();
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitAllocaInst(AllocaInst &I) {
+  if (!I.getAllocatedType()->isSized())
+    return unknown();
+
+  APInt Size(IntTyBits, TD->getTypeAllocSize(I.getAllocatedType()));
+  if (!I.isArrayAllocation())
+    return std::make_pair(align(Size, I.getAlignment()), Zero);
+
+  Value *ArraySize = I.getArraySize();
+  if (const ConstantInt *C = dyn_cast<ConstantInt>(ArraySize)) {
+    Size *= C->getValue().zextOrSelf(IntTyBits);
+    return std::make_pair(align(Size, I.getAlignment()), Zero);
+  }
+  return unknown();
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitArgument(Argument &A) {
+  // no interprocedural analysis is done at the moment
+  if (!A.hasByValAttr()) {
+    ++ObjectVisitorArgument;
+    return unknown();
+  }
+  PointerType *PT = cast<PointerType>(A.getType());
+  APInt Size(IntTyBits, TD->getTypeAllocSize(PT->getElementType()));
+  return std::make_pair(align(Size, A.getParamAlignment()), Zero);
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitCallSite(CallSite CS) {
+  const AllocFnsTy *FnData = getAllocationData(CS.getInstruction(), AnyAlloc);
+  if (!FnData)
+    return unknown();
+
+  // handle strdup-like functions separately
+  if (FnData->AllocTy == StrDupLike) {
+    // TODO
+    return unknown();
+  }
+
+  ConstantInt *Arg = dyn_cast<ConstantInt>(CS.getArgument(FnData->FstParam));
+  if (!Arg)
+    return unknown();
+
+  APInt Size = Arg->getValue().zextOrSelf(IntTyBits);
+  // size determined by just 1 parameter
+  if (FnData->SndParam < 0)
+    return std::make_pair(Size, Zero);
+
+  Arg = dyn_cast<ConstantInt>(CS.getArgument(FnData->SndParam));
+  if (!Arg)
+    return unknown();
+
+  Size *= Arg->getValue().zextOrSelf(IntTyBits);
+  return std::make_pair(Size, Zero);
+
+  // TODO: handle more standard functions (+ wchar cousins):
+  // - strdup / strndup
+  // - strcpy / strncpy
+  // - strcat / strncat
+  // - memcpy / memmove
+  // - strcat / strncat
+  // - memset
+}
+
+SizeOffsetType
+ObjectSizeOffsetVisitor::visitConstantPointerNull(ConstantPointerNull&) {
+  return std::make_pair(Zero, Zero);
+}
+
+SizeOffsetType
+ObjectSizeOffsetVisitor::visitExtractElementInst(ExtractElementInst&) {
+  return unknown();
+}
+
+SizeOffsetType
+ObjectSizeOffsetVisitor::visitExtractValueInst(ExtractValueInst&) {
+  // Easy cases were already folded by previous passes.
+  return unknown();
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitGEPOperator(GEPOperator &GEP) {
+  SizeOffsetType PtrData = compute(GEP.getPointerOperand());
+  if (!bothKnown(PtrData) || !GEP.hasAllConstantIndices())
+    return unknown();
+
+  SmallVector<Value*, 8> Ops(GEP.idx_begin(), GEP.idx_end());
+  APInt Offset(IntTyBits,TD->getIndexedOffset(GEP.getPointerOperandType(),Ops));
+  return std::make_pair(PtrData.first, PtrData.second + Offset);
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitGlobalVariable(GlobalVariable &GV){
+  if (!GV.hasDefinitiveInitializer())
+    return unknown();
+
+  APInt Size(IntTyBits, TD->getTypeAllocSize(GV.getType()->getElementType()));
+  return std::make_pair(align(Size, GV.getAlignment()), Zero);
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitIntToPtrInst(IntToPtrInst&) {
+  // clueless
+  return unknown();
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitLoadInst(LoadInst&) {
+  ++ObjectVisitorLoad;
+  return unknown();
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitPHINode(PHINode&) {
+  // too complex to analyze statically.
+  return unknown();
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitSelectInst(SelectInst &I) {
+  SizeOffsetType TrueSide  = compute(I.getTrueValue());
+  SizeOffsetType FalseSide = compute(I.getFalseValue());
+  if (bothKnown(TrueSide) && bothKnown(FalseSide) && TrueSide == FalseSide)
+    return TrueSide;
+  return unknown();
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitUndefValue(UndefValue&) {
+  return std::make_pair(Zero, Zero);
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitInstruction(Instruction &I) {
+  DEBUG(dbgs() << "ObjectSizeOffsetVisitor unknown instruction:" << I << '\n');
+  return unknown();
+}
+
+
+ObjectSizeOffsetEvaluator::ObjectSizeOffsetEvaluator(const TargetData *TD,
+                                                     LLVMContext &Context)
+: TD(TD), Context(Context), Builder(Context, TargetFolder(TD)),
+Visitor(TD, Context) {
+  IntTy = TD->getIntPtrType(Context);
+  Zero = ConstantInt::get(IntTy, 0);
+}
+
+SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute(Value *V) {
+  SizeOffsetEvalType Result = compute_(V);
+
+  if (!bothKnown(Result)) {
+    // erase everything that was computed in this iteration from the cache, so
+    // that no dangling references are left behind. We could be a bit smarter if
+    // we kept a dependency graph. It's probably not worth the complexity.
+    for (PtrSetTy::iterator I=SeenVals.begin(), E=SeenVals.end(); I != E; ++I) {
+      CacheMapTy::iterator CacheIt = CacheMap.find(*I);
+      // non-computable results can be safely cached
+      if (CacheIt != CacheMap.end() && anyKnown(CacheIt->second))
+        CacheMap.erase(CacheIt);
+    }
+  }
+
+  SeenVals.clear();
+  return Result;
+}
+
+SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute_(Value *V) {
+  SizeOffsetType Const = Visitor.compute(V);
+  if (Visitor.bothKnown(Const))
+    return std::make_pair(ConstantInt::get(Context, Const.first),
+                          ConstantInt::get(Context, Const.second));
+
+  V = V->stripPointerCasts();
+
+  // check cache
+  CacheMapTy::iterator CacheIt = CacheMap.find(V);
+  if (CacheIt != CacheMap.end())
+    return CacheIt->second;
+
+  // always generate code immediately before the instruction being
+  // processed, so that the generated code dominates the same BBs
+  Instruction *PrevInsertPoint = Builder.GetInsertPoint();
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    Builder.SetInsertPoint(I);
+
+  // record the pointers that were handled in this run, so that they can be
+  // cleaned later if something fails
+  SeenVals.insert(V);
+
+  // now compute the size and offset
+  SizeOffsetEvalType Result;
+  if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+    Result = visitGEPOperator(*GEP);
+  } else if (Instruction *I = dyn_cast<Instruction>(V)) {
+    Result = visit(*I);
+  } else if (isa<Argument>(V) ||
+             (isa<ConstantExpr>(V) &&
+              cast<ConstantExpr>(V)->getOpcode() == Instruction::IntToPtr) ||
+             isa<GlobalVariable>(V)) {
+    // ignore values where we cannot do more than what ObjectSizeVisitor can
+    Result = unknown();
+  } else {
+    DEBUG(dbgs() << "ObjectSizeOffsetEvaluator::compute() unhandled value: "
+          << *V << '\n');
+    Result = unknown();
+  }
+
+  if (PrevInsertPoint)
+    Builder.SetInsertPoint(PrevInsertPoint);
+
+  // Don't reuse CacheIt since it may be invalid at this point.
+  CacheMap[V] = Result;
+  return Result;
+}
+
+SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitAllocaInst(AllocaInst &I) {
+  if (!I.getAllocatedType()->isSized())
+    return unknown();
+
+  // must be a VLA
+  assert(I.isArrayAllocation());
+  Value *ArraySize = I.getArraySize();
+  Value *Size = ConstantInt::get(ArraySize->getType(),
+                                 TD->getTypeAllocSize(I.getAllocatedType()));
+  Size = Builder.CreateMul(Size, ArraySize);
+  return std::make_pair(Size, Zero);
+}
+
+SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitCallSite(CallSite CS) {
+  const AllocFnsTy *FnData = getAllocationData(CS.getInstruction(), AnyAlloc);
+  if (!FnData)
+    return unknown();
+
+  // handle strdup-like functions separately
+  if (FnData->AllocTy == StrDupLike) {
+    // TODO
+    return unknown();
+  }
+
+  Value *FirstArg = CS.getArgument(FnData->FstParam);
+  FirstArg = Builder.CreateZExt(FirstArg, IntTy);
+  if (FnData->SndParam < 0)
+    return std::make_pair(FirstArg, Zero);
+
+  Value *SecondArg = CS.getArgument(FnData->SndParam);
+  SecondArg = Builder.CreateZExt(SecondArg, IntTy);
+  Value *Size = Builder.CreateMul(FirstArg, SecondArg);
+  return std::make_pair(Size, Zero);
+
+  // TODO: handle more standard functions (+ wchar cousins):
+  // - strdup / strndup
+  // - strcpy / strncpy
+  // - strcat / strncat
+  // - memcpy / memmove
+  // - strcat / strncat
+  // - memset
+}
+
+SizeOffsetEvalType
+ObjectSizeOffsetEvaluator::visitExtractElementInst(ExtractElementInst&) {
+  return unknown();
+}
+
+SizeOffsetEvalType
+ObjectSizeOffsetEvaluator::visitExtractValueInst(ExtractValueInst&) {
+  return unknown();
+}
+
+SizeOffsetEvalType
+ObjectSizeOffsetEvaluator::visitGEPOperator(GEPOperator &GEP) {
+  SizeOffsetEvalType PtrData = compute_(GEP.getPointerOperand());
+  if (!bothKnown(PtrData))
+    return unknown();
+
+  Value *Offset = EmitGEPOffset(&Builder, *TD, &GEP, /*NoAssumptions=*/true);
+  Offset = Builder.CreateAdd(PtrData.second, Offset);
+  return std::make_pair(PtrData.first, Offset);
+}
+
+SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitIntToPtrInst(IntToPtrInst&) {
+  // clueless
+  return unknown();
+}
+
+SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitLoadInst(LoadInst&) {
+  return unknown();
+}
+
+SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitPHINode(PHINode &PHI) {
+  // create 2 PHIs: one for size and another for offset
+  PHINode *SizePHI   = Builder.CreatePHI(IntTy, PHI.getNumIncomingValues());
+  PHINode *OffsetPHI = Builder.CreatePHI(IntTy, PHI.getNumIncomingValues());
+
+  // insert right away in the cache to handle recursive PHIs
+  CacheMap[&PHI] = std::make_pair(SizePHI, OffsetPHI);
+
+  // compute offset/size for each PHI incoming pointer
+  for (unsigned i = 0, e = PHI.getNumIncomingValues(); i != e; ++i) {
+    Builder.SetInsertPoint(PHI.getIncomingBlock(i)->getFirstInsertionPt());
+    SizeOffsetEvalType EdgeData = compute_(PHI.getIncomingValue(i));
+
+    if (!bothKnown(EdgeData)) {
+      OffsetPHI->replaceAllUsesWith(UndefValue::get(IntTy));
+      OffsetPHI->eraseFromParent();
+      SizePHI->replaceAllUsesWith(UndefValue::get(IntTy));
+      SizePHI->eraseFromParent();
+      return unknown();
+    }
+    SizePHI->addIncoming(EdgeData.first, PHI.getIncomingBlock(i));
+    OffsetPHI->addIncoming(EdgeData.second, PHI.getIncomingBlock(i));
+  }
+
+  Value *Size = SizePHI, *Offset = OffsetPHI, *Tmp;
+  if ((Tmp = SizePHI->hasConstantValue())) {
+    Size = Tmp;
+    SizePHI->replaceAllUsesWith(Size);
+    SizePHI->eraseFromParent();
+  }
+  if ((Tmp = OffsetPHI->hasConstantValue())) {
+    Offset = Tmp;
+    OffsetPHI->replaceAllUsesWith(Offset);
+    OffsetPHI->eraseFromParent();
+  }
+  return std::make_pair(Size, Offset);
+}
+
+SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitSelectInst(SelectInst &I) {
+  SizeOffsetEvalType TrueSide  = compute_(I.getTrueValue());
+  SizeOffsetEvalType FalseSide = compute_(I.getFalseValue());
+
+  if (!bothKnown(TrueSide) || !bothKnown(FalseSide))
+    return unknown();
+  if (TrueSide == FalseSide)
+    return TrueSide;
+
+  Value *Size = Builder.CreateSelect(I.getCondition(), TrueSide.first,
+                                     FalseSide.first);
+  Value *Offset = Builder.CreateSelect(I.getCondition(), TrueSide.second,
+                                       FalseSide.second);
+  return std::make_pair(Size, Offset);
+}
+
+SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitInstruction(Instruction &I) {
+  DEBUG(dbgs() << "ObjectSizeOffsetEvaluator unknown instruction:" << I <<'\n');
+  return unknown();
+}
diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp
index 3a544f3..7fb154d 100644
--- a/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -16,13 +16,11 @@
 
 #define DEBUG_TYPE "memdep"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Function.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
@@ -339,86 +337,6 @@ getLoadLoadClobberFullWidthSize(const Value *MemLocBase, int64_t MemLocOffs,
   }
 }
 
-namespace {
-  /// Only find pointer captures which happen before the given instruction. Uses
-  /// the dominator tree to determine whether one instruction is before another.
-  struct CapturesBefore : public CaptureTracker {
-    CapturesBefore(const Instruction *I, DominatorTree *DT)
-      : BeforeHere(I), DT(DT), Captured(false) {}
-
-    void tooManyUses() { Captured = true; }
-
-    bool shouldExplore(Use *U) {
-      Instruction *I = cast<Instruction>(U->getUser());
-      BasicBlock *BB = I->getParent();
-      if (BeforeHere != I &&
-          (!DT->isReachableFromEntry(BB) || DT->dominates(BeforeHere, I)))
-        return false;
-      return true;
-    }
-
-    bool captured(Use *U) {
-      Instruction *I = cast<Instruction>(U->getUser());
-      BasicBlock *BB = I->getParent();
-      if (BeforeHere != I &&
-          (!DT->isReachableFromEntry(BB) || DT->dominates(BeforeHere, I)))
-        return false;
-      Captured = true;
-      return true;
-    }
-
-    const Instruction *BeforeHere;
-    DominatorTree *DT;
-
-    bool Captured;
-  };
-}
-
-AliasAnalysis::ModRefResult
-MemoryDependenceAnalysis::getModRefInfo(const Instruction *Inst,
-                                        const AliasAnalysis::Location &MemLoc) {
-  AliasAnalysis::ModRefResult MR = AA->getModRefInfo(Inst, MemLoc);
-  if (MR != AliasAnalysis::ModRef) return MR;
-
-  // FIXME: this is really just shoring-up a deficiency in alias analysis.
-  // BasicAA isn't willing to spend linear time determining whether an alloca
-  // was captured before or after this particular call, while we are. However,
-  // with a smarter AA in place, this test is just wasting compile time.
-  if (!DT) return AliasAnalysis::ModRef;
-  const Value *Object = GetUnderlyingObject(MemLoc.Ptr, TD);
-  if (!isIdentifiedObject(Object) || isa<GlobalValue>(Object))
-    return AliasAnalysis::ModRef;
-  ImmutableCallSite CS(Inst);
-  if (!CS.getInstruction()) return AliasAnalysis::ModRef;
-
-  CapturesBefore CB(Inst, DT);
-  llvm::PointerMayBeCaptured(Object, &CB);
-
-  if (isa<Constant>(Object) || CS.getInstruction() == Object || CB.Captured)
-    return AliasAnalysis::ModRef;
-
-  unsigned ArgNo = 0;
-  for (ImmutableCallSite::arg_iterator CI = CS.arg_begin(), CE = CS.arg_end();
-       CI != CE; ++CI, ++ArgNo) {
-    // Only look at the no-capture or byval pointer arguments.  If this
-    // pointer were passed to arguments that were neither of these, then it
-    // couldn't be no-capture.
-    if (!(*CI)->getType()->isPointerTy() ||
-        (!CS.doesNotCapture(ArgNo) && !CS.isByValArgument(ArgNo)))
-      continue;
-
-    // If this is a no-capture pointer argument, see if we can tell that it
-    // is impossible to alias the pointer we're checking.  If not, we have to
-    // assume that the call could touch the pointer, even though it doesn't
-    // escape.
-    if (!AA->isNoAlias(AliasAnalysis::Location(*CI),
-                       AliasAnalysis::Location(Object))) {
-      return AliasAnalysis::ModRef;
-    }
-  }
-  return AliasAnalysis::NoModRef;
-}
-
 /// getPointerDependencyFrom - Return the instruction on which a memory
 /// location depends.  If isLoad is true, this routine ignores may-aliases with
 /// read-only operations.  If isLoad is false, this routine ignores may-aliases
@@ -556,8 +474,7 @@ getPointerDependencyFrom(const AliasAnalysis::Location &MemLoc, bool isLoad,
     // a subsequent bitcast of the malloc call result.  There can be stores to
     // the malloced memory between the malloc call and its bitcast uses, and we
     // need to continue scanning until the malloc call.
-    if (isa<AllocaInst>(Inst) ||
-        (isa<CallInst>(Inst) && extractMallocCall(Inst))) {
+    if (isa<AllocaInst>(Inst) || isNoAliasFn(Inst)) {
       const Value *AccessPtr = GetUnderlyingObject(MemLoc.Ptr, TD);
       
       if (AccessPtr == Inst || AA->isMustAlias(Inst, AccessPtr))
@@ -566,7 +483,11 @@ getPointerDependencyFrom(const AliasAnalysis::Location &MemLoc, bool isLoad,
     }
 
     // See if this instruction (e.g. a call or vaarg) mod/ref's the pointer.
-    switch (getModRefInfo(Inst, MemLoc)) {
+    AliasAnalysis::ModRefResult MR = AA->getModRefInfo(Inst, MemLoc);
+    // If necessary, perform additional analysis.
+    if (MR == AliasAnalysis::ModRef)
+      MR = AA->callCapturesBefore(Inst, MemLoc, DT);
+    switch (MR) {
     case AliasAnalysis::NoModRef:
       // If the call has no effect on the queried pointer, just ignore it.
       continue;
@@ -984,7 +905,7 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
   if (!Pair.second) {
     if (CacheInfo->Size < Loc.Size) {
       // The query's Size is greater than the cached one. Throw out the
-      // cached data and procede with the query at the greater size.
+      // cached data and proceed with the query at the greater size.
       CacheInfo->Pair = BBSkipFirstBlockPair();
       CacheInfo->Size = Loc.Size;
       for (NonLocalDepInfo::iterator DI = CacheInfo->NonLocalDeps.begin(),
diff --git a/lib/Analysis/ModuleDebugInfoPrinter.cpp b/lib/Analysis/ModuleDebugInfoPrinter.cpp
index e7e999c..f8c7514 100644
--- a/lib/Analysis/ModuleDebugInfoPrinter.cpp
+++ b/lib/Analysis/ModuleDebugInfoPrinter.cpp
@@ -16,10 +16,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Assembly/Writer.h"
-#include "llvm/Pass.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/Function.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/Statistic.h"
diff --git a/lib/Analysis/PathNumbering.cpp b/lib/Analysis/PathNumbering.cpp
index 80c5222..d4ad726 100644
--- a/lib/Analysis/PathNumbering.cpp
+++ b/lib/Analysis/PathNumbering.cpp
@@ -31,11 +31,11 @@
 #include "llvm/Instructions.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
+#include "llvm/TypeBuilder.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/TypeBuilder.h"
 #include "llvm/Support/raw_ostream.h"
 
 #include <queue>
diff --git a/lib/Analysis/ProfileInfoLoader.cpp b/lib/Analysis/ProfileInfoLoader.cpp
index eaa38da..5c7c97c 100644
--- a/lib/Analysis/ProfileInfoLoader.cpp
+++ b/lib/Analysis/ProfileInfoLoader.cpp
@@ -83,10 +83,8 @@ const unsigned ProfileInfoLoader::Uncounted = ~0U;
 // program if the file is invalid or broken.
 //
 ProfileInfoLoader::ProfileInfoLoader(const char *ToolName,
-                                     const std::string &Filename,
-                                     Module &TheModule) :
-                                     Filename(Filename), 
-                                     M(TheModule), Warned(false) {
+                                     const std::string &Filename)
+  : Filename(Filename) {
   FILE *F = fopen(Filename.c_str(), "rb");
   if (F == 0) {
     errs() << ToolName << ": Error opening '" << Filename << "': ";
diff --git a/lib/Analysis/ProfileInfoLoaderPass.cpp b/lib/Analysis/ProfileInfoLoaderPass.cpp
index c4da807..5ecf052 100644
--- a/lib/Analysis/ProfileInfoLoaderPass.cpp
+++ b/lib/Analysis/ProfileInfoLoaderPass.cpp
@@ -152,7 +152,7 @@ void LoaderPass::readEdge(ProfileInfo::Edge e,
 }
 
 bool LoaderPass::runOnModule(Module &M) {
-  ProfileInfoLoader PIL("profile-loader", Filename, M);
+  ProfileInfoLoader PIL("profile-loader", Filename);
 
   EdgeInformation.clear();
   std::vector<unsigned> Counters = PIL.getRawEdgeCounts();
diff --git a/lib/Analysis/RegionInfo.cpp b/lib/Analysis/RegionInfo.cpp
index b507b1e..5f4458b 100644
--- a/lib/Analysis/RegionInfo.cpp
+++ b/lib/Analysis/RegionInfo.cpp
@@ -47,7 +47,7 @@ static cl::opt<enum Region::PrintStyle> printStyle("print-region-style",
   cl::values(
     clEnumValN(Region::PrintNone, "none",  "print no details"),
     clEnumValN(Region::PrintBB, "bb",
-               "print regions in detail with block_iterator"),
+               "print regions in detail with block_node_iterator"),
     clEnumValN(Region::PrintRN, "rn",
                "print regions in detail with element_iterator"),
     clEnumValEnd));
@@ -246,22 +246,38 @@ void Region::verifyRegionNest() const {
   verifyRegion();
 }
 
-Region::block_iterator Region::block_begin() {
+Region::block_node_iterator Region::block_node_begin() {
   return GraphTraits<FlatIt<Region*> >::nodes_begin(this);
 }
 
-Region::block_iterator Region::block_end() {
+Region::block_node_iterator Region::block_node_end() {
   return GraphTraits<FlatIt<Region*> >::nodes_end(this);
 }
 
-Region::const_block_iterator Region::block_begin() const {
+Region::const_block_node_iterator Region::block_node_begin() const {
   return GraphTraits<FlatIt<const Region*> >::nodes_begin(this);
 }
 
-Region::const_block_iterator Region::block_end() const {
+Region::const_block_node_iterator Region::block_node_end() const {
   return GraphTraits<FlatIt<const Region*> >::nodes_end(this);
 }
 
+Region::block_iterator Region::block_begin() {
+  return block_node_begin();
+}
+
+Region::block_iterator Region::block_end() {
+  return block_node_end();
+}
+
+Region::const_block_iterator Region::block_begin() const {
+  return block_node_begin();
+}
+
+Region::const_block_iterator Region::block_end() const {
+  return block_node_end();
+}
+
 Region::element_iterator Region::element_begin() {
   return GraphTraits<Region*>::nodes_begin(this);
 }
@@ -425,7 +441,9 @@ void Region::print(raw_ostream &OS, bool print_tree, unsigned level,
     OS.indent(level*2 + 2);
 
     if (Style == PrintBB) {
-      for (const_block_iterator I = block_begin(), E = block_end(); I!=E; ++I)
+      for (const_block_node_iterator I = block_node_begin(),
+                                     E = block_node_end();
+           I != E; ++I)
         OS << **I << ", "; // TODO: remove the last ","
     } else if (Style == PrintRN) {
       for (const_element_iterator I = element_begin(), E = element_end(); I!=E; ++I)
diff --git a/lib/Analysis/RegionPass.cpp b/lib/Analysis/RegionPass.cpp
index 3a3529b..c97b5eb 100644
--- a/lib/Analysis/RegionPass.cpp
+++ b/lib/Analysis/RegionPass.cpp
@@ -195,7 +195,8 @@ public:
 
   virtual bool runOnRegion(Region *R, RGPassManager &RGM) {
     Out << Banner;
-    for (Region::block_iterator I = R->block_begin(), E = R->block_end();
+    for (Region::block_node_iterator I = R->block_node_begin(),
+                                     E = R->block_node_end();
          I != E; ++I)
       (*I)->getEntry()->print(Out);
 
diff --git a/lib/Analysis/RegionPrinter.cpp b/lib/Analysis/RegionPrinter.cpp
index a1730b0..8b23cc7 100644
--- a/lib/Analysis/RegionPrinter.cpp
+++ b/lib/Analysis/RegionPrinter.cpp
@@ -122,13 +122,11 @@ struct DOTGraphTraits<RegionInfo*> : public DOTGraphTraits<RegionNode*> {
     RegionInfo *RI = R->getRegionInfo();
 
     for (Region::const_block_iterator BI = R->block_begin(),
-         BE = R->block_end(); BI != BE; ++BI) {
-      BasicBlock *BB = (*BI)->getNodeAs<BasicBlock>();
-      if (RI->getRegionFor(BB) == R)
+         BE = R->block_end(); BI != BE; ++BI)
+      if (RI->getRegionFor(*BI) == R)
         O.indent(2 * (depth + 1)) << "Node"
-          << static_cast<const void*>(RI->getTopLevelRegion()->getBBNode(BB))
+          << static_cast<const void*>(RI->getTopLevelRegion()->getBBNode(*BI))
           << ";\n";
-    }
 
     O.indent(2 * depth) << "}\n";
   }
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 205227c..f0f3b1c 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -826,8 +826,7 @@ const SCEV *ScalarEvolution::getTruncateExpr(const SCEV *Op,
   // Fold if the operand is constant.
   if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Op))
     return getConstant(
-      cast<ConstantInt>(ConstantExpr::getTrunc(SC->getValue(),
-                                               getEffectiveSCEVType(Ty))));
+      cast<ConstantInt>(ConstantExpr::getTrunc(SC->getValue(), Ty)));
 
   // trunc(trunc(x)) --> trunc(x)
   if (const SCEVTruncateExpr *ST = dyn_cast<SCEVTruncateExpr>(Op))
@@ -879,13 +878,6 @@ const SCEV *ScalarEvolution::getTruncateExpr(const SCEV *Op,
     return getAddRecExpr(Operands, AddRec->getLoop(), SCEV::FlagAnyWrap);
   }
 
-  // As a special case, fold trunc(undef) to undef. We don't want to
-  // know too much about SCEVUnknowns, but this special case is handy
-  // and harmless.
-  if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Op))
-    if (isa<UndefValue>(U->getValue()))
-      return getSCEV(UndefValue::get(Ty));
-
   // The cast wasn't folded; create an explicit cast node. We can reuse
   // the existing insert position since if we get here, we won't have
   // made any changes which would invalidate it.
@@ -906,8 +898,7 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
   // Fold if the operand is constant.
   if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Op))
     return getConstant(
-      cast<ConstantInt>(ConstantExpr::getZExt(SC->getValue(),
-                                              getEffectiveSCEVType(Ty))));
+      cast<ConstantInt>(ConstantExpr::getZExt(SC->getValue(), Ty)));
 
   // zext(zext(x)) --> zext(x)
   if (const SCEVZeroExtendExpr *SZ = dyn_cast<SCEVZeroExtendExpr>(Op))
@@ -976,12 +967,15 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
           Type *WideTy = IntegerType::get(getContext(), BitWidth * 2);
           // Check whether Start+Step*MaxBECount has no unsigned overflow.
           const SCEV *ZMul = getMulExpr(CastedMaxBECount, Step);
-          const SCEV *Add = getAddExpr(Start, ZMul);
+          const SCEV *ZAdd = getZeroExtendExpr(getAddExpr(Start, ZMul), WideTy);
+          const SCEV *WideStart = getZeroExtendExpr(Start, WideTy);
+          const SCEV *WideMaxBECount =
+            getZeroExtendExpr(CastedMaxBECount, WideTy);
           const SCEV *OperandExtendedAdd =
-            getAddExpr(getZeroExtendExpr(Start, WideTy),
-                       getMulExpr(getZeroExtendExpr(CastedMaxBECount, WideTy),
+            getAddExpr(WideStart,
+                       getMulExpr(WideMaxBECount,
                                   getZeroExtendExpr(Step, WideTy)));
-          if (getZeroExtendExpr(Add, WideTy) == OperandExtendedAdd) {
+          if (ZAdd == OperandExtendedAdd) {
             // Cache knowledge of AR NUW, which is propagated to this AddRec.
             const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNUW);
             // Return the expression with the addrec on the outside.
@@ -991,13 +985,11 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
           }
           // Similar to above, only this time treat the step value as signed.
           // This covers loops that count down.
-          const SCEV *SMul = getMulExpr(CastedMaxBECount, Step);
-          Add = getAddExpr(Start, SMul);
           OperandExtendedAdd =
-            getAddExpr(getZeroExtendExpr(Start, WideTy),
-                       getMulExpr(getZeroExtendExpr(CastedMaxBECount, WideTy),
+            getAddExpr(WideStart,
+                       getMulExpr(WideMaxBECount,
                                   getSignExtendExpr(Step, WideTy)));
-          if (getZeroExtendExpr(Add, WideTy) == OperandExtendedAdd) {
+          if (ZAdd == OperandExtendedAdd) {
             // Cache knowledge of AR NW, which is propagated to this AddRec.
             // Negative step causes unsigned wrap, but it still can't self-wrap.
             const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNW);
@@ -1164,8 +1156,7 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
   // Fold if the operand is constant.
   if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Op))
     return getConstant(
-      cast<ConstantInt>(ConstantExpr::getSExt(SC->getValue(),
-                                              getEffectiveSCEVType(Ty))));
+      cast<ConstantInt>(ConstantExpr::getSExt(SC->getValue(), Ty)));
 
   // sext(sext(x)) --> sext(x)
   if (const SCEVSignExtendExpr *SS = dyn_cast<SCEVSignExtendExpr>(Op))
@@ -1242,12 +1233,15 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
           Type *WideTy = IntegerType::get(getContext(), BitWidth * 2);
           // Check whether Start+Step*MaxBECount has no signed overflow.
           const SCEV *SMul = getMulExpr(CastedMaxBECount, Step);
-          const SCEV *Add = getAddExpr(Start, SMul);
+          const SCEV *SAdd = getSignExtendExpr(getAddExpr(Start, SMul), WideTy);
+          const SCEV *WideStart = getSignExtendExpr(Start, WideTy);
+          const SCEV *WideMaxBECount =
+            getZeroExtendExpr(CastedMaxBECount, WideTy);
           const SCEV *OperandExtendedAdd =
-            getAddExpr(getSignExtendExpr(Start, WideTy),
-                       getMulExpr(getZeroExtendExpr(CastedMaxBECount, WideTy),
+            getAddExpr(WideStart,
+                       getMulExpr(WideMaxBECount,
                                   getSignExtendExpr(Step, WideTy)));
-          if (getSignExtendExpr(Add, WideTy) == OperandExtendedAdd) {
+          if (SAdd == OperandExtendedAdd) {
             // Cache knowledge of AR NSW, which is propagated to this AddRec.
             const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
             // Return the expression with the addrec on the outside.
@@ -1257,13 +1251,11 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
           }
           // Similar to above, only this time treat the step value as unsigned.
           // This covers loops that count up with an unsigned step.
-          const SCEV *UMul = getMulExpr(CastedMaxBECount, Step);
-          Add = getAddExpr(Start, UMul);
           OperandExtendedAdd =
-            getAddExpr(getSignExtendExpr(Start, WideTy),
-                       getMulExpr(getZeroExtendExpr(CastedMaxBECount, WideTy),
+            getAddExpr(WideStart,
+                       getMulExpr(WideMaxBECount,
                                   getZeroExtendExpr(Step, WideTy)));
-          if (getSignExtendExpr(Add, WideTy) == OperandExtendedAdd) {
+          if (SAdd == OperandExtendedAdd) {
             // Cache knowledge of AR NSW, which is propagated to this AddRec.
             const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
             // Return the expression with the addrec on the outside.
@@ -1345,13 +1337,6 @@ const SCEV *ScalarEvolution::getAnyExtendExpr(const SCEV *Op,
     return getAddRecExpr(Ops, AR->getLoop(), SCEV::FlagNW);
   }
 
-  // As a special case, fold anyext(undef) to undef. We don't want to
-  // know too much about SCEVUnknowns, but this special case is handy
-  // and harmless.
-  if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Op))
-    if (isa<UndefValue>(U->getValue()))
-      return getSCEV(UndefValue::get(Ty));
-
   // If the expression is obviously signed, use the sext cast value.
   if (isa<SCEVSMaxExpr>(Op))
     return SExt;
@@ -1839,7 +1824,7 @@ static uint64_t umul_ov(uint64_t i, uint64_t j, bool &Overflow) {
 
 /// Compute the result of "n choose k", the binomial coefficient.  If an
 /// intermediate computation overflows, Overflow will be set and the return will
-/// be garbage. Overflow is not cleared on absense of overflow.
+/// be garbage. Overflow is not cleared on absence of overflow.
 static uint64_t Choose(uint64_t n, uint64_t k, bool &Overflow) {
   // We use the multiplicative formula:
   //     n(n-1)(n-2)...(n-(k-1)) / k(k-1)(k-2)...1 .
@@ -2038,63 +2023,67 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
     for (unsigned OtherIdx = Idx+1;
          OtherIdx < Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);
          ++OtherIdx) {
-      if (AddRecLoop == cast<SCEVAddRecExpr>(Ops[OtherIdx])->getLoop()) {
-        // {A1,+,A2,+,...,+,An}<L> * {B1,+,B2,+,...,+,Bn}<L>
-        // = {x=1 in [ sum y=x..2x [ sum z=max(y-x, y-n)..min(x,n) [
-        //       choose(x, 2x)*choose(2x-y, x-z)*A_{y-z}*B_z
-        //   ]]],+,...up to x=2n}.
-        // Note that the arguments to choose() are always integers with values
-        // known at compile time, never SCEV objects.
-        //
-        // The implementation avoids pointless extra computations when the two
-        // addrec's are of different length (mathematically, it's equivalent to
-        // an infinite stream of zeros on the right).
-        bool OpsModified = false;
-        for (; OtherIdx != Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);
-             ++OtherIdx)
-          if (const SCEVAddRecExpr *OtherAddRec =
-                dyn_cast<SCEVAddRecExpr>(Ops[OtherIdx]))
-            if (OtherAddRec->getLoop() == AddRecLoop) {
-              bool Overflow = false;
-              Type *Ty = AddRec->getType();
-              bool LargerThan64Bits = getTypeSizeInBits(Ty) > 64;
-              SmallVector<const SCEV*, 7> AddRecOps;
-              for (int x = 0, xe = AddRec->getNumOperands() +
-                     OtherAddRec->getNumOperands() - 1;
-                   x != xe && !Overflow; ++x) {
-                const SCEV *Term = getConstant(Ty, 0);
-                for (int y = x, ye = 2*x+1; y != ye && !Overflow; ++y) {
-                  uint64_t Coeff1 = Choose(x, 2*x - y, Overflow);
-                  for (int z = std::max(y-x, y-(int)AddRec->getNumOperands()+1),
-                         ze = std::min(x+1, (int)OtherAddRec->getNumOperands());
-                       z < ze && !Overflow; ++z) {
-                    uint64_t Coeff2 = Choose(2*x - y, x-z, Overflow);
-                    uint64_t Coeff;
-                    if (LargerThan64Bits)
-                      Coeff = umul_ov(Coeff1, Coeff2, Overflow);
-                    else
-                      Coeff = Coeff1*Coeff2;
-                    const SCEV *CoeffTerm = getConstant(Ty, Coeff);
-                    const SCEV *Term1 = AddRec->getOperand(y-z);
-                    const SCEV *Term2 = OtherAddRec->getOperand(z);
-                    Term = getAddExpr(Term, getMulExpr(CoeffTerm, Term1,Term2));
-                  }
-                }
-                AddRecOps.push_back(Term);
-              }
-              if (!Overflow) {
-                const SCEV *NewAddRec = getAddRecExpr(AddRecOps,
-                                                      AddRec->getLoop(),
-                                                      SCEV::FlagAnyWrap);
-                if (Ops.size() == 2) return NewAddRec;
-                Ops[Idx] = AddRec = cast<SCEVAddRecExpr>(NewAddRec);
-                Ops.erase(Ops.begin() + OtherIdx); --OtherIdx;
-                OpsModified = true;
-              }
+      if (AddRecLoop != cast<SCEVAddRecExpr>(Ops[OtherIdx])->getLoop())
+        continue;
+
+      // {A1,+,A2,+,...,+,An}<L> * {B1,+,B2,+,...,+,Bn}<L>
+      // = {x=1 in [ sum y=x..2x [ sum z=max(y-x, y-n)..min(x,n) [
+      //       choose(x, 2x)*choose(2x-y, x-z)*A_{y-z}*B_z
+      //   ]]],+,...up to x=2n}.
+      // Note that the arguments to choose() are always integers with values
+      // known at compile time, never SCEV objects.
+      //
+      // The implementation avoids pointless extra computations when the two
+      // addrec's are of different length (mathematically, it's equivalent to
+      // an infinite stream of zeros on the right).
+      bool OpsModified = false;
+      for (; OtherIdx != Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);
+           ++OtherIdx) {
+        const SCEVAddRecExpr *OtherAddRec =
+          dyn_cast<SCEVAddRecExpr>(Ops[OtherIdx]);
+        if (!OtherAddRec || OtherAddRec->getLoop() != AddRecLoop)
+          continue;
+
+        bool Overflow = false;
+        Type *Ty = AddRec->getType();
+        bool LargerThan64Bits = getTypeSizeInBits(Ty) > 64;
+        SmallVector<const SCEV*, 7> AddRecOps;
+        for (int x = 0, xe = AddRec->getNumOperands() +
+               OtherAddRec->getNumOperands() - 1; x != xe && !Overflow; ++x) {
+          const SCEV *Term = getConstant(Ty, 0);
+          for (int y = x, ye = 2*x+1; y != ye && !Overflow; ++y) {
+            uint64_t Coeff1 = Choose(x, 2*x - y, Overflow);
+            for (int z = std::max(y-x, y-(int)AddRec->getNumOperands()+1),
+                   ze = std::min(x+1, (int)OtherAddRec->getNumOperands());
+                 z < ze && !Overflow; ++z) {
+              uint64_t Coeff2 = Choose(2*x - y, x-z, Overflow);
+              uint64_t Coeff;
+              if (LargerThan64Bits)
+                Coeff = umul_ov(Coeff1, Coeff2, Overflow);
+              else
+                Coeff = Coeff1*Coeff2;
+              const SCEV *CoeffTerm = getConstant(Ty, Coeff);
+              const SCEV *Term1 = AddRec->getOperand(y-z);
+              const SCEV *Term2 = OtherAddRec->getOperand(z);
+              Term = getAddExpr(Term, getMulExpr(CoeffTerm, Term1,Term2));
             }
-        if (OpsModified)
-          return getMulExpr(Ops);
+          }
+          AddRecOps.push_back(Term);
+        }
+        if (!Overflow) {
+          const SCEV *NewAddRec = getAddRecExpr(AddRecOps, AddRec->getLoop(),
+                                                SCEV::FlagAnyWrap);
+          if (Ops.size() == 2) return NewAddRec;
+          Ops[Idx] = NewAddRec;
+          Ops.erase(Ops.begin() + OtherIdx); --OtherIdx;
+          OpsModified = true;
+          AddRec = dyn_cast<SCEVAddRecExpr>(NewAddRec);
+          if (!AddRec)
+            break;
+        }
       }
+      if (OpsModified)
+        return getMulExpr(Ops);
     }
 
     // Otherwise couldn't fold anything into this recurrence.  Move onto the
@@ -2723,7 +2712,7 @@ const SCEV *ScalarEvolution::getCouldNotCompute() {
 const SCEV *ScalarEvolution::getSCEV(Value *V) {
   assert(isSCEVable(V->getType()) && "Value is not SCEVable!");
 
-  ValueExprMapType::const_iterator I = ValueExprMap.find(V);
+  ValueExprMapType::const_iterator I = ValueExprMap.find_as(V);
   if (I != ValueExprMap.end()) return I->second;
   const SCEV *S = createSCEV(V);
 
@@ -2960,7 +2949,7 @@ ScalarEvolution::ForgetSymbolicName(Instruction *PN, const SCEV *SymName) {
     if (!Visited.insert(I)) continue;
 
     ValueExprMapType::iterator It =
-      ValueExprMap.find(static_cast<Value *>(I));
+      ValueExprMap.find_as(static_cast<Value *>(I));
     if (It != ValueExprMap.end()) {
       const SCEV *Old = It->second;
 
@@ -3017,7 +3006,7 @@ const SCEV *ScalarEvolution::createNodeForPHI(PHINode *PN) {
       if (BEValueV && StartValueV) {
         // While we are analyzing this PHI node, handle its value symbolically.
         const SCEV *SymbolicName = getUnknown(PN);
-        assert(ValueExprMap.find(PN) == ValueExprMap.end() &&
+        assert(ValueExprMap.find_as(PN) == ValueExprMap.end() &&
                "PHI node already processed?");
         ValueExprMap.insert(std::make_pair(SCEVCallbackVH(PN, this), SymbolicName));
 
@@ -4081,7 +4070,7 @@ ScalarEvolution::getBackedgeTakenInfo(const Loop *L) {
       if (!Visited.insert(I)) continue;
 
       ValueExprMapType::iterator It =
-        ValueExprMap.find(static_cast<Value *>(I));
+        ValueExprMap.find_as(static_cast<Value *>(I));
       if (It != ValueExprMap.end()) {
         const SCEV *Old = It->second;
 
@@ -4132,7 +4121,8 @@ void ScalarEvolution::forgetLoop(const Loop *L) {
     Instruction *I = Worklist.pop_back_val();
     if (!Visited.insert(I)) continue;
 
-    ValueExprMapType::iterator It = ValueExprMap.find(static_cast<Value *>(I));
+    ValueExprMapType::iterator It =
+      ValueExprMap.find_as(static_cast<Value *>(I));
     if (It != ValueExprMap.end()) {
       forgetMemoizedResults(It->second);
       ValueExprMap.erase(It);
@@ -4165,7 +4155,8 @@ void ScalarEvolution::forgetValue(Value *V) {
     I = Worklist.pop_back_val();
     if (!Visited.insert(I)) continue;
 
-    ValueExprMapType::iterator It = ValueExprMap.find(static_cast<Value *>(I));
+    ValueExprMapType::iterator It =
+      ValueExprMap.find_as(static_cast<Value *>(I));
     if (It != ValueExprMap.end()) {
       forgetMemoizedResults(It->second);
       ValueExprMap.erase(It);
@@ -5481,7 +5472,7 @@ ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L) {
   // to 0, it must be counting down to equal 0. Consequently, N = Start / -Step.
   // We have not yet seen any such cases.
   const SCEVConstant *StepC = dyn_cast<SCEVConstant>(Step);
-  if (StepC == 0)
+  if (StepC == 0 || StepC->getValue()->equalsInt(0))
     return getCouldNotCompute();
 
   // For positive steps (counting up until unsigned overflow):
@@ -5602,9 +5593,14 @@ static bool HasSameValue(const SCEV *A, const SCEV *B) {
 /// predicate Pred. Return true iff any changes were made.
 ///
 bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred,
-                                           const SCEV *&LHS, const SCEV *&RHS) {
+                                           const SCEV *&LHS, const SCEV *&RHS,
+                                           unsigned Depth) {
   bool Changed = false;
 
+  // If we hit the max recursion limit bail out.
+  if (Depth >= 3)
+    return false;
+
   // Canonicalize a constant to the right side.
   if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(LHS)) {
     // Check for both operands constant.
@@ -5642,6 +5638,16 @@ bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred,
     default: llvm_unreachable("Unexpected ICmpInst::Predicate value!");
     case ICmpInst::ICMP_EQ:
     case ICmpInst::ICMP_NE:
+      // Fold ((-1) * %a) + %b == 0 (equivalent to %b-%a == 0) into %a == %b.
+      if (!RA)
+        if (const SCEVAddExpr *AE = dyn_cast<SCEVAddExpr>(LHS))
+          if (const SCEVMulExpr *ME = dyn_cast<SCEVMulExpr>(AE->getOperand(0)))
+            if (AE->getNumOperands() == 2 && ME->getNumOperands() == 2 &&
+                ME->getOperand(0)->isAllOnesValue()) {
+              RHS = AE->getOperand(1);
+              LHS = ME->getOperand(1);
+              Changed = true;
+            }
       break;
     case ICmpInst::ICMP_UGE:
       if ((RA - 1).isMinValue()) {
@@ -5843,6 +5849,11 @@ bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred,
 
   // TODO: More simplifications are possible here.
 
+  // Recursively simplify until we either hit a recursion limit or nothing
+  // changes.
+  if (Changed)
+    return SimplifyICmpOperands(Pred, LHS, RHS, Depth+1);
+
   return Changed;
 
 trivially_true:
@@ -6040,12 +6051,34 @@ ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L,
   return false;
 }
 
+/// RAII wrapper to prevent recursive application of isImpliedCond.
+/// ScalarEvolution's PendingLoopPredicates set must be empty unless we are
+/// currently evaluating isImpliedCond.
+struct MarkPendingLoopPredicate {
+  Value *Cond;
+  DenseSet<Value*> &LoopPreds;
+  bool Pending;
+
+  MarkPendingLoopPredicate(Value *C, DenseSet<Value*> &LP)
+    : Cond(C), LoopPreds(LP) {
+    Pending = !LoopPreds.insert(Cond).second;
+  }
+  ~MarkPendingLoopPredicate() {
+    if (!Pending)
+      LoopPreds.erase(Cond);
+  }
+};
+
 /// isImpliedCond - Test whether the condition described by Pred, LHS,
 /// and RHS is true whenever the given Cond value evaluates to true.
 bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred,
                                     const SCEV *LHS, const SCEV *RHS,
                                     Value *FoundCondValue,
                                     bool Inverse) {
+  MarkPendingLoopPredicate Mark(FoundCondValue, PendingLoopPredicates);
+  if (Mark.Pending)
+    return false;
+
   // Recursively handle And and Or conditions.
   if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FoundCondValue)) {
     if (BO->getOpcode() == Instruction::And) {
@@ -6572,6 +6605,8 @@ void ScalarEvolution::releaseMemory() {
     I->second.clear();
   }
 
+  assert(PendingLoopPredicates.empty() && "isImpliedCond garbage");
+
   BackedgeTakenCounts.clear();
   ConstantEvolutionLoopExitValue.clear();
   ValuesAtScopes.clear();
@@ -6859,44 +6894,27 @@ bool ScalarEvolution::properlyDominates(const SCEV *S, const BasicBlock *BB) {
   return getBlockDisposition(S, BB) == ProperlyDominatesBlock;
 }
 
-bool ScalarEvolution::hasOperand(const SCEV *S, const SCEV *Op) const {
-  switch (S->getSCEVType()) {
-  case scConstant:
-    return false;
-  case scTruncate:
-  case scZeroExtend:
-  case scSignExtend: {
-    const SCEVCastExpr *Cast = cast<SCEVCastExpr>(S);
-    const SCEV *CastOp = Cast->getOperand();
-    return Op == CastOp || hasOperand(CastOp, Op);
-  }
-  case scAddRecExpr:
-  case scAddExpr:
-  case scMulExpr:
-  case scUMaxExpr:
-  case scSMaxExpr: {
-    const SCEVNAryExpr *NAry = cast<SCEVNAryExpr>(S);
-    for (SCEVNAryExpr::op_iterator I = NAry->op_begin(), E = NAry->op_end();
-         I != E; ++I) {
-      const SCEV *NAryOp = *I;
-      if (NAryOp == Op || hasOperand(NAryOp, Op))
-        return true;
-    }
-    return false;
-  }
-  case scUDivExpr: {
-    const SCEVUDivExpr *UDiv = cast<SCEVUDivExpr>(S);
-    const SCEV *LHS = UDiv->getLHS(), *RHS = UDiv->getRHS();
-    return LHS == Op || hasOperand(LHS, Op) ||
-           RHS == Op || hasOperand(RHS, Op);
-  }
-  case scUnknown:
-    return false;
-  case scCouldNotCompute:
-    llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!");
-  default:
-    llvm_unreachable("Unknown SCEV kind!");
+namespace {
+// Search for a SCEV expression node within an expression tree.
+// Implements SCEVTraversal::Visitor.
+struct SCEVSearch {
+  const SCEV *Node;
+  bool IsFound;
+
+  SCEVSearch(const SCEV *N): Node(N), IsFound(false) {}
+
+  bool follow(const SCEV *S) {
+    IsFound |= (S == Node);
+    return !IsFound;
   }
+  bool isDone() const { return IsFound; }
+};
+}
+
+bool ScalarEvolution::hasOperand(const SCEV *S, const SCEV *Op) const {
+  SCEVSearch Search(Op);
+  visitAll(S, Search);
+  return Search.IsFound;
 }
 
 void ScalarEvolution::forgetMemoizedResults(const SCEV *S) {
diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp
index 69507be..62710c5 100644
--- a/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -37,7 +37,7 @@ Value *SCEVExpander::ReuseOrCreateCast(Value *V, Type *Ty,
   // We use this precondition to produce a cast that will dominate all its
   // uses. In particular, this is crucial for the case where the builder's
   // insertion point *is* the point where we were asked to put the cast.
-  // Since we don't know the the builder's insertion point is actually
+  // Since we don't know the builder's insertion point is actually
   // where the uses will be added (only that it dominates it), we are
   // not allowed to move it.
   BasicBlock::iterator BIP = Builder.GetInsertPoint();
@@ -955,7 +955,8 @@ bool SCEVExpander::hoistIVInc(Instruction *IncV, Instruction *InsertPos) {
 
   // InsertPos must itself dominate IncV so that IncV's new position satisfies
   // its existing users.
-  if (!SE.DT->dominates(InsertPos->getParent(), IncV->getParent()))
+  if (isa<PHINode>(InsertPos)
+      || !SE.DT->dominates(InsertPos->getParent(), IncV->getParent()))
     return false;
 
   // Check that the chain of IV operands leading back to Phi can be hoisted.
@@ -1699,3 +1700,44 @@ unsigned SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT,
   }
   return NumElim;
 }
+
+namespace {
+// Search for a SCEV subexpression that is not safe to expand.  Any expression
+// that may expand to a !isSafeToSpeculativelyExecute value is unsafe, namely
+// UDiv expressions. We don't know if the UDiv is derived from an IR divide
+// instruction, but the important thing is that we prove the denominator is
+// nonzero before expansion.
+//
+// IVUsers already checks that IV-derived expressions are safe. So this check is
+// only needed when the expression includes some subexpression that is not IV
+// derived.
+//
+// Currently, we only allow division by a nonzero constant here. If this is
+// inadequate, we could easily allow division by SCEVUnknown by using
+// ValueTracking to check isKnownNonZero().
+struct SCEVFindUnsafe {
+  bool IsUnsafe;
+
+  SCEVFindUnsafe(): IsUnsafe(false) {}
+
+  bool follow(const SCEV *S) {
+    const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S);
+    if (!D)
+      return true;
+    const SCEVConstant *SC = dyn_cast<SCEVConstant>(D->getRHS());
+    if (SC && !SC->getValue()->isZero())
+      return true;
+    IsUnsafe = true;
+    return false;
+  }
+  bool isDone() const { return IsUnsafe; }
+};
+}
+
+namespace llvm {
+bool isSafeToExpand(const SCEV *S) {
+  SCEVFindUnsafe Search;
+  visitAll(S, Search);
+  return !Search.IsUnsafe;
+}
+}
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index a430f62..cea34e1 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -564,7 +564,7 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
                         Depth+1);
       // If it's known zero, our sign bit is also zero.
       if (LHSKnownZero.isNegative())
-        KnownZero |= LHSKnownZero;
+        KnownZero.setBit(BitWidth - 1);
     }
 
     break;
@@ -694,7 +694,7 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
     // taking conservative care to avoid excessive recursion.
     if (Depth < MaxDepth - 1 && !KnownZero && !KnownOne) {
       // Skip if every incoming value references to ourself.
-      if (P->hasConstantValue() == P)
+      if (dyn_cast_or_null<UndefValue>(P->hasConstantValue()))
         break;
 
       KnownZero = APInt::getAllOnesValue(BitWidth);
@@ -1796,6 +1796,37 @@ llvm::GetUnderlyingObject(Value *V, const TargetData *TD, unsigned MaxLookup) {
   return V;
 }
 
+void
+llvm::GetUnderlyingObjects(Value *V,
+                           SmallVectorImpl<Value *> &Objects,
+                           const TargetData *TD,
+                           unsigned MaxLookup) {
+  SmallPtrSet<Value *, 4> Visited;
+  SmallVector<Value *, 4> Worklist;
+  Worklist.push_back(V);
+  do {
+    Value *P = Worklist.pop_back_val();
+    P = GetUnderlyingObject(P, TD, MaxLookup);
+
+    if (!Visited.insert(P))
+      continue;
+
+    if (SelectInst *SI = dyn_cast<SelectInst>(P)) {
+      Worklist.push_back(SI->getTrueValue());
+      Worklist.push_back(SI->getFalseValue());
+      continue;
+    }
+
+    if (PHINode *PN = dyn_cast<PHINode>(P)) {
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        Worklist.push_back(PN->getIncomingValue(i));
+      continue;
+    }
+
+    Objects.push_back(P);
+  } while (!Worklist.empty());
+}
+
 /// onlyUsedByLifetimeMarkers - Return true if the only users of this pointer
 /// are lifetime markers.
 ///
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 8818168..670c1bb 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -474,6 +474,9 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(extern_weak);
   KEYWORD(external);
   KEYWORD(thread_local);
+  KEYWORD(localdynamic);
+  KEYWORD(initialexec);
+  KEYWORD(localexec);
   KEYWORD(zeroinitializer);
   KEYWORD(undef);
   KEYWORD(null);
@@ -673,11 +676,12 @@ lltok::Kind LLLexer::LexIdentifier() {
 ///    HexFP80Constant   0xK[0-9A-Fa-f]+
 ///    HexFP128Constant  0xL[0-9A-Fa-f]+
 ///    HexPPC128Constant 0xM[0-9A-Fa-f]+
+///    HexHalfConstant   0xH[0-9A-Fa-f]+
 lltok::Kind LLLexer::Lex0x() {
   CurPtr = TokStart + 2;
 
   char Kind;
-  if (CurPtr[0] >= 'K' && CurPtr[0] <= 'M') {
+  if ((CurPtr[0] >= 'K' && CurPtr[0] <= 'M') || CurPtr[0] == 'H') {
     Kind = *CurPtr++;
   } else {
     Kind = 'J';
@@ -718,6 +722,9 @@ lltok::Kind LLLexer::Lex0x() {
     HexToIntPair(TokStart+3, CurPtr, Pair);
     APFloatVal = APFloat(APInt(128, Pair));
     return lltok::APFloat;
+  case 'H':
+    APFloatVal = APFloat(APInt(16,HexIntToVal(TokStart+3, CurPtr)));
+    return lltok::APFloat;
   }
 }
 
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index 068be3d..095b7c5 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -645,12 +645,13 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
                            unsigned Linkage, bool HasLinkage,
                            unsigned Visibility) {
   unsigned AddrSpace;
-  bool ThreadLocal, IsConstant, UnnamedAddr;
+  bool IsConstant, UnnamedAddr;
+  GlobalVariable::ThreadLocalMode TLM;
   LocTy UnnamedAddrLoc;
   LocTy TyLoc;
 
   Type *Ty = 0;
-  if (ParseOptionalToken(lltok::kw_thread_local, ThreadLocal) ||
+  if (ParseOptionalThreadLocal(TLM) ||
       ParseOptionalAddrSpace(AddrSpace) ||
       ParseOptionalToken(lltok::kw_unnamed_addr, UnnamedAddr,
                          &UnnamedAddrLoc) ||
@@ -691,7 +692,8 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
 
   if (GV == 0) {
     GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage, 0,
-                            Name, 0, false, AddrSpace);
+                            Name, 0, GlobalVariable::NotThreadLocal,
+                            AddrSpace);
   } else {
     if (GV->getType()->getElementType() != Ty)
       return Error(TyLoc,
@@ -710,7 +712,7 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
   GV->setConstant(IsConstant);
   GV->setLinkage((GlobalValue::LinkageTypes)Linkage);
   GV->setVisibility((GlobalValue::VisibilityTypes)Visibility);
-  GV->setThreadLocal(ThreadLocal);
+  GV->setThreadLocalMode(TLM);
   GV->setUnnamedAddr(UnnamedAddr);
 
   // Parse attributes on the global.
@@ -858,6 +860,46 @@ bool LLParser::ParseUInt32(unsigned &Val) {
   return false;
 }
 
+/// ParseTLSModel
+///   := 'localdynamic'
+///   := 'initialexec'
+///   := 'localexec'
+bool LLParser::ParseTLSModel(GlobalVariable::ThreadLocalMode &TLM) {
+  switch (Lex.getKind()) {
+    default:
+      return TokError("expected localdynamic, initialexec or localexec");
+    case lltok::kw_localdynamic:
+      TLM = GlobalVariable::LocalDynamicTLSModel;
+      break;
+    case lltok::kw_initialexec:
+      TLM = GlobalVariable::InitialExecTLSModel;
+      break;
+    case lltok::kw_localexec:
+      TLM = GlobalVariable::LocalExecTLSModel;
+      break;
+  }
+
+  Lex.Lex();
+  return false;
+}
+
+/// ParseOptionalThreadLocal
+///   := /*empty*/
+///   := 'thread_local'
+///   := 'thread_local' '(' tlsmodel ')'
+bool LLParser::ParseOptionalThreadLocal(GlobalVariable::ThreadLocalMode &TLM) {
+  TLM = GlobalVariable::NotThreadLocal;
+  if (!EatIfPresent(lltok::kw_thread_local))
+    return false;
+
+  TLM = GlobalVariable::GeneralDynamicTLSModel;
+  if (Lex.getKind() == lltok::lparen) {
+    Lex.Lex();
+    return ParseTLSModel(TLM) ||
+      ParseToken(lltok::rparen, "expected ')' after thread local model");
+  }
+  return false;
+}
 
 /// ParseOptionalAddrSpace
 ///   := /*empty*/
@@ -2692,7 +2734,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   if (FuncAttrs != Attribute::None)
     Attrs.push_back(AttributeWithIndex::get(~0, FuncAttrs));
 
-  AttrListPtr PAL = AttrListPtr::get(Attrs.begin(), Attrs.end());
+  AttrListPtr PAL = AttrListPtr::get(Attrs);
 
   if (PAL.paramHasAttr(1, Attribute::StructRet) && !RetType->isVoidTy())
     return Error(RetTypeLoc, "functions with 'sret' argument must return void");
@@ -3239,7 +3281,7 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
     Attrs.push_back(AttributeWithIndex::get(~0, FnAttrs));
 
   // Finish off the Attributes and check them
-  AttrListPtr PAL = AttrListPtr::get(Attrs.begin(), Attrs.end());
+  AttrListPtr PAL = AttrListPtr::get(Attrs);
 
   InvokeInst *II = InvokeInst::Create(Callee, NormalBB, UnwindBB, Args);
   II->setCallingConv(CC);
@@ -3635,7 +3677,7 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
     Attrs.push_back(AttributeWithIndex::get(~0, FnAttrs));
 
   // Finish off the Attributes and check them
-  AttrListPtr PAL = AttrListPtr::get(Attrs.begin(), Attrs.end());
+  AttrListPtr PAL = AttrListPtr::get(Attrs);
 
   CallInst *CI = CallInst::Create(Callee, Args);
   CI->setTailCall(isTail);
diff --git a/lib/AsmParser/LLParser.h b/lib/AsmParser/LLParser.h
index dda8808..257c726 100644
--- a/lib/AsmParser/LLParser.h
+++ b/lib/AsmParser/LLParser.h
@@ -171,6 +171,9 @@ namespace llvm {
       Loc = Lex.getLoc();
       return ParseUInt32(Val);
     }
+
+    bool ParseTLSModel(GlobalVariable::ThreadLocalMode &TLM);
+    bool ParseOptionalThreadLocal(GlobalVariable::ThreadLocalMode &TLM);
     bool ParseOptionalAddrSpace(unsigned &AddrSpace);
     bool ParseOptionalAttrs(Attributes &Attrs, unsigned AttrKind);
     bool ParseOptionalLinkage(unsigned &Linkage, bool &HasLinkage);
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index adf5d4f..0461e7b 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -44,13 +44,14 @@ namespace lltok {
     kw_unnamed_addr,
     kw_extern_weak,
     kw_external, kw_thread_local,
+    kw_localdynamic, kw_initialexec, kw_localexec,
     kw_zeroinitializer,
     kw_undef, kw_null,
     kw_to,
     kw_tail,
     kw_target,
     kw_triple,
-    kw_unwind, 
+    kw_unwind,
     kw_deplibs,
     kw_datalayout,
     kw_volatile,
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index e399040..4ffee38 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -28,6 +28,10 @@
 #include "llvm/OperandTraits.h"
 using namespace llvm;
 
+enum {
+  SWITCH_INST_MAGIC = 0x4B5 // May 2012 => 1205 => Hex
+};
+
 void BitcodeReader::materializeForwardReferencedFunctions() {
   while (!BlockAddrFwdRefs.empty()) {
     Function *F = BlockAddrFwdRefs.begin()->first;
@@ -57,7 +61,7 @@ void BitcodeReader::FreeState() {
 /// ConvertToString - Convert a string from a record into an std::string, return
 /// true on failure.
 template<typename StrTy>
-static bool ConvertToString(SmallVector<uint64_t, 64> &Record, unsigned Idx,
+static bool ConvertToString(ArrayRef<uint64_t> Record, unsigned Idx,
                             StrTy &Result) {
   if (Idx > Record.size())
     return true;
@@ -98,6 +102,17 @@ static GlobalValue::VisibilityTypes GetDecodedVisibility(unsigned Val) {
   }
 }
 
+static GlobalVariable::ThreadLocalMode GetDecodedThreadLocalMode(unsigned Val) {
+  switch (Val) {
+    case 0: return GlobalVariable::NotThreadLocal;
+    default: // Map unknown non-zero value to general dynamic.
+    case 1: return GlobalVariable::GeneralDynamicTLSModel;
+    case 2: return GlobalVariable::LocalDynamicTLSModel;
+    case 3: return GlobalVariable::InitialExecTLSModel;
+    case 4: return GlobalVariable::LocalExecTLSModel;
+  }
+}
+
 static int GetDecodedCastOpcode(unsigned Val) {
   switch (Val) {
   default: return -1;
@@ -458,61 +473,19 @@ bool BitcodeReader::ParseAttributeBlock() {
       if (Record.size() & 1)
         return Error("Invalid ENTRY record");
 
-      // FIXME : Remove this autoupgrade code in LLVM 3.0.
-      // If Function attributes are using index 0 then transfer them
-      // to index ~0. Index 0 is used for return value attributes but used to be
-      // used for function attributes.
-      Attributes RetAttribute;
-      Attributes FnAttribute;
       for (unsigned i = 0, e = Record.size(); i != e; i += 2) {
-        // FIXME: remove in LLVM 3.0
-        // The alignment is stored as a 16-bit raw value from bits 31--16.
-        // We shift the bits above 31 down by 11 bits.
-
-        unsigned Alignment = (Record[i+1] & (0xffffull << 16)) >> 16;
-        if (Alignment && !isPowerOf2_32(Alignment))
-          return Error("Alignment is not a power of two.");
-
-        Attributes ReconstitutedAttr(Record[i+1] & 0xffff);
-        if (Alignment)
-          ReconstitutedAttr |= Attribute::constructAlignmentFromInt(Alignment);
-        ReconstitutedAttr |=
-            Attributes((Record[i+1] & (0xffffull << 32)) >> 11);
-
+        Attributes ReconstitutedAttr =
+          Attribute::decodeLLVMAttributesForBitcode(Record[i+1]);
         Record[i+1] = ReconstitutedAttr.Raw();
-        if (Record[i] == 0)
-          RetAttribute = ReconstitutedAttr;
-        else if (Record[i] == ~0U)
-          FnAttribute = ReconstitutedAttr;
-      }
-
-      Attributes OldRetAttrs = (Attribute::NoUnwind|Attribute::NoReturn|
-                              Attribute::ReadOnly|Attribute::ReadNone);
-
-      if (FnAttribute == Attribute::None && RetAttribute != Attribute::None &&
-          (RetAttribute & OldRetAttrs)) {
-        if (FnAttribute == Attribute::None) { // add a slot so they get added.
-          Record.push_back(~0U);
-          Record.push_back(0);
-        }
-
-        FnAttribute  |= RetAttribute & OldRetAttrs;
-        RetAttribute &= ~OldRetAttrs;
       }
 
       for (unsigned i = 0, e = Record.size(); i != e; i += 2) {
-        if (Record[i] == 0) {
-          if (RetAttribute != Attribute::None)
-            Attrs.push_back(AttributeWithIndex::get(0, RetAttribute));
-        } else if (Record[i] == ~0U) {
-          if (FnAttribute != Attribute::None)
-            Attrs.push_back(AttributeWithIndex::get(~0U, FnAttribute));
-        } else if (Attributes(Record[i+1]) != Attribute::None)
+        if (Attributes(Record[i+1]) != Attribute::None)
           Attrs.push_back(AttributeWithIndex::get(Record[i],
                                                   Attributes(Record[i+1])));
       }
 
-      MAttributes.push_back(AttrListPtr::get(Attrs.begin(), Attrs.end()));
+      MAttributes.push_back(AttrListPtr::get(Attrs));
       Attrs.clear();
       break;
     }
@@ -621,7 +594,7 @@ bool BitcodeReader::ParseTypeTableBody() {
       break;
     }
     case bitc::TYPE_CODE_FUNCTION_OLD: {
-      // FIXME: attrid is dead, remove it in LLVM 3.0
+      // FIXME: attrid is dead, remove it in LLVM 4.0
       // FUNCTION: [vararg, attrid, retty, paramty x N]
       if (Record.size() < 3)
         return Error("Invalid FUNCTION type record");
@@ -851,11 +824,7 @@ bool BitcodeReader::ParseMetadata() {
       break;
     case bitc::METADATA_NAME: {
       // Read named of the named metadata.
-      unsigned NameLength = Record.size();
-      SmallString<8> Name;
-      Name.resize(NameLength);
-      for (unsigned i = 0; i != NameLength; ++i)
-        Name[i] = Record[i];
+      SmallString<8> Name(Record.begin(), Record.end());
       Record.clear();
       Code = Stream.ReadCode();
 
@@ -899,26 +868,18 @@ bool BitcodeReader::ParseMetadata() {
       break;
     }
     case bitc::METADATA_STRING: {
-      unsigned MDStringLength = Record.size();
-      SmallString<8> String;
-      String.resize(MDStringLength);
-      for (unsigned i = 0; i != MDStringLength; ++i)
-        String[i] = Record[i];
-      Value *V = MDString::get(Context,
-                               StringRef(String.data(), String.size()));
+      SmallString<8> String(Record.begin(), Record.end());
+      Value *V = MDString::get(Context, String);
       MDValueList.AssignValue(V, NextMDValueNo++);
       break;
     }
     case bitc::METADATA_KIND: {
-      unsigned RecordLength = Record.size();
-      if (Record.empty() || RecordLength < 2)
+      if (Record.size() < 2)
         return Error("Invalid METADATA_KIND record");
-      SmallString<8> Name;
-      Name.resize(RecordLength-1);
+
       unsigned Kind = Record[0];
-      for (unsigned i = 1; i != RecordLength; ++i)
-        Name[i-1] = Record[i];
-      
+      SmallString<8> Name(Record.begin()+1, Record.end());
+
       unsigned NewKind = TheModule->getMDKindID(Name.str());
       if (!MDKindMap.insert(std::make_pair(Kind, NewKind)).second)
         return Error("Conflicting METADATA_KIND records");
@@ -977,6 +938,14 @@ bool BitcodeReader::ResolveGlobalAndAliasInits() {
   return false;
 }
 
+static APInt ReadWideAPInt(ArrayRef<uint64_t> Vals, unsigned TypeBits) {
+  SmallVector<uint64_t, 8> Words(Vals.size());
+  std::transform(Vals.begin(), Vals.end(), Words.begin(),
+                 DecodeSignRotatedValue);
+
+  return APInt(TypeBits, Words);
+}
+
 bool BitcodeReader::ParseConstants() {
   if (Stream.EnterSubBlock(bitc::CONSTANTS_BLOCK_ID))
     return Error("Malformed block record");
@@ -1032,14 +1001,10 @@ bool BitcodeReader::ParseConstants() {
       if (!CurTy->isIntegerTy() || Record.empty())
         return Error("Invalid WIDE_INTEGER record");
 
-      unsigned NumWords = Record.size();
-      SmallVector<uint64_t, 8> Words;
-      Words.resize(NumWords);
-      for (unsigned i = 0; i != NumWords; ++i)
-        Words[i] = DecodeSignRotatedValue(Record[i]);
-      V = ConstantInt::get(Context,
-                           APInt(cast<IntegerType>(CurTy)->getBitWidth(),
-                                 Words));
+      APInt VInt = ReadWideAPInt(Record,
+                                 cast<IntegerType>(CurTy)->getBitWidth());
+      V = ConstantInt::get(Context, VInt);
+      
       break;
     }
     case bitc::CST_CODE_FLOAT: {    // FLOAT: [fpval]
@@ -1098,10 +1063,7 @@ bool BitcodeReader::ParseConstants() {
       if (Record.empty())
         return Error("Invalid CST_STRING record");
 
-      unsigned Size = Record.size();
-      SmallString<16> Elts;
-      for (unsigned i = 0; i != Size; ++i)
-        Elts.push_back(Record[i]);
+      SmallString<16> Elts(Record.begin(), Record.end());
       V = ConstantDataArray::getString(Context, Elts,
                                        BitCode == bitc::CST_CODE_CSTRING);
       break;
@@ -1138,23 +1100,16 @@ bool BitcodeReader::ParseConstants() {
         else
           V = ConstantDataArray::get(Context, Elts);
       } else if (EltTy->isFloatTy()) {
-        SmallVector<float, 16> Elts;
-        for (unsigned i = 0; i != Size; ++i) {
-          union { uint32_t I; float F; };
-          I = Record[i];
-          Elts.push_back(F);
-        }
+        SmallVector<float, 16> Elts(Size);
+        std::transform(Record.begin(), Record.end(), Elts.begin(), BitsToFloat);
         if (isa<VectorType>(CurTy))
           V = ConstantDataVector::get(Context, Elts);
         else
           V = ConstantDataArray::get(Context, Elts);
       } else if (EltTy->isDoubleTy()) {
-        SmallVector<double, 16> Elts;
-        for (unsigned i = 0; i != Size; ++i) {
-          union { uint64_t I; double F; };
-          I = Record[i];
-          Elts.push_back(F);
-        }
+        SmallVector<double, 16> Elts(Size);
+        std::transform(Record.begin(), Record.end(), Elts.begin(),
+                       BitsToDouble);
         if (isa<VectorType>(CurTy))
           V = ConstantDataVector::get(Context, Elts);
         else
@@ -1600,9 +1555,10 @@ bool BitcodeReader::ParseModule(bool Resume) {
       GlobalValue::VisibilityTypes Visibility = GlobalValue::DefaultVisibility;
       if (Record.size() > 6)
         Visibility = GetDecodedVisibility(Record[6]);
-      bool isThreadLocal = false;
+
+      GlobalVariable::ThreadLocalMode TLM = GlobalVariable::NotThreadLocal;
       if (Record.size() > 7)
-        isThreadLocal = Record[7];
+        TLM = GetDecodedThreadLocalMode(Record[7]);
 
       bool UnnamedAddr = false;
       if (Record.size() > 8)
@@ -1610,12 +1566,11 @@ bool BitcodeReader::ParseModule(bool Resume) {
 
       GlobalVariable *NewGV =
         new GlobalVariable(*TheModule, Ty, isConstant, Linkage, 0, "", 0,
-                           isThreadLocal, AddressSpace);
+                           TLM, AddressSpace);
       NewGV->setAlignment(Alignment);
       if (!Section.empty())
         NewGV->setSection(Section);
       NewGV->setVisibility(Visibility);
-      NewGV->setThreadLocal(isThreadLocal);
       NewGV->setUnnamedAddr(UnnamedAddr);
 
       ValueList.push_back(NewGV);
@@ -1732,7 +1687,7 @@ bool BitcodeReader::ParseBitcodeInto(Module *M) {
       // have to read and ignore these final 4 bytes :-(
       if (Stream.GetAbbrevIDWidth() == 2 && Code == 2 &&
           Stream.Read(6) == 2 && Stream.Read(24) == 0xa0a0a &&
-	  Stream.AtEndOfStream())
+          Stream.AtEndOfStream())
         return false;
 
       return Error("Invalid record at top-level");
@@ -2271,6 +2226,65 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       break;
     }
     case bitc::FUNC_CODE_INST_SWITCH: { // SWITCH: [opty, op0, op1, ...]
+      // Check magic 
+      if ((Record[0] >> 16) == SWITCH_INST_MAGIC) {
+        // New SwitchInst format with case ranges.
+        
+        Type *OpTy = getTypeByID(Record[1]);
+        unsigned ValueBitWidth = cast<IntegerType>(OpTy)->getBitWidth();
+
+        Value *Cond = getFnValueByID(Record[2], OpTy);
+        BasicBlock *Default = getBasicBlock(Record[3]);
+        if (OpTy == 0 || Cond == 0 || Default == 0)
+          return Error("Invalid SWITCH record");
+
+        unsigned NumCases = Record[4];
+        
+        SwitchInst *SI = SwitchInst::Create(Cond, Default, NumCases);
+        InstructionList.push_back(SI);
+        
+        unsigned CurIdx = 5;
+        for (unsigned i = 0; i != NumCases; ++i) {
+          IntegersSubsetToBB CaseBuilder;
+          unsigned NumItems = Record[CurIdx++];
+          for (unsigned ci = 0; ci != NumItems; ++ci) {
+            bool isSingleNumber = Record[CurIdx++];
+            
+            APInt Low;
+            unsigned ActiveWords = 1;
+            if (ValueBitWidth > 64)
+              ActiveWords = Record[CurIdx++];
+            Low = ReadWideAPInt(makeArrayRef(&Record[CurIdx], ActiveWords),
+                                ValueBitWidth);
+            CurIdx += ActiveWords;
+
+            if (!isSingleNumber) {
+              ActiveWords = 1;
+              if (ValueBitWidth > 64)
+                ActiveWords = Record[CurIdx++];
+              APInt High =
+                  ReadWideAPInt(makeArrayRef(&Record[CurIdx], ActiveWords),
+                                ValueBitWidth);
+              
+              CaseBuilder.add(IntItem::fromType(OpTy, Low),
+                              IntItem::fromType(OpTy, High));
+              CurIdx += ActiveWords;
+            } else
+              CaseBuilder.add(IntItem::fromType(OpTy, Low));
+          }
+          BasicBlock *DestBB = getBasicBlock(Record[CurIdx++]);
+          IntegersSubset Case = CaseBuilder.getCase(); 
+          SI->addCase(Case, DestBB);
+        }
+        uint16_t Hash = SI->hash();
+        if (Hash != (Record[0] & 0xFFFF))
+          return Error("Invalid SWITCH record");
+        I = SI;
+        break;
+      }
+      
+      // Old SwitchInst format without case ranges.
+      
       if (Record.size() < 3 || (Record.size() & 1) == 0)
         return Error("Invalid SWITCH record");
       Type *OpTy = getTypeByID(Record[0]);
diff --git a/lib/Bitcode/Reader/CMakeLists.txt b/lib/Bitcode/Reader/CMakeLists.txt
index 693d431..dfe7e10 100644
--- a/lib/Bitcode/Reader/CMakeLists.txt
+++ b/lib/Bitcode/Reader/CMakeLists.txt
@@ -2,3 +2,5 @@ add_llvm_library(LLVMBitReader
   BitReader.cpp
   BitcodeReader.cpp
   )
+
+add_dependencies(LLVMBitReader intrinsics_gen)
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index b25d2e9..5b1725f 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -62,7 +62,10 @@ enum {
   FUNCTION_INST_CAST_ABBREV,
   FUNCTION_INST_RET_VOID_ABBREV,
   FUNCTION_INST_RET_VAL_ABBREV,
-  FUNCTION_INST_UNREACHABLE_ABBREV
+  FUNCTION_INST_UNREACHABLE_ABBREV,
+  
+  // SwitchInst Magic
+  SWITCH_INST_MAGIC = 0x4B5 // May 2012 => 1205 => Hex
 };
 
 static unsigned GetEncodedCastOpcode(unsigned Opcode) {
@@ -174,18 +177,7 @@ static void WriteAttributeTable(const ValueEnumerator &VE,
     for (unsigned i = 0, e = A.getNumSlots(); i != e; ++i) {
       const AttributeWithIndex &PAWI = A.getSlot(i);
       Record.push_back(PAWI.Index);
-
-      // FIXME: remove in LLVM 3.0
-      // Store the alignment in the bitcode as a 16-bit raw value instead of a
-      // 5-bit log2 encoded value. Shift the bits above the alignment up by
-      // 11 bits.
-      uint64_t FauxAttr = PAWI.Attrs.Raw() & 0xffff;
-      if (PAWI.Attrs & Attribute::Alignment)
-        FauxAttr |= (1ull<<16)<<
-            (((PAWI.Attrs & Attribute::Alignment).Raw()-1) >> 16);
-      FauxAttr |= (PAWI.Attrs.Raw() & (0x3FFull << 21)) << 11;
-
-      Record.push_back(FauxAttr);
+      Record.push_back(Attribute::encodeLLVMAttributesForBitcode(PAWI.Attrs));
     }
 
     Stream.EmitRecord(bitc::PARAMATTR_CODE_ENTRY, Record);
@@ -387,6 +379,17 @@ static unsigned getEncodedVisibility(const GlobalValue *GV) {
   llvm_unreachable("Invalid visibility");
 }
 
+static unsigned getEncodedThreadLocalMode(const GlobalVariable *GV) {
+  switch (GV->getThreadLocalMode()) {
+    case GlobalVariable::NotThreadLocal:         return 0;
+    case GlobalVariable::GeneralDynamicTLSModel: return 1;
+    case GlobalVariable::LocalDynamicTLSModel:   return 2;
+    case GlobalVariable::InitialExecTLSModel:    return 3;
+    case GlobalVariable::LocalExecTLSModel:      return 4;
+  }
+  llvm_unreachable("Invalid TLS model");
+}
+
 // Emit top-level description of module, including target triple, inline asm,
 // descriptors for global variables, and function prototype info.
 static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
@@ -495,7 +498,7 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
         GV->getVisibility() != GlobalValue::DefaultVisibility ||
         GV->hasUnnamedAddr()) {
       Vals.push_back(getEncodedVisibility(GV));
-      Vals.push_back(GV->isThreadLocal());
+      Vals.push_back(getEncodedThreadLocalMode(GV));
       Vals.push_back(GV->hasUnnamedAddr());
     } else {
       AbbrevToUse = SimpleGVarAbbrev;
@@ -719,6 +722,41 @@ static void WriteModuleMetadataStore(const Module *M, BitstreamWriter &Stream) {
   Stream.ExitBlock();
 }
 
+static void EmitAPInt(SmallVectorImpl<uint64_t> &Vals,
+                      unsigned &Code, unsigned &AbbrevToUse, const APInt &Val,
+                      bool EmitSizeForWideNumbers = false
+                      ) {
+  if (Val.getBitWidth() <= 64) {
+    uint64_t V = Val.getSExtValue();
+    if ((int64_t)V >= 0)
+      Vals.push_back(V << 1);
+    else
+      Vals.push_back((-V << 1) | 1);
+    Code = bitc::CST_CODE_INTEGER;
+    AbbrevToUse = CONSTANTS_INTEGER_ABBREV;
+  } else {
+    // Wide integers, > 64 bits in size.
+    // We have an arbitrary precision integer value to write whose
+    // bit width is > 64. However, in canonical unsigned integer
+    // format it is likely that the high bits are going to be zero.
+    // So, we only write the number of active words.
+    unsigned NWords = Val.getActiveWords();
+    
+    if (EmitSizeForWideNumbers)
+      Vals.push_back(NWords);
+    
+    const uint64_t *RawWords = Val.getRawData();
+    for (unsigned i = 0; i != NWords; ++i) {
+      int64_t V = RawWords[i];
+      if (V >= 0)
+        Vals.push_back(V << 1);
+      else
+        Vals.push_back((-V << 1) | 1);
+    }
+    Code = bitc::CST_CODE_WIDE_INTEGER;
+  }
+}
+
 static void WriteConstants(unsigned FirstVal, unsigned LastVal,
                            const ValueEnumerator &VE,
                            BitstreamWriter &Stream, bool isGlobal) {
@@ -801,30 +839,7 @@ static void WriteConstants(unsigned FirstVal, unsigned LastVal,
     } else if (isa<UndefValue>(C)) {
       Code = bitc::CST_CODE_UNDEF;
     } else if (const ConstantInt *IV = dyn_cast<ConstantInt>(C)) {
-      if (IV->getBitWidth() <= 64) {
-        uint64_t V = IV->getSExtValue();
-        if ((int64_t)V >= 0)
-          Record.push_back(V << 1);
-        else
-          Record.push_back((-V << 1) | 1);
-        Code = bitc::CST_CODE_INTEGER;
-        AbbrevToUse = CONSTANTS_INTEGER_ABBREV;
-      } else {                             // Wide integers, > 64 bits in size.
-        // We have an arbitrary precision integer value to write whose
-        // bit width is > 64. However, in canonical unsigned integer
-        // format it is likely that the high bits are going to be zero.
-        // So, we only write the number of active words.
-        unsigned NWords = IV->getValue().getActiveWords();
-        const uint64_t *RawWords = IV->getValue().getRawData();
-        for (unsigned i = 0; i != NWords; ++i) {
-          int64_t V = RawWords[i];
-          if (V >= 0)
-            Record.push_back(V << 1);
-          else
-            Record.push_back((-V << 1) | 1);
-        }
-        Code = bitc::CST_CODE_WIDE_INTEGER;
-      }
+      EmitAPInt(Record, Code, AbbrevToUse, IV->getValue());
     } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
       Code = bitc::CST_CODE_FLOAT;
       Type *Ty = CFP->getType();
@@ -1137,16 +1152,63 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
     break;
   case Instruction::Switch:
     {
+      // Redefine Vals, since here we need to use 64 bit values
+      // explicitly to store large APInt numbers.
+      SmallVector<uint64_t, 128> Vals64;
+      
       Code = bitc::FUNC_CODE_INST_SWITCH;
       SwitchInst &SI = cast<SwitchInst>(I);
-      Vals.push_back(VE.getTypeID(SI.getCondition()->getType()));
-      Vals.push_back(VE.getValueID(SI.getCondition()));
-      Vals.push_back(VE.getValueID(SI.getDefaultDest()));
+      
+      uint32_t SwitchRecordHeader = SI.hash() | (SWITCH_INST_MAGIC << 16); 
+      Vals64.push_back(SwitchRecordHeader);      
+      
+      Vals64.push_back(VE.getTypeID(SI.getCondition()->getType()));
+      Vals64.push_back(VE.getValueID(SI.getCondition()));
+      Vals64.push_back(VE.getValueID(SI.getDefaultDest()));
+      Vals64.push_back(SI.getNumCases());
       for (SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end();
            i != e; ++i) {
-        Vals.push_back(VE.getValueID(i.getCaseValue()));
-        Vals.push_back(VE.getValueID(i.getCaseSuccessor()));
+        IntegersSubset& CaseRanges = i.getCaseValueEx();
+        unsigned Code, Abbrev; // will unused.
+        
+        if (CaseRanges.isSingleNumber()) {
+          Vals64.push_back(1/*NumItems = 1*/);
+          Vals64.push_back(true/*IsSingleNumber = true*/);
+          EmitAPInt(Vals64, Code, Abbrev, CaseRanges.getSingleNumber(0), true);
+        } else {
+          
+          Vals64.push_back(CaseRanges.getNumItems());
+          
+          if (CaseRanges.isSingleNumbersOnly()) {
+            for (unsigned ri = 0, rn = CaseRanges.getNumItems();
+                 ri != rn; ++ri) {
+              
+              Vals64.push_back(true/*IsSingleNumber = true*/);
+              
+              EmitAPInt(Vals64, Code, Abbrev,
+                        CaseRanges.getSingleNumber(ri), true);
+            }
+          } else
+            for (unsigned ri = 0, rn = CaseRanges.getNumItems();
+                 ri != rn; ++ri) {
+              IntegersSubset::Range r = CaseRanges.getItem(ri);
+              bool IsSingleNumber = CaseRanges.isSingleNumber(ri);
+    
+              Vals64.push_back(IsSingleNumber);
+              
+              EmitAPInt(Vals64, Code, Abbrev, r.getLow(), true);
+              if (!IsSingleNumber)
+                EmitAPInt(Vals64, Code, Abbrev, r.getHigh(), true);
+            }
+        }
+        Vals64.push_back(VE.getValueID(i.getCaseSuccessor()));
       }
+      
+      Stream.EmitRecord(Code, Vals64, AbbrevToUse);
+      
+      // Also do expected action - clear external Vals collection:
+      Vals.clear();
+      return;
     }
     break;
   case Instruction::IndirectBr:
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index 822a564..205480a 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -16,10 +16,10 @@
 
 #define DEBUG_TYPE "post-RA-sched"
 #include "AggressiveAntiDepBreaker.h"
-#include "RegisterClassInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -157,8 +157,8 @@ void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
     // In a return block, examine the function live-out regs.
     for (MachineRegisterInfo::liveout_iterator I = MRI.liveout_begin(),
          E = MRI.liveout_end(); I != E; ++I) {
-      for (const uint16_t *Alias = TRI->getOverlaps(*I);
-           unsigned Reg = *Alias; ++Alias) {
+      for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) {
+        unsigned Reg = *AI;
         State->UnionGroups(Reg, 0);
         KillIndices[Reg] = BB->size();
         DefIndices[Reg] = ~0u;
@@ -173,8 +173,8 @@ void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
          SE = BB->succ_end(); SI != SE; ++SI)
     for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(),
            E = (*SI)->livein_end(); I != E; ++I) {
-      for (const uint16_t *Alias = TRI->getOverlaps(*I);
-           unsigned Reg = *Alias; ++Alias) {
+      for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) {
+        unsigned Reg = *AI;
         State->UnionGroups(Reg, 0);
         KillIndices[Reg] = BB->size();
         DefIndices[Reg] = ~0u;
@@ -189,8 +189,8 @@ void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
   for (const uint16_t *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) {
     unsigned Reg = *I;
     if (!IsReturnBlock && !Pristine.test(Reg)) continue;
-    for (const uint16_t *Alias = TRI->getOverlaps(Reg);
-         unsigned AliasReg = *Alias; ++Alias) {
+    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+      unsigned AliasReg = *AI;
       State->UnionGroups(AliasReg, 0);
       KillIndices[AliasReg] = BB->size();
       DefIndices[AliasReg] = ~0u;
@@ -265,10 +265,8 @@ void AggressiveAntiDepBreaker::GetPassthruRegs(MachineInstr *MI,
         IsImplicitDefUse(MI, MO)) {
       const unsigned Reg = MO.getReg();
       PassthruRegs.insert(Reg);
-      for (const uint16_t *Subreg = TRI->getSubRegisters(Reg);
-           *Subreg; ++Subreg) {
-        PassthruRegs.insert(*Subreg);
-      }
+      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+        PassthruRegs.insert(*SubRegs);
     }
   }
 }
@@ -333,9 +331,8 @@ void AggressiveAntiDepBreaker::HandleLastUse(unsigned Reg, unsigned KillIdx,
     DEBUG(dbgs() << "->g" << State->GetGroup(Reg) << tag);
   }
   // Repeat for subregisters.
-  for (const uint16_t *Subreg = TRI->getSubRegisters(Reg);
-       *Subreg; ++Subreg) {
-    unsigned SubregReg = *Subreg;
+  for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+    unsigned SubregReg = *SubRegs;
     if (!State->IsLive(SubregReg)) {
       KillIndices[SubregReg] = KillIdx;
       DefIndices[SubregReg] = ~0u;
@@ -392,8 +389,8 @@ void AggressiveAntiDepBreaker::PrescanInstruction(MachineInstr *MI,
 
     // Any aliased that are live at this point are completely or
     // partially defined here, so group those aliases with Reg.
-    for (const uint16_t *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
-      unsigned AliasReg = *Alias;
+    for (MCRegAliasIterator AI(Reg, TRI, false); AI.isValid(); ++AI) {
+      unsigned AliasReg = *AI;
       if (State->IsLive(AliasReg)) {
         State->UnionGroups(Reg, AliasReg);
         DEBUG(dbgs() << "->g" << State->GetGroup(Reg) << "(via " <<
@@ -404,7 +401,7 @@ void AggressiveAntiDepBreaker::PrescanInstruction(MachineInstr *MI,
     // Note register reference...
     const TargetRegisterClass *RC = NULL;
     if (i < MI->getDesc().getNumOperands())
-      RC = TII->getRegClass(MI->getDesc(), i, TRI);
+      RC = TII->getRegClass(MI->getDesc(), i, TRI, MF);
     AggressiveAntiDepState::RegisterReference RR = { &MO, RC };
     RegRefs.insert(std::make_pair(Reg, RR));
   }
@@ -423,9 +420,8 @@ void AggressiveAntiDepBreaker::PrescanInstruction(MachineInstr *MI,
       continue;
 
     // Update def for Reg and aliases.
-    for (const uint16_t *Alias = TRI->getOverlaps(Reg);
-         unsigned AliasReg = *Alias; ++Alias)
-      DefIndices[AliasReg] = Count;
+    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+      DefIndices[*AI] = Count;
   }
 }
 
@@ -479,7 +475,7 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr *MI,
     // Note register reference...
     const TargetRegisterClass *RC = NULL;
     if (i < MI->getDesc().getNumOperands())
-      RC = TII->getRegClass(MI->getDesc(), i, TRI);
+      RC = TII->getRegClass(MI->getDesc(), i, TRI, MF);
     AggressiveAntiDepState::RegisterReference RR = { &MO, RC };
     RegRefs.insert(std::make_pair(Reg, RR));
   }
@@ -678,9 +674,8 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
         goto next_super_reg;
       } else {
         bool found = false;
-        for (const uint16_t *Alias = TRI->getAliasSet(NewReg);
-             *Alias; ++Alias) {
-          unsigned AliasReg = *Alias;
+        for (MCRegAliasIterator AI(NewReg, TRI, false); AI.isValid(); ++AI) {
+          unsigned AliasReg = *AI;
           if (State->IsLive(AliasReg) ||
               (KillIndices[Reg] > DefIndices[AliasReg])) {
             DEBUG(dbgs() << "(alias " << TRI->getName(AliasReg) << " live)");
diff --git a/lib/CodeGen/AllocationOrder.cpp b/lib/CodeGen/AllocationOrder.cpp
index 87f6431..32ad34a 100644
--- a/lib/CodeGen/AllocationOrder.cpp
+++ b/lib/CodeGen/AllocationOrder.cpp
@@ -15,9 +15,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "AllocationOrder.h"
-#include "RegisterClassInfo.h"
 #include "VirtRegMap.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
 
 using namespace llvm;
 
diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp
index 00874d4..447f398 100644
--- a/lib/CodeGen/Analysis.cpp
+++ b/lib/CodeGen/Analysis.cpp
@@ -203,6 +203,63 @@ ISD::CondCode llvm::getICmpCondCode(ICmpInst::Predicate Pred) {
   }
 }
 
+
+/// getNoopInput - If V is a noop (i.e., lowers to no machine code), look
+/// through it (and any transitive noop operands to it) and return its input
+/// value.  This is used to determine if a tail call can be formed.
+///
+static const Value *getNoopInput(const Value *V, const TargetLowering &TLI) {
+  // If V is not an instruction, it can't be looked through.
+  const Instruction *I = dyn_cast<Instruction>(V);
+  if (I == 0 || !I->hasOneUse() || I->getNumOperands() == 0) return V;
+  
+  Value *Op = I->getOperand(0);
+
+  // Look through truly no-op truncates.
+  if (isa<TruncInst>(I) &&
+      TLI.isTruncateFree(I->getOperand(0)->getType(), I->getType()))
+    return getNoopInput(I->getOperand(0), TLI);
+  
+  // Look through truly no-op bitcasts.
+  if (isa<BitCastInst>(I)) {
+    // No type change at all.
+    if (Op->getType() == I->getType())
+      return getNoopInput(Op, TLI);
+
+    // Pointer to pointer cast.
+    if (Op->getType()->isPointerTy() && I->getType()->isPointerTy())
+      return getNoopInput(Op, TLI);
+    
+    if (isa<VectorType>(Op->getType()) && isa<VectorType>(I->getType()) &&
+        TLI.isTypeLegal(EVT::getEVT(Op->getType())) &&
+        TLI.isTypeLegal(EVT::getEVT(I->getType())))
+      return getNoopInput(Op, TLI);
+  }
+  
+  // Look through inttoptr.
+  if (isa<IntToPtrInst>(I) && !isa<VectorType>(I->getType())) {
+    // Make sure this isn't a truncating or extending cast.  We could support
+    // this eventually, but don't bother for now.
+    if (TLI.getPointerTy().getSizeInBits() == 
+          cast<IntegerType>(Op->getType())->getBitWidth())
+      return getNoopInput(Op, TLI);
+  }
+
+  // Look through ptrtoint.
+  if (isa<PtrToIntInst>(I) && !isa<VectorType>(I->getType())) {
+    // Make sure this isn't a truncating or extending cast.  We could support
+    // this eventually, but don't bother for now.
+    if (TLI.getPointerTy().getSizeInBits() == 
+        cast<IntegerType>(I->getType())->getBitWidth())
+      return getNoopInput(Op, TLI);
+  }
+
+
+  // Otherwise it's not something we can look through.
+  return V;
+}
+
+
 /// Test if the given instruction is in a position to be optimized
 /// with a tail-call. This roughly means that it's in a block with
 /// a return and there's nothing that needs to be scheduled
@@ -226,7 +283,8 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS, Attributes CalleeRetAttr,
   // been fully understood.
   if (!Ret &&
       (!TLI.getTargetMachine().Options.GuaranteedTailCallOpt ||
-       !isa<UnreachableInst>(Term))) return false;
+       !isa<UnreachableInst>(Term)))
+    return false;
 
   // If I will have a chain, make sure no other instruction that will have a
   // chain interposes between I and the return.
@@ -264,28 +322,28 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS, Attributes CalleeRetAttr,
     return false;
 
   // Otherwise, make sure the unmodified return value of I is the return value.
-  for (const Instruction *U = dyn_cast<Instruction>(Ret->getOperand(0)); ;
-       U = dyn_cast<Instruction>(U->getOperand(0))) {
-    if (!U)
-      return false;
-    if (!U->hasOneUse())
+  // We handle two cases: multiple return values + scalars.
+  Value *RetVal = Ret->getOperand(0);
+  if (!isa<InsertValueInst>(RetVal) || !isa<StructType>(RetVal->getType()))
+    // Handle scalars first.
+    return getNoopInput(Ret->getOperand(0), TLI) == I;
+  
+  // If this is an aggregate return, look through the insert/extract values and
+  // see if each is transparent.
+  for (unsigned i = 0, e =cast<StructType>(RetVal->getType())->getNumElements();
+       i != e; ++i) {
+    const Value *InScalar = FindInsertedValue(RetVal, i);
+    if (InScalar == 0) return false;
+    InScalar = getNoopInput(InScalar, TLI);
+    
+    // If the scalar value being inserted is an extractvalue of the right index
+    // from the call, then everything is good.
+    const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(InScalar);
+    if (EVI == 0 || EVI->getOperand(0) != I || EVI->getNumIndices() != 1 ||
+        EVI->getIndices()[0] != i)
       return false;
-    if (U == I)
-      break;
-    // Check for a truly no-op truncate.
-    if (isa<TruncInst>(U) &&
-        TLI.isTruncateFree(U->getOperand(0)->getType(), U->getType()))
-      continue;
-    // Check for a truly no-op bitcast.
-    if (isa<BitCastInst>(U) &&
-        (U->getOperand(0)->getType() == U->getType() ||
-         (U->getOperand(0)->getType()->isPointerTy() &&
-          U->getType()->isPointerTy())))
-      continue;
-    // Otherwise it's not a true no-op.
-    return false;
   }
-
+  
   return true;
 }
 
diff --git a/lib/CodeGen/AsmPrinter/ARMException.cpp b/lib/CodeGen/AsmPrinter/ARMException.cpp
index b60fda8..bf5d8c4 100644
--- a/lib/CodeGen/AsmPrinter/ARMException.cpp
+++ b/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -44,9 +44,7 @@ EnableARMEHABIDescriptors("arm-enable-ehabi-descriptors", cl::Hidden,
 
 
 ARMException::ARMException(AsmPrinter *A)
-  : DwarfException(A),
-    shouldEmitTable(false), shouldEmitMoves(false), shouldEmitTableModule(false)
-    {}
+  : DwarfException(A) {}
 
 ARMException::~ARMException() {}
 
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index f6cde98..b7fc663 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -16,6 +16,7 @@
 #if !defined(ANDROID_TARGET_BUILD) || defined(ANDROID_ENGINEERING_BUILD)
 #   include "DwarfDebug.h"
 #   include "DwarfException.h"
+#   include "llvm/DebugInfo.h"
 #endif // !ANDROID_TARGET_BUILD || ANDROID_ENGINEERING_BUILD
 #include "llvm/Module.h"
 #include "llvm/CodeGen/GCMetadataPrinter.h"
@@ -26,7 +27,6 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
@@ -484,10 +484,8 @@ void AsmPrinter::EmitFunctionHeader() {
 void AsmPrinter::EmitFunctionEntryLabel() {
   // The function label could have already been emitted if two symbols end up
   // conflicting due to asm renaming.  Detect this and emit an error.
-  if (CurrentFnSym->isUndefined()) {
-    OutStreamer.ForceCodeRegion();
+  if (CurrentFnSym->isUndefined())
     return OutStreamer.EmitLabel(CurrentFnSym);
-  }
 
   report_fatal_error("'" + Twine(CurrentFnSym->getName()) +
                      "' label emitted multiple times to assembly file");
@@ -624,7 +622,7 @@ bool AsmPrinter::needsSEHMoves() {
 }
 
 bool AsmPrinter::needsRelocationsForDwarfStringPool() const {
-  return MAI->doesDwarfUseRelocationsForStringPool();
+  return MAI->doesDwarfUseRelocationsAcrossSections();
 }
 
 void AsmPrinter::emitPrologLabel(const MachineInstr &MI) {
@@ -813,8 +811,8 @@ void AsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const {
   const TargetRegisterInfo *TRI = TM.getRegisterInfo();
   int Reg = TRI->getDwarfRegNum(MLoc.getReg(), false);
 
-  for (const uint16_t *SR = TRI->getSuperRegisters(MLoc.getReg());
-       *SR && Reg < 0; ++SR) {
+  for (MCSuperRegIterator SR(MLoc.getReg(), TRI); SR.isValid() && Reg < 0;
+       ++SR) {
     Reg = TRI->getDwarfRegNum(*SR, false);
     // FIXME: Get the bit range this register uses of the superregister
     // so that we can produce a DW_OP_bit_piece
@@ -1102,15 +1100,6 @@ void AsmPrinter::EmitJumpTableInfo() {
 
   EmitAlignment(Log2_32(MJTI->getEntryAlignment(*TM.getTargetData())));
 
-  // If we know the form of the jump table, go ahead and tag it as such.
-  if (!JTInDiffSection) {
-    if (MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32) {
-      OutStreamer.EmitJumpTable32Region();
-    } else {
-      OutStreamer.EmitDataRegion();
-    }
-  }
-
   for (unsigned JTI = 0, e = JT.size(); JTI != e; ++JTI) {
     const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
 
@@ -1416,13 +1405,14 @@ void AsmPrinter::EmitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset,
                                       unsigned Size)
   const {
 
-  // Emit Label+Offset
-  const MCExpr *Plus =
-    MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Label, OutContext),
-                            MCConstantExpr::Create(Offset, OutContext),
-                            OutContext);
+  // Emit Label+Offset (or just Label if Offset is zero)
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Label, OutContext);
+  if (Offset)
+    Expr = MCBinaryExpr::CreateAdd(Expr,
+                                   MCConstantExpr::Create(Offset, OutContext),
+                                   OutContext);
 
-  OutStreamer.EmitValue(Plus, 4, 0/*AddrSpace*/);
+  OutStreamer.EmitValue(Expr, Size, 0/*AddrSpace*/);
 }
 
 
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index e9e9335..711375b 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -329,11 +329,11 @@ void AsmPrinter::EmitInlineAsm(const MachineInstr *MI) const {
           OpNo += InlineAsm::getNumOperandRegisters(OpFlags) + 1;
         }
 
-	// We may have a location metadata attached to the end of the
-	// instruction, and at no point should see metadata at any
-	// other point while processing. It's an error if so.
+        // We may have a location metadata attached to the end of the
+        // instruction, and at no point should see metadata at any
+        // other point while processing. It's an error if so.
         if (OpNo >= MI->getNumOperands() ||
-	    MI->getOperand(OpNo).isMetadata()) {
+            MI->getOperand(OpNo).isMetadata()) {
           Error = true;
         } else {
           unsigned OpFlags = MI->getOperand(OpNo).getImm();
@@ -413,9 +413,28 @@ void AsmPrinter::PrintSpecial(const MachineInstr *MI, raw_ostream &OS,
 /// instruction, using the specified assembler variant.  Targets should
 /// override this to format as appropriate.
 bool AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
-                                 unsigned AsmVariant, const char *ExtraCode,
-                                 raw_ostream &O) {
-  // Target doesn't support this yet!
+    unsigned AsmVariant, const char *ExtraCode,
+    raw_ostream &O) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    const MachineOperand &MO = MI->getOperand(OpNo);
+    switch (ExtraCode[0]) {
+    default:
+      return true;  // Unknown modifier.
+    case 'c': // Substitute immediate value without immediate syntax
+      if (MO.getType() != MachineOperand::MO_Immediate)
+        return true;
+      O << MO.getImm();
+      return false;
+    case 'n':  // Negate the immediate constant.
+      if (MO.getType() != MachineOperand::MO_Immediate)
+        return true;
+      O << -MO.getImm();
+      return false;
+    }
+  }
   return true;
 }
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index cc5b642..d231665 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -17,9 +17,9 @@
 #include "DwarfCompileUnit.h"
 #include "DwarfDebug.h"
 #include "llvm/Constants.h"
+#include "llvm/DIBuilder.h"
 #include "llvm/GlobalVariable.h"
 #include "llvm/Instructions.h"
-#include "llvm/Analysis/DIBuilder.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetData.h"
@@ -33,7 +33,7 @@ using namespace llvm;
 
 /// CompileUnit - Compile unit constructor.
 CompileUnit::CompileUnit(unsigned I, unsigned L, DIE *D, AsmPrinter *A,
-			 DwarfDebug *DW)
+                         DwarfDebug *DW)
   : ID(I), Language(L), CUDie(D), Asm(A), DD(DW), IndexTyDie(0) {
   DIEIntegerOne = new (DIEValueAllocator) DIEInteger(1);
 }
@@ -198,7 +198,7 @@ void CompileUnit::addSourceLine(DIE *Die, DIObjCProperty Ty) {
     return;
   DIFile File = Ty.getFile();
   unsigned FileID = DD->GetOrCreateSourceID(File.getFilename(),
-					    File.getDirectory());
+                                            File.getDirectory());
   assert(FileID && "Invalid file id");
   addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
   addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
@@ -308,7 +308,8 @@ void CompileUnit::addComplexAddress(DbgVariable *&DV, DIE *Die,
       addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
       addUInt(Block, 0, dwarf::DW_FORM_udata, DV->getAddrElement(++i));
     } else if (Element == DIBuilder::OpDeref) {
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+      if (!Location.isReg())
+        addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
     } else llvm_unreachable("unknown DIBuilder Opcode");
   }
 
@@ -418,27 +419,12 @@ void CompileUnit::addBlockByrefAddress(DbgVariable *&DV, DIE *Die,
 
   // Decode the original location, and use that as the start of the byref
   // variable's location.
-  const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
-  unsigned Reg = RI->getDwarfRegNum(Location.getReg(), false);
   DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
 
-  if (Location.isReg()) {
-    if (Reg < 32)
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_reg0 + Reg);
-    else {
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_regx);
-      addUInt(Block, 0, dwarf::DW_FORM_udata, Reg);
-    }
-  } else {
-    if (Reg < 32)
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_breg0 + Reg);
-    else {
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_bregx);
-      addUInt(Block, 0, dwarf::DW_FORM_udata, Reg);
-    }
-
-    addUInt(Block, 0, dwarf::DW_FORM_sdata, Location.getOffset());
-  }
+  if (Location.isReg())
+    addRegisterOp(Block, Location.getReg());
+  else
+    addRegisterOffset(Block, Location.getReg(), Location.getOffset());
 
   // If we started with a pointer to the __Block_byref... struct, then
   // the first thing we need to do is dereference the pointer (DW_OP_deref).
@@ -646,8 +632,7 @@ DIE *CompileUnit::getOrCreateTypeDIE(const MDNode *TyNode) {
 }
 
 /// addType - Add a new type attribute to the specified entity.
-void CompileUnit::addType(DIE *Entity, DIType Ty,
-			  unsigned Attribute) {
+void CompileUnit::addType(DIE *Entity, DIType Ty, unsigned Attribute) {
   if (!Ty.Verify())
     return;
 
@@ -776,6 +761,11 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
         Buffer.addChild(ElemDie);
       }
     }
+    DIType DTy = CTy.getTypeDerivedFrom();
+    if (DTy.Verify()) {
+      addType(&Buffer, DTy);
+      addUInt(&Buffer, dwarf::DW_AT_enum_class, dwarf::DW_FORM_flag, 1);
+    }
   }
     break;
   case dwarf::DW_TAG_subroutine_type: {
@@ -801,9 +791,9 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
     // Add prototype flag if we're dealing with a C language and the
     // function has been prototyped.
     if (isPrototyped &&
-	(Language == dwarf::DW_LANG_C89 ||
-	 Language == dwarf::DW_LANG_C99 ||
-	 Language == dwarf::DW_LANG_ObjC))
+        (Language == dwarf::DW_LANG_C89 ||
+         Language == dwarf::DW_LANG_C99 ||
+         Language == dwarf::DW_LANG_ObjC))
       addUInt(&Buffer, dwarf::DW_AT_prototyped, dwarf::DW_FORM_flag, 1);
   }
     break;
@@ -846,19 +836,19 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
         addUInt(ElemDie, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1);
         addSourceLine(ElemDie, DV);
       } else if (Element.isDerivedType()) {
-	DIDerivedType DDTy(Element);
-	if (DDTy.getTag() == dwarf::DW_TAG_friend) {
-	  ElemDie = new DIE(dwarf::DW_TAG_friend);
-	  addType(ElemDie, DDTy.getTypeDerivedFrom(), dwarf::DW_AT_friend);
-	} else
-	  ElemDie = createMemberDIE(DIDerivedType(Element));
+        DIDerivedType DDTy(Element);
+        if (DDTy.getTag() == dwarf::DW_TAG_friend) {
+          ElemDie = new DIE(dwarf::DW_TAG_friend);
+          addType(ElemDie, DDTy.getTypeDerivedFrom(), dwarf::DW_AT_friend);
+        } else
+          ElemDie = createMemberDIE(DIDerivedType(Element));
       } else if (Element.isObjCProperty()) {
         DIObjCProperty Property(Element);
         ElemDie = new DIE(Property.getTag());
         StringRef PropertyName = Property.getObjCPropertyName();
         addString(ElemDie, dwarf::DW_AT_APPLE_property_name, PropertyName);
-	addType(ElemDie, Property.getType());
-	addSourceLine(ElemDie, Property);
+        addType(ElemDie, Property.getType());
+        addSourceLine(ElemDie, Property);
         StringRef GetterName = Property.getObjCPropertyGetterName();
         if (!GetterName.empty())
           addString(ElemDie, dwarf::DW_AT_APPLE_property_getter, GetterName);
@@ -925,19 +915,21 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
   if (!Name.empty())
     addString(&Buffer, dwarf::DW_AT_name, Name);
 
-  if (Tag == dwarf::DW_TAG_enumeration_type || Tag == dwarf::DW_TAG_class_type
-      || Tag == dwarf::DW_TAG_structure_type || Tag == dwarf::DW_TAG_union_type)
-  {
+  if (Tag == dwarf::DW_TAG_enumeration_type ||
+      Tag == dwarf::DW_TAG_class_type ||
+      Tag == dwarf::DW_TAG_structure_type ||
+      Tag == dwarf::DW_TAG_union_type) {
     // Add size if non-zero (derived types might be zero-sized.)
+    // TODO: Do we care about size for enum forward declarations?
     if (Size)
       addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size);
-    else {
+    else if (!CTy.isForwardDecl())
       // Add zero size if it is not a forward declaration.
-      if (CTy.isForwardDecl())
-        addUInt(&Buffer, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
-      else
-        addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, 0);
-    }
+      addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, 0);
+
+    // If we're a forward decl, say so.
+    if (CTy.isForwardDecl())
+      addUInt(&Buffer, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
 
     // Add source line info if available.
     if (!CTy.isForwardDecl())
@@ -968,7 +960,7 @@ CompileUnit::getOrCreateTemplateTypeParameterDIE(DITemplateTypeParameter TP) {
 /// getOrCreateTemplateValueParameterDIE - Find existing DIE or create new DIE 
 /// for the given DITemplateValueParameter.
 DIE *
-CompileUnit::getOrCreateTemplateValueParameterDIE(DITemplateValueParameter TPV) {
+CompileUnit::getOrCreateTemplateValueParameterDIE(DITemplateValueParameter TPV){
   DIE *ParamDIE = getDIE(TPV);
   if (ParamDIE)
     return ParamDIE;
@@ -1015,17 +1007,17 @@ DIE *CompileUnit::getOrCreateSubprogramDIE(DISubprogram SP) {
   if (SPDie)
     return SPDie;
 
+  SPDie = new DIE(dwarf::DW_TAG_subprogram);
+
+  // DW_TAG_inlined_subroutine may refer to this DIE.
+  insertDIE(SP, SPDie);
+
   DISubprogram SPDecl = SP.getFunctionDeclaration();
   DIE *DeclDie = NULL;
   if (SPDecl.isSubprogram()) {
     DeclDie = getOrCreateSubprogramDIE(SPDecl);
   }
 
-  SPDie = new DIE(dwarf::DW_TAG_subprogram);
-  
-  // DW_TAG_inlined_subroutine may refer to this DIE.
-  insertDIE(SP, SPDie);
-  
   // Add to context owner.
   addToContextOwner(SPDie, SP.getContext());
 
@@ -1240,7 +1232,8 @@ void CompileUnit::createGlobalVariableDIE(const MDNode *N) {
 }
 
 /// constructSubrangeDIE - Construct subrange DIE from DISubrange.
-void CompileUnit::constructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy){
+void CompileUnit::constructSubrangeDIE(DIE &Buffer, DISubrange SR,
+                                       DIE *IndexTy) {
   DIE *DW_Subrange = new DIE(dwarf::DW_TAG_subrange_type);
   addDIEEntry(DW_Subrange, dwarf::DW_AT_type, dwarf::DW_FORM_ref4, IndexTy);
   uint64_t L = SR.getLo();
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index 45e407e..b4ff9e8 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -15,7 +15,7 @@
 #define CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H
 
 #include "DIE.h"
-#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/OwningPtr.h"
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index cb78878..649684a 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -17,9 +17,10 @@
 #include "DwarfAccelTable.h"
 #include "DwarfCompileUnit.h"
 #include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/DIBuilder.h"
 #include "llvm/Module.h"
 #include "llvm/Instructions.h"
-#include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -32,11 +33,10 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/Analysis/DebugInfo.h"
-#include "llvm/Analysis/DIBuilder.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -117,7 +117,6 @@ DIType DbgVariable::getType() const {
       if (getName() == DT.getName())
         return (DT.getTypeDerivedFrom());
     }
-    return Ty;
   }
   return Ty;
 }
@@ -127,6 +126,7 @@ DIType DbgVariable::getType() const {
 DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
   : Asm(A), MMI(Asm->MMI), FirstCU(0),
     AbbreviationsSet(InitAbbreviationsSetSize),
+    SourceIdMap(DIEValueAllocator), StringPool(DIEValueAllocator),
     PrevLabel(NULL) {
   NextStringPoolNumber = 0;
 
@@ -566,7 +566,7 @@ CompileUnit *DwarfDebug::constructCompileUnit(const MDNode *N) {
   NewCU->addUInt(Die, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr, 0);
   // DW_AT_stmt_list is a offset of line number information for this
   // compile unit in debug_line section.
-  if (Asm->MAI->doesDwarfRequireRelocationForSectionOffset())
+  if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
     NewCU->addLabel(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4,
                     Asm->GetTempSymbol("section_line"));
   else
@@ -1310,8 +1310,9 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
                MOE = MI->operands_end(); MOI != MOE; ++MOI) {
           if (!MOI->isReg() || !MOI->isDef() || !MOI->getReg())
             continue;
-          for (const uint16_t *AI = TRI->getOverlaps(MOI->getReg());
-               unsigned Reg = *AI; ++AI) {
+          for (MCRegAliasIterator AI(MOI->getReg(), TRI, true);
+               AI.isValid(); ++AI) {
+            unsigned Reg = *AI;
             const MDNode *Var = LiveUserVar[Reg];
             if (!Var)
               continue;
@@ -1381,7 +1382,7 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
                                        MF->getFunction()->getContext());
     recordSourceLine(FnStartDL.getLine(), FnStartDL.getCol(),
                      FnStartDL.getScope(MF->getFunction()->getContext()),
-                     0);
+                     DWARF2_LINE_DEFAULT_IS_STMT ? DWARF2_FLAG_IS_STMT : 0);
   }
 }
 
@@ -1421,6 +1422,12 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
         DIVariable DV(Variables.getElement(i));
         if (!DV || !DV.Verify() || !ProcessedVars.insert(DV))
           continue;
+        // Check that DbgVariable for DV wasn't created earlier, when
+        // findAbstractVariable() was called for inlined instance of DV.
+        LLVMContext &Ctx = DV->getContext();
+        DIVariable CleanDV = cleanseInlinedVariable(DV, Ctx);
+        if (AbstractVariables.lookup(CleanDV))
+          continue;
         if (LexicalScope *Scope = LScopes.findAbstractScope(DV.getContext()))
           addScopeVariable(Scope, new DbgVariable(DV, NULL));
       }
@@ -1623,7 +1630,7 @@ void DwarfDebug::emitDIE(DIE *Die) {
       // DW_AT_range Value encodes offset in debug_range section.
       DIEInteger *V = cast<DIEInteger>(Values[i]);
 
-      if (Asm->MAI->doesDwarfUseLabelOffsetForRanges()) {
+      if (Asm->MAI->doesDwarfUseRelocationsAcrossSections()) {
         Asm->EmitLabelPlusOffset(DwarfDebugRangeSectionSym,
                                  V->getValue(),
                                  4);
@@ -1636,10 +1643,14 @@ void DwarfDebug::emitDIE(DIE *Die) {
       break;
     }
     case dwarf::DW_AT_location: {
-      if (DIELabel *L = dyn_cast<DIELabel>(Values[i]))
-        Asm->EmitLabelDifference(L->getValue(), DwarfDebugLocSectionSym, 4);
-      else
+      if (DIELabel *L = dyn_cast<DIELabel>(Values[i])) {
+        if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
+          Asm->EmitLabelReference(L->getValue(), 4);
+        else
+          Asm->EmitLabelDifference(L->getValue(), DwarfDebugLocSectionSym, 4);
+      } else {
         Values[i]->EmitValue(Asm, Form);
+      }
       break;
     }
     case dwarf::DW_AT_accessibility: {
@@ -2049,9 +2060,11 @@ void DwarfDebug::emitDebugLoc() {
             if (Element == DIBuilder::OpPlus) {
               Asm->EmitInt8(dwarf::DW_OP_plus_uconst);
               Asm->EmitULEB128(DV.getAddrElement(++i));
-            } else if (Element == DIBuilder::OpDeref)
-              Asm->EmitInt8(dwarf::DW_OP_deref);
-            else llvm_unreachable("unknown Opcode found in complex address");
+            } else if (Element == DIBuilder::OpDeref) {
+              if (!Entry.Loc.isReg())
+                Asm->EmitInt8(dwarf::DW_OP_deref);
+            } else
+              llvm_unreachable("unknown Opcode found in complex address");
           }
         }
       }
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 83f30f5..d1d6512 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -14,11 +14,11 @@
 #ifndef CODEGEN_ASMPRINTER_DWARFDEBUG_H__
 #define CODEGEN_ASMPRINTER_DWARFDEBUG_H__
 
+#include "DIE.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/LexicalScopes.h"
 #include "llvm/MC/MachineLocation.h"
-#include "llvm/Analysis/DebugInfo.h"
-#include "DIE.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -188,6 +188,9 @@ class DwarfDebug {
   /// MMI - Collected machine module information.
   MachineModuleInfo *MMI;
 
+  /// DIEValueAllocator - All DIEValues are allocated through this allocator.
+  BumpPtrAllocator DIEValueAllocator;
+
   //===--------------------------------------------------------------------===//
   // Attributes used to construct specific Dwarf sections.
   //
@@ -210,11 +213,11 @@ class DwarfDebug {
 
   /// SourceIdMap - Source id map, i.e. pair of source filename and directory,
   /// separated by a zero byte, mapped to a unique id.
-  StringMap<unsigned> SourceIdMap;
+  StringMap<unsigned, BumpPtrAllocator&> SourceIdMap;
 
   /// StringPool - A String->Symbol mapping of strings used by indirect
   /// references.
-  StringMap<std::pair<MCSymbol*, unsigned> > StringPool;
+  StringMap<std::pair<MCSymbol*, unsigned>, BumpPtrAllocator&> StringPool;
   unsigned NextStringPoolNumber;
   
   /// SectionMap - Provides a unique id per text section.
@@ -232,7 +235,7 @@ class DwarfDebug {
   /// ScopeVariables - Collection of dbg variables of a scope.
   DenseMap<LexicalScope *, SmallVector<DbgVariable *, 8> > ScopeVariables;
 
-  /// AbstractVariables - Collection on abstract variables.
+  /// AbstractVariables - Collection of abstract variables.
   DenseMap<const MDNode *, DbgVariable *> AbstractVariables;
 
   /// DotDebugLocEntries - Collection of DotDebugLocEntry.
@@ -292,9 +295,6 @@ class DwarfDebug {
 
   std::vector<FunctionDebugFrameInfo> DebugFrames;
 
-  // DIEValueAllocator - All DIEValues are allocated through this allocator.
-  BumpPtrAllocator DIEValueAllocator;
-
   // Section Symbols: these are assembler temporary labels that are emitted at
   // the beginning of each supported dwarf section.  These are used to form
   // section offsets and are created by EmitSectionLabels.
@@ -333,9 +333,6 @@ private:
   /// of the function.
   DIE *constructInlinedScopeDIE(CompileUnit *TheCU, LexicalScope *Scope);
 
-  /// constructVariableDIE - Construct a DIE for the given DbgVariable.
-  DIE *constructVariableDIE(DbgVariable *DV, LexicalScope *S);
-
   /// constructScopeDIE - Construct a DIE for this scope.
   DIE *constructScopeDIE(CompileUnit *TheCU, LexicalScope *Scope);
 
@@ -517,9 +514,6 @@ public:
   /// in the SourceIds map.
   unsigned GetOrCreateSourceID(StringRef DirName, StringRef FullName);
 
-  /// createSubprogramDIE - Create new DIE using SP.
-  DIE *createSubprogramDIE(DISubprogram SP);
-
   /// getStringPool - returns the entry into the start of the pool.
   MCSymbol *getStringPool();
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfException.h b/lib/CodeGen/AsmPrinter/DwarfException.h
index b5f86ab..75f6056 100644
--- a/lib/CodeGen/AsmPrinter/DwarfException.h
+++ b/lib/CodeGen/AsmPrinter/DwarfException.h
@@ -175,17 +175,6 @@ public:
 };
 
 class ARMException : public DwarfException {
-  /// shouldEmitTable - Per-function flag to indicate if EH tables should
-  /// be emitted.
-  bool shouldEmitTable;
-
-  /// shouldEmitMoves - Per-function flag to indicate if frame moves info
-  /// should be emitted.
-  bool shouldEmitMoves;
-
-  /// shouldEmitTableModule - Per-module flag to indicate if EH tables
-  /// should be emitted.
-  bool shouldEmitTableModule;
 public:
   //===--------------------------------------------------------------------===//
   // Main entry points.
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index ef1d2ba..fb65bb7 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -137,9 +137,8 @@ bool BranchFolder::OptimizeImpDefsBlock(MachineBasicBlock *MBB) {
       break;
     unsigned Reg = I->getOperand(0).getReg();
     ImpDefRegs.insert(Reg);
-    for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg);
-         unsigned SubReg = *SubRegs; ++SubRegs)
-      ImpDefRegs.insert(SubReg);
+    for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+      ImpDefRegs.insert(*SubRegs);
     ++I;
   }
   if (ImpDefRegs.empty())
@@ -188,7 +187,7 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF,
 
   // Use a RegScavenger to help update liveness when required.
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  if (MRI.tracksLiveness() && TRI->requiresRegisterScavenging(MF))
+  if (MRI.tracksLiveness() && TRI->trackLivenessAfterRegAlloc(MF))
     RS = new RegScavenger();
   else
     MRI.invalidateLiveness();
@@ -819,10 +818,8 @@ bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB,
 }
 
 bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
-
-  if (!EnableTailMerge) return false;
-
   bool MadeChange = false;
+  if (!EnableTailMerge) return MadeChange;
 
   // First find blocks with no successors.
   MergePotentials.clear();
@@ -839,6 +836,7 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
   if (MergePotentials.size() == TailMergeThreshold)
     for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i)
       TriedMerging.insert(MergePotentials[i].getBlock());
+
   // See if we can do any tail merging on those.
   if (MergePotentials.size() >= 2)
     MadeChange |= TryTailMergeBlocks(NULL, NULL);
@@ -864,88 +862,97 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
 
   for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end();
        I != E; ++I) {
-    if (I->pred_size() >= 2) {
-      SmallPtrSet<MachineBasicBlock *, 8> UniquePreds;
-      MachineBasicBlock *IBB = I;
-      MachineBasicBlock *PredBB = prior(I);
-      MergePotentials.clear();
-      for (MachineBasicBlock::pred_iterator P = I->pred_begin(),
-                                            E2 = I->pred_end();
-           P != E2 && MergePotentials.size() < TailMergeThreshold; ++P) {
-        MachineBasicBlock *PBB = *P;
-        if (TriedMerging.count(PBB))
-          continue;
-        // Skip blocks that loop to themselves, can't tail merge these.
-        if (PBB == IBB)
-          continue;
-        // Visit each predecessor only once.
-        if (!UniquePreds.insert(PBB))
-          continue;
-        // Skip blocks which may jump to a landing pad. Can't tail merge these.
-        if (PBB->getLandingPadSuccessor())
-          continue;
-        MachineBasicBlock *TBB = 0, *FBB = 0;
-        SmallVector<MachineOperand, 4> Cond;
-        if (!TII->AnalyzeBranch(*PBB, TBB, FBB, Cond, true)) {
-          // Failing case:  IBB is the target of a cbr, and
-          // we cannot reverse the branch.
-          SmallVector<MachineOperand, 4> NewCond(Cond);
-          if (!Cond.empty() && TBB == IBB) {
-            if (TII->ReverseBranchCondition(NewCond))
+    if (I->pred_size() < 2) continue;
+    SmallPtrSet<MachineBasicBlock *, 8> UniquePreds;
+    MachineBasicBlock *IBB = I;
+    MachineBasicBlock *PredBB = prior(I);
+    MergePotentials.clear();
+    for (MachineBasicBlock::pred_iterator P = I->pred_begin(),
+           E2 = I->pred_end();
+         P != E2 && MergePotentials.size() < TailMergeThreshold; ++P) {
+      MachineBasicBlock *PBB = *P;
+      if (TriedMerging.count(PBB))
+        continue;
+
+      // Skip blocks that loop to themselves, can't tail merge these.
+      if (PBB == IBB)
+        continue;
+
+      // Visit each predecessor only once.
+      if (!UniquePreds.insert(PBB))
+        continue;
+
+      // Skip blocks which may jump to a landing pad. Can't tail merge these.
+      if (PBB->getLandingPadSuccessor())
+        continue;
+
+      MachineBasicBlock *TBB = 0, *FBB = 0;
+      SmallVector<MachineOperand, 4> Cond;
+      if (!TII->AnalyzeBranch(*PBB, TBB, FBB, Cond, true)) {
+        // Failing case: IBB is the target of a cbr, and we cannot reverse the
+        // branch.
+        SmallVector<MachineOperand, 4> NewCond(Cond);
+        if (!Cond.empty() && TBB == IBB) {
+          if (TII->ReverseBranchCondition(NewCond))
+            continue;
+          // This is the QBB case described above
+          if (!FBB)
+            FBB = llvm::next(MachineFunction::iterator(PBB));
+        }
+
+        // Failing case: the only way IBB can be reached from PBB is via
+        // exception handling.  Happens for landing pads.  Would be nice to have
+        // a bit in the edge so we didn't have to do all this.
+        if (IBB->isLandingPad()) {
+          MachineFunction::iterator IP = PBB;  IP++;
+          MachineBasicBlock *PredNextBB = NULL;
+          if (IP != MF.end())
+            PredNextBB = IP;
+          if (TBB == NULL) {
+            if (IBB != PredNextBB)      // fallthrough
+              continue;
+          } else if (FBB) {
+            if (TBB != IBB && FBB != IBB)   // cbr then ubr
+              continue;
+          } else if (Cond.empty()) {
+            if (TBB != IBB)               // ubr
+              continue;
+          } else {
+            if (TBB != IBB && IBB != PredNextBB)  // cbr
               continue;
-            // This is the QBB case described above
-            if (!FBB)
-              FBB = llvm::next(MachineFunction::iterator(PBB));
-          }
-          // Failing case:  the only way IBB can be reached from PBB is via
-          // exception handling.  Happens for landing pads.  Would be nice
-          // to have a bit in the edge so we didn't have to do all this.
-          if (IBB->isLandingPad()) {
-            MachineFunction::iterator IP = PBB;  IP++;
-            MachineBasicBlock *PredNextBB = NULL;
-            if (IP != MF.end())
-              PredNextBB = IP;
-            if (TBB == NULL) {
-              if (IBB != PredNextBB)      // fallthrough
-                continue;
-            } else if (FBB) {
-              if (TBB != IBB && FBB != IBB)   // cbr then ubr
-                continue;
-            } else if (Cond.empty()) {
-              if (TBB != IBB)               // ubr
-                continue;
-            } else {
-              if (TBB != IBB && IBB != PredNextBB)  // cbr
-                continue;
-            }
-          }
-          // Remove the unconditional branch at the end, if any.
-          if (TBB && (Cond.empty() || FBB)) {
-            DebugLoc dl;  // FIXME: this is nowhere
-            TII->RemoveBranch(*PBB);
-            if (!Cond.empty())
-              // reinsert conditional branch only, for now
-              TII->InsertBranch(*PBB, (TBB == IBB) ? FBB : TBB, 0, NewCond, dl);
           }
-          MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(PBB), *P));
         }
+
+        // Remove the unconditional branch at the end, if any.
+        if (TBB && (Cond.empty() || FBB)) {
+          DebugLoc dl;  // FIXME: this is nowhere
+          TII->RemoveBranch(*PBB);
+          if (!Cond.empty())
+            // reinsert conditional branch only, for now
+            TII->InsertBranch(*PBB, (TBB == IBB) ? FBB : TBB, 0, NewCond, dl);
+        }
+
+        MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(PBB), *P));
       }
-      // If this is a large problem, avoid visiting the same basic blocks
-      // multiple times.
-      if (MergePotentials.size() == TailMergeThreshold)
-        for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i)
-          TriedMerging.insert(MergePotentials[i].getBlock());
-      if (MergePotentials.size() >= 2)
-        MadeChange |= TryTailMergeBlocks(IBB, PredBB);
-      // Reinsert an unconditional branch if needed.
-      // The 1 below can occur as a result of removing blocks in
-      // TryTailMergeBlocks.
-      PredBB = prior(I);     // this may have been changed in TryTailMergeBlocks
-      if (MergePotentials.size() == 1 &&
-          MergePotentials.begin()->getBlock() != PredBB)
-        FixTail(MergePotentials.begin()->getBlock(), IBB, TII);
     }
+
+    // If this is a large problem, avoid visiting the same basic blocks multiple
+    // times.
+    if (MergePotentials.size() == TailMergeThreshold)
+      for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i)
+        TriedMerging.insert(MergePotentials[i].getBlock());
+
+    if (MergePotentials.size() >= 2)
+      MadeChange |= TryTailMergeBlocks(IBB, PredBB);
+
+    // Reinsert an unconditional branch if needed. The 1 below can occur as a
+    // result of removing blocks in TryTailMergeBlocks.
+    PredBB = prior(I);     // this may have been changed in TryTailMergeBlocks
+    if (MergePotentials.size() == 1 &&
+        MergePotentials.begin()->getBlock() != PredBB)
+      FixTail(MergePotentials.begin()->getBlock(), IBB, TII);
   }
+
   return MadeChange;
 }
 
@@ -1459,7 +1466,7 @@ static MachineBasicBlock *findFalseBlock(MachineBasicBlock *BB,
 }
 
 /// findHoistingInsertPosAndDeps - Find the location to move common instructions
-/// in successors to. The location is ususally just before the terminator,
+/// in successors to. The location is usually just before the terminator,
 /// however if the terminator is a conditional branch and its previous
 /// instruction is the flag setting instruction, the previous instruction is
 /// the preferred location. This function also gathers uses and defs of the
@@ -1483,9 +1490,8 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
     if (!Reg)
       continue;
     if (MO.isUse()) {
-      Uses.insert(Reg);
-      for (const uint16_t *AS = TRI->getAliasSet(Reg); *AS; ++AS)
-        Uses.insert(*AS);
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        Uses.insert(*AI);
     } else if (!MO.isDead())
       // Don't try to hoist code in the rare case the terminator defines a
       // register that is later used.
@@ -1545,18 +1551,16 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
     if (!Reg)
       continue;
     if (MO.isUse()) {
-      Uses.insert(Reg);
-      for (const uint16_t *AS = TRI->getAliasSet(Reg); *AS; ++AS)
-        Uses.insert(*AS);
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        Uses.insert(*AI);
     } else {
       if (Uses.count(Reg)) {
         Uses.erase(Reg);
-        for (const uint16_t *SR = TRI->getSubRegisters(Reg); *SR; ++SR)
-          Uses.erase(*SR); // Use getSubRegisters to be conservative
+        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+          Uses.erase(*SubRegs); // Use sub-registers to be conservative
       }
-      Defs.insert(Reg);
-      for (const uint16_t *AS = TRI->getAliasSet(Reg); *AS; ++AS)
-        Defs.insert(*AS);
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        Defs.insert(*AI);
     }
   }
 
@@ -1683,8 +1687,8 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
       unsigned Reg = MO.getReg();
       if (!Reg || !LocalDefsSet.count(Reg))
         continue;
-      for (const uint16_t *OR = TRI->getOverlaps(Reg); *OR; ++OR)
-        LocalDefsSet.erase(*OR);
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        LocalDefsSet.erase(*AI);
     }
 
     // Track local defs so we can update liveins.
@@ -1696,8 +1700,8 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
       if (!Reg)
         continue;
       LocalDefs.push_back(Reg);
-      for (const uint16_t *OR = TRI->getOverlaps(Reg); *OR; ++OR)
-        LocalDefsSet.insert(*OR);
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        LocalDefsSet.insert(*AI);
     }
 
     HasDups = true;
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 21729cd..d240389 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -11,6 +11,7 @@ add_llvm_library(LLVMCodeGen
   DeadMachineInstructionElim.cpp
   DFAPacketizer.cpp
   DwarfEHPrepare.cpp
+  EarlyIfConversion.cpp
   EdgeBundles.cpp
   ExecutionDepsFix.cpp
   ExpandISelPseudos.cpp
@@ -30,6 +31,7 @@ add_llvm_library(LLVMCodeGen
   LiveInterval.cpp
   LiveIntervalAnalysis.cpp
   LiveIntervalUnion.cpp
+  LiveRegMatrix.cpp
   LiveStackAnalysis.cpp
   LiveVariables.cpp
   LiveRangeCalc.cpp
@@ -77,8 +79,8 @@ add_llvm_library(LLVMCodeGen
   RegAllocPBQP.cpp
   RegisterClassInfo.cpp
   RegisterCoalescer.cpp
+  RegisterPressure.cpp
   RegisterScavenging.cpp
-  RenderMachineFunction.cpp
   ScheduleDAG.cpp
   ScheduleDAGInstrs.cpp
   ScheduleDAGPrinter.cpp
@@ -103,5 +105,7 @@ add_llvm_library(LLVMCodeGen
   VirtRegMap.cpp
   )
 
+add_dependencies(LLVMCodeGen intrinsics_gen)
+
 add_subdirectory(SelectionDAG)
 add_subdirectory(AsmPrinter)
diff --git a/lib/CodeGen/CalcSpillWeights.cpp b/lib/CodeGen/CalcSpillWeights.cpp
index ea16a25..939af3f 100644
--- a/lib/CodeGen/CalcSpillWeights.cpp
+++ b/lib/CodeGen/CalcSpillWeights.cpp
@@ -39,18 +39,20 @@ void CalculateSpillWeights::getAnalysisUsage(AnalysisUsage &au) const {
   MachineFunctionPass::getAnalysisUsage(au);
 }
 
-bool CalculateSpillWeights::runOnMachineFunction(MachineFunction &fn) {
+bool CalculateSpillWeights::runOnMachineFunction(MachineFunction &MF) {
 
   DEBUG(dbgs() << "********** Compute Spill Weights **********\n"
                << "********** Function: "
-               << fn.getFunction()->getName() << '\n');
-
-  LiveIntervals &lis = getAnalysis<LiveIntervals>();
-  VirtRegAuxInfo vrai(fn, lis, getAnalysis<MachineLoopInfo>());
-  for (LiveIntervals::iterator I = lis.begin(), E = lis.end(); I != E; ++I) {
-    LiveInterval &li = *I->second;
-    if (TargetRegisterInfo::isVirtualRegister(li.reg))
-      vrai.CalculateWeightAndHint(li);
+               << MF.getFunction()->getName() << '\n');
+
+  LiveIntervals &LIS = getAnalysis<LiveIntervals>();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  VirtRegAuxInfo VRAI(MF, LIS, getAnalysis<MachineLoopInfo>());
+  for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (MRI.reg_nodbg_empty(Reg))
+      continue;
+    VRAI.CalculateWeightAndHint(LIS.getInterval(Reg));
   }
   return false;
 }
@@ -86,6 +88,27 @@ static unsigned copyHint(const MachineInstr *mi, unsigned reg,
   return tri.getMatchingSuperReg(hreg, sub, rc);
 }
 
+// Check if all values in LI are rematerializable
+static bool isRematerializable(const LiveInterval &LI,
+                               const LiveIntervals &LIS,
+                               const TargetInstrInfo &TII) {
+  for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end();
+       I != E; ++I) {
+    const VNInfo *VNI = *I;
+    if (VNI->isUnused())
+      continue;
+    if (VNI->isPHIDef())
+      return false;
+
+    MachineInstr *MI = LIS.getInstructionFromIndex(VNI->def);
+    assert(MI && "Dead valno in interval");
+
+    if (!TII.isTriviallyReMaterializable(MI, LIS.getAliasAnalysis()))
+      return false;
+  }
+  return true;
+}
+
 void VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) {
   MachineRegisterInfo &mri = MF.getRegInfo();
   const TargetRegisterInfo &tri = *MF.getTarget().getRegisterInfo();
@@ -171,17 +194,11 @@ void VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) {
   }
 
   // If all of the definitions of the interval are re-materializable,
-  // it is a preferred candidate for spilling. If none of the defs are
-  // loads, then it's potentially very cheap to re-materialize.
+  // it is a preferred candidate for spilling.
   // FIXME: this gets much more complicated once we support non-trivial
   // re-materialization.
-  bool isLoad = false;
-  if (LIS.isReMaterializable(li, 0, isLoad)) {
-    if (isLoad)
-      totalWeight *= 0.9F;
-    else
-      totalWeight *= 0.5F;
-  }
+  if (isRematerializable(li, LIS, *MF.getTarget().getInstrInfo()))
+    totalWeight *= 0.5F;
 
   li.weight = normalizeSpillWeight(totalWeight, li.getSize());
 }
diff --git a/lib/CodeGen/CallingConvLower.cpp b/lib/CodeGen/CallingConvLower.cpp
index 2b7dfdb..0b747fd 100644
--- a/lib/CodeGen/CallingConvLower.cpp
+++ b/lib/CodeGen/CallingConvLower.cpp
@@ -49,8 +49,7 @@ void CCState::HandleByVal(unsigned ValNo, MVT ValVT,
     Size = MinSize;
   if (MinAlign > (int)Align)
     Align = MinAlign;
-  if (MF.getFrameInfo()->getMaxAlignment() < Align)
-    MF.getFrameInfo()->setMaxAlignment(Align);
+  MF.getFrameInfo()->ensureMaxAlignment(Align);
   TM.getTargetLowering()->HandleByVal(this, Size);
   unsigned Offset = AllocateStack(Size, Align);
   addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
@@ -58,9 +57,8 @@ void CCState::HandleByVal(unsigned ValNo, MVT ValVT,
 
 /// MarkAllocated - Mark a register and all of its aliases as allocated.
 void CCState::MarkAllocated(unsigned Reg) {
-  for (const uint16_t *Alias = TRI.getOverlaps(Reg);
-       unsigned Reg = *Alias; ++Alias)
-    UsedRegs[Reg/32] |= 1 << (Reg&31);
+  for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
+    UsedRegs[*AI/32] |= 1 << (*AI&31);
 }
 
 /// AnalyzeFormalArguments - Analyze an array of argument values,
diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp
index a81bb5c..fb2c2e8 100644
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp
@@ -23,6 +23,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeCalculateSpillWeightsPass(Registry);
   initializeCodePlacementOptPass(Registry);
   initializeDeadMachineInstructionElimPass(Registry);
+  initializeEarlyIfConverterPass(Registry);
   initializeExpandPostRAPass(Registry);
   initializeExpandISelPseudosPass(Registry);
   initializeFinalizeMachineBundlesPass(Registry);
@@ -53,7 +54,6 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeProcessImplicitDefsPass(Registry);
   initializePEIPass(Registry);
   initializeRegisterCoalescerPass(Registry);
-  initializeRenderMachineFunctionPass(Registry);
   initializeSlotIndexesPass(Registry);
   initializeStackProtectorPass(Registry);
   initializeStackSlotColoringPass(Registry);
@@ -65,7 +65,9 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeUnreachableBlockElimPass(Registry);
   initializeUnreachableMachineBlockElimPass(Registry);
   initializeVirtRegMapPass(Registry);
+  initializeVirtRegRewriterPass(Registry);
   initializeLowerIntrinsicsPass(Registry);
+  initializeMachineFunctionPrinterPassPass(Registry);
 }
 
 void LLVMInitializeCodeGen(LLVMPassRegistryRef R) {
diff --git a/lib/CodeGen/CodePlacementOpt.cpp b/lib/CodeGen/CodePlacementOpt.cpp
index c13c05e..99233df 100644
--- a/lib/CodeGen/CodePlacementOpt.cpp
+++ b/lib/CodeGen/CodePlacementOpt.cpp
@@ -201,7 +201,7 @@ bool CodePlacementOpt::EliminateUnconditionalJumpsToTop(MachineFunction &MF,
           // fallthrough edge.
           if (!Prior->isSuccessor(End))
             goto next_pred;
-          // Otherwise we can stop scanning and procede to move the blocks.
+          // Otherwise we can stop scanning and proceed to move the blocks.
           break;
         }
         // If we hit a switch or something complicated, don't move anything
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.cpp b/lib/CodeGen/CriticalAntiDepBreaker.cpp
index bad5010..a9de1c7 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -62,17 +62,11 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
     // In a return block, examine the function live-out regs.
     for (MachineRegisterInfo::liveout_iterator I = MRI.liveout_begin(),
          E = MRI.liveout_end(); I != E; ++I) {
-      unsigned Reg = *I;
-      Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
-      KillIndices[Reg] = BBSize;
-      DefIndices[Reg] = ~0u;
-
-      // Repeat, for all aliases.
-      for (const uint16_t *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
-        unsigned AliasReg = *Alias;
-        Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1);
-        KillIndices[AliasReg] = BBSize;
-        DefIndices[AliasReg] = ~0u;
+      for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) {
+        unsigned Reg = *AI;
+        Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+        KillIndices[Reg] = BBSize;
+        DefIndices[Reg] = ~0u;
       }
     }
   }
@@ -84,17 +78,11 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
          SE = BB->succ_end(); SI != SE; ++SI)
     for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(),
            E = (*SI)->livein_end(); I != E; ++I) {
-      unsigned Reg = *I;
-      Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
-      KillIndices[Reg] = BBSize;
-      DefIndices[Reg] = ~0u;
-
-      // Repeat, for all aliases.
-      for (const uint16_t *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
-        unsigned AliasReg = *Alias;
-        Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1);
-        KillIndices[AliasReg] = BBSize;
-        DefIndices[AliasReg] = ~0u;
+      for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) {
+        unsigned Reg = *AI;
+        Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+        KillIndices[Reg] = BBSize;
+        DefIndices[Reg] = ~0u;
       }
     }
 
@@ -104,18 +92,12 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   BitVector Pristine = MFI->getPristineRegs(BB);
   for (const uint16_t *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) {
-    unsigned Reg = *I;
-    if (!IsReturnBlock && !Pristine.test(Reg)) continue;
-    Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
-    KillIndices[Reg] = BBSize;
-    DefIndices[Reg] = ~0u;
-
-    // Repeat, for all aliases.
-    for (const uint16_t *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
-      unsigned AliasReg = *Alias;
-      Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1);
-      KillIndices[AliasReg] = BBSize;
-      DefIndices[AliasReg] = ~0u;
+    if (!IsReturnBlock && !Pristine.test(*I)) continue;
+    for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) {
+      unsigned Reg = *AI;
+      Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+      KillIndices[Reg] = BBSize;
+      DefIndices[Reg] = ~0u;
     }
   }
 }
@@ -208,7 +190,7 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr *MI) {
     const TargetRegisterClass *NewRC = 0;
 
     if (i < MI->getDesc().getNumOperands())
-      NewRC = TII->getRegClass(MI->getDesc(), i, TRI);
+      NewRC = TII->getRegClass(MI->getDesc(), i, TRI, MF);
 
     // For now, only allow the register to be changed if its register
     // class is consistent across all uses.
@@ -218,11 +200,11 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr *MI) {
       Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
 
     // Now check for aliases.
-    for (const uint16_t *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
+    for (MCRegAliasIterator AI(Reg, TRI, false); AI.isValid(); ++AI) {
       // If an alias of the reg is used during the live range, give up.
       // Note that this allows us to skip checking if AntiDepReg
       // overlaps with any of the aliases, among other things.
-      unsigned AliasReg = *Alias;
+      unsigned AliasReg = *AI;
       if (Classes[AliasReg]) {
         Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1);
         Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
@@ -236,9 +218,8 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr *MI) {
     if (MO.isUse() && Special) {
       if (!KeepRegs.test(Reg)) {
         KeepRegs.set(Reg);
-        for (const uint16_t *Subreg = TRI->getSubRegisters(Reg);
-             *Subreg; ++Subreg)
-          KeepRegs.set(*Subreg);
+        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+          KeepRegs.set(*SubRegs);
       }
     }
   }
@@ -247,7 +228,7 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr *MI) {
 void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI,
                                              unsigned Count) {
   // Update liveness.
-  // Proceding upwards, registers that are defed but not used in this
+  // Proceeding upwards, registers that are defed but not used in this
   // instruction are now dead.
 
   if (!TII->isPredicated(MI)) {
@@ -282,9 +263,8 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI,
       Classes[Reg] = 0;
       RegRefs.erase(Reg);
       // Repeat, for all subregs.
-      for (const uint16_t *Subreg = TRI->getSubRegisters(Reg);
-           *Subreg; ++Subreg) {
-        unsigned SubregReg = *Subreg;
+      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+        unsigned SubregReg = *SubRegs;
         DefIndices[SubregReg] = Count;
         KillIndices[SubregReg] = ~0u;
         KeepRegs.reset(SubregReg);
@@ -292,11 +272,8 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI,
         RegRefs.erase(SubregReg);
       }
       // Conservatively mark super-registers as unusable.
-      for (const uint16_t *Super = TRI->getSuperRegisters(Reg);
-           *Super; ++Super) {
-        unsigned SuperReg = *Super;
-        Classes[SuperReg] = reinterpret_cast<TargetRegisterClass *>(-1);
-      }
+      for (MCSuperRegIterator SR(Reg, TRI); SR.isValid(); ++SR)
+        Classes[*SR] = reinterpret_cast<TargetRegisterClass *>(-1);
     }
   }
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
@@ -308,7 +285,7 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI,
 
     const TargetRegisterClass *NewRC = 0;
     if (i < MI->getDesc().getNumOperands())
-      NewRC = TII->getRegClass(MI->getDesc(), i, TRI);
+      NewRC = TII->getRegClass(MI->getDesc(), i, TRI, MF);
 
     // For now, only allow the register to be changed if its register
     // class is consistent across all uses.
@@ -328,8 +305,8 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI,
                "Kill and Def maps aren't consistent for Reg!");
     }
     // Repeat, for all aliases.
-    for (const uint16_t *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
-      unsigned AliasReg = *Alias;
+    for (MCRegAliasIterator AI(Reg, TRI, false); AI.isValid(); ++AI) {
+      unsigned AliasReg = *AI;
       if (KillIndices[AliasReg] == ~0u) {
         KillIndices[AliasReg] = Count;
         DefIndices[AliasReg] = ~0u;
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.h b/lib/CodeGen/CriticalAntiDepBreaker.h
index 7746259..ad95c48 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.h
+++ b/lib/CodeGen/CriticalAntiDepBreaker.h
@@ -17,11 +17,11 @@
 #define LLVM_CODEGEN_CRITICALANTIDEPBREAKER_H
 
 #include "AntiDepBreaker.h"
-#include "RegisterClassInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/ADT/BitVector.h"
 #include <map>
diff --git a/lib/CodeGen/DFAPacketizer.cpp b/lib/CodeGen/DFAPacketizer.cpp
index 5ff641c..ff2f113 100644
--- a/lib/CodeGen/DFAPacketizer.cpp
+++ b/lib/CodeGen/DFAPacketizer.cpp
@@ -23,10 +23,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/CodeGen/DFAPacketizer.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBundle.h"
-#include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/MC/MCInstrItineraries.h"
 using namespace llvm;
@@ -100,22 +100,23 @@ void DFAPacketizer::reserveResources(llvm::MachineInstr *MI) {
   reserveResources(&MID);
 }
 
-namespace {
+namespace llvm {
 // DefaultVLIWScheduler - This class extends ScheduleDAGInstrs and overrides
 // Schedule method to build the dependence graph.
 class DefaultVLIWScheduler : public ScheduleDAGInstrs {
 public:
   DefaultVLIWScheduler(MachineFunction &MF, MachineLoopInfo &MLI,
-                       MachineDominatorTree &MDT, bool IsPostRA);
+                   MachineDominatorTree &MDT, bool IsPostRA);
   // Schedule - Actual scheduling work.
   void schedule();
 };
-} // end anonymous namespace
+}
 
 DefaultVLIWScheduler::DefaultVLIWScheduler(
   MachineFunction &MF, MachineLoopInfo &MLI, MachineDominatorTree &MDT,
   bool IsPostRA) :
   ScheduleDAGInstrs(MF, MLI, MDT, IsPostRA) {
+  CanHandleTerminators = true;
 }
 
 void DefaultVLIWScheduler::schedule() {
@@ -129,49 +130,25 @@ VLIWPacketizerList::VLIWPacketizerList(
   bool IsPostRA) : TM(MF.getTarget()), MF(MF)  {
   TII = TM.getInstrInfo();
   ResourceTracker = TII->CreateTargetScheduleState(&TM, 0);
-  SchedulerImpl = new DefaultVLIWScheduler(MF, MLI, MDT, IsPostRA);
+  VLIWScheduler = new DefaultVLIWScheduler(MF, MLI, MDT, IsPostRA);
 }
 
 // VLIWPacketizerList Dtor
 VLIWPacketizerList::~VLIWPacketizerList() {
-  delete SchedulerImpl;
-  delete ResourceTracker;
-}
-
-// ignorePseudoInstruction - ignore pseudo instructions.
-bool VLIWPacketizerList::ignorePseudoInstruction(MachineInstr *MI,
-                                                 MachineBasicBlock *MBB) {
-  if (MI->isDebugValue())
-    return true;
-
-  if (TII->isSchedulingBoundary(MI, MBB, MF))
-    return true;
-
-  return false;
-}
-
-// isSoloInstruction - return true if instruction I must end previous
-// packet.
-bool VLIWPacketizerList::isSoloInstruction(MachineInstr *I) {
-  if (I->isInlineAsm())
-    return true;
-
-  return false;
-}
+  if (VLIWScheduler)
+    delete VLIWScheduler;
 
-// addToPacket - Add I to the current packet and reserve resource.
-void VLIWPacketizerList::addToPacket(MachineInstr *MI) {
-  CurrentPacketMIs.push_back(MI);
-  ResourceTracker->reserveResources(MI);
+  if (ResourceTracker)
+    delete ResourceTracker;
 }
 
 // endPacket - End the current packet, bundle packet instructions and reset
 // DFA state.
 void VLIWPacketizerList::endPacket(MachineBasicBlock *MBB,
-                                         MachineInstr *I) {
+                                         MachineInstr *MI) {
   if (CurrentPacketMIs.size() > 1) {
     MachineInstr *MIFirst = CurrentPacketMIs.front();
-    finalizeBundle(*MBB, MIFirst, I);
+    finalizeBundle(*MBB, MIFirst, MI);
   }
   CurrentPacketMIs.clear();
   ResourceTracker->clearResources();
@@ -181,31 +158,35 @@ void VLIWPacketizerList::endPacket(MachineBasicBlock *MBB,
 void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB,
                                       MachineBasicBlock::iterator BeginItr,
                                       MachineBasicBlock::iterator EndItr) {
-  assert(MBB->end() == EndItr && "Bad EndIndex");
-
-  SchedulerImpl->enterRegion(MBB, BeginItr, EndItr, MBB->size());
-
-  // Build the DAG without reordering instructions.
-  SchedulerImpl->schedule();
-
-  // Remember scheduling units.
-  SUnits = SchedulerImpl->SUnits;
+  assert(VLIWScheduler && "VLIW Scheduler is not initialized!");
+  VLIWScheduler->startBlock(MBB);
+  VLIWScheduler->enterRegion(MBB, BeginItr, EndItr, MBB->size());
+  VLIWScheduler->schedule();
+
+  // Generate MI -> SU map.
+  MIToSUnit.clear();
+  for (unsigned i = 0, e = VLIWScheduler->SUnits.size(); i != e; ++i) {
+    SUnit *SU = &VLIWScheduler->SUnits[i];
+    MIToSUnit[SU->getInstr()] = SU;
+  }
 
   // The main packetizer loop.
   for (; BeginItr != EndItr; ++BeginItr) {
     MachineInstr *MI = BeginItr;
 
-    // Ignore pseudo instructions.
-    if (ignorePseudoInstruction(MI, MBB))
-      continue;
+    this->initPacketizerState();
 
     // End the current packet if needed.
-    if (isSoloInstruction(MI)) {
+    if (this->isSoloInstruction(MI)) {
       endPacket(MBB, MI);
       continue;
     }
 
-    SUnit *SUI = SchedulerImpl->getSUnit(MI);
+    // Ignore pseudo instructions.
+    if (this->ignorePseudoInstruction(MI, MBB))
+      continue;
+
+    SUnit *SUI = MIToSUnit[MI];
     assert(SUI && "Missing SUnit Info!");
 
     // Ask DFA if machine resource is available for MI.
@@ -215,13 +196,13 @@ void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB,
       for (std::vector<MachineInstr*>::iterator VI = CurrentPacketMIs.begin(),
            VE = CurrentPacketMIs.end(); VI != VE; ++VI) {
         MachineInstr *MJ = *VI;
-        SUnit *SUJ = SchedulerImpl->getSUnit(MJ);
+        SUnit *SUJ = MIToSUnit[MJ];
         assert(SUJ && "Missing SUnit Info!");
 
         // Is it legal to packetize SUI and SUJ together.
-        if (!isLegalToPacketizeTogether(SUI, SUJ)) {
+        if (!this->isLegalToPacketizeTogether(SUI, SUJ)) {
           // Allow packetization if dependency can be pruned.
-          if (!isLegalToPruneDependencies(SUI, SUJ)) {
+          if (!this->isLegalToPruneDependencies(SUI, SUJ)) {
             // End the packet if dependency cannot be pruned.
             endPacket(MBB, MI);
             break;
@@ -234,11 +215,11 @@ void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB,
     }
 
     // Add MI to the current packet.
-    addToPacket(MI);
+    BeginItr = this->addToPacket(MI);
   } // For all instructions in BB.
 
   // End any packet left behind.
   endPacket(MBB, EndItr);
-
-  SchedulerImpl->exitRegion();
+  VLIWScheduler->exitRegion();
+  VLIWScheduler->finishBlock();
 }
diff --git a/lib/CodeGen/DeadMachineInstructionElim.cpp b/lib/CodeGen/DeadMachineInstructionElim.cpp
index aa10d1d..b4394e8 100644
--- a/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -171,9 +171,8 @@ bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) {
             // Check the subreg set, not the alias set, because a def
             // of a super-register may still be partially live after
             // this def.
-            for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg);
-                 *SubRegs; ++SubRegs)
-              LivePhysRegs.reset(*SubRegs);
+            for (MCSubRegIterator SR(Reg, TRI); SR.isValid(); ++SR)
+              LivePhysRegs.reset(*SR);
           }
         } else if (MO.isRegMask()) {
           // Register mask of preserved registers. All clobbers are dead.
@@ -187,10 +186,8 @@ bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) {
         if (MO.isReg() && MO.isUse()) {
           unsigned Reg = MO.getReg();
           if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
-            LivePhysRegs.set(Reg);
-            for (const uint16_t *AliasSet = TRI->getAliasSet(Reg);
-                 *AliasSet; ++AliasSet)
-              LivePhysRegs.set(*AliasSet);
+            for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+              LivePhysRegs.set(*AI);
           }
         }
       }
diff --git a/lib/CodeGen/DwarfEHPrepare.cpp b/lib/CodeGen/DwarfEHPrepare.cpp
index 944dd4f..7095624 100644
--- a/lib/CodeGen/DwarfEHPrepare.cpp
+++ b/lib/CodeGen/DwarfEHPrepare.cpp
@@ -39,7 +39,7 @@ namespace {
     Constant *RewindFunction;
 
     bool InsertUnwindResumeCalls(Function &Fn);
-    Instruction *GetExceptionObject(ResumeInst *RI);
+    Value *GetExceptionObject(ResumeInst *RI);
 
   public:
     static char ID; // Pass identification, replacement for typeid.
@@ -68,9 +68,9 @@ FunctionPass *llvm::createDwarfEHPass(const TargetMachine *tm) {
 /// GetExceptionObject - Return the exception object from the value passed into
 /// the 'resume' instruction (typically an aggregate). Clean up any dead
 /// instructions, including the 'resume' instruction.
-Instruction *DwarfEHPrepare::GetExceptionObject(ResumeInst *RI) {
+Value *DwarfEHPrepare::GetExceptionObject(ResumeInst *RI) {
   Value *V = RI->getOperand(0);
-  Instruction *ExnObj = 0;
+  Value *ExnObj = 0;
   InsertValueInst *SelIVI = dyn_cast<InsertValueInst>(V);
   LoadInst *SelLoad = 0;
   InsertValueInst *ExcIVI = 0;
@@ -81,7 +81,7 @@ Instruction *DwarfEHPrepare::GetExceptionObject(ResumeInst *RI) {
       ExcIVI = dyn_cast<InsertValueInst>(SelIVI->getOperand(0));
       if (ExcIVI && isa<UndefValue>(ExcIVI->getOperand(0)) &&
           ExcIVI->getNumIndices() == 1 && *ExcIVI->idx_begin() == 0) {
-        ExnObj = cast<Instruction>(ExcIVI->getOperand(1));
+        ExnObj = ExcIVI->getOperand(1);
         SelLoad = dyn_cast<LoadInst>(SelIVI->getOperand(1));
         EraseIVIs = true;
       }
@@ -139,7 +139,7 @@ bool DwarfEHPrepare::InsertUnwindResumeCalls(Function &Fn) {
     // _Unwind_Resume to the end of the single resume block.
     ResumeInst *RI = Resumes.front();
     BasicBlock *UnwindBB = RI->getParent();
-    Instruction *ExnObj = GetExceptionObject(RI);
+    Value *ExnObj = GetExceptionObject(RI);
 
     // Call the _Unwind_Resume function.
     CallInst *CI = CallInst::Create(RewindFunction, ExnObj, "", UnwindBB);
@@ -162,7 +162,7 @@ bool DwarfEHPrepare::InsertUnwindResumeCalls(Function &Fn) {
     BasicBlock *Parent = RI->getParent();
     BranchInst::Create(UnwindBB, Parent);
 
-    Instruction *ExnObj = GetExceptionObject(RI);
+    Value *ExnObj = GetExceptionObject(RI);
     PN->addIncoming(ExnObj, Parent);
 
     ++NumResumesLowered;
diff --git a/lib/CodeGen/EarlyIfConversion.cpp b/lib/CodeGen/EarlyIfConversion.cpp
new file mode 100644
index 0000000..9840a40
--- /dev/null
+++ b/lib/CodeGen/EarlyIfConversion.cpp
@@ -0,0 +1,618 @@
+//===-- EarlyIfConversion.cpp - If-conversion on SSA form machine code ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Early if-conversion is for out-of-order CPUs that don't have a lot of
+// predicable instructions. The goal is to eliminate conditional branches that
+// may mispredict.
+//
+// Instructions from both sides of the branch are executed specutatively, and a
+// cmov instruction selects the result.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "early-ifcvt"
+#include "llvm/Function.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SparseSet.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+// Absolute maximum number of instructions allowed per speculated block.
+// This bypasses all other heuristics, so it should be set fairly high.
+static cl::opt<unsigned>
+BlockInstrLimit("early-ifcvt-limit", cl::init(30), cl::Hidden,
+  cl::desc("Maximum number of instructions per speculated block."));
+
+// Stress testing mode - disable heuristics.
+static cl::opt<bool> Stress("stress-early-ifcvt", cl::Hidden,
+  cl::desc("Turn all knobs to 11"));
+
+typedef SmallSetVector<MachineBasicBlock*, 8> BlockSetVector;
+
+//===----------------------------------------------------------------------===//
+//                                 SSAIfConv
+//===----------------------------------------------------------------------===//
+//
+// The SSAIfConv class performs if-conversion on SSA form machine code after
+// determining if it is possible. The class contains no heuristics; external
+// code should be used to determine when if-conversion is a good idea.
+//
+// SSAIfConv can convert both triangles and diamonds:
+//
+//   Triangle: Head              Diamond: Head
+//              | \                       /  \_
+//              |  \                     /    |
+//              |  [TF]BB              FBB    TBB
+//              |  /                     \    /
+//              | /                       \  /
+//             Tail                       Tail
+//
+// Instructions in the conditional blocks TBB and/or FBB are spliced into the
+// Head block, and phis in the Tail block are converted to select instructions.
+//
+namespace {
+class SSAIfConv {
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+
+public:
+  /// The block containing the conditional branch.
+  MachineBasicBlock *Head;
+
+  /// The block containing phis after the if-then-else.
+  MachineBasicBlock *Tail;
+
+  /// The 'true' conditional block as determined by AnalyzeBranch.
+  MachineBasicBlock *TBB;
+
+  /// The 'false' conditional block as determined by AnalyzeBranch.
+  MachineBasicBlock *FBB;
+
+  /// isTriangle - When there is no 'else' block, either TBB or FBB will be
+  /// equal to Tail.
+  bool isTriangle() const { return TBB == Tail || FBB == Tail; }
+
+  /// Information about each phi in the Tail block.
+  struct PHIInfo {
+    MachineInstr *PHI;
+    unsigned TReg, FReg;
+    // Latencies from Cond+Branch, TReg, and FReg to DstReg.
+    int CondCycles, TCycles, FCycles;
+
+    PHIInfo(MachineInstr *phi)
+      : PHI(phi), TReg(0), FReg(0), CondCycles(0), TCycles(0), FCycles(0) {}
+  };
+
+  SmallVector<PHIInfo, 8> PHIs;
+
+private:
+  /// The branch condition determined by AnalyzeBranch.
+  SmallVector<MachineOperand, 4> Cond;
+
+  /// Instructions in Head that define values used by the conditional blocks.
+  /// The hoisted instructions must be inserted after these instructions.
+  SmallPtrSet<MachineInstr*, 8> InsertAfter;
+
+  /// Register units clobbered by the conditional blocks.
+  BitVector ClobberedRegUnits;
+
+  // Scratch pad for findInsertionPoint.
+  SparseSet<unsigned> LiveRegUnits;
+
+  /// Insertion point in Head for speculatively executed instructions form TBB
+  /// and FBB.
+  MachineBasicBlock::iterator InsertionPoint;
+
+  /// Return true if all non-terminator instructions in MBB can be safely
+  /// speculated.
+  bool canSpeculateInstrs(MachineBasicBlock *MBB);
+
+  /// Find a valid insertion point in Head.
+  bool findInsertionPoint();
+
+public:
+  /// runOnMachineFunction - Initialize per-function data structures.
+  void runOnMachineFunction(MachineFunction &MF) {
+    TII = MF.getTarget().getInstrInfo();
+    TRI = MF.getTarget().getRegisterInfo();
+    MRI = &MF.getRegInfo();
+    LiveRegUnits.clear();
+    LiveRegUnits.setUniverse(TRI->getNumRegUnits());
+    ClobberedRegUnits.clear();
+    ClobberedRegUnits.resize(TRI->getNumRegUnits());
+  }
+
+  /// canConvertIf - If the sub-CFG headed by MBB can be if-converted,
+  /// initialize the internal state, and return true.
+  bool canConvertIf(MachineBasicBlock *MBB);
+
+  /// convertIf - If-convert the last block passed to canConvertIf(), assuming
+  /// it is possible. Add any erased blocks to RemovedBlocks.
+  void convertIf(SmallVectorImpl<MachineBasicBlock*> &RemovedBlocks);
+};
+} // end anonymous namespace
+
+
+/// canSpeculateInstrs - Returns true if all the instructions in MBB can safely
+/// be speculated. The terminators are not considered.
+///
+/// If instructions use any values that are defined in the head basic block,
+/// the defining instructions are added to InsertAfter.
+///
+/// Any clobbered regunits are added to ClobberedRegUnits.
+///
+bool SSAIfConv::canSpeculateInstrs(MachineBasicBlock *MBB) {
+  // Reject any live-in physregs. It's probably CPSR/EFLAGS, and very hard to
+  // get right.
+  if (!MBB->livein_empty()) {
+    DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has live-ins.\n");
+    return false;
+  }
+
+  unsigned InstrCount = 0;
+
+  // Check all instructions, except the terminators. It is assumed that
+  // terminators never have side effects or define any used register values.
+  for (MachineBasicBlock::iterator I = MBB->begin(),
+       E = MBB->getFirstTerminator(); I != E; ++I) {
+    if (I->isDebugValue())
+      continue;
+
+    if (++InstrCount > BlockInstrLimit && !Stress) {
+      DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has more than "
+                   << BlockInstrLimit << " instructions.\n");
+      return false;
+    }
+
+    // There shouldn't normally be any phis in a single-predecessor block.
+    if (I->isPHI()) {
+      DEBUG(dbgs() << "Can't hoist: " << *I);
+      return false;
+    }
+
+    // Don't speculate loads. Note that it may be possible and desirable to
+    // speculate GOT or constant pool loads that are guaranteed not to trap,
+    // but we don't support that for now.
+    if (I->mayLoad()) {
+      DEBUG(dbgs() << "Won't speculate load: " << *I);
+      return false;
+    }
+
+    // We never speculate stores, so an AA pointer isn't necessary.
+    bool DontMoveAcrossStore = true;
+    if (!I->isSafeToMove(TII, 0, DontMoveAcrossStore)) {
+      DEBUG(dbgs() << "Can't speculate: " << *I);
+      return false;
+    }
+
+    // Check for any dependencies on Head instructions.
+    for (MIOperands MO(I); MO.isValid(); ++MO) {
+      if (MO->isRegMask()) {
+        DEBUG(dbgs() << "Won't speculate regmask: " << *I);
+        return false;
+      }
+      if (!MO->isReg())
+        continue;
+      unsigned Reg = MO->getReg();
+
+      // Remember clobbered regunits.
+      if (MO->isDef() && TargetRegisterInfo::isPhysicalRegister(Reg))
+        for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
+          ClobberedRegUnits.set(*Units);
+
+      if (!MO->readsReg() || !TargetRegisterInfo::isVirtualRegister(Reg))
+        continue;
+      MachineInstr *DefMI = MRI->getVRegDef(Reg);
+      if (!DefMI || DefMI->getParent() != Head)
+        continue;
+      if (InsertAfter.insert(DefMI))
+        DEBUG(dbgs() << "BB#" << MBB->getNumber() << " depends on " << *DefMI);
+      if (DefMI->isTerminator()) {
+        DEBUG(dbgs() << "Can't insert instructions below terminator.\n");
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+
+/// Find an insertion point in Head for the speculated instructions. The
+/// insertion point must be:
+///
+/// 1. Before any terminators.
+/// 2. After any instructions in InsertAfter.
+/// 3. Not have any clobbered regunits live.
+///
+/// This function sets InsertionPoint and returns true when successful, it
+/// returns false if no valid insertion point could be found.
+///
+bool SSAIfConv::findInsertionPoint() {
+  // Keep track of live regunits before the current position.
+  // Only track RegUnits that are also in ClobberedRegUnits.
+  LiveRegUnits.clear();
+  SmallVector<unsigned, 8> Reads;
+  MachineBasicBlock::iterator FirstTerm = Head->getFirstTerminator();
+  MachineBasicBlock::iterator I = Head->end();
+  MachineBasicBlock::iterator B = Head->begin();
+  while (I != B) {
+    --I;
+    // Some of the conditional code depends in I.
+    if (InsertAfter.count(I)) {
+      DEBUG(dbgs() << "Can't insert code after " << *I);
+      return false;
+    }
+
+    // Update live regunits.
+    for (MIOperands MO(I); MO.isValid(); ++MO) {
+      // We're ignoring regmask operands. That is conservatively correct.
+      if (!MO->isReg())
+        continue;
+      unsigned Reg = MO->getReg();
+      if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+        continue;
+      // I clobbers Reg, so it isn't live before I.
+      if (MO->isDef())
+        for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
+          LiveRegUnits.erase(*Units);
+      // Unless I reads Reg.
+      if (MO->readsReg())
+        Reads.push_back(Reg);
+    }
+    // Anything read by I is live before I.
+    while (!Reads.empty())
+      for (MCRegUnitIterator Units(Reads.pop_back_val(), TRI); Units.isValid();
+           ++Units)
+        if (ClobberedRegUnits.test(*Units))
+          LiveRegUnits.insert(*Units);
+
+    // We can't insert before a terminator.
+    if (I != FirstTerm && I->isTerminator())
+      continue;
+
+    // Some of the clobbered registers are live before I, not a valid insertion
+    // point.
+    if (!LiveRegUnits.empty()) {
+      DEBUG({
+        dbgs() << "Would clobber";
+        for (SparseSet<unsigned>::const_iterator
+             i = LiveRegUnits.begin(), e = LiveRegUnits.end(); i != e; ++i)
+          dbgs() << ' ' << PrintRegUnit(*i, TRI);
+        dbgs() << " live before " << *I;
+      });
+      continue;
+    }
+
+    // This is a valid insertion point.
+    InsertionPoint = I;
+    DEBUG(dbgs() << "Can insert before " << *I);
+    return true;
+  }
+  DEBUG(dbgs() << "No legal insertion point found.\n");
+  return false;
+}
+
+
+
+/// canConvertIf - analyze the sub-cfg rooted in MBB, and return true if it is
+/// a potential candidate for if-conversion. Fill out the internal state.
+///
+bool SSAIfConv::canConvertIf(MachineBasicBlock *MBB) {
+  Head = MBB;
+  TBB = FBB = Tail = 0;
+
+  if (Head->succ_size() != 2)
+    return false;
+  MachineBasicBlock *Succ0 = Head->succ_begin()[0];
+  MachineBasicBlock *Succ1 = Head->succ_begin()[1];
+
+  // Canonicalize so Succ0 has MBB as its single predecessor.
+  if (Succ0->pred_size() != 1)
+    std::swap(Succ0, Succ1);
+
+  if (Succ0->pred_size() != 1 || Succ0->succ_size() != 1)
+    return false;
+
+  // We could support additional Tail predecessors by updating phis instead of
+  // eliminating them. Let's see an example where it matters first.
+  Tail = Succ0->succ_begin()[0];
+  if (Tail->pred_size() != 2)
+    return false;
+
+  // This is not a triangle.
+  if (Tail != Succ1) {
+    // Check for a diamond. We won't deal with any critical edges.
+    if (Succ1->pred_size() != 1 || Succ1->succ_size() != 1 ||
+        Succ1->succ_begin()[0] != Tail)
+      return false;
+    DEBUG(dbgs() << "\nDiamond: BB#" << Head->getNumber()
+                 << " -> BB#" << Succ0->getNumber()
+                 << "/BB#" << Succ1->getNumber()
+                 << " -> BB#" << Tail->getNumber() << '\n');
+
+    // Live-in physregs are tricky to get right when speculating code.
+    if (!Tail->livein_empty()) {
+      DEBUG(dbgs() << "Tail has live-ins.\n");
+      return false;
+    }
+  } else {
+    DEBUG(dbgs() << "\nTriangle: BB#" << Head->getNumber()
+                 << " -> BB#" << Succ0->getNumber()
+                 << " -> BB#" << Tail->getNumber() << '\n');
+  }
+
+  // This is a triangle or a diamond.
+  // If Tail doesn't have any phis, there must be side effects.
+  if (Tail->empty() || !Tail->front().isPHI()) {
+    DEBUG(dbgs() << "No phis in tail.\n");
+    return false;
+  }
+
+  // The branch we're looking to eliminate must be analyzable.
+  Cond.clear();
+  if (TII->AnalyzeBranch(*Head, TBB, FBB, Cond)) {
+    DEBUG(dbgs() << "Branch not analyzable.\n");
+    return false;
+  }
+
+  // This is weird, probably some sort of degenerate CFG.
+  if (!TBB) {
+    DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch.\n");
+    return false;
+  }
+
+  // AnalyzeBranch doesn't set FBB on a fall-through branch.
+  // Make sure it is always set.
+  FBB = TBB == Succ0 ? Succ1 : Succ0;
+
+  // Any phis in the tail block must be convertible to selects.
+  PHIs.clear();
+  MachineBasicBlock *TPred = TBB == Tail ? Head : TBB;
+  MachineBasicBlock *FPred = FBB == Tail ? Head : FBB;
+  for (MachineBasicBlock::iterator I = Tail->begin(), E = Tail->end();
+       I != E && I->isPHI(); ++I) {
+    PHIs.push_back(&*I);
+    PHIInfo &PI = PHIs.back();
+    // Find PHI operands corresponding to TPred and FPred.
+    for (unsigned i = 1; i != PI.PHI->getNumOperands(); i += 2) {
+      if (PI.PHI->getOperand(i+1).getMBB() == TPred)
+        PI.TReg = PI.PHI->getOperand(i).getReg();
+      if (PI.PHI->getOperand(i+1).getMBB() == FPred)
+        PI.FReg = PI.PHI->getOperand(i).getReg();
+    }
+    assert(TargetRegisterInfo::isVirtualRegister(PI.TReg) && "Bad PHI");
+    assert(TargetRegisterInfo::isVirtualRegister(PI.FReg) && "Bad PHI");
+
+    // Get target information.
+    if (!TII->canInsertSelect(*Head, Cond, PI.TReg, PI.FReg,
+                              PI.CondCycles, PI.TCycles, PI.FCycles)) {
+      DEBUG(dbgs() << "Can't convert: " << *PI.PHI);
+      return false;
+    }
+  }
+
+  // Check that the conditional instructions can be speculated.
+  InsertAfter.clear();
+  ClobberedRegUnits.reset();
+  if (TBB != Tail && !canSpeculateInstrs(TBB))
+    return false;
+  if (FBB != Tail && !canSpeculateInstrs(FBB))
+    return false;
+
+  // Try to find a valid insertion point for the speculated instructions in the
+  // head basic block.
+  if (!findInsertionPoint())
+    return false;
+
+  return true;
+}
+
+
+/// convertIf - Execute the if conversion after canConvertIf has determined the
+/// feasibility.
+///
+/// Any basic blocks erased will be added to RemovedBlocks.
+///
+void SSAIfConv::convertIf(SmallVectorImpl<MachineBasicBlock*> &RemovedBlocks) {
+  assert(Head && Tail && TBB && FBB && "Call canConvertIf first.");
+
+  // Move all instructions into Head, except for the terminators.
+  if (TBB != Tail)
+    Head->splice(InsertionPoint, TBB, TBB->begin(), TBB->getFirstTerminator());
+  if (FBB != Tail)
+    Head->splice(InsertionPoint, FBB, FBB->begin(), FBB->getFirstTerminator());
+
+  MachineBasicBlock::iterator FirstTerm = Head->getFirstTerminator();
+  assert(FirstTerm != Head->end() && "No terminators");
+  DebugLoc HeadDL = FirstTerm->getDebugLoc();
+
+  // Convert all PHIs to select instructions inserted before FirstTerm.
+  for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
+    PHIInfo &PI = PHIs[i];
+    DEBUG(dbgs() << "If-converting " << *PI.PHI);
+    assert(PI.PHI->getNumOperands() == 5 && "Unexpected PHI operands.");
+    unsigned DstReg = PI.PHI->getOperand(0).getReg();
+    TII->insertSelect(*Head, FirstTerm, HeadDL, DstReg, Cond, PI.TReg, PI.FReg);
+    DEBUG(dbgs() << "          --> " << *llvm::prior(FirstTerm));
+    PI.PHI->eraseFromParent();
+    PI.PHI = 0;
+  }
+
+  // Fix up the CFG, temporarily leave Head without any successors.
+  Head->removeSuccessor(TBB);
+  Head->removeSuccessor(FBB);
+  if (TBB != Tail)
+    TBB->removeSuccessor(Tail);
+  if (FBB != Tail)
+    FBB->removeSuccessor(Tail);
+
+  // Fix up Head's terminators.
+  // It should become a single branch or a fallthrough.
+  TII->RemoveBranch(*Head);
+
+  // Erase the now empty conditional blocks. It is likely that Head can fall
+  // through to Tail, and we can join the two blocks.
+  if (TBB != Tail) {
+    RemovedBlocks.push_back(TBB);
+    TBB->eraseFromParent();
+  }
+  if (FBB != Tail) {
+    RemovedBlocks.push_back(FBB);
+    FBB->eraseFromParent();
+  }
+
+  assert(Head->succ_empty() && "Additional head successors?");
+  if (Head->isLayoutSuccessor(Tail)) {
+    // Splice Tail onto the end of Head.
+    DEBUG(dbgs() << "Joining tail BB#" << Tail->getNumber()
+                 << " into head BB#" << Head->getNumber() << '\n');
+    Head->splice(Head->end(), Tail,
+                     Tail->begin(), Tail->end());
+    Head->transferSuccessorsAndUpdatePHIs(Tail);
+    RemovedBlocks.push_back(Tail);
+    Tail->eraseFromParent();
+  } else {
+    // We need a branch to Tail, let code placement work it out later.
+    DEBUG(dbgs() << "Converting to unconditional branch.\n");
+    SmallVector<MachineOperand, 0> EmptyCond;
+    TII->InsertBranch(*Head, Tail, 0, EmptyCond, HeadDL);
+    Head->addSuccessor(Tail);
+  }
+  DEBUG(dbgs() << *Head);
+}
+
+
+//===----------------------------------------------------------------------===//
+//                           EarlyIfConverter Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+class EarlyIfConverter : public MachineFunctionPass {
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+  MachineDominatorTree *DomTree;
+  MachineLoopInfo *Loops;
+  SSAIfConv IfConv;
+
+public:
+  static char ID;
+  EarlyIfConverter() : MachineFunctionPass(ID) {}
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+  bool runOnMachineFunction(MachineFunction &MF);
+
+private:
+  bool tryConvertIf(MachineBasicBlock*);
+  void updateDomTree(ArrayRef<MachineBasicBlock*> Removed);
+  void updateLoops(ArrayRef<MachineBasicBlock*> Removed);
+};
+} // end anonymous namespace
+
+char EarlyIfConverter::ID = 0;
+char &llvm::EarlyIfConverterID = EarlyIfConverter::ID;
+
+INITIALIZE_PASS_BEGIN(EarlyIfConverter,
+                      "early-ifcvt", "Early If Converter", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(EarlyIfConverter,
+                      "early-ifcvt", "Early If Converter", false, false)
+
+void EarlyIfConverter::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineBranchProbabilityInfo>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
+  AU.addPreserved<MachineLoopInfo>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+/// Update the dominator tree after if-conversion erased some blocks.
+void EarlyIfConverter::updateDomTree(ArrayRef<MachineBasicBlock*> Removed) {
+  // convertIf can remove TBB, FBB, and Tail can be merged into Head.
+  // TBB and FBB should not dominate any blocks.
+  // Tail children should be transferred to Head.
+  MachineDomTreeNode *HeadNode = DomTree->getNode(IfConv.Head);
+  for (unsigned i = 0, e = Removed.size(); i != e; ++i) {
+    MachineDomTreeNode *Node = DomTree->getNode(Removed[i]);
+    assert(Node != HeadNode && "Cannot erase the head node");
+    while (Node->getNumChildren()) {
+      assert(Node->getBlock() == IfConv.Tail && "Unexpected children");
+      DomTree->changeImmediateDominator(Node->getChildren().back(), HeadNode);
+    }
+    DomTree->eraseNode(Removed[i]);
+  }
+}
+
+/// Update LoopInfo after if-conversion.
+void EarlyIfConverter::updateLoops(ArrayRef<MachineBasicBlock*> Removed) {
+  if (!Loops)
+    return;
+  // If-conversion doesn't change loop structure, and it doesn't mess with back
+  // edges, so updating LoopInfo is simply removing the dead blocks.
+  for (unsigned i = 0, e = Removed.size(); i != e; ++i)
+    Loops->removeBlock(Removed[i]);
+}
+
+/// Attempt repeated if-conversion on MBB, return true if successful.
+///
+bool EarlyIfConverter::tryConvertIf(MachineBasicBlock *MBB) {
+  bool Changed = false;
+  while (IfConv.canConvertIf(MBB)) {
+    // If-convert MBB and update analyses.
+    SmallVector<MachineBasicBlock*, 4> RemovedBlocks;
+    IfConv.convertIf(RemovedBlocks);
+    Changed = true;
+    updateDomTree(RemovedBlocks);
+    updateLoops(RemovedBlocks);
+  }
+  return Changed;
+}
+
+bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** EARLY IF-CONVERSION **********\n"
+               << "********** Function: "
+               << ((Value*)MF.getFunction())->getName() << '\n');
+  TII = MF.getTarget().getInstrInfo();
+  TRI = MF.getTarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  DomTree = &getAnalysis<MachineDominatorTree>();
+  Loops = getAnalysisIfAvailable<MachineLoopInfo>();
+
+  bool Changed = false;
+  IfConv.runOnMachineFunction(MF);
+
+  // Visit blocks in dominator tree post-order. The post-order enables nested
+  // if-conversion in a single pass. The tryConvertIf() function may erase
+  // blocks, but only blocks dominated by the head block. This makes it safe to
+  // update the dominator tree while the post-order iterator is still active.
+  for (po_iterator<MachineDominatorTree*>
+       I = po_begin(DomTree), E = po_end(DomTree); I != E; ++I)
+    if (tryConvertIf(I->getBlock()))
+      Changed = true;
+
+  MF.verify(this, "After early if-conversion");
+  return Changed;
+}
diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp
index a48c540..fee8e47 100644
--- a/lib/CodeGen/ExecutionDepsFix.cpp
+++ b/lib/CodeGen/ExecutionDepsFix.cpp
@@ -59,7 +59,7 @@ struct DomainValue {
 
   // Pointer to the next DomainValue in a chain.  When two DomainValues are
   // merged, Victim.Next is set to point to Victor, so old DomainValue
-  // references can be updated by folowing the chain.
+  // references can be updated by following the chain.
   DomainValue *Next;
 
   // Twiddleable instructions using or defining these registers.
@@ -666,7 +666,8 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
     // or -1.
     AliasMap.resize(TRI->getNumRegs(), -1);
     for (unsigned i = 0, e = RC->getNumRegs(); i != e; ++i)
-      for (const uint16_t *AI = TRI->getOverlaps(RC->getRegister(i)); *AI; ++AI)
+      for (MCRegAliasIterator AI(RC->getRegister(i), TRI, true);
+           AI.isValid(); ++AI)
         AliasMap[*AI] = i;
   }
 
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index 75ae5b9..4214ba1 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -18,6 +18,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
@@ -155,7 +156,9 @@ namespace {
     const TargetRegisterInfo *TRI;
     const InstrItineraryData *InstrItins;
     const MachineBranchProbabilityInfo *MBPI;
+    MachineRegisterInfo *MRI;
 
+    bool PreRegAlloc;
     bool MadeChange;
     int FnNum;
   public:
@@ -263,14 +266,20 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
   TII = MF.getTarget().getInstrInfo();
   TRI = MF.getTarget().getRegisterInfo();
   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+  MRI = &MF.getRegInfo();
   InstrItins = MF.getTarget().getInstrItineraryData();
   if (!TII) return false;
 
-  // Tail merge tend to expose more if-conversion opportunities.
-  BranchFolder BF(true, false);
-  bool BFChange = BF.OptimizeFunction(MF, TII,
+  PreRegAlloc = MRI->isSSA();
+
+  bool BFChange = false;
+  if (!PreRegAlloc) {
+    // Tail merge tend to expose more if-conversion opportunities.
+    BranchFolder BF(true, false);
+    BFChange = BF.OptimizeFunction(MF, TII,
                                    MF.getTarget().getRegisterInfo(),
                                    getAnalysisIfAvailable<MachineModuleInfo>());
+  }
 
   DEBUG(dbgs() << "\nIfcvt: function (" << ++FnNum <<  ") \'"
                << MF.getFunction()->getName() << "\'");
@@ -621,7 +630,7 @@ void IfConverter::ScanInstructions(BBInfo &BBI) {
   if (BBI.IsDone)
     return;
 
-  bool AlreadyPredicated = BBI.Predicate.size() > 0;
+  bool AlreadyPredicated = !BBI.Predicate.empty();
   // First analyze the end of BB branches.
   BBI.TrueBB = BBI.FalseBB = NULL;
   BBI.BrCond.clear();
@@ -786,8 +795,8 @@ IfConverter::BBInfo &IfConverter::AnalyzeBlock(MachineBasicBlock *BB,
 
   unsigned Dups = 0;
   unsigned Dups2 = 0;
-  bool TNeedSub = TrueBBI.Predicate.size() > 0;
-  bool FNeedSub = FalseBBI.Predicate.size() > 0;
+  bool TNeedSub = !TrueBBI.Predicate.empty();
+  bool FNeedSub = !FalseBBI.Predicate.empty();
   bool Enqueued = false;
 
   BranchProbability Prediction = MBPI->getEdgeProbability(BB, TrueBBI.BB);
@@ -962,9 +971,8 @@ static void InitPredRedefs(MachineBasicBlock *BB, SmallSet<unsigned,4> &Redefs,
          E = BB->livein_end(); I != E; ++I) {
     unsigned Reg = *I;
     Redefs.insert(Reg);
-    for (const uint16_t *Subreg = TRI->getSubRegisters(Reg);
-         *Subreg; ++Subreg)
-      Redefs.insert(*Subreg);
+    for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+      Redefs.insert(*SubRegs);
   }
 }
 
@@ -983,8 +991,8 @@ static void UpdatePredRedefs(MachineInstr *MI, SmallSet<unsigned,4> &Redefs,
       Defs.push_back(Reg);
     else if (MO.isKill()) {
       Redefs.erase(Reg);
-      for (const uint16_t *SR = TRI->getSubRegisters(Reg); *SR; ++SR)
-        Redefs.erase(*SR);
+      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+        Redefs.erase(*SubRegs);
     }
   }
   for (unsigned i = 0, e = Defs.size(); i != e; ++i) {
@@ -993,11 +1001,12 @@ static void UpdatePredRedefs(MachineInstr *MI, SmallSet<unsigned,4> &Redefs,
       if (AddImpUse)
         // Treat predicated update as read + write.
         MI->addOperand(MachineOperand::CreateReg(Reg, false/*IsDef*/,
-                                                true/*IsImp*/,false/*IsKill*/));
+                                              true/*IsImp*/,false/*IsKill*/,
+                                              false/*IsDead*/,true/*IsUndef*/));
     } else {
       Redefs.insert(Reg);
-      for (const uint16_t *SR = TRI->getSubRegisters(Reg); *SR; ++SR)
-        Redefs.insert(*SR);
+      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+        Redefs.insert(*SubRegs);
     }
   }
 }
@@ -1335,8 +1344,8 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
           // These are defined before ctrl flow reach the 'false' instructions.
           // They cannot be modified by the 'true' instructions.
           ExtUses.insert(Reg);
-          for (const uint16_t *SR = TRI->getSubRegisters(Reg); *SR; ++SR)
-            ExtUses.insert(*SR);
+          for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+            ExtUses.insert(*SubRegs);
         }
       }
 
@@ -1344,8 +1353,8 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
         unsigned Reg = Defs[i];
         if (!ExtUses.count(Reg)) {
           RedefsByFalse.insert(Reg);
-          for (const uint16_t *SR = TRI->getSubRegisters(Reg); *SR; ++SR)
-            RedefsByFalse.insert(*SR);
+          for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+            RedefsByFalse.insert(*SubRegs);
         }
       }
     }
diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp
index d5ea666..07e37af 100644
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp
@@ -52,7 +52,6 @@ static cl::opt<bool> DisableHoisting("disable-spill-hoist", cl::Hidden,
 
 namespace {
 class InlineSpiller : public Spiller {
-  MachineFunctionPass &Pass;
   MachineFunction &MF;
   LiveIntervals &LIS;
   LiveStacks &LSS;
@@ -137,8 +136,7 @@ public:
   InlineSpiller(MachineFunctionPass &pass,
                 MachineFunction &mf,
                 VirtRegMap &vrm)
-    : Pass(pass),
-      MF(mf),
+    : MF(mf),
       LIS(pass.getAnalysis<LiveIntervals>()),
       LSS(pass.getAnalysis<LiveStacks>()),
       AA(&pass.getAnalysis<AliasAnalysis>()),
@@ -578,11 +576,11 @@ MachineInstr *InlineSpiller::traceSiblingValue(unsigned UseReg, VNInfo *UseVNI,
     if (unsigned SrcReg = isFullCopyOf(MI, Reg)) {
       if (isSibling(SrcReg)) {
         LiveInterval &SrcLI = LIS.getInterval(SrcReg);
-        LiveRange *SrcLR = SrcLI.getLiveRangeContaining(VNI->def.getRegSlot(true));
-        assert(SrcLR && "Copy from non-existing value");
+        LiveRangeQuery SrcQ(SrcLI, VNI->def);
+        assert(SrcQ.valueIn() && "Copy from non-existing value");
         // Check if this COPY kills its source.
-        SVI->second.KillsSource = (SrcLR->end == VNI->def);
-        VNInfo *SrcVNI = SrcLR->valno;
+        SVI->second.KillsSource = SrcQ.isKill();
+        VNInfo *SrcVNI = SrcQ.valueIn();
         DEBUG(dbgs() << "copy of " << PrintReg(SrcReg) << ':'
                      << SrcVNI->id << '@' << SrcVNI->def
                      << " kill=" << unsigned(SVI->second.KillsSource) << '\n');
@@ -1083,6 +1081,10 @@ void InlineSpiller::insertReload(LiveInterval &NewLI,
                            MRI.getRegClass(NewLI.reg), &TRI);
   --MI; // Point to load instruction.
   SlotIndex LoadIdx = LIS.InsertMachineInstrInMaps(MI).getRegSlot();
+  // Some (out-of-tree) targets have EC reload instructions.
+  if (MachineOperand *MO = MI->findRegisterDefOperand(NewLI.reg))
+    if (MO->isEarlyClobber())
+      LoadIdx = LoadIdx.getRegSlot(true);
   DEBUG(dbgs() << "\treload:  " << LoadIdx << '\t' << *MI);
   VNInfo *LoadVNI = NewLI.getNextValue(LoadIdx, LIS.getVNInfoAllocator());
   NewLI.addRange(LiveRange(LoadIdx, Idx, LoadVNI));
@@ -1275,8 +1277,8 @@ void InlineSpiller::spill(LiveRangeEdit &edit) {
 
   DEBUG(dbgs() << "Inline spilling "
                << MRI.getRegClass(edit.getReg())->getName()
-               << ':' << edit.getParent() << "\nFrom original "
-               << LIS.getInterval(Original) << '\n');
+               << ':' << PrintReg(edit.getReg()) << ' ' << edit.getParent()
+               << "\nFrom original " << LIS.getInterval(Original) << '\n');
   assert(edit.getParent().isSpillable() &&
          "Attempting to spill already spilled value.");
   assert(DeadDefs.empty() && "Previous spill didn't remove dead defs");
diff --git a/lib/CodeGen/InterferenceCache.cpp b/lib/CodeGen/InterferenceCache.cpp
index 8368b58..1541bf0 100644
--- a/lib/CodeGen/InterferenceCache.cpp
+++ b/lib/CodeGen/InterferenceCache.cpp
@@ -39,7 +39,7 @@ InterferenceCache::Entry *InterferenceCache::get(unsigned PhysReg) {
   unsigned E = PhysRegEntries[PhysReg];
   if (E < CacheEntries && Entries[E].getPhysReg() == PhysReg) {
     if (!Entries[E].valid(LIUArray, TRI))
-      Entries[E].revalidate();
+      Entries[E].revalidate(LIUArray, TRI);
     return &Entries[E];
   }
   // No valid entry exists, pick the next round-robin entry.
@@ -61,13 +61,15 @@ InterferenceCache::Entry *InterferenceCache::get(unsigned PhysReg) {
 }
 
 /// revalidate - LIU contents have changed, update tags.
-void InterferenceCache::Entry::revalidate() {
+void InterferenceCache::Entry::revalidate(LiveIntervalUnion *LIUArray,
+                                          const TargetRegisterInfo *TRI) {
   // Invalidate all block entries.
   ++Tag;
   // Invalidate all iterators.
   PrevPos = SlotIndex();
-  for (unsigned i = 0, e = Aliases.size(); i != e; ++i)
-    Aliases[i].second = Aliases[i].first->getTag();
+  unsigned i = 0;
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units, ++i)
+    RegUnits[i].VirtTag = LIUArray[*Units].getTag();
 }
 
 void InterferenceCache::Entry::reset(unsigned physReg,
@@ -79,28 +81,23 @@ void InterferenceCache::Entry::reset(unsigned physReg,
   ++Tag;
   PhysReg = physReg;
   Blocks.resize(MF->getNumBlockIDs());
-  Aliases.clear();
-  for (const uint16_t *AS = TRI->getOverlaps(PhysReg); *AS; ++AS) {
-    LiveIntervalUnion *LIU = LIUArray + *AS;
-    Aliases.push_back(std::make_pair(LIU, LIU->getTag()));
-  }
 
   // Reset iterators.
   PrevPos = SlotIndex();
-  unsigned e = Aliases.size();
-  Iters.resize(e);
-  for (unsigned i = 0; i != e; ++i)
-    Iters[i].setMap(Aliases[i].first->getMap());
+  RegUnits.clear();
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    RegUnits.push_back(LIUArray[*Units]);
+    RegUnits.back().Fixed = &LIS->getRegUnit(*Units);
+  }
 }
 
 bool InterferenceCache::Entry::valid(LiveIntervalUnion *LIUArray,
                                      const TargetRegisterInfo *TRI) {
-  unsigned i = 0, e = Aliases.size();
-  for (const uint16_t *AS = TRI->getOverlaps(PhysReg); *AS; ++AS, ++i) {
-    LiveIntervalUnion *LIU = LIUArray + *AS;
-    if (i == e ||  Aliases[i].first != LIU)
+  unsigned i = 0, e = RegUnits.size();
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units, ++i) {
+    if (i == e)
       return false;
-    if (LIU->changedSince(Aliases[i].second))
+    if (LIUArray[*Units].changedSince(RegUnits[i].VirtTag))
       return false;
   }
   return i == e;
@@ -112,12 +109,20 @@ void InterferenceCache::Entry::update(unsigned MBBNum) {
 
   // Use advanceTo only when possible.
   if (PrevPos != Start) {
-    if (!PrevPos.isValid() || Start < PrevPos)
-      for (unsigned i = 0, e = Iters.size(); i != e; ++i)
-        Iters[i].find(Start);
-    else
-      for (unsigned i = 0, e = Iters.size(); i != e; ++i)
-        Iters[i].advanceTo(Start);
+    if (!PrevPos.isValid() || Start < PrevPos) {
+      for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+        RegUnitInfo &RUI = RegUnits[i];
+        RUI.VirtI.find(Start);
+        RUI.FixedI = RUI.Fixed->find(Start);
+      }
+    } else {
+      for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+        RegUnitInfo &RUI = RegUnits[i];
+        RUI.VirtI.advanceTo(Start);
+        if (RUI.FixedI != RUI.Fixed->end())
+          RUI.FixedI = RUI.Fixed->advanceTo(RUI.FixedI, Start);
+      }
+    }
     PrevPos = Start;
   }
 
@@ -129,9 +134,9 @@ void InterferenceCache::Entry::update(unsigned MBBNum) {
     BI->Tag = Tag;
     BI->First = BI->Last = SlotIndex();
 
-    // Check for first interference.
-    for (unsigned i = 0, e = Iters.size(); i != e; ++i) {
-      Iter &I = Iters[i];
+    // Check for first interference from virtregs.
+    for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+      LiveIntervalUnion::SegmentIter &I = RegUnits[i].VirtI;
       if (!I.valid())
         continue;
       SlotIndex StartI = I.start();
@@ -141,6 +146,19 @@ void InterferenceCache::Entry::update(unsigned MBBNum) {
         BI->First = StartI;
     }
 
+    // Same thing for fixed interference.
+    for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+      LiveInterval::const_iterator I = RegUnits[i].FixedI;
+      LiveInterval::const_iterator E = RegUnits[i].Fixed->end();
+      if (I == E)
+        continue;
+      SlotIndex StartI = I->start;
+      if (StartI >= Stop)
+        continue;
+      if (!BI->First.isValid() || StartI < BI->First)
+        BI->First = StartI;
+    }
+
     // Also check for register mask interference.
     RegMaskSlots = LIS->getRegMaskSlotsInBlock(MBBNum);
     RegMaskBits = LIS->getRegMaskBitsInBlock(MBBNum);
@@ -168,8 +186,8 @@ void InterferenceCache::Entry::update(unsigned MBBNum) {
   }
 
   // Check for last interference in block.
-  for (unsigned i = 0, e = Iters.size(); i != e; ++i) {
-    Iter &I = Iters[i];
+  for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+    LiveIntervalUnion::SegmentIter &I = RegUnits[i].VirtI;
     if (!I.valid() || I.start() >= Stop)
       continue;
     I.advanceTo(Stop);
@@ -183,6 +201,23 @@ void InterferenceCache::Entry::update(unsigned MBBNum) {
       ++I;
   }
 
+  // Fixed interference.
+  for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+    LiveInterval::iterator &I = RegUnits[i].FixedI;
+    LiveInterval *LI = RegUnits[i].Fixed;
+    if (I == LI->end() || I->start >= Stop)
+      continue;
+    I = LI->advanceTo(I, Stop);
+    bool Backup = I == LI->end() || I->start >= Stop;
+    if (Backup)
+      --I;
+    SlotIndex StopI = I->end;
+    if (!BI->Last.isValid() || StopI > BI->Last)
+      BI->Last = StopI;
+    if (Backup)
+      ++I;
+  }
+
   // Also check for register mask interference.
   SlotIndex Limit = BI->Last.isValid() ? BI->Last : Start;
   for (unsigned i = RegMaskSlots.size();
diff --git a/lib/CodeGen/InterferenceCache.h b/lib/CodeGen/InterferenceCache.h
index 485a325..3c928a5 100644
--- a/lib/CodeGen/InterferenceCache.h
+++ b/lib/CodeGen/InterferenceCache.h
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// InterferenceCache remembers per-block interference in LiveIntervalUnions.
+// InterferenceCache remembers per-block interference from LiveIntervalUnions,
+// fixed RegUnit interference, and register masks.
 //
 //===----------------------------------------------------------------------===//
 
@@ -59,14 +60,31 @@ class InterferenceCache {
     /// PrevPos - The previous position the iterators were moved to.
     SlotIndex PrevPos;
 
-    /// AliasTags - A LiveIntervalUnion pointer and tag for each alias of
-    /// PhysReg.
-    SmallVector<std::pair<LiveIntervalUnion*, unsigned>, 8> Aliases;
+    /// RegUnitInfo - Information tracked about each RegUnit in PhysReg.
+    /// When PrevPos is set, the iterators are valid as if advanceTo(PrevPos)
+    /// had just been called.
+    struct RegUnitInfo {
+      /// Iterator pointing into the LiveIntervalUnion containing virtual
+      /// register interference.
+      LiveIntervalUnion::SegmentIter VirtI;
 
-    typedef LiveIntervalUnion::SegmentIter Iter;
+      /// Tag of the LIU last time we looked.
+      unsigned VirtTag;
 
-    /// Iters - an iterator for each alias
-    SmallVector<Iter, 8> Iters;
+      /// Fixed interference in RegUnit.
+      LiveInterval *Fixed;
+
+      /// Iterator pointing into the fixed RegUnit interference.
+      LiveInterval::iterator FixedI;
+
+      RegUnitInfo(LiveIntervalUnion &LIU) : VirtTag(LIU.getTag()), Fixed(0) {
+        VirtI.setMap(LIU.getMap());
+      }
+    };
+
+    /// Info for each RegUnit in PhysReg. It is very rare ofr a PHysReg to have
+    /// more than 4 RegUnits.
+    SmallVector<RegUnitInfo, 4> RegUnits;
 
     /// Blocks - Interference for each block in the function.
     SmallVector<BlockInterference, 8> Blocks;
@@ -91,7 +109,7 @@ class InterferenceCache {
 
     bool hasRefs() const { return RefCount > 0; }
 
-    void revalidate();
+    void revalidate(LiveIntervalUnion *LIUArray, const TargetRegisterInfo *TRI);
 
     /// valid - Return true if this is a valid entry for physReg.
     bool valid(LiveIntervalUnion *LIUArray, const TargetRegisterInfo *TRI);
diff --git a/lib/CodeGen/IntrinsicLowering.cpp b/lib/CodeGen/IntrinsicLowering.cpp
index a9ca42f..8d2282a 100644
--- a/lib/CodeGen/IntrinsicLowering.cpp
+++ b/lib/CodeGen/IntrinsicLowering.cpp
@@ -11,17 +11,17 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/Module.h"
 #include "llvm/Type.h"
-#include "llvm/CodeGen/IntrinsicLowering.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetData.h"
-#include "llvm/ADT/SmallVector.h"
 using namespace llvm;
 
 template <class ArgIt>
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index a1f479a..cac0c83 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/PassManager.h"
+#include "llvm/Assembly/PrintModulePass.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
@@ -78,40 +79,15 @@ LLVMTargetMachine::LLVMTargetMachine(const Target &T, StringRef Triple,
          "and that InitializeAllTargetMCs() is being invoked!");
 }
 
-/// Turn exception handling constructs into something the code generators can
-/// handle.
-static void addPassesToHandleExceptions(TargetMachine *TM,
-                                        PassManagerBase &PM) {
-  switch (TM->getMCAsmInfo()->getExceptionHandlingType()) {
-  case ExceptionHandling::SjLj:
-    // SjLj piggy-backs on dwarf for this bit. The cleanups done apply to both
-    // Dwarf EH prepare needs to be run after SjLj prepare. Otherwise,
-    // catch info can get misplaced when a selector ends up more than one block
-    // removed from the parent invoke(s). This could happen when a landing
-    // pad is shared by multiple invokes and is also a target of a normal
-    // edge from elsewhere.
-    PM.add(createSjLjEHPreparePass(TM->getTargetLowering()));
-    // FALLTHROUGH
-  case ExceptionHandling::DwarfCFI:
-  case ExceptionHandling::ARM:
-  case ExceptionHandling::Win64:
-    PM.add(createDwarfEHPass(TM));
-    break;
-  case ExceptionHandling::None:
-    PM.add(createLowerInvokePass(TM->getTargetLowering()));
-
-    // The lower invoke pass may create unreachable code. Remove it.
-    PM.add(createUnreachableBlockEliminationPass());
-    break;
-  }
-}
-
 /// addPassesToX helper drives creation and initialization of TargetPassConfig.
 static MCContext *addPassesToGenerateCode(LLVMTargetMachine *TM,
                                           PassManagerBase &PM,
-                                          bool DisableVerify) {
+                                          bool DisableVerify,
+                                          AnalysisID StartAfter,
+                                          AnalysisID StopAfter) {
   // Targets may override createPassConfig to provide a target-specific sublass.
   TargetPassConfig *PassConfig = TM->createPassConfig(PM);
+  PassConfig->setStartStopPasses(StartAfter, StopAfter);
 
   // Set PassConfig options provided by TargetMachine.
   PassConfig->setDisableVerify(DisableVerify);
@@ -120,7 +96,7 @@ static MCContext *addPassesToGenerateCode(LLVMTargetMachine *TM,
 
   PassConfig->addIRPasses();
 
-  addPassesToHandleExceptions(TM, PM);
+  PassConfig->addPassesToHandleExceptions();
 
   PassConfig->addISelPrepare();
 
@@ -155,16 +131,30 @@ static MCContext *addPassesToGenerateCode(LLVMTargetMachine *TM,
 bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
                                             formatted_raw_ostream &Out,
                                             CodeGenFileType FileType,
-                                            bool DisableVerify) {
+                                            bool DisableVerify,
+                                            AnalysisID StartAfter,
+                                            AnalysisID StopAfter) {
   // Add common CodeGen passes.
-  MCContext *Context = addPassesToGenerateCode(this, PM, DisableVerify);
+  MCContext *Context = addPassesToGenerateCode(this, PM, DisableVerify,
+                                               StartAfter, StopAfter);
   if (!Context)
     return true;
 
+  if (StopAfter) {
+    // FIXME: The intent is that this should eventually write out a YAML file,
+    // containing the LLVM IR, the machine-level IR (when stopping after a
+    // machine-level pass), and whatever other information is needed to
+    // deserialize the code and resume compilation.  For now, just write the
+    // LLVM IR.
+    PM.add(createPrintModulePass(&Out));
+    return false;
+  }
+
   if (hasMCSaveTempLabels())
     Context->setAllowTemporaryLabels(false);
 
   const MCAsmInfo &MAI = *getMCAsmInfo();
+  const MCRegisterInfo &MRI = *getRegisterInfo();
   const MCSubtargetInfo &STI = getSubtarget<MCSubtargetInfo>();
   OwningPtr<MCStreamer> AsmStreamer;
 
@@ -180,7 +170,8 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
     MCAsmBackend *MAB = 0;
     if (ShowMCEncoding) {
       const MCSubtargetInfo &STI = getSubtarget<MCSubtargetInfo>();
-      MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), STI, *Context);
+      MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), MRI, STI,
+                                            *Context);
       MAB = getTarget().createMCAsmBackend(getTargetTriple());
     }
 
@@ -198,8 +189,8 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
   case CGFT_ObjectFile: {
     // Create the code emitter for the target if it exists.  If not, .o file
     // emission fails.
-    MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), STI,
-                                                         *Context);
+    MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), MRI,
+                                                         STI, *Context);
     MCAsmBackend *MAB = getTarget().createMCAsmBackend(getTargetTriple());
     if (MCE == 0 || MAB == 0)
       return true;
@@ -242,7 +233,7 @@ bool LLVMTargetMachine::addPassesToEmitMachineCode(PassManagerBase &PM,
                                                    JITCodeEmitter &JCE,
                                                    bool DisableVerify) {
   // Add common CodeGen passes.
-  MCContext *Context = addPassesToGenerateCode(this, PM, DisableVerify);
+  MCContext *Context = addPassesToGenerateCode(this, PM, DisableVerify, 0, 0);
   if (!Context)
     return true;
 
@@ -262,7 +253,7 @@ bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM,
                                           raw_ostream &Out,
                                           bool DisableVerify) {
   // Add common CodeGen passes.
-  Ctx = addPassesToGenerateCode(this, PM, DisableVerify);
+  Ctx = addPassesToGenerateCode(this, PM, DisableVerify, 0, 0);
   if (!Ctx)
     return true;
 
@@ -271,9 +262,10 @@ bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM,
 
   // Create the code emitter for the target if it exists.  If not, .o file
   // emission fails.
+  const MCRegisterInfo &MRI = *getRegisterInfo();
   const MCSubtargetInfo &STI = getSubtarget<MCSubtargetInfo>();
-  MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(*getInstrInfo(),STI,
-                                                       *Ctx);
+  MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), MRI,
+                                                       STI, *Ctx);
   MCAsmBackend *MAB = getTarget().createMCAsmBackend(getTargetTriple());
   if (MCE == 0 || MAB == 0)
     return true;
diff --git a/lib/CodeGen/LexicalScopes.cpp b/lib/CodeGen/LexicalScopes.cpp
index f1abcbb..6b6b9d0 100644
--- a/lib/CodeGen/LexicalScopes.cpp
+++ b/lib/CodeGen/LexicalScopes.cpp
@@ -16,8 +16,8 @@
 
 #define DEBUG_TYPE "lexicalscopes"
 #include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/Function.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/Support/Debug.h"
diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp
index 2187833..d631726 100644
--- a/lib/CodeGen/LiveDebugVariables.cpp
+++ b/lib/CodeGen/LiveDebugVariables.cpp
@@ -23,9 +23,9 @@
 #include "LiveDebugVariables.h"
 #include "VirtRegMap.h"
 #include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/Metadata.h"
 #include "llvm/Value.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/ADT/IntervalMap.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LexicalScopes.h"
@@ -243,7 +243,7 @@ public:
 
   /// computeIntervals - Compute the live intervals of all locations after
   /// collecting all their def points.
-  void computeIntervals(MachineRegisterInfo &MRI,
+  void computeIntervals(MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
                         LiveIntervals &LIS, MachineDominatorTree &MDT,
                         UserValueScopes &UVS);
 
@@ -618,6 +618,7 @@ UserValue::addDefsFromCopies(LiveInterval *LI, unsigned LocNo,
 
 void
 UserValue::computeIntervals(MachineRegisterInfo &MRI,
+                            const TargetRegisterInfo &TRI,
                             LiveIntervals &LIS,
                             MachineDominatorTree &MDT,
                             UserValueScopes &UVS) {
@@ -634,15 +635,32 @@ UserValue::computeIntervals(MachineRegisterInfo &MRI,
     unsigned LocNo = Defs[i].second;
     const MachineOperand &Loc = locations[LocNo];
 
+    if (!Loc.isReg()) {
+      extendDef(Idx, LocNo, 0, 0, 0, LIS, MDT, UVS);
+      continue;
+    }
+
     // Register locations are constrained to where the register value is live.
-    if (Loc.isReg() && LIS.hasInterval(Loc.getReg())) {
-      LiveInterval *LI = &LIS.getInterval(Loc.getReg());
-      const VNInfo *VNI = LI->getVNInfoAt(Idx);
+    if (TargetRegisterInfo::isVirtualRegister(Loc.getReg())) {
+      LiveInterval *LI = 0;
+      const VNInfo *VNI = 0;
+      if (LIS.hasInterval(Loc.getReg())) {
+        LI = &LIS.getInterval(Loc.getReg());
+        VNI = LI->getVNInfoAt(Idx);
+      }
       SmallVector<SlotIndex, 16> Kills;
       extendDef(Idx, LocNo, LI, VNI, &Kills, LIS, MDT, UVS);
-      addDefsFromCopies(LI, LocNo, Kills, Defs, MRI, LIS);
-    } else
-      extendDef(Idx, LocNo, 0, 0, 0, LIS, MDT, UVS);
+      if (LI)
+        addDefsFromCopies(LI, LocNo, Kills, Defs, MRI, LIS);
+      continue;
+    }
+
+    // For physregs, use the live range of the first regunit as a guide.
+    unsigned Unit = *MCRegUnitIterator(Loc.getReg(), &TRI);
+    LiveInterval *LI = &LIS.getRegUnit(Unit);
+    const VNInfo *VNI = LI->getVNInfoAt(Idx);
+    // Don't track copies from physregs, it is too expensive.
+    extendDef(Idx, LocNo, LI, VNI, 0, LIS, MDT, UVS);
   }
 
   // Finally, erase all the undefs.
@@ -656,7 +674,7 @@ UserValue::computeIntervals(MachineRegisterInfo &MRI,
 void LDVImpl::computeIntervals() {
   for (unsigned i = 0, e = userValues.size(); i != e; ++i) {
     UserValueScopes UVS(userValues[i]->getDebugLoc(), LS);
-    userValues[i]->computeIntervals(MF->getRegInfo(), *LIS, *MDT, UVS);
+    userValues[i]->computeIntervals(MF->getRegInfo(), *TRI, *LIS, *MDT, UVS);
     userValues[i]->mapVirtRegs(this);
   }
 }
@@ -721,7 +739,8 @@ renameRegister(unsigned OldReg, unsigned NewReg, unsigned SubIdx) {
 
   if (TargetRegisterInfo::isVirtualRegister(NewReg))
     mapVirtReg(NewReg, UV);
-  virtRegToEqClass.erase(OldReg);
+  if (OldReg != NewReg)
+    virtRegToEqClass.erase(OldReg);
 
   do {
     UV->renameRegister(OldReg, NewReg, SubIdx, TRI);
diff --git a/lib/CodeGen/LiveInterval.cpp b/lib/CodeGen/LiveInterval.cpp
index ac18843..01077db 100644
--- a/lib/CodeGen/LiveInterval.cpp
+++ b/lib/CodeGen/LiveInterval.cpp
@@ -48,6 +48,26 @@ LiveInterval::iterator LiveInterval::find(SlotIndex Pos) {
   return I;
 }
 
+VNInfo *LiveInterval::createDeadDef(SlotIndex Def,
+                                    VNInfo::Allocator &VNInfoAllocator) {
+  assert(!Def.isDead() && "Cannot define a value at the dead slot");
+  iterator I = find(Def);
+  if (I == end()) {
+    VNInfo *VNI = getNextValue(Def, VNInfoAllocator);
+    ranges.push_back(LiveRange(Def, Def.getDeadSlot(), VNI));
+    return VNI;
+  }
+  if (SlotIndex::isSameInstr(Def, I->start)) {
+    assert(I->start == Def && "Cannot insert def, already live");
+    assert(I->valno->def == Def && "Inconsistent existing value def");
+    return I->valno;
+  }
+  assert(SlotIndex::isEarlierInstr(Def, I->start) && "Already live at def");
+  VNInfo *VNI = getNextValue(Def, VNInfoAllocator);
+  ranges.insert(I, LiveRange(Def, Def.getDeadSlot(), VNI));
+  return VNI;
+}
+
 /// killedInRange - Return true if the interval has kills in [Start,End).
 bool LiveInterval::killedInRange(SlotIndex Start, SlotIndex End) const {
   Ranges::const_iterator r =
@@ -176,16 +196,16 @@ void LiveInterval::extendIntervalEndTo(Ranges::iterator I, SlotIndex NewEnd) {
   // If NewEnd was in the middle of an interval, make sure to get its endpoint.
   I->end = std::max(NewEnd, prior(MergeTo)->end);
 
-  // Erase any dead ranges.
-  ranges.erase(llvm::next(I), MergeTo);
-
   // If the newly formed range now touches the range after it and if they have
   // the same value number, merge the two ranges into one range.
-  Ranges::iterator Next = llvm::next(I);
-  if (Next != ranges.end() && Next->start <= I->end && Next->valno == ValNo) {
-    I->end = Next->end;
-    ranges.erase(Next);
+  if (MergeTo != ranges.end() && MergeTo->start <= I->end &&
+      MergeTo->valno == ValNo) {
+    I->end = MergeTo->end;
+    ++MergeTo;
   }
+
+  // Erase any dead ranges.
+  ranges.erase(llvm::next(I), MergeTo);
 }
 
 
@@ -353,18 +373,6 @@ void LiveInterval::removeValNo(VNInfo *ValNo) {
   markValNoForDeletion(ValNo);
 }
 
-/// findDefinedVNInfo - Find the VNInfo defined by the specified
-/// index (register interval).
-VNInfo *LiveInterval::findDefinedVNInfoForRegInt(SlotIndex Idx) const {
-  for (LiveInterval::const_vni_iterator i = vni_begin(), e = vni_end();
-       i != e; ++i) {
-    if ((*i)->def == Idx)
-      return *i;
-  }
-
-  return 0;
-}
-
 /// join - Join two live intervals (this, and other) together.  This applies
 /// mappings to the value numbers in the LHS/RHS intervals as specified.  If
 /// the intervals are not joinable, this aborts.
@@ -373,6 +381,8 @@ void LiveInterval::join(LiveInterval &Other,
                         const int *RHSValNoAssignments,
                         SmallVector<VNInfo*, 16> &NewVNInfo,
                         MachineRegisterInfo *MRI) {
+  verify();
+
   // Determine if any of our live range values are mapped.  This is uncommon, so
   // we want to avoid the interval scan if not.
   bool MustMapCurValNos = false;
@@ -440,16 +450,148 @@ void LiveInterval::join(LiveInterval &Other,
     valnos.resize(NumNewVals);  // shrinkify
 
   // Okay, now insert the RHS live ranges into the LHS.
-  iterator InsertPos = begin();
   unsigned RangeNo = 0;
   for (iterator I = Other.begin(), E = Other.end(); I != E; ++I, ++RangeNo) {
     // Map the valno in the other live range to the current live range.
     I->valno = NewVNInfo[OtherAssignments[RangeNo]];
     assert(I->valno && "Adding a dead range?");
-    InsertPos = addRangeFrom(*I, InsertPos);
+  }
+  mergeIntervalRanges(Other);
+
+  verify();
+}
+
+/// \brief Helper function for merging in another LiveInterval's ranges.
+///
+/// This is a helper routine implementing an efficient merge of another
+/// LiveIntervals ranges into the current interval.
+///
+/// \param LHSValNo If non-NULL, set as the new value number for every range
+///                 from RHS which is merged into the LHS.
+/// \param RHSValNo If non-NULL, then only ranges in RHS whose original value
+///                 number maches this value number will be merged into LHS.
+void LiveInterval::mergeIntervalRanges(const LiveInterval &RHS,
+                                       VNInfo *LHSValNo,
+                                       const VNInfo *RHSValNo) {
+  if (RHS.empty())
+    return;
+
+  // Ensure we're starting with a valid range. Note that we don't verify RHS
+  // because it may have had its value numbers adjusted in preparation for
+  // merging.
+  verify();
+
+  // The strategy for merging these efficiently is as follows:
+  //
+  // 1) Find the beginning of the impacted ranges in the LHS.
+  // 2) Create a new, merged sub-squence of ranges merging from the position in
+  //    #1 until either LHS or RHS is exhausted. Any part of LHS between RHS
+  //    entries being merged will be copied into this new range.
+  // 3) Replace the relevant section in LHS with these newly merged ranges.
+  // 4) Append any remaning ranges from RHS if LHS is exhausted in #2.
+  //
+  // We don't follow the typical in-place merge strategy for sorted ranges of
+  // appending the new ranges to the back and then using std::inplace_merge
+  // because one step of the merge can both mutate the original elements and
+  // remove elements from the original. Essentially, because the merge includes
+  // collapsing overlapping ranges, a more complex approach is required.
+
+  // We do an initial binary search to optimize for a common pattern: a large
+  // LHS, and a very small RHS.
+  const_iterator RI = RHS.begin(), RE = RHS.end();
+  iterator LE = end(), LI = std::upper_bound(begin(), LE, *RI);
+
+  // Merge into NewRanges until one of the ranges is exhausted.
+  SmallVector<LiveRange, 4> NewRanges;
+
+  // Keep track of where to begin the replacement.
+  iterator ReplaceI = LI;
+
+  // If there are preceding ranges in the LHS, put the last one into NewRanges
+  // so we can optionally extend it. Adjust the replacement point accordingly.
+  if (LI != begin()) {
+    ReplaceI = llvm::prior(LI);
+    NewRanges.push_back(*ReplaceI);
+  }
+
+  // Now loop over the mergable portions of both LHS and RHS, merging into
+  // NewRanges.
+  while (LI != LE && RI != RE) {
+    // Skip incoming ranges with the wrong value.
+    if (RHSValNo && RI->valno != RHSValNo) {
+      ++RI;
+      continue;
+    }
+
+    // Select the first range. We pick the earliest start point, and then the
+    // largest range.
+    LiveRange R = *LI;
+    if (*RI < R) {
+      R = *RI;
+      ++RI;
+      if (LHSValNo)
+        R.valno = LHSValNo;
+    } else {
+      ++LI;
+    }
+
+    if (NewRanges.empty()) {
+      NewRanges.push_back(R);
+      continue;
+    }
+
+    LiveRange &LastR = NewRanges.back();
+    if (R.valno == LastR.valno) {
+      // Try to merge this range into the last one.
+      if (R.start <= LastR.end) {
+        LastR.end = std::max(LastR.end, R.end);
+        continue;
+      }
+    } else {
+      // We can't merge ranges across a value number.
+      assert(R.start >= LastR.end &&
+             "Cannot overlap two LiveRanges with differing ValID's");
+    }
+
+    // If all else fails, just append the range.
+    NewRanges.push_back(R);
+  }
+  assert(RI == RE || LI == LE);
+
+  // Check for being able to merge into the trailing sequence of ranges on the LHS.
+  if (!NewRanges.empty())
+    for (; LI != LE && (LI->valno == NewRanges.back().valno &&
+                        LI->start <= NewRanges.back().end);
+         ++LI)
+      NewRanges.back().end = std::max(NewRanges.back().end, LI->end);
+
+  // Replace the ranges in the LHS with the newly merged ones. It would be
+  // really nice if there were a move-supporting 'replace' directly in
+  // SmallVector, but as there is not, we pay the price of copies to avoid
+  // wasted memory allocations.
+  SmallVectorImpl<LiveRange>::iterator NRI = NewRanges.begin(),
+                                       NRE = NewRanges.end();
+  for (; ReplaceI != LI && NRI != NRE; ++ReplaceI, ++NRI)
+    *ReplaceI = *NRI;
+  if (NRI == NRE)
+    ranges.erase(ReplaceI, LI);
+  else
+    ranges.insert(LI, NRI, NRE);
+
+  // And finally insert any trailing end of RHS (if we have one).
+  for (; RI != RE; ++RI) {
+    LiveRange R = *RI;
+    if (LHSValNo)
+      R.valno = LHSValNo;
+    if (!ranges.empty() &&
+        ranges.back().valno == R.valno && R.start <= ranges.back().end)
+      ranges.back().end = std::max(ranges.back().end, R.end);
+    else
+      ranges.push_back(R);
   }
 
-  ComputeJoinedWeight(Other);
+  // Ensure we finished with a valid new sequence of ranges.
+  verify();
 }
 
 /// MergeRangesInAsValue - Merge all of the intervals in RHS into this live
@@ -458,38 +600,20 @@ void LiveInterval::join(LiveInterval &Other,
 /// the overlapping LiveRanges have the specified value number.
 void LiveInterval::MergeRangesInAsValue(const LiveInterval &RHS,
                                         VNInfo *LHSValNo) {
-  // TODO: Make this more efficient.
-  iterator InsertPos = begin();
-  for (const_iterator I = RHS.begin(), E = RHS.end(); I != E; ++I) {
-    // Map the valno in the other live range to the current live range.
-    LiveRange Tmp = *I;
-    Tmp.valno = LHSValNo;
-    InsertPos = addRangeFrom(Tmp, InsertPos);
-  }
+  mergeIntervalRanges(RHS, LHSValNo);
 }
 
-
 /// MergeValueInAsValue - Merge all of the live ranges of a specific val#
 /// in RHS into this live interval as the specified value number.
 /// The LiveRanges in RHS are allowed to overlap with LiveRanges in the
 /// current interval, it will replace the value numbers of the overlaped
 /// live ranges with the specified value number.
-void LiveInterval::MergeValueInAsValue(
-                                    const LiveInterval &RHS,
-                                    const VNInfo *RHSValNo, VNInfo *LHSValNo) {
-  // TODO: Make this more efficient.
-  iterator InsertPos = begin();
-  for (const_iterator I = RHS.begin(), E = RHS.end(); I != E; ++I) {
-    if (I->valno != RHSValNo)
-      continue;
-    // Map the valno in the other live range to the current live range.
-    LiveRange Tmp = *I;
-    Tmp.valno = LHSValNo;
-    InsertPos = addRangeFrom(Tmp, InsertPos);
-  }
+void LiveInterval::MergeValueInAsValue(const LiveInterval &RHS,
+                                       const VNInfo *RHSValNo,
+                                       VNInfo *LHSValNo) {
+  mergeIntervalRanges(RHS, LHSValNo, RHSValNo);
 }
 
-
 /// MergeValueNumberInto - This method is called when two value nubmers
 /// are found to be equivalent.  This eliminates V1, replacing all
 /// LiveRanges with the V1 value number with the V2 value number.  This can
@@ -569,6 +693,8 @@ void LiveInterval::Copy(const LiveInterval &RHS,
     const LiveRange &LR = RHS.ranges[i];
     addRange(LiveRange(LR.start, LR.end, getValNumInfo(LR.valno->id)));
   }
+
+  verify();
 }
 
 unsigned LiveInterval::getSize() const {
@@ -578,29 +704,6 @@ unsigned LiveInterval::getSize() const {
   return Sum;
 }
 
-/// ComputeJoinedWeight - Set the weight of a live interval Joined
-/// after Other has been merged into it.
-void LiveInterval::ComputeJoinedWeight(const LiveInterval &Other) {
-  // If either of these intervals was spilled, the weight is the
-  // weight of the non-spilled interval.  This can only happen with
-  // iterative coalescers.
-
-  if (Other.weight != HUGE_VALF) {
-    weight += Other.weight;
-  }
-  else if (weight == HUGE_VALF &&
-      !TargetRegisterInfo::isPhysicalRegister(reg)) {
-    // Remove this assert if you have an iterative coalescer
-    assert(0 && "Joining to spilled interval");
-    weight = Other.weight;
-  }
-  else {
-    // Otherwise the weight stays the same
-    // Remove this assert if you have an iterative coalescer
-    assert(0 && "Joining from spilled interval");
-  }
-}
-
 raw_ostream& llvm::operator<<(raw_ostream& os, const LiveRange &LR) {
   return os << '[' << LR.start << ',' << LR.end << ':' << LR.valno->id << ")";
 }
@@ -609,15 +712,10 @@ void LiveRange::dump() const {
   dbgs() << *this << "\n";
 }
 
-void LiveInterval::print(raw_ostream &OS, const TargetRegisterInfo *TRI) const {
-  OS << PrintReg(reg, TRI);
-  if (weight != 0)
-    OS << ',' << weight;
-
+void LiveInterval::print(raw_ostream &OS) const {
   if (empty())
-    OS << " EMPTY";
+    OS << "EMPTY";
   else {
-    OS << " = ";
     for (LiveInterval::Ranges::const_iterator I = ranges.begin(),
            E = ranges.end(); I != E; ++I) {
       OS << *I;
@@ -651,6 +749,23 @@ void LiveInterval::dump() const {
   dbgs() << *this << "\n";
 }
 
+#ifndef NDEBUG
+void LiveInterval::verify() const {
+  for (const_iterator I = begin(), E = end(); I != E; ++I) {
+    assert(I->start.isValid());
+    assert(I->end.isValid());
+    assert(I->start < I->end);
+    assert(I->valno != 0);
+    assert(I->valno == valnos[I->valno->id]);
+    if (llvm::next(I) != E) {
+      assert(I->end <= llvm::next(I)->start);
+      if (I->end == llvm::next(I)->start)
+        assert(I->valno != llvm::next(I)->valno);
+    }
+  }
+}
+#endif
+
 
 void LiveRange::print(raw_ostream &os) const {
   os << *this;
@@ -718,7 +833,10 @@ void ConnectedVNInfoEqClasses::Distribute(LiveInterval *LIV[],
     SlotIndex Idx = LIS.getInstructionIndex(MI);
     Idx = Idx.getRegSlot(MO.isUse());
     const VNInfo *VNI = LI.getVNInfoAt(Idx);
-    assert(VNI && "Interval not live at use.");
+    // FIXME: We should be able to assert(VNI) here, but the coalescer leaves
+    // dangling defs around.
+    if (!VNI)
+      continue;
     MO.setReg(LIV[getEqClass(VNI)]->reg);
   }
 
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index 934cc12..819707f 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -20,30 +20,24 @@
 #include "llvm/Value.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/STLExtras.h"
+#include "LiveRangeCalc.h"
 #include <algorithm>
 #include <limits>
 #include <cmath>
 using namespace llvm;
 
-// Hidden options for help debugging.
-static cl::opt<bool> DisableReMat("disable-rematerialization",
-                                  cl::init(false), cl::Hidden);
-
-STATISTIC(numIntervals , "Number of original intervals");
-
 char LiveIntervals::ID = 0;
 INITIALIZE_PASS_BEGIN(LiveIntervals, "liveintervals",
                 "Live Interval Analysis", false, false)
@@ -61,23 +55,35 @@ void LiveIntervals::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<LiveVariables>();
   AU.addPreserved<LiveVariables>();
   AU.addPreservedID(MachineLoopInfoID);
+  AU.addRequiredTransitiveID(MachineDominatorsID);
   AU.addPreservedID(MachineDominatorsID);
   AU.addPreserved<SlotIndexes>();
   AU.addRequiredTransitive<SlotIndexes>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
+LiveIntervals::LiveIntervals() : MachineFunctionPass(ID),
+  DomTree(0), LRCalc(0) {
+  initializeLiveIntervalsPass(*PassRegistry::getPassRegistry());
+}
+
+LiveIntervals::~LiveIntervals() {
+  delete LRCalc;
+}
+
 void LiveIntervals::releaseMemory() {
   // Free the live intervals themselves.
-  for (DenseMap<unsigned, LiveInterval*>::iterator I = r2iMap_.begin(),
-       E = r2iMap_.end(); I != E; ++I)
-    delete I->second;
-
-  r2iMap_.clear();
+  for (unsigned i = 0, e = VirtRegIntervals.size(); i != e; ++i)
+    delete VirtRegIntervals[TargetRegisterInfo::index2VirtReg(i)];
+  VirtRegIntervals.clear();
   RegMaskSlots.clear();
   RegMaskBits.clear();
   RegMaskBlocks.clear();
 
+  for (unsigned i = 0, e = RegUnitIntervals.size(); i != e; ++i)
+    delete RegUnitIntervals[i];
+  RegUnitIntervals.clear();
+
   // Release VNInfo memory regions, VNInfo objects don't need to be dtor'd.
   VNInfoAllocator.Reset();
 }
@@ -85,20 +91,22 @@ void LiveIntervals::releaseMemory() {
 /// runOnMachineFunction - Register allocate the whole function
 ///
 bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {
-  mf_ = &fn;
-  mri_ = &mf_->getRegInfo();
-  tm_ = &fn.getTarget();
-  tri_ = tm_->getRegisterInfo();
-  tii_ = tm_->getInstrInfo();
-  aa_ = &getAnalysis<AliasAnalysis>();
-  lv_ = &getAnalysis<LiveVariables>();
-  indexes_ = &getAnalysis<SlotIndexes>();
-  allocatableRegs_ = tri_->getAllocatableSet(fn);
-  reservedRegs_ = tri_->getReservedRegs(fn);
+  MF = &fn;
+  MRI = &MF->getRegInfo();
+  TM = &fn.getTarget();
+  TRI = TM->getRegisterInfo();
+  TII = TM->getInstrInfo();
+  AA = &getAnalysis<AliasAnalysis>();
+  LV = &getAnalysis<LiveVariables>();
+  Indexes = &getAnalysis<SlotIndexes>();
+  DomTree = &getAnalysis<MachineDominatorTree>();
+  if (!LRCalc)
+    LRCalc = new LiveRangeCalc();
+  AllocatableRegs = TRI->getAllocatableSet(fn);
+  ReservedRegs = TRI->getReservedRegs(fn);
 
   computeIntervals();
-
-  numIntervals += getNumIntervals();
+  computeLiveInRegUnits();
 
   DEBUG(dump());
   return true;
@@ -108,27 +116,24 @@ bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {
 void LiveIntervals::print(raw_ostream &OS, const Module* ) const {
   OS << "********** INTERVALS **********\n";
 
-  // Dump the physregs.
-  for (unsigned Reg = 1, RegE = tri_->getNumRegs(); Reg != RegE; ++Reg)
-    if (const LiveInterval *LI = r2iMap_.lookup(Reg)) {
-      LI->print(OS, tri_);
-      OS << '\n';
-    }
+  // Dump the regunits.
+  for (unsigned i = 0, e = RegUnitIntervals.size(); i != e; ++i)
+    if (LiveInterval *LI = RegUnitIntervals[i])
+      OS << PrintRegUnit(i, TRI) << " = " << *LI << '\n';
 
   // Dump the virtregs.
-  for (unsigned Reg = 0, RegE = mri_->getNumVirtRegs(); Reg != RegE; ++Reg)
-    if (const LiveInterval *LI =
-        r2iMap_.lookup(TargetRegisterInfo::index2VirtReg(Reg))) {
-      LI->print(OS, tri_);
-      OS << '\n';
-    }
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (hasInterval(Reg))
+      OS << PrintReg(Reg) << " = " << getInterval(Reg) << '\n';
+  }
 
   printInstrs(OS);
 }
 
 void LiveIntervals::printInstrs(raw_ostream &OS) const {
   OS << "********** MACHINEINSTRS **********\n";
-  mf_->print(OS, indexes_);
+  MF->print(OS, Indexes);
 }
 
 void LiveIntervals::dumpInstrs() const {
@@ -176,13 +181,13 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
                                              MachineOperand& MO,
                                              unsigned MOIdx,
                                              LiveInterval &interval) {
-  DEBUG(dbgs() << "\t\tregister: " << PrintReg(interval.reg, tri_));
+  DEBUG(dbgs() << "\t\tregister: " << PrintReg(interval.reg, TRI));
 
   // Virtual registers may be defined multiple times (due to phi
   // elimination and 2-addr elimination).  Much of what we do only has to be
   // done once for the vreg.  We use an empty interval to detect the first
   // time we see a vreg.
-  LiveVariables::VarInfo& vi = lv_->getVarInfo(interval.reg);
+  LiveVariables::VarInfo& vi = LV->getVarInfo(interval.reg);
   if (interval.empty()) {
     // Get the Idx of the defining instructions.
     SlotIndex defIndex = MIIdx.getRegSlot(MO.isEarlyClobber());
@@ -226,11 +231,11 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
     DEBUG(dbgs() << " +" << NewLR);
     interval.addRange(NewLR);
 
-    bool PHIJoin = lv_->isPHIJoin(interval.reg);
+    bool PHIJoin = LV->isPHIJoin(interval.reg);
 
     if (PHIJoin) {
-      // A phi join register is killed at the end of the MBB and revived as a new
-      // valno in the killing blocks.
+      // A phi join register is killed at the end of the MBB and revived as a
+      // new valno in the killing blocks.
       assert(vi.AliveBlocks.empty() && "Phi join can't pass through blocks");
       DEBUG(dbgs() << " phi-join");
       ValNo->setHasPHIKill(true);
@@ -240,8 +245,9 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
       // live interval.
       for (SparseBitVector<>::iterator I = vi.AliveBlocks.begin(),
                E = vi.AliveBlocks.end(); I != E; ++I) {
-        MachineBasicBlock *aliveBlock = mf_->getBlockNumbered(*I);
-        LiveRange LR(getMBBStartIdx(aliveBlock), getMBBEndIdx(aliveBlock), ValNo);
+        MachineBasicBlock *aliveBlock = MF->getBlockNumbered(*I);
+        LiveRange LR(getMBBStartIdx(aliveBlock), getMBBEndIdx(aliveBlock),
+                     ValNo);
         interval.addRange(LR);
         DEBUG(dbgs() << " +" << LR);
       }
@@ -319,11 +325,8 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
         interval.addRange(LiveRange(RedefIndex, RedefIndex.getDeadSlot(),
                                     OldValNo));
 
-      DEBUG({
-          dbgs() << " RESULT: ";
-          interval.print(dbgs(), tri_);
-        });
-    } else if (lv_->isPHIJoin(interval.reg)) {
+      DEBUG(dbgs() << " RESULT: " << interval);
+    } else if (LV->isPHIJoin(interval.reg)) {
       // In the case of PHI elimination, each variable definition is only
       // live until the end of the block.  We've already taken care of the
       // rest of the live range.
@@ -347,101 +350,6 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
   DEBUG(dbgs() << '\n');
 }
 
-static bool isRegLiveIntoSuccessor(const MachineBasicBlock *MBB, unsigned Reg) {
-  for (MachineBasicBlock::const_succ_iterator SI = MBB->succ_begin(),
-                                              SE = MBB->succ_end();
-       SI != SE; ++SI) {
-    const MachineBasicBlock* succ = *SI;
-    if (succ->isLiveIn(Reg))
-      return true;
-  }
-  return false;
-}
-
-void LiveIntervals::handlePhysicalRegisterDef(MachineBasicBlock *MBB,
-                                              MachineBasicBlock::iterator mi,
-                                              SlotIndex MIIdx,
-                                              MachineOperand& MO,
-                                              LiveInterval &interval) {
-  DEBUG(dbgs() << "\t\tregister: " << PrintReg(interval.reg, tri_));
-
-  SlotIndex baseIndex = MIIdx;
-  SlotIndex start = baseIndex.getRegSlot(MO.isEarlyClobber());
-  SlotIndex end = start;
-
-  // If it is not used after definition, it is considered dead at
-  // the instruction defining it. Hence its interval is:
-  // [defSlot(def), defSlot(def)+1)
-  // For earlyclobbers, the defSlot was pushed back one; the extra
-  // advance below compensates.
-  if (MO.isDead()) {
-    DEBUG(dbgs() << " dead");
-    end = start.getDeadSlot();
-    goto exit;
-  }
-
-  // If it is not dead on definition, it must be killed by a
-  // subsequent instruction. Hence its interval is:
-  // [defSlot(def), useSlot(kill)+1)
-  baseIndex = baseIndex.getNextIndex();
-  while (++mi != MBB->end()) {
-
-    if (mi->isDebugValue())
-      continue;
-    if (getInstructionFromIndex(baseIndex) == 0)
-      baseIndex = indexes_->getNextNonNullIndex(baseIndex);
-
-    if (mi->killsRegister(interval.reg, tri_)) {
-      DEBUG(dbgs() << " killed");
-      end = baseIndex.getRegSlot();
-      goto exit;
-    } else {
-      int DefIdx = mi->findRegisterDefOperandIdx(interval.reg,false,false,tri_);
-      if (DefIdx != -1) {
-        if (mi->isRegTiedToUseOperand(DefIdx)) {
-          // Two-address instruction.
-          end = baseIndex.getRegSlot(mi->getOperand(DefIdx).isEarlyClobber());
-        } else {
-          // Another instruction redefines the register before it is ever read.
-          // Then the register is essentially dead at the instruction that
-          // defines it. Hence its interval is:
-          // [defSlot(def), defSlot(def)+1)
-          DEBUG(dbgs() << " dead");
-          end = start.getDeadSlot();
-        }
-        goto exit;
-      }
-    }
-
-    baseIndex = baseIndex.getNextIndex();
-  }
-
-  // If we get here the register *should* be live out.
-  assert(!isAllocatable(interval.reg) && "Physregs shouldn't be live out!");
-
-  // FIXME: We need saner rules for reserved regs.
-  if (isReserved(interval.reg)) {
-    end = start.getDeadSlot();
-  } else {
-    // Unreserved, unallocable registers like EFLAGS can be live across basic
-    // block boundaries.
-    assert(isRegLiveIntoSuccessor(MBB, interval.reg) &&
-           "Unreserved reg not live-out?");
-    end = getMBBEndIdx(MBB);
-  }
-exit:
-  assert(start < end && "did not find end of interval?");
-
-  // Already exists? Extend old live interval.
-  VNInfo *ValNo = interval.getVNInfoAt(start);
-  bool Extend = ValNo != 0;
-  if (!Extend)
-    ValNo = interval.getNextValue(start, VNInfoAllocator);
-  LiveRange LR(start, end, ValNo);
-  interval.addRange(LR);
-  DEBUG(dbgs() << " +" << LR << '\n');
-}
-
 void LiveIntervals::handleRegisterDef(MachineBasicBlock *MBB,
                                       MachineBasicBlock::iterator MI,
                                       SlotIndex MIIdx,
@@ -450,93 +358,6 @@ void LiveIntervals::handleRegisterDef(MachineBasicBlock *MBB,
   if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
     handleVirtualRegisterDef(MBB, MI, MIIdx, MO, MOIdx,
                              getOrCreateInterval(MO.getReg()));
-  else
-    handlePhysicalRegisterDef(MBB, MI, MIIdx, MO,
-                              getOrCreateInterval(MO.getReg()));
-}
-
-void LiveIntervals::handleLiveInRegister(MachineBasicBlock *MBB,
-                                         SlotIndex MIIdx,
-                                         LiveInterval &interval) {
-  assert(TargetRegisterInfo::isPhysicalRegister(interval.reg) &&
-         "Only physical registers can be live in.");
-  assert((!isAllocatable(interval.reg) || MBB->getParent()->begin() ||
-          MBB->isLandingPad()) &&
-          "Allocatable live-ins only valid for entry blocks and landing pads.");
-
-  DEBUG(dbgs() << "\t\tlivein register: " << PrintReg(interval.reg, tri_));
-
-  // Look for kills, if it reaches a def before it's killed, then it shouldn't
-  // be considered a livein.
-  MachineBasicBlock::iterator mi = MBB->begin();
-  MachineBasicBlock::iterator E = MBB->end();
-  // Skip over DBG_VALUE at the start of the MBB.
-  if (mi != E && mi->isDebugValue()) {
-    while (++mi != E && mi->isDebugValue())
-      ;
-    if (mi == E)
-      // MBB is empty except for DBG_VALUE's.
-      return;
-  }
-
-  SlotIndex baseIndex = MIIdx;
-  SlotIndex start = baseIndex;
-  if (getInstructionFromIndex(baseIndex) == 0)
-    baseIndex = indexes_->getNextNonNullIndex(baseIndex);
-
-  SlotIndex end = baseIndex;
-  bool SeenDefUse = false;
-
-  while (mi != E) {
-    if (mi->killsRegister(interval.reg, tri_)) {
-      DEBUG(dbgs() << " killed");
-      end = baseIndex.getRegSlot();
-      SeenDefUse = true;
-      break;
-    } else if (mi->modifiesRegister(interval.reg, tri_)) {
-      // Another instruction redefines the register before it is ever read.
-      // Then the register is essentially dead at the instruction that defines
-      // it. Hence its interval is:
-      // [defSlot(def), defSlot(def)+1)
-      DEBUG(dbgs() << " dead");
-      end = start.getDeadSlot();
-      SeenDefUse = true;
-      break;
-    }
-
-    while (++mi != E && mi->isDebugValue())
-      // Skip over DBG_VALUE.
-      ;
-    if (mi != E)
-      baseIndex = indexes_->getNextNonNullIndex(baseIndex);
-  }
-
-  // Live-in register might not be used at all.
-  if (!SeenDefUse) {
-    if (isAllocatable(interval.reg) ||
-        !isRegLiveIntoSuccessor(MBB, interval.reg)) {
-      // Allocatable registers are never live through.
-      // Non-allocatable registers that aren't live into any successors also
-      // aren't live through.
-      DEBUG(dbgs() << " dead");
-      return;
-    } else {
-      // If we get here the register is non-allocatable and live into some
-      // successor. We'll conservatively assume it's live-through.
-      DEBUG(dbgs() << " live through");
-      end = getMBBEndIdx(MBB);
-    }
-  }
-
-  SlotIndex defIdx = getMBBStartIdx(MBB);
-  assert(getInstructionFromIndex(defIdx) == 0 &&
-         "PHI def index points at actual instruction.");
-  VNInfo *vni = interval.getNextValue(defIdx, VNInfoAllocator);
-  vni->setIsPHIDef(true);
-  LiveRange LR(start, end, vni);
-
-  interval.addRange(LR);
-  DEBUG(dbgs() << " +" << LR << '\n');
 }
 
 /// computeIntervals - computes the live intervals for virtual
@@ -546,12 +367,12 @@ void LiveIntervals::handleLiveInRegister(MachineBasicBlock *MBB,
 void LiveIntervals::computeIntervals() {
   DEBUG(dbgs() << "********** COMPUTING LIVE INTERVALS **********\n"
                << "********** Function: "
-               << ((Value*)mf_->getFunction())->getName() << '\n');
+               << ((Value*)MF->getFunction())->getName() << '\n');
 
-  RegMaskBlocks.resize(mf_->getNumBlockIDs());
+  RegMaskBlocks.resize(MF->getNumBlockIDs());
 
   SmallVector<unsigned, 8> UndefUses;
-  for (MachineFunction::iterator MBBI = mf_->begin(), E = mf_->end();
+  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
        MBBI != E; ++MBBI) {
     MachineBasicBlock *MBB = MBBI;
     RegMaskBlocks[MBB->getNumber()].first = RegMaskSlots.size();
@@ -564,22 +385,16 @@ void LiveIntervals::computeIntervals() {
     DEBUG(dbgs() << "BB#" << MBB->getNumber()
           << ":\t\t# derived from " << MBB->getName() << "\n");
 
-    // Create intervals for live-ins to this BB first.
-    for (MachineBasicBlock::livein_iterator LI = MBB->livein_begin(),
-           LE = MBB->livein_end(); LI != LE; ++LI) {
-      handleLiveInRegister(MBB, MIIndex, getOrCreateInterval(*LI));
-    }
-
     // Skip over empty initial indices.
     if (getInstructionFromIndex(MIIndex) == 0)
-      MIIndex = indexes_->getNextNonNullIndex(MIIndex);
+      MIIndex = Indexes->getNextNonNullIndex(MIIndex);
 
     for (MachineBasicBlock::iterator MI = MBB->begin(), miEnd = MBB->end();
          MI != miEnd; ++MI) {
       DEBUG(dbgs() << MIIndex << "\t" << *MI);
       if (MI->isDebugValue())
         continue;
-      assert(indexes_->getInstructionFromIndex(MIIndex) == MI &&
+      assert(Indexes->getInstructionFromIndex(MIIndex) == MI &&
              "Lost SlotIndex synchronization");
 
       // Handle defs.
@@ -593,7 +408,7 @@ void LiveIntervals::computeIntervals() {
           continue;
         }
 
-        if (!MO.isReg() || !MO.getReg())
+        if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
           continue;
 
         // handle register defs - build intervals
@@ -604,7 +419,7 @@ void LiveIntervals::computeIntervals() {
       }
 
       // Move to the next instr slot.
-      MIIndex = indexes_->getNextNonNullIndex(MIIndex);
+      MIIndex = Indexes->getNextNonNullIndex(MIIndex);
     }
 
     // Compute the number of register mask instructions in this block.
@@ -626,14 +441,104 @@ LiveInterval* LiveIntervals::createInterval(unsigned reg) {
   return new LiveInterval(reg, Weight);
 }
 
-/// dupInterval - Duplicate a live interval. The caller is responsible for
-/// managing the allocated memory.
-LiveInterval* LiveIntervals::dupInterval(LiveInterval *li) {
-  LiveInterval *NewLI = createInterval(li->reg);
-  NewLI->Copy(*li, mri_, getVNInfoAllocator());
-  return NewLI;
+
+//===----------------------------------------------------------------------===//
+//                           Register Unit Liveness
+//===----------------------------------------------------------------------===//
+//
+// Fixed interference typically comes from ABI boundaries: Function arguments
+// and return values are passed in fixed registers, and so are exception
+// pointers entering landing pads. Certain instructions require values to be
+// present in specific registers. That is also represented through fixed
+// interference.
+//
+
+/// computeRegUnitInterval - Compute the live interval of a register unit, based
+/// on the uses and defs of aliasing registers.  The interval should be empty,
+/// or contain only dead phi-defs from ABI blocks.
+void LiveIntervals::computeRegUnitInterval(LiveInterval *LI) {
+  unsigned Unit = LI->reg;
+
+  assert(LRCalc && "LRCalc not initialized.");
+  LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
+
+  // The physregs aliasing Unit are the roots and their super-registers.
+  // Create all values as dead defs before extending to uses. Note that roots
+  // may share super-registers. That's OK because createDeadDefs() is
+  // idempotent. It is very rare for a register unit to have multiple roots, so
+  // uniquing super-registers is probably not worthwhile.
+  for (MCRegUnitRootIterator Roots(Unit, TRI); Roots.isValid(); ++Roots) {
+    unsigned Root = *Roots;
+    if (!MRI->reg_empty(Root))
+      LRCalc->createDeadDefs(LI, Root);
+    for (MCSuperRegIterator Supers(Root, TRI); Supers.isValid(); ++Supers) {
+      if (!MRI->reg_empty(*Supers))
+        LRCalc->createDeadDefs(LI, *Supers);
+    }
+  }
+
+  // Now extend LI to reach all uses.
+  // Ignore uses of reserved registers. We only track defs of those.
+  for (MCRegUnitRootIterator Roots(Unit, TRI); Roots.isValid(); ++Roots) {
+    unsigned Root = *Roots;
+    if (!isReserved(Root) && !MRI->reg_empty(Root))
+      LRCalc->extendToUses(LI, Root);
+    for (MCSuperRegIterator Supers(Root, TRI); Supers.isValid(); ++Supers) {
+      unsigned Reg = *Supers;
+      if (!isReserved(Reg) && !MRI->reg_empty(Reg))
+        LRCalc->extendToUses(LI, Reg);
+    }
+  }
+}
+
+
+/// computeLiveInRegUnits - Precompute the live ranges of any register units
+/// that are live-in to an ABI block somewhere. Register values can appear
+/// without a corresponding def when entering the entry block or a landing pad.
+///
+void LiveIntervals::computeLiveInRegUnits() {
+  RegUnitIntervals.resize(TRI->getNumRegUnits());
+  DEBUG(dbgs() << "Computing live-in reg-units in ABI blocks.\n");
+
+  // Keep track of the intervals allocated.
+  SmallVector<LiveInterval*, 8> NewIntvs;
+
+  // Check all basic blocks for live-ins.
+  for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end();
+       MFI != MFE; ++MFI) {
+    const MachineBasicBlock *MBB = MFI;
+
+    // We only care about ABI blocks: Entry + landing pads.
+    if ((MFI != MF->begin() && !MBB->isLandingPad()) || MBB->livein_empty())
+      continue;
+
+    // Create phi-defs at Begin for all live-in registers.
+    SlotIndex Begin = Indexes->getMBBStartIdx(MBB);
+    DEBUG(dbgs() << Begin << "\tBB#" << MBB->getNumber());
+    for (MachineBasicBlock::livein_iterator LII = MBB->livein_begin(),
+         LIE = MBB->livein_end(); LII != LIE; ++LII) {
+      for (MCRegUnitIterator Units(*LII, TRI); Units.isValid(); ++Units) {
+        unsigned Unit = *Units;
+        LiveInterval *Intv = RegUnitIntervals[Unit];
+        if (!Intv) {
+          Intv = RegUnitIntervals[Unit] = new LiveInterval(Unit, HUGE_VALF);
+          NewIntvs.push_back(Intv);
+        }
+        VNInfo *VNI = Intv->createDeadDef(Begin, getVNInfoAllocator());
+        (void)VNI;
+        DEBUG(dbgs() << ' ' << PrintRegUnit(Unit, TRI) << '#' << VNI->id);
+      }
+    }
+    DEBUG(dbgs() << '\n');
+  }
+  DEBUG(dbgs() << "Created " << NewIntvs.size() << " new intervals.\n");
+
+  // Compute the 'normal' part of the intervals.
+  for (unsigned i = 0, e = NewIntvs.size(); i != e; ++i)
+    computeRegUnitInterval(NewIntvs[i]);
 }
 
+
 /// shrinkToUses - After removing some uses of a register, shrink its live
 /// range to just the remaining uses. This method does not compute reaching
 /// defs for new uses, and it doesn't remove dead defs.
@@ -649,14 +554,13 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
   SmallPtrSet<MachineBasicBlock*, 16> LiveOut;
 
   // Visit all instructions reading li->reg.
-  for (MachineRegisterInfo::reg_iterator I = mri_->reg_begin(li->reg);
+  for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(li->reg);
        MachineInstr *UseMI = I.skipInstruction();) {
     if (UseMI->isDebugValue() || !UseMI->readsVirtualRegister(li->reg))
       continue;
     SlotIndex Idx = getInstructionIndex(UseMI).getRegSlot();
-    // Note: This intentionally picks up the wrong VNI in case of an EC redef.
-    // See below.
-    VNInfo *VNI = li->getVNInfoBefore(Idx);
+    LiveRangeQuery LRQ(*li, Idx);
+    VNInfo *VNI = LRQ.valueIn();
     if (!VNI) {
       // This shouldn't happen: readsVirtualRegister returns true, but there is
       // no live value. It is likely caused by a target getting <undef> flags
@@ -667,13 +571,10 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
       continue;
     }
     // Special case: An early-clobber tied operand reads and writes the
-    // register one slot early.  The getVNInfoBefore call above would have
-    // picked up the value defined by UseMI.  Adjust the kill slot and value.
-    if (SlotIndex::isSameInstr(VNI->def, Idx)) {
-      Idx = VNI->def;
-      VNI = li->getVNInfoBefore(Idx);
-      assert(VNI && "Early-clobber tied value not available");
-    }
+    // register one slot early.
+    if (VNInfo *DefVNI = LRQ.valueDefined())
+      Idx = DefVNI->def;
+
     WorkList.push_back(std::make_pair(Idx, VNI));
   }
 
@@ -755,7 +656,7 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
       // This is a dead def. Make sure the instruction knows.
       MachineInstr *MI = getInstructionFromIndex(VNI->def);
       assert(MI && "No instruction defining live value");
-      MI->addRegisterDead(li->reg, tri_);
+      MI->addRegisterDead(li->reg, TRI);
       if (dead && MI->allDefsAreDead()) {
         DEBUG(dbgs() << "All defs dead: " << VNI->def << '\t' << *MI);
         dead->push_back(MI);
@@ -775,13 +676,11 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
 //
 
 void LiveIntervals::addKillFlags() {
-  for (iterator I = begin(), E = end(); I != E; ++I) {
-    unsigned Reg = I->first;
-    if (TargetRegisterInfo::isPhysicalRegister(Reg))
-      continue;
-    if (mri_->reg_nodbg_empty(Reg))
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (MRI->reg_nodbg_empty(Reg))
       continue;
-    LiveInterval *LI = I->second;
+    LiveInterval *LI = &getInterval(Reg);
 
     // Every instruction that kills Reg corresponds to a live range end point.
     for (LiveInterval::iterator RI = LI->begin(), RE = LI->end(); RI != RE;
@@ -797,101 +696,6 @@ void LiveIntervals::addKillFlags() {
   }
 }
 
-/// getReMatImplicitUse - If the remat definition MI has one (for now, we only
-/// allow one) virtual register operand, then its uses are implicitly using
-/// the register. Returns the virtual register.
-unsigned LiveIntervals::getReMatImplicitUse(const LiveInterval &li,
-                                            MachineInstr *MI) const {
-  unsigned RegOp = 0;
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
-    if (!MO.isReg() || !MO.isUse())
-      continue;
-    unsigned Reg = MO.getReg();
-    if (Reg == 0 || Reg == li.reg)
-      continue;
-
-    if (TargetRegisterInfo::isPhysicalRegister(Reg) && !isAllocatable(Reg))
-      continue;
-    RegOp = MO.getReg();
-    break; // Found vreg operand - leave the loop.
-  }
-  return RegOp;
-}
-
-/// isValNoAvailableAt - Return true if the val# of the specified interval
-/// which reaches the given instruction also reaches the specified use index.
-bool LiveIntervals::isValNoAvailableAt(const LiveInterval &li, MachineInstr *MI,
-                                       SlotIndex UseIdx) const {
-  VNInfo *UValNo = li.getVNInfoAt(UseIdx);
-  return UValNo && UValNo == li.getVNInfoAt(getInstructionIndex(MI));
-}
-
-/// isReMaterializable - Returns true if the definition MI of the specified
-/// val# of the specified interval is re-materializable.
-bool
-LiveIntervals::isReMaterializable(const LiveInterval &li,
-                                  const VNInfo *ValNo, MachineInstr *MI,
-                                  const SmallVectorImpl<LiveInterval*> *SpillIs,
-                                  bool &isLoad) {
-  if (DisableReMat)
-    return false;
-
-  if (!tii_->isTriviallyReMaterializable(MI, aa_))
-    return false;
-
-  // Target-specific code can mark an instruction as being rematerializable
-  // if it has one virtual reg use, though it had better be something like
-  // a PIC base register which is likely to be live everywhere.
-  unsigned ImpUse = getReMatImplicitUse(li, MI);
-  if (ImpUse) {
-    const LiveInterval &ImpLi = getInterval(ImpUse);
-    for (MachineRegisterInfo::use_nodbg_iterator
-           ri = mri_->use_nodbg_begin(li.reg), re = mri_->use_nodbg_end();
-         ri != re; ++ri) {
-      MachineInstr *UseMI = &*ri;
-      SlotIndex UseIdx = getInstructionIndex(UseMI);
-      if (li.getVNInfoAt(UseIdx) != ValNo)
-        continue;
-      if (!isValNoAvailableAt(ImpLi, MI, UseIdx))
-        return false;
-    }
-
-    // If a register operand of the re-materialized instruction is going to
-    // be spilled next, then it's not legal to re-materialize this instruction.
-    if (SpillIs)
-      for (unsigned i = 0, e = SpillIs->size(); i != e; ++i)
-        if (ImpUse == (*SpillIs)[i]->reg)
-          return false;
-  }
-  return true;
-}
-
-/// isReMaterializable - Returns true if every definition of MI of every
-/// val# of the specified interval is re-materializable.
-bool
-LiveIntervals::isReMaterializable(const LiveInterval &li,
-                                  const SmallVectorImpl<LiveInterval*> *SpillIs,
-                                  bool &isLoad) {
-  isLoad = false;
-  for (LiveInterval::const_vni_iterator i = li.vni_begin(), e = li.vni_end();
-       i != e; ++i) {
-    const VNInfo *VNI = *i;
-    if (VNI->isUnused())
-      continue; // Dead val#.
-    // Is the def for the val# rematerializable?
-    MachineInstr *ReMatDefMI = getInstructionFromIndex(VNI->def);
-    if (!ReMatDefMI)
-      return false;
-    bool DefIsLoad = false;
-    if (!ReMatDefMI ||
-        !isReMaterializable(li, VNI, ReMatDefMI, SpillIs, DefIsLoad))
-      return false;
-    isLoad |= DefIsLoad;
-  }
-  return true;
-}
-
 MachineBasicBlock*
 LiveIntervals::intervalIsInOneMBB(const LiveInterval &LI) const {
   // A local live range must be fully contained inside the block, meaning it is
@@ -911,8 +715,8 @@ LiveIntervals::intervalIsInOneMBB(const LiveInterval &LI) const {
 
   // getMBBFromIndex doesn't need to search the MBB table when both indexes
   // belong to proper instructions.
-  MachineBasicBlock *MBB1 = indexes_->getMBBFromIndex(Start);
-  MachineBasicBlock *MBB2 = indexes_->getMBBFromIndex(Stop);
+  MachineBasicBlock *MBB1 = Indexes->getMBBFromIndex(Start);
+  MachineBasicBlock *MBB2 = Indexes->getMBBFromIndex(Stop);
   return MBB1 == MBB2 ? MBB1 : NULL;
 }
 
@@ -990,7 +794,7 @@ bool LiveIntervals::checkRegMaskInterference(LiveInterval &LI,
       if (!Found) {
         // This is the first overlap. Initialize UsableRegs to all ones.
         UsableRegs.clear();
-        UsableRegs.resize(tri_->getNumRegs(), true);
+        UsableRegs.resize(TRI->getNumRegs(), true);
         Found = true;
       }
       // Remove usable registers clobbered by this mask.
@@ -1101,6 +905,9 @@ public:
 
     BundleRanges BR = createBundleRanges(Entering, Internal, Exiting);
 
+    Entering.clear();
+    Internal.clear();
+    Exiting.clear();
     collectRanges(MI, Entering, Internal, Exiting, hasRegMaskOp, OldIdx);
     assert(!hasRegMaskOp && "Can't have RegMask operand in bundle.");
 
@@ -1176,78 +983,44 @@ private:
       // TODO: Currently we're skipping uses that are reserved or have no
       // interval, but we're not updating their kills. This should be
       // fixed.
-      if (!LIS.hasInterval(Reg) ||
-          (TargetRegisterInfo::isPhysicalRegister(Reg) && LIS.isReserved(Reg)))
+      if (TargetRegisterInfo::isPhysicalRegister(Reg) && LIS.isReserved(Reg))
         continue;
 
-      LiveInterval* LI = &LIS.getInterval(Reg);
-
-      if (MO.readsReg()) {
-        LiveRange* LR = LI->getLiveRangeContaining(OldIdx);
-        if (LR != 0)
-          Entering.insert(std::make_pair(LI, LR));
-      }
-      if (MO.isDef()) {
-        if (MO.isEarlyClobber()) {
-          LiveRange* LR = LI->getLiveRangeContaining(OldIdx.getRegSlot(true));
-          assert(LR != 0 && "No EC range?");
-          if (LR->end > OldIdx.getDeadSlot())
-            Exiting.insert(std::make_pair(LI, LR));
-          else
-            Internal.insert(std::make_pair(LI, LR));
-        } else if (MO.isDead()) {
-          LiveRange* LR = LI->getLiveRangeContaining(OldIdx.getRegSlot());
-          assert(LR != 0 && "No dead-def range?");
-          Internal.insert(std::make_pair(LI, LR));
-        } else {
-          LiveRange* LR = LI->getLiveRangeContaining(OldIdx.getDeadSlot());
-          assert(LR && LR->end > OldIdx.getDeadSlot() &&
-                 "Non-dead-def should have live range exiting.");
-          Exiting.insert(std::make_pair(LI, LR));
-        }
+      // Collect ranges for register units. These live ranges are computed on
+      // demand, so just skip any that haven't been computed yet.
+      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+        for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units)
+          if (LiveInterval *LI = LIS.getCachedRegUnit(*Units))
+            collectRanges(MO, LI, Entering, Internal, Exiting, OldIdx);
+      } else {
+        // Collect ranges for individual virtual registers.
+        collectRanges(MO, &LIS.getInterval(Reg),
+                      Entering, Internal, Exiting, OldIdx);
       }
     }
   }
 
-  // Collect IntRangePairs for all operands of MI that may need fixing.
-  void collectRangesInBundle(MachineInstr* MI, RangeSet& Entering,
-                             RangeSet& Exiting, SlotIndex MIStartIdx,
-                             SlotIndex MIEndIdx) {
-    for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
-                                    MOE = MI->operands_end();
-         MOI != MOE; ++MOI) {
-      const MachineOperand& MO = *MOI;
-      assert(!MO.isRegMask() && "Can't have RegMasks in bundles.");
-      if (!MO.isReg() || MO.getReg() == 0)
-        continue;
-
-      unsigned Reg = MO.getReg();
-
-      // TODO: Currently we're skipping uses that are reserved or have no
-      // interval, but we're not updating their kills. This should be
-      // fixed.
-      if (!LIS.hasInterval(Reg) ||
-          (TargetRegisterInfo::isPhysicalRegister(Reg) && LIS.isReserved(Reg)))
-        continue;
-
-      LiveInterval* LI = &LIS.getInterval(Reg);
-
-      if (MO.readsReg()) {
-        LiveRange* LR = LI->getLiveRangeContaining(MIStartIdx);
-        if (LR != 0)
-          Entering.insert(std::make_pair(LI, LR));
-      }
-      if (MO.isDef()) {
-        assert(!MO.isEarlyClobber() && "Early clobbers not allowed in bundles.");
-        assert(!MO.isDead() && "Dead-defs not allowed in bundles.");
-        LiveRange* LR = LI->getLiveRangeContaining(MIEndIdx.getDeadSlot());
-        assert(LR != 0 && "Internal ranges not allowed in bundles.");
+  void collectRanges(const MachineOperand &MO, LiveInterval *LI,
+                     RangeSet &Entering, RangeSet &Internal, RangeSet &Exiting,
+                     SlotIndex OldIdx) {
+    if (MO.readsReg()) {
+      LiveRange* LR = LI->getLiveRangeContaining(OldIdx);
+      if (LR != 0)
+        Entering.insert(std::make_pair(LI, LR));
+    }
+    if (MO.isDef()) {
+      LiveRange* LR = LI->getLiveRangeContaining(OldIdx.getRegSlot());
+      assert(LR != 0 && "No live range for def?");
+      if (LR->end > OldIdx.getDeadSlot())
         Exiting.insert(std::make_pair(LI, LR));
-      }
+      else
+        Internal.insert(std::make_pair(LI, LR));
     }
   }
 
-  BundleRanges createBundleRanges(RangeSet& Entering, RangeSet& Internal, RangeSet& Exiting) {
+  BundleRanges createBundleRanges(RangeSet& Entering,
+                                  RangeSet& Internal,
+                                  RangeSet& Exiting) {
     BundleRanges BR;
 
     for (RangeSet::iterator EI = Entering.begin(), EE = Entering.end();
@@ -1284,7 +1057,8 @@ private:
       return; // Bail out if we don't have kill flags on the old register.
     MachineInstr* NewKillMI = LIS.getInstructionFromIndex(newKillIdx);
     assert(OldKillMI->killsRegister(reg) && "Old 'kill' instr isn't a kill.");
-    assert(!NewKillMI->killsRegister(reg) && "New kill instr is already a kill.");
+    assert(!NewKillMI->killsRegister(reg) &&
+           "New kill instr is already a kill.");
     OldKillMI->clearRegisterKills(reg, &TRI);
     NewKillMI->addRegisterKilled(reg, &TRI);
   }
@@ -1523,22 +1297,23 @@ private:
 };
 
 void LiveIntervals::handleMove(MachineInstr* MI) {
-  SlotIndex OldIndex = indexes_->getInstructionIndex(MI);
-  indexes_->removeMachineInstrFromMaps(MI);
+  SlotIndex OldIndex = Indexes->getInstructionIndex(MI);
+  Indexes->removeMachineInstrFromMaps(MI);
   SlotIndex NewIndex = MI->isInsideBundle() ?
-                        indexes_->getInstructionIndex(MI) :
-                        indexes_->insertMachineInstrInMaps(MI);
+                        Indexes->getInstructionIndex(MI) :
+                        Indexes->insertMachineInstrInMaps(MI);
   assert(getMBBStartIdx(MI->getParent()) <= OldIndex &&
          OldIndex < getMBBEndIdx(MI->getParent()) &&
          "Cannot handle moves across basic block boundaries.");
   assert(!MI->isBundled() && "Can't handle bundled instructions yet.");
 
-  HMEditor HME(*this, *mri_, *tri_, NewIndex);
+  HMEditor HME(*this, *MRI, *TRI, NewIndex);
   HME.moveAllRangesFrom(MI, OldIndex);
 }
 
-void LiveIntervals::handleMoveIntoBundle(MachineInstr* MI, MachineInstr* BundleStart) {
-  SlotIndex NewIndex = indexes_->getInstructionIndex(BundleStart);
-  HMEditor HME(*this, *mri_, *tri_, NewIndex);
+void LiveIntervals::handleMoveIntoBundle(MachineInstr* MI,
+                                         MachineInstr* BundleStart) {
+  SlotIndex NewIndex = Indexes->getInstructionIndex(BundleStart);
+  HMEditor HME(*this, *MRI, *TRI, NewIndex);
   HME.moveAllRangesInto(MI, BundleStart);
 }
diff --git a/lib/CodeGen/LiveIntervalUnion.cpp b/lib/CodeGen/LiveIntervalUnion.cpp
index 60a6880..dadd02b 100644
--- a/lib/CodeGen/LiveIntervalUnion.cpp
+++ b/lib/CodeGen/LiveIntervalUnion.cpp
@@ -81,7 +81,6 @@ void LiveIntervalUnion::extract(LiveInterval &VirtReg) {
 
 void
 LiveIntervalUnion::print(raw_ostream &OS, const TargetRegisterInfo *TRI) const {
-  OS << "LIU " << PrintReg(RepReg, TRI);
   if (empty()) {
     OS << " empty\n";
     return;
@@ -209,3 +208,26 @@ bool LiveIntervalUnion::Query::checkLoopInterference(MachineLoopRange *Loop) {
     VRI = VirtReg->advanceTo(VRI, Overlaps.start());
   }
 }
+
+void LiveIntervalUnion::Array::init(LiveIntervalUnion::Allocator &Alloc,
+                                    unsigned NSize) {
+  // Reuse existing allocation.
+  if (NSize == Size)
+    return;
+  clear();
+  Size = NSize;
+  LIUs = static_cast<LiveIntervalUnion*>(
+    malloc(sizeof(LiveIntervalUnion)*NSize));
+  for (unsigned i = 0; i != Size; ++i)
+    new(LIUs + i) LiveIntervalUnion(Alloc);
+}
+
+void LiveIntervalUnion::Array::clear() {
+  if (!LIUs)
+    return;
+  for (unsigned i = 0; i != Size; ++i)
+    LIUs[i].~LiveIntervalUnion();
+  free(LIUs);
+  Size =  0;
+  LIUs = 0;
+}
diff --git a/lib/CodeGen/LiveIntervalUnion.h b/lib/CodeGen/LiveIntervalUnion.h
index dbf5ac1..cd4e690 100644
--- a/lib/CodeGen/LiveIntervalUnion.h
+++ b/lib/CodeGen/LiveIntervalUnion.h
@@ -60,13 +60,11 @@ public:
   class Query;
 
 private:
-  const unsigned RepReg;  // representative register number
   unsigned Tag;           // unique tag for current contents.
   LiveSegments Segments;  // union of virtual reg segments
 
 public:
-  LiveIntervalUnion(unsigned r, Allocator &a) : RepReg(r), Tag(0), Segments(a)
-    {}
+  explicit LiveIntervalUnion(Allocator &a) : Tag(0), Segments(a) {}
 
   // Iterate over all segments in the union of live virtual registers ordered
   // by their starting position.
@@ -183,6 +181,28 @@ public:
     Query(const Query&);          // DO NOT IMPLEMENT
     void operator=(const Query&); // DO NOT IMPLEMENT
   };
+
+  // Array of LiveIntervalUnions.
+  class Array {
+    unsigned Size;
+    LiveIntervalUnion *LIUs;
+  public:
+    Array() : Size(0), LIUs(0) {}
+    ~Array() { clear(); }
+
+    // Initialize the array to have Size entries.
+    // Reuse an existing allocation if the size matches.
+    void init(LiveIntervalUnion::Allocator&, unsigned Size);
+
+    unsigned size() const { return Size; }
+
+    void clear();
+
+    LiveIntervalUnion& operator[](unsigned idx) {
+      assert(idx <  Size && "idx out of bounds");
+      return LIUs[idx];
+    }
+  };
 };
 
 } // end namespace llvm
diff --git a/lib/CodeGen/LiveRangeCalc.cpp b/lib/CodeGen/LiveRangeCalc.cpp
index d8ab791..9384075 100644
--- a/lib/CodeGen/LiveRangeCalc.cpp
+++ b/lib/CodeGen/LiveRangeCalc.cpp
@@ -14,10 +14,19 @@
 #define DEBUG_TYPE "regalloc"
 #include "LiveRangeCalc.h"
 #include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 
 using namespace llvm;
 
-void LiveRangeCalc::reset(const MachineFunction *MF) {
+void LiveRangeCalc::reset(const MachineFunction *MF,
+                          SlotIndexes *SI,
+                          MachineDominatorTree *MDT,
+                          VNInfo::Allocator *VNIA) {
+  MRI = &MF->getRegInfo();
+  Indexes = SI;
+  DomTree = MDT;
+  Alloc = VNIA;
+
   unsigned N = MF->getNumBlockIDs();
   Seen.clear();
   Seen.resize(N);
@@ -26,8 +35,73 @@ void LiveRangeCalc::reset(const MachineFunction *MF) {
 }
 
 
+void LiveRangeCalc::createDeadDefs(LiveInterval *LI, unsigned Reg) {
+  assert(MRI && Indexes && "call reset() first");
+
+  // Visit all def operands. If the same instruction has multiple defs of Reg,
+  // LI->createDeadDef() will deduplicate.
+  for (MachineRegisterInfo::def_iterator
+       I = MRI->def_begin(Reg), E = MRI->def_end(); I != E; ++I) {
+    const MachineInstr *MI = &*I;
+    // Find the corresponding slot index.
+    SlotIndex Idx;
+    if (MI->isPHI())
+      // PHI defs begin at the basic block start index.
+      Idx = Indexes->getMBBStartIdx(MI->getParent());
+    else
+      // Instructions are either normal 'r', or early clobber 'e'.
+      Idx = Indexes->getInstructionIndex(MI)
+        .getRegSlot(I.getOperand().isEarlyClobber());
+
+    // Create the def in LI. This may find an existing def.
+    VNInfo *VNI = LI->createDeadDef(Idx, *Alloc);
+    VNI->setIsPHIDef(MI->isPHI());
+  }
+}
+
+
+void LiveRangeCalc::extendToUses(LiveInterval *LI, unsigned Reg) {
+  assert(MRI && Indexes && "call reset() first");
+
+  // Visit all operands that read Reg. This may include partial defs.
+  for (MachineRegisterInfo::reg_nodbg_iterator I = MRI->reg_nodbg_begin(Reg),
+       E = MRI->reg_nodbg_end(); I != E; ++I) {
+    const MachineOperand &MO = I.getOperand();
+    if (!MO.readsReg())
+      continue;
+    // MI is reading Reg. We may have visited MI before if it happens to be
+    // reading Reg multiple times. That is OK, extend() is idempotent.
+    const MachineInstr *MI = &*I;
+
+    // Find the SlotIndex being read.
+    SlotIndex Idx;
+    if (MI->isPHI()) {
+      assert(!MO.isDef() && "Cannot handle PHI def of partial register.");
+      // PHI operands are paired: (Reg, PredMBB).
+      // Extend the live range to be live-out from PredMBB.
+      Idx = Indexes->getMBBEndIdx(MI->getOperand(I.getOperandNo()+1).getMBB());
+    } else {
+      // This is a normal instruction.
+      Idx = Indexes->getInstructionIndex(MI).getRegSlot();
+      // Check for early-clobber redefs.
+      unsigned DefIdx;
+      if (MO.isDef()) {
+        if (MO.isEarlyClobber())
+          Idx = Idx.getRegSlot(true);
+      } else if (MI->isRegTiedToDefOperand(I.getOperandNo(), &DefIdx)) {
+        // FIXME: This would be a lot easier if tied early-clobber uses also
+        // had an early-clobber flag.
+        if (MI->getOperand(DefIdx).isEarlyClobber())
+          Idx = Idx.getRegSlot(true);
+      }
+    }
+    extend(LI, Idx, Reg);
+  }
+}
+
+
 // Transfer information from the LiveIn vector to the live ranges.
-void LiveRangeCalc::updateLiveIns(VNInfo *OverrideVNI, SlotIndexes *Indexes) {
+void LiveRangeCalc::updateLiveIns(VNInfo *OverrideVNI) {
   for (SmallVectorImpl<LiveInBlock>::iterator I = LiveIn.begin(),
          E = LiveIn.end(); I != E; ++I) {
     if (!I->DomNode)
@@ -56,9 +130,7 @@ void LiveRangeCalc::updateLiveIns(VNInfo *OverrideVNI, SlotIndexes *Indexes) {
 
 void LiveRangeCalc::extend(LiveInterval *LI,
                            SlotIndex Kill,
-                           SlotIndexes *Indexes,
-                           MachineDominatorTree *DomTree,
-                           VNInfo::Allocator *Alloc) {
+                           unsigned PhysReg) {
   assert(LI && "Missing live range");
   assert(Kill.isValid() && "Invalid SlotIndex");
   assert(Indexes && "Missing SlotIndexes");
@@ -75,34 +147,31 @@ void LiveRangeCalc::extend(LiveInterval *LI,
   // multiple values, and we may need to create even more phi-defs to preserve
   // VNInfo SSA form.  Perform a search for all predecessor blocks where we
   // know the dominating VNInfo.
-  VNInfo *VNI = findReachingDefs(LI, KillMBB, Kill, Indexes, DomTree);
+  VNInfo *VNI = findReachingDefs(LI, KillMBB, Kill, PhysReg);
 
   // When there were multiple different values, we may need new PHIs.
   if (!VNI)
-    updateSSA(Indexes, DomTree, Alloc);
+    updateSSA();
 
-  updateLiveIns(VNI, Indexes);
+  updateLiveIns(VNI);
 }
 
 
 // This function is called by a client after using the low-level API to add
 // live-out and live-in blocks.  The unique value optimization is not
 // available, SplitEditor::transferValues handles that case directly anyway.
-void LiveRangeCalc::calculateValues(SlotIndexes *Indexes,
-                                    MachineDominatorTree *DomTree,
-                                    VNInfo::Allocator *Alloc) {
+void LiveRangeCalc::calculateValues() {
   assert(Indexes && "Missing SlotIndexes");
   assert(DomTree && "Missing dominator tree");
-  updateSSA(Indexes, DomTree, Alloc);
-  updateLiveIns(0, Indexes);
+  updateSSA();
+  updateLiveIns(0);
 }
 
 
 VNInfo *LiveRangeCalc::findReachingDefs(LiveInterval *LI,
                                         MachineBasicBlock *KillMBB,
                                         SlotIndex Kill,
-                                        SlotIndexes *Indexes,
-                                        MachineDominatorTree *DomTree) {
+                                        unsigned PhysReg) {
   // Blocks where LI should be live-in.
   SmallVector<MachineBasicBlock*, 16> WorkList(1, KillMBB);
 
@@ -113,7 +182,22 @@ VNInfo *LiveRangeCalc::findReachingDefs(LiveInterval *LI,
   // Using Seen as a visited set, perform a BFS for all reaching defs.
   for (unsigned i = 0; i != WorkList.size(); ++i) {
     MachineBasicBlock *MBB = WorkList[i];
-    assert(!MBB->pred_empty() && "Value live-in to entry block?");
+
+#ifndef NDEBUG
+    if (MBB->pred_empty()) {
+      MBB->getParent()->verify();
+      llvm_unreachable("Use not jointly dominated by defs.");
+    }
+
+    if (TargetRegisterInfo::isPhysicalRegister(PhysReg) &&
+        !MBB->isLiveIn(PhysReg)) {
+      MBB->getParent()->verify();
+      errs() << "The register needs to be live in to BB#" << MBB->getNumber()
+             << ", but is missing from the live-in list.\n";
+      llvm_unreachable("Invalid global physical register");
+    }
+#endif
+
     for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
            PE = MBB->pred_end(); PI != PE; ++PI) {
        MachineBasicBlock *Pred = *PI;
@@ -168,9 +252,7 @@ VNInfo *LiveRangeCalc::findReachingDefs(LiveInterval *LI,
 
 // This is essentially the same iterative algorithm that SSAUpdater uses,
 // except we already have a dominator tree, so we don't have to recompute it.
-void LiveRangeCalc::updateSSA(SlotIndexes *Indexes,
-                              MachineDominatorTree *DomTree,
-                              VNInfo::Allocator *Alloc) {
+void LiveRangeCalc::updateSSA() {
   assert(Indexes && "Missing SlotIndexes");
   assert(DomTree && "Missing dominator tree");
 
diff --git a/lib/CodeGen/LiveRangeCalc.h b/lib/CodeGen/LiveRangeCalc.h
index b8c8585..909829b 100644
--- a/lib/CodeGen/LiveRangeCalc.h
+++ b/lib/CodeGen/LiveRangeCalc.h
@@ -34,6 +34,11 @@ template <class NodeT> class DomTreeNodeBase;
 typedef DomTreeNodeBase<MachineBasicBlock> MachineDomTreeNode;
 
 class LiveRangeCalc {
+  const MachineRegisterInfo *MRI;
+  SlotIndexes *Indexes;
+  MachineDominatorTree *DomTree;
+  VNInfo::Allocator *Alloc;
+
   /// Seen - Bit vector of active entries in LiveOut, also used as a visited
   /// set by findReachingDefs.  One entry per basic block, indexed by block
   /// number.  This is kept as a separate bit vector because it can be cleared
@@ -100,26 +105,27 @@ class LiveRangeCalc {
   /// to be live-in are added to LiveIn.  If a unique reaching def is found,
   /// its value is returned, if Kill is jointly dominated by multiple values,
   /// NULL is returned.
+  ///
+  /// PhysReg, when set, is used to verify live-in lists on basic blocks.
   VNInfo *findReachingDefs(LiveInterval *LI,
                            MachineBasicBlock *KillMBB,
                            SlotIndex Kill,
-                           SlotIndexes *Indexes,
-                           MachineDominatorTree *DomTree);
+                           unsigned PhysReg);
 
   /// updateSSA - Compute the values that will be live in to all requested
   /// blocks in LiveIn.  Create PHI-def values as required to preserve SSA form.
   ///
   /// Every live-in block must be jointly dominated by the added live-out
   /// blocks.  No values are read from the live ranges.
-  void updateSSA(SlotIndexes *Indexes,
-                 MachineDominatorTree *DomTree,
-                 VNInfo::Allocator *Alloc);
+  void updateSSA();
 
   /// updateLiveIns - Add liveness as specified in the LiveIn vector, using VNI
   /// as a wildcard value for LiveIn entries without a value.
-  void updateLiveIns(VNInfo *VNI, SlotIndexes*);
+  void updateLiveIns(VNInfo *VNI);
 
 public:
+  LiveRangeCalc() : MRI(0), Indexes(0), DomTree(0), Alloc(0) {}
+
   //===--------------------------------------------------------------------===//
   // High-level interface.
   //===--------------------------------------------------------------------===//
@@ -132,14 +138,14 @@ public:
   /// that may overlap a previously computed live range, and before the first
   /// live range in a function.  If live ranges are not known to be
   /// non-overlapping, call reset before each.
-  void reset(const MachineFunction *MF);
+  void reset(const MachineFunction *MF,
+             SlotIndexes*,
+             MachineDominatorTree*,
+             VNInfo::Allocator*);
 
   /// calculate - Calculate the live range of a virtual register from its defs
   /// and uses.  LI must be empty with no values.
-  void calculate(LiveInterval *LI,
-                 MachineRegisterInfo *MRI,
-                 SlotIndexes *Indexes,
-                 VNInfo::Allocator *Alloc);
+  void calculate(LiveInterval *LI);
 
   //===--------------------------------------------------------------------===//
   // Mid-level interface.
@@ -154,21 +160,30 @@ public:
   /// Kill is not dominated by a single existing value, PHI-defs are inserted
   /// as required to preserve SSA form.  If Kill is known to be dominated by a
   /// single existing value, Alloc may be null.
-  void extend(LiveInterval *LI,
-              SlotIndex Kill,
-              SlotIndexes *Indexes,
-              MachineDominatorTree *DomTree,
-              VNInfo::Allocator *Alloc);
+  ///
+  /// PhysReg, when set, is used to verify live-in lists on basic blocks.
+  void extend(LiveInterval *LI, SlotIndex Kill, unsigned PhysReg = 0);
+
+  /// createDeadDefs - Create a dead def in LI for every def operand of Reg.
+  /// Each instruction defining Reg gets a new VNInfo with a corresponding
+  /// minimal live range.
+  void createDeadDefs(LiveInterval *LI, unsigned Reg);
 
-  /// extendToUses - Extend the live range of LI to reach all uses.
+  /// createDeadDefs - Create a dead def in LI for every def of LI->reg.
+  void createDeadDefs(LiveInterval *LI) {
+    createDeadDefs(LI, LI->reg);
+  }
+
+  /// extendToUses - Extend the live range of LI to reach all uses of Reg.
   ///
   /// All uses must be jointly dominated by existing liveness.  PHI-defs are
   /// inserted as needed to preserve SSA form.
-  void extendToUses(LiveInterval *LI,
-                    MachineRegisterInfo *MRI,
-                    SlotIndexes *Indexes,
-                    MachineDominatorTree *DomTree,
-                    VNInfo::Allocator *Alloc);
+  void extendToUses(LiveInterval *LI, unsigned Reg);
+
+  /// extendToUses - Extend the live range of LI to reach all uses of LI->reg.
+  void extendToUses(LiveInterval *LI) {
+    extendToUses(LI, LI->reg);
+  }
 
   //===--------------------------------------------------------------------===//
   // Low-level interface.
@@ -216,9 +231,7 @@ public:
   ///
   /// Every predecessor of a live-in block must have been given a value with
   /// setLiveOutValue, the value may be null for live-trough blocks.
-  void calculateValues(SlotIndexes *Indexes,
-                       MachineDominatorTree *DomTree,
-                       VNInfo::Allocator *Alloc);
+  void calculateValues();
 };
 
 } // end namespace llvm
diff --git a/lib/CodeGen/LiveRangeEdit.cpp b/lib/CodeGen/LiveRangeEdit.cpp
index 695f536..896fdbf 100644
--- a/lib/CodeGen/LiveRangeEdit.cpp
+++ b/lib/CodeGen/LiveRangeEdit.cpp
@@ -38,7 +38,7 @@ LiveInterval &LiveRangeEdit::createFrom(unsigned OldReg) {
     VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg));
   }
   LiveInterval &LI = LIS.getOrCreateInterval(VReg);
-  newRegs_.push_back(&LI);
+  NewRegs.push_back(&LI);
   return LI;
 }
 
@@ -46,16 +46,16 @@ bool LiveRangeEdit::checkRematerializable(VNInfo *VNI,
                                           const MachineInstr *DefMI,
                                           AliasAnalysis *aa) {
   assert(DefMI && "Missing instruction");
-  scannedRemattable_ = true;
+  ScannedRemattable = true;
   if (!TII.isTriviallyReMaterializable(DefMI, aa))
     return false;
-  remattable_.insert(VNI);
+  Remattable.insert(VNI);
   return true;
 }
 
 void LiveRangeEdit::scanRemattable(AliasAnalysis *aa) {
-  for (LiveInterval::vni_iterator I = parent_.vni_begin(),
-       E = parent_.vni_end(); I != E; ++I) {
+  for (LiveInterval::vni_iterator I = getParent().vni_begin(),
+       E = getParent().vni_end(); I != E; ++I) {
     VNInfo *VNI = *I;
     if (VNI->isUnused())
       continue;
@@ -64,13 +64,13 @@ void LiveRangeEdit::scanRemattable(AliasAnalysis *aa) {
       continue;
     checkRematerializable(VNI, DefMI, aa);
   }
-  scannedRemattable_ = true;
+  ScannedRemattable = true;
 }
 
 bool LiveRangeEdit::anyRematerializable(AliasAnalysis *aa) {
-  if (!scannedRemattable_)
+  if (!ScannedRemattable)
     scanRemattable(aa);
-  return !remattable_.empty();
+  return !Remattable.empty();
 }
 
 /// allUsesAvailableAt - Return true if all registers used by OrigMI at
@@ -82,12 +82,16 @@ bool LiveRangeEdit::allUsesAvailableAt(const MachineInstr *OrigMI,
   UseIdx = UseIdx.getRegSlot(true);
   for (unsigned i = 0, e = OrigMI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = OrigMI->getOperand(i);
-    if (!MO.isReg() || !MO.getReg() || MO.isDef())
-      continue;
-    // Reserved registers are OK.
-    if (MO.isUndef() || !LIS.hasInterval(MO.getReg()))
+    if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
       continue;
 
+    // We can't remat physreg uses, unless it is a constant.
+    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+      if (MRI.isConstantPhysReg(MO.getReg(), VRM->getMachineFunction()))
+        continue;
+      return false;
+    }
+
     LiveInterval &li = LIS.getInterval(MO.getReg());
     const VNInfo *OVNI = li.getVNInfoAt(OrigIdx);
     if (!OVNI)
@@ -101,10 +105,10 @@ bool LiveRangeEdit::allUsesAvailableAt(const MachineInstr *OrigMI,
 bool LiveRangeEdit::canRematerializeAt(Remat &RM,
                                        SlotIndex UseIdx,
                                        bool cheapAsAMove) {
-  assert(scannedRemattable_ && "Call anyRematerializable first");
+  assert(ScannedRemattable && "Call anyRematerializable first");
 
   // Use scanRemattable info.
-  if (!remattable_.count(RM.ParentVNI))
+  if (!Remattable.count(RM.ParentVNI))
     return false;
 
   // No defining instruction provided.
@@ -136,13 +140,13 @@ SlotIndex LiveRangeEdit::rematerializeAt(MachineBasicBlock &MBB,
                                          bool Late) {
   assert(RM.OrigMI && "Invalid remat");
   TII.reMaterialize(MBB, MI, DestReg, 0, RM.OrigMI, tri);
-  rematted_.insert(RM.ParentVNI);
+  Rematted.insert(RM.ParentVNI);
   return LIS.getSlotIndexes()->insertMachineInstrInMaps(--MI, Late)
            .getRegSlot();
 }
 
 void LiveRangeEdit::eraseVirtReg(unsigned Reg) {
-  if (delegate_ && delegate_->LRE_CanEraseVirtReg(Reg))
+  if (TheDelegate && TheDelegate->LRE_CanEraseVirtReg(Reg))
     LIS.removeInterval(Reg);
 }
 
@@ -173,6 +177,19 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
   if (!DefMI || !UseMI)
     return false;
 
+  // Since we're moving the DefMI load, make sure we're not extending any live
+  // ranges.
+  if (!allUsesAvailableAt(DefMI,
+                          LIS.getInstructionIndex(DefMI),
+                          LIS.getInstructionIndex(UseMI)))
+    return false;
+
+  // We also need to make sure it is safe to move the load.
+  // Assume there are stores between DefMI and UseMI.
+  bool SawStore = true;
+  if (!DefMI->isSafeToMove(&TII, 0, SawStore))
+    return false;
+
   DEBUG(dbgs() << "Try to fold single def: " << *DefMI
                << "       into single use: " << *UseMI);
 
@@ -220,6 +237,9 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
 
       DEBUG(dbgs() << "Deleting dead def " << Idx << '\t' << *MI);
 
+      // Collect virtual registers to be erased after MI is gone.
+      SmallVector<unsigned, 8> RegsToErase;
+
       // Check for live intervals that may shrink
       for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
              MOE = MI->operands_end(); MOI != MOE; ++MOI) {
@@ -242,22 +262,30 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
         // Remove defined value.
         if (MOI->isDef()) {
           if (VNInfo *VNI = LI.getVNInfoAt(Idx)) {
-            if (delegate_)
-              delegate_->LRE_WillShrinkVirtReg(LI.reg);
+            if (TheDelegate)
+              TheDelegate->LRE_WillShrinkVirtReg(LI.reg);
             LI.removeValNo(VNI);
-            if (LI.empty()) {
-              ToShrink.remove(&LI);
-              eraseVirtReg(Reg);
-            }
+            if (LI.empty())
+              RegsToErase.push_back(Reg);
           }
         }
       }
 
-      if (delegate_)
-        delegate_->LRE_WillEraseInstruction(MI);
+      if (TheDelegate)
+        TheDelegate->LRE_WillEraseInstruction(MI);
       LIS.RemoveMachineInstrFromMaps(MI);
       MI->eraseFromParent();
       ++NumDCEDeleted;
+
+      // Erase any virtregs that are now empty and unused. There may be <undef>
+      // uses around. Keep the empty live range in that case.
+      for (unsigned i = 0, e = RegsToErase.size(); i != e; ++i) {
+        unsigned Reg = RegsToErase[i];
+        if (LIS.hasInterval(Reg) && MRI.reg_nodbg_empty(Reg)) {
+          ToShrink.remove(&LIS.getInterval(Reg));
+          eraseVirtReg(Reg);
+        }
+      }
     }
 
     if (ToShrink.empty())
@@ -268,8 +296,8 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
     ToShrink.pop_back();
     if (foldAsLoad(LI, Dead))
       continue;
-    if (delegate_)
-      delegate_->LRE_WillShrinkVirtReg(LI->reg);
+    if (TheDelegate)
+      TheDelegate->LRE_WillShrinkVirtReg(LI->reg);
     if (!LIS.shrinkToUses(LI, &Dead))
       continue;
     
@@ -304,10 +332,14 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
       // interval must contain all the split products, and LI doesn't.
       if (IsOriginal)
         VRM->setIsSplitFromReg(Dups.back()->reg, 0);
-      if (delegate_)
-        delegate_->LRE_DidCloneVirtReg(Dups.back()->reg, LI->reg);
+      if (TheDelegate)
+        TheDelegate->LRE_DidCloneVirtReg(Dups.back()->reg, LI->reg);
     }
     ConEQ.Distribute(&Dups[0], MRI);
+    DEBUG({
+      for (unsigned i = 0; i != NumComp; ++i)
+        dbgs() << '\t' << *Dups[i] << '\n';
+    });
   }
 }
 
diff --git a/lib/CodeGen/LiveRegMatrix.cpp b/lib/CodeGen/LiveRegMatrix.cpp
new file mode 100644
index 0000000..cdb1776
--- /dev/null
+++ b/lib/CodeGen/LiveRegMatrix.cpp
@@ -0,0 +1,152 @@
+//===-- LiveRegMatrix.cpp - Track register interference -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the LiveRegMatrix analysis pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "regalloc"
+#include "LiveRegMatrix.h"
+#include "VirtRegMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+STATISTIC(NumAssigned   , "Number of registers assigned");
+STATISTIC(NumUnassigned , "Number of registers unassigned");
+
+char LiveRegMatrix::ID = 0;
+INITIALIZE_PASS_BEGIN(LiveRegMatrix, "liveregmatrix",
+                      "Live Register Matrix", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_END(LiveRegMatrix, "liveregmatrix",
+                    "Live Register Matrix", false, false)
+
+LiveRegMatrix::LiveRegMatrix() : MachineFunctionPass(ID),
+  UserTag(0), RegMaskTag(0), RegMaskVirtReg(0) {}
+
+void LiveRegMatrix::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequiredTransitive<LiveIntervals>();
+  AU.addRequiredTransitive<VirtRegMap>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool LiveRegMatrix::runOnMachineFunction(MachineFunction &MF) {
+  TRI = MF.getTarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  LIS = &getAnalysis<LiveIntervals>();
+  VRM = &getAnalysis<VirtRegMap>();
+
+  unsigned NumRegUnits = TRI->getNumRegUnits();
+  if (NumRegUnits != Matrix.size())
+    Queries.reset(new LiveIntervalUnion::Query[NumRegUnits]);
+  Matrix.init(LIUAlloc, NumRegUnits);
+
+  // Make sure no stale queries get reused.
+  invalidateVirtRegs();
+  return false;
+}
+
+void LiveRegMatrix::releaseMemory() {
+  for (unsigned i = 0, e = Matrix.size(); i != e; ++i) {
+    Matrix[i].clear();
+    Queries[i].clear();
+  }
+}
+
+void LiveRegMatrix::assign(LiveInterval &VirtReg, unsigned PhysReg) {
+  DEBUG(dbgs() << "assigning " << PrintReg(VirtReg.reg, TRI)
+               << " to " << PrintReg(PhysReg, TRI) << ':');
+  assert(!VRM->hasPhys(VirtReg.reg) && "Duplicate VirtReg assignment");
+  VRM->assignVirt2Phys(VirtReg.reg, PhysReg);
+  MRI->setPhysRegUsed(PhysReg);
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    DEBUG(dbgs() << ' ' << PrintRegUnit(*Units, TRI));
+    Matrix[*Units].unify(VirtReg);
+  }
+  ++NumAssigned;
+  DEBUG(dbgs() << '\n');
+}
+
+void LiveRegMatrix::unassign(LiveInterval &VirtReg) {
+  unsigned PhysReg = VRM->getPhys(VirtReg.reg);
+  DEBUG(dbgs() << "unassigning " << PrintReg(VirtReg.reg, TRI)
+               << " from " << PrintReg(PhysReg, TRI) << ':');
+  VRM->clearVirt(VirtReg.reg);
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    DEBUG(dbgs() << ' ' << PrintRegUnit(*Units, TRI));
+    Matrix[*Units].extract(VirtReg);
+  }
+  ++NumUnassigned;
+  DEBUG(dbgs() << '\n');
+}
+
+bool LiveRegMatrix::checkRegMaskInterference(LiveInterval &VirtReg,
+                                             unsigned PhysReg) {
+  // Check if the cached information is valid.
+  // The same BitVector can be reused for all PhysRegs.
+  // We could cache multiple VirtRegs if it becomes necessary.
+  if (RegMaskVirtReg != VirtReg.reg || RegMaskTag != UserTag) {
+    RegMaskVirtReg = VirtReg.reg;
+    RegMaskTag = UserTag;
+    RegMaskUsable.clear();
+    LIS->checkRegMaskInterference(VirtReg, RegMaskUsable);
+  }
+
+  // The BitVector is indexed by PhysReg, not register unit.
+  // Regmask interference is more fine grained than regunits.
+  // For example, a Win64 call can clobber %ymm8 yet preserve %xmm8.
+  return !RegMaskUsable.empty() && (!PhysReg || !RegMaskUsable.test(PhysReg));
+}
+
+bool LiveRegMatrix::checkRegUnitInterference(LiveInterval &VirtReg,
+                                             unsigned PhysReg) {
+  if (VirtReg.empty())
+    return false;
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units)
+    if (VirtReg.overlaps(LIS->getRegUnit(*Units)))
+      return true;
+  return false;
+}
+
+LiveIntervalUnion::Query &LiveRegMatrix::query(LiveInterval &VirtReg,
+                                               unsigned RegUnit) {
+  LiveIntervalUnion::Query &Q = Queries[RegUnit];
+  Q.init(UserTag, &VirtReg, &Matrix[RegUnit]);
+  return Q;
+}
+
+LiveRegMatrix::InterferenceKind
+LiveRegMatrix::checkInterference(LiveInterval &VirtReg, unsigned PhysReg) {
+  if (VirtReg.empty())
+    return IK_Free;
+
+  // Regmask interference is the fastest check.
+  if (checkRegMaskInterference(VirtReg, PhysReg))
+    return IK_RegMask;
+
+  // Check for fixed interference.
+  if (checkRegUnitInterference(VirtReg, PhysReg))
+    return IK_RegUnit;
+
+  // Check the matrix for virtual register interference.
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units)
+    if (query(VirtReg, *Units).checkInterference())
+      return IK_VirtReg;
+
+  return IK_Free;
+}
diff --git a/lib/CodeGen/LiveRegMatrix.h b/lib/CodeGen/LiveRegMatrix.h
new file mode 100644
index 0000000..b3e2d7f
--- /dev/null
+++ b/lib/CodeGen/LiveRegMatrix.h
@@ -0,0 +1,148 @@
+//===-- LiveRegMatrix.h - Track register interference ---------*- C++ -*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The LiveRegMatrix analysis pass keeps track of virtual register interference
+// along two dimensions: Slot indexes and register units. The matrix is used by
+// register allocators to ensure that no interfering virtual registers get
+// assigned to overlapping physical registers.
+//
+// Register units are defined in MCRegisterInfo.h, they represent the smallest
+// unit of interference when dealing with overlapping physical registers. The
+// LiveRegMatrix is represented as a LiveIntervalUnion per register unit. When
+// a virtual register is assigned to a physicval register, the live range for
+// the virtual register is inserted into the LiveIntervalUnion for each regunit
+// in the physreg.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_LIVEREGMATRIX_H
+#define LLVM_CODEGEN_LIVEREGMATRIX_H
+
+#include "LiveIntervalUnion.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+namespace llvm {
+
+class LiveInterval;
+class LiveIntervalAnalysis;
+class MachineRegisterInfo;
+class TargetRegisterInfo;
+class VirtRegMap;
+
+class LiveRegMatrix : public MachineFunctionPass {
+  const TargetRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+  LiveIntervals *LIS;
+  VirtRegMap *VRM;
+
+  // UserTag changes whenever virtual registers have been modified.
+  unsigned UserTag;
+
+  // The matrix is represented as a LiveIntervalUnion per register unit.
+  LiveIntervalUnion::Allocator LIUAlloc;
+  LiveIntervalUnion::Array Matrix;
+
+  // Cached queries per register unit.
+  OwningArrayPtr<LiveIntervalUnion::Query> Queries;
+
+  // Cached register mask interference info.
+  unsigned RegMaskTag;
+  unsigned RegMaskVirtReg;
+  BitVector RegMaskUsable;
+
+  // MachineFunctionPass boilerplate.
+  virtual void getAnalysisUsage(AnalysisUsage&) const;
+  virtual bool runOnMachineFunction(MachineFunction&);
+  virtual void releaseMemory();
+public:
+  static char ID;
+  LiveRegMatrix();
+
+  //===--------------------------------------------------------------------===//
+  // High-level interface.
+  //===--------------------------------------------------------------------===//
+  //
+  // Check for interference before assigning virtual registers to physical
+  // registers.
+  //
+
+  /// Invalidate cached interference queries after modifying virtual register
+  /// live ranges. Interference checks may return stale information unless
+  /// caches are invalidated.
+  void invalidateVirtRegs() { ++UserTag; }
+
+  enum InterferenceKind {
+    /// No interference, go ahead and assign.
+    IK_Free = 0,
+
+    /// Virtual register interference. There are interfering virtual registers
+    /// assigned to PhysReg or its aliases. This interference could be resolved
+    /// by unassigning those other virtual registers.
+    IK_VirtReg,
+
+    /// Register unit interference. A fixed live range is in the way, typically
+    /// argument registers for a call. This can't be resolved by unassigning
+    /// other virtual registers.
+    IK_RegUnit,
+
+    /// RegMask interference. The live range is crossing an instruction with a
+    /// regmask operand that doesn't preserve PhysReg. This typically means
+    /// VirtReg is live across a call, and PhysReg isn't call-preserved.
+    IK_RegMask
+  };
+
+  /// Check for interference before assigning VirtReg to PhysReg.
+  /// If this function returns IK_Free, it is legal to assign(VirtReg, PhysReg).
+  /// When there is more than one kind of interference, the InterferenceKind
+  /// with the highest enum value is returned.
+  InterferenceKind checkInterference(LiveInterval &VirtReg, unsigned PhysReg);
+
+  /// Assign VirtReg to PhysReg.
+  /// This will mark VirtReg's live range as occupied in the LiveRegMatrix and
+  /// update VirtRegMap. The live range is expected to be available in PhysReg.
+  void assign(LiveInterval &VirtReg, unsigned PhysReg);
+
+  /// Unassign VirtReg from its PhysReg.
+  /// Assuming that VirtReg was previously assigned to a PhysReg, this undoes
+  /// the assignment and updates VirtRegMap accordingly.
+  void unassign(LiveInterval &VirtReg);
+
+  //===--------------------------------------------------------------------===//
+  // Low-level interface.
+  //===--------------------------------------------------------------------===//
+  //
+  // Provide access to the underlying LiveIntervalUnions.
+  //
+
+  /// Check for regmask interference only.
+  /// Return true if VirtReg crosses a regmask operand that clobbers PhysReg.
+  /// If PhysReg is null, check if VirtReg crosses any regmask operands.
+  bool checkRegMaskInterference(LiveInterval &VirtReg, unsigned PhysReg = 0);
+
+  /// Check for regunit interference only.
+  /// Return true if VirtReg overlaps a fixed assignment of one of PhysRegs's
+  /// register units.
+  bool checkRegUnitInterference(LiveInterval &VirtReg, unsigned PhysReg);
+
+  /// Query a line of the assigned virtual register matrix directly.
+  /// Use MCRegUnitIterator to enumerate all regunits in the desired PhysReg.
+  /// This returns a reference to an internal Query data structure that is only
+  /// valid until the next query() call.
+  LiveIntervalUnion::Query &query(LiveInterval &VirtReg, unsigned RegUnit);
+
+  /// Directly access the live interval unions per regunit.
+  /// This returns an array indexed by the regunit number.
+  LiveIntervalUnion *getLiveUnions() { return &Matrix[0]; }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_LIVEREGMATRIX_H
diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp
index 5a0d97d..348ed3a 100644
--- a/lib/CodeGen/LiveVariables.cpp
+++ b/lib/CodeGen/LiveVariables.cpp
@@ -192,8 +192,8 @@ MachineInstr *LiveVariables::FindLastPartialDef(unsigned Reg,
   unsigned LastDefReg = 0;
   unsigned LastDefDist = 0;
   MachineInstr *LastDef = NULL;
-  for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg);
-       unsigned SubReg = *SubRegs; ++SubRegs) {
+  for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+    unsigned SubReg = *SubRegs;
     MachineInstr *Def = PhysRegDef[SubReg];
     if (!Def)
       continue;
@@ -216,9 +216,8 @@ MachineInstr *LiveVariables::FindLastPartialDef(unsigned Reg,
     unsigned DefReg = MO.getReg();
     if (TRI->isSubRegister(Reg, DefReg)) {
       PartDefRegs.insert(DefReg);
-      for (const uint16_t *SubRegs = TRI->getSubRegisters(DefReg);
-           unsigned SubReg = *SubRegs; ++SubRegs)
-        PartDefRegs.insert(SubReg);
+      for (MCSubRegIterator SubRegs(DefReg, TRI); SubRegs.isValid(); ++SubRegs)
+        PartDefRegs.insert(*SubRegs);
     }
   }
   return LastDef;
@@ -247,8 +246,8 @@ void LiveVariables::HandlePhysRegUse(unsigned Reg, MachineInstr *MI) {
                                                            true/*IsImp*/));
       PhysRegDef[Reg] = LastPartialDef;
       SmallSet<unsigned, 8> Processed;
-      for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg);
-           unsigned SubReg = *SubRegs; ++SubRegs) {
+      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+        unsigned SubReg = *SubRegs;
         if (Processed.count(SubReg))
           continue;
         if (PartDefRegs.count(SubReg))
@@ -259,7 +258,7 @@ void LiveVariables::HandlePhysRegUse(unsigned Reg, MachineInstr *MI) {
                                                              false/*IsDef*/,
                                                              true/*IsImp*/));
         PhysRegDef[SubReg] = LastPartialDef;
-        for (const uint16_t *SS = TRI->getSubRegisters(SubReg); *SS; ++SS)
+        for (MCSubRegIterator SS(SubReg, TRI); SS.isValid(); ++SS)
           Processed.insert(*SS);
       }
     }
@@ -271,9 +270,8 @@ void LiveVariables::HandlePhysRegUse(unsigned Reg, MachineInstr *MI) {
 
   // Remember this use.
   PhysRegUse[Reg]  = MI;
-  for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg);
-       unsigned SubReg = *SubRegs; ++SubRegs)
-    PhysRegUse[SubReg] =  MI;
+  for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+    PhysRegUse[*SubRegs] =  MI;
 }
 
 /// FindLastRefOrPartRef - Return the last reference or partial reference of
@@ -287,8 +285,8 @@ MachineInstr *LiveVariables::FindLastRefOrPartRef(unsigned Reg) {
   MachineInstr *LastRefOrPartRef = LastUse ? LastUse : LastDef;
   unsigned LastRefOrPartRefDist = DistanceMap[LastRefOrPartRef];
   unsigned LastPartDefDist = 0;
-  for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg);
-       unsigned SubReg = *SubRegs; ++SubRegs) {
+  for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+    unsigned SubReg = *SubRegs;
     MachineInstr *Def = PhysRegDef[SubReg];
     if (Def && Def != LastDef) {
       // There was a def of this sub-register in between. This is a partial
@@ -336,8 +334,8 @@ bool LiveVariables::HandlePhysRegKill(unsigned Reg, MachineInstr *MI) {
   MachineInstr *LastPartDef = 0;
   unsigned LastPartDefDist = 0;
   SmallSet<unsigned, 8> PartUses;
-  for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg);
-       unsigned SubReg = *SubRegs; ++SubRegs) {
+  for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+    unsigned SubReg = *SubRegs;
     MachineInstr *Def = PhysRegDef[SubReg];
     if (Def && Def != LastDef) {
       // There was a def of this sub-register in between. This is a partial
@@ -351,7 +349,7 @@ bool LiveVariables::HandlePhysRegKill(unsigned Reg, MachineInstr *MI) {
     }
     if (MachineInstr *Use = PhysRegUse[SubReg]) {
       PartUses.insert(SubReg);
-      for (const uint16_t *SS = TRI->getSubRegisters(SubReg); *SS; ++SS)
+      for (MCSubRegIterator SS(SubReg, TRI); SS.isValid(); ++SS)
         PartUses.insert(*SS);
       unsigned Dist = DistanceMap[Use];
       if (Dist > LastRefOrPartRefDist) {
@@ -367,8 +365,8 @@ bool LiveVariables::HandlePhysRegKill(unsigned Reg, MachineInstr *MI) {
     // EAX<dead>  = op  AL<imp-def>
     // That is, EAX def is dead but AL def extends pass it.
     PhysRegDef[Reg]->addRegisterDead(Reg, TRI, true);
-    for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg);
-         unsigned SubReg = *SubRegs; ++SubRegs) {
+    for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+      unsigned SubReg = *SubRegs;
       if (!PartUses.count(SubReg))
         continue;
       bool NeedDef = true;
@@ -388,11 +386,10 @@ bool LiveVariables::HandlePhysRegKill(unsigned Reg, MachineInstr *MI) {
       else {
         LastRefOrPartRef->addRegisterKilled(SubReg, TRI, true);
         PhysRegUse[SubReg] = LastRefOrPartRef;
-        for (const uint16_t *SSRegs = TRI->getSubRegisters(SubReg);
-             unsigned SSReg = *SSRegs; ++SSRegs)
-          PhysRegUse[SSReg] = LastRefOrPartRef;
+        for (MCSubRegIterator SS(SubReg, TRI); SS.isValid(); ++SS)
+          PhysRegUse[*SS] = LastRefOrPartRef;
       }
-      for (const uint16_t *SS = TRI->getSubRegisters(SubReg); *SS; ++SS)
+      for (MCSubRegIterator SS(SubReg, TRI); SS.isValid(); ++SS)
         PartUses.erase(*SS);
     }
   } else if (LastRefOrPartRef == PhysRegDef[Reg] && LastRefOrPartRef != MI) {
@@ -434,7 +431,7 @@ void LiveVariables::HandleRegMask(const MachineOperand &MO) {
     // Kill the largest clobbered super-register.
     // This avoids needless implicit operands.
     unsigned Super = Reg;
-    for (const uint16_t *SR = TRI->getSuperRegisters(Reg); *SR; ++SR)
+    for (MCSuperRegIterator SR(Reg, TRI); SR.isValid(); ++SR)
       if ((PhysRegDef[*SR] || PhysRegUse[*SR]) && MO.clobbersPhysReg(*SR))
         Super = *SR;
     HandlePhysRegKill(Super, 0);
@@ -447,11 +444,11 @@ void LiveVariables::HandlePhysRegDef(unsigned Reg, MachineInstr *MI,
   SmallSet<unsigned, 32> Live;
   if (PhysRegDef[Reg] || PhysRegUse[Reg]) {
     Live.insert(Reg);
-    for (const uint16_t *SS = TRI->getSubRegisters(Reg); *SS; ++SS)
-      Live.insert(*SS);
+    for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+      Live.insert(*SubRegs);
   } else {
-    for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg);
-         unsigned SubReg = *SubRegs; ++SubRegs) {
+    for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+      unsigned SubReg = *SubRegs;
       // If a register isn't itself defined, but all parts that make up of it
       // are defined, then consider it also defined.
       // e.g.
@@ -462,7 +459,7 @@ void LiveVariables::HandlePhysRegDef(unsigned Reg, MachineInstr *MI,
         continue;
       if (PhysRegDef[SubReg] || PhysRegUse[SubReg]) {
         Live.insert(SubReg);
-        for (const uint16_t *SS = TRI->getSubRegisters(SubReg); *SS; ++SS)
+        for (MCSubRegIterator SS(SubReg, TRI); SS.isValid(); ++SS)
           Live.insert(*SS);
       }
     }
@@ -472,8 +469,8 @@ void LiveVariables::HandlePhysRegDef(unsigned Reg, MachineInstr *MI,
   // is referenced.
   HandlePhysRegKill(Reg, MI);
   // Only some of the sub-registers are used.
-  for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg);
-       unsigned SubReg = *SubRegs; ++SubRegs) {
+  for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+    unsigned SubReg = *SubRegs;
     if (!Live.count(SubReg))
       // Skip if this sub-register isn't defined.
       continue;
@@ -491,8 +488,8 @@ void LiveVariables::UpdatePhysRegDefs(MachineInstr *MI,
     Defs.pop_back();
     PhysRegDef[Reg]  = MI;
     PhysRegUse[Reg]  = NULL;
-    for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg);
-         unsigned SubReg = *SubRegs; ++SubRegs) {
+    for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+      unsigned SubReg = *SubRegs;
       PhysRegDef[SubReg]  = MI;
       PhysRegUse[SubReg]  = NULL;
     }
@@ -576,7 +573,8 @@ bool LiveVariables::runOnMachineFunction(MachineFunction &mf) {
         unsigned MOReg = MO.getReg();
         if (MO.isUse()) {
           MO.setIsKill(false);
-          UseRegs.push_back(MOReg);
+          if (MO.readsReg())
+            UseRegs.push_back(MOReg);
         } else /*MO.isDef()*/ {
           MO.setIsDead(false);
           DefRegs.push_back(MOReg);
@@ -732,8 +730,9 @@ void LiveVariables::analyzePHINodes(const MachineFunction& Fn) {
     for (MachineBasicBlock::const_iterator BBI = I->begin(), BBE = I->end();
          BBI != BBE && BBI->isPHI(); ++BBI)
       for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2)
-        PHIVarInfo[BBI->getOperand(i + 1).getMBB()->getNumber()]
-          .push_back(BBI->getOperand(i).getReg());
+        if (BBI->getOperand(i).readsReg())
+          PHIVarInfo[BBI->getOperand(i + 1).getMBB()->getNumber()]
+            .push_back(BBI->getOperand(i).getReg());
 }
 
 bool LiveVariables::VarInfo::isLiveIn(const MachineBasicBlock &MBB,
diff --git a/lib/CodeGen/LocalStackSlotAllocation.cpp b/lib/CodeGen/LocalStackSlotAllocation.cpp
index 238bf52..fbc9e20 100644
--- a/lib/CodeGen/LocalStackSlotAllocation.cpp
+++ b/lib/CodeGen/LocalStackSlotAllocation.cpp
@@ -314,7 +314,8 @@ bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) {
             // No previously defined register was in range, so create a
             // new one.
             int64_t InstrOffset = TRI->getFrameIndexInstrOffset(MI, idx);
-            const TargetRegisterClass *RC = TRI->getPointerRegClass();
+            const MachineFunction *MF = MI->getParent()->getParent();
+            const TargetRegisterClass *RC = TRI->getPointerRegClass(*MF);
             BaseReg = Fn.getRegInfo().createVirtualRegister(RC);
 
             DEBUG(dbgs() << "  Materializing base register " << BaseReg <<
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index 1abb8f2..ecc1e95 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -271,11 +271,9 @@ void MachineBasicBlock::print(raw_ostream &OS, SlotIndexes *Indexes) const {
   }
   if (isLandingPad()) { OS << Comma << "EH LANDING PAD"; Comma = ", "; }
   if (hasAddressTaken()) { OS << Comma << "ADDRESS TAKEN"; Comma = ", "; }
-  if (Alignment) {
+  if (Alignment)
     OS << Comma << "Align " << Alignment << " (" << (1u << Alignment)
        << " bytes)";
-    Comma = ", ";
-  }
 
   OS << '\n';
 
@@ -596,6 +594,11 @@ bool MachineBasicBlock::canFallThrough() {
 
 MachineBasicBlock *
 MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
+  // Splitting the critical edge to a landing pad block is non-trivial. Don't do
+  // it in this generic function.
+  if (Succ->isLandingPad())
+    return NULL;
+
   MachineFunction *MF = getParent();
   DebugLoc dl;  // FIXME: this is nowhere
 
@@ -670,7 +673,7 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
 
   // Inherit live-ins from the successor
   for (MachineBasicBlock::livein_iterator I = Succ->livein_begin(),
-	 E = Succ->livein_end(); I != E; ++I)
+         E = Succ->livein_end(); I != E; ++I)
     NMBB->addLiveIn(*I);
 
   // Update LiveVariables.
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index 5ba6851..5a15f92 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -11,7 +11,7 @@
 // structure and branch probability estimates.
 //
 // The pass strives to preserve the structure of the CFG (that is, retain
-// a topological ordering of basic blocks) in the absense of a *strong* signal
+// a topological ordering of basic blocks) in the absence of a *strong* signal
 // to the contrary from probabilities. However, within the CFG structure, it
 // attempts to choose an ordering which favors placing more likely sequences of
 // blocks adjacent to each other.
@@ -63,17 +63,13 @@ namespace {
 ///
 /// This is the datastructure representing a chain of consecutive blocks that
 /// are profitable to layout together in order to maximize fallthrough
-/// probabilities. We also can use a block chain to represent a sequence of
-/// basic blocks which have some external (correctness) requirement for
-/// sequential layout.
+/// probabilities and code locality. We also can use a block chain to represent
+/// a sequence of basic blocks which have some external (correctness)
+/// requirement for sequential layout.
 ///
-/// Eventually, the block chains will form a directed graph over the function.
-/// We provide an SCC-supporting-iterator in order to quicky build and walk the
-/// SCCs of block chains within a function.
-///
-/// The block chains also have support for calculating and caching probability
-/// information related to the chain itself versus other chains. This is used
-/// for ranking during the final layout of block chains.
+/// Chains can be built around a single basic block and can be merged to grow
+/// them. They participate in a block-to-chain mapping, which is updated
+/// automatically as chains are merged together.
 class BlockChain {
   /// \brief The sequence of blocks belonging to this chain.
   ///
@@ -179,10 +175,11 @@ class MachineBlockPlacement : public MachineFunctionPass {
 
   /// \brief Allocator and owner of BlockChain structures.
   ///
-  /// We build BlockChains lazily by merging together high probability BB
-  /// sequences acording to the "Algo2" in the paper mentioned at the top of
-  /// the file. To reduce malloc traffic, we allocate them using this slab-like
-  /// allocator, and destroy them after the pass completes.
+  /// We build BlockChains lazily while processing the loop structure of
+  /// a function. To reduce malloc traffic, we allocate them using this
+  /// slab-like allocator, and destroy them after the pass completes. An
+  /// important guarantee is that this allocator produces stable pointers to
+  /// the chains.
   SpecificBumpPtrAllocator<BlockChain> ChainAllocator;
 
   /// \brief Function wide BasicBlock to BlockChain mapping.
@@ -329,7 +326,7 @@ MachineBasicBlock *MachineBlockPlacement::selectBestSuccessor(
   // the MBPI analysis, we manually compute probabilities using the edge
   // weights. This is suboptimal as it means that the somewhat subtle
   // definition of edge weight semantics is encoded here as well. We should
-  // improve the MBPI interface to effeciently support query patterns such as
+  // improve the MBPI interface to efficiently support query patterns such as
   // this.
   uint32_t BestWeight = 0;
   uint32_t WeightScale = 0;
@@ -1053,7 +1050,7 @@ namespace {
 ///
 /// A separate pass to compute interesting statistics for evaluating block
 /// placement. This is separate from the actual placement pass so that they can
-/// be computed in the absense of any placement transformations or when using
+/// be computed in the absence of any placement transformations or when using
 /// alternative placement strategies.
 class MachineBlockPlacementStats : public MachineFunctionPass {
   /// \brief A handle to the branch probability pass.
diff --git a/lib/CodeGen/MachineCSE.cpp b/lib/CodeGen/MachineCSE.cpp
index a63688e..9cfe9ab 100644
--- a/lib/CodeGen/MachineCSE.cpp
+++ b/lib/CodeGen/MachineCSE.cpp
@@ -84,7 +84,7 @@ namespace {
     bool PerformTrivialCoalescing(MachineInstr *MI, MachineBasicBlock *MBB);
     bool isPhysDefTriviallyDead(unsigned Reg,
                                 MachineBasicBlock::const_iterator I,
-                                MachineBasicBlock::const_iterator E) const ;
+                                MachineBasicBlock::const_iterator E) const;
     bool hasLivePhysRegDefUses(const MachineInstr *MI,
                                const MachineBasicBlock *MBB,
                                SmallSet<unsigned,8> &PhysRefs,
@@ -100,8 +100,7 @@ namespace {
     void ExitScope(MachineBasicBlock *MBB);
     bool ProcessBlock(MachineBasicBlock *MBB);
     void ExitScopeIfDone(MachineDomTreeNode *Node,
-                 DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren,
-                 DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> &ParentMap);
+                         DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren);
     bool PerformCSE(MachineDomTreeNode *Node);
   };
 } // end anonymous namespace
@@ -216,11 +215,10 @@ bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI,
     if (MO.isDef() &&
         (MO.isDead() || isPhysDefTriviallyDead(Reg, I, MBB->end())))
       continue;
-    PhysRefs.insert(Reg);
+    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+      PhysRefs.insert(*AI);
     if (MO.isDef())
       PhysDefs.push_back(Reg);
-    for (const uint16_t *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias)
-      PhysRefs.insert(*Alias);
   }
 
   return !PhysRefs.empty();
@@ -437,7 +435,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
     // used, then it's not safe to replace it with a common subexpression.
     // It's also not safe if the instruction uses physical registers.
     bool CrossMBBPhysDef = false;
-    SmallSet<unsigned,8> PhysRefs;
+    SmallSet<unsigned, 8> PhysRefs;
     SmallVector<unsigned, 2> PhysDefs;
     if (FoundCSE && hasLivePhysRegDefUses(MI, MBB, PhysRefs, PhysDefs)) {
       FoundCSE = false;
@@ -480,6 +478,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
              "Do not CSE physical register defs!");
 
       if (!isProfitableToCSE(NewReg, OldReg, CSMI, MI)) {
+        DEBUG(dbgs() << "*** Not profitable, avoid CSE!\n");
         DoCSE = false;
         break;
       }
@@ -488,6 +487,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
       // within the register class of the new instruction.
       const TargetRegisterClass *OldRC = MRI->getRegClass(OldReg);
       if (!MRI->constrainRegClass(NewReg, OldRC)) {
+        DEBUG(dbgs() << "*** Not the same register class, avoid CSE!\n");
         DoCSE = false;
         break;
       }
@@ -522,7 +522,6 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
         ++NumCommutes;
       Changed = true;
     } else {
-      DEBUG(dbgs() << "*** Not profitable, avoid CSE!\n");
       VNT.insert(MI, CurrVN++);
       Exps.push_back(MI);
     }
@@ -537,8 +536,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
 /// up the dominator tree to destroy ancestors which are now done.
 void
 MachineCSE::ExitScopeIfDone(MachineDomTreeNode *Node,
-                DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren,
-                DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> &ParentMap) {
+                        DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren) {
   if (OpenChildren[Node])
     return;
 
@@ -546,7 +544,7 @@ MachineCSE::ExitScopeIfDone(MachineDomTreeNode *Node,
   ExitScope(Node->getBlock());
 
   // Now traverse upwards to pop ancestors whose offsprings are all done.
-  while (MachineDomTreeNode *Parent = ParentMap[Node]) {
+  while (MachineDomTreeNode *Parent = Node->getIDom()) {
     unsigned Left = --OpenChildren[Parent];
     if (Left != 0)
       break;
@@ -558,7 +556,6 @@ MachineCSE::ExitScopeIfDone(MachineDomTreeNode *Node,
 bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) {
   SmallVector<MachineDomTreeNode*, 32> Scopes;
   SmallVector<MachineDomTreeNode*, 8> WorkList;
-  DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> ParentMap;
   DenseMap<MachineDomTreeNode*, unsigned> OpenChildren;
 
   CurrVN = 0;
@@ -573,7 +570,6 @@ bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) {
     OpenChildren[Node] = NumChildren;
     for (unsigned i = 0; i != NumChildren; ++i) {
       MachineDomTreeNode *Child = Children[i];
-      ParentMap[Child] = Node;
       WorkList.push_back(Child);
     }
   } while (!WorkList.empty());
@@ -586,7 +582,7 @@ bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) {
     EnterScope(MBB);
     Changed |= ProcessBlock(MBB);
     // If it's a leaf node, it's done. Traverse upwards to pop ancestors.
-    ExitScopeIfDone(Node, OpenChildren, ParentMap);
+    ExitScopeIfDone(Node, OpenChildren);
   }
 
   return Changed;
diff --git a/lib/CodeGen/MachineCopyPropagation.cpp b/lib/CodeGen/MachineCopyPropagation.cpp
index 9730eaa..bac3aa2 100644
--- a/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/lib/CodeGen/MachineCopyPropagation.cpp
@@ -62,28 +62,16 @@ void
 MachineCopyPropagation::SourceNoLongerAvailable(unsigned Reg,
                               SourceMap &SrcMap,
                               DenseMap<unsigned, MachineInstr*> &AvailCopyMap) {
-  SourceMap::iterator SI = SrcMap.find(Reg);
-  if (SI != SrcMap.end()) {
-    const DestList& Defs = SI->second;
-    for (DestList::const_iterator I = Defs.begin(), E = Defs.end();
-         I != E; ++I) {
-      unsigned MappedDef = *I;
-      // Source of copy is no longer available for propagation.
-      if (AvailCopyMap.erase(MappedDef)) {
-        for (const uint16_t *SR = TRI->getSubRegisters(MappedDef); *SR; ++SR)
-          AvailCopyMap.erase(*SR);
-      }
-    }
-  }
-  for (const uint16_t *AS = TRI->getAliasSet(Reg); *AS; ++AS) {
-    SI = SrcMap.find(*AS);
+  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+    SourceMap::iterator SI = SrcMap.find(*AI);
     if (SI != SrcMap.end()) {
       const DestList& Defs = SI->second;
       for (DestList::const_iterator I = Defs.begin(), E = Defs.end();
            I != E; ++I) {
         unsigned MappedDef = *I;
+        // Source of copy is no longer available for propagation.
         if (AvailCopyMap.erase(MappedDef)) {
-          for (const uint16_t *SR = TRI->getSubRegisters(MappedDef); *SR; ++SR)
+          for (MCSubRegIterator SR(MappedDef, TRI); SR.isValid(); ++SR)
             AvailCopyMap.erase(*SR);
         }
       }
@@ -188,11 +176,8 @@ bool MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
       }
 
       // If Src is defined by a previous copy, it cannot be eliminated.
-      CI = CopyMap.find(Src);
-      if (CI != CopyMap.end())
-        MaybeDeadCopies.remove(CI->second);
-      for (const uint16_t *AS = TRI->getAliasSet(Src); *AS; ++AS) {
-        CI = CopyMap.find(*AS);
+      for (MCRegAliasIterator AI(Src, TRI, true); AI.isValid(); ++AI) {
+        CI = CopyMap.find(*AI);
         if (CI != CopyMap.end())
           MaybeDeadCopies.remove(CI->second);
       }
@@ -211,13 +196,13 @@ bool MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
 
       // Remember Def is defined by the copy.
       // ... Make sure to clear the def maps of aliases first.
-      for (const uint16_t *AS = TRI->getAliasSet(Def); *AS; ++AS) {
-        CopyMap.erase(*AS);
-        AvailCopyMap.erase(*AS);
+      for (MCRegAliasIterator AI(Def, TRI, false); AI.isValid(); ++AI) {
+        CopyMap.erase(*AI);
+        AvailCopyMap.erase(*AI);
       }
       CopyMap[Def] = MI;
       AvailCopyMap[Def] = MI;
-      for (const uint16_t *SR = TRI->getSubRegisters(Def); *SR; ++SR) {
+      for (MCSubRegIterator SR(Def, TRI); SR.isValid(); ++SR) {
         CopyMap[*SR] = MI;
         AvailCopyMap[*SR] = MI;
       }
@@ -256,11 +241,8 @@ bool MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
 
       // If 'Reg' is defined by a copy, the copy is no longer a candidate
       // for elimination.
-      DenseMap<unsigned, MachineInstr*>::iterator CI = CopyMap.find(Reg);
-      if (CI != CopyMap.end())
-        MaybeDeadCopies.remove(CI->second);
-      for (const uint16_t *AS = TRI->getAliasSet(Reg); *AS; ++AS) {
-        CI = CopyMap.find(*AS);
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+        DenseMap<unsigned, MachineInstr*>::iterator CI = CopyMap.find(*AI);
         if (CI != CopyMap.end())
           MaybeDeadCopies.remove(CI->second);
       }
@@ -296,11 +278,9 @@ bool MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
       unsigned Reg = Defs[i];
 
       // No longer defined by a copy.
-      CopyMap.erase(Reg);
-      AvailCopyMap.erase(Reg);
-      for (const uint16_t *AS = TRI->getAliasSet(Reg); *AS; ++AS) {
-        CopyMap.erase(*AS);
-        AvailCopyMap.erase(*AS);
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+        CopyMap.erase(*AI);
+        AvailCopyMap.erase(*AI);
       }
 
       // If 'Reg' is previously source of a copy, it is no longer available for
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index d8c2f6a..d4aede8 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/Function.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -26,7 +27,6 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetLowering.h"
@@ -60,7 +60,7 @@ MachineFunction::MachineFunction(const Function *F, const TargetMachine &TM,
   MFInfo = 0;
   FrameInfo = new (Allocator) MachineFrameInfo(*TM.getFrameLowering());
   if (Fn->hasFnAttr(Attribute::StackAlignment))
-    FrameInfo->setMaxAlignment(Attribute::getStackAlignmentFromAttrs(
+    FrameInfo->ensureMaxAlignment(Attribute::getStackAlignmentFromAttrs(
         Fn->getAttributes().getFnAttributes()));
   ConstantPool = new (Allocator) MachineConstantPool(TM.getTargetData());
   Alignment = TM.getTargetLowering()->getMinFunctionAlignment();
@@ -84,9 +84,13 @@ MachineFunction::~MachineFunction() {
     MFInfo->~MachineFunctionInfo();
     Allocator.Deallocate(MFInfo);
   }
-  FrameInfo->~MachineFrameInfo();         Allocator.Deallocate(FrameInfo);
-  ConstantPool->~MachineConstantPool();   Allocator.Deallocate(ConstantPool);
-  
+
+  FrameInfo->~MachineFrameInfo();
+  Allocator.Deallocate(FrameInfo);
+
+  ConstantPool->~MachineConstantPool();
+  Allocator.Deallocate(ConstantPool);
+
   if (JumpTableInfo) {
     JumpTableInfo->~MachineJumpTableInfo();
     Allocator.Deallocate(JumpTableInfo);
@@ -98,7 +102,7 @@ MachineFunction::~MachineFunction() {
 MachineJumpTableInfo *MachineFunction::
 getOrCreateJumpTableInfo(unsigned EntryKind) {
   if (JumpTableInfo) return JumpTableInfo;
-  
+
   JumpTableInfo = new (Allocator)
     MachineJumpTableInfo((MachineJumpTableInfo::JTEntryKind)EntryKind);
   return JumpTableInfo;
@@ -116,12 +120,12 @@ void MachineFunction::RenumberBlocks(MachineBasicBlock *MBB) {
     MBBI = begin();
   else
     MBBI = MBB;
-  
+
   // Figure out the block number this should have.
   unsigned BlockNo = 0;
   if (MBBI != begin())
     BlockNo = prior(MBBI)->getNumber()+1;
-  
+
   for (; MBBI != E; ++MBBI, ++BlockNo) {
     if (MBBI->getNumber() != (int)BlockNo) {
       // Remove use of the old number.
@@ -130,7 +134,7 @@ void MachineFunction::RenumberBlocks(MachineBasicBlock *MBB) {
                "MBB number mismatch!");
         MBBNumbering[MBBI->getNumber()] = 0;
       }
-      
+
       // If BlockNo is already taken, set that block's number to -1.
       if (MBBNumbering[BlockNo])
         MBBNumbering[BlockNo]->setNumber(-1);
@@ -138,7 +142,7 @@ void MachineFunction::RenumberBlocks(MachineBasicBlock *MBB) {
       MBBNumbering[BlockNo] = MBBI;
       MBBI->setNumber(BlockNo);
     }
-  }    
+  }
 
   // Okay, all the blocks are renumbered.  If we have compactified the block
   // numbering, shrink MBBNumbering now.
@@ -295,16 +299,16 @@ void MachineFunction::print(raw_ostream &OS, SlotIndexes *Indexes) const {
 
   // Print Frame Information
   FrameInfo->print(*this, OS);
-  
+
   // Print JumpTable Information
   if (JumpTableInfo)
     JumpTableInfo->print(OS);
 
   // Print Constant Pool
   ConstantPool->print(OS);
-  
+
   const TargetRegisterInfo *TRI = getTarget().getRegisterInfo();
-  
+
   if (RegInfo && !RegInfo->livein_empty()) {
     OS << "Function Live Ins: ";
     for (MachineRegisterInfo::livein_iterator
@@ -324,7 +328,7 @@ void MachineFunction::print(raw_ostream &OS, SlotIndexes *Indexes) const {
       OS << ' ' << PrintReg(*I, TRI);
     OS << '\n';
   }
-  
+
   for (const_iterator BB = begin(), E = end(); BB != E; ++BB) {
     OS << '\n';
     BB->print(OS, Indexes);
@@ -411,10 +415,9 @@ unsigned MachineFunction::addLiveIn(unsigned PReg,
 MCSymbol *MachineFunction::getJTISymbol(unsigned JTI, MCContext &Ctx, 
                                         bool isLinkerPrivate) const {
   assert(JumpTableInfo && "No jump tables");
-  
   assert(JTI < JumpTableInfo->getJumpTables().size() && "Invalid JTI!");
   const MCAsmInfo &MAI = *getTarget().getMCAsmInfo();
-  
+
   const char *Prefix = isLinkerPrivate ? MAI.getLinkerPrivateGlobalPrefix() :
                                          MAI.getPrivateGlobalPrefix();
   SmallString<60> Name;
@@ -691,7 +694,7 @@ static bool CanShareConstantPoolEntry(const Constant *A, const Constant *B,
   else if (B->getType() != IntTy)
     B = ConstantFoldInstOperands(Instruction::BitCast, IntTy,
                                  const_cast<Constant*>(B), TD);
-  
+
   return A == B;
 }
 
@@ -714,7 +717,7 @@ unsigned MachineConstantPool::getConstantPoolIndex(const Constant *C,
         Constants[i].Alignment = Alignment;
       return i;
     }
-  
+
   Constants.push_back(MachineConstantPoolEntry(C, Alignment));
   return Constants.size()-1;
 }
@@ -723,7 +726,7 @@ unsigned MachineConstantPool::getConstantPoolIndex(MachineConstantPoolValue *V,
                                                    unsigned Alignment) {
   assert(Alignment && "Alignment must be specified!");
   if (Alignment > PoolAlignment) PoolAlignment = Alignment;
-  
+
   // Check to see if we already have this constant.
   //
   // FIXME, this could be made much more efficient for large constant pools.
diff --git a/lib/CodeGen/MachineFunctionPrinterPass.cpp b/lib/CodeGen/MachineFunctionPrinterPass.cpp
index 2aaa798..0102ac7 100644
--- a/lib/CodeGen/MachineFunctionPrinterPass.cpp
+++ b/lib/CodeGen/MachineFunctionPrinterPass.cpp
@@ -14,7 +14,9 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Debug.h"
 
 using namespace llvm;
 
@@ -28,6 +30,7 @@ struct MachineFunctionPrinterPass : public MachineFunctionPass {
   raw_ostream &OS;
   const std::string Banner;
 
+  MachineFunctionPrinterPass() : MachineFunctionPass(ID), OS(dbgs()) { }
   MachineFunctionPrinterPass(raw_ostream &os, const std::string &banner) 
       : MachineFunctionPass(ID), OS(os), Banner(banner) {}
 
@@ -40,7 +43,7 @@ struct MachineFunctionPrinterPass : public MachineFunctionPass {
 
   bool runOnMachineFunction(MachineFunction &MF) {
     OS << "# " << Banner << ":\n";
-    MF.print(OS);
+    MF.print(OS, getAnalysisIfAvailable<SlotIndexes>());
     return false;
   }
 };
@@ -48,6 +51,10 @@ struct MachineFunctionPrinterPass : public MachineFunctionPass {
 char MachineFunctionPrinterPass::ID = 0;
 }
 
+char &MachineFunctionPrinterPassID = MachineFunctionPrinterPass::ID;
+INITIALIZE_PASS(MachineFunctionPrinterPass, "print-machineinstrs",
+                "Machine Function Printer", false, false)
+
 namespace llvm {
 /// Returns a newly-created MachineFunction Printer pass. The
 /// default banner is empty.
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index e553a04..8dada05 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/Function.h"
 #include "llvm/InlineAsm.h"
 #include "llvm/LLVMContext.h"
@@ -33,7 +34,6 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LeakDetector.h"
@@ -186,7 +186,8 @@ void MachineOperand::ChangeToRegister(unsigned Reg, bool isDef, bool isImp,
 }
 
 /// isIdenticalTo - Return true if this operand is identical to the specified
-/// operand.
+/// operand. Note that this should stay in sync with the hash_value overload
+/// below.
 bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {
   if (getType() != Other.getType() ||
       getTargetFlags() != Other.getTargetFlags())
@@ -227,6 +228,46 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {
   llvm_unreachable("Invalid machine operand type");
 }
 
+// Note: this must stay exactly in sync with isIdenticalTo above.
+hash_code llvm::hash_value(const MachineOperand &MO) {
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getReg(),
+                        MO.getSubReg(), MO.isDef());
+  case MachineOperand::MO_Immediate:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getImm());
+  case MachineOperand::MO_CImmediate:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getCImm());
+  case MachineOperand::MO_FPImmediate:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getFPImm());
+  case MachineOperand::MO_MachineBasicBlock:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getMBB());
+  case MachineOperand::MO_FrameIndex:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIndex());
+  case MachineOperand::MO_ConstantPoolIndex:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIndex(),
+                        MO.getOffset());
+  case MachineOperand::MO_JumpTableIndex:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIndex());
+  case MachineOperand::MO_ExternalSymbol:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getOffset(),
+                        MO.getSymbolName());
+  case MachineOperand::MO_GlobalAddress:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getGlobal(),
+                        MO.getOffset());
+  case MachineOperand::MO_BlockAddress:
+    return hash_combine(MO.getType(), MO.getTargetFlags(),
+                        MO.getBlockAddress());
+  case MachineOperand::MO_RegisterMask:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getRegMask());
+  case MachineOperand::MO_Metadata:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getMetadata());
+  case MachineOperand::MO_MCSymbol:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getMCSymbol());
+  }
+  llvm_unreachable("Invalid machine operand type");
+}
+
 /// print - Print the specified machine operand.
 ///
 void MachineOperand::print(raw_ostream &OS, const TargetMachine *TM) const {
@@ -255,12 +296,16 @@ void MachineOperand::print(raw_ostream &OS, const TargetMachine *TM) const {
           OS << "imp-";
         OS << "def";
         NeedComma = true;
+        // <def,read-undef> only makes sense when getSubReg() is set.
+        // Don't clutter the output otherwise.
+        if (isUndef() && getSubReg())
+          OS << ",read-undef";
       } else if (isImplicit()) {
           OS << "imp-use";
           NeedComma = true;
       }
 
-      if (isKill() || isDead() || isUndef() || isInternalRead()) {
+      if (isKill() || isDead() || (isUndef() && isUse()) || isInternalRead()) {
         if (NeedComma) OS << ',';
         NeedComma = false;
         if (isKill()) {
@@ -271,7 +316,7 @@ void MachineOperand::print(raw_ostream &OS, const TargetMachine *TM) const {
           OS << "dead";
           NeedComma = true;
         }
-        if (isUndef()) {
+        if (isUndef() && isUse()) {
           if (NeedComma) OS << ',';
           OS << "undef";
           NeedComma = true;
@@ -656,7 +701,9 @@ void MachineInstr::addOperand(const MachineOperand &Op) {
 
   // OpNo now points as the desired insertion point.  Unless this is a variadic
   // instruction, only implicit regs are allowed beyond MCID->getNumOperands().
-  assert((isImpReg || MCID->isVariadic() || OpNo < MCID->getNumOperands()) &&
+  // RegMask operands go between the explicit and implicit operands.
+  assert((isImpReg || Op.isRegMask() || MCID->isVariadic() ||
+          OpNo < MCID->getNumOperands()) &&
          "Trying to add an operand to a machine instr that is already done!");
 
   // All operands from OpNo have been removed from RegInfo.  If the Operands
@@ -868,7 +915,8 @@ void MachineInstr::eraseFromParent() {
       MBB->erase(MI);
     }
   }
-  getParent()->erase(this);
+  // Erase the individual instruction, which may itself be inside a bundle.
+  getParent()->erase_instr(this);
 }
 
 
@@ -938,9 +986,13 @@ const TargetRegisterClass*
 MachineInstr::getRegClassConstraint(unsigned OpIdx,
                                     const TargetInstrInfo *TII,
                                     const TargetRegisterInfo *TRI) const {
+  assert(getParent() && "Can't have an MBB reference here!");
+  assert(getParent()->getParent() && "Can't have an MF reference here!");
+  const MachineFunction &MF = *getParent()->getParent();
+
   // Most opcodes have fixed constraints in their MCInstrDesc.
   if (!isInlineAsm())
-    return TII->getRegClass(getDesc(), OpIdx, TRI);
+    return TII->getRegClass(getDesc(), OpIdx, TRI, MF);
 
   if (!getOperand(OpIdx).isReg())
     return NULL;
@@ -962,7 +1014,7 @@ MachineInstr::getRegClassConstraint(unsigned OpIdx,
 
   // Assume that all registers in a memory operand are pointers.
   if (InlineAsm::getKind(Flag) == InlineAsm::Kind_Mem)
-    return TRI->getPointerRegClass();
+    return TRI->getPointerRegClass(MF);
 
   return NULL;
 }
@@ -1530,12 +1582,14 @@ void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM) const {
         const MachineRegisterInfo &MRI = MF->getRegInfo();
         if (MRI.use_empty(Reg) && !MRI.isLiveOut(Reg)) {
           bool HasAliasLive = false;
-          for (const uint16_t *Alias = TM->getRegisterInfo()->getAliasSet(Reg);
-               unsigned AliasReg = *Alias; ++Alias)
+          for (MCRegAliasIterator AI(Reg, TM->getRegisterInfo(), true);
+               AI.isValid(); ++AI) {
+            unsigned AliasReg = *AI;
             if (!MRI.use_empty(AliasReg) || MRI.isLiveOut(AliasReg)) {
               HasAliasLive = true;
               break;
             }
+          }
           if (!HasAliasLive) {
             OmittedAnyCallClobbers = true;
             continue;
@@ -1667,7 +1721,8 @@ bool MachineInstr::addRegisterKilled(unsigned IncomingReg,
                                      const TargetRegisterInfo *RegInfo,
                                      bool AddIfNotFound) {
   bool isPhysReg = TargetRegisterInfo::isPhysicalRegister(IncomingReg);
-  bool hasAliases = isPhysReg && RegInfo->getAliasSet(IncomingReg);
+  bool hasAliases = isPhysReg &&
+    MCRegAliasIterator(IncomingReg, RegInfo, false).isValid();
   bool Found = false;
   SmallVector<unsigned,4> DeadOps;
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
@@ -1739,7 +1794,8 @@ bool MachineInstr::addRegisterDead(unsigned IncomingReg,
                                    const TargetRegisterInfo *RegInfo,
                                    bool AddIfNotFound) {
   bool isPhysReg = TargetRegisterInfo::isPhysicalRegister(IncomingReg);
-  bool hasAliases = isPhysReg && RegInfo->getAliasSet(IncomingReg);
+  bool hasAliases = isPhysReg &&
+    MCRegAliasIterator(IncomingReg, RegInfo, false).isValid();
   bool Found = false;
   SmallVector<unsigned,4> DeadOps;
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
@@ -1758,9 +1814,7 @@ bool MachineInstr::addRegisterDead(unsigned IncomingReg,
       // There exists a super-register that's marked dead.
       if (RegInfo->isSuperRegister(IncomingReg, Reg))
         return true;
-      if (RegInfo->getSubRegisters(IncomingReg) &&
-          RegInfo->getSuperRegisters(Reg) &&
-          RegInfo->isSubRegister(IncomingReg, Reg))
+      if (RegInfo->isSubRegister(IncomingReg, Reg))
         DeadOps.push_back(i);
     }
   }
@@ -1841,52 +1895,16 @@ void MachineInstr::setPhysRegsDeadExcept(ArrayRef<unsigned> UsedRegs,
 unsigned
 MachineInstrExpressionTrait::getHashValue(const MachineInstr* const &MI) {
   // Build up a buffer of hash code components.
-  //
-  // FIXME: This is a total hack. We should have a hash_value overload for
-  // MachineOperand, but currently that doesn't work because there are many
-  // different ideas of "equality" and thus different sets of information that
-  // contribute to the hash code. This one happens to want to take a specific
-  // subset. And it's still not clear that this routine uses the *correct*
-  // subset of information when computing the hash code. The goal is to use the
-  // same inputs for the hash code here that MachineInstr::isIdenticalTo uses to
-  // test for equality when passed the 'IgnoreVRegDefs' filter flag. It would
-  // be very useful to factor the selection of relevant inputs out of the two
-  // functions and into a common routine, but it's not clear how that can be
-  // done.
   SmallVector<size_t, 8> HashComponents;
   HashComponents.reserve(MI->getNumOperands() + 1);
   HashComponents.push_back(MI->getOpcode());
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
-    switch (MO.getType()) {
-    default: break;
-    case MachineOperand::MO_Register:
-      if (MO.isDef() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
-        continue;  // Skip virtual register defs.
-      HashComponents.push_back(hash_combine(MO.getType(), MO.getReg()));
-      break;
-    case MachineOperand::MO_Immediate:
-      HashComponents.push_back(hash_combine(MO.getType(), MO.getImm()));
-      break;
-    case MachineOperand::MO_FrameIndex:
-    case MachineOperand::MO_ConstantPoolIndex:
-    case MachineOperand::MO_JumpTableIndex:
-      HashComponents.push_back(hash_combine(MO.getType(), MO.getIndex()));
-      break;
-    case MachineOperand::MO_MachineBasicBlock:
-      HashComponents.push_back(hash_combine(MO.getType(), MO.getMBB()));
-      break;
-    case MachineOperand::MO_GlobalAddress:
-      HashComponents.push_back(hash_combine(MO.getType(), MO.getGlobal()));
-      break;
-    case MachineOperand::MO_BlockAddress:
-      HashComponents.push_back(hash_combine(MO.getType(),
-                                            MO.getBlockAddress()));
-      break;
-    case MachineOperand::MO_MCSymbol:
-      HashComponents.push_back(hash_combine(MO.getType(), MO.getMCSymbol()));
-      break;
-    }
+    if (MO.isReg() && MO.isDef() &&
+        TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      continue;  // Skip virtual register defs.
+
+    HashComponents.push_back(hash_value(MO));
   }
   return hash_combine_range(HashComponents.begin(), HashComponents.end());
 }
diff --git a/lib/CodeGen/MachineInstrBundle.cpp b/lib/CodeGen/MachineInstrBundle.cpp
index 73489a7..b7de7bf 100644
--- a/lib/CodeGen/MachineInstrBundle.cpp
+++ b/lib/CodeGen/MachineInstrBundle.cpp
@@ -169,8 +169,8 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
       }
 
       if (!MO.isDead()) {
-        for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg);
-             unsigned SubReg = *SubRegs; ++SubRegs) {
+        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+          unsigned SubReg = *SubRegs;
           if (LocalDefSet.insert(SubReg))
             LocalDefs.push_back(SubReg);
         }
diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
index 8c562cc..efec481 100644
--- a/lib/CodeGen/MachineLICM.cpp
+++ b/lib/CodeGen/MachineLICM.cpp
@@ -445,8 +445,8 @@ void MachineLICM::ProcessMI(MachineInstr *MI,
     }
 
     if (MO.isImplicit()) {
-      for (const uint16_t *AS = TRI->getOverlaps(Reg); *AS; ++AS)
-        PhysRegClobbers.set(*AS);
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        PhysRegClobbers.set(*AI);
       if (!MO.isDead())
         // Non-dead implicit def? This cannot be hoisted.
         RuledOut = true;
@@ -465,7 +465,7 @@ void MachineLICM::ProcessMI(MachineInstr *MI,
     // If we have already seen another instruction that defines the same
     // register, then this is not safe.  Two defs is indicated by setting a
     // PhysRegClobbers bit.
-    for (const uint16_t *AS = TRI->getOverlaps(Reg); *AS; ++AS) {
+    for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS) {
       if (PhysRegDefs.test(*AS))
         PhysRegClobbers.set(*AS);
       if (PhysRegClobbers.test(*AS))
@@ -517,8 +517,8 @@ void MachineLICM::HoistRegionPostRA() {
     for (MachineBasicBlock::livein_iterator I = BB->livein_begin(),
            E = BB->livein_end(); I != E; ++I) {
       unsigned Reg = *I;
-      for (const uint16_t *AS = TRI->getOverlaps(Reg); *AS; ++AS)
-        PhysRegDefs.set(*AS);
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        PhysRegDefs.set(*AI);
     }
 
     SpeculationState = SpeculateUnknown;
@@ -540,8 +540,8 @@ void MachineLICM::HoistRegionPostRA() {
       unsigned Reg = MO.getReg();
       if (!Reg)
         continue;
-      for (const uint16_t *AS = TRI->getOverlaps(Reg); *AS; ++AS)
-        TermRegs.set(*AS);
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        TermRegs.set(*AI);
     }
   }
 
@@ -1260,11 +1260,11 @@ MachineInstr *MachineLICM::ExtractHoistableLoad(MachineInstr *MI) {
   if (NewOpc == 0) return 0;
   const MCInstrDesc &MID = TII->get(NewOpc);
   if (MID.getNumDefs() != 1) return 0;
-  const TargetRegisterClass *RC = TII->getRegClass(MID, LoadRegIndex, TRI);
+  MachineFunction &MF = *MI->getParent()->getParent();
+  const TargetRegisterClass *RC = TII->getRegClass(MID, LoadRegIndex, TRI, MF);
   // Ok, we're unfolding. Create a temporary register and do the unfold.
   unsigned Reg = MRI->createVirtualRegister(RC);
 
-  MachineFunction &MF = *MI->getParent()->getParent();
   SmallVector<MachineInstr *, 2> NewMIs;
   bool Success =
     TII->unfoldMemoryOperand(MF, MI, Reg,
diff --git a/lib/CodeGen/MachineLoopInfo.cpp b/lib/CodeGen/MachineLoopInfo.cpp
index 189cb2b..9f3829e 100644
--- a/lib/CodeGen/MachineLoopInfo.cpp
+++ b/lib/CodeGen/MachineLoopInfo.cpp
@@ -9,7 +9,7 @@
 //
 // This file defines the MachineLoopInfo class that is used to identify natural
 // loops and determine the loop depth of various nodes of the CFG.  Note that
-// the loops identified may actually be several natural loops that share the 
+// the loops identified may actually be several natural loops that share the
 // same header node... not just a single natural loop.
 //
 //===----------------------------------------------------------------------===//
@@ -17,17 +17,13 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/Analysis/LoopInfoImpl.h"
 #include "llvm/Support/Debug.h"
 using namespace llvm;
 
-namespace llvm {
-#define MLB class LoopBase<MachineBasicBlock, MachineLoop>
-TEMPLATE_INSTANTIATION(MLB);
-#undef MLB
-#define MLIB class LoopInfoBase<MachineBasicBlock, MachineLoop>
-TEMPLATE_INSTANTIATION(MLIB);
-#undef MLIB
-}
+// Explicitly instantiate methods in LoopInfoImpl.h for MI-level Loops.
+template class llvm::LoopBase<MachineBasicBlock, MachineLoop>;
+template class llvm::LoopInfoBase<MachineBasicBlock, MachineLoop>;
 
 char MachineLoopInfo::ID = 0;
 INITIALIZE_PASS_BEGIN(MachineLoopInfo, "machine-loops",
@@ -40,7 +36,7 @@ char &llvm::MachineLoopInfoID = MachineLoopInfo::ID;
 
 bool MachineLoopInfo::runOnMachineFunction(MachineFunction &) {
   releaseMemory();
-  LI.Calculate(getAnalysis<MachineDominatorTree>().getBase());    // Update
+  LI.Analyze(getAnalysis<MachineDominatorTree>().getBase());
   return false;
 }
 
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index 7ea1517..82e1235 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -162,9 +162,22 @@ void MachineRegisterInfo::replaceRegWith(unsigned FromReg, unsigned ToReg) {
 MachineInstr *MachineRegisterInfo::getVRegDef(unsigned Reg) const {
   // Since we are in SSA form, we can use the first definition.
   def_iterator I = def_begin(Reg);
+  assert((I.atEnd() || llvm::next(I) == def_end()) &&
+         "getVRegDef assumes a single definition or no definition");
   return !I.atEnd() ? &*I : 0;
 }
 
+/// getUniqueVRegDef - Return the unique machine instr that defines the
+/// specified virtual register or null if none is found.  If there are
+/// multiple definitions or no definition, return null.
+MachineInstr *MachineRegisterInfo::getUniqueVRegDef(unsigned Reg) const {
+  if (def_empty(Reg)) return 0;
+  def_iterator I = def_begin(Reg);
+  if (llvm::next(I) != def_end())
+    return 0;
+  return &*I;
+}
+
 bool MachineRegisterInfo::hasOneUse(unsigned RegNo) const {
   use_iterator UI = use_begin(RegNo);
   if (UI == use_end())
@@ -268,15 +281,15 @@ bool MachineRegisterInfo::isConstantPhysReg(unsigned PhysReg,
   assert(TargetRegisterInfo::isPhysicalRegister(PhysReg));
 
   // Check if any overlapping register is modified.
-  for (const uint16_t *R = TRI->getOverlaps(PhysReg); *R; ++R)
-    if (!def_empty(*R))
+  for (MCRegAliasIterator AI(PhysReg, TRI, true); AI.isValid(); ++AI)
+    if (!def_empty(*AI))
       return false;
 
   // Check if any overlapping register is allocatable so it may be used later.
   if (AllocatableRegs.empty())
     AllocatableRegs = TRI->getAllocatableSet(MF);
-  for (const uint16_t *R = TRI->getOverlaps(PhysReg); *R; ++R)
-    if (AllocatableRegs.test(*R))
+  for (MCRegAliasIterator AI(PhysReg, TRI, true); AI.isValid(); ++AI)
+    if (AllocatableRegs.test(*AI))
       return false;
   return true;
 }
diff --git a/lib/CodeGen/MachineSSAUpdater.cpp b/lib/CodeGen/MachineSSAUpdater.cpp
index 070a557..acb1ee6 100644
--- a/lib/CodeGen/MachineSSAUpdater.cpp
+++ b/lib/CodeGen/MachineSSAUpdater.cpp
@@ -241,30 +241,6 @@ void MachineSSAUpdater::ReplaceRegWith(unsigned OldReg, unsigned NewReg) {
       I->second = NewReg;
 }
 
-/// MachinePHIiter - Iterator for PHI operands.  This is used for the
-/// PHI_iterator in the SSAUpdaterImpl template.
-namespace {
-  class MachinePHIiter {
-  private:
-    MachineInstr *PHI;
-    unsigned idx;
- 
-  public:
-    explicit MachinePHIiter(MachineInstr *P) // begin iterator
-      : PHI(P), idx(1) {}
-    MachinePHIiter(MachineInstr *P, bool) // end iterator
-      : PHI(P), idx(PHI->getNumOperands()) {}
-
-    MachinePHIiter &operator++() { idx += 2; return *this; } 
-    bool operator==(const MachinePHIiter& x) const { return idx == x.idx; }
-    bool operator!=(const MachinePHIiter& x) const { return !operator==(x); }
-    unsigned getIncomingValue() { return PHI->getOperand(idx).getReg(); }
-    MachineBasicBlock *getIncomingBlock() {
-      return PHI->getOperand(idx+1).getMBB();
-    }
-  };
-}
-
 /// SSAUpdaterTraits<MachineSSAUpdater> - Traits for the SSAUpdaterImpl
 /// template, specialized for MachineSSAUpdater.
 namespace llvm {
@@ -279,7 +255,26 @@ public:
   static BlkSucc_iterator BlkSucc_begin(BlkT *BB) { return BB->succ_begin(); }
   static BlkSucc_iterator BlkSucc_end(BlkT *BB) { return BB->succ_end(); }
 
-  typedef MachinePHIiter PHI_iterator;
+  /// Iterator for PHI operands.
+  class PHI_iterator {
+  private:
+    MachineInstr *PHI;
+    unsigned idx;
+ 
+  public:
+    explicit PHI_iterator(MachineInstr *P) // begin iterator
+      : PHI(P), idx(1) {}
+    PHI_iterator(MachineInstr *P, bool) // end iterator
+      : PHI(P), idx(PHI->getNumOperands()) {}
+
+    PHI_iterator &operator++() { idx += 2; return *this; } 
+    bool operator==(const PHI_iterator& x) const { return idx == x.idx; }
+    bool operator!=(const PHI_iterator& x) const { return !operator==(x); }
+    unsigned getIncomingValue() { return PHI->getOperand(idx).getReg(); }
+    MachineBasicBlock *getIncomingBlock() {
+      return PHI->getOperand(idx+1).getMBB();
+    }
+  };
   static inline PHI_iterator PHI_begin(PhiT *PHI) { return PHI_iterator(PHI); }
   static inline PHI_iterator PHI_end(PhiT *PHI) {
     return PHI_iterator(PHI, true);
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index 1d3241b..a1dc948 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -17,9 +17,13 @@
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
-#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -50,6 +54,15 @@ static bool ViewMISchedDAGs = false;
 // Machine Instruction Scheduling Pass and Registry
 //===----------------------------------------------------------------------===//
 
+MachineSchedContext::MachineSchedContext():
+    MF(0), MLI(0), MDT(0), PassConfig(0), AA(0), LIS(0) {
+  RegClassInfo = new RegisterClassInfo();
+}
+
+MachineSchedContext::~MachineSchedContext() {
+  delete RegClassInfo;
+}
+
 namespace {
 /// MachineScheduler runs after coalescing and before register allocation.
 class MachineScheduler : public MachineSchedContext,
@@ -122,6 +135,29 @@ DefaultSchedRegistry("default", "Use the target's default scheduler choice.",
 /// default scheduler if the target does not set a default.
 static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C);
 
+
+/// Decrement this iterator until reaching the top or a non-debug instr.
+static MachineBasicBlock::iterator
+priorNonDebug(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Beg) {
+  assert(I != Beg && "reached the top of the region, cannot decrement");
+  while (--I != Beg) {
+    if (!I->isDebugValue())
+      break;
+  }
+  return I;
+}
+
+/// If this iterator is a debug value, increment until reaching the End or a
+/// non-debug instruction.
+static MachineBasicBlock::iterator
+nextIfDebug(MachineBasicBlock::iterator I, MachineBasicBlock::iterator End) {
+  for(; I != End; ++I) {
+    if (!I->isDebugValue())
+      break;
+  }
+  return I;
+}
+
 /// Top-level MachineScheduler pass driver.
 ///
 /// Visit blocks in function order. Divide each block into scheduling regions
@@ -139,6 +175,8 @@ static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C);
 /// design would be to split blocks at scheduling boundaries, but LLVM has a
 /// general bias against block splitting purely for implementation simplicity.
 bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
+  DEBUG(dbgs() << "Before MISsched:\n"; mf.print(dbgs()));
+
   // Initialize the context of the pass.
   MF = &mf;
   MLI = &getAnalysis<MachineLoopInfo>();
@@ -149,6 +187,8 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
   LIS = &getAnalysis<LiveIntervals>();
   const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
 
+  RegClassInfo->runOnMachineFunction(*MF);
+
   // Select the scheduler, or set the default.
   MachineSchedRegistry::ScheduleDAGCtor Ctor = MachineSchedOpt;
   if (Ctor == useDefaultMachineSched) {
@@ -163,13 +203,16 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
   OwningPtr<ScheduleDAGInstrs> Scheduler(Ctor(this));
 
   // Visit all machine basic blocks.
+  //
+  // TODO: Visit blocks in global postorder or postorder within the bottom-up
+  // loop tree. Then we can optionally compute global RegPressure.
   for (MachineFunction::iterator MBB = MF->begin(), MBBEnd = MF->end();
        MBB != MBBEnd; ++MBB) {
 
     Scheduler->startBlock(MBB);
 
     // Break the block into scheduling regions [I, RegionEnd), and schedule each
-    // region as soon as it is discovered. RegionEnd points the the scheduling
+    // region as soon as it is discovered. RegionEnd points the scheduling
     // boundary at the bottom of the region. The DAG does not include RegionEnd,
     // but the region does (i.e. the next RegionEnd is above the previous
     // RegionBegin). If the current block has no terminator then RegionEnd ==
@@ -181,6 +224,7 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
     unsigned RemainingCount = MBB->size();
     for(MachineBasicBlock::iterator RegionEnd = MBB->end();
         RegionEnd != MBB->begin(); RegionEnd = Scheduler->begin()) {
+
       // Avoid decrementing RegionEnd for blocks with no terminator.
       if (RegionEnd != MBB->end()
           || TII->isSchedulingBoundary(llvm::prior(RegionEnd), MBB, *MF)) {
@@ -207,7 +251,8 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
         Scheduler->exitRegion();
         continue;
       }
-      DEBUG(dbgs() << "MachineScheduling " << MF->getFunction()->getName()
+      DEBUG(dbgs() << "********** MI Scheduling **********\n");
+      DEBUG(dbgs() << MF->getFunction()->getName()
             << ":BB#" << MBB->getNumber() << "\n  From: " << *I << "    To: ";
             if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
             else dbgs() << "End";
@@ -260,6 +305,9 @@ public:
   /// be scheduled at the bottom.
   virtual SUnit *pickNode(bool &IsTopNode) = 0;
 
+  /// Notify MachineSchedStrategy that ScheduleDAGMI has scheduled a node.
+  virtual void schedNode(SUnit *SU, bool IsTopNode) = 0;
+
   /// When all predecessor dependencies have been resolved, free this node for
   /// top-down scheduling.
   virtual void releaseTopNode(SUnit *SU) = 0;
@@ -279,22 +327,45 @@ namespace {
 /// machine instructions while updating LiveIntervals.
 class ScheduleDAGMI : public ScheduleDAGInstrs {
   AliasAnalysis *AA;
+  RegisterClassInfo *RegClassInfo;
   MachineSchedStrategy *SchedImpl;
 
+  MachineBasicBlock::iterator LiveRegionEnd;
+
+  /// Register pressure in this region computed by buildSchedGraph.
+  IntervalPressure RegPressure;
+  RegPressureTracker RPTracker;
+
+  /// List of pressure sets that exceed the target's pressure limit before
+  /// scheduling, listed in increasing set ID order. Each pressure set is paired
+  /// with its max pressure in the currently scheduled regions.
+  std::vector<PressureElement> RegionCriticalPSets;
+
   /// The top of the unscheduled zone.
   MachineBasicBlock::iterator CurrentTop;
+  IntervalPressure TopPressure;
+  RegPressureTracker TopRPTracker;
 
   /// The bottom of the unscheduled zone.
   MachineBasicBlock::iterator CurrentBottom;
+  IntervalPressure BotPressure;
+  RegPressureTracker BotRPTracker;
 
+#ifndef NDEBUG
   /// The number of instructions scheduled so far. Used to cut off the
   /// scheduler at the point determined by misched-cutoff.
   unsigned NumInstrsScheduled;
+#endif
 public:
   ScheduleDAGMI(MachineSchedContext *C, MachineSchedStrategy *S):
     ScheduleDAGInstrs(*C->MF, *C->MLI, *C->MDT, /*IsPostRA=*/false, C->LIS),
-    AA(C->AA), SchedImpl(S), CurrentTop(), CurrentBottom(),
-    NumInstrsScheduled(0) {}
+    AA(C->AA), RegClassInfo(C->RegClassInfo), SchedImpl(S),
+    RPTracker(RegPressure), CurrentTop(), TopRPTracker(TopPressure),
+    CurrentBottom(), BotRPTracker(BotPressure) {
+#ifndef NDEBUG
+    NumInstrsScheduled = 0;
+#endif
+  }
 
   ~ScheduleDAGMI() {
     delete SchedImpl;
@@ -303,22 +374,68 @@ public:
   MachineBasicBlock::iterator top() const { return CurrentTop; }
   MachineBasicBlock::iterator bottom() const { return CurrentBottom; }
 
-  /// Implement ScheduleDAGInstrs interface.
+  /// Implement the ScheduleDAGInstrs interface for handling the next scheduling
+  /// region. This covers all instructions in a block, while schedule() may only
+  /// cover a subset.
+  void enterRegion(MachineBasicBlock *bb,
+                   MachineBasicBlock::iterator begin,
+                   MachineBasicBlock::iterator end,
+                   unsigned endcount);
+
+  /// Implement ScheduleDAGInstrs interface for scheduling a sequence of
+  /// reorderable instructions.
   void schedule();
 
+  /// Get current register pressure for the top scheduled instructions.
+  const IntervalPressure &getTopPressure() const { return TopPressure; }
+  const RegPressureTracker &getTopRPTracker() const { return TopRPTracker; }
+
+  /// Get current register pressure for the bottom scheduled instructions.
+  const IntervalPressure &getBotPressure() const { return BotPressure; }
+  const RegPressureTracker &getBotRPTracker() const { return BotRPTracker; }
+
+  /// Get register pressure for the entire scheduling region before scheduling.
+  const IntervalPressure &getRegPressure() const { return RegPressure; }
+
+  const std::vector<PressureElement> &getRegionCriticalPSets() const {
+    return RegionCriticalPSets;
+  }
+
+  /// getIssueWidth - Return the max instructions per scheduling group.
+  unsigned getIssueWidth() const {
+    return (InstrItins && InstrItins->SchedModel)
+      ? InstrItins->SchedModel->IssueWidth : 1;
+  }
+
+  /// getNumMicroOps - Return the number of issue slots required for this MI.
+  unsigned getNumMicroOps(MachineInstr *MI) const {
+    if (!InstrItins) return 1;
+    int UOps = InstrItins->getNumMicroOps(MI->getDesc().getSchedClass());
+    return (UOps >= 0) ? UOps : TII->getNumMicroOps(InstrItins, MI);
+  }
+
 protected:
+  void initRegPressure();
+  void updateScheduledPressure(std::vector<unsigned> NewMaxPressure);
+
   void moveInstruction(MachineInstr *MI, MachineBasicBlock::iterator InsertPos);
   bool checkSchedLimit();
 
+  void releaseRoots();
+
   void releaseSucc(SUnit *SU, SDep *SuccEdge);
   void releaseSuccessors(SUnit *SU);
   void releasePred(SUnit *SU, SDep *PredEdge);
   void releasePredecessors(SUnit *SU);
+
+  void placeDebugValues();
 };
 } // namespace
 
 /// ReleaseSucc - Decrement the NumPredsLeft count of a successor. When
 /// NumPredsLeft reaches zero, release the successor node.
+///
+/// FIXME: Adjust SuccSU height based on MinLatency.
 void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) {
   SUnit *SuccSU = SuccEdge->getSUnit();
 
@@ -345,6 +462,8 @@ void ScheduleDAGMI::releaseSuccessors(SUnit *SU) {
 
 /// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. When
 /// NumSuccsLeft reaches zero, release the predecessor node.
+///
+/// FIXME: Adjust PredSU height based on MinLatency.
 void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) {
   SUnit *PredSU = PredEdge->getSUnit();
 
@@ -371,12 +490,17 @@ void ScheduleDAGMI::releasePredecessors(SUnit *SU) {
 
 void ScheduleDAGMI::moveInstruction(MachineInstr *MI,
                                     MachineBasicBlock::iterator InsertPos) {
-  // Fix RegionBegin if the first instruction moves down.
+  // Advance RegionBegin if the first instruction moves down.
   if (&*RegionBegin == MI)
-    RegionBegin = llvm::next(RegionBegin);
+    ++RegionBegin;
+
+  // Update the instruction stream.
   BB->splice(InsertPos, BB, MI);
+
+  // Update LiveIntervals
   LIS->handleMove(MI);
-  // Fix RegionBegin if another instruction moves above the first instruction.
+
+  // Recede RegionBegin if an instruction moves above the first.
   if (RegionBegin == InsertPos)
     RegionBegin = MI;
 }
@@ -392,12 +516,114 @@ bool ScheduleDAGMI::checkSchedLimit() {
   return true;
 }
 
+/// enterRegion - Called back from MachineScheduler::runOnMachineFunction after
+/// crossing a scheduling boundary. [begin, end) includes all instructions in
+/// the region, including the boundary itself and single-instruction regions
+/// that don't get scheduled.
+void ScheduleDAGMI::enterRegion(MachineBasicBlock *bb,
+                                MachineBasicBlock::iterator begin,
+                                MachineBasicBlock::iterator end,
+                                unsigned endcount)
+{
+  ScheduleDAGInstrs::enterRegion(bb, begin, end, endcount);
+
+  // For convenience remember the end of the liveness region.
+  LiveRegionEnd =
+    (RegionEnd == bb->end()) ? RegionEnd : llvm::next(RegionEnd);
+}
+
+// Setup the register pressure trackers for the top scheduled top and bottom
+// scheduled regions.
+void ScheduleDAGMI::initRegPressure() {
+  TopRPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin);
+  BotRPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd);
+
+  // Close the RPTracker to finalize live ins.
+  RPTracker.closeRegion();
+
+  DEBUG(RPTracker.getPressure().dump(TRI));
+
+  // Initialize the live ins and live outs.
+  TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
+  BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
+
+  // Close one end of the tracker so we can call
+  // getMaxUpward/DownwardPressureDelta before advancing across any
+  // instructions. This converts currently live regs into live ins/outs.
+  TopRPTracker.closeTop();
+  BotRPTracker.closeBottom();
+
+  // Account for liveness generated by the region boundary.
+  if (LiveRegionEnd != RegionEnd)
+    BotRPTracker.recede();
+
+  assert(BotRPTracker.getPos() == RegionEnd && "Can't find the region bottom");
+
+  // Cache the list of excess pressure sets in this region. This will also track
+  // the max pressure in the scheduled code for these sets.
+  RegionCriticalPSets.clear();
+  std::vector<unsigned> RegionPressure = RPTracker.getPressure().MaxSetPressure;
+  for (unsigned i = 0, e = RegionPressure.size(); i < e; ++i) {
+    unsigned Limit = TRI->getRegPressureSetLimit(i);
+    if (RegionPressure[i] > Limit)
+      RegionCriticalPSets.push_back(PressureElement(i, 0));
+  }
+  DEBUG(dbgs() << "Excess PSets: ";
+        for (unsigned i = 0, e = RegionCriticalPSets.size(); i != e; ++i)
+          dbgs() << TRI->getRegPressureSetName(
+            RegionCriticalPSets[i].PSetID) << " ";
+        dbgs() << "\n");
+}
+
+// FIXME: When the pressure tracker deals in pressure differences then we won't
+// iterate over all RegionCriticalPSets[i].
+void ScheduleDAGMI::
+updateScheduledPressure(std::vector<unsigned> NewMaxPressure) {
+  for (unsigned i = 0, e = RegionCriticalPSets.size(); i < e; ++i) {
+    unsigned ID = RegionCriticalPSets[i].PSetID;
+    int &MaxUnits = RegionCriticalPSets[i].UnitIncrease;
+    if ((int)NewMaxPressure[ID] > MaxUnits)
+      MaxUnits = NewMaxPressure[ID];
+  }
+}
+
+// Release all DAG roots for scheduling.
+void ScheduleDAGMI::releaseRoots() {
+  SmallVector<SUnit*, 16> BotRoots;
+
+  for (std::vector<SUnit>::iterator
+         I = SUnits.begin(), E = SUnits.end(); I != E; ++I) {
+    // A SUnit is ready to top schedule if it has no predecessors.
+    if (I->Preds.empty())
+      SchedImpl->releaseTopNode(&(*I));
+    // A SUnit is ready to bottom schedule if it has no successors.
+    if (I->Succs.empty())
+      BotRoots.push_back(&(*I));
+  }
+  // Release bottom roots in reverse order so the higher priority nodes appear
+  // first. This is more natural and slightly more efficient.
+  for (SmallVectorImpl<SUnit*>::const_reverse_iterator
+         I = BotRoots.rbegin(), E = BotRoots.rend(); I != E; ++I)
+    SchedImpl->releaseBottomNode(*I);
+}
+
 /// schedule - Called back from MachineScheduler::runOnMachineFunction
-/// after setting up the current scheduling region.
+/// after setting up the current scheduling region. [RegionBegin, RegionEnd)
+/// only includes instructions that have DAG nodes, not scheduling boundaries.
 void ScheduleDAGMI::schedule() {
-  buildSchedGraph(AA);
+  // Initialize the register pressure tracker used by buildSchedGraph.
+  RPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd);
+
+  // Account for liveness generate by the region boundary.
+  if (LiveRegionEnd != RegionEnd)
+    RPTracker.recede();
+
+  // Build the DAG, and compute current register pressure.
+  buildSchedGraph(AA, &RPTracker);
+
+  // Initialize top/bottom trackers after computing region pressure.
+  initRegPressure();
 
-  DEBUG(dbgs() << "********** MI Scheduling **********\n");
   DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
           SUnits[su].dumpAll(this));
 
@@ -410,22 +636,12 @@ void ScheduleDAGMI::schedule() {
   releasePredecessors(&ExitSU);
 
   // Release all DAG roots for scheduling.
-  for (std::vector<SUnit>::iterator I = SUnits.begin(), E = SUnits.end();
-       I != E; ++I) {
-    // A SUnit is ready to top schedule if it has no predecessors.
-    if (I->Preds.empty())
-      SchedImpl->releaseTopNode(&(*I));
-    // A SUnit is ready to bottom schedule if it has no successors.
-    if (I->Succs.empty())
-      SchedImpl->releaseBottomNode(&(*I));
-  }
+  releaseRoots();
 
-  CurrentTop = RegionBegin;
+  CurrentTop = nextIfDebug(RegionBegin, RegionEnd);
   CurrentBottom = RegionEnd;
   bool IsTopNode = false;
   while (SUnit *SU = SchedImpl->pickNode(IsTopNode)) {
-    DEBUG(dbgs() << "*** " << (IsTopNode ? "Top" : "Bottom")
-          << " Scheduling Instruction:\n"; SU->dump(this));
     if (!checkSchedLimit())
       break;
 
@@ -435,28 +651,69 @@ void ScheduleDAGMI::schedule() {
     if (IsTopNode) {
       assert(SU->isTopReady() && "node still has unscheduled dependencies");
       if (&*CurrentTop == MI)
-        ++CurrentTop;
-      else
+        CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom);
+      else {
         moveInstruction(MI, CurrentTop);
+        TopRPTracker.setPos(MI);
+      }
+
+      // Update top scheduled pressure.
+      TopRPTracker.advance();
+      assert(TopRPTracker.getPos() == CurrentTop && "out of sync");
+      updateScheduledPressure(TopRPTracker.getPressure().MaxSetPressure);
+
       // Release dependent instructions for scheduling.
       releaseSuccessors(SU);
     }
     else {
       assert(SU->isBottomReady() && "node still has unscheduled dependencies");
-      if (&*llvm::prior(CurrentBottom) == MI)
-        --CurrentBottom;
+      MachineBasicBlock::iterator priorII =
+        priorNonDebug(CurrentBottom, CurrentTop);
+      if (&*priorII == MI)
+        CurrentBottom = priorII;
       else {
-        if (&*CurrentTop == MI)
-          CurrentTop = llvm::next(CurrentTop);
+        if (&*CurrentTop == MI) {
+          CurrentTop = nextIfDebug(++CurrentTop, priorII);
+          TopRPTracker.setPos(CurrentTop);
+        }
         moveInstruction(MI, CurrentBottom);
         CurrentBottom = MI;
       }
+      // Update bottom scheduled pressure.
+      BotRPTracker.recede();
+      assert(BotRPTracker.getPos() == CurrentBottom && "out of sync");
+      updateScheduledPressure(BotRPTracker.getPressure().MaxSetPressure);
+
       // Release dependent instructions for scheduling.
       releasePredecessors(SU);
     }
     SU->isScheduled = true;
+    SchedImpl->schedNode(SU, IsTopNode);
   }
   assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
+
+  placeDebugValues();
+}
+
+/// Reinsert any remaining debug_values, just like the PostRA scheduler.
+void ScheduleDAGMI::placeDebugValues() {
+  // If first instruction was a DBG_VALUE then put it back.
+  if (FirstDbgValue) {
+    BB->splice(RegionBegin, BB, FirstDbgValue);
+    RegionBegin = FirstDbgValue;
+  }
+
+  for (std::vector<std::pair<MachineInstr *, MachineInstr *> >::iterator
+         DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) {
+    std::pair<MachineInstr *, MachineInstr *> P = *prior(DI);
+    MachineInstr *DbgValue = P.first;
+    MachineBasicBlock::iterator OrigPrevMI = P.second;
+    BB->splice(++OrigPrevMI, BB, DbgValue);
+    if (OrigPrevMI == llvm::prior(RegionEnd))
+      RegionEnd = DbgValue;
+  }
+  DbgValues.clear();
+  FirstDbgValue = NULL;
 }
 
 //===----------------------------------------------------------------------===//
@@ -464,56 +721,603 @@ void ScheduleDAGMI::schedule() {
 //===----------------------------------------------------------------------===//
 
 namespace {
+/// ReadyQueue encapsulates vector of "ready" SUnits with basic convenience
+/// methods for pushing and removing nodes. ReadyQueue's are uniquely identified
+/// by an ID. SUnit::NodeQueueId is a mask of the ReadyQueues the SUnit is in.
+class ReadyQueue {
+  unsigned ID;
+  std::string Name;
+  std::vector<SUnit*> Queue;
+
+public:
+  ReadyQueue(unsigned id, const Twine &name): ID(id), Name(name.str()) {}
+
+  unsigned getID() const { return ID; }
+
+  StringRef getName() const { return Name; }
+
+  // SU is in this queue if it's NodeQueueID is a superset of this ID.
+  bool isInQueue(SUnit *SU) const { return (SU->NodeQueueId & ID); }
+
+  bool empty() const { return Queue.empty(); }
+
+  unsigned size() const { return Queue.size(); }
+
+  typedef std::vector<SUnit*>::iterator iterator;
+
+  iterator begin() { return Queue.begin(); }
+
+  iterator end() { return Queue.end(); }
+
+  iterator find(SUnit *SU) {
+    return std::find(Queue.begin(), Queue.end(), SU);
+  }
+
+  void push(SUnit *SU) {
+    Queue.push_back(SU);
+    SU->NodeQueueId |= ID;
+  }
+
+  void remove(iterator I) {
+    (*I)->NodeQueueId &= ~ID;
+    *I = Queue.back();
+    Queue.pop_back();
+  }
+
+  void dump() {
+    dbgs() << Name << ": ";
+    for (unsigned i = 0, e = Queue.size(); i < e; ++i)
+      dbgs() << Queue[i]->NodeNum << " ";
+    dbgs() << "\n";
+  }
+};
+
 /// ConvergingScheduler shrinks the unscheduled zone using heuristics to balance
 /// the schedule.
 class ConvergingScheduler : public MachineSchedStrategy {
+
+  /// Store the state used by ConvergingScheduler heuristics, required for the
+  /// lifetime of one invocation of pickNode().
+  struct SchedCandidate {
+    // The best SUnit candidate.
+    SUnit *SU;
+
+    // Register pressure values for the best candidate.
+    RegPressureDelta RPDelta;
+
+    SchedCandidate(): SU(NULL) {}
+  };
+  /// Represent the type of SchedCandidate found within a single queue.
+  enum CandResult {
+    NoCand, NodeOrder, SingleExcess, SingleCritical, SingleMax, MultiPressure };
+
+  /// Each Scheduling boundary is associated with ready queues. It tracks the
+  /// current cycle in whichever direction at has moved, and maintains the state
+  /// of "hazards" and other interlocks at the current cycle.
+  struct SchedBoundary {
+    ScheduleDAGMI *DAG;
+
+    ReadyQueue Available;
+    ReadyQueue Pending;
+    bool CheckPending;
+
+    ScheduleHazardRecognizer *HazardRec;
+
+    unsigned CurrCycle;
+    unsigned IssueCount;
+
+    /// MinReadyCycle - Cycle of the soonest available instruction.
+    unsigned MinReadyCycle;
+
+    // Remember the greatest min operand latency.
+    unsigned MaxMinLatency;
+
+    /// Pending queues extend the ready queues with the same ID and the
+    /// PendingFlag set.
+    SchedBoundary(unsigned ID, const Twine &Name):
+      DAG(0), Available(ID, Name+".A"),
+      Pending(ID << ConvergingScheduler::LogMaxQID, Name+".P"),
+      CheckPending(false), HazardRec(0), CurrCycle(0), IssueCount(0),
+      MinReadyCycle(UINT_MAX), MaxMinLatency(0) {}
+
+    ~SchedBoundary() { delete HazardRec; }
+
+    bool isTop() const {
+      return Available.getID() == ConvergingScheduler::TopQID;
+    }
+
+    bool checkHazard(SUnit *SU);
+
+    void releaseNode(SUnit *SU, unsigned ReadyCycle);
+
+    void bumpCycle();
+
+    void bumpNode(SUnit *SU);
+
+    void releasePending();
+
+    void removeReady(SUnit *SU);
+
+    SUnit *pickOnlyChoice();
+  };
+
   ScheduleDAGMI *DAG;
+  const TargetRegisterInfo *TRI;
 
-  unsigned NumTopReady;
-  unsigned NumBottomReady;
+  // State of the top and bottom scheduled instruction boundaries.
+  SchedBoundary Top;
+  SchedBoundary Bot;
 
 public:
-  virtual void initialize(ScheduleDAGMI *dag) {
-    DAG = dag;
+  /// SUnit::NodeQueueId: 0 (none), 1 (top), 2 (bot), 3 (both)
+  enum {
+    TopQID = 1,
+    BotQID = 2,
+    LogMaxQID = 2
+  };
+
+  ConvergingScheduler():
+    DAG(0), TRI(0), Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {}
+
+  virtual void initialize(ScheduleDAGMI *dag);
+
+  virtual SUnit *pickNode(bool &IsTopNode);
+
+  virtual void schedNode(SUnit *SU, bool IsTopNode);
+
+  virtual void releaseTopNode(SUnit *SU);
+
+  virtual void releaseBottomNode(SUnit *SU);
+
+protected:
+  SUnit *pickNodeBidrectional(bool &IsTopNode);
 
-    assert((!ForceTopDown || !ForceBottomUp) &&
-           "-misched-topdown incompatible with -misched-bottomup");
+  CandResult pickNodeFromQueue(ReadyQueue &Q,
+                               const RegPressureTracker &RPTracker,
+                               SchedCandidate &Candidate);
+#ifndef NDEBUG
+  void traceCandidate(const char *Label, const ReadyQueue &Q, SUnit *SU,
+                      PressureElement P = PressureElement());
+#endif
+};
+} // namespace
+
+void ConvergingScheduler::initialize(ScheduleDAGMI *dag) {
+  DAG = dag;
+  TRI = DAG->TRI;
+  Top.DAG = dag;
+  Bot.DAG = dag;
+
+  // Initialize the HazardRecognizers.
+  const TargetMachine &TM = DAG->MF.getTarget();
+  const InstrItineraryData *Itin = TM.getInstrItineraryData();
+  Top.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
+  Bot.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
+
+  assert((!ForceTopDown || !ForceBottomUp) &&
+         "-misched-topdown incompatible with -misched-bottomup");
+}
+
+void ConvergingScheduler::releaseTopNode(SUnit *SU) {
+  if (SU->isScheduled)
+    return;
+
+  for (SUnit::succ_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    unsigned PredReadyCycle = I->getSUnit()->TopReadyCycle;
+    unsigned Latency =
+      DAG->computeOperandLatency(I->getSUnit(), SU, *I, /*FindMin=*/true);
+#ifndef NDEBUG
+    Top.MaxMinLatency = std::max(Latency, Top.MaxMinLatency);
+#endif
+    if (SU->TopReadyCycle < PredReadyCycle + Latency)
+      SU->TopReadyCycle = PredReadyCycle + Latency;
   }
+  Top.releaseNode(SU, SU->TopReadyCycle);
+}
 
-  virtual SUnit *pickNode(bool &IsTopNode) {
-    if (DAG->top() == DAG->bottom())
-      return NULL;
+void ConvergingScheduler::releaseBottomNode(SUnit *SU) {
+  if (SU->isScheduled)
+    return;
 
-    // As an initial placeholder heuristic, schedule in the direction that has
-    // the fewest choices.
-    SUnit *SU;
-    if (ForceTopDown || (!ForceBottomUp && NumTopReady <= NumBottomReady)) {
-      SU = DAG->getSUnit(DAG->top());
-      IsTopNode = true;
+  assert(SU->getInstr() && "Scheduled SUnit must have instr");
+
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle;
+    unsigned Latency =
+      DAG->computeOperandLatency(SU, I->getSUnit(), *I, /*FindMin=*/true);
+#ifndef NDEBUG
+    Bot.MaxMinLatency = std::max(Latency, Bot.MaxMinLatency);
+#endif
+    if (SU->BotReadyCycle < SuccReadyCycle + Latency)
+      SU->BotReadyCycle = SuccReadyCycle + Latency;
+  }
+  Bot.releaseNode(SU, SU->BotReadyCycle);
+}
+
+/// Does this SU have a hazard within the current instruction group.
+///
+/// The scheduler supports two modes of hazard recognition. The first is the
+/// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that
+/// supports highly complicated in-order reservation tables
+/// (ScoreboardHazardRecognizer) and arbitraty target-specific logic.
+///
+/// The second is a streamlined mechanism that checks for hazards based on
+/// simple counters that the scheduler itself maintains. It explicitly checks
+/// for instruction dispatch limitations, including the number of micro-ops that
+/// can dispatch per cycle.
+///
+/// TODO: Also check whether the SU must start a new group.
+bool ConvergingScheduler::SchedBoundary::checkHazard(SUnit *SU) {
+  if (HazardRec->isEnabled())
+    return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard;
+
+  if (IssueCount + DAG->getNumMicroOps(SU->getInstr()) > DAG->getIssueWidth())
+    return true;
+
+  return false;
+}
+
+void ConvergingScheduler::SchedBoundary::releaseNode(SUnit *SU,
+                                                     unsigned ReadyCycle) {
+  if (ReadyCycle < MinReadyCycle)
+    MinReadyCycle = ReadyCycle;
+
+  // Check for interlocks first. For the purpose of other heuristics, an
+  // instruction that cannot issue appears as if it's not in the ReadyQueue.
+  if (ReadyCycle > CurrCycle || checkHazard(SU))
+    Pending.push(SU);
+  else
+    Available.push(SU);
+}
+
+/// Move the boundary of scheduled code by one cycle.
+void ConvergingScheduler::SchedBoundary::bumpCycle() {
+  unsigned Width = DAG->getIssueWidth();
+  IssueCount = (IssueCount <= Width) ? 0 : IssueCount - Width;
+
+  assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized");
+  unsigned NextCycle = std::max(CurrCycle + 1, MinReadyCycle);
+
+  if (!HazardRec->isEnabled()) {
+    // Bypass HazardRec virtual calls.
+    CurrCycle = NextCycle;
+  }
+  else {
+    // Bypass getHazardType calls in case of long latency.
+    for (; CurrCycle != NextCycle; ++CurrCycle) {
+      if (isTop())
+        HazardRec->AdvanceCycle();
+      else
+        HazardRec->RecedeCycle();
     }
-    else {
-      SU = DAG->getSUnit(llvm::prior(DAG->bottom()));
-      IsTopNode = false;
+  }
+  CheckPending = true;
+
+  DEBUG(dbgs() << "*** " << Available.getName() << " cycle "
+        << CurrCycle << '\n');
+}
+
+/// Move the boundary of scheduled code by one SUnit.
+void ConvergingScheduler::SchedBoundary::bumpNode(SUnit *SU) {
+  // Update the reservation table.
+  if (HazardRec->isEnabled()) {
+    if (!isTop() && SU->isCall) {
+      // Calls are scheduled with their preceding instructions. For bottom-up
+      // scheduling, clear the pipeline state before emitting.
+      HazardRec->Reset();
     }
-    if (SU->isTopReady()) {
-      assert(NumTopReady > 0 && "bad ready count");
-      --NumTopReady;
+    HazardRec->EmitInstruction(SU);
+  }
+  // Check the instruction group dispatch limit.
+  // TODO: Check if this SU must end a dispatch group.
+  IssueCount += DAG->getNumMicroOps(SU->getInstr());
+  if (IssueCount >= DAG->getIssueWidth()) {
+    DEBUG(dbgs() << "*** Max instrs at cycle " << CurrCycle << '\n');
+    bumpCycle();
+  }
+}
+
+/// Release pending ready nodes in to the available queue. This makes them
+/// visible to heuristics.
+void ConvergingScheduler::SchedBoundary::releasePending() {
+  // If the available queue is empty, it is safe to reset MinReadyCycle.
+  if (Available.empty())
+    MinReadyCycle = UINT_MAX;
+
+  // Check to see if any of the pending instructions are ready to issue.  If
+  // so, add them to the available queue.
+  for (unsigned i = 0, e = Pending.size(); i != e; ++i) {
+    SUnit *SU = *(Pending.begin()+i);
+    unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
+
+    if (ReadyCycle < MinReadyCycle)
+      MinReadyCycle = ReadyCycle;
+
+    if (ReadyCycle > CurrCycle)
+      continue;
+
+    if (checkHazard(SU))
+      continue;
+
+    Available.push(SU);
+    Pending.remove(Pending.begin()+i);
+    --i; --e;
+  }
+  CheckPending = false;
+}
+
+/// Remove SU from the ready set for this boundary.
+void ConvergingScheduler::SchedBoundary::removeReady(SUnit *SU) {
+  if (Available.isInQueue(SU))
+    Available.remove(Available.find(SU));
+  else {
+    assert(Pending.isInQueue(SU) && "bad ready count");
+    Pending.remove(Pending.find(SU));
+  }
+}
+
+/// If this queue only has one ready candidate, return it. As a side effect,
+/// advance the cycle until at least one node is ready. If multiple instructions
+/// are ready, return NULL.
+SUnit *ConvergingScheduler::SchedBoundary::pickOnlyChoice() {
+  if (CheckPending)
+    releasePending();
+
+  for (unsigned i = 0; Available.empty(); ++i) {
+    assert(i <= (HazardRec->getMaxLookAhead() + MaxMinLatency) &&
+           "permanent hazard"); (void)i;
+    bumpCycle();
+    releasePending();
+  }
+  if (Available.size() == 1)
+    return *Available.begin();
+  return NULL;
+}
+
+#ifndef NDEBUG
+void ConvergingScheduler::traceCandidate(const char *Label, const ReadyQueue &Q,
+                                         SUnit *SU, PressureElement P) {
+  dbgs() << Label << " " << Q.getName() << " ";
+  if (P.isValid())
+    dbgs() << TRI->getRegPressureSetName(P.PSetID) << ":" << P.UnitIncrease
+           << " ";
+  else
+    dbgs() << "     ";
+  SU->dump(DAG);
+}
+#endif
+
+/// pickNodeFromQueue helper that returns true if the LHS reg pressure effect is
+/// more desirable than RHS from scheduling standpoint.
+static bool compareRPDelta(const RegPressureDelta &LHS,
+                           const RegPressureDelta &RHS) {
+  // Compare each component of pressure in decreasing order of importance
+  // without checking if any are valid. Invalid PressureElements are assumed to
+  // have UnitIncrease==0, so are neutral.
+
+  // Avoid increasing the max critical pressure in the scheduled region.
+  if (LHS.Excess.UnitIncrease != RHS.Excess.UnitIncrease)
+    return LHS.Excess.UnitIncrease < RHS.Excess.UnitIncrease;
+
+  // Avoid increasing the max critical pressure in the scheduled region.
+  if (LHS.CriticalMax.UnitIncrease != RHS.CriticalMax.UnitIncrease)
+    return LHS.CriticalMax.UnitIncrease < RHS.CriticalMax.UnitIncrease;
+
+  // Avoid increasing the max pressure of the entire region.
+  if (LHS.CurrentMax.UnitIncrease != RHS.CurrentMax.UnitIncrease)
+    return LHS.CurrentMax.UnitIncrease < RHS.CurrentMax.UnitIncrease;
+
+  return false;
+}
+
+/// Pick the best candidate from the top queue.
+///
+/// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during
+/// DAG building. To adjust for the current scheduling location we need to
+/// maintain the number of vreg uses remaining to be top-scheduled.
+ConvergingScheduler::CandResult ConvergingScheduler::
+pickNodeFromQueue(ReadyQueue &Q, const RegPressureTracker &RPTracker,
+                  SchedCandidate &Candidate) {
+  DEBUG(Q.dump());
+
+  // getMaxPressureDelta temporarily modifies the tracker.
+  RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
+
+  // BestSU remains NULL if no top candidates beat the best existing candidate.
+  CandResult FoundCandidate = NoCand;
+  for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
+    RegPressureDelta RPDelta;
+    TempTracker.getMaxPressureDelta((*I)->getInstr(), RPDelta,
+                                    DAG->getRegionCriticalPSets(),
+                                    DAG->getRegPressure().MaxSetPressure);
+
+    // Initialize the candidate if needed.
+    if (!Candidate.SU) {
+      Candidate.SU = *I;
+      Candidate.RPDelta = RPDelta;
+      FoundCandidate = NodeOrder;
+      continue;
+    }
+    // Avoid exceeding the target's limit.
+    if (RPDelta.Excess.UnitIncrease < Candidate.RPDelta.Excess.UnitIncrease) {
+      DEBUG(traceCandidate("ECAND", Q, *I, RPDelta.Excess));
+      Candidate.SU = *I;
+      Candidate.RPDelta = RPDelta;
+      FoundCandidate = SingleExcess;
+      continue;
+    }
+    if (RPDelta.Excess.UnitIncrease > Candidate.RPDelta.Excess.UnitIncrease)
+      continue;
+    if (FoundCandidate == SingleExcess)
+      FoundCandidate = MultiPressure;
+
+    // Avoid increasing the max critical pressure in the scheduled region.
+    if (RPDelta.CriticalMax.UnitIncrease
+        < Candidate.RPDelta.CriticalMax.UnitIncrease) {
+      DEBUG(traceCandidate("PCAND", Q, *I, RPDelta.CriticalMax));
+      Candidate.SU = *I;
+      Candidate.RPDelta = RPDelta;
+      FoundCandidate = SingleCritical;
+      continue;
+    }
+    if (RPDelta.CriticalMax.UnitIncrease
+        > Candidate.RPDelta.CriticalMax.UnitIncrease)
+      continue;
+    if (FoundCandidate == SingleCritical)
+      FoundCandidate = MultiPressure;
+
+    // Avoid increasing the max pressure of the entire region.
+    if (RPDelta.CurrentMax.UnitIncrease
+        < Candidate.RPDelta.CurrentMax.UnitIncrease) {
+      DEBUG(traceCandidate("MCAND", Q, *I, RPDelta.CurrentMax));
+      Candidate.SU = *I;
+      Candidate.RPDelta = RPDelta;
+      FoundCandidate = SingleMax;
+      continue;
     }
-    if (SU->isBottomReady()) {
-      assert(NumBottomReady > 0 && "bad ready count");
-      --NumBottomReady;
+    if (RPDelta.CurrentMax.UnitIncrease
+        > Candidate.RPDelta.CurrentMax.UnitIncrease)
+      continue;
+    if (FoundCandidate == SingleMax)
+      FoundCandidate = MultiPressure;
+
+    // Fall through to original instruction order.
+    // Only consider node order if Candidate was chosen from this Q.
+    if (FoundCandidate == NoCand)
+      continue;
+
+    if ((Q.getID() == TopQID && (*I)->NodeNum < Candidate.SU->NodeNum)
+        || (Q.getID() == BotQID && (*I)->NodeNum > Candidate.SU->NodeNum)) {
+      DEBUG(traceCandidate("NCAND", Q, *I));
+      Candidate.SU = *I;
+      Candidate.RPDelta = RPDelta;
+      FoundCandidate = NodeOrder;
     }
+  }
+  return FoundCandidate;
+}
+
+/// Pick the best candidate node from either the top or bottom queue.
+SUnit *ConvergingScheduler::pickNodeBidrectional(bool &IsTopNode) {
+  // Schedule as far as possible in the direction of no choice. This is most
+  // efficient, but also provides the best heuristics for CriticalPSets.
+  if (SUnit *SU = Bot.pickOnlyChoice()) {
+    IsTopNode = false;
     return SU;
   }
+  if (SUnit *SU = Top.pickOnlyChoice()) {
+    IsTopNode = true;
+    return SU;
+  }
+  SchedCandidate BotCand;
+  // Prefer bottom scheduling when heuristics are silent.
+  CandResult BotResult = pickNodeFromQueue(Bot.Available,
+                                           DAG->getBotRPTracker(), BotCand);
+  assert(BotResult != NoCand && "failed to find the first candidate");
+
+  // If either Q has a single candidate that provides the least increase in
+  // Excess pressure, we can immediately schedule from that Q.
+  //
+  // RegionCriticalPSets summarizes the pressure within the scheduled region and
+  // affects picking from either Q. If scheduling in one direction must
+  // increase pressure for one of the excess PSets, then schedule in that
+  // direction first to provide more freedom in the other direction.
+  if (BotResult == SingleExcess || BotResult == SingleCritical) {
+    IsTopNode = false;
+    return BotCand.SU;
+  }
+  // Check if the top Q has a better candidate.
+  SchedCandidate TopCand;
+  CandResult TopResult = pickNodeFromQueue(Top.Available,
+                                           DAG->getTopRPTracker(), TopCand);
+  assert(TopResult != NoCand && "failed to find the first candidate");
+
+  if (TopResult == SingleExcess || TopResult == SingleCritical) {
+    IsTopNode = true;
+    return TopCand.SU;
+  }
+  // If either Q has a single candidate that minimizes pressure above the
+  // original region's pressure pick it.
+  if (BotResult == SingleMax) {
+    IsTopNode = false;
+    return BotCand.SU;
+  }
+  if (TopResult == SingleMax) {
+    IsTopNode = true;
+    return TopCand.SU;
+  }
+  // Check for a salient pressure difference and pick the best from either side.
+  if (compareRPDelta(TopCand.RPDelta, BotCand.RPDelta)) {
+    IsTopNode = true;
+    return TopCand.SU;
+  }
+  // Otherwise prefer the bottom candidate in node order.
+  IsTopNode = false;
+  return BotCand.SU;
+}
 
-  virtual void releaseTopNode(SUnit *SU) {
-    ++NumTopReady;
+/// Pick the best node to balance the schedule. Implements MachineSchedStrategy.
+SUnit *ConvergingScheduler::pickNode(bool &IsTopNode) {
+  if (DAG->top() == DAG->bottom()) {
+    assert(Top.Available.empty() && Top.Pending.empty() &&
+           Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
+    return NULL;
   }
-  virtual void releaseBottomNode(SUnit *SU) {
-    ++NumBottomReady;
+  SUnit *SU;
+  if (ForceTopDown) {
+    SU = Top.pickOnlyChoice();
+    if (!SU) {
+      SchedCandidate TopCand;
+      CandResult TopResult =
+        pickNodeFromQueue(Top.Available, DAG->getTopRPTracker(), TopCand);
+      assert(TopResult != NoCand && "failed to find the first candidate");
+      (void)TopResult;
+      SU = TopCand.SU;
+    }
+    IsTopNode = true;
   }
-};
-} // namespace
+  else if (ForceBottomUp) {
+    SU = Bot.pickOnlyChoice();
+    if (!SU) {
+      SchedCandidate BotCand;
+      CandResult BotResult =
+        pickNodeFromQueue(Bot.Available, DAG->getBotRPTracker(), BotCand);
+      assert(BotResult != NoCand && "failed to find the first candidate");
+      (void)BotResult;
+      SU = BotCand.SU;
+    }
+    IsTopNode = false;
+  }
+  else {
+    SU = pickNodeBidrectional(IsTopNode);
+  }
+  if (SU->isTopReady())
+    Top.removeReady(SU);
+  if (SU->isBottomReady())
+    Bot.removeReady(SU);
+
+  DEBUG(dbgs() << "*** " << (IsTopNode ? "Top" : "Bottom")
+        << " Scheduling Instruction in cycle "
+        << (IsTopNode ? Top.CurrCycle : Bot.CurrCycle) << '\n';
+        SU->dump(DAG));
+  return SU;
+}
+
+/// Update the scheduler's state after scheduling a node. This is the same node
+/// that was just returned by pickNode(). However, ScheduleDAGMI needs to update
+/// it's state based on the current cycle before MachineSchedStrategy does.
+void ConvergingScheduler::schedNode(SUnit *SU, bool IsTopNode) {
+  if (IsTopNode) {
+    SU->TopReadyCycle = Top.CurrCycle;
+    Top.bumpNode(SU);
+  }
+  else {
+    SU->BotReadyCycle = Bot.CurrCycle;
+    Bot.bumpNode(SU);
+  }
+}
 
 /// Create the standard converging machine scheduler. This will be used as the
 /// default scheduler if the target does not set a default.
@@ -592,6 +1396,8 @@ public:
     return SU;
   }
 
+  virtual void schedNode(SUnit *SU, bool IsTopNode) {}
+
   virtual void releaseTopNode(SUnit *SU) {
     TopQ.push(SU);
   }
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index 74ba94d..d8dece6 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -89,8 +89,8 @@ namespace {
     void addRegWithSubRegs(RegVector &RV, unsigned Reg) {
       RV.push_back(Reg);
       if (TargetRegisterInfo::isPhysicalRegister(Reg))
-        for (const uint16_t *R = TRI->getSubRegisters(Reg); *R; R++)
-          RV.push_back(*R);
+        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+          RV.push_back(*SubRegs);
     }
 
     struct BBInfo {
@@ -191,9 +191,11 @@ namespace {
 
     void visitMachineFunctionBefore();
     void visitMachineBasicBlockBefore(const MachineBasicBlock *MBB);
+    void visitMachineBundleBefore(const MachineInstr *MI);
     void visitMachineInstrBefore(const MachineInstr *MI);
     void visitMachineOperand(const MachineOperand *MO, unsigned MONum);
     void visitMachineInstrAfter(const MachineInstr *MI);
+    void visitMachineBundleAfter(const MachineInstr *MI);
     void visitMachineBasicBlockAfter(const MachineBasicBlock *MBB);
     void visitMachineFunctionAfter();
 
@@ -288,6 +290,8 @@ bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) {
   for (MachineFunction::const_iterator MFI = MF.begin(), MFE = MF.end();
        MFI!=MFE; ++MFI) {
     visitMachineBasicBlockBefore(MFI);
+    // Keep track of the current bundle header.
+    const MachineInstr *CurBundle = 0;
     for (MachineBasicBlock::const_instr_iterator MBBI = MFI->instr_begin(),
            MBBE = MFI->instr_end(); MBBI != MBBE; ++MBBI) {
       if (MBBI->getParent() != MFI) {
@@ -295,15 +299,21 @@ bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) {
         *OS << "Instruction: " << *MBBI;
         continue;
       }
-      // Skip BUNDLE instruction for now. FIXME: We should add code to verify
-      // the BUNDLE's specifically.
-      if (MBBI->isBundle())
-        continue;
+      // Is this a bundle header?
+      if (!MBBI->isInsideBundle()) {
+        if (CurBundle)
+          visitMachineBundleAfter(CurBundle);
+        CurBundle = MBBI;
+        visitMachineBundleBefore(CurBundle);
+      } else if (!CurBundle)
+        report("No bundle header", MBBI);
       visitMachineInstrBefore(MBBI);
       for (unsigned I = 0, E = MBBI->getNumOperands(); I != E; ++I)
         visitMachineOperand(&MBBI->getOperand(I), I);
       visitMachineInstrAfter(MBBI);
     }
+    if (CurBundle)
+      visitMachineBundleAfter(CurBundle);
     visitMachineBasicBlockAfter(MFI);
   }
   visitMachineFunctionAfter();
@@ -384,10 +394,10 @@ void MachineVerifier::visitMachineFunctionBefore() {
   // A sub-register of a reserved register is also reserved
   for (int Reg = regsReserved.find_first(); Reg>=0;
        Reg = regsReserved.find_next(Reg)) {
-    for (const uint16_t *Sub = TRI->getSubRegisters(Reg); *Sub; ++Sub) {
+    for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
       // FIXME: This should probably be:
-      // assert(regsReserved.test(*Sub) && "Non-reserved sub-register");
-      regsReserved.set(*Sub);
+      // assert(regsReserved.test(*SubRegs) && "Non-reserved sub-register");
+      regsReserved.set(*SubRegs);
     }
   }
 
@@ -466,8 +476,8 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
         report("MBB exits via unconditional fall-through but its successor "
                "differs from its CFG successor!", MBB);
       }
-      if (!MBB->empty() && MBB->back().isBarrier() &&
-          !TII->isPredicated(&MBB->back())) {
+      if (!MBB->empty() && getBundleStart(&MBB->back())->isBarrier() &&
+          !TII->isPredicated(getBundleStart(&MBB->back()))) {
         report("MBB exits via unconditional fall-through but ends with a "
                "barrier instruction!", MBB);
       }
@@ -487,10 +497,10 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
       if (MBB->empty()) {
         report("MBB exits via unconditional branch but doesn't contain "
                "any instructions!", MBB);
-      } else if (!MBB->back().isBarrier()) {
+      } else if (!getBundleStart(&MBB->back())->isBarrier()) {
         report("MBB exits via unconditional branch but doesn't end with a "
                "barrier instruction!", MBB);
-      } else if (!MBB->back().isTerminator()) {
+      } else if (!getBundleStart(&MBB->back())->isTerminator()) {
         report("MBB exits via unconditional branch but the branch isn't a "
                "terminator instruction!", MBB);
       }
@@ -510,10 +520,10 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
       if (MBB->empty()) {
         report("MBB exits via conditional branch/fall-through but doesn't "
                "contain any instructions!", MBB);
-      } else if (MBB->back().isBarrier()) {
+      } else if (getBundleStart(&MBB->back())->isBarrier()) {
         report("MBB exits via conditional branch/fall-through but ends with a "
                "barrier instruction!", MBB);
-      } else if (!MBB->back().isTerminator()) {
+      } else if (!getBundleStart(&MBB->back())->isTerminator()) {
         report("MBB exits via conditional branch/fall-through but the branch "
                "isn't a terminator instruction!", MBB);
       }
@@ -530,10 +540,10 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
       if (MBB->empty()) {
         report("MBB exits via conditional branch/branch but doesn't "
                "contain any instructions!", MBB);
-      } else if (!MBB->back().isBarrier()) {
+      } else if (!getBundleStart(&MBB->back())->isBarrier()) {
         report("MBB exits via conditional branch/branch but doesn't end with a "
                "barrier instruction!", MBB);
-      } else if (!MBB->back().isTerminator()) {
+      } else if (!getBundleStart(&MBB->back())->isTerminator()) {
         report("MBB exits via conditional branch/branch but the branch "
                "isn't a terminator instruction!", MBB);
       }
@@ -554,8 +564,8 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
       continue;
     }
     regsLive.insert(*I);
-    for (const uint16_t *R = TRI->getSubRegisters(*I); *R; R++)
-      regsLive.insert(*R);
+    for (MCSubRegIterator SubRegs(*I, TRI); SubRegs.isValid(); ++SubRegs)
+      regsLive.insert(*SubRegs);
   }
   regsLiveInButUnused = regsLive;
 
@@ -564,8 +574,8 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
   BitVector PR = MFI->getPristineRegs(MBB);
   for (int I = PR.find_first(); I>0; I = PR.find_next(I)) {
     regsLive.insert(I);
-    for (const uint16_t *R = TRI->getSubRegisters(I); *R; R++)
-      regsLive.insert(*R);
+    for (MCSubRegIterator SubRegs(I, TRI); SubRegs.isValid(); ++SubRegs)
+      regsLive.insert(*SubRegs);
   }
 
   regsKilled.clear();
@@ -575,6 +585,30 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
     lastIndex = Indexes->getMBBStartIdx(MBB);
 }
 
+// This function gets called for all bundle headers, including normal
+// stand-alone unbundled instructions.
+void MachineVerifier::visitMachineBundleBefore(const MachineInstr *MI) {
+  if (Indexes && Indexes->hasIndex(MI)) {
+    SlotIndex idx = Indexes->getInstructionIndex(MI);
+    if (!(idx > lastIndex)) {
+      report("Instruction index out of order", MI);
+      *OS << "Last instruction was at " << lastIndex << '\n';
+    }
+    lastIndex = idx;
+  }
+
+  // Ensure non-terminators don't follow terminators.
+  // Ignore predicated terminators formed by if conversion.
+  // FIXME: If conversion shouldn't need to violate this rule.
+  if (MI->isTerminator() && !TII->isPredicated(MI)) {
+    if (!FirstTerminator)
+      FirstTerminator = MI;
+  } else if (FirstTerminator) {
+    report("Non-terminator instruction after the first terminator", MI);
+    *OS << "First terminator was:\t" << *FirstTerminator;
+  }
+}
+
 void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
   const MCInstrDesc &MCID = MI->getDesc();
   if (MI->getNumOperands() < MCID.getNumOperands()) {
@@ -608,17 +642,6 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
     }
   }
 
-  // Ensure non-terminators don't follow terminators.
-  // Ignore predicated terminators formed by if conversion.
-  // FIXME: If conversion shouldn't need to violate this rule.
-  if (MI->isTerminator() && !TII->isPredicated(MI)) {
-    if (!FirstTerminator)
-      FirstTerminator = MI;
-  } else if (FirstTerminator) {
-    report("Non-terminator instruction after the first terminator", MI);
-    *OS << "First terminator was:\t" << *FirstTerminator;
-  }
-
   StringRef ErrorInfo;
   if (!TII->verifyInstruction(MI, ErrorInfo))
     report(ErrorInfo.data(), MI);
@@ -634,7 +657,7 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
   if (MONum < MCID.getNumDefs()) {
     if (!MO->isReg())
       report("Explicit definition must be a register", MO, MONum);
-    else if (!MO->isDef())
+    else if (!MO->isDef() && !MCOI.isOptionalDef())
       report("Explicit definition marked as use", MO, MONum);
     else if (MO->isImplicit())
       report("Explicit definition marked as implicit", MO, MONum);
@@ -672,7 +695,8 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
           report("Illegal subregister index for physical register", MO, MONum);
           return;
         }
-        if (const TargetRegisterClass *DRC = TII->getRegClass(MCID,MONum,TRI)) {
+        if (const TargetRegisterClass *DRC =
+              TII->getRegClass(MCID, MONum, TRI, *MF)) {
           if (!DRC->contains(Reg)) {
             report("Illegal physical register for instruction", MO, MONum);
             *OS << TRI->getName(Reg) << " is not a "
@@ -698,7 +722,8 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
             return;
           }
         }
-        if (const TargetRegisterClass *DRC = TII->getRegClass(MCID,MONum,TRI)) {
+        if (const TargetRegisterClass *DRC =
+              TII->getRegClass(MCID, MONum, TRI, *MF)) {
           if (SubIdx) {
             const TargetRegisterClass *SuperRC =
               TRI->getLargestLegalSuperClass(RC);
@@ -812,6 +837,8 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
         // Reserved registers may be used even when 'dead'.
         if (!isReserved(Reg))
           report("Using an undefined physical register", MO, MONum);
+      } else if (MRI->def_empty(Reg)) {
+        report("Reading virtual register without a def", MO, MONum);
       } else {
         BBInfo &MInfo = MBBInfoMap[MI->getParent()];
         // We don't know which virtual registers are live in, so only complain
@@ -841,12 +868,13 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
     // Check LiveInts for a live range, but only for virtual registers.
     if (LiveInts && TargetRegisterInfo::isVirtualRegister(Reg) &&
         !LiveInts->isNotInMIMap(MI)) {
-      SlotIndex DefIdx = LiveInts->getInstructionIndex(MI).getRegSlot();
+      SlotIndex DefIdx = LiveInts->getInstructionIndex(MI);
+      DefIdx = DefIdx.getRegSlot(MO->isEarlyClobber());
       if (LiveInts->hasInterval(Reg)) {
         const LiveInterval &LI = LiveInts->getInterval(Reg);
         if (const VNInfo *VNI = LI.getVNInfoAt(DefIdx)) {
           assert(VNI && "NULL valno is not allowed");
-          if (VNI->def != DefIdx && !MO->isEarlyClobber()) {
+          if (VNI->def != DefIdx) {
             report("Inconsistent valno->def", MO, MONum);
             *OS << "Valno " << VNI->id << " is not defined at "
               << DefIdx << " in " << LI << '\n';
@@ -863,6 +891,13 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
 }
 
 void MachineVerifier::visitMachineInstrAfter(const MachineInstr *MI) {
+}
+
+// This function gets called after visiting all instructions in a bundle. The
+// argument points to the bundle header.
+// Normal stand-alone instructions are also considered 'bundles', and this
+// function is called for all of them.
+void MachineVerifier::visitMachineBundleAfter(const MachineInstr *MI) {
   BBInfo &MInfo = MBBInfoMap[MI->getParent()];
   set_union(MInfo.regsKilled, regsKilled);
   set_subtract(regsLive, regsKilled); regsKilled.clear();
@@ -876,15 +911,6 @@ void MachineVerifier::visitMachineInstrAfter(const MachineInstr *MI) {
   }
   set_subtract(regsLive, regsDead);   regsDead.clear();
   set_union(regsLive, regsDefined);   regsDefined.clear();
-
-  if (Indexes && Indexes->hasIndex(MI)) {
-    SlotIndex idx = Indexes->getInstructionIndex(MI);
-    if (!(idx > lastIndex)) {
-      report("Instruction index out of order", MI);
-      *OS << "Last instruction was at " << lastIndex << '\n';
-    }
-    lastIndex = idx;
-  }
 }
 
 void
@@ -1025,7 +1051,21 @@ void MachineVerifier::visitMachineFunctionAfter() {
   // Now check liveness info if available
   calcRegsRequired();
 
-  if (MRI->isSSA() && !MF->empty()) {
+  // Check for killed virtual registers that should be live out.
+  for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end();
+       MFI != MFE; ++MFI) {
+    BBInfo &MInfo = MBBInfoMap[MFI];
+    for (RegSet::iterator
+         I = MInfo.vregsRequired.begin(), E = MInfo.vregsRequired.end(); I != E;
+         ++I)
+      if (MInfo.regsKilled.count(*I)) {
+        report("Virtual register killed in block, but needed live out.", MFI);
+        *OS << "Virtual register " << PrintReg(*I)
+            << " is used after the block.\n";
+      }
+  }
+
+  if (!MF->empty()) {
     BBInfo &MInfo = MBBInfoMap[&MF->front()];
     for (RegSet::iterator
          I = MInfo.vregsRequired.begin(), E = MInfo.vregsRequired.end(); I != E;
@@ -1069,20 +1109,21 @@ void MachineVerifier::verifyLiveVariables() {
 
 void MachineVerifier::verifyLiveIntervals() {
   assert(LiveInts && "Don't call verifyLiveIntervals without LiveInts");
-  for (LiveIntervals::const_iterator LVI = LiveInts->begin(),
-       LVE = LiveInts->end(); LVI != LVE; ++LVI) {
-    const LiveInterval &LI = *LVI->second;
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
 
     // Spilling and splitting may leave unused registers around. Skip them.
-    if (MRI->use_empty(LI.reg))
+    if (MRI->reg_nodbg_empty(Reg))
       continue;
 
-    // Physical registers have much weirdness going on, mostly from coalescing.
-    // We should probably fix it, but for now just ignore them.
-    if (TargetRegisterInfo::isPhysicalRegister(LI.reg))
+    if (!LiveInts->hasInterval(Reg)) {
+      report("Missing live interval for virtual register", MF);
+      *OS << PrintReg(Reg, TRI) << " still has defs or uses\n";
       continue;
+    }
 
-    assert(LVI->first == LI.reg && "Invalid reg to interval mapping");
+    const LiveInterval &LI = LiveInts->getInterval(Reg);
+    assert(Reg == LI.reg && "Invalid reg to interval mapping");
 
     for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end();
          I!=E; ++I) {
@@ -1307,15 +1348,18 @@ void MachineVerifier::verifyLiveIntervals() {
           ++MFI;
           continue;
         }
+
+        // Is VNI a PHI-def in the current block?
+        bool IsPHI = VNI->isPHIDef() &&
+                     VNI->def == LiveInts->getMBBStartIdx(MFI);
+
         // Check that VNI is live-out of all predecessors.
         for (MachineBasicBlock::const_pred_iterator PI = MFI->pred_begin(),
              PE = MFI->pred_end(); PI != PE; ++PI) {
           SlotIndex PEnd = LiveInts->getMBBEndIdx(*PI);
           const VNInfo *PVNI = LI.getVNInfoBefore(PEnd);
 
-          if (VNI->isPHIDef() && VNI->def == LiveInts->getMBBStartIdx(MFI))
-            continue;
-
+          // All predecessors must have a live-out value.
           if (!PVNI) {
             report("Register not marked live out of predecessor", *PI);
             *OS << "Valno #" << VNI->id << " live into BB#" << MFI->getNumber()
@@ -1324,12 +1368,14 @@ void MachineVerifier::verifyLiveIntervals() {
             continue;
           }
 
-          if (PVNI != VNI) {
+          // Only PHI-defs can take different predecessor values.
+          if (!IsPHI && PVNI != VNI) {
             report("Different value live out of predecessor", *PI);
             *OS << "Valno #" << PVNI->id << " live out of BB#"
                 << (*PI)->getNumber() << '@' << PEnd
                 << "\nValno #" << VNI->id << " live into BB#" << MFI->getNumber()
-                << '@' << LiveInts->getMBBStartIdx(MFI) << " in " << LI << '\n';
+                << '@' << LiveInts->getMBBStartIdx(MFI) << " in "
+                << PrintReg(Reg) << ": " << LI << '\n';
           }
         }
         if (&*MFI == EndMBB)
@@ -1357,4 +1403,3 @@ void MachineVerifier::verifyLiveIntervals() {
     }
   }
 }
-
diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp
index 0ed4c34..e6e23da 100644
--- a/lib/CodeGen/PHIElimination.cpp
+++ b/lib/CodeGen/PHIElimination.cpp
@@ -171,23 +171,30 @@ bool PHIElimination::EliminatePHINodes(MachineFunction &MF,
   return true;
 }
 
+/// isImplicitlyDefined - Return true if all defs of VirtReg are implicit-defs.
+/// This includes registers with no defs.
+static bool isImplicitlyDefined(unsigned VirtReg,
+                                const MachineRegisterInfo *MRI) {
+  for (MachineRegisterInfo::def_iterator DI = MRI->def_begin(VirtReg),
+       DE = MRI->def_end(); DI != DE; ++DI)
+    if (!DI->isImplicitDef())
+      return false;
+  return true;
+}
+
 /// isSourceDefinedByImplicitDef - Return true if all sources of the phi node
 /// are implicit_def's.
 static bool isSourceDefinedByImplicitDef(const MachineInstr *MPhi,
                                          const MachineRegisterInfo *MRI) {
-  for (unsigned i = 1; i != MPhi->getNumOperands(); i += 2) {
-    unsigned SrcReg = MPhi->getOperand(i).getReg();
-    const MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
-    if (!DefMI || !DefMI->isImplicitDef())
+  for (unsigned i = 1; i != MPhi->getNumOperands(); i += 2)
+    if (!isImplicitlyDefined(MPhi->getOperand(i).getReg(), MRI))
       return false;
-  }
   return true;
 }
 
 
-
 /// LowerAtomicPHINode - Lower the PHI node at the top of the specified block,
-/// under the assuption that it needs to be lowered in a way that supports
+/// under the assumption that it needs to be lowered in a way that supports
 /// atomic execution of PHIs.  This lowering method is always correct all of the
 /// time.
 ///
@@ -287,7 +294,8 @@ void PHIElimination::LowerAtomicPHINode(
   for (int i = NumSrcs - 1; i >= 0; --i) {
     unsigned SrcReg = MPhi->getOperand(i*2+1).getReg();
     unsigned SrcSubReg = MPhi->getOperand(i*2+1).getSubReg();
-
+    bool SrcUndef = MPhi->getOperand(i*2+1).isUndef() ||
+      isImplicitlyDefined(SrcReg, MRI);
     assert(TargetRegisterInfo::isVirtualRegister(SrcReg) &&
            "Machine PHI Operands must all be virtual registers!");
 
@@ -295,14 +303,6 @@ void PHIElimination::LowerAtomicPHINode(
     // path the PHI.
     MachineBasicBlock &opBlock = *MPhi->getOperand(i*2+2).getMBB();
 
-    // If source is defined by an implicit def, there is no need to insert a
-    // copy.
-    MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
-    if (DefMI->isImplicitDef()) {
-      ImpDefs.insert(DefMI);
-      continue;
-    }
-
     // Check to make sure we haven't already emitted the copy for this block.
     // This can happen because PHI nodes may have multiple entries for the same
     // basic block.
@@ -315,12 +315,27 @@ void PHIElimination::LowerAtomicPHINode(
       findPHICopyInsertPoint(&opBlock, &MBB, SrcReg);
 
     // Insert the copy.
-    if (!reusedIncoming && IncomingReg)
-      BuildMI(opBlock, InsertPos, MPhi->getDebugLoc(),
-              TII->get(TargetOpcode::COPY), IncomingReg).addReg(SrcReg, 0, SrcSubReg);
+    if (!reusedIncoming && IncomingReg) {
+      if (SrcUndef) {
+        // The source register is undefined, so there is no need for a real
+        // COPY, but we still need to ensure joint dominance by defs.
+        // Insert an IMPLICIT_DEF instruction.
+        BuildMI(opBlock, InsertPos, MPhi->getDebugLoc(),
+                TII->get(TargetOpcode::IMPLICIT_DEF), IncomingReg);
+
+        // Clean up the old implicit-def, if there even was one.
+        if (MachineInstr *DefMI = MRI->getVRegDef(SrcReg))
+          if (DefMI->isImplicitDef())
+            ImpDefs.insert(DefMI);
+      } else {
+        BuildMI(opBlock, InsertPos, MPhi->getDebugLoc(),
+                TII->get(TargetOpcode::COPY), IncomingReg)
+          .addReg(SrcReg, 0, SrcSubReg);
+      }
+    }
 
     // Now update live variable information if we have it.  Otherwise we're done
-    if (!LV) continue;
+    if (SrcUndef || !LV) continue;
 
     // We want to be able to insert a kill of the register if this PHI (aka, the
     // copy we just inserted) is the last use of the source value.  Live
@@ -340,39 +355,35 @@ void PHIElimination::LowerAtomicPHINode(
     // add a kill marker in this block saying that it kills the incoming value!
     if (!ValueIsUsed && !LV->isLiveOut(SrcReg, opBlock)) {
       // In our final twist, we have to decide which instruction kills the
-      // register.  In most cases this is the copy, however, the first
-      // terminator instruction at the end of the block may also use the value.
-      // In this case, we should mark *it* as being the killing block, not the
-      // copy.
-      MachineBasicBlock::iterator KillInst;
-      MachineBasicBlock::iterator Term = opBlock.getFirstTerminator();
-      if (Term != opBlock.end() && Term->readsRegister(SrcReg)) {
-        KillInst = Term;
-
-        // Check that no other terminators use values.
-#ifndef NDEBUG
-        for (MachineBasicBlock::iterator TI = llvm::next(Term);
-             TI != opBlock.end(); ++TI) {
-          if (TI->isDebugValue())
-            continue;
-          assert(!TI->readsRegister(SrcReg) &&
-                 "Terminator instructions cannot use virtual registers unless"
-                 "they are the first terminator in a block!");
-        }
-#endif
-      } else if (reusedIncoming || !IncomingReg) {
-        // We may have to rewind a bit if we didn't insert a copy this time.
-        KillInst = Term;
-        while (KillInst != opBlock.begin()) {
-          --KillInst;
-          if (KillInst->isDebugValue())
-            continue;
-          if (KillInst->readsRegister(SrcReg))
-            break;
+      // register.  In most cases this is the copy, however, terminator
+      // instructions at the end of the block may also use the value. In this
+      // case, we should mark the last such terminator as being the killing
+      // block, not the copy.
+      MachineBasicBlock::iterator KillInst = opBlock.end();
+      MachineBasicBlock::iterator FirstTerm = opBlock.getFirstTerminator();
+      for (MachineBasicBlock::iterator Term = FirstTerm;
+          Term != opBlock.end(); ++Term) {
+        if (Term->readsRegister(SrcReg))
+          KillInst = Term;
+      }
+
+      if (KillInst == opBlock.end()) {
+        // No terminator uses the register.
+
+        if (reusedIncoming || !IncomingReg) {
+          // We may have to rewind a bit if we didn't insert a copy this time.
+          KillInst = FirstTerm;
+          while (KillInst != opBlock.begin()) {
+            --KillInst;
+            if (KillInst->isDebugValue())
+              continue;
+            if (KillInst->readsRegister(SrcReg))
+              break;
+          }
+        } else {
+          // We just inserted this copy.
+          KillInst = prior(InsertPos);
         }
-      } else {
-        // We just inserted this copy.
-        KillInst = prior(InsertPos);
       }
       assert(KillInst->readsRegister(SrcReg) && "Cannot find kill instruction");
 
@@ -412,28 +423,71 @@ bool PHIElimination::SplitPHIEdges(MachineFunction &MF,
   if (MBB.empty() || !MBB.front().isPHI() || MBB.isLandingPad())
     return false;   // Quick exit for basic blocks without PHIs.
 
+  const MachineLoop *CurLoop = MLI ? MLI->getLoopFor(&MBB) : 0;
+  bool IsLoopHeader = CurLoop && &MBB == CurLoop->getHeader();
+
   bool Changed = false;
   for (MachineBasicBlock::iterator BBI = MBB.begin(), BBE = MBB.end();
        BBI != BBE && BBI->isPHI(); ++BBI) {
     for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2) {
       unsigned Reg = BBI->getOperand(i).getReg();
       MachineBasicBlock *PreMBB = BBI->getOperand(i+1).getMBB();
-      // We break edges when registers are live out from the predecessor block
-      // (not considering PHI nodes). If the register is live in to this block
-      // anyway, we would gain nothing from splitting.
+      // Is there a critical edge from PreMBB to MBB?
+      if (PreMBB->succ_size() == 1)
+        continue;
+
       // Avoid splitting backedges of loops. It would introduce small
       // out-of-line blocks into the loop which is very bad for code placement.
-      if (PreMBB != &MBB &&
-          !LV.isLiveIn(Reg, MBB) && LV.isLiveOut(Reg, *PreMBB)) {
-        if (!MLI ||
-            !(MLI->getLoopFor(PreMBB) == MLI->getLoopFor(&MBB) &&
-              MLI->isLoopHeader(&MBB))) {
-          if (PreMBB->SplitCriticalEdge(&MBB, this)) {
-            Changed = true;
-            ++NumCriticalEdgesSplit;
-          }
-        }
+      if (PreMBB == &MBB)
+        continue;
+      const MachineLoop *PreLoop = MLI ? MLI->getLoopFor(PreMBB) : 0;
+      if (IsLoopHeader && PreLoop == CurLoop)
+        continue;
+
+      // LV doesn't consider a phi use live-out, so isLiveOut only returns true
+      // when the source register is live-out for some other reason than a phi
+      // use. That means the copy we will insert in PreMBB won't be a kill, and
+      // there is a risk it may not be coalesced away.
+      //
+      // If the copy would be a kill, there is no need to split the edge.
+      if (!LV.isLiveOut(Reg, *PreMBB))
+        continue;
+
+      DEBUG(dbgs() << PrintReg(Reg) << " live-out before critical edge BB#"
+                   << PreMBB->getNumber() << " -> BB#" << MBB.getNumber()
+                   << ": " << *BBI);
+
+      // If Reg is not live-in to MBB, it means it must be live-in to some
+      // other PreMBB successor, and we can avoid the interference by splitting
+      // the edge.
+      //
+      // If Reg *is* live-in to MBB, the interference is inevitable and a copy
+      // is likely to be left after coalescing. If we are looking at a loop
+      // exiting edge, split it so we won't insert code in the loop, otherwise
+      // don't bother.
+      bool ShouldSplit = !LV.isLiveIn(Reg, MBB);
+
+      // Check for a loop exiting edge.
+      if (!ShouldSplit && CurLoop != PreLoop) {
+        DEBUG({
+          dbgs() << "Split wouldn't help, maybe avoid loop copies?\n";
+          if (PreLoop) dbgs() << "PreLoop: " << *PreLoop;
+          if (CurLoop) dbgs() << "CurLoop: " << *CurLoop;
+        });
+        // This edge could be entering a loop, exiting a loop, or it could be
+        // both: Jumping directly form one loop to the header of a sibling
+        // loop.
+        // Split unless this edge is entering CurLoop from an outer loop.
+        ShouldSplit = PreLoop && !PreLoop->contains(CurLoop);
+      }
+      if (!ShouldSplit)
+        continue;
+      if (!PreMBB->SplitCriticalEdge(&MBB, this)) {
+        DEBUG(dbgs() << "Failed to split ciritcal edge.\n");
+        continue;
       }
+      Changed = true;
+      ++NumCriticalEdgesSplit;
     }
   }
   return Changed;
diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp
index 13d1bbc..69d6d00 100644
--- a/lib/CodeGen/Passes.cpp
+++ b/lib/CodeGen/Passes.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Assembly/PrintModulePass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -48,6 +49,8 @@ static cl::opt<bool> DisableSSC("disable-ssc", cl::Hidden,
     cl::desc("Disable Stack Slot Coloring"));
 static cl::opt<bool> DisableMachineDCE("disable-machine-dce", cl::Hidden,
     cl::desc("Disable Machine Dead Code Elimination"));
+static cl::opt<bool> EnableEarlyIfConversion("enable-early-ifcvt", cl::Hidden,
+    cl::desc("Enable Early If-conversion"));
 static cl::opt<bool> DisableMachineLICM("disable-machine-licm", cl::Hidden,
     cl::desc("Disable Machine LICM"));
 static cl::opt<bool> DisableMachineCSE("disable-machine-cse", cl::Hidden,
@@ -80,15 +83,19 @@ static cl::opt<bool> PrintGCInfo("print-gc", cl::Hidden,
 static cl::opt<bool> VerifyMachineCode("verify-machineinstrs", cl::Hidden,
     cl::desc("Verify generated machine code"),
     cl::init(getenv("LLVM_VERIFY_MACHINEINSTRS")!=NULL));
+static cl::opt<std::string>
+PrintMachineInstrs("print-machineinstrs", cl::ValueOptional,
+                   cl::desc("Print machine instrs"),
+                   cl::value_desc("pass-name"), cl::init("option-unspecified"));
 
 /// Allow standard passes to be disabled by command line options. This supports
 /// simple binary flags that either suppress the pass or do nothing.
 /// i.e. -disable-mypass=false has no effect.
 /// These should be converted to boolOrDefault in order to use applyOverride.
-static AnalysisID applyDisable(AnalysisID ID, bool Override) {
+static AnalysisID applyDisable(AnalysisID PassID, bool Override) {
   if (Override)
-    return &NoPassID;
-  return ID;
+    return 0;
+  return PassID;
 }
 
 /// Allow Pass selection to be overriden by command line options. This supports
@@ -101,13 +108,13 @@ static AnalysisID applyOverride(AnalysisID TargetID, cl::boolOrDefault Override,
   case cl::BOU_UNSET:
     return TargetID;
   case cl::BOU_TRUE:
-    if (TargetID != &NoPassID)
+    if (TargetID)
       return TargetID;
-    if (StandardID == &NoPassID)
+    if (StandardID == 0)
       report_fatal_error("Target cannot enable pass");
     return StandardID;
   case cl::BOU_FALSE:
-    return &NoPassID;
+    return 0;
   }
   llvm_unreachable("Invalid command line option state");
 }
@@ -149,6 +156,9 @@ static AnalysisID overridePass(AnalysisID StandardID, AnalysisID TargetID) {
   if (StandardID == &DeadMachineInstructionElimID)
     return applyDisable(TargetID, DisableMachineDCE);
 
+  if (StandardID == &EarlyIfConverterID)
+    return applyDisable(TargetID, !EnableEarlyIfConversion);
+
   if (StandardID == &MachineLICMID)
     return applyDisable(TargetID, DisableMachineLICM);
 
@@ -178,9 +188,6 @@ INITIALIZE_PASS(TargetPassConfig, "targetpassconfig",
                 "Target Pass Configuration", false, false)
 char TargetPassConfig::ID = 0;
 
-static char NoPassIDAnchor = 0;
-char &llvm::NoPassID = NoPassIDAnchor;
-
 // Pseudo Pass IDs.
 char TargetPassConfig::EarlyTailDuplicateID = 0;
 char TargetPassConfig::PostRAMachineLICMID = 0;
@@ -193,9 +200,13 @@ public:
   // that are part of a standard pass pipeline without overridding the entire
   // pipeline. This mechanism allows target options to inherit a standard pass's
   // user interface. For example, a target may disable a standard pass by
-  // default by substituting NoPass, and the user may still enable that standard
-  // pass with an explicit command line option.
+  // default by substituting a pass ID of zero, and the user may still enable
+  // that standard pass with an explicit command line option.
   DenseMap<AnalysisID,AnalysisID> TargetPasses;
+
+  /// Store the pairs of <AnalysisID, AnalysisID> of which the second pass
+  /// is inserted after each instance of the first one.
+  SmallVector<std::pair<AnalysisID, AnalysisID>, 4> InsertedPasses;
 };
 } // namespace llvm
 
@@ -207,7 +218,8 @@ TargetPassConfig::~TargetPassConfig() {
 // Out of line constructor provides default values for pass options and
 // registers all common codegen passes.
 TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm)
-  : ImmutablePass(ID), TM(tm), PM(pm), Impl(0), Initialized(false),
+  : ImmutablePass(ID), PM(&pm), StartAfter(0), StopAfter(0),
+    Started(true), Stopped(false), TM(tm), Impl(0), Initialized(false),
     DisableVerify(false),
     EnableTailMerge(true) {
 
@@ -218,11 +230,22 @@ TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm)
   initializeCodeGen(*PassRegistry::getPassRegistry());
 
   // Substitute Pseudo Pass IDs for real ones.
-  substitutePass(EarlyTailDuplicateID, TailDuplicateID);
-  substitutePass(PostRAMachineLICMID, MachineLICMID);
+  substitutePass(&EarlyTailDuplicateID, &TailDuplicateID);
+  substitutePass(&PostRAMachineLICMID, &MachineLICMID);
+
+  // Disable early if-conversion. Targets that are ready can enable it.
+  disablePass(&EarlyIfConverterID);
 
   // Temporarily disable experimental passes.
-  substitutePass(MachineSchedulerID, NoPassID);
+  substitutePass(&MachineSchedulerID, 0);
+}
+
+/// Insert InsertedPassID pass after TargetPassID.
+void TargetPassConfig::insertPass(AnalysisID TargetPassID,
+                                  AnalysisID InsertedPassID) {
+  assert(TargetPassID != InsertedPassID && "Insert a pass after itself!");
+  std::pair<AnalysisID, AnalysisID> P(TargetPassID, InsertedPassID);
+  Impl->InsertedPasses.push_back(P);
 }
 
 /// createPassConfig - Create a pass configuration object to be used by
@@ -234,7 +257,7 @@ TargetPassConfig *LLVMTargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 TargetPassConfig::TargetPassConfig()
-  : ImmutablePass(ID), PM(*(PassManagerBase*)0) {
+  : ImmutablePass(ID), PM(0) {
   llvm_unreachable("TargetPassConfig should not be constructed on-the-fly");
 }
 
@@ -244,8 +267,9 @@ void TargetPassConfig::setOpt(bool &Opt, bool Val) {
   Opt = Val;
 }
 
-void TargetPassConfig::substitutePass(char &StandardID, char &TargetID) {
-  Impl->TargetPasses[&StandardID] = &TargetID;
+void TargetPassConfig::substitutePass(AnalysisID StandardID,
+                                      AnalysisID TargetID) {
+  Impl->TargetPasses[StandardID] = TargetID;
 }
 
 AnalysisID TargetPassConfig::getPassSubstitution(AnalysisID ID) const {
@@ -256,29 +280,62 @@ AnalysisID TargetPassConfig::getPassSubstitution(AnalysisID ID) const {
   return I->second;
 }
 
-/// Add a CodeGen pass at this point in the pipeline after checking for target
-/// and command line overrides.
-AnalysisID TargetPassConfig::addPass(char &ID) {
+/// Add a pass to the PassManager if that pass is supposed to be run.  If the
+/// Started/Stopped flags indicate either that the compilation should start at
+/// a later pass or that it should stop after an earlier pass, then do not add
+/// the pass.  Finally, compare the current pass against the StartAfter
+/// and StopAfter options and change the Started/Stopped flags accordingly.
+void TargetPassConfig::addPass(Pass *P) {
   assert(!Initialized && "PassConfig is immutable");
 
-  AnalysisID TargetID = getPassSubstitution(&ID);
-  AnalysisID FinalID = overridePass(&ID, TargetID);
-  if (FinalID == &NoPassID)
+  // Cache the Pass ID here in case the pass manager finds this pass is
+  // redundant with ones already scheduled / available, and deletes it.
+  // Fundamentally, once we add the pass to the manager, we no longer own it
+  // and shouldn't reference it.
+  AnalysisID PassID = P->getPassID();
+
+  if (Started && !Stopped)
+    PM->add(P);
+  if (StopAfter == PassID)
+    Stopped = true;
+  if (StartAfter == PassID)
+    Started = true;
+  if (Stopped && !Started)
+    report_fatal_error("Cannot stop compilation after pass that is not run");
+}
+
+/// Add a CodeGen pass at this point in the pipeline after checking for target
+/// and command line overrides.
+AnalysisID TargetPassConfig::addPass(AnalysisID PassID) {
+  AnalysisID TargetID = getPassSubstitution(PassID);
+  AnalysisID FinalID = overridePass(PassID, TargetID);
+  if (FinalID == 0)
     return FinalID;
 
   Pass *P = Pass::createPass(FinalID);
   if (!P)
     llvm_unreachable("Pass ID not registered");
-  PM.add(P);
+  addPass(P);
+  // Add the passes after the pass P if there is any.
+  for (SmallVector<std::pair<AnalysisID, AnalysisID>, 4>::iterator
+         I = Impl->InsertedPasses.begin(), E = Impl->InsertedPasses.end();
+       I != E; ++I) {
+    if ((*I).first == PassID) {
+      assert((*I).second && "Illegal Pass ID!");
+      Pass *NP = Pass::createPass((*I).second);
+      assert(NP && "Pass ID not registered");
+      addPass(NP);
+    }
+  }
   return FinalID;
 }
 
-void TargetPassConfig::printAndVerify(const char *Banner) const {
+void TargetPassConfig::printAndVerify(const char *Banner) {
   if (TM->shouldPrintMachineCode())
-    PM.add(createMachineFunctionPrinterPass(dbgs(), Banner));
+    addPass(createMachineFunctionPrinterPass(dbgs(), Banner));
 
   if (VerifyMachineCode)
-    PM.add(createMachineVerifierPass(Banner));
+    addPass(createMachineVerifierPass(Banner));
 }
 
 /// Add common target configurable passes that perform LLVM IR to IR transforms
@@ -288,46 +345,73 @@ void TargetPassConfig::addIRPasses() {
   // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
   // BasicAliasAnalysis wins if they disagree. This is intended to help
   // support "obvious" type-punning idioms.
-  PM.add(createTypeBasedAliasAnalysisPass());
-  PM.add(createBasicAliasAnalysisPass());
+  addPass(createTypeBasedAliasAnalysisPass());
+  addPass(createBasicAliasAnalysisPass());
 
   // Before running any passes, run the verifier to determine if the input
   // coming from the front-end and/or optimizer is valid.
   if (!DisableVerify)
-    PM.add(createVerifierPass());
+    addPass(createVerifierPass());
 
   // Run loop strength reduction before anything else.
   if (getOptLevel() != CodeGenOpt::None && !DisableLSR) {
-    PM.add(createLoopStrengthReducePass(getTargetLowering()));
+    addPass(createLoopStrengthReducePass(getTargetLowering()));
     if (PrintLSR)
-      PM.add(createPrintFunctionPass("\n\n*** Code after LSR ***\n", &dbgs()));
+      addPass(createPrintFunctionPass("\n\n*** Code after LSR ***\n", &dbgs()));
   }
 
-  PM.add(createGCLoweringPass());
+  addPass(createGCLoweringPass());
 
   // Make sure that no unreachable blocks are instruction selected.
-  PM.add(createUnreachableBlockEliminationPass());
+  addPass(createUnreachableBlockEliminationPass());
+}
+
+/// Turn exception handling constructs into something the code generators can
+/// handle.
+void TargetPassConfig::addPassesToHandleExceptions() {
+  switch (TM->getMCAsmInfo()->getExceptionHandlingType()) {
+  case ExceptionHandling::SjLj:
+    // SjLj piggy-backs on dwarf for this bit. The cleanups done apply to both
+    // Dwarf EH prepare needs to be run after SjLj prepare. Otherwise,
+    // catch info can get misplaced when a selector ends up more than one block
+    // removed from the parent invoke(s). This could happen when a landing
+    // pad is shared by multiple invokes and is also a target of a normal
+    // edge from elsewhere.
+    addPass(createSjLjEHPreparePass(TM->getTargetLowering()));
+    // FALLTHROUGH
+  case ExceptionHandling::DwarfCFI:
+  case ExceptionHandling::ARM:
+  case ExceptionHandling::Win64:
+    addPass(createDwarfEHPass(TM));
+    break;
+  case ExceptionHandling::None:
+    addPass(createLowerInvokePass(TM->getTargetLowering()));
+
+    // The lower invoke pass may create unreachable code. Remove it.
+    addPass(createUnreachableBlockEliminationPass());
+    break;
+  }
 }
 
 /// Add common passes that perform LLVM IR to IR transforms in preparation for
 /// instruction selection.
 void TargetPassConfig::addISelPrepare() {
   if (getOptLevel() != CodeGenOpt::None && !DisableCGP)
-    PM.add(createCodeGenPreparePass(getTargetLowering()));
+    addPass(createCodeGenPreparePass(getTargetLowering()));
 
-  PM.add(createStackProtectorPass(getTargetLowering()));
+  addPass(createStackProtectorPass(getTargetLowering()));
 
   addPreISel();
 
   if (PrintISelInput)
-    PM.add(createPrintFunctionPass("\n\n"
-                                   "*** Final LLVM Code input to ISel ***\n",
-                                   &dbgs()));
+    addPass(createPrintFunctionPass("\n\n"
+                                    "*** Final LLVM Code input to ISel ***\n",
+                                    &dbgs()));
 
   // All passes which modify the LLVM IR are now complete; run the verifier
   // to ensure that the IR is valid.
   if (!DisableVerify)
-    PM.add(createVerifierPass());
+    addPass(createVerifierPass());
 }
 
 /// Add the complete set of target-independent postISel code generator passes.
@@ -349,11 +433,26 @@ void TargetPassConfig::addISelPrepare() {
 /// TODO: We could use a single addPre/Post(ID) hook to allow pass injection
 /// before/after any target-independent pass. But it's currently overkill.
 void TargetPassConfig::addMachinePasses() {
+  // Insert a machine instr printer pass after the specified pass.
+  // If -print-machineinstrs specified, print machineinstrs after all passes.
+  if (StringRef(PrintMachineInstrs.getValue()).equals(""))
+    TM->Options.PrintMachineCode = true;
+  else if (!StringRef(PrintMachineInstrs.getValue())
+           .equals("option-unspecified")) {
+    const PassRegistry *PR = PassRegistry::getPassRegistry();
+    const PassInfo *TPI = PR->getPassInfo(PrintMachineInstrs.getValue());
+    const PassInfo *IPI = PR->getPassInfo(StringRef("print-machineinstrs"));
+    assert (TPI && IPI && "Pass ID not registered!");
+    const char *TID = (char *)(TPI->getTypeInfo());
+    const char *IID = (char *)(IPI->getTypeInfo());
+    insertPass(TID, IID);
+  }
+
   // Print the instruction selected machine code...
   printAndVerify("After Instruction Selection");
 
   // Expand pseudo-instructions emitted by ISel.
-  addPass(ExpandISelPseudosID);
+  addPass(&ExpandISelPseudosID);
 
   // Add passes that optimize machine instructions in SSA form.
   if (getOptLevel() != CodeGenOpt::None) {
@@ -362,7 +461,7 @@ void TargetPassConfig::addMachinePasses() {
   else {
     // If the target requests it, assign local variables to stack slots relative
     // to one another and simplify frame index references where possible.
-    addPass(LocalStackSlotAllocationID);
+    addPass(&LocalStackSlotAllocationID);
   }
 
   // Run pre-ra passes.
@@ -381,7 +480,7 @@ void TargetPassConfig::addMachinePasses() {
     printAndVerify("After PostRegAlloc passes");
 
   // Insert prolog/epilog code.  Eliminate abstract frame index references...
-  addPass(PrologEpilogCodeInserterID);
+  addPass(&PrologEpilogCodeInserterID);
   printAndVerify("After PrologEpilogCodeInserter");
 
   /// Add passes that optimize machine instructions after register allocation.
@@ -389,7 +488,7 @@ void TargetPassConfig::addMachinePasses() {
     addMachineLateOptimization();
 
   // Expand pseudo instructions before second scheduling pass.
-  addPass(ExpandPostRAPseudosID);
+  addPass(&ExpandPostRAPseudosID);
   printAndVerify("After ExpandPostRAPseudos");
 
   // Run pre-sched2 passes.
@@ -398,14 +497,14 @@ void TargetPassConfig::addMachinePasses() {
 
   // Second pass scheduler.
   if (getOptLevel() != CodeGenOpt::None) {
-    addPass(PostRASchedulerID);
+    addPass(&PostRASchedulerID);
     printAndVerify("After PostRAScheduler");
   }
 
   // GC
-  addPass(GCMachineCodeAnalysisID);
+  addPass(&GCMachineCodeAnalysisID);
   if (PrintGCInfo)
-    PM.add(createGCInfoPrinter(dbgs()));
+    addPass(createGCInfoPrinter(dbgs()));
 
   // Basic block placement.
   if (getOptLevel() != CodeGenOpt::None)
@@ -418,30 +517,31 @@ void TargetPassConfig::addMachinePasses() {
 /// Add passes that optimize machine instructions in SSA form.
 void TargetPassConfig::addMachineSSAOptimization() {
   // Pre-ra tail duplication.
-  if (addPass(EarlyTailDuplicateID) != &NoPassID)
+  if (addPass(&EarlyTailDuplicateID))
     printAndVerify("After Pre-RegAlloc TailDuplicate");
 
   // Optimize PHIs before DCE: removing dead PHI cycles may make more
   // instructions dead.
-  addPass(OptimizePHIsID);
+  addPass(&OptimizePHIsID);
 
   // If the target requests it, assign local variables to stack slots relative
   // to one another and simplify frame index references where possible.
-  addPass(LocalStackSlotAllocationID);
+  addPass(&LocalStackSlotAllocationID);
 
   // With optimization, dead code should already be eliminated. However
   // there is one known exception: lowered code for arguments that are only
   // used by tail calls, where the tail calls reuse the incoming stack
   // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
-  addPass(DeadMachineInstructionElimID);
+  addPass(&DeadMachineInstructionElimID);
   printAndVerify("After codegen DCE pass");
 
-  addPass(MachineLICMID);
-  addPass(MachineCSEID);
-  addPass(MachineSinkingID);
+  addPass(&EarlyIfConverterID);
+  addPass(&MachineLICMID);
+  addPass(&MachineCSEID);
+  addPass(&MachineSinkingID);
   printAndVerify("After Machine LICM, CSE and Sinking passes");
 
-  addPass(PeepholeOptimizerID);
+  addPass(&PeepholeOptimizerID);
   printAndVerify("After codegen peephole optimization pass");
 }
 
@@ -519,10 +619,10 @@ FunctionPass *TargetPassConfig::createRegAllocPass(bool Optimized) {
 /// Add the minimum set of target-independent passes that are required for
 /// register allocation. No coalescing or scheduling.
 void TargetPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
-  addPass(PHIEliminationID);
-  addPass(TwoAddressInstructionPassID);
+  addPass(&PHIEliminationID);
+  addPass(&TwoAddressInstructionPassID);
 
-  PM.add(RegAllocPass);
+  addPass(RegAllocPass);
   printAndVerify("After Register Allocation");
 }
 
@@ -530,42 +630,46 @@ void TargetPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
 /// optimized register allocation, including coalescing, machine instruction
 /// scheduling, and register allocation itself.
 void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
+  addPass(&ProcessImplicitDefsID);
+
   // LiveVariables currently requires pure SSA form.
   //
   // FIXME: Once TwoAddressInstruction pass no longer uses kill flags,
   // LiveVariables can be removed completely, and LiveIntervals can be directly
   // computed. (We still either need to regenerate kill flags after regalloc, or
   // preferably fix the scavenger to not depend on them).
-  addPass(LiveVariablesID);
+  addPass(&LiveVariablesID);
 
   // Add passes that move from transformed SSA into conventional SSA. This is a
   // "copy coalescing" problem.
   //
   if (!EnableStrongPHIElim) {
     // Edge splitting is smarter with machine loop info.
-    addPass(MachineLoopInfoID);
-    addPass(PHIEliminationID);
+    addPass(&MachineLoopInfoID);
+    addPass(&PHIEliminationID);
   }
-  addPass(TwoAddressInstructionPassID);
-
-  // FIXME: Either remove this pass completely, or fix it so that it works on
-  // SSA form. We could modify LiveIntervals to be independent of this pass, But
-  // it would be even better to simply eliminate *all* IMPLICIT_DEFs before
-  // leaving SSA.
-  addPass(ProcessImplicitDefsID);
+  addPass(&TwoAddressInstructionPassID);
 
   if (EnableStrongPHIElim)
-    addPass(StrongPHIEliminationID);
+    addPass(&StrongPHIEliminationID);
 
-  addPass(RegisterCoalescerID);
+  addPass(&RegisterCoalescerID);
 
   // PreRA instruction scheduling.
-  if (addPass(MachineSchedulerID) != &NoPassID)
+  if (addPass(&MachineSchedulerID))
     printAndVerify("After Machine Scheduling");
 
   // Add the selected register allocation pass.
-  PM.add(RegAllocPass);
-  printAndVerify("After Register Allocation");
+  addPass(RegAllocPass);
+  printAndVerify("After Register Allocation, before rewriter");
+
+  // Allow targets to change the register assignments before rewriting.
+  if (addPreRewrite())
+    printAndVerify("After pre-rewrite passes");
+
+  // Finally rewrite virtual registers.
+  addPass(&VirtRegRewriterID);
+  printAndVerify("After Virtual Register Rewriter");
 
   // FinalizeRegAlloc is convenient until MachineInstrBundles is more mature,
   // but eventually, all users of it should probably be moved to addPostRA and
@@ -579,12 +683,12 @@ void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
   //
   // FIXME: Re-enable coloring with register when it's capable of adding
   // kill markers.
-  addPass(StackSlotColoringID);
+  addPass(&StackSlotColoringID);
 
   // Run post-ra machine LICM to hoist reloads / remats.
   //
   // FIXME: can this move into MachineLateOptimization?
-  addPass(PostRAMachineLICMID);
+  addPass(&PostRAMachineLICMID);
 
   printAndVerify("After StackSlotColoring and postra Machine LICM");
 }
@@ -596,33 +700,33 @@ void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
 /// Add passes that optimize machine instructions after register allocation.
 void TargetPassConfig::addMachineLateOptimization() {
   // Branch folding must be run after regalloc and prolog/epilog insertion.
-  if (addPass(BranchFolderPassID) != &NoPassID)
+  if (addPass(&BranchFolderPassID))
     printAndVerify("After BranchFolding");
 
   // Tail duplication.
-  if (addPass(TailDuplicateID) != &NoPassID)
+  if (addPass(&TailDuplicateID))
     printAndVerify("After TailDuplicate");
 
   // Copy propagation.
-  if (addPass(MachineCopyPropagationID) != &NoPassID)
+  if (addPass(&MachineCopyPropagationID))
     printAndVerify("After copy propagation pass");
 }
 
 /// Add standard basic block placement passes.
 void TargetPassConfig::addBlockPlacement() {
-  AnalysisID ID = &NoPassID;
+  AnalysisID PassID = 0;
   if (!DisableBlockPlacement) {
     // MachineBlockPlacement is a new pass which subsumes the functionality of
     // CodPlacementOpt. The old code placement pass can be restored by
     // disabling block placement, but eventually it will be removed.
-    ID = addPass(MachineBlockPlacementID);
+    PassID = addPass(&MachineBlockPlacementID);
   } else {
-    ID = addPass(CodePlacementOptID);
+    PassID = addPass(&CodePlacementOptID);
   }
-  if (ID != &NoPassID) {
+  if (PassID) {
     // Run a separate pass to collect block placement statistics.
     if (EnableBlockPlacementStats)
-      addPass(MachineBlockPlacementStatsID);
+      addPass(&MachineBlockPlacementStatsID);
 
     printAndVerify("After machine block placement.");
   }
diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp
index 9c5c029..91c33c4 100644
--- a/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/lib/CodeGen/PeepholeOptimizer.cpp
@@ -31,6 +31,15 @@
 //     same flag that the "cmp" instruction sets and that "bz" uses, then we can
 //     eliminate the "cmp" instruction.
 //
+//     Another instance, in this code:
+//
+//       sub r1, r3 | sub r1, imm
+//       cmp r3, r1 or cmp r1, r3 | cmp r1, imm
+//       bge L1
+//
+//     If the branch instruction can use flag from "sub", then we can replace
+//     "sub" with "subs" and eliminate the "cmp" instruction.
+//
 // - Optimize Bitcast pairs:
 //
 //     v1 = bitcast v0
@@ -95,14 +104,14 @@ namespace {
     }
 
   private:
-    bool OptimizeBitcastInstr(MachineInstr *MI, MachineBasicBlock *MBB);
-    bool OptimizeCmpInstr(MachineInstr *MI, MachineBasicBlock *MBB);
-    bool OptimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
+    bool optimizeBitcastInstr(MachineInstr *MI, MachineBasicBlock *MBB);
+    bool optimizeCmpInstr(MachineInstr *MI, MachineBasicBlock *MBB);
+    bool optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
                           SmallPtrSet<MachineInstr*, 8> &LocalMIs);
     bool isMoveImmediate(MachineInstr *MI,
                          SmallSet<unsigned, 4> &ImmDefRegs,
                          DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
-    bool FoldImmediate(MachineInstr *MI, MachineBasicBlock *MBB,
+    bool foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB,
                        SmallSet<unsigned, 4> &ImmDefRegs,
                        DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
   };
@@ -116,7 +125,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_END(PeepholeOptimizer, "peephole-opts",
                 "Peephole Optimizations", false, false)
 
-/// OptimizeExtInstr - If instruction is a copy-like instruction, i.e. it reads
+/// optimizeExtInstr - If instruction is a copy-like instruction, i.e. it reads
 /// a single register and writes a single register and it does not modify the
 /// source, and if the source value is preserved as a sub-register of the
 /// result, then replace all reachable uses of the source with the subreg of the
@@ -126,7 +135,7 @@ INITIALIZE_PASS_END(PeepholeOptimizer, "peephole-opts",
 /// the code. Since this code does not currently share EXTRACTs, just ignore all
 /// debug uses.
 bool PeepholeOptimizer::
-OptimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
+optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
                  SmallPtrSet<MachineInstr*, 8> &LocalMIs) {
   unsigned SrcReg, DstReg, SubIdx;
   if (!TII->isCoalescableExtInstr(*MI, SrcReg, DstReg, SubIdx))
@@ -136,16 +145,30 @@ OptimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
       TargetRegisterInfo::isPhysicalRegister(SrcReg))
     return false;
 
-  MachineRegisterInfo::use_nodbg_iterator UI = MRI->use_nodbg_begin(SrcReg);
-  if (++UI == MRI->use_nodbg_end())
+  if (MRI->hasOneNonDBGUse(SrcReg))
     // No other uses.
     return false;
 
+  // Ensure DstReg can get a register class that actually supports
+  // sub-registers. Don't change the class until we commit.
+  const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
+  DstRC = TM->getRegisterInfo()->getSubClassWithSubReg(DstRC, SubIdx);
+  if (!DstRC)
+    return false;
+
+  // The ext instr may be operating on a sub-register of SrcReg as well.
+  // PPC::EXTSW is a 32 -> 64-bit sign extension, but it reads a 64-bit
+  // register.
+  // If UseSrcSubIdx is Set, SubIdx also applies to SrcReg, and only uses of
+  // SrcReg:SubIdx should be replaced.
+  bool UseSrcSubIdx = TM->getRegisterInfo()->
+    getSubClassWithSubReg(MRI->getRegClass(SrcReg), SubIdx) != 0;
+
   // The source has other uses. See if we can replace the other uses with use of
   // the result of the extension.
   SmallPtrSet<MachineBasicBlock*, 4> ReachedBBs;
-  UI = MRI->use_nodbg_begin(DstReg);
-  for (MachineRegisterInfo::use_nodbg_iterator UE = MRI->use_nodbg_end();
+  for (MachineRegisterInfo::use_nodbg_iterator
+       UI = MRI->use_nodbg_begin(DstReg), UE = MRI->use_nodbg_end();
        UI != UE; ++UI)
     ReachedBBs.insert(UI->getParent());
 
@@ -156,8 +179,8 @@ OptimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
   SmallVector<MachineOperand*, 8> ExtendedUses;
 
   bool ExtendLife = true;
-  UI = MRI->use_nodbg_begin(SrcReg);
-  for (MachineRegisterInfo::use_nodbg_iterator UE = MRI->use_nodbg_end();
+  for (MachineRegisterInfo::use_nodbg_iterator
+       UI = MRI->use_nodbg_begin(SrcReg), UE = MRI->use_nodbg_end();
        UI != UE; ++UI) {
     MachineOperand &UseMO = UI.getOperand();
     MachineInstr *UseMI = &*UI;
@@ -169,6 +192,10 @@ OptimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
       continue;
     }
 
+    // Only accept uses of SrcReg:SubIdx.
+    if (UseSrcSubIdx && UseMO.getSubReg() != SubIdx)
+      continue;
+
     // It's an error to translate this:
     //
     //    %reg1025 = <sext> %reg1024
@@ -223,9 +250,9 @@ OptimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
     // Look for PHI uses of the extended result, we don't want to extend the
     // liveness of a PHI input. It breaks all kinds of assumptions down
     // stream. A PHI use is expected to be the kill of its source values.
-    UI = MRI->use_nodbg_begin(DstReg);
     for (MachineRegisterInfo::use_nodbg_iterator
-           UE = MRI->use_nodbg_end(); UI != UE; ++UI)
+         UI = MRI->use_nodbg_begin(DstReg), UE = MRI->use_nodbg_end();
+         UI != UE; ++UI)
       if (UI->isPHI())
         PHIBBs.insert(UI->getParent());
 
@@ -238,14 +265,20 @@ OptimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
         continue;
 
       // About to add uses of DstReg, clear DstReg's kill flags.
-      if (!Changed)
+      if (!Changed) {
         MRI->clearKillFlags(DstReg);
+        MRI->constrainRegClass(DstReg, DstRC);
+      }
 
       unsigned NewVR = MRI->createVirtualRegister(RC);
-      BuildMI(*UseMBB, UseMI, UseMI->getDebugLoc(),
-              TII->get(TargetOpcode::COPY), NewVR)
+      MachineInstr *Copy = BuildMI(*UseMBB, UseMI, UseMI->getDebugLoc(),
+                                   TII->get(TargetOpcode::COPY), NewVR)
         .addReg(DstReg, 0, SubIdx);
-
+      // SubIdx applies to both SrcReg and DstReg when UseSrcSubIdx is set.
+      if (UseSrcSubIdx) {
+        Copy->getOperand(0).setSubReg(SubIdx);
+        Copy->getOperand(0).setIsUndef();
+      }
       UseMO->setReg(NewVR);
       ++NumReuse;
       Changed = true;
@@ -255,7 +288,7 @@ OptimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
   return Changed;
 }
 
-/// OptimizeBitcastInstr - If the instruction is a bitcast instruction A that
+/// optimizeBitcastInstr - If the instruction is a bitcast instruction A that
 /// cannot be optimized away during isel (e.g. ARM::VMOVSR, which bitcast
 /// a value cross register classes), and the source is defined by another
 /// bitcast instruction B. And if the register class of source of B matches
@@ -265,7 +298,7 @@ OptimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
 ///   %vreg3<def> = VMOVRS %vreg0
 ///   Replace all uses of vreg3 with vreg1.
 
-bool PeepholeOptimizer::OptimizeBitcastInstr(MachineInstr *MI,
+bool PeepholeOptimizer::optimizeBitcastInstr(MachineInstr *MI,
                                              MachineBasicBlock *MBB) {
   unsigned NumDefs = MI->getDesc().getNumDefs();
   unsigned NumSrcs = MI->getDesc().getNumOperands() - NumDefs;
@@ -327,22 +360,23 @@ bool PeepholeOptimizer::OptimizeBitcastInstr(MachineInstr *MI,
   return true;
 }
 
-/// OptimizeCmpInstr - If the instruction is a compare and the previous
+/// optimizeCmpInstr - If the instruction is a compare and the previous
 /// instruction it's comparing against all ready sets (or could be modified to
 /// set) the same flag as the compare, then we can remove the comparison and use
 /// the flag from the previous instruction.
-bool PeepholeOptimizer::OptimizeCmpInstr(MachineInstr *MI,
+bool PeepholeOptimizer::optimizeCmpInstr(MachineInstr *MI,
                                          MachineBasicBlock *MBB) {
   // If this instruction is a comparison against zero and isn't comparing a
   // physical register, we can try to optimize it.
-  unsigned SrcReg;
+  unsigned SrcReg, SrcReg2;
   int CmpMask, CmpValue;
-  if (!TII->AnalyzeCompare(MI, SrcReg, CmpMask, CmpValue) ||
-      TargetRegisterInfo::isPhysicalRegister(SrcReg))
+  if (!TII->analyzeCompare(MI, SrcReg, SrcReg2, CmpMask, CmpValue) ||
+      TargetRegisterInfo::isPhysicalRegister(SrcReg) ||
+      (SrcReg2 != 0 && TargetRegisterInfo::isPhysicalRegister(SrcReg2)))
     return false;
 
   // Attempt to optimize the comparison instruction.
-  if (TII->OptimizeCompareInstr(MI, SrcReg, CmpMask, CmpValue, MRI)) {
+  if (TII->optimizeCompareInstr(MI, SrcReg, SrcReg2, CmpMask, CmpValue, MRI)) {
     ++NumCmps;
     return true;
   }
@@ -368,10 +402,10 @@ bool PeepholeOptimizer::isMoveImmediate(MachineInstr *MI,
   return false;
 }
 
-/// FoldImmediate - Try folding register operands that are defined by move
+/// foldImmediate - Try folding register operands that are defined by move
 /// immediate instructions, i.e. a trivial constant folding optimization, if
 /// and only if the def and use are in the same BB.
-bool PeepholeOptimizer::FoldImmediate(MachineInstr *MI, MachineBasicBlock *MBB,
+bool PeepholeOptimizer::foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB,
                                       SmallSet<unsigned, 4> &ImmDefRegs,
                                  DenseMap<unsigned, MachineInstr*> &ImmDefMIs) {
   for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) {
@@ -430,7 +464,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
       }
 
       if (MI->isBitcast()) {
-        if (OptimizeBitcastInstr(MI, MBB)) {
+        if (optimizeBitcastInstr(MI, MBB)) {
           // MI is deleted.
           LocalMIs.erase(MI);
           Changed = true;
@@ -438,7 +472,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
           continue;
         }
       } else if (MI->isCompare()) {
-        if (OptimizeCmpInstr(MI, MBB)) {
+        if (optimizeCmpInstr(MI, MBB)) {
           // MI is deleted.
           LocalMIs.erase(MI);
           Changed = true;
@@ -450,9 +484,9 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
       if (isMoveImmediate(MI, ImmDefRegs, ImmDefMIs)) {
         SeenMoveImm = true;
       } else {
-        Changed |= OptimizeExtInstr(MI, MBB, LocalMIs);
+        Changed |= optimizeExtInstr(MI, MBB, LocalMIs);
         if (SeenMoveImm)
-          Changed |= FoldImmediate(MI, MBB, ImmDefRegs, ImmDefMIs);
+          Changed |= foldImmediate(MI, MBB, ImmDefRegs, ImmDefMIs);
       }
 
       First = false;
diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp
index 24d3e5a..7449ff5 100644
--- a/lib/CodeGen/PostRASchedulerList.cpp
+++ b/lib/CodeGen/PostRASchedulerList.cpp
@@ -22,7 +22,6 @@
 #include "AntiDepBreaker.h"
 #include "AggressiveAntiDepBreaker.h"
 #include "CriticalAntiDepBreaker.h"
-#include "RegisterClassInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/LatencyPriorityQueue.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
@@ -31,6 +30,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -78,7 +78,6 @@ AntiDepBreaker::~AntiDepBreaker() { }
 
 namespace {
   class PostRAScheduler : public MachineFunctionPass {
-    AliasAnalysis *AA;
     const TargetInstrInfo *TII;
     RegisterClassInfo RegClassInfo;
 
@@ -206,6 +205,10 @@ SchedulePostRATDList::SchedulePostRATDList(
   const InstrItineraryData *InstrItins = TM.getInstrItineraryData();
   HazardRec =
     TM.getInstrInfo()->CreateTargetPostRAHazardRecognizer(InstrItins, this);
+
+  assert((AntiDepMode == TargetSubtargetInfo::ANTIDEP_NONE ||
+          MRI.tracksLiveness()) &&
+         "Live-ins must be accurate for anti-dependency breaking");
   AntiDepBreak =
     ((AntiDepMode == TargetSubtargetInfo::ANTIDEP_ALL) ?
      (AntiDepBreaker *)new AggressiveAntiDepBreaker(MF, RCI, CriticalPathRCs) :
@@ -423,9 +426,8 @@ void SchedulePostRATDList::StartBlockForKills(MachineBasicBlock *BB) {
       unsigned Reg = *I;
       LiveRegs.set(Reg);
       // Repeat, for all subregs.
-      for (const uint16_t *Subreg = TRI->getSubRegisters(Reg);
-           *Subreg; ++Subreg)
-        LiveRegs.set(*Subreg);
+      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+        LiveRegs.set(*SubRegs);
     }
   }
   else {
@@ -437,9 +439,8 @@ void SchedulePostRATDList::StartBlockForKills(MachineBasicBlock *BB) {
         unsigned Reg = *I;
         LiveRegs.set(Reg);
         // Repeat, for all subregs.
-        for (const uint16_t *Subreg = TRI->getSubRegisters(Reg);
-             *Subreg; ++Subreg)
-          LiveRegs.set(*Subreg);
+        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+          LiveRegs.set(*SubRegs);
       }
     }
   }
@@ -464,10 +465,9 @@ bool SchedulePostRATDList::ToggleKillFlag(MachineInstr *MI,
   MO.setIsKill(false);
   bool AllDead = true;
   const unsigned SuperReg = MO.getReg();
-  for (const uint16_t *Subreg = TRI->getSubRegisters(SuperReg);
-       *Subreg; ++Subreg) {
-    if (LiveRegs.test(*Subreg)) {
-      MI->addOperand(MachineOperand::CreateReg(*Subreg,
+  for (MCSubRegIterator SubRegs(SuperReg, TRI); SubRegs.isValid(); ++SubRegs) {
+    if (LiveRegs.test(*SubRegs)) {
+      MI->addOperand(MachineOperand::CreateReg(*SubRegs,
                                                true  /*IsDef*/,
                                                true  /*IsImp*/,
                                                false /*IsKill*/,
@@ -517,9 +517,8 @@ void SchedulePostRATDList::FixupKills(MachineBasicBlock *MBB) {
       LiveRegs.reset(Reg);
 
       // Repeat for all subregs.
-      for (const uint16_t *Subreg = TRI->getSubRegisters(Reg);
-           *Subreg; ++Subreg)
-        LiveRegs.reset(*Subreg);
+      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+        LiveRegs.reset(*SubRegs);
     }
 
     // Examine all used registers and set/clear kill flag. When a
@@ -536,9 +535,8 @@ void SchedulePostRATDList::FixupKills(MachineBasicBlock *MBB) {
       if (!killedRegs.test(Reg)) {
         kill = true;
         // A register is not killed if any subregs are live...
-        for (const uint16_t *Subreg = TRI->getSubRegisters(Reg);
-             *Subreg; ++Subreg) {
-          if (LiveRegs.test(*Subreg)) {
+        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+          if (LiveRegs.test(*SubRegs)) {
             kill = false;
             break;
           }
@@ -570,9 +568,8 @@ void SchedulePostRATDList::FixupKills(MachineBasicBlock *MBB) {
 
       LiveRegs.set(Reg);
 
-      for (const uint16_t *Subreg = TRI->getSubRegisters(Reg);
-           *Subreg; ++Subreg)
-        LiveRegs.set(*Subreg);
+      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+        LiveRegs.set(*SubRegs);
     }
   }
 }
diff --git a/lib/CodeGen/ProcessImplicitDefs.cpp b/lib/CodeGen/ProcessImplicitDefs.cpp
index 1ad3479..34d075c 100644
--- a/lib/CodeGen/ProcessImplicitDefs.cpp
+++ b/lib/CodeGen/ProcessImplicitDefs.cpp
@@ -9,297 +9,163 @@
 
 #define DEBUG_TYPE "processimplicitdefs"
 
-#include "llvm/CodeGen/ProcessImplicitDefs.h"
-
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-
 
 using namespace llvm;
 
+namespace {
+/// Process IMPLICIT_DEF instructions and make sure there is one implicit_def
+/// for each use. Add isUndef marker to implicit_def defs and their uses.
+class ProcessImplicitDefs : public MachineFunctionPass {
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+
+  SmallSetVector<MachineInstr*, 16> WorkList;
+
+  void processImplicitDef(MachineInstr *MI);
+  bool canTurnIntoImplicitDef(MachineInstr *MI);
+
+public:
+  static char ID;
+
+  ProcessImplicitDefs() : MachineFunctionPass(ID) {
+    initializeProcessImplicitDefsPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &au) const;
+
+  virtual bool runOnMachineFunction(MachineFunction &fn);
+};
+} // end anonymous namespace
+
 char ProcessImplicitDefs::ID = 0;
 char &llvm::ProcessImplicitDefsID = ProcessImplicitDefs::ID;
 
 INITIALIZE_PASS_BEGIN(ProcessImplicitDefs, "processimpdefs",
                 "Process Implicit Definitions", false, false)
-INITIALIZE_PASS_DEPENDENCY(LiveVariables)
 INITIALIZE_PASS_END(ProcessImplicitDefs, "processimpdefs",
                 "Process Implicit Definitions", false, false)
 
 void ProcessImplicitDefs::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
   AU.addPreserved<AliasAnalysis>();
-  AU.addPreserved<LiveVariables>();
-  AU.addPreservedID(MachineLoopInfoID);
-  AU.addPreservedID(MachineDominatorsID);
-  AU.addPreservedID(TwoAddressInstructionPassID);
-  AU.addPreservedID(PHIEliminationID);
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-bool
-ProcessImplicitDefs::CanTurnIntoImplicitDef(MachineInstr *MI,
-                                            unsigned Reg, unsigned OpIdx,
-                                            SmallSet<unsigned, 8> &ImpDefRegs) {
-  switch(OpIdx) {
-  case 1:
-    return MI->isCopy() && (!MI->getOperand(0).readsReg() ||
-                            ImpDefRegs.count(MI->getOperand(0).getReg()));
-  case 2:
-    return MI->isSubregToReg() && (!MI->getOperand(0).readsReg() ||
-                                  ImpDefRegs.count(MI->getOperand(0).getReg()));
-  default: return false;
-  }
-}
-
-static bool isUndefCopy(MachineInstr *MI, unsigned Reg,
-                        SmallSet<unsigned, 8> &ImpDefRegs) {
-  if (MI->isCopy()) {
-    MachineOperand &MO0 = MI->getOperand(0);
-    MachineOperand &MO1 = MI->getOperand(1);
-    if (MO1.getReg() != Reg)
-      return false;
-    if (!MO0.readsReg() || ImpDefRegs.count(MO0.getReg()))
-      return true;
+bool ProcessImplicitDefs::canTurnIntoImplicitDef(MachineInstr *MI) {
+  if (!MI->isCopyLike() &&
+      !MI->isInsertSubreg() &&
+      !MI->isRegSequence() &&
+      !MI->isPHI())
     return false;
-  }
-  return false;
+  for (MIOperands MO(MI); MO.isValid(); ++MO)
+    if (MO->isReg() && MO->isUse() && MO->readsReg())
+      return false;
+  return true;
 }
 
-/// processImplicitDefs - Process IMPLICIT_DEF instructions and make sure
-/// there is one implicit_def for each use. Add isUndef marker to
-/// implicit_def defs and their uses.
-bool ProcessImplicitDefs::runOnMachineFunction(MachineFunction &fn) {
-
-  DEBUG(dbgs() << "********** PROCESS IMPLICIT DEFS **********\n"
-               << "********** Function: "
-               << ((Value*)fn.getFunction())->getName() << '\n');
-
-  bool Changed = false;
-
-  TII = fn.getTarget().getInstrInfo();
-  TRI = fn.getTarget().getRegisterInfo();
-  MRI = &fn.getRegInfo();
-  LV = getAnalysisIfAvailable<LiveVariables>();
-
-  SmallSet<unsigned, 8> ImpDefRegs;
-  SmallVector<MachineInstr*, 8> ImpDefMIs;
-  SmallVector<MachineInstr*, 4> RUses;
-  SmallPtrSet<MachineBasicBlock*,16> Visited;
-  SmallPtrSet<MachineInstr*, 8> ModInsts;
-
-  MachineBasicBlock *Entry = fn.begin();
-  for (df_ext_iterator<MachineBasicBlock*, SmallPtrSet<MachineBasicBlock*,16> >
-         DFI = df_ext_begin(Entry, Visited), E = df_ext_end(Entry, Visited);
-       DFI != E; ++DFI) {
-    MachineBasicBlock *MBB = *DFI;
-    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
-         I != E; ) {
-      MachineInstr *MI = &*I;
-      ++I;
-      if (MI->isImplicitDef()) {
-        ImpDefMIs.push_back(MI);
-        // Is this a sub-register read-modify-write?
-        if (MI->getOperand(0).readsReg())
-          continue;
-        unsigned Reg = MI->getOperand(0).getReg();
-        ImpDefRegs.insert(Reg);
-        if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
-          for (const uint16_t *SS = TRI->getSubRegisters(Reg); *SS; ++SS)
-            ImpDefRegs.insert(*SS);
-        }
+void ProcessImplicitDefs::processImplicitDef(MachineInstr *MI) {
+  DEBUG(dbgs() << "Processing " << *MI);
+  unsigned Reg = MI->getOperand(0).getReg();
+
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    // For virtual regiusters, mark all uses as <undef>, and convert users to
+    // implicit-def when possible.
+    for (MachineRegisterInfo::use_nodbg_iterator UI =
+         MRI->use_nodbg_begin(Reg),
+         UE = MRI->use_nodbg_end(); UI != UE; ++UI) {
+      MachineOperand &MO = UI.getOperand();
+      MO.setIsUndef();
+      MachineInstr *UserMI = MO.getParent();
+      if (!canTurnIntoImplicitDef(UserMI))
         continue;
-      }
-
-      // Eliminate %reg1032:sub<def> = COPY undef.
-      if (MI->isCopy() && MI->getOperand(0).readsReg()) {
-        MachineOperand &MO = MI->getOperand(1);
-        if (MO.isUndef() || ImpDefRegs.count(MO.getReg())) {
-          if (LV && MO.isKill()) {
-            LiveVariables::VarInfo& vi = LV->getVarInfo(MO.getReg());
-            vi.removeKill(MI);
-          }
-          unsigned Reg = MI->getOperand(0).getReg();
-          MI->eraseFromParent();
-          Changed = true;
-
-          // A REG_SEQUENCE may have been expanded into partial definitions.
-          // If this was the last one, mark Reg as implicitly defined.
-          if (TargetRegisterInfo::isVirtualRegister(Reg) && MRI->def_empty(Reg))
-            ImpDefRegs.insert(Reg);
-          continue;
-        }
-      }
-
-      bool ChangedToImpDef = false;
-      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-        MachineOperand& MO = MI->getOperand(i);
-        if (!MO.isReg() || !MO.readsReg())
-          continue;
-        unsigned Reg = MO.getReg();
-        if (!Reg)
-          continue;
-        if (!ImpDefRegs.count(Reg))
-          continue;
-        // Use is a copy, just turn it into an implicit_def.
-        if (CanTurnIntoImplicitDef(MI, Reg, i, ImpDefRegs)) {
-          bool isKill = MO.isKill();
-          MI->setDesc(TII->get(TargetOpcode::IMPLICIT_DEF));
-          for (int j = MI->getNumOperands() - 1, ee = 0; j > ee; --j)
-            MI->RemoveOperand(j);
-          if (isKill) {
-            ImpDefRegs.erase(Reg);
-            if (LV) {
-              LiveVariables::VarInfo& vi = LV->getVarInfo(Reg);
-              vi.removeKill(MI);
-            }
-          }
-          ChangedToImpDef = true;
-          Changed = true;
-          break;
-        }
-
-        Changed = true;
-        MO.setIsUndef();
-        // This is a partial register redef of an implicit def.
-        // Make sure the whole register is defined by the instruction.
-        if (MO.isDef()) {
-          MI->addRegisterDefined(Reg);
-          continue;
-        }
-        if (MO.isKill() || MI->isRegTiedToDefOperand(i)) {
-          // Make sure other reads of Reg are also marked <undef>.
-          for (unsigned j = i+1; j != e; ++j) {
-            MachineOperand &MOJ = MI->getOperand(j);
-            if (MOJ.isReg() && MOJ.getReg() == Reg && MOJ.readsReg())
-              MOJ.setIsUndef();
-          }
-          ImpDefRegs.erase(Reg);
-        }
-      }
-
-      if (ChangedToImpDef) {
-        // Backtrack to process this new implicit_def.
-        --I;
-      } else {
-        for (unsigned i = 0; i != MI->getNumOperands(); ++i) {
-          MachineOperand& MO = MI->getOperand(i);
-          if (!MO.isReg() || !MO.isDef())
-            continue;
-          ImpDefRegs.erase(MO.getReg());
-        }
-      }
+      DEBUG(dbgs() << "Converting to IMPLICIT_DEF: " << *UserMI);
+      UserMI->setDesc(TII->get(TargetOpcode::IMPLICIT_DEF));
+      WorkList.insert(UserMI);
     }
+    MI->eraseFromParent();
+    return;
+  }
 
-    // Any outstanding liveout implicit_def's?
-    for (unsigned i = 0, e = ImpDefMIs.size(); i != e; ++i) {
-      MachineInstr *MI = ImpDefMIs[i];
-      unsigned Reg = MI->getOperand(0).getReg();
-      if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
-          !ImpDefRegs.count(Reg)) {
-        // Delete all "local" implicit_def's. That include those which define
-        // physical registers since they cannot be liveout.
-        MI->eraseFromParent();
-        Changed = true;
+  // This is a physreg implicit-def.
+  // Look for the first instruction to use or define an alias.
+  MachineBasicBlock::instr_iterator UserMI = MI;
+  MachineBasicBlock::instr_iterator UserE = MI->getParent()->instr_end();
+  bool Found = false;
+  for (++UserMI; UserMI != UserE; ++UserMI) {
+    for (MIOperands MO(UserMI); MO.isValid(); ++MO) {
+      if (!MO->isReg())
         continue;
-      }
-
-      // If there are multiple defs of the same register and at least one
-      // is not an implicit_def, do not insert implicit_def's before the
-      // uses.
-      bool Skip = false;
-      SmallVector<MachineInstr*, 4> DeadImpDefs;
-      for (MachineRegisterInfo::def_iterator DI = MRI->def_begin(Reg),
-             DE = MRI->def_end(); DI != DE; ++DI) {
-        MachineInstr *DeadImpDef = &*DI;
-        if (!DeadImpDef->isImplicitDef()) {
-          Skip = true;
-          break;
-        }
-        DeadImpDefs.push_back(DeadImpDef);
-      }
-      if (Skip)
+      unsigned UserReg = MO->getReg();
+      if (!TargetRegisterInfo::isPhysicalRegister(UserReg) ||
+          !TRI->regsOverlap(Reg, UserReg))
         continue;
+      // UserMI uses or redefines Reg. Set <undef> flags on all uses.
+      Found = true;
+      if (MO->isUse())
+        MO->setIsUndef();
+    }
+    if (Found)
+      break;
+  }
 
-      // The only implicit_def which we want to keep are those that are live
-      // out of its block.
-      for (unsigned j = 0, ee = DeadImpDefs.size(); j != ee; ++j)
-        DeadImpDefs[j]->eraseFromParent();
-      Changed = true;
-
-      // Process each use instruction once.
-      for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(Reg),
-             UE = MRI->use_end(); UI != UE; ++UI) {
-        if (UI.getOperand().isUndef())
-          continue;
-        MachineInstr *RMI = &*UI;
-        if (ModInsts.insert(RMI))
-          RUses.push_back(RMI);
-      }
+  // If we found the using MI, we can erase the IMPLICIT_DEF.
+  if (Found) {
+    DEBUG(dbgs() << "Physreg user: " << *UserMI);
+    MI->eraseFromParent();
+    return;
+  }
 
-      for (unsigned i = 0, e = RUses.size(); i != e; ++i) {
-        MachineInstr *RMI = RUses[i];
+  // Using instr wasn't found, it could be in another block.
+  // Leave the physreg IMPLICIT_DEF, but trim any extra operands.
+  for (unsigned i = MI->getNumOperands() - 1; i; --i)
+    MI->RemoveOperand(i);
+  DEBUG(dbgs() << "Keeping physreg: " << *MI);
+}
 
-        // Turn a copy use into an implicit_def.
-        if (isUndefCopy(RMI, Reg, ImpDefRegs)) {
-          RMI->setDesc(TII->get(TargetOpcode::IMPLICIT_DEF));
+/// processImplicitDefs - Process IMPLICIT_DEF instructions and turn them into
+/// <undef> operands.
+bool ProcessImplicitDefs::runOnMachineFunction(MachineFunction &MF) {
 
-          bool isKill = false;
-          SmallVector<unsigned, 4> Ops;
-          for (unsigned j = 0, ee = RMI->getNumOperands(); j != ee; ++j) {
-            MachineOperand &RRMO = RMI->getOperand(j);
-            if (RRMO.isReg() && RRMO.getReg() == Reg) {
-              Ops.push_back(j);
-              if (RRMO.isKill())
-                isKill = true;
-            }
-          }
-          // Leave the other operands along.
-          for (unsigned j = 0, ee = Ops.size(); j != ee; ++j) {
-            unsigned OpIdx = Ops[j];
-            RMI->RemoveOperand(OpIdx-j);
-          }
+  DEBUG(dbgs() << "********** PROCESS IMPLICIT DEFS **********\n"
+               << "********** Function: "
+               << ((Value*)MF.getFunction())->getName() << '\n');
 
-          // Update LiveVariables varinfo if the instruction is a kill.
-          if (LV && isKill) {
-            LiveVariables::VarInfo& vi = LV->getVarInfo(Reg);
-            vi.removeKill(RMI);
-          }
-          continue;
-        }
+  bool Changed = false;
 
-        // Replace Reg with a new vreg that's marked implicit.
-        const TargetRegisterClass* RC = MRI->getRegClass(Reg);
-        unsigned NewVReg = MRI->createVirtualRegister(RC);
-        bool isKill = true;
-        for (unsigned j = 0, ee = RMI->getNumOperands(); j != ee; ++j) {
-          MachineOperand &RRMO = RMI->getOperand(j);
-          if (RRMO.isReg() && RRMO.getReg() == Reg) {
-            RRMO.setReg(NewVReg);
-            RRMO.setIsUndef();
-            if (isKill) {
-              // Only the first operand of NewVReg is marked kill.
-              RRMO.setIsKill();
-              isKill = false;
-            }
-          }
-        }
-      }
-      RUses.clear();
-      ModInsts.clear();
-    }
-    ImpDefRegs.clear();
-    ImpDefMIs.clear();
+  TII = MF.getTarget().getInstrInfo();
+  TRI = MF.getTarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  assert(MRI->isSSA() && "ProcessImplicitDefs only works on SSA form.");
+  assert(WorkList.empty() && "Inconsistent worklist state");
+
+  for (MachineFunction::iterator MFI = MF.begin(), MFE = MF.end();
+       MFI != MFE; ++MFI) {
+    // Scan the basic block for implicit defs.
+    for (MachineBasicBlock::instr_iterator MBBI = MFI->instr_begin(),
+         MBBE = MFI->instr_end(); MBBI != MBBE; ++MBBI)
+      if (MBBI->isImplicitDef())
+        WorkList.insert(MBBI);
+
+    if (WorkList.empty())
+      continue;
+
+    DEBUG(dbgs() << "BB#" << MFI->getNumber() << " has " << WorkList.size()
+                 << " implicit defs.\n");
+    Changed = true;
+
+    // Drain the WorkList to recursively process any new implicit defs.
+    do processImplicitDef(WorkList.pop_back_val());
+    while (!WorkList.empty());
   }
-
   return Changed;
 }
-
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index 458915e..c791ffb 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -302,7 +302,7 @@ void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
   const TargetRegisterInfo *TRI = Fn.getTarget().getRegisterInfo();
   MachineBasicBlock::iterator I;
 
-  if (! ShrinkWrapThisFunction) {
+  if (!ShrinkWrapThisFunction) {
     // Spill using target interface.
     I = EntryBlock->begin();
     if (!TFI->spillCalleeSavedRegisters(*EntryBlock, I, CSI, TRI)) {
diff --git a/lib/CodeGen/RegAllocBase.cpp b/lib/CodeGen/RegAllocBase.cpp
index b00eceb..993dbc7 100644
--- a/lib/CodeGen/RegAllocBase.cpp
+++ b/lib/CodeGen/RegAllocBase.cpp
@@ -14,6 +14,7 @@
 
 #define DEBUG_TYPE "regalloc"
 #include "RegAllocBase.h"
+#include "LiveRegMatrix.h"
 #include "Spiller.h"
 #include "VirtRegMap.h"
 #include "llvm/ADT/Statistic.h"
@@ -34,8 +35,6 @@
 
 using namespace llvm;
 
-STATISTIC(NumAssigned     , "Number of registers assigned");
-STATISTIC(NumUnassigned   , "Number of registers unassigned");
 STATISTIC(NumNewQueued    , "Number of new live ranges queued");
 
 // Temporary verification option until we can put verification inside
@@ -47,85 +46,20 @@ VerifyRegAlloc("verify-regalloc", cl::location(RegAllocBase::VerifyEnabled),
 const char *RegAllocBase::TimerGroupName = "Register Allocation";
 bool RegAllocBase::VerifyEnabled = false;
 
-#ifndef NDEBUG
-// Verify each LiveIntervalUnion.
-void RegAllocBase::verify() {
-  LiveVirtRegBitSet VisitedVRegs;
-  OwningArrayPtr<LiveVirtRegBitSet>
-    unionVRegs(new LiveVirtRegBitSet[PhysReg2LiveUnion.numRegs()]);
-
-  // Verify disjoint unions.
-  for (unsigned PhysReg = 0; PhysReg < PhysReg2LiveUnion.numRegs(); ++PhysReg) {
-    DEBUG(PhysReg2LiveUnion[PhysReg].print(dbgs(), TRI));
-    LiveVirtRegBitSet &VRegs = unionVRegs[PhysReg];
-    PhysReg2LiveUnion[PhysReg].verify(VRegs);
-    // Union + intersection test could be done efficiently in one pass, but
-    // don't add a method to SparseBitVector unless we really need it.
-    assert(!VisitedVRegs.intersects(VRegs) && "vreg in multiple unions");
-    VisitedVRegs |= VRegs;
-  }
-
-  // Verify vreg coverage.
-  for (LiveIntervals::iterator liItr = LIS->begin(), liEnd = LIS->end();
-       liItr != liEnd; ++liItr) {
-    unsigned reg = liItr->first;
-    if (TargetRegisterInfo::isPhysicalRegister(reg)) continue;
-    if (!VRM->hasPhys(reg)) continue; // spilled?
-    unsigned PhysReg = VRM->getPhys(reg);
-    if (!unionVRegs[PhysReg].test(reg)) {
-      dbgs() << "LiveVirtReg " << reg << " not in union " <<
-        TRI->getName(PhysReg) << "\n";
-      llvm_unreachable("unallocated live vreg");
-    }
-  }
-  // FIXME: I'm not sure how to verify spilled intervals.
-}
-#endif //!NDEBUG
-
 //===----------------------------------------------------------------------===//
 //                         RegAllocBase Implementation
 //===----------------------------------------------------------------------===//
 
-// Instantiate a LiveIntervalUnion for each physical register.
-void RegAllocBase::LiveUnionArray::init(LiveIntervalUnion::Allocator &allocator,
-                                        unsigned NRegs) {
-  NumRegs = NRegs;
-  Array =
-    static_cast<LiveIntervalUnion*>(malloc(sizeof(LiveIntervalUnion)*NRegs));
-  for (unsigned r = 0; r != NRegs; ++r)
-    new(Array + r) LiveIntervalUnion(r, allocator);
-}
-
-void RegAllocBase::init(VirtRegMap &vrm, LiveIntervals &lis) {
-  NamedRegionTimer T("Initialize", TimerGroupName, TimePassesIsEnabled);
+void RegAllocBase::init(VirtRegMap &vrm,
+                        LiveIntervals &lis,
+                        LiveRegMatrix &mat) {
   TRI = &vrm.getTargetRegInfo();
   MRI = &vrm.getRegInfo();
   VRM = &vrm;
   LIS = &lis;
+  Matrix = &mat;
   MRI->freezeReservedRegs(vrm.getMachineFunction());
   RegClassInfo.runOnMachineFunction(vrm.getMachineFunction());
-
-  const unsigned NumRegs = TRI->getNumRegs();
-  if (NumRegs != PhysReg2LiveUnion.numRegs()) {
-    PhysReg2LiveUnion.init(UnionAllocator, NumRegs);
-    // Cache an interferece query for each physical reg
-    Queries.reset(new LiveIntervalUnion::Query[PhysReg2LiveUnion.numRegs()]);
-  }
-}
-
-void RegAllocBase::LiveUnionArray::clear() {
-  if (!Array)
-    return;
-  for (unsigned r = 0; r != NumRegs; ++r)
-    Array[r].~LiveIntervalUnion();
-  free(Array);
-  NumRegs =  0;
-  Array = 0;
-}
-
-void RegAllocBase::releaseMemory() {
-  for (unsigned r = 0, e = PhysReg2LiveUnion.numRegs(); r != e; ++r)
-    PhysReg2LiveUnion[r].clear();
 }
 
 // Visit all the live registers. If they are already assigned to a physical
@@ -133,35 +67,14 @@ void RegAllocBase::releaseMemory() {
 // them on the priority queue for later assignment.
 void RegAllocBase::seedLiveRegs() {
   NamedRegionTimer T("Seed Live Regs", TimerGroupName, TimePassesIsEnabled);
-  for (LiveIntervals::iterator I = LIS->begin(), E = LIS->end(); I != E; ++I) {
-    unsigned RegNum = I->first;
-    LiveInterval &VirtReg = *I->second;
-    if (TargetRegisterInfo::isPhysicalRegister(RegNum))
-      PhysReg2LiveUnion[RegNum].unify(VirtReg);
-    else
-      enqueue(&VirtReg);
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (MRI->reg_nodbg_empty(Reg))
+      continue;
+    enqueue(&LIS->getInterval(Reg));
   }
 }
 
-void RegAllocBase::assign(LiveInterval &VirtReg, unsigned PhysReg) {
-  DEBUG(dbgs() << "assigning " << PrintReg(VirtReg.reg, TRI)
-               << " to " << PrintReg(PhysReg, TRI) << '\n');
-  assert(!VRM->hasPhys(VirtReg.reg) && "Duplicate VirtReg assignment");
-  VRM->assignVirt2Phys(VirtReg.reg, PhysReg);
-  MRI->setPhysRegUsed(PhysReg);
-  PhysReg2LiveUnion[PhysReg].unify(VirtReg);
-  ++NumAssigned;
-}
-
-void RegAllocBase::unassign(LiveInterval &VirtReg, unsigned PhysReg) {
-  DEBUG(dbgs() << "unassigning " << PrintReg(VirtReg.reg, TRI)
-               << " from " << PrintReg(PhysReg, TRI) << '\n');
-  assert(VRM->getPhys(VirtReg.reg) == PhysReg && "Inconsistent unassign");
-  PhysReg2LiveUnion[PhysReg].extract(VirtReg);
-  VRM->clearVirt(VirtReg.reg);
-  ++NumUnassigned;
-}
-
 // Top-level driver to manage the queue of unassigned VirtRegs and call the
 // selectOrSplit implementation.
 void RegAllocBase::allocatePhysRegs() {
@@ -179,14 +92,14 @@ void RegAllocBase::allocatePhysRegs() {
     }
 
     // Invalidate all interference queries, live ranges could have changed.
-    invalidateVirtRegs();
+    Matrix->invalidateVirtRegs();
 
     // selectOrSplit requests the allocator to return an available physical
     // register if possible and populate a list of new live intervals that
     // result from splitting.
     DEBUG(dbgs() << "\nselectOrSplit "
                  << MRI->getRegClass(VirtReg->reg)->getName()
-                 << ':' << *VirtReg << '\n');
+                 << ':' << PrintReg(VirtReg->reg) << ' ' << *VirtReg << '\n');
     typedef SmallVector<LiveInterval*, 4> VirtRegVec;
     VirtRegVec SplitVRegs;
     unsigned AvailablePhysReg = selectOrSplit(*VirtReg, SplitVRegs);
@@ -211,7 +124,7 @@ void RegAllocBase::allocatePhysRegs() {
     }
 
     if (AvailablePhysReg)
-      assign(*VirtReg, AvailablePhysReg);
+      Matrix->assign(*VirtReg, AvailablePhysReg);
 
     for (VirtRegVec::iterator I = SplitVRegs.begin(), E = SplitVRegs.end();
          I != E; ++I) {
@@ -230,51 +143,3 @@ void RegAllocBase::allocatePhysRegs() {
     }
   }
 }
-
-// Check if this live virtual register interferes with a physical register. If
-// not, then check for interference on each register that aliases with the
-// physical register. Return the interfering register.
-unsigned RegAllocBase::checkPhysRegInterference(LiveInterval &VirtReg,
-                                                unsigned PhysReg) {
-  for (const uint16_t *AliasI = TRI->getOverlaps(PhysReg); *AliasI; ++AliasI)
-    if (query(VirtReg, *AliasI).checkInterference())
-      return *AliasI;
-  return 0;
-}
-
-// Add newly allocated physical registers to the MBB live in sets.
-void RegAllocBase::addMBBLiveIns(MachineFunction *MF) {
-  NamedRegionTimer T("MBB Live Ins", TimerGroupName, TimePassesIsEnabled);
-  SlotIndexes *Indexes = LIS->getSlotIndexes();
-  if (MF->size() <= 1)
-    return;
-
-  LiveIntervalUnion::SegmentIter SI;
-  for (unsigned PhysReg = 0; PhysReg < PhysReg2LiveUnion.numRegs(); ++PhysReg) {
-    LiveIntervalUnion &LiveUnion = PhysReg2LiveUnion[PhysReg];
-    if (LiveUnion.empty())
-      continue;
-    DEBUG(dbgs() << PrintReg(PhysReg, TRI) << " live-in:");
-    MachineFunction::iterator MBB = llvm::next(MF->begin());
-    MachineFunction::iterator MFE = MF->end();
-    SlotIndex Start, Stop;
-    tie(Start, Stop) = Indexes->getMBBRange(MBB);
-    SI.setMap(LiveUnion.getMap());
-    SI.find(Start);
-    while (SI.valid()) {
-      if (SI.start() <= Start) {
-        if (!MBB->isLiveIn(PhysReg))
-          MBB->addLiveIn(PhysReg);
-        DEBUG(dbgs() << "\tBB#" << MBB->getNumber() << ':'
-                     << PrintReg(SI.value()->reg, TRI));
-      } else if (SI.start() > Stop)
-        MBB = Indexes->getMBBFromIndex(SI.start().getPrevIndex());
-      if (++MBB == MFE)
-        break;
-      tie(Start, Stop) = Indexes->getMBBRange(MBB);
-      SI.advanceTo(Start);
-    }
-    DEBUG(dbgs() << '\n');
-  }
-}
-
diff --git a/lib/CodeGen/RegAllocBase.h b/lib/CodeGen/RegAllocBase.h
index 072fe2b..db0c8e1 100644
--- a/lib/CodeGen/RegAllocBase.h
+++ b/lib/CodeGen/RegAllocBase.h
@@ -37,9 +37,9 @@
 #ifndef LLVM_CODEGEN_REGALLOCBASE
 #define LLVM_CODEGEN_REGALLOCBASE
 
-#include "llvm/ADT/OwningPtr.h"
 #include "LiveIntervalUnion.h"
-#include "RegisterClassInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/ADT/OwningPtr.h"
 
 namespace llvm {
 
@@ -47,6 +47,7 @@ template<typename T> class SmallVectorImpl;
 class TargetRegisterInfo;
 class VirtRegMap;
 class LiveIntervals;
+class LiveRegMatrix;
 class Spiller;
 
 /// RegAllocBase provides the register allocation driver and interface that can
@@ -56,69 +57,20 @@ class Spiller;
 /// live range splitting. They must also override enqueue/dequeue to provide an
 /// assignment order.
 class RegAllocBase {
-  LiveIntervalUnion::Allocator UnionAllocator;
-
-  // Cache tag for PhysReg2LiveUnion entries. Increment whenever virtual
-  // registers may have changed.
-  unsigned UserTag;
-
-  // Array of LiveIntervalUnions indexed by physical register.
-  class LiveUnionArray {
-    unsigned NumRegs;
-    LiveIntervalUnion *Array;
-  public:
-    LiveUnionArray(): NumRegs(0), Array(0) {}
-    ~LiveUnionArray() { clear(); }
-
-    unsigned numRegs() const { return NumRegs; }
-
-    void init(LiveIntervalUnion::Allocator &, unsigned NRegs);
-
-    void clear();
-
-    LiveIntervalUnion& operator[](unsigned PhysReg) {
-      assert(PhysReg <  NumRegs && "physReg out of bounds");
-      return Array[PhysReg];
-    }
-  };
-
-  LiveUnionArray PhysReg2LiveUnion;
-
-  // Current queries, one per physreg. They must be reinitialized each time we
-  // query on a new live virtual register.
-  OwningArrayPtr<LiveIntervalUnion::Query> Queries;
-
 protected:
   const TargetRegisterInfo *TRI;
   MachineRegisterInfo *MRI;
   VirtRegMap *VRM;
   LiveIntervals *LIS;
+  LiveRegMatrix *Matrix;
   RegisterClassInfo RegClassInfo;
 
-  RegAllocBase(): UserTag(0), TRI(0), MRI(0), VRM(0), LIS(0) {}
+  RegAllocBase(): TRI(0), MRI(0), VRM(0), LIS(0), Matrix(0) {}
 
   virtual ~RegAllocBase() {}
 
   // A RegAlloc pass should call this before allocatePhysRegs.
-  void init(VirtRegMap &vrm, LiveIntervals &lis);
-
-  // Get an initialized query to check interferences between lvr and preg.  Note
-  // that Query::init must be called at least once for each physical register
-  // before querying a new live virtual register. This ties Queries and
-  // PhysReg2LiveUnion together.
-  LiveIntervalUnion::Query &query(LiveInterval &VirtReg, unsigned PhysReg) {
-    Queries[PhysReg].init(UserTag, &VirtReg, &PhysReg2LiveUnion[PhysReg]);
-    return Queries[PhysReg];
-  }
-
-  // Get direct access to the underlying LiveIntervalUnion for PhysReg.
-  LiveIntervalUnion &getLiveUnion(unsigned PhysReg) {
-    return PhysReg2LiveUnion[PhysReg];
-  }
-
-  // Invalidate all cached information about virtual registers - live ranges may
-  // have changed.
-  void invalidateVirtRegs() { ++UserTag; }
+  void init(VirtRegMap &vrm, LiveIntervals &lis, LiveRegMatrix &mat);
 
   // The top-level driver. The output is a VirtRegMap that us updated with
   // physical register assignments.
@@ -140,31 +92,6 @@ protected:
   virtual unsigned selectOrSplit(LiveInterval &VirtReg,
                                  SmallVectorImpl<LiveInterval*> &splitLVRs) = 0;
 
-  // A RegAlloc pass should call this when PassManager releases its memory.
-  virtual void releaseMemory();
-
-  // Helper for checking interference between a live virtual register and a
-  // physical register, including all its register aliases. If an interference
-  // exists, return the interfering register, which may be preg or an alias.
-  unsigned checkPhysRegInterference(LiveInterval& VirtReg, unsigned PhysReg);
-
-  /// assign - Assign VirtReg to PhysReg.
-  /// This should not be called from selectOrSplit for the current register.
-  void assign(LiveInterval &VirtReg, unsigned PhysReg);
-
-  /// unassign - Undo a previous assignment of VirtReg to PhysReg.
-  /// This can be invoked from selectOrSplit, but be careful to guarantee that
-  /// allocation is making progress.
-  void unassign(LiveInterval &VirtReg, unsigned PhysReg);
-
-  /// addMBBLiveIns - Add physreg liveins to basic blocks.
-  void addMBBLiveIns(MachineFunction *);
-
-#ifndef NDEBUG
-  // Verify each LiveIntervalUnion.
-  void verify();
-#endif
-
   // Use this group name for NamedRegionTimer.
   static const char *TimerGroupName;
 
diff --git a/lib/CodeGen/RegAllocBasic.cpp b/lib/CodeGen/RegAllocBasic.cpp
index 77ee314..3a03807 100644
--- a/lib/CodeGen/RegAllocBasic.cpp
+++ b/lib/CodeGen/RegAllocBasic.cpp
@@ -13,11 +13,12 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "regalloc"
+#include "AllocationOrder.h"
 #include "RegAllocBase.h"
 #include "LiveDebugVariables.h"
-#include "RenderMachineFunction.h"
 #include "Spiller.h"
 #include "VirtRegMap.h"
+#include "LiveRegMatrix.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Function.h"
 #include "llvm/PassAnalysisSupport.h"
@@ -64,10 +65,6 @@ class RABasic : public MachineFunctionPass, public RegAllocBase
   // context
   MachineFunction *MF;
 
-  // analyses
-  LiveStacks *LS;
-  RenderMachineFunction *RMF;
-
   // state
   std::auto_ptr<Spiller> SpillerInstance;
   std::priority_queue<LiveInterval*, std::vector<LiveInterval*>,
@@ -118,9 +115,6 @@ public:
   bool spillInterferences(LiveInterval &VirtReg, unsigned PhysReg,
                           SmallVectorImpl<LiveInterval*> &SplitVRegs);
 
-  void spillReg(LiveInterval &VirtReg, unsigned PhysReg,
-                SmallVectorImpl<LiveInterval*> &SplitVRegs);
-
   static char ID;
 };
 
@@ -139,7 +133,7 @@ RABasic::RABasic(): MachineFunctionPass(ID) {
   initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry());
   initializeMachineLoopInfoPass(*PassRegistry::getPassRegistry());
   initializeVirtRegMapPass(*PassRegistry::getPassRegistry());
-  initializeRenderMachineFunctionPass(*PassRegistry::getPassRegistry());
+  initializeLiveRegMatrixPass(*PassRegistry::getPassRegistry());
 }
 
 void RABasic::getAnalysisUsage(AnalysisUsage &AU) const {
@@ -147,6 +141,7 @@ void RABasic::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<AliasAnalysis>();
   AU.addPreserved<AliasAnalysis>();
   AU.addRequired<LiveIntervals>();
+  AU.addPreserved<LiveIntervals>();
   AU.addPreserved<SlotIndexes>();
   AU.addRequired<LiveDebugVariables>();
   AU.addPreserved<LiveDebugVariables>();
@@ -159,41 +154,15 @@ void RABasic::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<MachineLoopInfo>();
   AU.addRequired<VirtRegMap>();
   AU.addPreserved<VirtRegMap>();
-  DEBUG(AU.addRequired<RenderMachineFunction>());
+  AU.addRequired<LiveRegMatrix>();
+  AU.addPreserved<LiveRegMatrix>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
 void RABasic::releaseMemory() {
   SpillerInstance.reset(0);
-  RegAllocBase::releaseMemory();
 }
 
-// Helper for spillInterferences() that spills all interfering vregs currently
-// assigned to this physical register.
-void RABasic::spillReg(LiveInterval& VirtReg, unsigned PhysReg,
-                       SmallVectorImpl<LiveInterval*> &SplitVRegs) {
-  LiveIntervalUnion::Query &Q = query(VirtReg, PhysReg);
-  assert(Q.seenAllInterferences() && "need collectInterferences()");
-  const SmallVectorImpl<LiveInterval*> &PendingSpills = Q.interferingVRegs();
-
-  for (SmallVectorImpl<LiveInterval*>::const_iterator I = PendingSpills.begin(),
-         E = PendingSpills.end(); I != E; ++I) {
-    LiveInterval &SpilledVReg = **I;
-    DEBUG(dbgs() << "extracting from " <<
-          TRI->getName(PhysReg) << " " << SpilledVReg << '\n');
-
-    // Deallocate the interfering vreg by removing it from the union.
-    // A LiveInterval instance may not be in a union during modification!
-    unassign(SpilledVReg, PhysReg);
-
-    // Spill the extracted interval.
-    LiveRangeEdit LRE(SpilledVReg, SplitVRegs, *MF, *LIS, VRM);
-    spiller().spill(LRE);
-  }
-  // After extracting segments, the query's results are invalid. But keep the
-  // contents valid until we're done accessing pendingSpills.
-  Q.clear();
-}
 
 // Spill or split all live virtual registers currently unified under PhysReg
 // that interfere with VirtReg. The newly spilled or split live intervals are
@@ -202,22 +171,41 @@ bool RABasic::spillInterferences(LiveInterval &VirtReg, unsigned PhysReg,
                                  SmallVectorImpl<LiveInterval*> &SplitVRegs) {
   // Record each interference and determine if all are spillable before mutating
   // either the union or live intervals.
-  unsigned NumInterferences = 0;
+  SmallVector<LiveInterval*, 8> Intfs;
+
   // Collect interferences assigned to any alias of the physical register.
-  for (const uint16_t *asI = TRI->getOverlaps(PhysReg); *asI; ++asI) {
-    LiveIntervalUnion::Query &QAlias = query(VirtReg, *asI);
-    NumInterferences += QAlias.collectInterferingVRegs();
-    if (QAlias.seenUnspillableVReg()) {
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
+    Q.collectInterferingVRegs();
+    if (Q.seenUnspillableVReg())
       return false;
+    for (unsigned i = Q.interferingVRegs().size(); i; --i) {
+      LiveInterval *Intf = Q.interferingVRegs()[i - 1];
+      if (!Intf->isSpillable() || Intf->weight > VirtReg.weight)
+        return false;
+      Intfs.push_back(Intf);
     }
   }
   DEBUG(dbgs() << "spilling " << TRI->getName(PhysReg) <<
         " interferences with " << VirtReg << "\n");
-  assert(NumInterferences > 0 && "expect interference");
+  assert(!Intfs.empty() && "expected interference");
 
   // Spill each interfering vreg allocated to PhysReg or an alias.
-  for (const uint16_t *AliasI = TRI->getOverlaps(PhysReg); *AliasI; ++AliasI)
-    spillReg(VirtReg, *AliasI, SplitVRegs);
+  for (unsigned i = 0, e = Intfs.size(); i != e; ++i) {
+    LiveInterval &Spill = *Intfs[i];
+
+    // Skip duplicates.
+    if (!VRM->hasPhys(Spill.reg))
+      continue;
+
+    // Deallocate the interfering vreg by removing it from the union.
+    // A LiveInterval instance may not be in a union during modification!
+    Matrix->unassign(Spill);
+
+    // Spill the extracted interval.
+    LiveRangeEdit LRE(&Spill, SplitVRegs, *MF, *LIS, VRM);
+    spiller().spill(LRE);
+  }
   return true;
 }
 
@@ -235,49 +223,36 @@ bool RABasic::spillInterferences(LiveInterval &VirtReg, unsigned PhysReg,
 // selectOrSplit().
 unsigned RABasic::selectOrSplit(LiveInterval &VirtReg,
                                 SmallVectorImpl<LiveInterval*> &SplitVRegs) {
-  // Check for register mask interference.  When live ranges cross calls, the
-  // set of usable registers is reduced to the callee-saved ones.
-  bool CrossRegMasks = LIS->checkRegMaskInterference(VirtReg, UsableRegs);
-
   // Populate a list of physical register spill candidates.
   SmallVector<unsigned, 8> PhysRegSpillCands;
 
   // Check for an available register in this class.
-  ArrayRef<unsigned> Order =
-    RegClassInfo.getOrder(MRI->getRegClass(VirtReg.reg));
-  for (ArrayRef<unsigned>::iterator I = Order.begin(), E = Order.end(); I != E;
-       ++I) {
-    unsigned PhysReg = *I;
-
-    // If PhysReg is clobbered by a register mask, it isn't useful for
-    // allocation or spilling.
-    if (CrossRegMasks && !UsableRegs.test(PhysReg))
-      continue;
-
-    // Check interference and as a side effect, intialize queries for this
-    // VirtReg and its aliases.
-    unsigned interfReg = checkPhysRegInterference(VirtReg, PhysReg);
-    if (interfReg == 0) {
-      // Found an available register.
+  AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo);
+  while (unsigned PhysReg = Order.next()) {
+    // Check for interference in PhysReg
+    switch (Matrix->checkInterference(VirtReg, PhysReg)) {
+    case LiveRegMatrix::IK_Free:
+      // PhysReg is available, allocate it.
       return PhysReg;
-    }
-    LiveIntervalUnion::Query &IntfQ = query(VirtReg, interfReg);
-    IntfQ.collectInterferingVRegs(1);
-    LiveInterval *interferingVirtReg = IntfQ.interferingVRegs().front();
 
-    // The current VirtReg must either be spillable, or one of its interferences
-    // must have less spill weight.
-    if (interferingVirtReg->weight < VirtReg.weight ) {
+    case LiveRegMatrix::IK_VirtReg:
+      // Only virtual registers in the way, we may be able to spill them.
       PhysRegSpillCands.push_back(PhysReg);
+      continue;
+
+    default:
+      // RegMask or RegUnit interference.
+      continue;
     }
   }
+
   // Try to spill another interfering reg with less spill weight.
   for (SmallVectorImpl<unsigned>::iterator PhysRegI = PhysRegSpillCands.begin(),
-         PhysRegE = PhysRegSpillCands.end(); PhysRegI != PhysRegE; ++PhysRegI) {
-
-    if (!spillInterferences(VirtReg, *PhysRegI, SplitVRegs)) continue;
+       PhysRegE = PhysRegSpillCands.end(); PhysRegI != PhysRegE; ++PhysRegI) {
+    if (!spillInterferences(VirtReg, *PhysRegI, SplitVRegs))
+      continue;
 
-    assert(checkPhysRegInterference(VirtReg, *PhysRegI) == 0 &&
+    assert(!Matrix->checkInterference(VirtReg, *PhysRegI) &&
            "Interference after spill.");
     // Tell the caller to allocate to this newly freed physical register.
     return *PhysRegI;
@@ -287,7 +262,7 @@ unsigned RABasic::selectOrSplit(LiveInterval &VirtReg,
   DEBUG(dbgs() << "spilling: " << VirtReg << '\n');
   if (!VirtReg.isSpillable())
     return ~0u;
-  LiveRangeEdit LRE(VirtReg, SplitVRegs, *MF, *LIS, VRM);
+  LiveRangeEdit LRE(&VirtReg, SplitVRegs, *MF, *LIS, VRM);
   spiller().spill(LRE);
 
   // The live virtual register requesting allocation was spilled, so tell
@@ -301,53 +276,17 @@ bool RABasic::runOnMachineFunction(MachineFunction &mf) {
                << ((Value*)mf.getFunction())->getName() << '\n');
 
   MF = &mf;
-  DEBUG(RMF = &getAnalysis<RenderMachineFunction>());
-
-  RegAllocBase::init(getAnalysis<VirtRegMap>(), getAnalysis<LiveIntervals>());
+  RegAllocBase::init(getAnalysis<VirtRegMap>(),
+                     getAnalysis<LiveIntervals>(),
+                     getAnalysis<LiveRegMatrix>());
   SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM));
 
   allocatePhysRegs();
 
-  addMBBLiveIns(MF);
-
   // Diagnostic output before rewriting
   DEBUG(dbgs() << "Post alloc VirtRegMap:\n" << *VRM << "\n");
 
-  // optional HTML output
-  DEBUG(RMF->renderMachineFunction("After basic register allocation.", VRM));
-
-  // FIXME: Verification currently must run before VirtRegRewriter. We should
-  // make the rewriter a separate pass and override verifyAnalysis instead. When
-  // that happens, verification naturally falls under VerifyMachineCode.
-#ifndef NDEBUG
-  if (VerifyEnabled) {
-    // Verify accuracy of LiveIntervals. The standard machine code verifier
-    // ensures that each LiveIntervals covers all uses of the virtual reg.
-
-    // FIXME: MachineVerifier is badly broken when using the standard
-    // spiller. Always use -spiller=inline with -verify-regalloc. Even with the
-    // inline spiller, some tests fail to verify because the coalescer does not
-    // always generate verifiable code.
-    MF->verify(this, "In RABasic::verify");
-
-    // Verify that LiveIntervals are partitioned into unions and disjoint within
-    // the unions.
-    verify();
-  }
-#endif // !NDEBUG
-
-  // Run rewriter
-  VRM->rewrite(LIS->getSlotIndexes());
-
-  // Write out new DBG_VALUE instructions.
-  getAnalysis<LiveDebugVariables>().emitDebugValues(VRM);
-
-  // All machine operands and other references to virtual registers have been
-  // replaced. Remove the virtual registers and release all the transient data.
-  VRM->clearAllVirt();
-  MRI->clearVirtRegs();
   releaseMemory();
-
   return true;
 }
 
diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index e09b7f8..8325f20 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -13,7 +13,6 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "regalloc"
-#include "RegisterClassInfo.h"
 #include "llvm/BasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -22,6 +21,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/CommandLine.h"
@@ -77,7 +77,7 @@ namespace {
       explicit LiveReg(unsigned v)
         : LastUse(0), VirtReg(v), PhysReg(0), LastOpNum(0), Dirty(false) {}
 
-      unsigned getSparseSetKey() const {
+      unsigned getSparseSetIndex() const {
         return TargetRegisterInfo::virtReg2Index(VirtReg);
       }
     };
@@ -354,8 +354,8 @@ void RAFast::usePhysReg(MachineOperand &MO) {
   }
 
   // Maybe a superregister is reserved?
-  for (const uint16_t *AS = TRI->getAliasSet(PhysReg);
-       unsigned Alias = *AS; ++AS) {
+  for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) {
+    unsigned Alias = *AI;
     switch (PhysRegState[Alias]) {
     case regDisabled:
       break;
@@ -408,8 +408,8 @@ void RAFast::definePhysReg(MachineInstr *MI, unsigned PhysReg,
 
   // This is a disabled register, disable all aliases.
   PhysRegState[PhysReg] = NewState;
-  for (const uint16_t *AS = TRI->getAliasSet(PhysReg);
-       unsigned Alias = *AS; ++AS) {
+  for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) {
+    unsigned Alias = *AI;
     switch (unsigned VirtReg = PhysRegState[Alias]) {
     case regDisabled:
       break;
@@ -456,8 +456,8 @@ unsigned RAFast::calcSpillCost(unsigned PhysReg) const {
   // This is a disabled register, add up cost of aliases.
   DEBUG(dbgs() << PrintReg(PhysReg, TRI) << " is disabled.\n");
   unsigned Cost = 0;
-  for (const uint16_t *AS = TRI->getAliasSet(PhysReg);
-       unsigned Alias = *AS; ++AS) {
+  for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) {
+    unsigned Alias = *AI;
     if (UsedInInstr.test(Alias))
       return spillImpossible;
     switch (unsigned VirtReg = PhysRegState[Alias]) {
@@ -659,9 +659,10 @@ RAFast::reloadVirtReg(MachineInstr *MI, unsigned OpNum,
 // Return true if the operand kills its register.
 bool RAFast::setPhysReg(MachineInstr *MI, unsigned OpNum, unsigned PhysReg) {
   MachineOperand &MO = MI->getOperand(OpNum);
+  bool Dead = MO.isDead();
   if (!MO.getSubReg()) {
     MO.setReg(PhysReg);
-    return MO.isKill() || MO.isDead();
+    return MO.isKill() || Dead;
   }
 
   // Handle subregister index.
@@ -674,7 +675,13 @@ bool RAFast::setPhysReg(MachineInstr *MI, unsigned OpNum, unsigned PhysReg) {
     MI->addRegisterKilled(PhysReg, TRI, true);
     return true;
   }
-  return MO.isDead();
+
+  // A <def,read-undef> of a sub-register requires an implicit def of the full
+  // register.
+  if (MO.isDef() && MO.isUndef())
+    MI->addRegisterDefined(PhysReg, TRI);
+
+  return Dead;
 }
 
 // Handle special instruction operand like early clobbers and tied ops when
@@ -704,13 +711,10 @@ void RAFast::handleThroughOperands(MachineInstr *MI,
     if (!MO.isReg() || !MO.isDef()) continue;
     unsigned Reg = MO.getReg();
     if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
-    UsedInInstr.set(Reg);
-    if (ThroughRegs.count(PhysRegState[Reg]))
-      definePhysReg(MI, Reg, regFree);
-    for (const uint16_t *AS = TRI->getAliasSet(Reg); *AS; ++AS) {
-      UsedInInstr.set(*AS);
-      if (ThroughRegs.count(PhysRegState[*AS]))
-        definePhysReg(MI, *AS, regFree);
+    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+      UsedInInstr.set(*AI);
+      if (ThroughRegs.count(PhysRegState[*AI]))
+        definePhysReg(MI, *AI, regFree);
     }
   }
 
@@ -1029,9 +1033,8 @@ void RAFast::AllocateBasicBlock() {
         if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
         // Look for physreg defs and tied uses.
         if (!MO.isDef() && !MI->isRegTiedToDefOperand(i)) continue;
-        UsedInInstr.set(Reg);
-        for (const uint16_t *AS = TRI->getAliasSet(Reg); *AS; ++AS)
-          UsedInInstr.set(*AS);
+        for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+          UsedInInstr.set(*AI);
       }
     }
 
diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp
index 3f2a617..6ac5428 100644
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp
@@ -16,6 +16,7 @@
 #include "AllocationOrder.h"
 #include "InterferenceCache.h"
 #include "LiveDebugVariables.h"
+#include "LiveRegMatrix.h"
 #include "RegAllocBase.h"
 #include "Spiller.h"
 #include "SpillPlacement.h"
@@ -73,7 +74,6 @@ class RAGreedy : public MachineFunctionPass,
 
   // analyses
   SlotIndexes *Indexes;
-  LiveStacks *LS;
   MachineDominatorTree *DomTree;
   MachineLoopInfo *Loops;
   EdgeBundles *Bundles;
@@ -168,19 +168,6 @@ class RAGreedy : public MachineFunctionPass,
     }
   };
 
-  // Register mask interference. The current VirtReg is checked for register
-  // mask interference on entry to selectOrSplit().  If there is no
-  // interference, UsableRegs is left empty.  If there is interference,
-  // UsableRegs has a bit mask of registers that can be used without register
-  // mask interference.
-  BitVector UsableRegs;
-
-  /// clobberedByRegMask - Returns true if PhysReg is not directly usable
-  /// because of register mask clobbers.
-  bool clobberedByRegMask(unsigned PhysReg) const {
-    return !UsableRegs.empty() && !UsableRegs.test(PhysReg);
-  }
-
   // splitting state.
   std::auto_ptr<SplitAnalysis> SA;
   std::auto_ptr<SplitEditor> SE;
@@ -286,6 +273,8 @@ private:
                           SmallVectorImpl<LiveInterval*>&);
   unsigned tryBlockSplit(LiveInterval&, AllocationOrder&,
                          SmallVectorImpl<LiveInterval*>&);
+  unsigned tryInstructionSplit(LiveInterval&, AllocationOrder&,
+                               SmallVectorImpl<LiveInterval*>&);
   unsigned tryLocalSplit(LiveInterval&, AllocationOrder&,
     SmallVectorImpl<LiveInterval*>&);
   unsigned trySplit(LiveInterval&, AllocationOrder&,
@@ -327,6 +316,7 @@ RAGreedy::RAGreedy(): MachineFunctionPass(ID) {
   initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry());
   initializeMachineLoopInfoPass(*PassRegistry::getPassRegistry());
   initializeVirtRegMapPass(*PassRegistry::getPassRegistry());
+  initializeLiveRegMatrixPass(*PassRegistry::getPassRegistry());
   initializeEdgeBundlesPass(*PassRegistry::getPassRegistry());
   initializeSpillPlacementPass(*PassRegistry::getPassRegistry());
 }
@@ -336,6 +326,7 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<AliasAnalysis>();
   AU.addPreserved<AliasAnalysis>();
   AU.addRequired<LiveIntervals>();
+  AU.addPreserved<LiveIntervals>();
   AU.addRequired<SlotIndexes>();
   AU.addPreserved<SlotIndexes>();
   AU.addRequired<LiveDebugVariables>();
@@ -349,6 +340,8 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<MachineLoopInfo>();
   AU.addRequired<VirtRegMap>();
   AU.addPreserved<VirtRegMap>();
+  AU.addRequired<LiveRegMatrix>();
+  AU.addPreserved<LiveRegMatrix>();
   AU.addRequired<EdgeBundles>();
   AU.addRequired<SpillPlacement>();
   MachineFunctionPass::getAnalysisUsage(AU);
@@ -360,8 +353,8 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
 //===----------------------------------------------------------------------===//
 
 bool RAGreedy::LRE_CanEraseVirtReg(unsigned VirtReg) {
-  if (unsigned PhysReg = VRM->getPhys(VirtReg)) {
-    unassign(LIS->getInterval(VirtReg), PhysReg);
+  if (VRM->hasPhys(VirtReg)) {
+    Matrix->unassign(LIS->getInterval(VirtReg));
     return true;
   }
   // Unassigned virtreg is probably in the priority queue.
@@ -370,13 +363,12 @@ bool RAGreedy::LRE_CanEraseVirtReg(unsigned VirtReg) {
 }
 
 void RAGreedy::LRE_WillShrinkVirtReg(unsigned VirtReg) {
-  unsigned PhysReg = VRM->getPhys(VirtReg);
-  if (!PhysReg)
+  if (!VRM->hasPhys(VirtReg))
     return;
 
   // Register is assigned, put it back on the queue for reassignment.
   LiveInterval &LI = LIS->getInterval(VirtReg);
-  unassign(LI, PhysReg);
+  Matrix->unassign(LI);
   enqueue(&LI);
 }
 
@@ -398,7 +390,6 @@ void RAGreedy::releaseMemory() {
   SpillerInstance.reset(0);
   ExtraRegInfo.clear();
   GlobalCand.clear();
-  RegAllocBase::releaseMemory();
 }
 
 void RAGreedy::enqueue(LiveInterval *LI) {
@@ -450,12 +441,9 @@ unsigned RAGreedy::tryAssign(LiveInterval &VirtReg,
                              SmallVectorImpl<LiveInterval*> &NewVRegs) {
   Order.rewind();
   unsigned PhysReg;
-  while ((PhysReg = Order.next())) {
-    if (clobberedByRegMask(PhysReg))
-      continue;
-    if (!checkPhysRegInterference(VirtReg, PhysReg))
+  while ((PhysReg = Order.next()))
+    if (!Matrix->checkInterference(VirtReg, PhysReg))
       break;
-  }
   if (!PhysReg || Order.isHint(PhysReg))
     return PhysReg;
 
@@ -464,7 +452,7 @@ unsigned RAGreedy::tryAssign(LiveInterval &VirtReg,
   // If we missed a simple hint, try to cheaply evict interference from the
   // preferred register.
   if (unsigned Hint = MRI->getSimpleHint(VirtReg.reg))
-    if (Order.isHint(Hint) && !clobberedByRegMask(Hint)) {
+    if (Order.isHint(Hint)) {
       DEBUG(dbgs() << "missed hint " << PrintReg(Hint, TRI) << '\n');
       EvictionCost MaxCost(1);
       if (canEvictInterference(VirtReg, Hint, true, MaxCost)) {
@@ -527,6 +515,10 @@ bool RAGreedy::shouldEvict(LiveInterval &A, bool IsHint,
 /// @returns True when interference can be evicted cheaper than MaxCost.
 bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, unsigned PhysReg,
                                     bool IsHint, EvictionCost &MaxCost) {
+  // It is only possible to evict virtual register interference.
+  if (Matrix->checkInterference(VirtReg, PhysReg) > LiveRegMatrix::IK_VirtReg)
+    return false;
+
   // Find VirtReg's cascade number. This will be unassigned if VirtReg was never
   // involved in an eviction before. If a cascade number was assigned, deny
   // evicting anything with the same or a newer cascade number. This prevents
@@ -539,8 +531,8 @@ bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, unsigned PhysReg,
     Cascade = NextCascade;
 
   EvictionCost Cost;
-  for (const uint16_t *AliasI = TRI->getOverlaps(PhysReg); *AliasI; ++AliasI) {
-    LiveIntervalUnion::Query &Q = query(VirtReg, *AliasI);
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
     // If there is 10 or more interferences, chances are one is heavier.
     if (Q.collectInterferingVRegs(10) >= 10)
       return false;
@@ -548,15 +540,21 @@ bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, unsigned PhysReg,
     // Check if any interfering live range is heavier than MaxWeight.
     for (unsigned i = Q.interferingVRegs().size(); i; --i) {
       LiveInterval *Intf = Q.interferingVRegs()[i - 1];
-      if (TargetRegisterInfo::isPhysicalRegister(Intf->reg))
-        return false;
+      assert(TargetRegisterInfo::isVirtualRegister(Intf->reg) &&
+             "Only expecting virtual register interference from query");
       // Never evict spill products. They cannot split or spill.
       if (getStage(*Intf) == RS_Done)
         return false;
       // Once a live range becomes small enough, it is urgent that we find a
       // register for it. This is indicated by an infinite spill weight. These
       // urgent live ranges get to evict almost anything.
-      bool Urgent = !VirtReg.isSpillable() && Intf->isSpillable();
+      //
+      // Also allow urgent evictions of unspillable ranges from a strictly
+      // larger allocation order.
+      bool Urgent = !VirtReg.isSpillable() &&
+        (Intf->isSpillable() ||
+         RegClassInfo.getNumAllocatableRegs(MRI->getRegClass(VirtReg.reg)) <
+         RegClassInfo.getNumAllocatableRegs(MRI->getRegClass(Intf->reg)));
       // Only evict older cascades or live ranges without a cascade.
       unsigned IntfCascade = ExtraRegInfo[Intf->reg].Cascade;
       if (Cascade <= IntfCascade) {
@@ -597,19 +595,29 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, unsigned PhysReg,
 
   DEBUG(dbgs() << "evicting " << PrintReg(PhysReg, TRI)
                << " interference: Cascade " << Cascade << '\n');
-  for (const uint16_t *AliasI = TRI->getOverlaps(PhysReg); *AliasI; ++AliasI) {
-    LiveIntervalUnion::Query &Q = query(VirtReg, *AliasI);
+
+  // Collect all interfering virtregs first.
+  SmallVector<LiveInterval*, 8> Intfs;
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
     assert(Q.seenAllInterferences() && "Didn't check all interfererences.");
-    for (unsigned i = 0, e = Q.interferingVRegs().size(); i != e; ++i) {
-      LiveInterval *Intf = Q.interferingVRegs()[i];
-      unassign(*Intf, VRM->getPhys(Intf->reg));
-      assert((ExtraRegInfo[Intf->reg].Cascade < Cascade ||
-              VirtReg.isSpillable() < Intf->isSpillable()) &&
-             "Cannot decrease cascade number, illegal eviction");
-      ExtraRegInfo[Intf->reg].Cascade = Cascade;
-      ++NumEvicted;
-      NewVRegs.push_back(Intf);
-    }
+    ArrayRef<LiveInterval*> IVR = Q.interferingVRegs();
+    Intfs.append(IVR.begin(), IVR.end());
+  }
+
+  // Evict them second. This will invalidate the queries.
+  for (unsigned i = 0, e = Intfs.size(); i != e; ++i) {
+    LiveInterval *Intf = Intfs[i];
+    // The same VirtReg may be present in multiple RegUnits. Skip duplicates.
+    if (!VRM->hasPhys(Intf->reg))
+      continue;
+    Matrix->unassign(*Intf);
+    assert((ExtraRegInfo[Intf->reg].Cascade < Cascade ||
+            VirtReg.isSpillable() < Intf->isSpillable()) &&
+           "Cannot decrease cascade number, illegal eviction");
+    ExtraRegInfo[Intf->reg].Cascade = Cascade;
+    ++NumEvicted;
+    NewVRegs.push_back(Intf);
   }
 }
 
@@ -636,8 +644,6 @@ unsigned RAGreedy::tryEvict(LiveInterval &VirtReg,
 
   Order.rewind();
   while (unsigned PhysReg = Order.next()) {
-    if (clobberedByRegMask(PhysReg))
-      continue;
     if (TRI->getCostPerUse(PhysReg) >= CostPerUseLimit)
       continue;
     // The first use of a callee-saved register in a function has cost 1.
@@ -1183,7 +1189,7 @@ unsigned RAGreedy::tryRegionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
     return 0;
 
   // Prepare split editor.
-  LiveRangeEdit LREdit(VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
   SE->reset(LREdit, SplitSpillMode);
 
   // Assign all edge bundles to the preferred candidate, or NoCand.
@@ -1231,7 +1237,7 @@ unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   assert(&SA->getParent() == &VirtReg && "Live range wasn't analyzed");
   unsigned Reg = VirtReg.reg;
   bool SingleInstrs = RegClassInfo.isProperSubClass(MRI->getRegClass(Reg));
-  LiveRangeEdit LREdit(VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
   SE->reset(LREdit, SplitSpillMode);
   ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
   for (unsigned i = 0; i != UseBlocks.size(); ++i) {
@@ -1265,6 +1271,65 @@ unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   return 0;
 }
 
+
+//===----------------------------------------------------------------------===//
+//                         Per-Instruction Splitting
+//===----------------------------------------------------------------------===//
+
+/// tryInstructionSplit - Split a live range around individual instructions.
+/// This is normally not worthwhile since the spiller is doing essentially the
+/// same thing. However, when the live range is in a constrained register
+/// class, it may help to insert copies such that parts of the live range can
+/// be moved to a larger register class.
+///
+/// This is similar to spilling to a larger register class.
+unsigned
+RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
+                              SmallVectorImpl<LiveInterval*> &NewVRegs) {
+  // There is no point to this if there are no larger sub-classes.
+  if (!RegClassInfo.isProperSubClass(MRI->getRegClass(VirtReg.reg)))
+    return 0;
+
+  // Always enable split spill mode, since we're effectively spilling to a
+  // register.
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  SE->reset(LREdit, SplitEditor::SM_Size);
+
+  ArrayRef<SlotIndex> Uses = SA->getUseSlots();
+  if (Uses.size() <= 1)
+    return 0;
+
+  DEBUG(dbgs() << "Split around " << Uses.size() << " individual instrs.\n");
+
+  // Split around every non-copy instruction.
+  for (unsigned i = 0; i != Uses.size(); ++i) {
+    if (const MachineInstr *MI = Indexes->getInstructionFromIndex(Uses[i]))
+      if (MI->isFullCopy()) {
+        DEBUG(dbgs() << "    skip:\t" << Uses[i] << '\t' << *MI);
+        continue;
+      }
+    SE->openIntv();
+    SlotIndex SegStart = SE->enterIntvBefore(Uses[i]);
+    SlotIndex SegStop  = SE->leaveIntvAfter(Uses[i]);
+    SE->useIntv(SegStart, SegStop);
+  }
+
+  if (LREdit.empty()) {
+    DEBUG(dbgs() << "All uses were copies.\n");
+    return 0;
+  }
+
+  SmallVector<unsigned, 8> IntvMap;
+  SE->finish(&IntvMap);
+  DebugVars->splitRegister(VirtReg.reg, LREdit.regs());
+  ExtraRegInfo.resize(MRI->getNumVirtRegs());
+
+  // Assign all new registers to RS_Spill. This was the last chance.
+  setStage(LREdit.begin(), LREdit.end(), RS_Spill);
+  return 0;
+}
+
+
 //===----------------------------------------------------------------------===//
 //                             Local Splitting
 //===----------------------------------------------------------------------===//
@@ -1291,9 +1356,9 @@ void RAGreedy::calcGapWeights(unsigned PhysReg,
   GapWeight.assign(NumGaps, 0.0f);
 
   // Add interference from each overlapping register.
-  for (const uint16_t *AI = TRI->getOverlaps(PhysReg); *AI; ++AI) {
-    if (!query(const_cast<LiveInterval&>(SA->getParent()), *AI)
-           .checkInterference())
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    if (!Matrix->query(const_cast<LiveInterval&>(SA->getParent()), *Units)
+          .checkInterference())
       continue;
 
     // We know that VirtReg is a continuous interval from FirstInstr to
@@ -1303,7 +1368,8 @@ void RAGreedy::calcGapWeights(unsigned PhysReg,
     // surrounding the instruction. The exception is interference before
     // StartIdx and after StopIdx.
     //
-    LiveIntervalUnion::SegmentIter IntI = getLiveUnion(*AI).find(StartIdx);
+    LiveIntervalUnion::SegmentIter IntI =
+      Matrix->getLiveUnions()[*Units] .find(StartIdx);
     for (unsigned Gap = 0; IntI.valid() && IntI.start() < StopIdx; ++IntI) {
       // Skip the gaps before IntI.
       while (Uses[Gap+1].getBoundaryIndex() < IntI.start())
@@ -1323,6 +1389,30 @@ void RAGreedy::calcGapWeights(unsigned PhysReg,
         break;
     }
   }
+
+  // Add fixed interference.
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    const LiveInterval &LI = LIS->getRegUnit(*Units);
+    LiveInterval::const_iterator I = LI.find(StartIdx);
+    LiveInterval::const_iterator E = LI.end();
+
+    // Same loop as above. Mark any overlapped gaps as HUGE_VALF.
+    for (unsigned Gap = 0; I != E && I->start < StopIdx; ++I) {
+      while (Uses[Gap+1].getBoundaryIndex() < I->start)
+        if (++Gap == NumGaps)
+          break;
+      if (Gap == NumGaps)
+        break;
+
+      for (; Gap != NumGaps; ++Gap) {
+        GapWeight[Gap] = HUGE_VALF;
+        if (Uses[Gap+1].getBaseIndex() >= I->end)
+          break;
+      }
+      if (Gap == NumGaps)
+        break;
+    }
+  }
 }
 
 /// tryLocalSplit - Try to split VirtReg into smaller intervals inside its only
@@ -1355,7 +1445,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   // If VirtReg is live across any register mask operands, compute a list of
   // gaps with register masks.
   SmallVector<unsigned, 8> RegMaskGaps;
-  if (!UsableRegs.empty()) {
+  if (Matrix->checkRegMaskInterference(VirtReg)) {
     // Get regmask slots for the whole block.
     ArrayRef<SlotIndex> RMS = LIS->getRegMaskSlotsInBlock(BI.MBB->getNumber());
     DEBUG(dbgs() << RMS.size() << " regmasks in block:");
@@ -1417,7 +1507,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
     calcGapWeights(PhysReg, GapWeight);
 
     // Remove any gaps with regmask clobbers.
-    if (clobberedByRegMask(PhysReg))
+    if (Matrix->checkRegMaskInterference(VirtReg, PhysReg))
       for (unsigned i = 0, e = RegMaskGaps.size(); i != e; ++i)
         GapWeight[RegMaskGaps[i]] = HUGE_VALF;
 
@@ -1512,7 +1602,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
                << '-' << Uses[BestAfter] << ", " << BestDiff
                << ", " << (BestAfter - BestBefore + 1) << " instrs\n");
 
-  LiveRangeEdit LREdit(VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
   SE->reset(LREdit);
 
   SE->openIntv();
@@ -1561,7 +1651,10 @@ unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order,
   if (LIS->intervalIsInOneMBB(VirtReg)) {
     NamedRegionTimer T("Local Splitting", TimerGroupName, TimePassesIsEnabled);
     SA->analyze(&VirtReg);
-    return tryLocalSplit(VirtReg, Order, NewVRegs);
+    unsigned PhysReg = tryLocalSplit(VirtReg, Order, NewVRegs);
+    if (PhysReg || !NewVRegs.empty())
+      return PhysReg;
+    return tryInstructionSplit(VirtReg, Order, NewVRegs);
   }
 
   NamedRegionTimer T("Global Splitting", TimerGroupName, TimePassesIsEnabled);
@@ -1574,7 +1667,7 @@ unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order,
   // an assertion when the coalescer is fixed.
   if (SA->didRepairRange()) {
     // VirtReg has changed, so all cached queries are invalid.
-    invalidateVirtRegs();
+    Matrix->invalidateVirtRegs();
     if (unsigned PhysReg = tryAssign(VirtReg, Order, NewVRegs))
       return PhysReg;
   }
@@ -1599,11 +1692,6 @@ unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order,
 
 unsigned RAGreedy::selectOrSplit(LiveInterval &VirtReg,
                                  SmallVectorImpl<LiveInterval*> &NewVRegs) {
-  // Check if VirtReg is live across any calls.
-  UsableRegs.clear();
-  if (LIS->checkRegMaskInterference(VirtReg, UsableRegs))
-    DEBUG(dbgs() << "Live across regmasks.\n");
-
   // First try assigning a free register.
   AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo);
   if (unsigned PhysReg = tryAssign(VirtReg, Order, NewVRegs))
@@ -1644,7 +1732,7 @@ unsigned RAGreedy::selectOrSplit(LiveInterval &VirtReg,
 
   // Finally spill VirtReg itself.
   NamedRegionTimer T("Spiller", TimerGroupName, TimePassesIsEnabled);
-  LiveRangeEdit LRE(VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  LiveRangeEdit LRE(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
   spiller().spill(LRE);
   setStage(NewVRegs.begin(), NewVRegs.end(), RS_Done);
 
@@ -1665,7 +1753,9 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   if (VerifyEnabled)
     MF->verify(this, "Before greedy register allocator");
 
-  RegAllocBase::init(getAnalysis<VirtRegMap>(), getAnalysis<LiveIntervals>());
+  RegAllocBase::init(getAnalysis<VirtRegMap>(),
+                     getAnalysis<LiveIntervals>(),
+                     getAnalysis<LiveRegMatrix>());
   Indexes = &getAnalysis<SlotIndexes>();
   DomTree = &getAnalysis<MachineDominatorTree>();
   SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM));
@@ -1679,30 +1769,10 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   ExtraRegInfo.clear();
   ExtraRegInfo.resize(MRI->getNumVirtRegs());
   NextCascade = 1;
-  IntfCache.init(MF, &getLiveUnion(0), Indexes, LIS, TRI);
+  IntfCache.init(MF, Matrix->getLiveUnions(), Indexes, LIS, TRI);
   GlobalCand.resize(32);  // This will grow as needed.
 
   allocatePhysRegs();
-  addMBBLiveIns(MF);
-  LIS->addKillFlags();
-
-  // Run rewriter
-  {
-    NamedRegionTimer T("Rewriter", TimerGroupName, TimePassesIsEnabled);
-    VRM->rewrite(Indexes);
-  }
-
-  // Write out new DBG_VALUE instructions.
-  {
-    NamedRegionTimer T("Emit Debug Info", TimerGroupName, TimePassesIsEnabled);
-    DebugVars->emitDebugValues(VRM);
-  }
-
-  // All machine operands and other references to virtual registers have been
-  // replaced. Remove the virtual registers and release all the transient data.
-  VRM->clearAllVirt();
-  MRI->clearVirtRegs();
   releaseMemory();
-
   return true;
 }
diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp
index a284614..d0db26b 100644
--- a/lib/CodeGen/RegAllocPBQP.cpp
+++ b/lib/CodeGen/RegAllocPBQP.cpp
@@ -31,7 +31,6 @@
 
 #define DEBUG_TYPE "regalloc"
 
-#include "RenderMachineFunction.h"
 #include "Spiller.h"
 #include "VirtRegMap.h"
 #include "RegisterCoalescer.h"
@@ -98,7 +97,6 @@ public:
     initializeLiveStacksPass(*PassRegistry::getPassRegistry());
     initializeMachineLoopInfoPass(*PassRegistry::getPassRegistry());
     initializeVirtRegMapPass(*PassRegistry::getPassRegistry());
-    initializeRenderMachineFunctionPass(*PassRegistry::getPassRegistry());
   }
 
   /// Return the pass name.
@@ -134,7 +132,6 @@ private:
   const TargetInstrInfo *tii;
   const MachineLoopInfo *loopInfo;
   MachineRegisterInfo *mri;
-  RenderMachineFunction *rmf;
 
   std::auto_ptr<Spiller> spiller;
   LiveIntervals *lis;
@@ -196,7 +193,7 @@ std::auto_ptr<PBQPRAProblem> PBQPBuilder::build(MachineFunction *mf,
                                                 const RegSet &vregs) {
 
   typedef std::vector<const LiveInterval*> LIVector;
-  ArrayRef<SlotIndex> regMaskSlots = lis->getRegMaskSlots();
+  LiveIntervals *LIS = const_cast<LiveIntervals*>(lis);
   MachineRegisterInfo *mri = &mf->getRegInfo();
   const TargetRegisterInfo *tri = mf->getTarget().getRegisterInfo();
 
@@ -205,12 +202,11 @@ std::auto_ptr<PBQPRAProblem> PBQPBuilder::build(MachineFunction *mf,
   RegSet pregs;
 
   // Collect the set of preg intervals, record that they're used in the MF.
-  for (LiveIntervals::const_iterator itr = lis->begin(), end = lis->end();
-       itr != end; ++itr) {
-    if (TargetRegisterInfo::isPhysicalRegister(itr->first)) {
-      pregs.insert(itr->first);
-      mri->setPhysRegUsed(itr->first);
-    }
+  for (unsigned Reg = 1, e = tri->getNumRegs(); Reg != e; ++Reg) {
+    if (mri->def_empty(Reg))
+      continue;
+    pregs.insert(Reg);
+    mri->setPhysRegUsed(Reg);
   }
 
   BitVector reservedRegs = tri->getReservedRegs(*mf);
@@ -220,7 +216,11 @@ std::auto_ptr<PBQPRAProblem> PBQPBuilder::build(MachineFunction *mf,
        vregItr != vregEnd; ++vregItr) {
     unsigned vreg = *vregItr;
     const TargetRegisterClass *trc = mri->getRegClass(vreg);
-    const LiveInterval *vregLI = &lis->getInterval(vreg);
+    LiveInterval *vregLI = &LIS->getInterval(vreg);
+
+    // Record any overlaps with regmask operands.
+    BitVector regMaskOverlaps(tri->getNumRegs());
+    LIS->checkRegMaskInterference(*vregLI, regMaskOverlaps);
 
     // Compute an initial allowed set for the current vreg.
     typedef std::vector<unsigned> VRAllowed;
@@ -228,80 +228,26 @@ std::auto_ptr<PBQPRAProblem> PBQPBuilder::build(MachineFunction *mf,
     ArrayRef<uint16_t> rawOrder = trc->getRawAllocationOrder(*mf);
     for (unsigned i = 0; i != rawOrder.size(); ++i) {
       unsigned preg = rawOrder[i];
-      if (!reservedRegs.test(preg)) {
-        vrAllowed.push_back(preg);
-      }
-    }
-
-    RegSet overlappingPRegs;
-
-    // Record physical registers whose ranges overlap.
-    for (RegSet::const_iterator pregItr = pregs.begin(),
-                                pregEnd = pregs.end();
-         pregItr != pregEnd; ++pregItr) {
-      unsigned preg = *pregItr;
-      const LiveInterval *pregLI = &lis->getInterval(preg);
-
-      if (pregLI->empty()) {
+      if (reservedRegs.test(preg))
         continue;
-      }
 
-      if (vregLI->overlaps(*pregLI))
-        overlappingPRegs.insert(preg);      
-    }
+      // vregLI crosses a regmask operand that clobbers preg.
+      if (!regMaskOverlaps.empty() && !regMaskOverlaps.test(preg))
+        continue;
 
-    // Record any overlaps with regmask operands.
-    BitVector regMaskOverlaps(tri->getNumRegs());
-    for (ArrayRef<SlotIndex>::iterator rmItr = regMaskSlots.begin(),
-                                       rmEnd = regMaskSlots.end();
-         rmItr != rmEnd; ++rmItr) {
-      SlotIndex rmIdx = *rmItr;
-      if (vregLI->liveAt(rmIdx)) {
-        MachineInstr *rmMI = lis->getInstructionFromIndex(rmIdx);
-        const uint32_t* regMask = 0;
-        for (MachineInstr::mop_iterator mopItr = rmMI->operands_begin(),
-                                        mopEnd = rmMI->operands_end();
-             mopItr != mopEnd; ++mopItr) {
-          if (mopItr->isRegMask()) {
-            regMask = mopItr->getRegMask();
-            break;
-          }
+      // vregLI overlaps fixed regunit interference.
+      bool Interference = false;
+      for (MCRegUnitIterator Units(preg, tri); Units.isValid(); ++Units) {
+        if (vregLI->overlaps(LIS->getRegUnit(*Units))) {
+          Interference = true;
+          break;
         }
-        assert(regMask != 0 && "Couldn't find register mask.");
-        regMaskOverlaps.setBitsNotInMask(regMask);
       }
-    }
+      if (Interference)
+        continue;
 
-    for (unsigned preg = 0; preg < tri->getNumRegs(); ++preg) {
-      if (regMaskOverlaps.test(preg))
-        overlappingPRegs.insert(preg);
-    }
-
-    for (RegSet::const_iterator pregItr = overlappingPRegs.begin(),
-                                pregEnd = overlappingPRegs.end();
-         pregItr != pregEnd; ++pregItr) {
-      unsigned preg = *pregItr;
-
-      // Remove the register from the allowed set.
-      VRAllowed::iterator eraseItr =
-        std::find(vrAllowed.begin(), vrAllowed.end(), preg);
-
-      if (eraseItr != vrAllowed.end()) {
-        vrAllowed.erase(eraseItr);
-      }
-
-      // Also remove any aliases.
-      const uint16_t *aliasItr = tri->getAliasSet(preg);
-      if (aliasItr != 0) {
-        for (; *aliasItr != 0; ++aliasItr) {
-          VRAllowed::iterator eraseItr =
-            std::find(vrAllowed.begin(), vrAllowed.end(), *aliasItr);
-
-          if (eraseItr != vrAllowed.end()) {
-            vrAllowed.erase(eraseItr);
-          }
-        }
-      }
+      // preg is usable for this virtual register.
+      vrAllowed.push_back(preg);
     }
 
     // Construct the node.
@@ -379,7 +325,7 @@ std::auto_ptr<PBQPRAProblem> PBQPBuilderWithCoalescing::build(
   PBQP::Graph &g = p->getGraph();
 
   const TargetMachine &tm = mf->getTarget();
-  CoalescerPair cp(*tm.getInstrInfo(), *tm.getRegisterInfo());
+  CoalescerPair cp(*tm.getRegisterInfo());
 
   // Scan the machine function and add a coalescing cost whenever CoalescerPair
   // gives the Ok.
@@ -498,21 +444,17 @@ void RegAllocPBQP::getAnalysisUsage(AnalysisUsage &au) const {
   au.addRequired<MachineLoopInfo>();
   au.addPreserved<MachineLoopInfo>();
   au.addRequired<VirtRegMap>();
-  au.addRequired<RenderMachineFunction>();
   MachineFunctionPass::getAnalysisUsage(au);
 }
 
 void RegAllocPBQP::findVRegIntervalsToAlloc() {
 
   // Iterate over all live ranges.
-  for (LiveIntervals::iterator itr = lis->begin(), end = lis->end();
-       itr != end; ++itr) {
-
-    // Ignore physical ones.
-    if (TargetRegisterInfo::isPhysicalRegister(itr->first))
+  for (unsigned i = 0, e = mri->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (mri->reg_nodbg_empty(Reg))
       continue;
-
-    LiveInterval *li = itr->second;
+    LiveInterval *li = &lis->getInterval(Reg);
 
     // If this live interval is non-empty we will use pbqp to allocate it.
     // Empty intervals we allocate in a simple post-processing stage in
@@ -544,16 +486,17 @@ bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAProblem &problem,
 
     if (problem.isPRegOption(vreg, alloc)) {
       unsigned preg = problem.getPRegForOption(vreg, alloc);
-      DEBUG(dbgs() << "VREG " << vreg << " -> " << tri->getName(preg) << "\n");
+      DEBUG(dbgs() << "VREG " << PrintReg(vreg, tri) << " -> "
+            << tri->getName(preg) << "\n");
       assert(preg != 0 && "Invalid preg selected.");
       vrm->assignVirt2Phys(vreg, preg);
     } else if (problem.isSpillOption(vreg, alloc)) {
       vregsToAlloc.erase(vreg);
       SmallVector<LiveInterval*, 8> newSpills;
-      LiveRangeEdit LRE(lis->getInterval(vreg), newSpills, *mf, *lis, vrm);
+      LiveRangeEdit LRE(&lis->getInterval(vreg), newSpills, *mf, *lis, vrm);
       spiller->spill(LRE);
 
-      DEBUG(dbgs() << "VREG " << vreg << " -> SPILLED (Cost: "
+      DEBUG(dbgs() << "VREG " << PrintReg(vreg, tri) << " -> SPILLED (Cost: "
                    << LRE.getParent().weight << ", New vregs: ");
 
       // Copy any newly inserted live intervals into the list of regs to
@@ -561,7 +504,7 @@ bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAProblem &problem,
       for (LiveRangeEdit::iterator itr = LRE.begin(), end = LRE.end();
            itr != end; ++itr) {
         assert(!(*itr)->empty() && "Empty spill range.");
-        DEBUG(dbgs() << (*itr)->reg << " ");
+        DEBUG(dbgs() << PrintReg((*itr)->reg, tri) << " ");
         vregsToAlloc.insert((*itr)->reg);
       }
 
@@ -579,9 +522,6 @@ bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAProblem &problem,
 
 
 void RegAllocPBQP::finalizeAlloc() const {
-  typedef LiveIntervals::iterator LIIterator;
-  typedef LiveInterval::Ranges::const_iterator LRIterator;
-
   // First allocate registers for the empty intervals.
   for (RegSet::const_iterator
          itr = emptyIntervalVRegs.begin(), end = emptyIntervalVRegs.end();
@@ -597,51 +537,6 @@ void RegAllocPBQP::finalizeAlloc() const {
 
     vrm->assignVirt2Phys(li->reg, physReg);
   }
-
-  // Finally iterate over the basic blocks to compute and set the live-in sets.
-  SmallVector<MachineBasicBlock*, 8> liveInMBBs;
-  MachineBasicBlock *entryMBB = &*mf->begin();
-
-  for (LIIterator liItr = lis->begin(), liEnd = lis->end();
-       liItr != liEnd; ++liItr) {
-
-    const LiveInterval *li = liItr->second;
-    unsigned reg = 0;
-
-    // Get the physical register for this interval
-    if (TargetRegisterInfo::isPhysicalRegister(li->reg)) {
-      reg = li->reg;
-    } else if (vrm->isAssignedReg(li->reg)) {
-      reg = vrm->getPhys(li->reg);
-    } else {
-      // Ranges which are assigned a stack slot only are ignored.
-      continue;
-    }
-
-    if (reg == 0) {
-      // Filter out zero regs - they're for intervals that were spilled.
-      continue;
-    }
-
-    // Iterate over the ranges of the current interval...
-    for (LRIterator lrItr = li->begin(), lrEnd = li->end();
-         lrItr != lrEnd; ++lrItr) {
-
-      // Find the set of basic blocks which this range is live into...
-      if (lis->findLiveInMBBs(lrItr->start, lrItr->end,  liveInMBBs)) {
-        // And add the physreg for this interval to their live-in sets.
-        for (unsigned i = 0; i != liveInMBBs.size(); ++i) {
-          if (liveInMBBs[i] != entryMBB) {
-            if (!liveInMBBs[i]->isLiveIn(reg)) {
-              liveInMBBs[i]->addLiveIn(reg);
-            }
-          }
-        }
-        liveInMBBs.clear();
-      }
-    }
-  }
-
 }
 
 bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
@@ -655,7 +550,6 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
   lis = &getAnalysis<LiveIntervals>();
   lss = &getAnalysis<LiveStacks>();
   loopInfo = &getAnalysis<MachineLoopInfo>();
-  rmf = &getAnalysis<RenderMachineFunction>();
 
   vrm = &getAnalysis<VirtRegMap>();
   spiller.reset(createInlineSpiller(*this, MF, *vrm));
@@ -719,22 +613,11 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
 
   // Finalise allocation, allocate empty ranges.
   finalizeAlloc();
-
-  rmf->renderMachineFunction("After PBQP register allocation.", vrm);
-
   vregsToAlloc.clear();
   emptyIntervalVRegs.clear();
 
   DEBUG(dbgs() << "Post alloc VirtRegMap:\n" << *vrm << "\n");
 
-  // Run rewriter
-  vrm->rewrite(lis->getSlotIndexes());
-
-  // All machine operands and other references to virtual registers have been
-  // replaced. Remove the virtual registers.
-  vrm->clearAllVirt();
-  mri->clearVirtRegs();
-
   return true;
 }
 
diff --git a/lib/CodeGen/RegisterClassInfo.cpp b/lib/CodeGen/RegisterClassInfo.cpp
index 17165fa..652bc30 100644
--- a/lib/CodeGen/RegisterClassInfo.cpp
+++ b/lib/CodeGen/RegisterClassInfo.cpp
@@ -15,8 +15,8 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "regalloc"
-#include "RegisterClassInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -50,9 +50,8 @@ void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
     CSRNum.clear();
     CSRNum.resize(TRI->getNumRegs(), 0);
     for (unsigned N = 0; unsigned Reg = CSR[N]; ++N)
-      for (const uint16_t *AS = TRI->getOverlaps(Reg);
-           unsigned Alias = *AS; ++AS)
-        CSRNum[Alias] = N + 1; // 0 means no CSR, 1 means CalleeSaved[0], ...
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        CSRNum[*AI] = N + 1; // 0 means no CSR, 1 means CalleeSaved[0], ...
     Update = true;
   }
   CalleeSaved = CSR;
diff --git a/lib/CodeGen/RegisterClassInfo.h b/lib/CodeGen/RegisterClassInfo.h
deleted file mode 100644
index 400e1f4..0000000
--- a/lib/CodeGen/RegisterClassInfo.h
+++ /dev/null
@@ -1,132 +0,0 @@
-//===-- RegisterClassInfo.h - Dynamic Register Class Info -*- C++ -*-------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the RegisterClassInfo class which provides dynamic
-// information about target register classes. Callee saved and reserved
-// registers depends on calling conventions and other dynamic information, so
-// some things cannot be determined statically.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CODEGEN_REGISTERCLASSINFO_H
-#define LLVM_CODEGEN_REGISTERCLASSINFO_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/OwningPtr.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-
-namespace llvm {
-
-class RegisterClassInfo {
-  struct RCInfo {
-    unsigned Tag;
-    unsigned NumRegs;
-    bool ProperSubClass;
-    OwningArrayPtr<unsigned> Order;
-
-    RCInfo() : Tag(0), NumRegs(0), ProperSubClass(false) {}
-    operator ArrayRef<unsigned>() const {
-      return makeArrayRef(Order.get(), NumRegs);
-    }
-  };
-
-  // Brief cached information for each register class.
-  OwningArrayPtr<RCInfo> RegClass;
-
-  // Tag changes whenever cached information needs to be recomputed. An RCInfo
-  // entry is valid when its tag matches.
-  unsigned Tag;
-
-  const MachineFunction *MF;
-  const TargetRegisterInfo *TRI;
-
-  // Callee saved registers of last MF. Assumed to be valid until the next
-  // runOnFunction() call.
-  const uint16_t *CalleeSaved;
-
-  // Map register number to CalleeSaved index + 1;
-  SmallVector<uint8_t, 4> CSRNum;
-
-  // Reserved registers in the current MF.
-  BitVector Reserved;
-
-  // Compute all information about RC.
-  void compute(const TargetRegisterClass *RC) const;
-
-  // Return an up-to-date RCInfo for RC.
-  const RCInfo &get(const TargetRegisterClass *RC) const {
-    const RCInfo &RCI = RegClass[RC->getID()];
-    if (Tag != RCI.Tag)
-      compute(RC);
-    return RCI;
-  }
-
-public:
-  RegisterClassInfo();
-
-  /// runOnFunction - Prepare to answer questions about MF. This must be called
-  /// before any other methods are used.
-  void runOnMachineFunction(const MachineFunction &MF);
-
-  /// getNumAllocatableRegs - Returns the number of actually allocatable
-  /// registers in RC in the current function.
-  unsigned getNumAllocatableRegs(const TargetRegisterClass *RC) const {
-    return get(RC).NumRegs;
-  }
-
-  /// getOrder - Returns the preferred allocation order for RC. The order
-  /// contains no reserved registers, and registers that alias callee saved
-  /// registers come last.
-  ArrayRef<unsigned> getOrder(const TargetRegisterClass *RC) const {
-    return get(RC);
-  }
-
-  /// isProperSubClass - Returns true if RC has a legal super-class with more
-  /// allocatable registers.
-  ///
-  /// Register classes like GR32_NOSP are not proper sub-classes because %esp
-  /// is not allocatable.  Similarly, tGPR is not a proper sub-class in Thumb
-  /// mode because the GPR super-class is not legal.
-  bool isProperSubClass(const TargetRegisterClass *RC) const {
-    return get(RC).ProperSubClass;
-  }
-
-  /// getLastCalleeSavedAlias - Returns the last callee saved register that
-  /// overlaps PhysReg, or 0 if Reg doesn't overlap a CSR.
-  unsigned getLastCalleeSavedAlias(unsigned PhysReg) const {
-    assert(TargetRegisterInfo::isPhysicalRegister(PhysReg));
-    if (unsigned N = CSRNum[PhysReg])
-      return CalleeSaved[N-1];
-    return 0;
-  }
-
-  /// isReserved - Returns true when PhysReg is a reserved register.
-  ///
-  /// Reserved registers may belong to an allocatable register class, but the
-  /// target has explicitly requested that they are not used.
-  ///
-  bool isReserved(unsigned PhysReg) const {
-    return Reserved.test(PhysReg);
-  }
-
-  /// isAllocatable - Returns true when PhysReg belongs to an allocatable
-  /// register class and it hasn't been reserved.
-  ///
-  /// Allocatable registers may show up in the allocation order of some virtual
-  /// register, so a register allocator needs to track its liveness and
-  /// availability.
-  bool isAllocatable(unsigned PhysReg) const {
-    return TRI->isInAllocatableClass(PhysReg) && !isReserved(PhysReg);
-  }
-};
-} // end namespace llvm
-
-#endif
-
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index 75f88ca..733312f 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -16,34 +16,35 @@
 #define DEBUG_TYPE "regalloc"
 #include "RegisterCoalescer.h"
 #include "LiveDebugVariables.h"
-#include "RegisterClassInfo.h"
 #include "VirtRegMap.h"
 
 #include "llvm/Pass.h"
 #include "llvm/Value.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
-#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/OwningPtr.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 #include <algorithm>
 #include <cmath>
 using namespace llvm;
@@ -53,8 +54,6 @@ STATISTIC(numCrossRCs , "Number of cross class joins performed");
 STATISTIC(numCommutes , "Number of instruction commuting performed");
 STATISTIC(numExtends  , "Number of copies extended");
 STATISTIC(NumReMats   , "Number of instructions re-materialized");
-STATISTIC(numPeep     , "Number of identity moves eliminated after coalescing");
-STATISTIC(numAborts   , "Number of times interval joining aborted");
 STATISTIC(NumInflated , "Number of register classes inflated");
 
 static cl::opt<bool>
@@ -63,22 +62,13 @@ EnableJoining("join-liveintervals",
               cl::init(true));
 
 static cl::opt<bool>
-DisableCrossClassJoin("disable-cross-class-join",
-               cl::desc("Avoid coalescing cross register class copies"),
-               cl::init(false), cl::Hidden);
-
-static cl::opt<bool>
-EnablePhysicalJoin("join-physregs",
-                   cl::desc("Join physical register copies"),
-                   cl::init(false), cl::Hidden);
-
-static cl::opt<bool>
 VerifyCoalescing("verify-coalescing",
          cl::desc("Verify machine instrs before and after register coalescing"),
          cl::Hidden);
 
 namespace {
-  class RegisterCoalescer : public MachineFunctionPass {
+  class RegisterCoalescer : public MachineFunctionPass,
+                            private LiveRangeEdit::Delegate {
     MachineFunction* MF;
     MachineRegisterInfo* MRI;
     const TargetMachine* TM;
@@ -90,87 +80,83 @@ namespace {
     AliasAnalysis *AA;
     RegisterClassInfo RegClassInfo;
 
-    /// JoinedCopies - Keep track of copies eliminated due to coalescing.
-    ///
-    SmallPtrSet<MachineInstr*, 32> JoinedCopies;
+    /// WorkList - Copy instructions yet to be coalesced.
+    SmallVector<MachineInstr*, 8> WorkList;
+
+    /// ErasedInstrs - Set of instruction pointers that have been erased, and
+    /// that may be present in WorkList.
+    SmallPtrSet<MachineInstr*, 8> ErasedInstrs;
+
+    /// Dead instructions that are about to be deleted.
+    SmallVector<MachineInstr*, 8> DeadDefs;
 
-    /// ReMatCopies - Keep track of copies eliminated due to remat.
-    ///
-    SmallPtrSet<MachineInstr*, 32> ReMatCopies;
+    /// Virtual registers to be considered for register class inflation.
+    SmallVector<unsigned, 8> InflateRegs;
 
-    /// ReMatDefs - Keep track of definition instructions which have
-    /// been remat'ed.
-    SmallPtrSet<MachineInstr*, 8> ReMatDefs;
+    /// Recursively eliminate dead defs in DeadDefs.
+    void eliminateDeadDefs();
 
-    /// joinIntervals - join compatible live intervals
-    void joinIntervals();
+    /// LiveRangeEdit callback.
+    void LRE_WillEraseInstruction(MachineInstr *MI);
 
-    /// CopyCoalesceInMBB - Coalesce copies in the specified MBB, putting
-    /// copies that cannot yet be coalesced into the "TryAgain" list.
-    void CopyCoalesceInMBB(MachineBasicBlock *MBB,
-                           std::vector<MachineInstr*> &TryAgain);
+    /// joinAllIntervals - join compatible live intervals
+    void joinAllIntervals();
 
-    /// JoinCopy - Attempt to join intervals corresponding to SrcReg/DstReg,
+    /// copyCoalesceInMBB - Coalesce copies in the specified MBB, putting
+    /// copies that cannot yet be coalesced into WorkList.
+    void copyCoalesceInMBB(MachineBasicBlock *MBB);
+
+    /// copyCoalesceWorkList - Try to coalesce all copies in WorkList after
+    /// position From. Return true if any progress was made.
+    bool copyCoalesceWorkList(unsigned From = 0);
+
+    /// joinCopy - Attempt to join intervals corresponding to SrcReg/DstReg,
     /// which are the src/dst of the copy instruction CopyMI.  This returns
     /// true if the copy was successfully coalesced away. If it is not
     /// currently possible to coalesce this interval, but it may be possible if
     /// other things get coalesced, then it returns true by reference in
     /// 'Again'.
-    bool JoinCopy(MachineInstr *TheCopy, bool &Again);
+    bool joinCopy(MachineInstr *TheCopy, bool &Again);
 
-    /// JoinIntervals - Attempt to join these two intervals.  On failure, this
+    /// joinIntervals - Attempt to join these two intervals.  On failure, this
     /// returns false.  The output "SrcInt" will not have been modified, so we
     /// can use this information below to update aliases.
-    bool JoinIntervals(CoalescerPair &CP);
+    bool joinIntervals(CoalescerPair &CP);
 
-    /// AdjustCopiesBackFrom - We found a non-trivially-coalescable copy. If
+    /// Attempt joining with a reserved physreg.
+    bool joinReservedPhysReg(CoalescerPair &CP);
+
+    /// adjustCopiesBackFrom - We found a non-trivially-coalescable copy. If
     /// the source value number is defined by a copy from the destination reg
     /// see if we can merge these two destination reg valno# into a single
     /// value number, eliminating a copy.
-    bool AdjustCopiesBackFrom(const CoalescerPair &CP, MachineInstr *CopyMI);
+    bool adjustCopiesBackFrom(const CoalescerPair &CP, MachineInstr *CopyMI);
 
-    /// HasOtherReachingDefs - Return true if there are definitions of IntB
+    /// hasOtherReachingDefs - Return true if there are definitions of IntB
     /// other than BValNo val# that can reach uses of AValno val# of IntA.
-    bool HasOtherReachingDefs(LiveInterval &IntA, LiveInterval &IntB,
+    bool hasOtherReachingDefs(LiveInterval &IntA, LiveInterval &IntB,
                               VNInfo *AValNo, VNInfo *BValNo);
 
-    /// RemoveCopyByCommutingDef - We found a non-trivially-coalescable copy.
+    /// removeCopyByCommutingDef - We found a non-trivially-coalescable copy.
     /// If the source value number is defined by a commutable instruction and
     /// its other operand is coalesced to the copy dest register, see if we
     /// can transform the copy into a noop by commuting the definition.
-    bool RemoveCopyByCommutingDef(const CoalescerPair &CP,MachineInstr *CopyMI);
+    bool removeCopyByCommutingDef(const CoalescerPair &CP,MachineInstr *CopyMI);
 
-    /// ReMaterializeTrivialDef - If the source of a copy is defined by a
+    /// reMaterializeTrivialDef - If the source of a copy is defined by a
     /// trivial computation, replace the copy by rematerialize the definition.
-    /// If PreserveSrcInt is true, make sure SrcInt is valid after the call.
-    bool ReMaterializeTrivialDef(LiveInterval &SrcInt, bool PreserveSrcInt,
-                                 unsigned DstReg, MachineInstr *CopyMI);
-
-    /// shouldJoinPhys - Return true if a physreg copy should be joined.
-    bool shouldJoinPhys(CoalescerPair &CP);
-
-    /// isWinToJoinCrossClass - Return true if it's profitable to coalesce
-    /// two virtual registers from different register classes.
-    bool isWinToJoinCrossClass(unsigned SrcReg,
-                               unsigned DstReg,
-                               const TargetRegisterClass *SrcRC,
-                               const TargetRegisterClass *DstRC,
-                               const TargetRegisterClass *NewRC);
-
-    /// UpdateRegDefsUses - Replace all defs and uses of SrcReg to DstReg and
+    bool reMaterializeTrivialDef(LiveInterval &SrcInt, unsigned DstReg,
+                                 MachineInstr *CopyMI);
+
+    /// canJoinPhys - Return true if a physreg copy should be joined.
+    bool canJoinPhys(CoalescerPair &CP);
+
+    /// updateRegDefsUses - Replace all defs and uses of SrcReg to DstReg and
     /// update the subregister number if it is not zero. If DstReg is a
     /// physical register and the existing subregister number of the def / use
     /// being updated is not zero, make sure to set it to the correct physical
     /// subregister.
-    void UpdateRegDefsUses(const CoalescerPair &CP);
-
-    /// RemoveDeadDef - If a def of a live interval is now determined dead,
-    /// remove the val# it defines. If the live interval becomes empty, remove
-    /// it as well.
-    bool RemoveDeadDef(LiveInterval &li, MachineInstr *DefMI);
-
-    /// markAsJoined - Remember that CopyMI has already been joined.
-    void markAsJoined(MachineInstr *CopyMI);
+    void updateRegDefsUses(unsigned SrcReg, unsigned DstReg, unsigned SubIdx);
 
     /// eliminateUndefCopy - Handle copies of undef values.
     bool eliminateUndefCopy(MachineInstr *CopyMI, const CoalescerPair &CP);
@@ -233,7 +219,8 @@ static bool isMoveInstr(const TargetRegisterInfo &tri, const MachineInstr *MI,
 }
 
 bool CoalescerPair::setRegisters(const MachineInstr *MI) {
-  SrcReg = DstReg = SubIdx = 0;
+  SrcReg = DstReg = 0;
+  SrcIdx = DstIdx = 0;
   NewRC = 0;
   Flipped = CrossClass = false;
 
@@ -271,39 +258,44 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
     }
   } else {
     // Both registers are virtual.
+    const TargetRegisterClass *SrcRC = MRI.getRegClass(Src);
+    const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
 
     // Both registers have subreg indices.
     if (SrcSub && DstSub) {
-      // For now we only handle the case of identical indices in commensurate
-      // registers: Dreg:ssub_1 + Dreg:ssub_1 -> Dreg
-      // FIXME: Handle Qreg:ssub_3 + Dreg:ssub_1 as QReg:dsub_1 + Dreg.
-      if (SrcSub != DstSub)
+      // Copies between different sub-registers are never coalescable.
+      if (Src == Dst && SrcSub != DstSub)
         return false;
-      const TargetRegisterClass *SrcRC = MRI.getRegClass(Src);
-      const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
-      if (!TRI.getCommonSubClass(DstRC, SrcRC))
+
+      NewRC = TRI.getCommonSuperRegClass(SrcRC, SrcSub, DstRC, DstSub,
+                                         SrcIdx, DstIdx);
+      if (!NewRC)
         return false;
-      SrcSub = DstSub = 0;
+    } else if (DstSub) {
+      // SrcReg will be merged with a sub-register of DstReg.
+      SrcIdx = DstSub;
+      NewRC = TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSub);
+    } else if (SrcSub) {
+      // DstReg will be merged with a sub-register of SrcReg.
+      DstIdx = SrcSub;
+      NewRC = TRI.getMatchingSuperRegClass(SrcRC, DstRC, SrcSub);
+    } else {
+      // This is a straight copy without sub-registers.
+      NewRC = TRI.getCommonSubClass(DstRC, SrcRC);
     }
 
-    // There can be no SrcSub.
-    if (SrcSub) {
+    // The combined constraint may be impossible to satisfy.
+    if (!NewRC)
+      return false;
+
+    // Prefer SrcReg to be a sub-register of DstReg.
+    // FIXME: Coalescer should support subregs symmetrically.
+    if (DstIdx && !SrcIdx) {
       std::swap(Src, Dst);
-      DstSub = SrcSub;
-      SrcSub = 0;
-      assert(!Flipped && "Unexpected flip");
-      Flipped = true;
+      std::swap(SrcIdx, DstIdx);
+      Flipped = !Flipped;
     }
 
-    // Find the new register class.
-    const TargetRegisterClass *SrcRC = MRI.getRegClass(Src);
-    const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
-    if (DstSub)
-      NewRC = TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSub);
-    else
-      NewRC = TRI.getCommonSubClass(DstRC, SrcRC);
-    if (!NewRC)
-      return false;
     CrossClass = NewRC != DstRC || NewRC != SrcRC;
   }
   // Check our invariants
@@ -312,14 +304,14 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
          "Cannot have a physical SubIdx");
   SrcReg = Src;
   DstReg = Dst;
-  SubIdx = DstSub;
   return true;
 }
 
 bool CoalescerPair::flip() {
-  if (SubIdx || TargetRegisterInfo::isPhysicalRegister(DstReg))
+  if (TargetRegisterInfo::isPhysicalRegister(DstReg))
     return false;
   std::swap(SrcReg, DstReg);
+  std::swap(SrcIdx, DstIdx);
   Flipped = !Flipped;
   return true;
 }
@@ -343,7 +335,7 @@ bool CoalescerPair::isCoalescable(const MachineInstr *MI) const {
   if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
     if (!TargetRegisterInfo::isPhysicalRegister(Dst))
       return false;
-    assert(!SubIdx && "Inconsistent CoalescerPair state.");
+    assert(!DstIdx && !SrcIdx && "Inconsistent CoalescerPair state.");
     // DstSub could be set for a physreg from INSERT_SUBREG.
     if (DstSub)
       Dst = TRI.getSubReg(Dst, DstSub);
@@ -357,7 +349,7 @@ bool CoalescerPair::isCoalescable(const MachineInstr *MI) const {
     if (DstReg != Dst)
       return false;
     // Registers match, do the subregisters line up?
-    return compose(TRI, SubIdx, SrcSub) == DstSub;
+    return compose(TRI, SrcIdx, SrcSub) == compose(TRI, DstIdx, DstSub);
   }
 }
 
@@ -375,19 +367,18 @@ void RegisterCoalescer::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-void RegisterCoalescer::markAsJoined(MachineInstr *CopyMI) {
-  /// Joined copies are not deleted immediately, but kept in JoinedCopies.
-  JoinedCopies.insert(CopyMI);
+void RegisterCoalescer::eliminateDeadDefs() {
+  SmallVector<LiveInterval*, 8> NewRegs;
+  LiveRangeEdit(0, NewRegs, *MF, *LIS, 0, this).eliminateDeadDefs(DeadDefs);
+}
 
-  /// Mark all register operands of CopyMI as <undef> so they won't affect dead
-  /// code elimination.
-  for (MachineInstr::mop_iterator I = CopyMI->operands_begin(),
-       E = CopyMI->operands_end(); I != E; ++I)
-    if (I->isReg())
-      I->setIsUndef(true);
+// Callback from eliminateDeadDefs().
+void RegisterCoalescer::LRE_WillEraseInstruction(MachineInstr *MI) {
+  // MI may be in WorkList. Make sure we don't visit it.
+  ErasedInstrs.insert(MI);
 }
 
-/// AdjustCopiesBackFrom - We found a non-trivially-coalescable copy with IntA
+/// adjustCopiesBackFrom - We found a non-trivially-coalescable copy with IntA
 /// being the source and IntB being the dest, thus this defines a value number
 /// in IntB.  If the source value number (in IntA) is defined by a copy from B,
 /// see if we can merge these two pieces of B into a single value number,
@@ -402,12 +393,10 @@ void RegisterCoalescer::markAsJoined(MachineInstr *CopyMI) {
 ///
 /// This returns true if an interval was modified.
 ///
-bool RegisterCoalescer::AdjustCopiesBackFrom(const CoalescerPair &CP,
-                                                    MachineInstr *CopyMI) {
-  // Bail if there is no dst interval - can happen when merging physical subreg
-  // operations.
-  if (!LIS->hasInterval(CP.getDstReg()))
-    return false;
+bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
+                                             MachineInstr *CopyMI) {
+  assert(!CP.isPartial() && "This doesn't work for partial copies.");
+  assert(!CP.isPhys() && "This doesn't work for physreg copies.");
 
   LiveInterval &IntA =
     LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
@@ -457,24 +446,7 @@ bool RegisterCoalescer::AdjustCopiesBackFrom(const CoalescerPair &CP,
   // IntB, we can merge them.
   if (ValLR+1 != BLR) return false;
 
-  // If a live interval is a physical register, conservatively check if any
-  // of its aliases is overlapping the live interval of the virtual register.
-  // If so, do not coalesce.
-  if (TargetRegisterInfo::isPhysicalRegister(IntB.reg)) {
-    for (const uint16_t *AS = TRI->getAliasSet(IntB.reg); *AS; ++AS)
-      if (LIS->hasInterval(*AS) && IntA.overlaps(LIS->getInterval(*AS))) {
-        DEBUG({
-            dbgs() << "\t\tInterfere with alias ";
-            LIS->getInterval(*AS).print(dbgs(), TRI);
-          });
-        return false;
-      }
-  }
-
-  DEBUG({
-      dbgs() << "Extending: ";
-      IntB.print(dbgs(), TRI);
-    });
+  DEBUG(dbgs() << "Extending: " << PrintReg(IntB.reg, TRI));
 
   SlotIndex FillerStart = ValLR->end, FillerEnd = BLR->start;
   // We are about to delete CopyMI, so need to remove it as the 'instruction
@@ -487,19 +459,6 @@ bool RegisterCoalescer::AdjustCopiesBackFrom(const CoalescerPair &CP,
   // two value numbers.
   IntB.addRange(LiveRange(FillerStart, FillerEnd, BValNo));
 
-  // If the IntB live range is assigned to a physical register, and if that
-  // physreg has sub-registers, update their live intervals as well.
-  if (TargetRegisterInfo::isPhysicalRegister(IntB.reg)) {
-    for (const uint16_t *SR = TRI->getSubRegisters(IntB.reg); *SR; ++SR) {
-      if (!LIS->hasInterval(*SR))
-        continue;
-      LiveInterval &SRLI = LIS->getInterval(*SR);
-      SRLI.addRange(LiveRange(FillerStart, FillerEnd,
-                              SRLI.getNextValue(FillerStart,
-                                                LIS->getVNInfoAllocator())));
-    }
-  }
-
   // Okay, merge "B1" into the same value number as "B0".
   if (BValNo != ValLR->valno) {
     // If B1 is killed by a PHI, then the merged live range must also be killed
@@ -509,11 +468,7 @@ bool RegisterCoalescer::AdjustCopiesBackFrom(const CoalescerPair &CP,
     if (HasPHIKill)
       ValLR->valno->setHasPHIKill(true);
   }
-  DEBUG({
-      dbgs() << "   result = ";
-      IntB.print(dbgs(), TRI);
-      dbgs() << "\n";
-    });
+  DEBUG(dbgs() << "   result = " << IntB << '\n');
 
   // If the source instruction was killing the source register before the
   // merge, unset the isKill marker given the live range has been extended.
@@ -525,8 +480,7 @@ bool RegisterCoalescer::AdjustCopiesBackFrom(const CoalescerPair &CP,
   // Rewrite the copy. If the copy instruction was killing the destination
   // register before the merge, find the last use and trim the live range. That
   // will also add the isKill marker.
-  CopyMI->substituteRegister(IntA.reg, IntB.reg, CP.getSubIdx(),
-                             *TRI);
+  CopyMI->substituteRegister(IntA.reg, IntB.reg, 0, *TRI);
   if (ALR->end == CopyIdx)
     LIS->shrinkToUses(&IntA);
 
@@ -534,12 +488,12 @@ bool RegisterCoalescer::AdjustCopiesBackFrom(const CoalescerPair &CP,
   return true;
 }
 
-/// HasOtherReachingDefs - Return true if there are definitions of IntB
+/// hasOtherReachingDefs - Return true if there are definitions of IntB
 /// other than BValNo val# that can reach uses of AValno val# of IntA.
-bool RegisterCoalescer::HasOtherReachingDefs(LiveInterval &IntA,
-                                                    LiveInterval &IntB,
-                                                    VNInfo *AValNo,
-                                                    VNInfo *BValNo) {
+bool RegisterCoalescer::hasOtherReachingDefs(LiveInterval &IntA,
+                                             LiveInterval &IntB,
+                                             VNInfo *AValNo,
+                                             VNInfo *BValNo) {
   for (LiveInterval::iterator AI = IntA.begin(), AE = IntA.end();
        AI != AE; ++AI) {
     if (AI->valno != AValNo) continue;
@@ -559,7 +513,7 @@ bool RegisterCoalescer::HasOtherReachingDefs(LiveInterval &IntA,
   return false;
 }
 
-/// RemoveCopyByCommutingDef - We found a non-trivially-coalescable copy with
+/// removeCopyByCommutingDef - We found a non-trivially-coalescable copy with
 /// IntA being the source and IntB being the dest, thus this defines a value
 /// number in IntB.  If the source value number (in IntA) is defined by a
 /// commutable instruction and its other operand is coalesced to the copy dest
@@ -582,18 +536,9 @@ bool RegisterCoalescer::HasOtherReachingDefs(LiveInterval &IntA,
 ///
 /// This returns true if an interval was modified.
 ///
-bool RegisterCoalescer::RemoveCopyByCommutingDef(const CoalescerPair &CP,
-                                                        MachineInstr *CopyMI) {
-  // FIXME: For now, only eliminate the copy by commuting its def when the
-  // source register is a virtual register. We want to guard against cases
-  // where the copy is a back edge copy and commuting the def lengthen the
-  // live interval of the source register to the entire loop.
-  if (CP.isPhys() && CP.isFlipped())
-    return false;
-
-  // Bail if there is no dst interval.
-  if (!LIS->hasInterval(CP.getDstReg()))
-    return false;
+bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
+                                                 MachineInstr *CopyMI) {
+  assert (!CP.isPhys());
 
   SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot();
 
@@ -647,17 +592,9 @@ bool RegisterCoalescer::RemoveCopyByCommutingDef(const CoalescerPair &CP,
 
   // Make sure there are no other definitions of IntB that would reach the
   // uses which the new definition can reach.
-  if (HasOtherReachingDefs(IntA, IntB, AValNo, BValNo))
+  if (hasOtherReachingDefs(IntA, IntB, AValNo, BValNo))
     return false;
 
-  // Abort if the aliases of IntB.reg have values that are not simply the
-  // clobbers from the superreg.
-  if (TargetRegisterInfo::isPhysicalRegister(IntB.reg))
-    for (const uint16_t *AS = TRI->getAliasSet(IntB.reg); *AS; ++AS)
-      if (LIS->hasInterval(*AS) &&
-          HasOtherReachingDefs(IntA, LIS->getInterval(*AS), AValNo, 0))
-        return false;
-
   // If some of the uses of IntA.reg is already coalesced away, return false.
   // It's not possible to determine whether it's safe to perform the coalescing.
   for (MachineRegisterInfo::use_nodbg_iterator UI =
@@ -666,13 +603,14 @@ bool RegisterCoalescer::RemoveCopyByCommutingDef(const CoalescerPair &CP,
     MachineInstr *UseMI = &*UI;
     SlotIndex UseIdx = LIS->getInstructionIndex(UseMI);
     LiveInterval::iterator ULR = IntA.FindLiveRangeContaining(UseIdx);
-    if (ULR == IntA.end())
+    if (ULR == IntA.end() || ULR->valno != AValNo)
       continue;
-    if (ULR->valno == AValNo && JoinedCopies.count(UseMI))
+    // If this use is tied to a def, we can't rewrite the register.
+    if (UseMI->isRegTiedToDefOperand(UI.getOperandNo()))
       return false;
   }
 
-  DEBUG(dbgs() << "\tRemoveCopyByCommutingDef: " << AValNo->def << '\t'
+  DEBUG(dbgs() << "\tremoveCopyByCommutingDef: " << AValNo->def << '\t'
                << *DefMI);
 
   // At this point we have decided that it is legal to do this
@@ -709,8 +647,6 @@ bool RegisterCoalescer::RemoveCopyByCommutingDef(const CoalescerPair &CP,
     MachineOperand &UseMO = UI.getOperand();
     MachineInstr *UseMI = &*UI;
     ++UI;
-    if (JoinedCopies.count(UseMI))
-      continue;
     if (UseMI->isDebugValue()) {
       // FIXME These don't have an instruction index.  Not clear we have enough
       // info to decide whether to do this replacement or not.  For now do it.
@@ -742,7 +678,9 @@ bool RegisterCoalescer::RemoveCopyByCommutingDef(const CoalescerPair &CP,
     DEBUG(dbgs() << "\t\tnoop: " << DefIdx << '\t' << *UseMI);
     assert(DVNI->def == DefIdx);
     BValNo = IntB.MergeValueNumberInto(BValNo, DVNI);
-    markAsJoined(UseMI);
+    ErasedInstrs.insert(UseMI);
+    LIS->RemoveMachineInstrFromMaps(UseMI);
+    UseMI->eraseFromParent();
   }
 
   // Extend BValNo by merging in IntA live ranges of AValNo. Val# definition
@@ -762,12 +700,11 @@ bool RegisterCoalescer::RemoveCopyByCommutingDef(const CoalescerPair &CP,
   return true;
 }
 
-/// ReMaterializeTrivialDef - If the source of a copy is defined by a trivial
+/// reMaterializeTrivialDef - If the source of a copy is defined by a trivial
 /// computation, replace the copy by rematerialize the definition.
-bool RegisterCoalescer::ReMaterializeTrivialDef(LiveInterval &SrcInt,
-                                                       bool preserveSrcInt,
-                                                       unsigned DstReg,
-                                                       MachineInstr *CopyMI) {
+bool RegisterCoalescer::reMaterializeTrivialDef(LiveInterval &SrcInt,
+                                                unsigned DstReg,
+                                                MachineInstr *CopyMI) {
   SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot(true);
   LiveInterval::iterator SrcLR = SrcInt.FindLiveRangeContaining(CopyIdx);
   assert(SrcLR != SrcInt.end() && "Live range not found!");
@@ -792,7 +729,7 @@ bool RegisterCoalescer::ReMaterializeTrivialDef(LiveInterval &SrcInt,
     // Make sure the copy destination register class fits the instruction
     // definition register class. The mismatch can happen as a result of earlier
     // extract_subreg, insert_subreg, subreg_to_reg coalescing.
-    const TargetRegisterClass *RC = TII->getRegClass(MCID, 0, TRI);
+    const TargetRegisterClass *RC = TII->getRegClass(MCID, 0, TRI, *MF);
     if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
       if (MRI->getRegClass(DstReg) != RC)
         return false;
@@ -838,23 +775,21 @@ bool RegisterCoalescer::ReMaterializeTrivialDef(LiveInterval &SrcInt,
 
   SlotIndex NewMIIdx = LIS->getInstructionIndex(NewMI);
   for (unsigned i = 0, e = NewMIImplDefs.size(); i != e; ++i) {
-    unsigned reg = NewMIImplDefs[i];
-    LiveInterval &li = LIS->getInterval(reg);
-    VNInfo *DeadDefVN = li.getNextValue(NewMIIdx.getRegSlot(),
-                                        LIS->getVNInfoAllocator());
-    LiveRange lr(NewMIIdx.getRegSlot(), NewMIIdx.getDeadSlot(), DeadDefVN);
-    li.addRange(lr);
+    unsigned Reg = NewMIImplDefs[i];
+    for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
+      if (LiveInterval *LI = LIS->getCachedRegUnit(*Units))
+        LI->createDeadDef(NewMIIdx.getRegSlot(), LIS->getVNInfoAllocator());
   }
 
   CopyMI->eraseFromParent();
-  ReMatCopies.insert(CopyMI);
-  ReMatDefs.insert(DefMI);
+  ErasedInstrs.insert(CopyMI);
   DEBUG(dbgs() << "Remat: " << *NewMI);
   ++NumReMats;
 
   // The source interval can become smaller because we removed a use.
-  if (preserveSrcInt)
-    LIS->shrinkToUses(&SrcInt);
+  LIS->shrinkToUses(&SrcInt, &DeadDefs);
+  if (!DeadDefs.empty())
+    eliminateDeadDefs();
 
   return true;
 }
@@ -902,51 +837,40 @@ bool RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI,
   return true;
 }
 
-/// UpdateRegDefsUses - Replace all defs and uses of SrcReg to DstReg and
+/// updateRegDefsUses - Replace all defs and uses of SrcReg to DstReg and
 /// update the subregister number if it is not zero. If DstReg is a
 /// physical register and the existing subregister number of the def / use
 /// being updated is not zero, make sure to set it to the correct physical
 /// subregister.
-void
-RegisterCoalescer::UpdateRegDefsUses(const CoalescerPair &CP) {
-  bool DstIsPhys = CP.isPhys();
-  unsigned SrcReg = CP.getSrcReg();
-  unsigned DstReg = CP.getDstReg();
-  unsigned SubIdx = CP.getSubIdx();
+void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg,
+                                          unsigned DstReg,
+                                          unsigned SubIdx) {
+  bool DstIsPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
+  LiveInterval *DstInt = DstIsPhys ? 0 : &LIS->getInterval(DstReg);
 
   // Update LiveDebugVariables.
   LDV->renameRegister(SrcReg, DstReg, SubIdx);
 
   for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(SrcReg);
        MachineInstr *UseMI = I.skipInstruction();) {
-    // A PhysReg copy that won't be coalesced can perhaps be rematerialized
-    // instead.
-    if (DstIsPhys) {
-      if (UseMI->isFullCopy() &&
-          UseMI->getOperand(1).getReg() == SrcReg &&
-          UseMI->getOperand(0).getReg() != SrcReg &&
-          UseMI->getOperand(0).getReg() != DstReg &&
-          !JoinedCopies.count(UseMI) &&
-          ReMaterializeTrivialDef(LIS->getInterval(SrcReg), false,
-                                  UseMI->getOperand(0).getReg(), UseMI))
-        continue;
-    }
-
     SmallVector<unsigned,8> Ops;
     bool Reads, Writes;
     tie(Reads, Writes) = UseMI->readsWritesVirtualRegister(SrcReg, &Ops);
 
+    // If SrcReg wasn't read, it may still be the case that DstReg is live-in
+    // because SrcReg is a sub-register.
+    if (DstInt && !Reads && SubIdx)
+      Reads = DstInt->liveAt(LIS->getInstructionIndex(UseMI));
+
     // Replace SrcReg with DstReg in all UseMI operands.
     for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
       MachineOperand &MO = UseMI->getOperand(Ops[i]);
 
-      // Make sure we don't create read-modify-write defs accidentally.  We
-      // assume here that a SrcReg def cannot be joined into a live DstReg.  If
-      // RegisterCoalescer starts tracking partially live registers, we will
-      // need to check the actual LiveInterval to determine if DstReg is live
-      // here.
-      if (SubIdx && !Reads)
-        MO.setIsUndef();
+      // Adjust <undef> flags in case of sub-register joins. We don't want to
+      // turn a full def into a read-modify-write sub-register def and vice
+      // versa.
+      if (SubIdx && MO.isDef())
+        MO.setIsUndef(!Reads);
 
       if (DstIsPhys)
         MO.substPhysReg(DstReg, *TRI);
@@ -954,10 +878,6 @@ RegisterCoalescer::UpdateRegDefsUses(const CoalescerPair &CP) {
         MO.substVirtReg(DstReg, SubIdx, *TRI);
     }
 
-    // This instruction is a copy that will be removed.
-    if (JoinedCopies.count(UseMI))
-      continue;
-
     DEBUG({
         dbgs() << "\t\tupdated: ";
         if (!UseMI->isDebugValue())
@@ -967,210 +887,107 @@ RegisterCoalescer::UpdateRegDefsUses(const CoalescerPair &CP) {
   }
 }
 
-/// removeIntervalIfEmpty - Check if the live interval of a physical register
-/// is empty, if so remove it and also remove the empty intervals of its
-/// sub-registers. Return true if live interval is removed.
-static bool removeIntervalIfEmpty(LiveInterval &li, LiveIntervals *LIS,
-                                  const TargetRegisterInfo *TRI) {
-  if (li.empty()) {
-    if (TargetRegisterInfo::isPhysicalRegister(li.reg))
-      for (const uint16_t* SR = TRI->getSubRegisters(li.reg); *SR; ++SR) {
-        if (!LIS->hasInterval(*SR))
-          continue;
-        LiveInterval &sli = LIS->getInterval(*SR);
-        if (sli.empty())
-          LIS->removeInterval(*SR);
-      }
-    LIS->removeInterval(li.reg);
-    return true;
-  }
-  return false;
-}
-
-/// RemoveDeadDef - If a def of a live interval is now determined dead, remove
-/// the val# it defines. If the live interval becomes empty, remove it as well.
-bool RegisterCoalescer::RemoveDeadDef(LiveInterval &li,
-                                             MachineInstr *DefMI) {
-  SlotIndex DefIdx = LIS->getInstructionIndex(DefMI).getRegSlot();
-  LiveInterval::iterator MLR = li.FindLiveRangeContaining(DefIdx);
-  if (DefIdx != MLR->valno->def)
-    return false;
-  li.removeValNo(MLR->valno);
-  return removeIntervalIfEmpty(li, LIS, TRI);
-}
-
-/// shouldJoinPhys - Return true if a copy involving a physreg should be joined.
-/// We need to be careful about coalescing a source physical register with a
-/// virtual register. Once the coalescing is done, it cannot be broken and these
-/// are not spillable! If the destination interval uses are far away, think
-/// twice about coalescing them!
-bool RegisterCoalescer::shouldJoinPhys(CoalescerPair &CP) {
-  bool Allocatable = LIS->isAllocatable(CP.getDstReg());
-  LiveInterval &JoinVInt = LIS->getInterval(CP.getSrcReg());
-
+/// canJoinPhys - Return true if a copy involving a physreg should be joined.
+bool RegisterCoalescer::canJoinPhys(CoalescerPair &CP) {
   /// Always join simple intervals that are defined by a single copy from a
   /// reserved register. This doesn't increase register pressure, so it is
   /// always beneficial.
-  if (!Allocatable && CP.isFlipped() && JoinVInt.containsOneValue())
-    return true;
-
-  if (!EnablePhysicalJoin) {
-    DEBUG(dbgs() << "\tPhysreg joins disabled.\n");
+  if (!RegClassInfo.isReserved(CP.getDstReg())) {
+    DEBUG(dbgs() << "\tCan only merge into reserved registers.\n");
     return false;
   }
 
-  // Only coalesce to allocatable physreg, we don't want to risk modifying
-  // reserved registers.
-  if (!Allocatable) {
-    DEBUG(dbgs() << "\tRegister is an unallocatable physreg.\n");
-    return false;  // Not coalescable.
-  }
-
-  // Don't join with physregs that have a ridiculous number of live
-  // ranges. The data structure performance is really bad when that
-  // happens.
-  if (LIS->hasInterval(CP.getDstReg()) &&
-      LIS->getInterval(CP.getDstReg()).ranges.size() > 1000) {
-    ++numAborts;
-    DEBUG(dbgs()
-          << "\tPhysical register live interval too complicated, abort!\n");
-    return false;
-  }
-
-  // FIXME: Why are we skipping this test for partial copies?
-  //        CodeGen/X86/phys_subreg_coalesce-3.ll needs it.
-  if (!CP.isPartial()) {
-    const TargetRegisterClass *RC = MRI->getRegClass(CP.getSrcReg());
-    unsigned Threshold = RegClassInfo.getNumAllocatableRegs(RC) * 2;
-    unsigned Length = LIS->getApproximateInstructionCount(JoinVInt);
-    if (Length > Threshold) {
-      ++numAborts;
-      DEBUG(dbgs() << "\tMay tie down a physical register, abort!\n");
-      return false;
-    }
-  }
-  return true;
-}
-
-/// isWinToJoinCrossClass - Return true if it's profitable to coalesce
-/// two virtual registers from different register classes.
-bool
-RegisterCoalescer::isWinToJoinCrossClass(unsigned SrcReg,
-                                             unsigned DstReg,
-                                             const TargetRegisterClass *SrcRC,
-                                             const TargetRegisterClass *DstRC,
-                                             const TargetRegisterClass *NewRC) {
-  unsigned NewRCCount = RegClassInfo.getNumAllocatableRegs(NewRC);
-  // This heuristics is good enough in practice, but it's obviously not *right*.
-  // 4 is a magic number that works well enough for x86, ARM, etc. It filter
-  // out all but the most restrictive register classes.
-  if (NewRCCount > 4 ||
-      // Early exit if the function is fairly small, coalesce aggressively if
-      // that's the case. For really special register classes with 3 or
-      // fewer registers, be a bit more careful.
-      (LIS->getFuncInstructionCount() / NewRCCount) < 8)
-    return true;
-  LiveInterval &SrcInt = LIS->getInterval(SrcReg);
-  LiveInterval &DstInt = LIS->getInterval(DstReg);
-  unsigned SrcSize = LIS->getApproximateInstructionCount(SrcInt);
-  unsigned DstSize = LIS->getApproximateInstructionCount(DstInt);
-
-  // Coalesce aggressively if the intervals are small compared to the number of
-  // registers in the new class. The number 4 is fairly arbitrary, chosen to be
-  // less aggressive than the 8 used for the whole function size.
-  const unsigned ThresSize = 4 * NewRCCount;
-  if (SrcSize <= ThresSize && DstSize <= ThresSize)
+  LiveInterval &JoinVInt = LIS->getInterval(CP.getSrcReg());
+  if (CP.isFlipped() && JoinVInt.containsOneValue())
     return true;
 
-  // Estimate *register use density*. If it doubles or more, abort.
-  unsigned SrcUses = std::distance(MRI->use_nodbg_begin(SrcReg),
-                                   MRI->use_nodbg_end());
-  unsigned DstUses = std::distance(MRI->use_nodbg_begin(DstReg),
-                                   MRI->use_nodbg_end());
-  unsigned NewUses = SrcUses + DstUses;
-  unsigned NewSize = SrcSize + DstSize;
-  if (SrcRC != NewRC && SrcSize > ThresSize) {
-    unsigned SrcRCCount = RegClassInfo.getNumAllocatableRegs(SrcRC);
-    if (NewUses*SrcSize*SrcRCCount > 2*SrcUses*NewSize*NewRCCount)
-      return false;
-  }
-  if (DstRC != NewRC && DstSize > ThresSize) {
-    unsigned DstRCCount = RegClassInfo.getNumAllocatableRegs(DstRC);
-    if (NewUses*DstSize*DstRCCount > 2*DstUses*NewSize*NewRCCount)
-      return false;
-  }
-  return true;
+  DEBUG(dbgs() << "\tCannot join defs into reserved register.\n");
+  return false;
 }
 
-
-/// JoinCopy - Attempt to join intervals corresponding to SrcReg/DstReg,
+/// joinCopy - Attempt to join intervals corresponding to SrcReg/DstReg,
 /// which are the src/dst of the copy instruction CopyMI.  This returns true
 /// if the copy was successfully coalesced away. If it is not currently
 /// possible to coalesce this interval, but it may be possible if other
 /// things get coalesced, then it returns true by reference in 'Again'.
-bool RegisterCoalescer::JoinCopy(MachineInstr *CopyMI, bool &Again) {
+bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
 
   Again = false;
-  if (JoinedCopies.count(CopyMI) || ReMatCopies.count(CopyMI))
-    return false; // Already done.
-
   DEBUG(dbgs() << LIS->getInstructionIndex(CopyMI) << '\t' << *CopyMI);
 
-  CoalescerPair CP(*TII, *TRI);
+  CoalescerPair CP(*TRI);
   if (!CP.setRegisters(CopyMI)) {
     DEBUG(dbgs() << "\tNot coalescable.\n");
     return false;
   }
 
-  // If they are already joined we continue.
-  if (CP.getSrcReg() == CP.getDstReg()) {
-    markAsJoined(CopyMI);
-    DEBUG(dbgs() << "\tCopy already coalesced.\n");
-    return false;  // Not coalescable.
+  // Dead code elimination. This really should be handled by MachineDCE, but
+  // sometimes dead copies slip through, and we can't generate invalid live
+  // ranges.
+  if (!CP.isPhys() && CopyMI->allDefsAreDead()) {
+    DEBUG(dbgs() << "\tCopy is dead.\n");
+    DeadDefs.push_back(CopyMI);
+    eliminateDeadDefs();
+    return true;
   }
 
   // Eliminate undefs.
   if (!CP.isPhys() && eliminateUndefCopy(CopyMI, CP)) {
-    markAsJoined(CopyMI);
     DEBUG(dbgs() << "\tEliminated copy of <undef> value.\n");
+    LIS->RemoveMachineInstrFromMaps(CopyMI);
+    CopyMI->eraseFromParent();
     return false;  // Not coalescable.
   }
 
-  DEBUG(dbgs() << "\tConsidering merging " << PrintReg(CP.getSrcReg(), TRI)
-               << " with " << PrintReg(CP.getDstReg(), TRI, CP.getSubIdx())
-               << "\n");
+  // Coalesced copies are normally removed immediately, but transformations
+  // like removeCopyByCommutingDef() can inadvertently create identity copies.
+  // When that happens, just join the values and remove the copy.
+  if (CP.getSrcReg() == CP.getDstReg()) {
+    LiveInterval &LI = LIS->getInterval(CP.getSrcReg());
+    DEBUG(dbgs() << "\tCopy already coalesced: " << LI << '\n');
+    LiveRangeQuery LRQ(LI, LIS->getInstructionIndex(CopyMI));
+    if (VNInfo *DefVNI = LRQ.valueDefined()) {
+      VNInfo *ReadVNI = LRQ.valueIn();
+      assert(ReadVNI && "No value before copy and no <undef> flag.");
+      assert(ReadVNI != DefVNI && "Cannot read and define the same value.");
+      LI.MergeValueNumberInto(DefVNI, ReadVNI);
+      DEBUG(dbgs() << "\tMerged values:          " << LI << '\n');
+    }
+    LIS->RemoveMachineInstrFromMaps(CopyMI);
+    CopyMI->eraseFromParent();
+    return true;
+  }
 
   // Enforce policies.
   if (CP.isPhys()) {
-    if (!shouldJoinPhys(CP)) {
+    DEBUG(dbgs() << "\tConsidering merging " << PrintReg(CP.getSrcReg(), TRI)
+                 << " with " << PrintReg(CP.getDstReg(), TRI, CP.getSrcIdx())
+                 << '\n');
+    if (!canJoinPhys(CP)) {
       // Before giving up coalescing, if definition of source is defined by
       // trivial computation, try rematerializing it.
       if (!CP.isFlipped() &&
-          ReMaterializeTrivialDef(LIS->getInterval(CP.getSrcReg()), true,
+          reMaterializeTrivialDef(LIS->getInterval(CP.getSrcReg()),
                                   CP.getDstReg(), CopyMI))
         return true;
       return false;
     }
   } else {
-    // Avoid constraining virtual register regclass too much.
-    if (CP.isCrossClass()) {
-      DEBUG(dbgs() << "\tCross-class to " << CP.getNewRC()->getName() << ".\n");
-      if (DisableCrossClassJoin) {
-        DEBUG(dbgs() << "\tCross-class joins disabled.\n");
-        return false;
-      }
-      if (!isWinToJoinCrossClass(CP.getSrcReg(), CP.getDstReg(),
-                                 MRI->getRegClass(CP.getSrcReg()),
-                                 MRI->getRegClass(CP.getDstReg()),
-                                 CP.getNewRC())) {
-        DEBUG(dbgs() << "\tAvoid coalescing to constrained register class.\n");
-        Again = true;  // May be possible to coalesce later.
-        return false;
-      }
-    }
+    DEBUG({
+      dbgs() << "\tConsidering merging to " << CP.getNewRC()->getName()
+             << " with ";
+      if (CP.getDstIdx() && CP.getSrcIdx())
+        dbgs() << PrintReg(CP.getDstReg()) << " in "
+               << TRI->getSubRegIndexName(CP.getDstIdx()) << " and "
+               << PrintReg(CP.getSrcReg()) << " in "
+               << TRI->getSubRegIndexName(CP.getSrcIdx()) << '\n';
+      else
+        dbgs() << PrintReg(CP.getSrcReg(), TRI) << " in "
+               << PrintReg(CP.getDstReg(), TRI, CP.getSrcIdx()) << '\n';
+    });
 
     // When possible, let DstReg be the larger interval.
-    if (!CP.getSubIdx() && LIS->getInterval(CP.getSrcReg()).ranges.size() >
+    if (!CP.isPartial() && LIS->getInterval(CP.getSrcReg()).ranges.size() >
                            LIS->getInterval(CP.getDstReg()).ranges.size())
       CP.flip();
   }
@@ -1179,21 +996,22 @@ bool RegisterCoalescer::JoinCopy(MachineInstr *CopyMI, bool &Again) {
   // Otherwise, if one of the intervals being joined is a physreg, this method
   // always canonicalizes DstInt to be it.  The output "SrcInt" will not have
   // been modified, so we can use this information below to update aliases.
-  if (!JoinIntervals(CP)) {
+  if (!joinIntervals(CP)) {
     // Coalescing failed.
 
     // If definition of source is defined by trivial computation, try
     // rematerializing it.
     if (!CP.isFlipped() &&
-        ReMaterializeTrivialDef(LIS->getInterval(CP.getSrcReg()), true,
+        reMaterializeTrivialDef(LIS->getInterval(CP.getSrcReg()),
                                 CP.getDstReg(), CopyMI))
       return true;
 
     // If we can eliminate the copy without merging the live ranges, do so now.
-    if (!CP.isPartial()) {
-      if (AdjustCopiesBackFrom(CP, CopyMI) ||
-          RemoveCopyByCommutingDef(CP, CopyMI)) {
-        markAsJoined(CopyMI);
+    if (!CP.isPartial() && !CP.isPhys()) {
+      if (adjustCopiesBackFrom(CP, CopyMI) ||
+          removeCopyByCommutingDef(CP, CopyMI)) {
+        LIS->RemoveMachineInstrFromMaps(CopyMI);
+        CopyMI->eraseFromParent();
         DEBUG(dbgs() << "\tTrivial!\n");
         return true;
       }
@@ -1212,29 +1030,21 @@ bool RegisterCoalescer::JoinCopy(MachineInstr *CopyMI, bool &Again) {
     MRI->setRegClass(CP.getDstReg(), CP.getNewRC());
   }
 
-  // Remember to delete the copy instruction.
-  markAsJoined(CopyMI);
+  // Removing sub-register copies can ease the register class constraints.
+  // Make sure we attempt to inflate the register class of DstReg.
+  if (!CP.isPhys() && RegClassInfo.isProperSubClass(CP.getNewRC()))
+    InflateRegs.push_back(CP.getDstReg());
 
-  UpdateRegDefsUses(CP);
+  // CopyMI has been erased by joinIntervals at this point. Remove it from
+  // ErasedInstrs since copyCoalesceWorkList() won't add a successful join back
+  // to the work list. This keeps ErasedInstrs from growing needlessly.
+  ErasedInstrs.erase(CopyMI);
 
-  // If we have extended the live range of a physical register, make sure we
-  // update live-in lists as well.
-  if (CP.isPhys()) {
-    SmallVector<MachineBasicBlock*, 16> BlockSeq;
-    // JoinIntervals invalidates the VNInfos in SrcInt, but we only need the
-    // ranges for this, and they are preserved.
-    LiveInterval &SrcInt = LIS->getInterval(CP.getSrcReg());
-    for (LiveInterval::const_iterator I = SrcInt.begin(), E = SrcInt.end();
-         I != E; ++I ) {
-      LIS->findLiveInMBBs(I->start, I->end, BlockSeq);
-      for (unsigned idx = 0, size = BlockSeq.size(); idx != size; ++idx) {
-        MachineBasicBlock &block = *BlockSeq[idx];
-        if (!block.isLiveIn(CP.getDstReg()))
-          block.addLiveIn(CP.getDstReg());
-      }
-      BlockSeq.clear();
-    }
-  }
+  // Rewrite all SrcReg operands to DstReg.
+  // Also update DstReg operands to include DstIdx if it is set.
+  if (CP.getDstIdx())
+    updateRegDefsUses(CP.getDstReg(), CP.getDstReg(), CP.getDstIdx());
+  updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx());
 
   // SrcReg is guaranteed to be the register whose live interval that is
   // being merged.
@@ -1244,16 +1054,51 @@ bool RegisterCoalescer::JoinCopy(MachineInstr *CopyMI, bool &Again) {
   TRI->UpdateRegAllocHint(CP.getSrcReg(), CP.getDstReg(), *MF);
 
   DEBUG({
-    LiveInterval &DstInt = LIS->getInterval(CP.getDstReg());
-    dbgs() << "\tJoined. Result = ";
-    DstInt.print(dbgs(), TRI);
-    dbgs() << "\n";
+    dbgs() << "\tJoined. Result = " << PrintReg(CP.getDstReg(), TRI);
+    if (!CP.isPhys())
+      dbgs() << LIS->getInterval(CP.getDstReg());
+     dbgs() << '\n';
   });
 
   ++numJoins;
   return true;
 }
 
+/// Attempt joining with a reserved physreg.
+bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
+  assert(CP.isPhys() && "Must be a physreg copy");
+  assert(RegClassInfo.isReserved(CP.getDstReg()) && "Not a reserved register");
+  LiveInterval &RHS = LIS->getInterval(CP.getSrcReg());
+  DEBUG(dbgs() << "\t\tRHS = " << PrintReg(CP.getSrcReg()) << ' ' << RHS
+               << '\n');
+
+  assert(CP.isFlipped() && RHS.containsOneValue() &&
+         "Invalid join with reserved register");
+
+  // Optimization for reserved registers like ESP. We can only merge with a
+  // reserved physreg if RHS has a single value that is a copy of CP.DstReg().
+  // The live range of the reserved register will look like a set of dead defs
+  // - we don't properly track the live range of reserved registers.
+
+  // Deny any overlapping intervals.  This depends on all the reserved
+  // register live ranges to look like dead defs.
+  for (MCRegUnitIterator UI(CP.getDstReg(), TRI); UI.isValid(); ++UI)
+    if (RHS.overlaps(LIS->getRegUnit(*UI))) {
+      DEBUG(dbgs() << "\t\tInterference: " << PrintRegUnit(*UI, TRI) << '\n');
+      return false;
+    }
+
+  // Skip any value computations, we are not adding new values to the
+  // reserved register.  Also skip merging the live ranges, the reserved
+  // register live range doesn't need to be accurate as long as all the
+  // defs are there.
+
+  // We don't track kills for reserved registers.
+  MRI->clearKillFlags(CP.getSrcReg());
+
+  return true;
+}
+
 /// ComputeUltimateVN - Assuming we are going to join two live intervals,
 /// compute what the resultant value numbers for each value in the input two
 /// ranges will be.  This is complicated by copies between the two which can
@@ -1320,144 +1165,70 @@ static bool RegistersDefinedFromSameValue(LiveIntervals &li,
                                           const TargetRegisterInfo &tri,
                                           CoalescerPair &CP,
                                           VNInfo *VNI,
-                                          LiveRange *LR,
+                                          VNInfo *OtherVNI,
                                      SmallVector<MachineInstr*, 8> &DupCopies) {
   // FIXME: This is very conservative. For example, we don't handle
   // physical registers.
 
   MachineInstr *MI = li.getInstructionFromIndex(VNI->def);
 
-  if (!MI || !MI->isFullCopy() || CP.isPartial() || CP.isPhys())
+  if (!MI || CP.isPartial() || CP.isPhys())
     return false;
 
-  unsigned Dst = MI->getOperand(0).getReg();
-  unsigned Src = MI->getOperand(1).getReg();
-
-  if (!TargetRegisterInfo::isVirtualRegister(Src) ||
-      !TargetRegisterInfo::isVirtualRegister(Dst))
+  unsigned A = CP.getDstReg();
+  if (!TargetRegisterInfo::isVirtualRegister(A))
     return false;
 
-  unsigned A = CP.getDstReg();
   unsigned B = CP.getSrcReg();
-
-  if (B == Dst)
-    std::swap(A, B);
-  assert(Dst == A);
-
-  VNInfo *Other = LR->valno;
-  const MachineInstr *OtherMI = li.getInstructionFromIndex(Other->def);
-
-  if (!OtherMI || !OtherMI->isFullCopy())
+  if (!TargetRegisterInfo::isVirtualRegister(B))
     return false;
 
-  unsigned OtherDst = OtherMI->getOperand(0).getReg();
-  unsigned OtherSrc = OtherMI->getOperand(1).getReg();
-
-  if (!TargetRegisterInfo::isVirtualRegister(OtherSrc) ||
-      !TargetRegisterInfo::isVirtualRegister(OtherDst))
+  MachineInstr *OtherMI = li.getInstructionFromIndex(OtherVNI->def);
+  if (!OtherMI)
     return false;
 
-  assert(OtherDst == B);
-
-  if (Src != OtherSrc)
-    return false;
+  if (MI->isImplicitDef()) {
+    DupCopies.push_back(MI);
+    return true;
+  } else {
+    if (!MI->isFullCopy())
+      return false;
+    unsigned Src = MI->getOperand(1).getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Src))
+      return false;
+    if (!OtherMI->isFullCopy())
+      return false;
+    unsigned OtherSrc = OtherMI->getOperand(1).getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(OtherSrc))
+      return false;
 
-  // If the copies use two different value numbers of X, we cannot merge
-  // A and B.
-  LiveInterval &SrcInt = li.getInterval(Src);
-  // getVNInfoBefore returns NULL for undef copies. In this case, the
-  // optimization is still safe.
-  if (SrcInt.getVNInfoBefore(Other->def) != SrcInt.getVNInfoBefore(VNI->def))
-    return false;
+    if (Src != OtherSrc)
+      return false;
 
-  DupCopies.push_back(MI);
+    // If the copies use two different value numbers of X, we cannot merge
+    // A and B.
+    LiveInterval &SrcInt = li.getInterval(Src);
+    // getVNInfoBefore returns NULL for undef copies. In this case, the
+    // optimization is still safe.
+    if (SrcInt.getVNInfoBefore(OtherVNI->def) !=
+        SrcInt.getVNInfoBefore(VNI->def))
+      return false;
 
-  return true;
+    DupCopies.push_back(MI);
+    return true;
+  }
 }
 
-/// JoinIntervals - Attempt to join these two intervals.  On failure, this
+/// joinIntervals - Attempt to join these two intervals.  On failure, this
 /// returns false.
-bool RegisterCoalescer::JoinIntervals(CoalescerPair &CP) {
-  LiveInterval &RHS = LIS->getInterval(CP.getSrcReg());
-  DEBUG({ dbgs() << "\t\tRHS = "; RHS.print(dbgs(), TRI); dbgs() << "\n"; });
-
-  // If a live interval is a physical register, check for interference with any
-  // aliases. The interference check implemented here is a bit more conservative
-  // than the full interfeence check below. We allow overlapping live ranges
-  // only when one is a copy of the other.
-  if (CP.isPhys()) {
-    // Optimization for reserved registers like ESP.
-    // We can only merge with a reserved physreg if RHS has a single value that
-    // is a copy of CP.DstReg().  The live range of the reserved register will
-    // look like a set of dead defs - we don't properly track the live range of
-    // reserved registers.
-    if (RegClassInfo.isReserved(CP.getDstReg())) {
-      assert(CP.isFlipped() && RHS.containsOneValue() &&
-             "Invalid join with reserved register");
-      // Deny any overlapping intervals.  This depends on all the reserved
-      // register live ranges to look like dead defs.
-      for (const uint16_t *AS = TRI->getOverlaps(CP.getDstReg()); *AS; ++AS) {
-        if (!LIS->hasInterval(*AS)) {
-          // Make sure at least DstReg itself exists before attempting a join.
-          if (*AS == CP.getDstReg())
-            LIS->getOrCreateInterval(CP.getDstReg());
-          continue;
-        }
-        if (RHS.overlaps(LIS->getInterval(*AS))) {
-          DEBUG(dbgs() << "\t\tInterference: " << PrintReg(*AS, TRI) << '\n');
-          return false;
-        }
-      }
-      // Skip any value computations, we are not adding new values to the
-      // reserved register.  Also skip merging the live ranges, the reserved
-      // register live range doesn't need to be accurate as long as all the
-      // defs are there.
-      return true;
-    }
-
-    // Check if a register mask clobbers DstReg.
-    BitVector UsableRegs;
-    if (LIS->checkRegMaskInterference(RHS, UsableRegs) &&
-        !UsableRegs.test(CP.getDstReg())) {
-      DEBUG(dbgs() << "\t\tRegister mask interference.\n");
-      return false;
-    }
+bool RegisterCoalescer::joinIntervals(CoalescerPair &CP) {
+  // Handle physreg joins separately.
+  if (CP.isPhys())
+    return joinReservedPhysReg(CP);
 
-    for (const uint16_t *AS = TRI->getAliasSet(CP.getDstReg()); *AS; ++AS){
-      if (!LIS->hasInterval(*AS))
-        continue;
-      const LiveInterval &LHS = LIS->getInterval(*AS);
-      LiveInterval::const_iterator LI = LHS.begin();
-      for (LiveInterval::const_iterator RI = RHS.begin(), RE = RHS.end();
-           RI != RE; ++RI) {
-        LI = std::lower_bound(LI, LHS.end(), RI->start);
-        // Does LHS have an overlapping live range starting before RI?
-        if ((LI != LHS.begin() && LI[-1].end > RI->start) &&
-            (RI->start != RI->valno->def ||
-             !CP.isCoalescable(LIS->getInstructionFromIndex(RI->start)))) {
-          DEBUG({
-            dbgs() << "\t\tInterference from alias: ";
-            LHS.print(dbgs(), TRI);
-            dbgs() << "\n\t\tOverlap at " << RI->start << " and no copy.\n";
-          });
-          return false;
-        }
-
-        // Check that LHS ranges beginning in this range are copies.
-        for (; LI != LHS.end() && LI->start < RI->end; ++LI) {
-          if (LI->start != LI->valno->def ||
-              !CP.isCoalescable(LIS->getInstructionFromIndex(LI->start))) {
-            DEBUG({
-              dbgs() << "\t\tInterference from alias: ";
-              LHS.print(dbgs(), TRI);
-              dbgs() << "\n\t\tDef at " << LI->start << " is not a copy.\n";
-            });
-            return false;
-          }
-        }
-      }
-    }
-  }
+  LiveInterval &RHS = LIS->getInterval(CP.getSrcReg());
+  DEBUG(dbgs() << "\t\tRHS = " << PrintReg(CP.getSrcReg()) << ' ' << RHS
+               << '\n');
 
   // Compute the final value assignment, assuming that the live ranges can be
   // coalesced.
@@ -1468,9 +1239,11 @@ bool RegisterCoalescer::JoinIntervals(CoalescerPair &CP) {
   SmallVector<VNInfo*, 16> NewVNInfo;
 
   SmallVector<MachineInstr*, 8> DupCopies;
+  SmallVector<MachineInstr*, 8> DeadCopies;
 
   LiveInterval &LHS = LIS->getOrCreateInterval(CP.getDstReg());
-  DEBUG({ dbgs() << "\t\tLHS = "; LHS.print(dbgs(), TRI); dbgs() << "\n"; });
+  DEBUG(dbgs() << "\t\tLHS = " << PrintReg(CP.getDstReg(), TRI) << ' ' << LHS
+               << '\n');
 
   // Loop over the value numbers of the LHS, seeing if any are defined from
   // the RHS.
@@ -1481,21 +1254,24 @@ bool RegisterCoalescer::JoinIntervals(CoalescerPair &CP) {
       continue;
     MachineInstr *MI = LIS->getInstructionFromIndex(VNI->def);
     assert(MI && "Missing def");
-    if (!MI->isCopyLike())  // Src not defined by a copy?
+    if (!MI->isCopyLike() && !MI->isImplicitDef()) // Src not defined by a copy?
       continue;
 
     // Figure out the value # from the RHS.
-    LiveRange *lr = RHS.getLiveRangeContaining(VNI->def.getPrevSlot());
+    VNInfo *OtherVNI = RHS.getVNInfoBefore(VNI->def);
     // The copy could be to an aliased physreg.
-    if (!lr) continue;
+    if (!OtherVNI)
+      continue;
 
     // DstReg is known to be a register in the LHS interval.  If the src is
     // from the RHS interval, we can use its value #.
-    if (!CP.isCoalescable(MI) &&
-        !RegistersDefinedFromSameValue(*LIS, *TRI, CP, VNI, lr, DupCopies))
+    if (CP.isCoalescable(MI))
+      DeadCopies.push_back(MI);
+    else if (!RegistersDefinedFromSameValue(*LIS, *TRI, CP, VNI, OtherVNI,
+                                            DupCopies))
       continue;
 
-    LHSValsDefinedFromRHS[VNI] = lr->valno;
+    LHSValsDefinedFromRHS[VNI] = OtherVNI;
   }
 
   // Loop over the value numbers of the RHS, seeing if any are defined from
@@ -1507,21 +1283,24 @@ bool RegisterCoalescer::JoinIntervals(CoalescerPair &CP) {
       continue;
     MachineInstr *MI = LIS->getInstructionFromIndex(VNI->def);
     assert(MI && "Missing def");
-    if (!MI->isCopyLike())  // Src not defined by a copy?
+    if (!MI->isCopyLike() && !MI->isImplicitDef()) // Src not defined by a copy?
       continue;
 
     // Figure out the value # from the LHS.
-    LiveRange *lr = LHS.getLiveRangeContaining(VNI->def.getPrevSlot());
+    VNInfo *OtherVNI = LHS.getVNInfoBefore(VNI->def);
     // The copy could be to an aliased physreg.
-    if (!lr) continue;
+    if (!OtherVNI)
+      continue;
 
     // DstReg is known to be a register in the RHS interval.  If the src is
     // from the LHS interval, we can use its value #.
-    if (!CP.isCoalescable(MI) &&
-        !RegistersDefinedFromSameValue(*LIS, *TRI, CP, VNI, lr, DupCopies))
+    if (CP.isCoalescable(MI))
+      DeadCopies.push_back(MI);
+    else if (!RegistersDefinedFromSameValue(*LIS, *TRI, CP, VNI, OtherVNI,
+                                            DupCopies))
         continue;
 
-    RHSValsDefinedFromLHS[VNI] = lr->valno;
+    RHSValsDefinedFromLHS[VNI] = OtherVNI;
   }
 
   LHSValNoAssignments.resize(LHS.getNumValNums(), -1);
@@ -1563,6 +1342,10 @@ bool RegisterCoalescer::JoinIntervals(CoalescerPair &CP) {
   LiveInterval::const_iterator J = RHS.begin();
   LiveInterval::const_iterator JE = RHS.end();
 
+  // Collect interval end points that will no longer be kills.
+  SmallVector<MachineInstr*, 8> LHSOldKills;
+  SmallVector<MachineInstr*, 8> RHSOldKills;
+
   // Skip ahead until the first place of potential sharing.
   if (I != IE && J != JE) {
     if (I->start < J->start) {
@@ -1576,20 +1359,21 @@ bool RegisterCoalescer::JoinIntervals(CoalescerPair &CP) {
 
   while (I != IE && J != JE) {
     // Determine if these two live ranges overlap.
-    bool Overlaps;
-    if (I->start < J->start) {
-      Overlaps = I->end > J->start;
-    } else {
-      Overlaps = J->end > I->start;
-    }
-
     // If so, check value # info to determine if they are really different.
-    if (Overlaps) {
+    if (I->end > J->start && J->end > I->start) {
       // If the live range overlap will map to the same value number in the
       // result liverange, we can still coalesce them.  If not, we can't.
       if (LHSValNoAssignments[I->valno->id] !=
           RHSValNoAssignments[J->valno->id])
         return false;
+
+      // Extended live ranges should no longer be killed.
+      if (!I->end.isBlock() && I->end < J->end)
+        if (MachineInstr *MI = LIS->getInstructionFromIndex(I->end))
+          LHSOldKills.push_back(MI);
+      if (!J->end.isBlock() && J->end < I->end)
+        if (MachineInstr *MI = LIS->getInstructionFromIndex(J->end))
+          RHSOldKills.push_back(MI);
     }
 
     if (I->end < J->end)
@@ -1616,29 +1400,48 @@ bool RegisterCoalescer::JoinIntervals(CoalescerPair &CP) {
       NewVNInfo[RHSValID]->setHasPHIKill(true);
   }
 
+  // Clear kill flags where live ranges are extended.
+  while (!LHSOldKills.empty())
+    LHSOldKills.pop_back_val()->clearRegisterKills(LHS.reg, TRI);
+  while (!RHSOldKills.empty())
+    RHSOldKills.pop_back_val()->clearRegisterKills(RHS.reg, TRI);
+
   if (LHSValNoAssignments.empty())
     LHSValNoAssignments.push_back(-1);
   if (RHSValNoAssignments.empty())
     RHSValNoAssignments.push_back(-1);
 
+  // Now erase all the redundant copies.
+  for (unsigned i = 0, e = DeadCopies.size(); i != e; ++i) {
+    MachineInstr *MI = DeadCopies[i];
+    if (!ErasedInstrs.insert(MI))
+      continue;
+    DEBUG(dbgs() << "\t\terased:\t" << LIS->getInstructionIndex(MI)
+                 << '\t' << *MI);
+    LIS->RemoveMachineInstrFromMaps(MI);
+    MI->eraseFromParent();
+  }
+
   SmallVector<unsigned, 8> SourceRegisters;
   for (SmallVector<MachineInstr*, 8>::iterator I = DupCopies.begin(),
          E = DupCopies.end(); I != E; ++I) {
     MachineInstr *MI = *I;
+    if (!ErasedInstrs.insert(MI))
+      continue;
 
-    // We have pretended that the assignment to B in
+    // If MI is a copy, then we have pretended that the assignment to B in
     // A = X
     // B = X
     // was actually a copy from A. Now that we decided to coalesce A and B,
     // transform the code into
     // A = X
-    // X = X
-    // and mark the X as coalesced to keep the illusion.
-    unsigned Src = MI->getOperand(1).getReg();
-    SourceRegisters.push_back(Src);
-    MI->getOperand(0).substVirtReg(Src, 0, *TRI);
-
-    markAsJoined(MI);
+    // In the case of the implicit_def, we just have to remove it.
+    if (!MI->isImplicitDef()) {
+      unsigned Src = MI->getOperand(1).getReg();
+      SourceRegisters.push_back(Src);
+    }
+    LIS->RemoveMachineInstrFromMaps(MI);
+    MI->eraseFromParent();
   }
 
   // If B = X was the last use of X in a liverange, we have to shrink it now
@@ -1678,73 +1481,58 @@ namespace {
   };
 }
 
-void RegisterCoalescer::CopyCoalesceInMBB(MachineBasicBlock *MBB,
-                                            std::vector<MachineInstr*> &TryAgain) {
-  DEBUG(dbgs() << MBB->getName() << ":\n");
-
-  SmallVector<MachineInstr*, 8> VirtCopies;
-  SmallVector<MachineInstr*, 8> PhysCopies;
-  SmallVector<MachineInstr*, 8> ImpDefCopies;
-  for (MachineBasicBlock::iterator MII = MBB->begin(), E = MBB->end();
-       MII != E;) {
-    MachineInstr *Inst = MII++;
-
-    // If this isn't a copy nor a extract_subreg, we can't join intervals.
-    unsigned SrcReg, DstReg;
-    if (Inst->isCopy()) {
-      DstReg = Inst->getOperand(0).getReg();
-      SrcReg = Inst->getOperand(1).getReg();
-    } else if (Inst->isSubregToReg()) {
-      DstReg = Inst->getOperand(0).getReg();
-      SrcReg = Inst->getOperand(2).getReg();
-    } else
+// Try joining WorkList copies starting from index From.
+// Null out any successful joins.
+bool RegisterCoalescer::copyCoalesceWorkList(unsigned From) {
+  assert(From <= WorkList.size() && "Out of range");
+  bool Progress = false;
+  for (unsigned i = From, e = WorkList.size(); i != e; ++i) {
+    if (!WorkList[i])
       continue;
-
-    bool SrcIsPhys = TargetRegisterInfo::isPhysicalRegister(SrcReg);
-    bool DstIsPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
-    if (LIS->hasInterval(SrcReg) && LIS->getInterval(SrcReg).empty())
-      ImpDefCopies.push_back(Inst);
-    else if (SrcIsPhys || DstIsPhys)
-      PhysCopies.push_back(Inst);
-    else
-      VirtCopies.push_back(Inst);
-  }
-
-  // Try coalescing implicit copies and insert_subreg <undef> first,
-  // followed by copies to / from physical registers, then finally copies
-  // from virtual registers to virtual registers.
-  for (unsigned i = 0, e = ImpDefCopies.size(); i != e; ++i) {
-    MachineInstr *TheCopy = ImpDefCopies[i];
-    bool Again = false;
-    if (!JoinCopy(TheCopy, Again))
-      if (Again)
-        TryAgain.push_back(TheCopy);
-  }
-  for (unsigned i = 0, e = PhysCopies.size(); i != e; ++i) {
-    MachineInstr *TheCopy = PhysCopies[i];
-    bool Again = false;
-    if (!JoinCopy(TheCopy, Again))
-      if (Again)
-        TryAgain.push_back(TheCopy);
-  }
-  for (unsigned i = 0, e = VirtCopies.size(); i != e; ++i) {
-    MachineInstr *TheCopy = VirtCopies[i];
+    // Skip instruction pointers that have already been erased, for example by
+    // dead code elimination.
+    if (ErasedInstrs.erase(WorkList[i])) {
+      WorkList[i] = 0;
+      continue;
+    }
     bool Again = false;
-    if (!JoinCopy(TheCopy, Again))
-      if (Again)
-        TryAgain.push_back(TheCopy);
+    bool Success = joinCopy(WorkList[i], Again);
+    Progress |= Success;
+    if (Success || !Again)
+      WorkList[i] = 0;
   }
+  return Progress;
 }
 
-void RegisterCoalescer::joinIntervals() {
+void
+RegisterCoalescer::copyCoalesceInMBB(MachineBasicBlock *MBB) {
+  DEBUG(dbgs() << MBB->getName() << ":\n");
+
+  // Collect all copy-like instructions in MBB. Don't start coalescing anything
+  // yet, it might invalidate the iterator.
+  const unsigned PrevSize = WorkList.size();
+  for (MachineBasicBlock::iterator MII = MBB->begin(), E = MBB->end();
+       MII != E; ++MII)
+    if (MII->isCopyLike())
+      WorkList.push_back(MII);
+
+  // Try coalescing the collected copies immediately, and remove the nulls.
+  // This prevents the WorkList from getting too large since most copies are
+  // joinable on the first attempt.
+  if (copyCoalesceWorkList(PrevSize))
+    WorkList.erase(std::remove(WorkList.begin() + PrevSize, WorkList.end(),
+                               (MachineInstr*)0), WorkList.end());
+}
+
+void RegisterCoalescer::joinAllIntervals() {
   DEBUG(dbgs() << "********** JOINING INTERVALS ***********\n");
+  assert(WorkList.empty() && "Old data still around.");
 
-  std::vector<MachineInstr*> TryAgainList;
   if (Loops->empty()) {
     // If there are no loops in the function, join intervals in function order.
     for (MachineFunction::iterator I = MF->begin(), E = MF->end();
          I != E; ++I)
-      CopyCoalesceInMBB(I, TryAgainList);
+      copyCoalesceInMBB(I);
   } else {
     // Otherwise, join intervals in inner loops before other intervals.
     // Unfortunately we can't just iterate over loop hierarchy here because
@@ -1763,34 +1551,20 @@ void RegisterCoalescer::joinIntervals() {
 
     // Finally, join intervals in loop nest order.
     for (unsigned i = 0, e = MBBs.size(); i != e; ++i)
-      CopyCoalesceInMBB(MBBs[i].second, TryAgainList);
+      copyCoalesceInMBB(MBBs[i].second);
   }
 
   // Joining intervals can allow other intervals to be joined.  Iteratively join
   // until we make no progress.
-  bool ProgressMade = true;
-  while (ProgressMade) {
-    ProgressMade = false;
-
-    for (unsigned i = 0, e = TryAgainList.size(); i != e; ++i) {
-      MachineInstr *&TheCopy = TryAgainList[i];
-      if (!TheCopy)
-        continue;
-
-      bool Again = false;
-      bool Success = JoinCopy(TheCopy, Again);
-      if (Success || !Again) {
-        TheCopy= 0;   // Mark this one as done.
-        ProgressMade = true;
-      }
-    }
-  }
+  while (copyCoalesceWorkList())
+    /* empty */ ;
 }
 
 void RegisterCoalescer::releaseMemory() {
-  JoinedCopies.clear();
-  ReMatCopies.clear();
-  ReMatDefs.clear();
+  ErasedInstrs.clear();
+  WorkList.clear();
+  DeadDefs.clear();
+  InflateRegs.clear();
 }
 
 bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
@@ -1814,138 +1588,11 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
   RegClassInfo.runOnMachineFunction(fn);
 
   // Join (coalesce) intervals if requested.
-  if (EnableJoining) {
-    joinIntervals();
-    DEBUG({
-        dbgs() << "********** INTERVALS POST JOINING **********\n";
-        for (LiveIntervals::iterator I = LIS->begin(), E = LIS->end();
-             I != E; ++I){
-          I->second->print(dbgs(), TRI);
-          dbgs() << "\n";
-        }
-      });
-  }
-
-  // Perform a final pass over the instructions and compute spill weights
-  // and remove identity moves.
-  SmallVector<unsigned, 4> DeadDefs, InflateRegs;
-  for (MachineFunction::iterator mbbi = MF->begin(), mbbe = MF->end();
-       mbbi != mbbe; ++mbbi) {
-    MachineBasicBlock* mbb = mbbi;
-    for (MachineBasicBlock::iterator mii = mbb->begin(), mie = mbb->end();
-         mii != mie; ) {
-      MachineInstr *MI = mii;
-      if (JoinedCopies.count(MI)) {
-        // Delete all coalesced copies.
-        bool DoDelete = true;
-        assert(MI->isCopyLike() && "Unrecognized copy instruction");
-        unsigned SrcReg = MI->getOperand(MI->isSubregToReg() ? 2 : 1).getReg();
-        unsigned DstReg = MI->getOperand(0).getReg();
-
-        // Collect candidates for register class inflation.
-        if (TargetRegisterInfo::isVirtualRegister(SrcReg) &&
-            RegClassInfo.isProperSubClass(MRI->getRegClass(SrcReg)))
-          InflateRegs.push_back(SrcReg);
-        if (TargetRegisterInfo::isVirtualRegister(DstReg) &&
-            RegClassInfo.isProperSubClass(MRI->getRegClass(DstReg)))
-          InflateRegs.push_back(DstReg);
-
-        if (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
-            MI->getNumOperands() > 2)
-          // Do not delete extract_subreg, insert_subreg of physical
-          // registers unless the definition is dead. e.g.
-          // %DO<def> = INSERT_SUBREG %D0<undef>, %S0<kill>, 1
-          // or else the scavenger may complain. LowerSubregs will
-          // delete them later.
-          DoDelete = false;
-
-        if (MI->allDefsAreDead()) {
-          if (TargetRegisterInfo::isVirtualRegister(SrcReg) &&
-              LIS->hasInterval(SrcReg))
-            LIS->shrinkToUses(&LIS->getInterval(SrcReg));
-          DoDelete = true;
-        }
-        if (!DoDelete) {
-          // We need the instruction to adjust liveness, so make it a KILL.
-          if (MI->isSubregToReg()) {
-            MI->RemoveOperand(3);
-            MI->RemoveOperand(1);
-          }
-          MI->setDesc(TII->get(TargetOpcode::KILL));
-          mii = llvm::next(mii);
-        } else {
-          LIS->RemoveMachineInstrFromMaps(MI);
-          mii = mbbi->erase(mii);
-          ++numPeep;
-        }
-        continue;
-      }
-
-      // Now check if this is a remat'ed def instruction which is now dead.
-      if (ReMatDefs.count(MI)) {
-        bool isDead = true;
-        for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-          const MachineOperand &MO = MI->getOperand(i);
-          if (!MO.isReg())
-            continue;
-          unsigned Reg = MO.getReg();
-          if (!Reg)
-            continue;
-          DeadDefs.push_back(Reg);
-          if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-            // Remat may also enable register class inflation.
-            if (RegClassInfo.isProperSubClass(MRI->getRegClass(Reg)))
-              InflateRegs.push_back(Reg);
-          }
-          if (MO.isDead())
-            continue;
-          if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
-              !MRI->use_nodbg_empty(Reg)) {
-            isDead = false;
-            break;
-          }
-        }
-        if (isDead) {
-          while (!DeadDefs.empty()) {
-            unsigned DeadDef = DeadDefs.back();
-            DeadDefs.pop_back();
-            RemoveDeadDef(LIS->getInterval(DeadDef), MI);
-          }
-          LIS->RemoveMachineInstrFromMaps(mii);
-          mii = mbbi->erase(mii);
-          continue;
-        } else
-          DeadDefs.clear();
-      }
-
-      ++mii;
-
-      // Check for now unnecessary kill flags.
-      if (LIS->isNotInMIMap(MI)) continue;
-      SlotIndex DefIdx = LIS->getInstructionIndex(MI).getRegSlot();
-      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-        MachineOperand &MO = MI->getOperand(i);
-        if (!MO.isReg() || !MO.isKill()) continue;
-        unsigned reg = MO.getReg();
-        if (!reg || !LIS->hasInterval(reg)) continue;
-        if (!LIS->getInterval(reg).killedAt(DefIdx)) {
-          MO.setIsKill(false);
-          continue;
-        }
-        // When leaving a kill flag on a physreg, check if any subregs should
-        // remain alive.
-        if (!TargetRegisterInfo::isPhysicalRegister(reg))
-          continue;
-        for (const uint16_t *SR = TRI->getSubRegisters(reg);
-             unsigned S = *SR; ++SR)
-          if (LIS->hasInterval(S) && LIS->getInterval(S).liveAt(DefIdx))
-            MI->addRegisterDefined(S, TRI);
-      }
-    }
-  }
+  if (EnableJoining)
+    joinAllIntervals();
 
   // After deleting a lot of copies, register classes may be less constrained.
-  // Removing sub-register opreands may alow GR32_ABCD -> GR32 and DPR_VFP2 ->
+  // Removing sub-register operands may allow GR32_ABCD -> GR32 and DPR_VFP2 ->
   // DPR inflation.
   array_pod_sort(InflateRegs.begin(), InflateRegs.end());
   InflateRegs.erase(std::unique(InflateRegs.begin(), InflateRegs.end()),
diff --git a/lib/CodeGen/RegisterCoalescer.h b/lib/CodeGen/RegisterCoalescer.h
index 310b933..8a6df98 100644
--- a/lib/CodeGen/RegisterCoalescer.h
+++ b/lib/CodeGen/RegisterCoalescer.h
@@ -26,7 +26,6 @@ namespace llvm {
   /// two registers can be coalesced, CoalescerPair can determine if a copy
   /// instruction would become an identity copy after coalescing.
   class CoalescerPair {
-    const TargetInstrInfo &TII;
     const TargetRegisterInfo &TRI;
 
     /// DstReg - The register that will be left after coalescing. It can be a
@@ -36,10 +35,13 @@ namespace llvm {
     /// SrcReg - the virtual register that will be coalesced into dstReg.
     unsigned SrcReg;
 
-    /// subReg_ - The subregister index of srcReg in DstReg. It is possible the
-    /// coalesce SrcReg into a subreg of the larger DstReg when DstReg is a
-    /// virtual register.
-    unsigned SubIdx;
+    /// DstIdx - The sub-register index of the old DstReg in the new coalesced
+    /// register.
+    unsigned DstIdx;
+
+    /// SrcIdx - The sub-register index of the old SrcReg in the new coalesced
+    /// register.
+    unsigned SrcIdx;
 
     /// Partial - True when the original copy was a partial subregister copy.
     bool Partial;
@@ -52,12 +54,13 @@ namespace llvm {
     bool Flipped;
 
     /// NewRC - The register class of the coalesced register, or NULL if DstReg
-    /// is a physreg.
+    /// is a physreg. This register class may be a super-register of both
+    /// SrcReg and DstReg.
     const TargetRegisterClass *NewRC;
 
   public:
-    CoalescerPair(const TargetInstrInfo &tii, const TargetRegisterInfo &tri)
-      : TII(tii), TRI(tri), DstReg(0), SrcReg(0), SubIdx(0),
+    CoalescerPair(const TargetRegisterInfo &tri)
+      : TRI(tri), DstReg(0), SrcReg(0), DstIdx(0), SrcIdx(0),
         Partial(false), CrossClass(false), Flipped(false), NewRC(0) {}
 
     /// setRegisters - set registers to match the copy instruction MI. Return
@@ -94,9 +97,13 @@ namespace llvm {
     /// getSrcReg - Return the virtual register that will be coalesced away.
     unsigned getSrcReg() const { return SrcReg; }
 
-    /// getSubIdx - Return the subregister index in DstReg that SrcReg will be
-    /// coalesced into, or 0.
-    unsigned getSubIdx() const { return SubIdx; }
+    /// getDstIdx - Return the subregister index that DstReg will be coalesced
+    /// into, or 0.
+    unsigned getDstIdx() const { return DstIdx; }
+
+    /// getSrcIdx - Return the subregister index that SrcReg will be coalesced
+    /// into, or 0.
+    unsigned getSrcIdx() const { return SrcIdx; }
 
     /// getNewRC - Return the register class of the coalesced register.
     const TargetRegisterClass *getNewRC() const { return NewRC; }
diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp
new file mode 100644
index 0000000..43448c8
--- /dev/null
+++ b/lib/CodeGen/RegisterPressure.cpp
@@ -0,0 +1,841 @@
+//===-- RegisterPressure.cpp - Dynamic Register Pressure ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the RegisterPressure class which can be used to track
+// MachineInstr level register pressure.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+/// Increase register pressure for each set impacted by this register class.
+static void increaseSetPressure(std::vector<unsigned> &CurrSetPressure,
+                                std::vector<unsigned> &MaxSetPressure,
+                                const TargetRegisterClass *RC,
+                                const TargetRegisterInfo *TRI) {
+  unsigned Weight = TRI->getRegClassWeight(RC).RegWeight;
+  for (const int *PSet = TRI->getRegClassPressureSets(RC);
+       *PSet != -1; ++PSet) {
+    CurrSetPressure[*PSet] += Weight;
+    if (&CurrSetPressure != &MaxSetPressure
+        && CurrSetPressure[*PSet] > MaxSetPressure[*PSet]) {
+      MaxSetPressure[*PSet] = CurrSetPressure[*PSet];
+    }
+  }
+}
+
+/// Decrease register pressure for each set impacted by this register class.
+static void decreaseSetPressure(std::vector<unsigned> &CurrSetPressure,
+                                const TargetRegisterClass *RC,
+                                const TargetRegisterInfo *TRI) {
+  unsigned Weight = TRI->getRegClassWeight(RC).RegWeight;
+  for (const int *PSet = TRI->getRegClassPressureSets(RC);
+       *PSet != -1; ++PSet) {
+    assert(CurrSetPressure[*PSet] >= Weight && "register pressure underflow");
+    CurrSetPressure[*PSet] -= Weight;
+  }
+}
+
+/// Directly increase pressure only within this RegisterPressure result.
+void RegisterPressure::increase(const TargetRegisterClass *RC,
+                                const TargetRegisterInfo *TRI) {
+  increaseSetPressure(MaxSetPressure, MaxSetPressure, RC, TRI);
+}
+
+/// Directly decrease pressure only within this RegisterPressure result.
+void RegisterPressure::decrease(const TargetRegisterClass *RC,
+                                const TargetRegisterInfo *TRI) {
+  decreaseSetPressure(MaxSetPressure, RC, TRI);
+}
+
+void RegisterPressure::dump(const TargetRegisterInfo *TRI) {
+  dbgs() << "Live In: ";
+  for (unsigned i = 0, e = LiveInRegs.size(); i < e; ++i)
+    dbgs() << PrintReg(LiveInRegs[i], TRI) << " ";
+  dbgs() << '\n';
+  dbgs() << "Live Out: ";
+  for (unsigned i = 0, e = LiveOutRegs.size(); i < e; ++i)
+    dbgs() << PrintReg(LiveOutRegs[i], TRI) << " ";
+  dbgs() << '\n';
+  for (unsigned i = 0, e = MaxSetPressure.size(); i < e; ++i) {
+    if (MaxSetPressure[i] != 0)
+      dbgs() << TRI->getRegPressureSetName(i) << "=" << MaxSetPressure[i]
+             << '\n';
+  }
+}
+
+/// Increase the current pressure as impacted by these physical registers and
+/// bump the high water mark if needed.
+void RegPressureTracker::increasePhysRegPressure(ArrayRef<unsigned> Regs) {
+  for (unsigned I = 0, E = Regs.size(); I != E; ++I)
+    increaseSetPressure(CurrSetPressure, P.MaxSetPressure,
+                        TRI->getMinimalPhysRegClass(Regs[I]), TRI);
+}
+
+/// Simply decrease the current pressure as impacted by these physcial
+/// registers.
+void RegPressureTracker::decreasePhysRegPressure(ArrayRef<unsigned> Regs) {
+  for (unsigned I = 0, E = Regs.size(); I != E; ++I)
+    decreaseSetPressure(CurrSetPressure, TRI->getMinimalPhysRegClass(Regs[I]),
+                        TRI);
+}
+
+/// Increase the current pressure as impacted by these virtual registers and
+/// bump the high water mark if needed.
+void RegPressureTracker::increaseVirtRegPressure(ArrayRef<unsigned> Regs) {
+  for (unsigned I = 0, E = Regs.size(); I != E; ++I)
+    increaseSetPressure(CurrSetPressure, P.MaxSetPressure,
+                        MRI->getRegClass(Regs[I]), TRI);
+}
+
+/// Simply decrease the current pressure as impacted by these virtual registers.
+void RegPressureTracker::decreaseVirtRegPressure(ArrayRef<unsigned> Regs) {
+  for (unsigned I = 0, E = Regs.size(); I != E; ++I)
+    decreaseSetPressure(CurrSetPressure, MRI->getRegClass(Regs[I]), TRI);
+}
+
+/// Clear the result so it can be used for another round of pressure tracking.
+void IntervalPressure::reset() {
+  TopIdx = BottomIdx = SlotIndex();
+  MaxSetPressure.clear();
+  LiveInRegs.clear();
+  LiveOutRegs.clear();
+}
+
+/// Clear the result so it can be used for another round of pressure tracking.
+void RegionPressure::reset() {
+  TopPos = BottomPos = MachineBasicBlock::const_iterator();
+  MaxSetPressure.clear();
+  LiveInRegs.clear();
+  LiveOutRegs.clear();
+}
+
+/// If the current top is not less than or equal to the next index, open it.
+/// We happen to need the SlotIndex for the next top for pressure update.
+void IntervalPressure::openTop(SlotIndex NextTop) {
+  if (TopIdx <= NextTop)
+    return;
+  TopIdx = SlotIndex();
+  LiveInRegs.clear();
+}
+
+/// If the current top is the previous instruction (before receding), open it.
+void RegionPressure::openTop(MachineBasicBlock::const_iterator PrevTop) {
+  if (TopPos != PrevTop)
+    return;
+  TopPos = MachineBasicBlock::const_iterator();
+  LiveInRegs.clear();
+}
+
+/// If the current bottom is not greater than the previous index, open it.
+void IntervalPressure::openBottom(SlotIndex PrevBottom) {
+  if (BottomIdx > PrevBottom)
+    return;
+  BottomIdx = SlotIndex();
+  LiveInRegs.clear();
+}
+
+/// If the current bottom is the previous instr (before advancing), open it.
+void RegionPressure::openBottom(MachineBasicBlock::const_iterator PrevBottom) {
+  if (BottomPos != PrevBottom)
+    return;
+  BottomPos = MachineBasicBlock::const_iterator();
+  LiveInRegs.clear();
+}
+
+/// Setup the RegPressureTracker.
+///
+/// TODO: Add support for pressure without LiveIntervals.
+void RegPressureTracker::init(const MachineFunction *mf,
+                              const RegisterClassInfo *rci,
+                              const LiveIntervals *lis,
+                              const MachineBasicBlock *mbb,
+                              MachineBasicBlock::const_iterator pos)
+{
+  MF = mf;
+  TRI = MF->getTarget().getRegisterInfo();
+  RCI = rci;
+  MRI = &MF->getRegInfo();
+  MBB = mbb;
+
+  if (RequireIntervals) {
+    assert(lis && "IntervalPressure requires LiveIntervals");
+    LIS = lis;
+  }
+
+  CurrPos = pos;
+  while (CurrPos != MBB->end() && CurrPos->isDebugValue())
+    ++CurrPos;
+
+  CurrSetPressure.assign(TRI->getNumRegPressureSets(), 0);
+
+  if (RequireIntervals)
+    static_cast<IntervalPressure&>(P).reset();
+  else
+    static_cast<RegionPressure&>(P).reset();
+  P.MaxSetPressure = CurrSetPressure;
+
+  LivePhysRegs.clear();
+  LivePhysRegs.setUniverse(TRI->getNumRegs());
+  LiveVirtRegs.clear();
+  LiveVirtRegs.setUniverse(MRI->getNumVirtRegs());
+}
+
+/// Does this pressure result have a valid top position and live ins.
+bool RegPressureTracker::isTopClosed() const {
+  if (RequireIntervals)
+    return static_cast<IntervalPressure&>(P).TopIdx.isValid();
+  return (static_cast<RegionPressure&>(P).TopPos ==
+          MachineBasicBlock::const_iterator());
+}
+
+/// Does this pressure result have a valid bottom position and live outs.
+bool RegPressureTracker::isBottomClosed() const {
+  if (RequireIntervals)
+    return static_cast<IntervalPressure&>(P).BottomIdx.isValid();
+  return (static_cast<RegionPressure&>(P).BottomPos ==
+          MachineBasicBlock::const_iterator());
+}
+
+/// Set the boundary for the top of the region and summarize live ins.
+void RegPressureTracker::closeTop() {
+  if (RequireIntervals)
+    static_cast<IntervalPressure&>(P).TopIdx =
+      LIS->getInstructionIndex(CurrPos).getRegSlot();
+  else
+    static_cast<RegionPressure&>(P).TopPos = CurrPos;
+
+  assert(P.LiveInRegs.empty() && "inconsistent max pressure result");
+  P.LiveInRegs.reserve(LivePhysRegs.size() + LiveVirtRegs.size());
+  P.LiveInRegs.append(LivePhysRegs.begin(), LivePhysRegs.end());
+  for (SparseSet<unsigned>::const_iterator I =
+         LiveVirtRegs.begin(), E = LiveVirtRegs.end(); I != E; ++I)
+    P.LiveInRegs.push_back(*I);
+  std::sort(P.LiveInRegs.begin(), P.LiveInRegs.end());
+  P.LiveInRegs.erase(std::unique(P.LiveInRegs.begin(), P.LiveInRegs.end()),
+                     P.LiveInRegs.end());
+}
+
+/// Set the boundary for the bottom of the region and summarize live outs.
+void RegPressureTracker::closeBottom() {
+  if (RequireIntervals)
+    if (CurrPos == MBB->end())
+      static_cast<IntervalPressure&>(P).BottomIdx = LIS->getMBBEndIdx(MBB);
+    else
+      static_cast<IntervalPressure&>(P).BottomIdx =
+        LIS->getInstructionIndex(CurrPos).getRegSlot();
+  else
+    static_cast<RegionPressure&>(P).BottomPos = CurrPos;
+
+  assert(P.LiveOutRegs.empty() && "inconsistent max pressure result");
+  P.LiveOutRegs.reserve(LivePhysRegs.size() + LiveVirtRegs.size());
+  P.LiveOutRegs.append(LivePhysRegs.begin(), LivePhysRegs.end());
+  for (SparseSet<unsigned>::const_iterator I =
+         LiveVirtRegs.begin(), E = LiveVirtRegs.end(); I != E; ++I)
+    P.LiveOutRegs.push_back(*I);
+  std::sort(P.LiveOutRegs.begin(), P.LiveOutRegs.end());
+  P.LiveOutRegs.erase(std::unique(P.LiveOutRegs.begin(), P.LiveOutRegs.end()),
+                      P.LiveOutRegs.end());
+}
+
+/// Finalize the region boundaries and record live ins and live outs.
+void RegPressureTracker::closeRegion() {
+  if (!isTopClosed() && !isBottomClosed()) {
+    assert(LivePhysRegs.empty() && LiveVirtRegs.empty() &&
+           "no region boundary");
+    return;
+  }
+  if (!isBottomClosed())
+    closeBottom();
+  else if (!isTopClosed())
+    closeTop();
+  // If both top and bottom are closed, do nothing.
+}
+
+/// Return true if Reg aliases a register in Regs SparseSet.
+static bool hasRegAlias(unsigned Reg, SparseSet<unsigned> &Regs,
+                        const TargetRegisterInfo *TRI) {
+  assert(!TargetRegisterInfo::isVirtualRegister(Reg) && "only for physregs");
+  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+    if (Regs.count(*AI))
+      return true;
+  return false;
+}
+
+/// Return true if Reg aliases a register in unsorted Regs SmallVector.
+/// This is only valid for physical registers.
+static SmallVectorImpl<unsigned>::iterator
+findRegAlias(unsigned Reg, SmallVectorImpl<unsigned> &Regs,
+             const TargetRegisterInfo *TRI) {
+  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+    SmallVectorImpl<unsigned>::iterator I =
+      std::find(Regs.begin(), Regs.end(), *AI);
+    if (I != Regs.end())
+      return I;
+  }
+  return Regs.end();
+}
+
+/// Return true if Reg can be inserted into Regs SmallVector. For virtual
+/// register, do a linear search. For physical registers check for aliases.
+static SmallVectorImpl<unsigned>::iterator
+findReg(unsigned Reg, bool isVReg, SmallVectorImpl<unsigned> &Regs,
+        const TargetRegisterInfo *TRI) {
+  if(isVReg)
+    return std::find(Regs.begin(), Regs.end(), Reg);
+  return findRegAlias(Reg, Regs, TRI);
+}
+
+/// Collect this instruction's unique uses and defs into SmallVectors for
+/// processing defs and uses in order.
+template<bool isVReg>
+struct RegisterOperands {
+  SmallVector<unsigned, 8> Uses;
+  SmallVector<unsigned, 8> Defs;
+  SmallVector<unsigned, 8> DeadDefs;
+
+  /// Push this operand's register onto the correct vector.
+  void collect(const MachineOperand &MO, const TargetRegisterInfo *TRI) {
+    if (MO.readsReg()) {
+      if (findReg(MO.getReg(), isVReg, Uses, TRI) == Uses.end())
+      Uses.push_back(MO.getReg());
+    }
+    if (MO.isDef()) {
+      if (MO.isDead()) {
+        if (findReg(MO.getReg(), isVReg, DeadDefs, TRI) == DeadDefs.end())
+          DeadDefs.push_back(MO.getReg());
+      }
+      else {
+        if (findReg(MO.getReg(), isVReg, Defs, TRI) == Defs.end())
+          Defs.push_back(MO.getReg());
+      }
+    }
+  }
+};
+typedef RegisterOperands<false> PhysRegOperands;
+typedef RegisterOperands<true> VirtRegOperands;
+
+/// Collect physical and virtual register operands.
+static void collectOperands(const MachineInstr *MI,
+                            PhysRegOperands &PhysRegOpers,
+                            VirtRegOperands &VirtRegOpers,
+                            const TargetRegisterInfo *TRI,
+                            const RegisterClassInfo *RCI) {
+  for(ConstMIBundleOperands OperI(MI); OperI.isValid(); ++OperI) {
+    const MachineOperand &MO = *OperI;
+    if (!MO.isReg() || !MO.getReg())
+      continue;
+
+    if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      VirtRegOpers.collect(MO, TRI);
+    else if (RCI->isAllocatable(MO.getReg()))
+      PhysRegOpers.collect(MO, TRI);
+  }
+  // Remove redundant physreg dead defs.
+  for (unsigned i = PhysRegOpers.DeadDefs.size(); i > 0; --i) {
+    unsigned Reg = PhysRegOpers.DeadDefs[i-1];
+    if (findRegAlias(Reg, PhysRegOpers.Defs, TRI) != PhysRegOpers.Defs.end())
+      PhysRegOpers.DeadDefs.erase(&PhysRegOpers.DeadDefs[i-1]);
+  }
+}
+
+/// Force liveness of registers.
+void RegPressureTracker::addLiveRegs(ArrayRef<unsigned> Regs) {
+  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
+    if (TargetRegisterInfo::isVirtualRegister(Regs[i])) {
+      if (LiveVirtRegs.insert(Regs[i]).second)
+        increaseVirtRegPressure(Regs[i]);
+    }
+    else  {
+      if (!hasRegAlias(Regs[i], LivePhysRegs, TRI)) {
+        LivePhysRegs.insert(Regs[i]);
+        increasePhysRegPressure(Regs[i]);
+      }
+    }
+  }
+}
+
+/// Add PhysReg to the live in set and increase max pressure.
+void RegPressureTracker::discoverPhysLiveIn(unsigned Reg) {
+  assert(!LivePhysRegs.count(Reg) && "avoid bumping max pressure twice");
+  if (findRegAlias(Reg, P.LiveInRegs, TRI) != P.LiveInRegs.end())
+    return;
+
+  // At live in discovery, unconditionally increase the high water mark.
+  P.LiveInRegs.push_back(Reg);
+  P.increase(TRI->getMinimalPhysRegClass(Reg), TRI);
+}
+
+/// Add PhysReg to the live out set and increase max pressure.
+void RegPressureTracker::discoverPhysLiveOut(unsigned Reg) {
+  assert(!LivePhysRegs.count(Reg) && "avoid bumping max pressure twice");
+  if (findRegAlias(Reg, P.LiveOutRegs, TRI) != P.LiveOutRegs.end())
+    return;
+
+  // At live out discovery, unconditionally increase the high water mark.
+  P.LiveOutRegs.push_back(Reg);
+  P.increase(TRI->getMinimalPhysRegClass(Reg), TRI);
+}
+
+/// Add VirtReg to the live in set and increase max pressure.
+void RegPressureTracker::discoverVirtLiveIn(unsigned Reg) {
+  assert(!LiveVirtRegs.count(Reg) && "avoid bumping max pressure twice");
+  if (std::find(P.LiveInRegs.begin(), P.LiveInRegs.end(), Reg) !=
+      P.LiveInRegs.end())
+    return;
+
+  // At live in discovery, unconditionally increase the high water mark.
+  P.LiveInRegs.push_back(Reg);
+  P.increase(MRI->getRegClass(Reg), TRI);
+}
+
+/// Add VirtReg to the live out set and increase max pressure.
+void RegPressureTracker::discoverVirtLiveOut(unsigned Reg) {
+  assert(!LiveVirtRegs.count(Reg) && "avoid bumping max pressure twice");
+  if (std::find(P.LiveOutRegs.begin(), P.LiveOutRegs.end(), Reg) !=
+      P.LiveOutRegs.end())
+    return;
+
+  // At live out discovery, unconditionally increase the high water mark.
+  P.LiveOutRegs.push_back(Reg);
+  P.increase(MRI->getRegClass(Reg), TRI);
+}
+
+/// Recede across the previous instruction.
+bool RegPressureTracker::recede() {
+  // Check for the top of the analyzable region.
+  if (CurrPos == MBB->begin()) {
+    closeRegion();
+    return false;
+  }
+  if (!isBottomClosed())
+    closeBottom();
+
+  // Open the top of the region using block iterators.
+  if (!RequireIntervals && isTopClosed())
+    static_cast<RegionPressure&>(P).openTop(CurrPos);
+
+  // Find the previous instruction.
+  do
+    --CurrPos;
+  while (CurrPos != MBB->begin() && CurrPos->isDebugValue());
+
+  if (CurrPos->isDebugValue()) {
+    closeRegion();
+    return false;
+  }
+  SlotIndex SlotIdx;
+  if (RequireIntervals)
+    SlotIdx = LIS->getInstructionIndex(CurrPos).getRegSlot();
+
+  // Open the top of the region using slot indexes.
+  if (RequireIntervals && isTopClosed())
+    static_cast<IntervalPressure&>(P).openTop(SlotIdx);
+
+  PhysRegOperands PhysRegOpers;
+  VirtRegOperands VirtRegOpers;
+  collectOperands(CurrPos, PhysRegOpers, VirtRegOpers, TRI, RCI);
+
+  // Boost pressure for all dead defs together.
+  increasePhysRegPressure(PhysRegOpers.DeadDefs);
+  increaseVirtRegPressure(VirtRegOpers.DeadDefs);
+  decreasePhysRegPressure(PhysRegOpers.DeadDefs);
+  decreaseVirtRegPressure(VirtRegOpers.DeadDefs);
+
+  // Kill liveness at live defs.
+  // TODO: consider earlyclobbers?
+  for (unsigned i = 0, e = PhysRegOpers.Defs.size(); i < e; ++i) {
+    unsigned Reg = PhysRegOpers.Defs[i];
+    if (LivePhysRegs.erase(Reg))
+      decreasePhysRegPressure(Reg);
+    else
+      discoverPhysLiveOut(Reg);
+  }
+  for (unsigned i = 0, e = VirtRegOpers.Defs.size(); i < e; ++i) {
+    unsigned Reg = VirtRegOpers.Defs[i];
+    if (LiveVirtRegs.erase(Reg))
+      decreaseVirtRegPressure(Reg);
+    else
+      discoverVirtLiveOut(Reg);
+  }
+
+  // Generate liveness for uses.
+  for (unsigned i = 0, e = PhysRegOpers.Uses.size(); i < e; ++i) {
+    unsigned Reg = PhysRegOpers.Uses[i];
+    if (!hasRegAlias(Reg, LivePhysRegs, TRI)) {
+      increasePhysRegPressure(Reg);
+      LivePhysRegs.insert(Reg);
+    }
+  }
+  for (unsigned i = 0, e = VirtRegOpers.Uses.size(); i < e; ++i) {
+    unsigned Reg = VirtRegOpers.Uses[i];
+    if (!LiveVirtRegs.count(Reg)) {
+      // Adjust liveouts if LiveIntervals are available.
+      if (RequireIntervals) {
+        const LiveInterval *LI = &LIS->getInterval(Reg);
+        if (!LI->killedAt(SlotIdx))
+          discoverVirtLiveOut(Reg);
+      }
+      increaseVirtRegPressure(Reg);
+      LiveVirtRegs.insert(Reg);
+    }
+  }
+  return true;
+}
+
+/// Advance across the current instruction.
+bool RegPressureTracker::advance() {
+  // Check for the bottom of the analyzable region.
+  if (CurrPos == MBB->end()) {
+    closeRegion();
+    return false;
+  }
+  if (!isTopClosed())
+    closeTop();
+
+  SlotIndex SlotIdx;
+  if (RequireIntervals)
+    SlotIdx = LIS->getInstructionIndex(CurrPos).getRegSlot();
+
+  // Open the bottom of the region using slot indexes.
+  if (isBottomClosed()) {
+    if (RequireIntervals)
+      static_cast<IntervalPressure&>(P).openBottom(SlotIdx);
+    else
+      static_cast<RegionPressure&>(P).openBottom(CurrPos);
+  }
+
+  PhysRegOperands PhysRegOpers;
+  VirtRegOperands VirtRegOpers;
+  collectOperands(CurrPos, PhysRegOpers, VirtRegOpers, TRI, RCI);
+
+  // Kill liveness at last uses.
+  for (unsigned i = 0, e = PhysRegOpers.Uses.size(); i < e; ++i) {
+    unsigned Reg = PhysRegOpers.Uses[i];
+    if (!hasRegAlias(Reg, LivePhysRegs, TRI))
+      discoverPhysLiveIn(Reg);
+    else {
+      // Allocatable physregs are always single-use before regalloc.
+      decreasePhysRegPressure(Reg);
+      LivePhysRegs.erase(Reg);
+    }
+  }
+  for (unsigned i = 0, e = VirtRegOpers.Uses.size(); i < e; ++i) {
+    unsigned Reg = VirtRegOpers.Uses[i];
+    if (RequireIntervals) {
+      const LiveInterval *LI = &LIS->getInterval(Reg);
+      if (LI->killedAt(SlotIdx)) {
+        if (LiveVirtRegs.erase(Reg))
+          decreaseVirtRegPressure(Reg);
+        else
+          discoverVirtLiveIn(Reg);
+      }
+    }
+    else if (!LiveVirtRegs.count(Reg)) {
+      discoverVirtLiveIn(Reg);
+      increaseVirtRegPressure(Reg);
+    }
+  }
+
+  // Generate liveness for defs.
+  for (unsigned i = 0, e = PhysRegOpers.Defs.size(); i < e; ++i) {
+    unsigned Reg = PhysRegOpers.Defs[i];
+    if (!hasRegAlias(Reg, LivePhysRegs, TRI)) {
+      increasePhysRegPressure(Reg);
+      LivePhysRegs.insert(Reg);
+    }
+  }
+  for (unsigned i = 0, e = VirtRegOpers.Defs.size(); i < e; ++i) {
+    unsigned Reg = VirtRegOpers.Defs[i];
+    if (LiveVirtRegs.insert(Reg).second)
+      increaseVirtRegPressure(Reg);
+  }
+
+  // Boost pressure for all dead defs together.
+  increasePhysRegPressure(PhysRegOpers.DeadDefs);
+  increaseVirtRegPressure(VirtRegOpers.DeadDefs);
+  decreasePhysRegPressure(PhysRegOpers.DeadDefs);
+  decreaseVirtRegPressure(VirtRegOpers.DeadDefs);
+
+  // Find the next instruction.
+  do
+    ++CurrPos;
+  while (CurrPos != MBB->end() && CurrPos->isDebugValue());
+  return true;
+}
+
+/// Find the max change in excess pressure across all sets.
+static void computeExcessPressureDelta(ArrayRef<unsigned> OldPressureVec,
+                                       ArrayRef<unsigned> NewPressureVec,
+                                       RegPressureDelta &Delta,
+                                       const TargetRegisterInfo *TRI) {
+  int ExcessUnits = 0;
+  unsigned PSetID = ~0U;
+  for (unsigned i = 0, e = OldPressureVec.size(); i < e; ++i) {
+    unsigned POld = OldPressureVec[i];
+    unsigned PNew = NewPressureVec[i];
+    int PDiff = (int)PNew - (int)POld;
+    if (!PDiff) // No change in this set in the common case.
+      continue;
+    // Only consider change beyond the limit.
+    unsigned Limit = TRI->getRegPressureSetLimit(i);
+    if (Limit > POld) {
+      if (Limit > PNew)
+        PDiff = 0;            // Under the limit
+      else
+        PDiff = PNew - Limit; // Just exceeded limit.
+    }
+    else if (Limit > PNew)
+      PDiff = Limit - POld;   // Just obeyed limit.
+
+    if (std::abs(PDiff) > std::abs(ExcessUnits)) {
+      ExcessUnits = PDiff;
+      PSetID = i;
+    }
+  }
+  Delta.Excess.PSetID = PSetID;
+  Delta.Excess.UnitIncrease = ExcessUnits;
+}
+
+/// Find the max change in max pressure that either surpasses a critical PSet
+/// limit or exceeds the current MaxPressureLimit.
+///
+/// FIXME: comparing each element of the old and new MaxPressure vectors here is
+/// silly. It's done now to demonstrate the concept but will go away with a
+/// RegPressureTracker API change to work with pressure differences.
+static void computeMaxPressureDelta(ArrayRef<unsigned> OldMaxPressureVec,
+                                    ArrayRef<unsigned> NewMaxPressureVec,
+                                    ArrayRef<PressureElement> CriticalPSets,
+                                    ArrayRef<unsigned> MaxPressureLimit,
+                                    RegPressureDelta &Delta) {
+  Delta.CriticalMax = PressureElement();
+  Delta.CurrentMax = PressureElement();
+
+  unsigned CritIdx = 0, CritEnd = CriticalPSets.size();
+  for (unsigned i = 0, e = OldMaxPressureVec.size(); i < e; ++i) {
+    unsigned POld = OldMaxPressureVec[i];
+    unsigned PNew = NewMaxPressureVec[i];
+    if (PNew == POld) // No change in this set in the common case.
+      continue;
+
+    while (CritIdx != CritEnd && CriticalPSets[CritIdx].PSetID < i)
+      ++CritIdx;
+
+    if (CritIdx != CritEnd && CriticalPSets[CritIdx].PSetID == i) {
+      int PDiff = (int)PNew - (int)CriticalPSets[CritIdx].UnitIncrease;
+      if (PDiff > Delta.CriticalMax.UnitIncrease) {
+        Delta.CriticalMax.PSetID = i;
+        Delta.CriticalMax.UnitIncrease = PDiff;
+      }
+    }
+
+    // Find the greatest increase above MaxPressureLimit.
+    // (Ignores negative MDiff).
+    int MDiff = (int)PNew - (int)MaxPressureLimit[i];
+    if (MDiff > Delta.CurrentMax.UnitIncrease) {
+      Delta.CurrentMax.PSetID = i;
+      Delta.CurrentMax.UnitIncrease = PNew;
+    }
+  }
+}
+
+/// Record the upward impact of a single instruction on current register
+/// pressure. Unlike the advance/recede pressure tracking interface, this does
+/// not discover live in/outs.
+///
+/// This is intended for speculative queries. It leaves pressure inconsistent
+/// with the current position, so must be restored by the caller.
+void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
+  // Account for register pressure similar to RegPressureTracker::recede().
+  PhysRegOperands PhysRegOpers;
+  VirtRegOperands VirtRegOpers;
+  collectOperands(MI, PhysRegOpers, VirtRegOpers, TRI, RCI);
+
+  // Boost max pressure for all dead defs together.
+  // Since CurrSetPressure and MaxSetPressure
+  increasePhysRegPressure(PhysRegOpers.DeadDefs);
+  increaseVirtRegPressure(VirtRegOpers.DeadDefs);
+  decreasePhysRegPressure(PhysRegOpers.DeadDefs);
+  decreaseVirtRegPressure(VirtRegOpers.DeadDefs);
+
+  // Kill liveness at live defs.
+  decreasePhysRegPressure(PhysRegOpers.Defs);
+  decreaseVirtRegPressure(VirtRegOpers.Defs);
+
+  // Generate liveness for uses.
+  for (unsigned i = 0, e = PhysRegOpers.Uses.size(); i < e; ++i) {
+    unsigned Reg = PhysRegOpers.Uses[i];
+    if (!hasRegAlias(Reg, LivePhysRegs, TRI))
+      increasePhysRegPressure(Reg);
+  }
+  for (unsigned i = 0, e = VirtRegOpers.Uses.size(); i < e; ++i) {
+    unsigned Reg = VirtRegOpers.Uses[i];
+    if (!LiveVirtRegs.count(Reg))
+      increaseVirtRegPressure(Reg);
+  }
+}
+
+/// Consider the pressure increase caused by traversing this instruction
+/// bottom-up. Find the pressure set with the most change beyond its pressure
+/// limit based on the tracker's current pressure, and return the change in
+/// number of register units of that pressure set introduced by this
+/// instruction.
+///
+/// This assumes that the current LiveOut set is sufficient.
+///
+/// FIXME: This is expensive for an on-the-fly query. We need to cache the
+/// result per-SUnit with enough information to adjust for the current
+/// scheduling position. But this works as a proof of concept.
+void RegPressureTracker::
+getMaxUpwardPressureDelta(const MachineInstr *MI, RegPressureDelta &Delta,
+                          ArrayRef<PressureElement> CriticalPSets,
+                          ArrayRef<unsigned> MaxPressureLimit) {
+  // Snapshot Pressure.
+  // FIXME: The snapshot heap space should persist. But I'm planning to
+  // summarize the pressure effect so we don't need to snapshot at all.
+  std::vector<unsigned> SavedPressure = CurrSetPressure;
+  std::vector<unsigned> SavedMaxPressure = P.MaxSetPressure;
+
+  bumpUpwardPressure(MI);
+
+  computeExcessPressureDelta(SavedPressure, CurrSetPressure, Delta, TRI);
+  computeMaxPressureDelta(SavedMaxPressure, P.MaxSetPressure, CriticalPSets,
+                          MaxPressureLimit, Delta);
+  assert(Delta.CriticalMax.UnitIncrease >= 0 &&
+         Delta.CurrentMax.UnitIncrease >= 0 && "cannot decrease max pressure");
+
+  // Restore the tracker's state.
+  P.MaxSetPressure.swap(SavedMaxPressure);
+  CurrSetPressure.swap(SavedPressure);
+}
+
+/// Helper to find a vreg use between two indices [PriorUseIdx, NextUseIdx).
+static bool findUseBetween(unsigned Reg,
+                           SlotIndex PriorUseIdx, SlotIndex NextUseIdx,
+                           const MachineRegisterInfo *MRI,
+                           const LiveIntervals *LIS) {
+  for (MachineRegisterInfo::use_nodbg_iterator
+         UI = MRI->use_nodbg_begin(Reg), UE = MRI->use_nodbg_end();
+         UI != UE; UI.skipInstruction()) {
+      const MachineInstr* MI = &*UI;
+      SlotIndex InstSlot = LIS->getInstructionIndex(MI).getRegSlot();
+      if (InstSlot >= PriorUseIdx && InstSlot < NextUseIdx)
+        return true;
+  }
+  return false;
+}
+
+/// Record the downward impact of a single instruction on current register
+/// pressure. Unlike the advance/recede pressure tracking interface, this does
+/// not discover live in/outs.
+///
+/// This is intended for speculative queries. It leaves pressure inconsistent
+/// with the current position, so must be restored by the caller.
+void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) {
+  // Account for register pressure similar to RegPressureTracker::recede().
+  PhysRegOperands PhysRegOpers;
+  VirtRegOperands VirtRegOpers;
+  collectOperands(MI, PhysRegOpers, VirtRegOpers, TRI, RCI);
+
+  // Kill liveness at last uses. Assume allocatable physregs are single-use
+  // rather than checking LiveIntervals.
+  decreasePhysRegPressure(PhysRegOpers.Uses);
+  if (RequireIntervals) {
+    SlotIndex SlotIdx = LIS->getInstructionIndex(MI).getRegSlot();
+    for (unsigned i = 0, e = VirtRegOpers.Uses.size(); i < e; ++i) {
+      unsigned Reg = VirtRegOpers.Uses[i];
+      const LiveInterval *LI = &LIS->getInterval(Reg);
+      // FIXME: allow the caller to pass in the list of vreg uses that remain to
+      // be bottom-scheduled to avoid searching uses at each query.
+      SlotIndex CurrIdx = LIS->getInstructionIndex(CurrPos).getRegSlot();
+      if (LI->killedAt(SlotIdx)
+          && !findUseBetween(Reg, CurrIdx, SlotIdx, MRI, LIS)) {
+        decreaseVirtRegPressure(Reg);
+      }
+    }
+  }
+
+  // Generate liveness for defs.
+  increasePhysRegPressure(PhysRegOpers.Defs);
+  increaseVirtRegPressure(VirtRegOpers.Defs);
+
+  // Boost pressure for all dead defs together.
+  increasePhysRegPressure(PhysRegOpers.DeadDefs);
+  increaseVirtRegPressure(VirtRegOpers.DeadDefs);
+  decreasePhysRegPressure(PhysRegOpers.DeadDefs);
+  decreaseVirtRegPressure(VirtRegOpers.DeadDefs);
+}
+
+/// Consider the pressure increase caused by traversing this instruction
+/// top-down. Find the register class with the most change in its pressure limit
+/// based on the tracker's current pressure, and return the number of excess
+/// register units of that pressure set introduced by this instruction.
+///
+/// This assumes that the current LiveIn set is sufficient.
+void RegPressureTracker::
+getMaxDownwardPressureDelta(const MachineInstr *MI, RegPressureDelta &Delta,
+                            ArrayRef<PressureElement> CriticalPSets,
+                            ArrayRef<unsigned> MaxPressureLimit) {
+  // Snapshot Pressure.
+  std::vector<unsigned> SavedPressure = CurrSetPressure;
+  std::vector<unsigned> SavedMaxPressure = P.MaxSetPressure;
+
+  bumpDownwardPressure(MI);
+
+  computeExcessPressureDelta(SavedPressure, CurrSetPressure, Delta, TRI);
+  computeMaxPressureDelta(SavedMaxPressure, P.MaxSetPressure, CriticalPSets,
+                          MaxPressureLimit, Delta);
+  assert(Delta.CriticalMax.UnitIncrease >= 0 &&
+         Delta.CurrentMax.UnitIncrease >= 0 && "cannot decrease max pressure");
+
+  // Restore the tracker's state.
+  P.MaxSetPressure.swap(SavedMaxPressure);
+  CurrSetPressure.swap(SavedPressure);
+}
+
+/// Get the pressure of each PSet after traversing this instruction bottom-up.
+void RegPressureTracker::
+getUpwardPressure(const MachineInstr *MI,
+                  std::vector<unsigned> &PressureResult,
+                  std::vector<unsigned> &MaxPressureResult) {
+  // Snapshot pressure.
+  PressureResult = CurrSetPressure;
+  MaxPressureResult = P.MaxSetPressure;
+
+  bumpUpwardPressure(MI);
+
+  // Current pressure becomes the result. Restore current pressure.
+  P.MaxSetPressure.swap(MaxPressureResult);
+  CurrSetPressure.swap(PressureResult);
+}
+
+/// Get the pressure of each PSet after traversing this instruction top-down.
+void RegPressureTracker::
+getDownwardPressure(const MachineInstr *MI,
+                    std::vector<unsigned> &PressureResult,
+                    std::vector<unsigned> &MaxPressureResult) {
+  // Snapshot pressure.
+  PressureResult = CurrSetPressure;
+  MaxPressureResult = P.MaxSetPressure;
+
+  bumpDownwardPressure(MI);
+
+  // Current pressure becomes the result. Restore current pressure.
+  P.MaxSetPressure.swap(MaxPressureResult);
+  CurrSetPressure.swap(PressureResult);
+}
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index 03bd82e..d673794 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -37,16 +37,13 @@ using namespace llvm;
 void RegScavenger::setUsed(unsigned Reg) {
   RegsAvailable.reset(Reg);
 
-  for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg);
-       unsigned SubReg = *SubRegs; ++SubRegs)
-    RegsAvailable.reset(SubReg);
+  for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+    RegsAvailable.reset(*SubRegs);
 }
 
 bool RegScavenger::isAliasUsed(unsigned Reg) const {
-  if (isUsed(Reg))
-    return true;
-  for (const uint16_t *R = TRI->getAliasSet(Reg); *R; ++R)
-    if (isUsed(*R))
+  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+    if (isUsed(*AI))
       return true;
   return false;
 }
@@ -114,8 +111,8 @@ void RegScavenger::enterBasicBlock(MachineBasicBlock *mbb) {
 
 void RegScavenger::addRegWithSubRegs(BitVector &BV, unsigned Reg) {
   BV.set(Reg);
-  for (const uint16_t *R = TRI->getSubRegisters(Reg); *R; R++)
-    BV.set(*R);
+  for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+    BV.set(*SubRegs);
 }
 
 void RegScavenger::forward() {
@@ -195,9 +192,8 @@ void RegScavenger::forward() {
         // Ideally we would like a way to model this, but leaving the
         // insert_subreg around causes both correctness and performance issues.
         bool SubUsed = false;
-        for (const uint16_t *SubRegs = TRI->getSubRegisters(Reg);
-             unsigned SubReg = *SubRegs; ++SubRegs)
-          if (isUsed(SubReg)) {
+        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+          if (isUsed(*SubRegs)) {
             SubUsed = true;
             break;
           }
@@ -296,9 +292,8 @@ unsigned RegScavenger::findSurvivorReg(MachineBasicBlock::iterator StartMI,
           isVirtKillInsn = true;
         continue;
       }
-      Candidates.reset(MO.getReg());
-      for (const uint16_t *R = TRI->getAliasSet(MO.getReg()); *R; R++)
-        Candidates.reset(*R);
+      for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid(); ++AI)
+        Candidates.reset(*AI);
     }
     // If we're not in a virtual reg's live range, this is a valid
     // restore point.
diff --git a/lib/CodeGen/RenderMachineFunction.cpp b/lib/CodeGen/RenderMachineFunction.cpp
deleted file mode 100644
index 6020908..0000000
--- a/lib/CodeGen/RenderMachineFunction.cpp
+++ /dev/null
@@ -1,1013 +0,0 @@
-//===-- llvm/CodeGen/RenderMachineFunction.cpp - MF->HTML -----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "rendermf"
-
-#include "RenderMachineFunction.h"
-
-#include "VirtRegMap.h"
-
-#include "llvm/Function.h"
-#include "llvm/Module.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-
-#include <sstream>
-
-using namespace llvm;
-
-char RenderMachineFunction::ID = 0;
-INITIALIZE_PASS_BEGIN(RenderMachineFunction, "rendermf",
-                "Render machine functions (and related info) to HTML pages",
-                false, false)
-INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
-INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_END(RenderMachineFunction, "rendermf",
-                "Render machine functions (and related info) to HTML pages",
-                false, false)
-
-static cl::opt<std::string>
-outputFileSuffix("rmf-file-suffix",
-                 cl::desc("Appended to function name to get output file name "
-                          "(default: \".html\")"),
-                 cl::init(".html"), cl::Hidden);
-
-static cl::opt<std::string>
-machineFuncsToRender("rmf-funcs",
-                     cl::desc("Comma separated list of functions to render"
-                              ", or \"*\"."),
-                     cl::init(""), cl::Hidden);
-
-static cl::opt<std::string>
-pressureClasses("rmf-classes",
-                cl::desc("Register classes to render pressure for."),
-                cl::init(""), cl::Hidden);
-
-static cl::opt<std::string>
-showIntervals("rmf-intervals",
-              cl::desc("Live intervals to show alongside code."),
-              cl::init(""), cl::Hidden);
-
-static cl::opt<bool>
-filterEmpty("rmf-filter-empty-intervals",
-            cl::desc("Don't display empty intervals."),
-            cl::init(true), cl::Hidden);
-
-static cl::opt<bool>
-showEmptyIndexes("rmf-empty-indexes",
-                 cl::desc("Render indexes not associated with instructions or "
-                          "MBB starts."),
-                 cl::init(false), cl::Hidden);
-
-static cl::opt<bool>
-useFancyVerticals("rmf-fancy-verts",
-                  cl::desc("Use SVG for vertical text."),
-                  cl::init(true), cl::Hidden);
-
-static cl::opt<bool>
-prettyHTML("rmf-pretty-html",
-           cl::desc("Pretty print HTML. For debugging the renderer only.."),
-           cl::init(false), cl::Hidden);
-
-
-namespace llvm {
-
-  bool MFRenderingOptions::renderingOptionsProcessed;
-  std::set<std::string> MFRenderingOptions::mfNamesToRender;
-  bool MFRenderingOptions::renderAllMFs = false;
-
-  std::set<std::string> MFRenderingOptions::classNamesToRender;
-  bool MFRenderingOptions::renderAllClasses = false;
-
-  std::set<std::pair<unsigned, unsigned> >
-    MFRenderingOptions::intervalNumsToRender;
-  unsigned MFRenderingOptions::intervalTypesToRender = ExplicitOnly;
-
-  template <typename OutputItr>
-  void MFRenderingOptions::splitComaSeperatedList(const std::string &s,
-                                                         OutputItr outItr) {
-    std::string::const_iterator curPos = s.begin();
-    std::string::const_iterator nextComa = std::find(curPos, s.end(), ',');
-    while (nextComa != s.end()) {
-      std::string elem;
-      std::copy(curPos, nextComa, std::back_inserter(elem));
-      *outItr = elem;
-      ++outItr;
-      curPos = llvm::next(nextComa);
-      nextComa = std::find(curPos, s.end(), ',');
-    }
-
-    if (curPos != s.end()) {
-      std::string elem;
-      std::copy(curPos, s.end(), std::back_inserter(elem));
-      *outItr = elem;
-      ++outItr;
-    }
-  }
-
-  void MFRenderingOptions::processOptions() {
-    if (!renderingOptionsProcessed) {
-      processFuncNames();
-      processRegClassNames();
-      processIntervalNumbers();
-      renderingOptionsProcessed = true;
-    }
-  }
-
-  void MFRenderingOptions::processFuncNames() {
-    if (machineFuncsToRender == "*") {
-      renderAllMFs = true;
-    } else {
-      splitComaSeperatedList(machineFuncsToRender,
-                             std::inserter(mfNamesToRender,
-                                           mfNamesToRender.begin()));
-    }
-  }
-
-  void MFRenderingOptions::processRegClassNames() {
-    if (pressureClasses == "*") {
-      renderAllClasses = true;
-    } else {
-      splitComaSeperatedList(pressureClasses,
-                             std::inserter(classNamesToRender,
-                                           classNamesToRender.begin()));
-    }
-  }
-
-  void MFRenderingOptions::processIntervalNumbers() {
-    std::set<std::string> intervalRanges;
-    splitComaSeperatedList(showIntervals,
-                           std::inserter(intervalRanges,
-                                         intervalRanges.begin()));
-    std::for_each(intervalRanges.begin(), intervalRanges.end(),
-                  processIntervalRange);
-  }
-
-  void MFRenderingOptions::processIntervalRange(
-                                          const std::string &intervalRangeStr) {
-    if (intervalRangeStr == "*") {
-      intervalTypesToRender |= All;
-    } else if (intervalRangeStr == "virt-nospills*") {
-      intervalTypesToRender |= VirtNoSpills;
-    } else if (intervalRangeStr == "spills*") {
-      intervalTypesToRender |= VirtSpills;
-    } else if (intervalRangeStr == "virt*") {
-      intervalTypesToRender |= AllVirt;
-    } else if (intervalRangeStr == "phys*") {
-      intervalTypesToRender |= AllPhys;
-    } else {
-      std::istringstream iss(intervalRangeStr);
-      unsigned reg1, reg2;
-      if ((iss >> reg1 >> std::ws)) {
-        if (iss.eof()) {
-          intervalNumsToRender.insert(std::make_pair(reg1, reg1 + 1));
-        } else {
-          char c;
-          iss >> c;
-          if (c == '-' && (iss >> reg2)) {
-            intervalNumsToRender.insert(std::make_pair(reg1, reg2 + 1));
-          } else {
-            dbgs() << "Warning: Invalid interval range \""
-                   << intervalRangeStr << "\" in -rmf-intervals. Skipping.\n";
-          }
-        }
-      } else {
-        dbgs() << "Warning: Invalid interval number \""
-               << intervalRangeStr << "\" in -rmf-intervals. Skipping.\n";
-      }
-    }
-  }
-
-  void MFRenderingOptions::setup(MachineFunction *mf,
-                                 const TargetRegisterInfo *tri,
-                                 LiveIntervals *lis,
-                                 const RenderMachineFunction *rmf) {
-    this->mf = mf;
-    this->tri = tri;
-    this->lis = lis;
-    this->rmf = rmf;
-
-    clear();
-  }
-
-  void MFRenderingOptions::clear() {
-    regClassesTranslatedToCurrentFunction = false;
-    regClassSet.clear();
-
-    intervalsTranslatedToCurrentFunction = false;
-    intervalSet.clear();
-  }
-
-  void MFRenderingOptions::resetRenderSpecificOptions() {
-    intervalSet.clear();
-    intervalsTranslatedToCurrentFunction = false;
-  }
-
-  bool MFRenderingOptions::shouldRenderCurrentMachineFunction() const {
-    processOptions();
-
-    return (renderAllMFs ||
-            mfNamesToRender.find(mf->getFunction()->getName()) !=
-              mfNamesToRender.end());    
-  }
-
-  const MFRenderingOptions::RegClassSet& MFRenderingOptions::regClasses() const{
-    translateRegClassNamesToCurrentFunction();
-    return regClassSet;
-  }
-
-  const MFRenderingOptions::IntervalSet& MFRenderingOptions::intervals() const {
-    translateIntervalNumbersToCurrentFunction();
-    return intervalSet;
-  }
-
-  bool MFRenderingOptions::renderEmptyIndexes() const {
-    return showEmptyIndexes;
-  }
-
-  bool MFRenderingOptions::fancyVerticals() const {
-    return useFancyVerticals;
-  }
-
-  void MFRenderingOptions::translateRegClassNamesToCurrentFunction() const {
-    if (!regClassesTranslatedToCurrentFunction) {
-      processOptions();
-      for (TargetRegisterInfo::regclass_iterator rcItr = tri->regclass_begin(),
-                                                 rcEnd = tri->regclass_end();
-           rcItr != rcEnd; ++rcItr) {
-        const TargetRegisterClass *trc = *rcItr;
-        if (renderAllClasses ||
-            classNamesToRender.find(trc->getName()) !=
-              classNamesToRender.end()) {
-          regClassSet.insert(trc);
-        }
-      }
-      regClassesTranslatedToCurrentFunction = true;
-    }
-  }
-
-  void MFRenderingOptions::translateIntervalNumbersToCurrentFunction() const {
-    if (!intervalsTranslatedToCurrentFunction) {
-      processOptions();
-
-      // If we're not just doing explicit then do a copy over all matching
-      // types.
-      if (intervalTypesToRender != ExplicitOnly) {
-        for (LiveIntervals::iterator liItr = lis->begin(), liEnd = lis->end();
-             liItr != liEnd; ++liItr) {
-          LiveInterval *li = liItr->second;
-
-          if (filterEmpty && li->empty())
-            continue;
-
-          if ((TargetRegisterInfo::isPhysicalRegister(li->reg) &&
-               (intervalTypesToRender & AllPhys))) {
-            intervalSet.insert(li);
-          } else if (TargetRegisterInfo::isVirtualRegister(li->reg)) {
-            if (((intervalTypesToRender & VirtNoSpills) && !rmf->isSpill(li)) || 
-                ((intervalTypesToRender & VirtSpills) && rmf->isSpill(li))) {
-              intervalSet.insert(li);
-            }
-          }
-        }
-      }
-
-      // If we need to process the explicit list...
-      if (intervalTypesToRender != All) {
-        for (std::set<std::pair<unsigned, unsigned> >::const_iterator
-               regRangeItr = intervalNumsToRender.begin(),
-               regRangeEnd = intervalNumsToRender.end();
-             regRangeItr != regRangeEnd; ++regRangeItr) {
-          const std::pair<unsigned, unsigned> &range = *regRangeItr;
-          for (unsigned reg = range.first; reg != range.second; ++reg) {
-            if (lis->hasInterval(reg)) {
-              intervalSet.insert(&lis->getInterval(reg));
-            }
-          }
-        }
-      }
-
-      intervalsTranslatedToCurrentFunction = true;
-    }
-  }
-
-  // ---------- TargetRegisterExtraInformation implementation ----------
-
-  TargetRegisterExtraInfo::TargetRegisterExtraInfo()
-    : mapsPopulated(false) {
-  }
-
-  void TargetRegisterExtraInfo::setup(MachineFunction *mf,
-                                      MachineRegisterInfo *mri,
-                                      const TargetRegisterInfo *tri,
-                                      LiveIntervals *lis) {
-    this->mf = mf;
-    this->mri = mri;
-    this->tri = tri;
-    this->lis = lis;
-  }
-
-  void TargetRegisterExtraInfo::reset() {
-    if (!mapsPopulated) {
-      initWorst();
-      //initBounds();
-      initCapacity();
-      mapsPopulated = true;
-    }
-
-    resetPressureAndLiveStates();
-  }
-
-  void TargetRegisterExtraInfo::clear() {
-    prWorst.clear();
-    vrWorst.clear();
-    capacityMap.clear();
-    pressureMap.clear();
-    //liveStatesMap.clear();
-    mapsPopulated = false;
-  }
-
-  void TargetRegisterExtraInfo::initWorst() {
-    assert(!mapsPopulated && prWorst.empty() && vrWorst.empty() &&
-           "Worst map already initialised?");
-
-    // Start with the physical registers.
-    for (unsigned preg = 1; preg < tri->getNumRegs(); ++preg) {
-      WorstMapLine &pregLine = prWorst[preg];
-
-      for (TargetRegisterInfo::regclass_iterator rcItr = tri->regclass_begin(),
-                                                 rcEnd = tri->regclass_end();
-           rcItr != rcEnd; ++rcItr) {
-        const TargetRegisterClass *trc = *rcItr;
-
-        unsigned numOverlaps = 0;
-        for (TargetRegisterClass::iterator rItr = trc->begin(),
-                                           rEnd = trc->end();
-             rItr != rEnd; ++rItr) {
-          unsigned trcPReg = *rItr;
-          if (tri->regsOverlap(preg, trcPReg))
-            ++numOverlaps;
-        }
-        
-        pregLine[trc] = numOverlaps;
-      }
-    }
-
-    // Now the register classes.
-    for (TargetRegisterInfo::regclass_iterator rc1Itr = tri->regclass_begin(),
-                                               rcEnd = tri->regclass_end();
-         rc1Itr != rcEnd; ++rc1Itr) {
-      const TargetRegisterClass *trc1 = *rc1Itr;
-      WorstMapLine &classLine = vrWorst[trc1];
-
-      for (TargetRegisterInfo::regclass_iterator rc2Itr = tri->regclass_begin();
-           rc2Itr != rcEnd; ++rc2Itr) {
-        const TargetRegisterClass *trc2 = *rc2Itr;
-
-        unsigned worst = 0;
-
-        for (TargetRegisterClass::iterator trc1Itr = trc1->begin(),
-                                           trc1End = trc1->end();
-             trc1Itr != trc1End; ++trc1Itr) {
-          unsigned trc1Reg = *trc1Itr;
-          unsigned trc1RegWorst = 0;
-
-          for (TargetRegisterClass::iterator trc2Itr = trc2->begin(),
-                                             trc2End = trc2->end();
-               trc2Itr != trc2End; ++trc2Itr) {
-            unsigned trc2Reg = *trc2Itr;
-            if (tri->regsOverlap(trc1Reg, trc2Reg))
-              ++trc1RegWorst;
-          }
-          if (trc1RegWorst > worst) {
-            worst = trc1RegWorst;
-          }    
-        }
-
-        if (worst != 0) {
-          classLine[trc2] = worst;
-        }
-      }
-    }
-  }
-
-  unsigned TargetRegisterExtraInfo::getWorst(
-                                        unsigned reg,
-                                        const TargetRegisterClass *trc) const {
-    const WorstMapLine *wml = 0;
-    if (TargetRegisterInfo::isPhysicalRegister(reg)) {
-      PRWorstMap::const_iterator prwItr = prWorst.find(reg);
-      assert(prwItr != prWorst.end() && "Missing prWorst entry.");
-      wml = &prwItr->second;
-    } else {
-      const TargetRegisterClass *regTRC = mri->getRegClass(reg);
-      VRWorstMap::const_iterator vrwItr = vrWorst.find(regTRC);
-      assert(vrwItr != vrWorst.end() && "Missing vrWorst entry.");
-      wml = &vrwItr->second;
-    }
-    
-    WorstMapLine::const_iterator wmlItr = wml->find(trc);
-    if (wmlItr == wml->end())
-      return 0;
-
-    return wmlItr->second;
-  }
-
-  void TargetRegisterExtraInfo::initCapacity() {
-    assert(!mapsPopulated && capacityMap.empty() &&
-           "Capacity map already initialised?");
-
-    for (TargetRegisterInfo::regclass_iterator rcItr = tri->regclass_begin(),
-           rcEnd = tri->regclass_end();
-         rcItr != rcEnd; ++rcItr) {
-      const TargetRegisterClass *trc = *rcItr;
-      unsigned capacity = trc->getRawAllocationOrder(*mf).size();
-
-      if (capacity != 0)
-        capacityMap[trc] = capacity;
-    }
-  }
-
-  unsigned TargetRegisterExtraInfo::getCapacity(
-                                         const TargetRegisterClass *trc) const {
-    CapacityMap::const_iterator cmItr = capacityMap.find(trc);
-    assert(cmItr != capacityMap.end() &&
-           "vreg with unallocable register class");
-    return cmItr->second;
-  }
-
-  void TargetRegisterExtraInfo::resetPressureAndLiveStates() {
-    pressureMap.clear();
-    //liveStatesMap.clear();
-
-    // Iterate over all slots.
-    
-
-    // Iterate over all live intervals.
-    for (LiveIntervals::iterator liItr = lis->begin(),
-           liEnd = lis->end();
-         liItr != liEnd; ++liItr) {
-      LiveInterval *li = liItr->second;
-
-      if (TargetRegisterInfo::isPhysicalRegister(li->reg))
-        continue;
-      
-      // For all ranges in the current interal.
-      for (LiveInterval::iterator lrItr = li->begin(),
-             lrEnd = li->end();
-           lrItr != lrEnd; ++lrItr) {
-        LiveRange *lr = &*lrItr;
-        
-        // For all slots in the current range.
-        for (SlotIndex i = lr->start; i != lr->end; i = i.getNextSlot()) {
-
-          // Record increased pressure at index for all overlapping classes.
-          for (TargetRegisterInfo::regclass_iterator
-                 rcItr = tri->regclass_begin(),
-                 rcEnd = tri->regclass_end();
-               rcItr != rcEnd; ++rcItr) {
-            const TargetRegisterClass *trc = *rcItr;
-
-            if (trc->getRawAllocationOrder(*mf).empty())
-              continue;
-
-            unsigned worstAtI = getWorst(li->reg, trc);
-
-            if (worstAtI != 0) {
-              pressureMap[i][trc] += worstAtI;
-            }
-          }
-        }
-      }
-    } 
-  }
-
-  unsigned TargetRegisterExtraInfo::getPressureAtSlot(
-                                                 const TargetRegisterClass *trc,
-                                                 SlotIndex i) const {
-    PressureMap::const_iterator pmItr = pressureMap.find(i);
-    if (pmItr == pressureMap.end())
-      return 0;
-    const PressureMapLine &pmLine = pmItr->second;
-    PressureMapLine::const_iterator pmlItr = pmLine.find(trc);
-    if (pmlItr == pmLine.end())
-      return 0;
-    return pmlItr->second;
-  }
-
-  bool TargetRegisterExtraInfo::classOverCapacityAtSlot(
-                                                 const TargetRegisterClass *trc,
-                                                 SlotIndex i) const {
-    return (getPressureAtSlot(trc, i) > getCapacity(trc));
-  }
-
-  // ---------- MachineFunctionRenderer implementation ----------
-
-  void RenderMachineFunction::Spacer::print(raw_ostream &os) const {
-    if (!prettyHTML)
-      return;
-    for (unsigned i = 0; i < ns; ++i) {
-      os << " ";
-    }
-  }
-
-  RenderMachineFunction::Spacer RenderMachineFunction::s(unsigned ns) const {
-    return Spacer(ns);
-  }
-
-  raw_ostream& operator<<(raw_ostream &os, const RenderMachineFunction::Spacer &s) {
-    s.print(os);
-    return os;
-  }
-
-  template <typename Iterator>
-  std::string RenderMachineFunction::escapeChars(Iterator sBegin, Iterator sEnd) const {
-    std::string r;
-
-    for (Iterator sItr = sBegin; sItr != sEnd; ++sItr) {
-      char c = *sItr;
-
-      switch (c) {
-        case '<': r.append("&lt;"); break;
-        case '>': r.append("&gt;"); break;
-        case '&': r.append("&amp;"); break;
-        case ' ': r.append("&nbsp;"); break;
-        case '\"': r.append("&quot;"); break;
-        default: r.push_back(c); break;
-      }
-    }
-
-    return r;
-  }
-
-  RenderMachineFunction::LiveState
-  RenderMachineFunction::getLiveStateAt(const LiveInterval *li,
-                                        SlotIndex i) const {
-    const MachineInstr *mi = sis->getInstructionFromIndex(i);
-
-    // For uses/defs recorded use/def indexes override current liveness and
-    // instruction operands (Only for the interval which records the indexes).
-    // FIXME: This is all wrong, uses and defs share the same slots.
-    if (i.isEarlyClobber() || i.isRegister()) {
-      UseDefs::const_iterator udItr = useDefs.find(li);
-      if (udItr != useDefs.end()) {
-        const SlotSet &slotSet = udItr->second;
-        if (slotSet.count(i)) {
-          if (i.isEarlyClobber()) {
-            return Used;
-          }
-          // else
-          return Defined;
-        }
-      }
-    }
-
-    // If the slot is a load/store, or there's no info in the use/def set then
-    // use liveness and instruction operand info.
-    if (li->liveAt(i)) {
-
-      if (mi == 0) {
-        if (vrm == 0 || 
-            (vrm->getStackSlot(li->reg) == VirtRegMap::NO_STACK_SLOT)) {
-          return AliveReg;
-        } else {
-          return AliveStack;
-        }
-      } else {
-        if (i.isRegister() && mi->definesRegister(li->reg, tri)) {
-          return Defined;
-        } else if (i.isEarlyClobber() && mi->readsRegister(li->reg)) {
-          return Used;
-        } else {
-          if (vrm == 0 || 
-              (vrm->getStackSlot(li->reg) == VirtRegMap::NO_STACK_SLOT)) {
-            return AliveReg;
-          } else {
-            return AliveStack;
-          }
-        }
-      }
-    }
-    return Dead;
-  }
-
-  RenderMachineFunction::PressureState
-  RenderMachineFunction::getPressureStateAt(const TargetRegisterClass *trc,
-                                              SlotIndex i) const {
-    if (trei.getPressureAtSlot(trc, i) == 0) {
-      return Zero;
-    } else if (trei.classOverCapacityAtSlot(trc, i)){
-      return High;
-    }
-    return Low;
-  }
-
-  /// \brief Render a machine instruction.
-  void RenderMachineFunction::renderMachineInstr(raw_ostream &os,
-                                                 const MachineInstr *mi) const {
-    std::string s;
-    raw_string_ostream oss(s);
-    oss << *mi;
-
-    os << escapeChars(oss.str());
-  }
-
-  template <typename T>
-  void RenderMachineFunction::renderVertical(const Spacer &indent,
-                                             raw_ostream &os,
-                                             const T &t) const {
-    if (ro.fancyVerticals()) {
-      os << indent << "<object\n"
-         << indent + s(2) << "class=\"obj\"\n"
-         << indent + s(2) << "type=\"image/svg+xml\"\n"
-         << indent + s(2) << "width=\"14px\"\n"
-         << indent + s(2) << "height=\"55px\"\n"
-         << indent + s(2) << "data=\"data:image/svg+xml,\n"
-         << indent + s(4) << "<svg xmlns='http://www.w3.org/2000/svg'>\n"
-         << indent + s(6) << "<text x='-55' y='10' "
-                             "font-family='Courier' font-size='12' "
-                             "transform='rotate(-90)' "
-                             "text-rendering='optimizeSpeed' "
-                             "fill='#000'>" << t << "</text>\n"
-         << indent + s(4) << "</svg>\">\n"
-         << indent << "</object>\n";
-    } else {
-      std::ostringstream oss;
-      oss << t;
-      std::string tStr(oss.str());
-
-      os << indent;
-      for (std::string::iterator tStrItr = tStr.begin(), tStrEnd = tStr.end();
-           tStrItr != tStrEnd; ++tStrItr) {
-        os << *tStrItr << "<br/>";
-      }
-      os << "\n";
-    }
-  }
-
-  void RenderMachineFunction::insertCSS(const Spacer &indent,
-                                        raw_ostream &os) const {
-    os << indent << "<style type=\"text/css\">\n"
-       << indent + s(2) << "body { font-color: black; }\n"
-       << indent + s(2) << "table.code td { font-family: monospace; "
-                    "border-width: 0px; border-style: solid; "
-                    "border-bottom: 1px solid #dddddd; white-space: nowrap; }\n"
-       << indent + s(2) << "table.code td.p-z { background-color: #000000; }\n"
-       << indent + s(2) << "table.code td.p-l { background-color: #00ff00; }\n"
-       << indent + s(2) << "table.code td.p-h { background-color: #ff0000; }\n"
-       << indent + s(2) << "table.code td.l-n { background-color: #ffffff; }\n"
-       << indent + s(2) << "table.code td.l-d { background-color: #ff0000; }\n"
-       << indent + s(2) << "table.code td.l-u { background-color: #ffff00; }\n"
-       << indent + s(2) << "table.code td.l-r { background-color: #000000; }\n"
-       << indent + s(2) << "table.code td.l-s { background-color: #770000; }\n"
-       << indent + s(2) << "table.code th { border-width: 0px; "
-                    "border-style: solid; }\n"
-       << indent << "</style>\n";
-  }
-
-  void RenderMachineFunction::renderFunctionSummary(
-                                    const Spacer &indent, raw_ostream &os,
-                                    const char * const renderContextStr) const {
-    os << indent << "<h1>Function: " << mf->getFunction()->getName()
-                 << "</h1>\n"
-       << indent << "<h2>Rendering context: " << renderContextStr << "</h2>\n";
-  }
-
-
-  void RenderMachineFunction::renderPressureTableLegend(
-                                                      const Spacer &indent,
-                                                      raw_ostream &os) const {
-    os << indent << "<h2>Rendering Pressure Legend:</h2>\n"
-       << indent << "<table class=\"code\">\n"
-       << indent + s(2) << "<tr>\n"
-       << indent + s(4) << "<th>Pressure</th><th>Description</th>"
-                    "<th>Appearance</th>\n"
-       << indent + s(2) << "</tr>\n"
-       << indent + s(2) << "<tr>\n"
-       << indent + s(4) << "<td>No Pressure</td>"
-                    "<td>No physical registers of this class requested.</td>"
-                    "<td class=\"p-z\">&nbsp;&nbsp;</td>\n"
-       << indent + s(2) << "</tr>\n"
-       << indent + s(2) << "<tr>\n"
-       << indent + s(4) << "<td>Low Pressure</td>"
-                    "<td>Sufficient physical registers to meet demand.</td>"
-                    "<td class=\"p-l\">&nbsp;&nbsp;</td>\n"
-       << indent + s(2) << "</tr>\n"
-       << indent + s(2) << "<tr>\n"
-       << indent + s(4) << "<td>High Pressure</td>"
-                    "<td>Potentially insufficient physical registers to meet demand.</td>"
-                    "<td class=\"p-h\">&nbsp;&nbsp;</td>\n"
-       << indent + s(2) << "</tr>\n"
-       << indent << "</table>\n";
-  }
-
-  template <typename CellType>
-  void RenderMachineFunction::renderCellsWithRLE(
-                   const Spacer &indent, raw_ostream &os,
-                   const std::pair<CellType, unsigned> &rleAccumulator,
-                   const std::map<CellType, std::string> &cellTypeStrs) const {
-
-    if (rleAccumulator.second == 0)
-      return; 
-
-    typename std::map<CellType, std::string>::const_iterator ctsItr =
-      cellTypeStrs.find(rleAccumulator.first);
-
-    assert(ctsItr != cellTypeStrs.end() && "No string for given cell type.");
-
-    os << indent + s(4) << "<td class=\"" << ctsItr->second << "\"";
-    if (rleAccumulator.second > 1)
-      os << " colspan=" << rleAccumulator.second;
-    os << "></td>\n";
-  }
-
-
-  void RenderMachineFunction::renderCodeTablePlusPI(const Spacer &indent,
-                                                    raw_ostream &os) const {
-
-    std::map<LiveState, std::string> lsStrs;
-    lsStrs[Dead] = "l-n";
-    lsStrs[Defined] = "l-d";
-    lsStrs[Used] = "l-u";
-    lsStrs[AliveReg] = "l-r";
-    lsStrs[AliveStack] = "l-s";
-
-    std::map<PressureState, std::string> psStrs;
-    psStrs[Zero] = "p-z";
-    psStrs[Low] = "p-l";
-    psStrs[High] = "p-h";
-
-    // Open the table... 
-
-    os << indent << "<table cellpadding=0 cellspacing=0 class=\"code\">\n"
-       << indent + s(2) << "<tr>\n";
-
-    // Render the header row...
-
-    os << indent + s(4) << "<th>index</th>\n"
-       << indent + s(4) << "<th>instr</th>\n";
-
-    // Render class names if necessary...
-    if (!ro.regClasses().empty()) {
-      for (MFRenderingOptions::RegClassSet::const_iterator
-             rcItr = ro.regClasses().begin(),
-             rcEnd = ro.regClasses().end();
-           rcItr != rcEnd; ++rcItr) {
-        const TargetRegisterClass *trc = *rcItr;
-        os << indent + s(4) << "<th>\n";
-        renderVertical(indent + s(6), os, trc->getName());
-        os << indent + s(4) << "</th>\n";
-      }
-    }
-
-    // FIXME: Is there a nicer way to insert space between columns in HTML?
-    if (!ro.regClasses().empty() && !ro.intervals().empty())
-      os << indent + s(4) << "<th>&nbsp;&nbsp;</th>\n";
-
-    // Render interval numbers if necessary...
-    if (!ro.intervals().empty()) {
-      for (MFRenderingOptions::IntervalSet::const_iterator
-             liItr = ro.intervals().begin(),
-             liEnd = ro.intervals().end();
-           liItr != liEnd; ++liItr) {
-
-        const LiveInterval *li = *liItr;
-        os << indent + s(4) << "<th>\n";
-        renderVertical(indent + s(6), os, li->reg);
-        os << indent + s(4) << "</th>\n";
-      }
-    }
-
-    os << indent + s(2) << "</tr>\n";
-
-    // End header row, start with the data rows...
-
-    MachineInstr *mi = 0;
-
-    // Data rows:
-    for (SlotIndex i = sis->getZeroIndex(); i != sis->getLastIndex();
-         i = i.getNextSlot()) {
-     
-      // Render the slot column. 
-      os << indent + s(2) << "<tr height=6ex>\n";
-      
-      // Render the code column.
-      if (i.isBlock()) {
-        MachineBasicBlock *mbb = sis->getMBBFromIndex(i);
-        mi = sis->getInstructionFromIndex(i);
-
-        if (i == sis->getMBBStartIdx(mbb) || mi != 0 ||
-            ro.renderEmptyIndexes()) {
-          os << indent + s(4) << "<td rowspan=4>" << i << "&nbsp;</td>\n"
-             << indent + s(4) << "<td rowspan=4>\n";
-
-          if (i == sis->getMBBStartIdx(mbb)) {
-            os << indent + s(6) << "BB#" << mbb->getNumber() << ":&nbsp;\n";
-          } else if (mi != 0) {
-            os << indent + s(6) << "&nbsp;&nbsp;";
-            renderMachineInstr(os, mi);
-          } else {
-            // Empty interval - leave blank.
-          }
-          os << indent + s(4) << "</td>\n";
-        } else {
-          i = i.getDeadSlot(); // <- Will be incremented to the next index.
-          continue;
-        }
-      }
-
-      // Render the class columns.
-      if (!ro.regClasses().empty()) {
-        std::pair<PressureState, unsigned> psRLEAccumulator(Zero, 0);
-        for (MFRenderingOptions::RegClassSet::const_iterator
-               rcItr = ro.regClasses().begin(),
-               rcEnd = ro.regClasses().end();
-             rcItr != rcEnd; ++rcItr) {
-          const TargetRegisterClass *trc = *rcItr;
-          PressureState newPressure = getPressureStateAt(trc, i);
-
-          if (newPressure == psRLEAccumulator.first) {
-            ++psRLEAccumulator.second;
-          } else {
-            renderCellsWithRLE(indent + s(4), os, psRLEAccumulator, psStrs);
-            psRLEAccumulator.first = newPressure;
-            psRLEAccumulator.second = 1;
-          }
-        }
-        renderCellsWithRLE(indent + s(4), os, psRLEAccumulator, psStrs);
-      }
-  
-      // FIXME: Is there a nicer way to insert space between columns in HTML?
-      if (!ro.regClasses().empty() && !ro.intervals().empty())
-        os << indent + s(4) << "<td width=2em></td>\n";
-
-      if (!ro.intervals().empty()) {
-        std::pair<LiveState, unsigned> lsRLEAccumulator(Dead, 0);
-        for (MFRenderingOptions::IntervalSet::const_iterator
-               liItr = ro.intervals().begin(),
-               liEnd = ro.intervals().end();
-             liItr != liEnd; ++liItr) {
-          const LiveInterval *li = *liItr;
-          LiveState newLiveness = getLiveStateAt(li, i);
-
-          if (newLiveness == lsRLEAccumulator.first) {
-            ++lsRLEAccumulator.second;
-          } else {
-            renderCellsWithRLE(indent + s(4), os, lsRLEAccumulator, lsStrs);
-            lsRLEAccumulator.first = newLiveness;
-            lsRLEAccumulator.second = 1;
-          }
-        }
-        renderCellsWithRLE(indent + s(4), os, lsRLEAccumulator, lsStrs);
-      }
-      os << indent + s(2) << "</tr>\n";
-    }
-
-    os << indent << "</table>\n";
-
-    if (!ro.regClasses().empty())
-      renderPressureTableLegend(indent, os);
-  }
-
-  void RenderMachineFunction::renderFunctionPage(
-                                    raw_ostream &os,
-                                    const char * const renderContextStr) const {
-    os << "<html>\n"
-       << s(2) << "<head>\n"
-       << s(4) << "<title>" << fqn << "</title>\n";
-
-    insertCSS(s(4), os);
-
-    os << s(2) << "<head>\n"
-       << s(2) << "<body >\n";
-
-    renderFunctionSummary(s(4), os, renderContextStr);
-
-    os << s(4) << "<br/><br/><br/>\n";
-
-    //renderLiveIntervalInfoTable("    ", os);
-
-    os << s(4) << "<br/><br/><br/>\n";
-
-    renderCodeTablePlusPI(s(4), os);
-
-    os << s(2) << "</body>\n"
-       << "</html>\n";
-  }
-
-  void RenderMachineFunction::getAnalysisUsage(AnalysisUsage &au) const {
-    au.addRequired<SlotIndexes>();
-    au.addRequired<LiveIntervals>();
-    au.setPreservesAll();
-    MachineFunctionPass::getAnalysisUsage(au);
-  }
-
-  bool RenderMachineFunction::runOnMachineFunction(MachineFunction &fn) {
-
-    mf = &fn;
-    mri = &mf->getRegInfo();
-    tri = mf->getTarget().getRegisterInfo();
-    lis = &getAnalysis<LiveIntervals>();
-    sis = &getAnalysis<SlotIndexes>();
-
-    trei.setup(mf, mri, tri, lis);
-    ro.setup(mf, tri, lis, this);
-    spillIntervals.clear();
-    spillFor.clear();
-    useDefs.clear();
-
-    fqn = mf->getFunction()->getParent()->getModuleIdentifier() + "." +
-          mf->getFunction()->getName().str();
-
-    return false;
-  }
-
-  void RenderMachineFunction::releaseMemory() {
-    trei.clear();
-    ro.clear();
-    spillIntervals.clear();
-    spillFor.clear();
-    useDefs.clear();
-  }
-
-  void RenderMachineFunction::rememberUseDefs(const LiveInterval *li) {
-
-    if (!ro.shouldRenderCurrentMachineFunction())
-      return; 
-
-    for (MachineRegisterInfo::reg_iterator rItr = mri->reg_begin(li->reg),
-                                           rEnd = mri->reg_end();
-         rItr != rEnd; ++rItr) {
-      const MachineInstr *mi = &*rItr;
-      if (mi->readsRegister(li->reg)) {
-        useDefs[li].insert(lis->getInstructionIndex(mi).getRegSlot(true));
-      }
-      if (mi->definesRegister(li->reg)) {
-        useDefs[li].insert(lis->getInstructionIndex(mi).getRegSlot());
-      }
-    }
-  }
-
-  void RenderMachineFunction::rememberSpills(
-                                     const LiveInterval *li,
-                                     const std::vector<LiveInterval*> &spills) {
-
-    if (!ro.shouldRenderCurrentMachineFunction())
-      return; 
-
-    for (std::vector<LiveInterval*>::const_iterator siItr = spills.begin(),
-                                                    siEnd = spills.end();
-         siItr != siEnd; ++siItr) {
-      const LiveInterval *spill = *siItr;
-      spillIntervals[li].insert(spill);
-      spillFor[spill] = li;
-    }
-  }
-
-  bool RenderMachineFunction::isSpill(const LiveInterval *li) const {
-    SpillForMap::const_iterator sfItr = spillFor.find(li);
-    if (sfItr == spillFor.end())
-      return false;
-    return true;
-  }
-
-  void RenderMachineFunction::renderMachineFunction(
-                                                   const char *renderContextStr,
-                                                   const VirtRegMap *vrm,
-                                                   const char *renderSuffix) {
-    if (!ro.shouldRenderCurrentMachineFunction())
-      return; 
-
-    this->vrm = vrm;
-    trei.reset();
-
-    std::string rpFileName(mf->getFunction()->getName().str() +
-                           (renderSuffix ? renderSuffix : "") +
-                           outputFileSuffix);
-
-    std::string errMsg;
-    raw_fd_ostream outFile(rpFileName.c_str(), errMsg, raw_fd_ostream::F_Binary);
-
-    renderFunctionPage(outFile, renderContextStr);
-
-    ro.resetRenderSpecificOptions();
-  }
-
-  std::string RenderMachineFunction::escapeChars(const std::string &s) const {
-    return escapeChars(s.begin(), s.end());
-  }
-
-}
diff --git a/lib/CodeGen/RenderMachineFunction.h b/lib/CodeGen/RenderMachineFunction.h
deleted file mode 100644
index 8571992..0000000
--- a/lib/CodeGen/RenderMachineFunction.h
+++ /dev/null
@@ -1,338 +0,0 @@
-//===-- llvm/CodeGen/RenderMachineFunction.h - MF->HTML -*- C++ -*---------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CODEGEN_RENDERMACHINEFUNCTION_H
-#define LLVM_CODEGEN_RENDERMACHINEFUNCTION_H
-
-#include "llvm/CodeGen/LiveInterval.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/SlotIndexes.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-
-#include <algorithm>
-#include <map>
-#include <set>
-#include <string>
-
-namespace llvm {
-
-  class LiveInterval;
-  class LiveIntervals;
-  class MachineInstr;
-  class MachineRegisterInfo;
-  class RenderMachineFunction;
-  class TargetRegisterClass;
-  class TargetRegisterInfo;
-  class VirtRegMap;
-  class raw_ostream;
-
-  /// \brief Helper class to process rendering options. Tries to be as lazy as
-  ///        possible.
-  class MFRenderingOptions {
-  public:
-
-    struct RegClassComp {
-      bool operator()(const TargetRegisterClass *trc1,
-                      const TargetRegisterClass *trc2) const {
-        std::string trc1Name(trc1->getName()), trc2Name(trc2->getName());
-        return std::lexicographical_compare(trc1Name.begin(), trc1Name.end(),
-                                            trc2Name.begin(), trc2Name.end());
-      }
-    };
-
-    typedef std::set<const TargetRegisterClass*, RegClassComp> RegClassSet;
-
-    struct IntervalComp {
-      bool operator()(const LiveInterval *li1, const LiveInterval *li2) const {
-        return li1->reg < li2->reg;
-      }
-    };
-
-    typedef std::set<const LiveInterval*, IntervalComp> IntervalSet;
-
-    /// Initialise the rendering options.
-    void setup(MachineFunction *mf, const TargetRegisterInfo *tri,
-               LiveIntervals *lis, const RenderMachineFunction *rmf);
-
-    /// Clear translations of options to the current function.
-    void clear();
-
-    /// Reset any options computed for this specific rendering.
-    void resetRenderSpecificOptions();
-
-    /// Should we render the current function.
-    bool shouldRenderCurrentMachineFunction() const;
-
-    /// Return the set of register classes to render pressure for.
-    const RegClassSet& regClasses() const;
-
-    /// Return the set of live intervals to render liveness for.
-    const IntervalSet& intervals() const;
-
-    /// Render indexes which are not associated with instructions / MBB starts.
-    bool renderEmptyIndexes() const;
-
-    /// Return whether or not to render using SVG for fancy vertical text.
-    bool fancyVerticals() const;
-
-  private:
-
-    static bool renderingOptionsProcessed;
-    static std::set<std::string> mfNamesToRender;
-    static bool renderAllMFs;
-
-    static std::set<std::string> classNamesToRender;
-    static bool renderAllClasses;
-
-
-    static std::set<std::pair<unsigned, unsigned> > intervalNumsToRender;
-    typedef enum { ExplicitOnly     = 0,
-                   AllPhys          = 1,
-                   VirtNoSpills     = 2,
-                   VirtSpills       = 4,
-                   AllVirt          = 6,
-                   All              = 7 }
-      IntervalTypesToRender;
-    static unsigned intervalTypesToRender;
-
-    template <typename OutputItr>
-    static void splitComaSeperatedList(const std::string &s, OutputItr outItr);
-
-    static void processOptions();
-
-    static void processFuncNames();
-    static void processRegClassNames();
-    static void processIntervalNumbers();
-
-    static void processIntervalRange(const std::string &intervalRangeStr);
-
-    MachineFunction *mf;
-    const TargetRegisterInfo *tri;
-    LiveIntervals *lis;
-    const RenderMachineFunction *rmf;
-
-    mutable bool regClassesTranslatedToCurrentFunction;
-    mutable RegClassSet regClassSet;
-
-    mutable bool intervalsTranslatedToCurrentFunction;
-    mutable IntervalSet intervalSet;
-
-    void translateRegClassNamesToCurrentFunction() const;
-
-    void translateIntervalNumbersToCurrentFunction() const;
-  };
-
-  /// \brief Provide extra information about the physical and virtual registers
-  ///        in the function being compiled.
-  class TargetRegisterExtraInfo {
-  public:
-    TargetRegisterExtraInfo();
-
-    /// \brief Set up TargetRegisterExtraInfo with pointers to necessary
-    ///        sources of information.
-    void setup(MachineFunction *mf, MachineRegisterInfo *mri,
-               const TargetRegisterInfo *tri, LiveIntervals *lis);
-
-    /// \brief Recompute tables for changed function.
-    void reset(); 
-
-    /// \brief Free all tables in TargetRegisterExtraInfo.
-    void clear();
-
-    /// \brief Maximum number of registers from trc which alias reg.
-    unsigned getWorst(unsigned reg, const TargetRegisterClass *trc) const;
-
-    /// \brief Returns the number of allocable registers in trc.
-    unsigned getCapacity(const TargetRegisterClass *trc) const;
-
-    /// \brief Return the number of registers of class trc that may be
-    ///        needed at slot i.
-    unsigned getPressureAtSlot(const TargetRegisterClass *trc,
-                               SlotIndex i) const;
-
-    /// \brief Return true if the number of registers of type trc that may be
-    ///        needed at slot i is greater than the capacity of trc.
-    bool classOverCapacityAtSlot(const TargetRegisterClass *trc,
-                                 SlotIndex i) const;
-
-  private:
-
-    MachineFunction *mf;
-    MachineRegisterInfo *mri;
-    const TargetRegisterInfo *tri;
-    LiveIntervals *lis;
-
-    typedef std::map<const TargetRegisterClass*, unsigned> WorstMapLine;
-    typedef std::map<const TargetRegisterClass*, WorstMapLine> VRWorstMap;
-    VRWorstMap vrWorst;
-
-    typedef std::map<unsigned, WorstMapLine> PRWorstMap;
-    PRWorstMap prWorst;
-
-    typedef std::map<const TargetRegisterClass*, unsigned> CapacityMap;
-    CapacityMap capacityMap;
-
-    typedef std::map<const TargetRegisterClass*, unsigned> PressureMapLine;
-    typedef std::map<SlotIndex, PressureMapLine> PressureMap;
-    PressureMap pressureMap;
-
-    bool mapsPopulated;
-
-    /// \brief Initialise the 'worst' table.
-    void initWorst();
- 
-    /// \brief Initialise the 'capacity' table.
-    void initCapacity();
-
-    /// \brief Initialise/Reset the 'pressure' and live states tables.
-    void resetPressureAndLiveStates();
-  };
-
-  /// \brief Render MachineFunction objects and related information to a HTML
-  ///        page.
-  class RenderMachineFunction : public MachineFunctionPass {
-  public:
-    static char ID;
-
-    RenderMachineFunction() : MachineFunctionPass(ID) {
-      initializeRenderMachineFunctionPass(*PassRegistry::getPassRegistry());
-    }
-
-    virtual void getAnalysisUsage(AnalysisUsage &au) const;
-
-    virtual bool runOnMachineFunction(MachineFunction &fn);
-
-    virtual void releaseMemory();
-
-    void rememberUseDefs(const LiveInterval *li);
-
-    void rememberSpills(const LiveInterval *li,
-                        const std::vector<LiveInterval*> &spills);
-
-    bool isSpill(const LiveInterval *li) const;
-
-    /// \brief Render this machine function to HTML.
-    /// 
-    /// @param renderContextStr This parameter will be included in the top of
-    ///                         the html file to explain where (in the
-    ///                         codegen pipeline) this function was rendered
-    ///                         from. Set it to something like
-    ///                         "Pre-register-allocation".
-    /// @param vrm              If non-null the VRM will be queried to determine
-    ///                         whether a virtual register was allocated to a
-    ///                         physical register or spilled.
-    /// @param renderFilePrefix This string will be appended to the function
-    ///                         name (before the output file suffix) to enable
-    ///                         multiple renderings from the same function.
-    void renderMachineFunction(const char *renderContextStr,
-                               const VirtRegMap *vrm = 0,
-                               const char *renderSuffix = 0);
-
-  private:
-    class Spacer;
-    friend raw_ostream& operator<<(raw_ostream &os, const Spacer &s);
-
-    std::string fqn;
-
-    MachineFunction *mf;
-    MachineRegisterInfo *mri;
-    const TargetRegisterInfo *tri;
-    LiveIntervals *lis;
-    SlotIndexes *sis;
-    const VirtRegMap *vrm;
-
-    TargetRegisterExtraInfo trei;
-    MFRenderingOptions ro;
-
-    
-
-    // Utilities.
-    typedef enum { Dead, Defined, Used, AliveReg, AliveStack } LiveState;
-    LiveState getLiveStateAt(const LiveInterval *li, SlotIndex i) const;
-
-    typedef enum { Zero, Low, High } PressureState;
-    PressureState getPressureStateAt(const TargetRegisterClass *trc,
-                                     SlotIndex i) const;
-
-    typedef std::map<const LiveInterval*, std::set<const LiveInterval*> >
-      SpillIntervals;
-    SpillIntervals spillIntervals;
-
-    typedef std::map<const LiveInterval*, const LiveInterval*> SpillForMap;
-    SpillForMap spillFor;
-
-    typedef std::set<SlotIndex> SlotSet;
-    typedef std::map<const LiveInterval*, SlotSet> UseDefs;
-    UseDefs useDefs;
-
-    // ---------- Rendering methods ----------
-
-    /// For inserting spaces when pretty printing.
-    class Spacer {
-    public:
-      explicit Spacer(unsigned numSpaces) : ns(numSpaces) {}
-      Spacer operator+(const Spacer &o) const { return Spacer(ns + o.ns); }
-      void print(raw_ostream &os) const;
-    private:
-      unsigned ns;
-    };
-
-    Spacer s(unsigned ns) const;
-
-    template <typename Iterator>
-    std::string escapeChars(Iterator sBegin, Iterator sEnd) const;
-
-    /// \brief Render a machine instruction.
-    void renderMachineInstr(raw_ostream &os,
-                            const MachineInstr *mi) const;
-
-    /// \brief Render vertical text.
-    template <typename T>
-    void renderVertical(const Spacer &indent,
-                        raw_ostream &os,
-                        const T &t) const;
-
-    /// \brief Insert CSS layout info.
-    void insertCSS(const Spacer &indent,
-                   raw_ostream &os) const;
-
-    /// \brief Render a brief summary of the function (including rendering
-    ///        context).
-    void renderFunctionSummary(const Spacer &indent,
-                               raw_ostream &os,
-                               const char * const renderContextStr) const;
-
-    /// \brief Render a legend for the pressure table.
-    void renderPressureTableLegend(const Spacer &indent,
-                                   raw_ostream &os) const;
-
-    /// \brief Render a consecutive set of HTML cells of the same class using
-    /// the colspan attribute for run-length encoding.
-    template <typename CellType>
-    void renderCellsWithRLE(
-                     const Spacer &indent, raw_ostream &os,
-                     const std::pair<CellType, unsigned> &rleAccumulator,
-                     const std::map<CellType, std::string> &cellTypeStrs) const;
-
-    /// \brief Render code listing, potentially with register pressure
-    ///        and live intervals shown alongside.
-    void renderCodeTablePlusPI(const Spacer &indent,
-                               raw_ostream &os) const;
-
-    /// \brief Render the HTML page representing the MachineFunction.
-    void renderFunctionPage(raw_ostream &os,
-                            const char * const renderContextStr) const;
-
-    std::string escapeChars(const std::string &s) const;
-  };
-}
-
-#endif /* LLVM_CODEGEN_RENDERMACHINEFUNCTION_H */
diff --git a/lib/CodeGen/ScheduleDAG.cpp b/lib/CodeGen/ScheduleDAG.cpp
index 8fd6426..752f8e4 100644
--- a/lib/CodeGen/ScheduleDAG.cpp
+++ b/lib/CodeGen/ScheduleDAG.cpp
@@ -64,10 +64,27 @@ const MCInstrDesc *ScheduleDAG::getNodeDesc(const SDNode *Node) const {
 /// specified node.
 bool SUnit::addPred(const SDep &D) {
   // If this node already has this depenence, don't add a redundant one.
-  for (SmallVector<SDep, 4>::const_iterator I = Preds.begin(), E = Preds.end();
-       I != E; ++I)
-    if (*I == D)
+  for (SmallVector<SDep, 4>::iterator I = Preds.begin(), E = Preds.end();
+       I != E; ++I) {
+    if (I->overlaps(D)) {
+      // Extend the latency if needed. Equivalent to removePred(I) + addPred(D).
+      if (I->getLatency() < D.getLatency()) {
+        SUnit *PredSU = I->getSUnit();
+        // Find the corresponding successor in N.
+        SDep ForwardD = *I;
+        ForwardD.setSUnit(this);
+        for (SmallVector<SDep, 4>::iterator II = PredSU->Succs.begin(),
+               EE = PredSU->Succs.end(); II != EE; ++II) {
+          if (*II == ForwardD) {
+            II->setLatency(D.getLatency());
+            break;
+          }
+        }
+        I->setLatency(D.getLatency());
+      }
       return false;
+    }
+  }
   // Now add a corresponding succ to N.
   SDep P = D;
   P.setSUnit(this);
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index d46eb89..110f478 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -21,17 +21,24 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
 using namespace llvm;
 
+static cl::opt<bool> EnableAASchedMI("enable-aa-sched-mi", cl::Hidden,
+    cl::ZeroOrMore, cl::init(false),
+    cl::desc("Enable use of AA during MI GAD construction"));
+
 ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
                                      const MachineLoopInfo &mli,
                                      const MachineDominatorTree &mdt,
@@ -40,7 +47,7 @@ ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
   : ScheduleDAG(mf), MLI(mli), MDT(mdt), MFI(mf.getFrameInfo()),
     InstrItins(mf.getTarget().getInstrItineraryData()), LIS(lis),
     IsPostRA(IsPostRAFlag), UnitLatencies(false), CanHandleTerminators(false),
-    LoopRegs(MLI, MDT), FirstDbgValue(0) {
+    LoopRegs(MDT), FirstDbgValue(0) {
   assert((IsPostRA || LIS) && "PreRA scheduling requires LiveIntervals");
   DbgValues.clear();
   assert(!(IsPostRA && MRI.getNumVirtRegs()) &&
@@ -126,7 +133,8 @@ static const Value *getUnderlyingObjectForInstr(const MachineInstr *MI,
   return 0;
 }
 
-void ScheduleDAGInstrs::startBlock(MachineBasicBlock *BB) {
+void ScheduleDAGInstrs::startBlock(MachineBasicBlock *bb) {
+  BB = bb;
   LoopRegs.Deps.clear();
   if (MachineLoop *ML = MLI.getLoopFor(BB))
     if (BB == ML->getLoopLatch())
@@ -134,7 +142,8 @@ void ScheduleDAGInstrs::startBlock(MachineBasicBlock *BB) {
 }
 
 void ScheduleDAGInstrs::finishBlock() {
-  // Nothing to do.
+  // Subclasses should no longer refer to the old block.
+  BB = 0;
 }
 
 /// Initialize the map with the number of registers.
@@ -159,7 +168,7 @@ void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb,
                                     MachineBasicBlock::iterator begin,
                                     MachineBasicBlock::iterator end,
                                     unsigned endcount) {
-  BB = bb;
+  assert(bb == BB && "startBlock should set BB");
   RegionBegin = begin;
   RegionEnd = end;
   EndIndex = endcount;
@@ -232,7 +241,8 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU,
   unsigned SpecialAddressLatency = ST.getSpecialAddressLatency();
   unsigned DataLatency = SU->Latency;
 
-  for (const uint16_t *Alias = TRI->getOverlaps(MO.getReg()); *Alias; ++Alias) {
+  for (MCRegAliasIterator Alias(MO.getReg(), TRI, true);
+       Alias.isValid(); ++Alias) {
     if (!Uses.contains(*Alias))
       continue;
     std::vector<SUnit*> &UseList = Uses[*Alias];
@@ -261,10 +271,12 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU,
       // Adjust the dependence latency using operand def/use
       // information (if any), and then allow the target to
       // perform its own adjustments.
-      const SDep& dep = SDep(SU, SDep::Data, LDataLatency, *Alias);
+      SDep dep(SU, SDep::Data, LDataLatency, *Alias);
       if (!UnitLatencies) {
-        computeOperandLatency(SU, UseSU, const_cast<SDep &>(dep));
-        ST.adjustSchedDependency(SU, UseSU, const_cast<SDep &>(dep));
+        unsigned Latency = computeOperandLatency(SU, UseSU, dep);
+        dep.setLatency(Latency);
+
+        ST.adjustSchedDependency(SU, UseSU, dep);
       }
       UseSU->addPred(dep);
     }
@@ -285,7 +297,8 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
   // TODO: Using a latency of 1 here for output dependencies assumes
   //       there's no cost for reusing registers.
   SDep::Kind Kind = MO.isUse() ? SDep::Anti : SDep::Output;
-  for (const uint16_t *Alias = TRI->getOverlaps(MO.getReg()); *Alias; ++Alias) {
+  for (MCRegAliasIterator Alias(MO.getReg(), TRI, true);
+       Alias.isValid(); ++Alias) {
     if (!Defs.contains(*Alias))
       continue;
     std::vector<SUnit *> &DefList = Defs[*Alias];
@@ -400,8 +413,10 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
 
   // SSA defs do not have output/anti dependencies.
   // The current operand is a def, so we have at least one.
-  if (llvm::next(MRI.def_begin(Reg)) == MRI.def_end())
-    return;
+  //
+  // FIXME: This optimization is disabled pending PR13112.
+  //if (llvm::next(MRI.def_begin(Reg)) == MRI.def_end())
+  //  return;
 
   // Add output dependence to the next nearest def of this vreg.
   //
@@ -410,7 +425,7 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
   // uses. We're conservative for now until we have a way to guarantee the uses
   // are not eliminated sometime during scheduling. The output dependence edge
   // is also useful if output latency exceeds def-use latency.
-  VReg2SUnitMap::iterator DefI = findVRegDef(Reg);
+  VReg2SUnitMap::iterator DefI = VRegDefs.find(Reg);
   if (DefI == VRegDefs.end())
     VRegDefs.insert(VReg2SUnit(Reg, SU));
   else {
@@ -436,10 +451,11 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
 
   // Lookup this operand's reaching definition.
   assert(LIS && "vreg dependencies requires LiveIntervals");
-  SlotIndex UseIdx = LIS->getInstructionIndex(MI).getRegSlot();
-  LiveInterval *LI = &LIS->getInterval(Reg);
-  VNInfo *VNI = LI->getVNInfoBefore(UseIdx);
+  LiveRangeQuery LRQ(LIS->getInterval(Reg), LIS->getInstructionIndex(MI));
+  VNInfo *VNI = LRQ.valueIn();
+
   // VNI will be valid because MachineOperand::readsReg() is checked by caller.
+  assert(VNI && "No value to read by operand");
   MachineInstr *Def = LIS->getInstructionFromIndex(VNI->def);
   // Phis and other noninstructions (after coalescing) have a NULL Def.
   if (Def) {
@@ -449,11 +465,13 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
       // Create a data dependence.
       //
       // TODO: Handle "special" address latencies cleanly.
-      const SDep &dep = SDep(DefSU, SDep::Data, DefSU->Latency, Reg);
+      SDep dep(DefSU, SDep::Data, DefSU->Latency, Reg);
       if (!UnitLatencies) {
         // Adjust the dependence latency using operand def/use information, then
         // allow the target to perform its own adjustments.
-        computeOperandLatency(DefSU, SU, const_cast<SDep &>(dep));
+        unsigned Latency = computeOperandLatency(DefSU, SU, const_cast<SDep &>(dep));
+        dep.setLatency(Latency);
+
         const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
         ST.adjustSchedDependency(DefSU, SU, const_cast<SDep &>(dep));
       }
@@ -462,11 +480,217 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
   }
 
   // Add antidependence to the following def of the vreg it uses.
-  VReg2SUnitMap::iterator DefI = findVRegDef(Reg);
+  VReg2SUnitMap::iterator DefI = VRegDefs.find(Reg);
   if (DefI != VRegDefs.end() && DefI->SU != SU)
     DefI->SU->addPred(SDep(SU, SDep::Anti, 0, Reg));
 }
 
+/// Return true if MI is an instruction we are unable to reason about
+/// (like a call or something with unmodeled side effects).
+static inline bool isGlobalMemoryObject(AliasAnalysis *AA, MachineInstr *MI) {
+  if (MI->isCall() || MI->hasUnmodeledSideEffects() ||
+      (MI->hasVolatileMemoryRef() &&
+       (!MI->mayLoad() || !MI->isInvariantLoad(AA))))
+    return true;
+  return false;
+}
+
+// This MI might have either incomplete info, or known to be unsafe
+// to deal with (i.e. volatile object).
+static inline bool isUnsafeMemoryObject(MachineInstr *MI,
+                                        const MachineFrameInfo *MFI) {
+  if (!MI || MI->memoperands_empty())
+    return true;
+  // We purposefully do no check for hasOneMemOperand() here
+  // in hope to trigger an assert downstream in order to
+  // finish implementation.
+  if ((*MI->memoperands_begin())->isVolatile() ||
+       MI->hasUnmodeledSideEffects())
+    return true;
+
+  const Value *V = (*MI->memoperands_begin())->getValue();
+  if (!V)
+    return true;
+
+  V = getUnderlyingObject(V);
+  if (const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V)) {
+    // Similarly to getUnderlyingObjectForInstr:
+    // For now, ignore PseudoSourceValues which may alias LLVM IR values
+    // because the code that uses this function has no way to cope with
+    // such aliases.
+    if (PSV->isAliased(MFI))
+      return true;
+  }
+  // Does this pointer refer to a distinct and identifiable object?
+  if (!isIdentifiedObject(V))
+    return true;
+
+  return false;
+}
+
+/// This returns true if the two MIs need a chain edge betwee them.
+/// If these are not even memory operations, we still may need
+/// chain deps between them. The question really is - could
+/// these two MIs be reordered during scheduling from memory dependency
+/// point of view.
+static bool MIsNeedChainEdge(AliasAnalysis *AA, const MachineFrameInfo *MFI,
+                             MachineInstr *MIa,
+                             MachineInstr *MIb) {
+  // Cover a trivial case - no edge is need to itself.
+  if (MIa == MIb)
+    return false;
+
+  if (isUnsafeMemoryObject(MIa, MFI) || isUnsafeMemoryObject(MIb, MFI))
+    return true;
+
+  // If we are dealing with two "normal" loads, we do not need an edge
+  // between them - they could be reordered.
+  if (!MIa->mayStore() && !MIb->mayStore())
+    return false;
+
+  // To this point analysis is generic. From here on we do need AA.
+  if (!AA)
+    return true;
+
+  MachineMemOperand *MMOa = *MIa->memoperands_begin();
+  MachineMemOperand *MMOb = *MIb->memoperands_begin();
+
+  // FIXME: Need to handle multiple memory operands to support all targets.
+  if (!MIa->hasOneMemOperand() || !MIb->hasOneMemOperand())
+    llvm_unreachable("Multiple memory operands.");
+
+  // The following interface to AA is fashioned after DAGCombiner::isAlias
+  // and operates with MachineMemOperand offset with some important
+  // assumptions:
+  //   - LLVM fundamentally assumes flat address spaces.
+  //   - MachineOperand offset can *only* result from legalization and
+  //     cannot affect queries other than the trivial case of overlap
+  //     checking.
+  //   - These offsets never wrap and never step outside
+  //     of allocated objects.
+  //   - There should never be any negative offsets here.
+  //
+  // FIXME: Modify API to hide this math from "user"
+  // FIXME: Even before we go to AA we can reason locally about some
+  // memory objects. It can save compile time, and possibly catch some
+  // corner cases not currently covered.
+
+  assert ((MMOa->getOffset() >= 0) && "Negative MachineMemOperand offset");
+  assert ((MMOb->getOffset() >= 0) && "Negative MachineMemOperand offset");
+
+  int64_t MinOffset = std::min(MMOa->getOffset(), MMOb->getOffset());
+  int64_t Overlapa = MMOa->getSize() + MMOa->getOffset() - MinOffset;
+  int64_t Overlapb = MMOb->getSize() + MMOb->getOffset() - MinOffset;
+
+  AliasAnalysis::AliasResult AAResult = AA->alias(
+  AliasAnalysis::Location(MMOa->getValue(), Overlapa,
+                          MMOa->getTBAAInfo()),
+  AliasAnalysis::Location(MMOb->getValue(), Overlapb,
+                          MMOb->getTBAAInfo()));
+
+  return (AAResult != AliasAnalysis::NoAlias);
+}
+
+/// This recursive function iterates over chain deps of SUb looking for
+/// "latest" node that needs a chain edge to SUa.
+static unsigned
+iterateChainSucc(AliasAnalysis *AA, const MachineFrameInfo *MFI,
+                 SUnit *SUa, SUnit *SUb, SUnit *ExitSU, unsigned *Depth,
+                 SmallPtrSet<const SUnit*, 16> &Visited) {
+  if (!SUa || !SUb || SUb == ExitSU)
+    return *Depth;
+
+  // Remember visited nodes.
+  if (!Visited.insert(SUb))
+      return *Depth;
+  // If there is _some_ dependency already in place, do not
+  // descend any further.
+  // TODO: Need to make sure that if that dependency got eliminated or ignored
+  // for any reason in the future, we would not violate DAG topology.
+  // Currently it does not happen, but makes an implicit assumption about
+  // future implementation.
+  //
+  // Independently, if we encounter node that is some sort of global
+  // object (like a call) we already have full set of dependencies to it
+  // and we can stop descending.
+  if (SUa->isSucc(SUb) ||
+      isGlobalMemoryObject(AA, SUb->getInstr()))
+    return *Depth;
+
+  // If we do need an edge, or we have exceeded depth budget,
+  // add that edge to the predecessors chain of SUb,
+  // and stop descending.
+  if (*Depth > 200 ||
+      MIsNeedChainEdge(AA, MFI, SUa->getInstr(), SUb->getInstr())) {
+    SUb->addPred(SDep(SUa, SDep::Order, /*Latency=*/0, /*Reg=*/0,
+                      /*isNormalMemory=*/true));
+    return *Depth;
+  }
+  // Track current depth.
+  (*Depth)++;
+  // Iterate over chain dependencies only.
+  for (SUnit::const_succ_iterator I = SUb->Succs.begin(), E = SUb->Succs.end();
+       I != E; ++I)
+    if (I->isCtrl())
+      iterateChainSucc (AA, MFI, SUa, I->getSUnit(), ExitSU, Depth, Visited);
+  return *Depth;
+}
+
+/// This function assumes that "downward" from SU there exist
+/// tail/leaf of already constructed DAG. It iterates downward and
+/// checks whether SU can be aliasing any node dominated
+/// by it.
+static void adjustChainDeps(AliasAnalysis *AA, const MachineFrameInfo *MFI,
+                            SUnit *SU, SUnit *ExitSU, std::set<SUnit *> &CheckList,
+                            unsigned LatencyToLoad) {
+  if (!SU)
+    return;
+
+  SmallPtrSet<const SUnit*, 16> Visited;
+  unsigned Depth = 0;
+
+  for (std::set<SUnit *>::iterator I = CheckList.begin(), IE = CheckList.end();
+       I != IE; ++I) {
+    if (SU == *I)
+      continue;
+    if (MIsNeedChainEdge(AA, MFI, SU->getInstr(), (*I)->getInstr())) {
+      unsigned Latency = ((*I)->getInstr()->mayLoad()) ? LatencyToLoad : 0;
+      (*I)->addPred(SDep(SU, SDep::Order, Latency, /*Reg=*/0,
+                         /*isNormalMemory=*/true));
+    }
+    // Now go through all the chain successors and iterate from them.
+    // Keep track of visited nodes.
+    for (SUnit::const_succ_iterator J = (*I)->Succs.begin(),
+         JE = (*I)->Succs.end(); J != JE; ++J)
+      if (J->isCtrl())
+        iterateChainSucc (AA, MFI, SU, J->getSUnit(),
+                          ExitSU, &Depth, Visited);
+  }
+}
+
+/// Check whether two objects need a chain edge, if so, add it
+/// otherwise remember the rejected SU.
+static inline
+void addChainDependency (AliasAnalysis *AA, const MachineFrameInfo *MFI,
+                         SUnit *SUa, SUnit *SUb,
+                         std::set<SUnit *> &RejectList,
+                         unsigned TrueMemOrderLatency = 0,
+                         bool isNormalMemory = false) {
+  // If this is a false dependency,
+  // do not add the edge, but rememeber the rejected node.
+  if (!EnableAASchedMI ||
+      MIsNeedChainEdge(AA, MFI, SUa->getInstr(), SUb->getInstr()))
+    SUb->addPred(SDep(SUa, SDep::Order, TrueMemOrderLatency, /*Reg=*/0,
+                      isNormalMemory));
+  else {
+    // Duplicate entries should be ignored.
+    RejectList.insert(SUb);
+    DEBUG(dbgs() << "\tReject chain dep between SU("
+          << SUa->NodeNum << ") and SU("
+          << SUb->NodeNum << ")\n");
+  }
+}
+
 /// Create an SUnit for each real instruction, numbered in top-down toplological
 /// order. The instruction order A < B, implies that no edge exists from B to A.
 ///
@@ -502,7 +726,11 @@ void ScheduleDAGInstrs::initSUnits() {
   }
 }
 
-void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) {
+/// If RegPressure is non null, compute register pressure as a side effect. The
+/// DAG builder is an efficient place to do it because it already visits
+/// operands.
+void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
+                                        RegPressureTracker *RPTracker) {
   // Create an SUnit for each real instruction.
   initSUnits();
 
@@ -518,6 +746,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) {
   // that are known not to alias
   std::map<const Value *, SUnit *> AliasMemDefs, NonAliasMemDefs;
   std::map<const Value *, std::vector<SUnit *> > AliasMemUses, NonAliasMemUses;
+  std::set<SUnit*> RejectMemNodes;
 
   // Remove any stale debug info; sometimes BuildSchedGraph is called again
   // without emitting the info from the previous call.
@@ -553,6 +782,10 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) {
       PrevMI = MI;
       continue;
     }
+    if (RPTracker) {
+      RPTracker->recede();
+      assert(RPTracker->getPos() == prior(MII) && "RPTracker can't find MI");
+    }
 
     assert((!MI->isTerminator() || CanHandleTerminators) && !MI->isLabel() &&
            "Cannot schedule terminators or labels!");
@@ -587,11 +820,8 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) {
     // after stack slots are lowered to actual addresses.
     // TODO: Use an AliasAnalysis and do real alias-analysis queries, and
     // produce more precise dependence information.
-#define STORE_LOAD_LATENCY 1
-    unsigned TrueMemOrderLatency = 0;
-    if (MI->isCall() || MI->hasUnmodeledSideEffects() ||
-        (MI->hasVolatileMemoryRef() &&
-         (!MI->mayLoad() || !MI->isInvariantLoad(AA)))) {
+    unsigned TrueMemOrderLatency = MI->mayStore() ? 1 : 0;
+    if (isGlobalMemoryObject(AA, MI)) {
       // Be conservative with these and add dependencies on all memory
       // references, even those that are known to not alias.
       for (std::map<const Value *, SUnit *>::iterator I =
@@ -603,36 +833,48 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) {
         for (unsigned i = 0, e = I->second.size(); i != e; ++i)
           I->second[i]->addPred(SDep(SU, SDep::Order, TrueMemOrderLatency));
       }
-      NonAliasMemDefs.clear();
-      NonAliasMemUses.clear();
       // Add SU to the barrier chain.
       if (BarrierChain)
         BarrierChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
       BarrierChain = SU;
+      // This is a barrier event that acts as a pivotal node in the DAG,
+      // so it is safe to clear list of exposed nodes.
+      adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes,
+                      TrueMemOrderLatency);
+      RejectMemNodes.clear();
+      NonAliasMemDefs.clear();
+      NonAliasMemUses.clear();
 
       // fall-through
     new_alias_chain:
       // Chain all possibly aliasing memory references though SU.
-      if (AliasChain)
-        AliasChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
+      if (AliasChain) {
+        unsigned ChainLatency = 0;
+        if (AliasChain->getInstr()->mayLoad())
+          ChainLatency = TrueMemOrderLatency;
+        addChainDependency(AA, MFI, SU, AliasChain, RejectMemNodes,
+                           ChainLatency);
+      }
       AliasChain = SU;
       for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k)
-        PendingLoads[k]->addPred(SDep(SU, SDep::Order, TrueMemOrderLatency));
+        addChainDependency(AA, MFI, SU, PendingLoads[k], RejectMemNodes,
+                           TrueMemOrderLatency);
       for (std::map<const Value *, SUnit *>::iterator I = AliasMemDefs.begin(),
-           E = AliasMemDefs.end(); I != E; ++I) {
-        I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
-      }
+           E = AliasMemDefs.end(); I != E; ++I)
+        addChainDependency(AA, MFI, SU, I->second, RejectMemNodes);
       for (std::map<const Value *, std::vector<SUnit *> >::iterator I =
            AliasMemUses.begin(), E = AliasMemUses.end(); I != E; ++I) {
         for (unsigned i = 0, e = I->second.size(); i != e; ++i)
-          I->second[i]->addPred(SDep(SU, SDep::Order, TrueMemOrderLatency));
+          addChainDependency(AA, MFI, SU, I->second[i], RejectMemNodes,
+                             TrueMemOrderLatency);
       }
+      adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes,
+                      TrueMemOrderLatency);
       PendingLoads.clear();
       AliasMemDefs.clear();
       AliasMemUses.clear();
     } else if (MI->mayStore()) {
       bool MayAlias = true;
-      TrueMemOrderLatency = STORE_LOAD_LATENCY;
       if (const Value *V = getUnderlyingObjectForInstr(MI, MFI, MayAlias)) {
         // A store to a specific PseudoSourceValue. Add precise dependencies.
         // Record the def in MemDefs, first adding a dep if there is
@@ -642,8 +884,8 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) {
         std::map<const Value *, SUnit *>::iterator IE =
           ((MayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end());
         if (I != IE) {
-          I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0, /*Reg=*/0,
-                                  /*isNormalMemory=*/true));
+          addChainDependency(AA, MFI, SU, I->second, RejectMemNodes,
+                             0, true);
           I->second = SU;
         } else {
           if (MayAlias)
@@ -658,20 +900,28 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) {
           ((MayAlias) ? AliasMemUses.end() : NonAliasMemUses.end());
         if (J != JE) {
           for (unsigned i = 0, e = J->second.size(); i != e; ++i)
-            J->second[i]->addPred(SDep(SU, SDep::Order, TrueMemOrderLatency,
-                                       /*Reg=*/0, /*isNormalMemory=*/true));
+            addChainDependency(AA, MFI, SU, J->second[i], RejectMemNodes,
+                               TrueMemOrderLatency, true);
           J->second.clear();
         }
         if (MayAlias) {
           // Add dependencies from all the PendingLoads, i.e. loads
           // with no underlying object.
           for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k)
-            PendingLoads[k]->addPred(SDep(SU, SDep::Order, TrueMemOrderLatency));
+            addChainDependency(AA, MFI, SU, PendingLoads[k], RejectMemNodes,
+                               TrueMemOrderLatency);
           // Add dependence on alias chain, if needed.
           if (AliasChain)
-            AliasChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
+            addChainDependency(AA, MFI, SU, AliasChain, RejectMemNodes);
+          // But we also should check dependent instructions for the
+          // SU in question.
+          adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes,
+                          TrueMemOrderLatency);
         }
         // Add dependence on barrier chain, if needed.
+        // There is no point to check aliasing on barrier event. Even if
+        // SU and barrier _could_ be reordered, they should not. In addition,
+        // we have lost all RejectMemNodes below barrier.
         if (BarrierChain)
           BarrierChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
       } else {
@@ -688,7 +938,6 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) {
                             /*isArtificial=*/true));
     } else if (MI->mayLoad()) {
       bool MayAlias = true;
-      TrueMemOrderLatency = 0;
       if (MI->isInvariantLoad(AA)) {
         // Invariant load, no chain dependencies needed!
       } else {
@@ -700,8 +949,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) {
           std::map<const Value *, SUnit *>::iterator IE =
             ((MayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end());
           if (I != IE)
-            I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0, /*Reg=*/0,
-                                    /*isNormalMemory=*/true));
+            addChainDependency(AA, MFI, SU, I->second, RejectMemNodes, 0, true);
           if (MayAlias)
             AliasMemUses[V].push_back(SU);
           else
@@ -711,15 +959,16 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) {
           // potentially aliasing stores.
           for (std::map<const Value *, SUnit *>::iterator I =
                  AliasMemDefs.begin(), E = AliasMemDefs.end(); I != E; ++I)
-            I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
+            addChainDependency(AA, MFI, SU, I->second, RejectMemNodes);
 
           PendingLoads.push_back(SU);
           MayAlias = true;
         }
-
+        if (MayAlias)
+          adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes, /*Latency=*/0);
         // Add dependencies on alias and barrier chains, if needed.
         if (MayAlias && AliasChain)
-          AliasChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
+          addChainDependency(AA, MFI, SU, AliasChain, RejectMemNodes);
         if (BarrierChain)
           BarrierChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
       }
@@ -735,8 +984,9 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) {
 }
 
 void ScheduleDAGInstrs::computeLatency(SUnit *SU) {
-  // Compute the latency for the node.
-  if (!InstrItins || InstrItins->isEmpty()) {
+  // Compute the latency for the node. We only provide a default for missing
+  // itineraries. Empty itineraries still have latency properties.
+  if (!InstrItins) {
     SU->Latency = 1;
 
     // Simplistic target-independent heuristic: assume that loads take
@@ -748,63 +998,15 @@ void ScheduleDAGInstrs::computeLatency(SUnit *SU) {
   }
 }
 
-void ScheduleDAGInstrs::computeOperandLatency(SUnit *Def, SUnit *Use,
-                                              SDep& dep) const {
-  if (!InstrItins || InstrItins->isEmpty())
-    return;
-
+unsigned ScheduleDAGInstrs::computeOperandLatency(SUnit *Def, SUnit *Use,
+                                                  const SDep& dep,
+                                                  bool FindMin) const {
   // For a data dependency with a known register...
   if ((dep.getKind() != SDep::Data) || (dep.getReg() == 0))
-    return;
-
-  const unsigned Reg = dep.getReg();
-
-  // ... find the definition of the register in the defining
-  // instruction
-  MachineInstr *DefMI = Def->getInstr();
-  int DefIdx = DefMI->findRegisterDefOperandIdx(Reg);
-  if (DefIdx != -1) {
-    const MachineOperand &MO = DefMI->getOperand(DefIdx);
-    if (MO.isReg() && MO.isImplicit() &&
-        DefIdx >= (int)DefMI->getDesc().getNumOperands()) {
-      // This is an implicit def, getOperandLatency() won't return the correct
-      // latency. e.g.
-      //   %D6<def>, %D7<def> = VLD1q16 %R2<kill>, 0, ..., %Q3<imp-def>
-      //   %Q1<def> = VMULv8i16 %Q1<kill>, %Q3<kill>, ...
-      // What we want is to compute latency between def of %D6/%D7 and use of
-      // %Q3 instead.
-      unsigned Op2 = DefMI->findRegisterDefOperandIdx(Reg, false, true, TRI);
-      if (DefMI->getOperand(Op2).isReg())
-        DefIdx = Op2;
-    }
-    MachineInstr *UseMI = Use->getInstr();
-    // For all uses of the register, calculate the maxmimum latency
-    int Latency = -1;
-    if (UseMI) {
-      for (unsigned i = 0, e = UseMI->getNumOperands(); i != e; ++i) {
-        const MachineOperand &MO = UseMI->getOperand(i);
-        if (!MO.isReg() || !MO.isUse())
-          continue;
-        unsigned MOReg = MO.getReg();
-        if (MOReg != Reg)
-          continue;
-
-        int UseCycle = TII->getOperandLatency(InstrItins, DefMI, DefIdx,
-                                              UseMI, i);
-        Latency = std::max(Latency, UseCycle);
-      }
-    } else {
-      // UseMI is null, then it must be a scheduling barrier.
-      if (!InstrItins || InstrItins->isEmpty())
-        return;
-      unsigned DefClass = DefMI->getDesc().getSchedClass();
-      Latency = InstrItins->getOperandCycle(DefClass, DefIdx);
-    }
+    return 1;
 
-    // If we found a latency, then replace the existing dependence latency.
-    if (Latency >= 0)
-      dep.setLatency(Latency);
-  }
+  return TII->computeOperandLatency(InstrItins, TRI, Def->getInstr(),
+                                    Use->getInstr(), dep.getReg(), FindMin);
 }
 
 void ScheduleDAGInstrs::dumpNode(const SUnit *SU) const {
diff --git a/lib/CodeGen/ScoreboardHazardRecognizer.cpp b/lib/CodeGen/ScoreboardHazardRecognizer.cpp
index 3d22035..e675366 100644
--- a/lib/CodeGen/ScoreboardHazardRecognizer.cpp
+++ b/lib/CodeGen/ScoreboardHazardRecognizer.cpp
@@ -39,13 +39,11 @@ ScoreboardHazardRecognizer(const InstrItineraryData *II,
   DebugType = ParentDebugType;
 #endif
 
-  // Determine the maximum depth of any itinerary. This determines the
-  // depth of the scoreboard. We always make the scoreboard at least 1
-  // cycle deep to avoid dealing with the boundary condition.
+  // Determine the maximum depth of any itinerary. This determines the depth of
+  // the scoreboard. We always make the scoreboard at least 1 cycle deep to
+  // avoid dealing with the boundary condition.
   unsigned ScoreboardDepth = 1;
   if (ItinData && !ItinData->isEmpty()) {
-    IssueWidth = ItinData->IssueWidth;
-
     for (unsigned idx = 0; ; ++idx) {
       if (ItinData->isEndMarker(idx))
         break;
@@ -63,16 +61,26 @@ ScoreboardHazardRecognizer(const InstrItineraryData *II,
       // Find the next power-of-2 >= ItinDepth
       while (ItinDepth > ScoreboardDepth) {
         ScoreboardDepth *= 2;
+        // Don't set MaxLookAhead until we find at least one nonzero stage.
+        // This way, an itinerary with no stages has MaxLookAhead==0, which
+        // completely bypasses the scoreboard hazard logic.
+        MaxLookAhead = ScoreboardDepth;
       }
     }
-    MaxLookAhead = ScoreboardDepth;
   }
 
   ReservedScoreboard.reset(ScoreboardDepth);
   RequiredScoreboard.reset(ScoreboardDepth);
 
-  DEBUG(dbgs() << "Using scoreboard hazard recognizer: Depth = "
-               << ScoreboardDepth << '\n');
+  // If MaxLookAhead is not set above, then we are not enabled.
+  if (!isEnabled())
+    DEBUG(dbgs() << "Disabled scoreboard hazard recognizer\n");
+  else {
+    // A nonempty itinerary must have a SchedModel.
+    IssueWidth = ItinData->SchedModel->IssueWidth;
+    DEBUG(dbgs() << "Using scoreboard hazard recognizer: Depth = "
+          << ScoreboardDepth << '\n');
+  }
 }
 
 void ScoreboardHazardRecognizer::Reset() {
@@ -151,7 +159,7 @@ ScoreboardHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
       }
 
       if (!freeUnits) {
-        DEBUG(dbgs() << "*** Hazard in cycle " << (cycle + i) << ", ");
+        DEBUG(dbgs() << "*** Hazard in cycle +" << StageCycle << ", ");
         DEBUG(dbgs() << "SU(" << SU->NodeNum << "): ");
         DEBUG(DAG->dumpNode(SU));
         return Hazard;
diff --git a/lib/CodeGen/SelectionDAG/CMakeLists.txt b/lib/CodeGen/SelectionDAG/CMakeLists.txt
index a6bdc3b..75e8167 100644
--- a/lib/CodeGen/SelectionDAG/CMakeLists.txt
+++ b/lib/CodeGen/SelectionDAG/CMakeLists.txt
@@ -23,3 +23,5 @@ add_llvm_library(LLVMSelectionDAG
   TargetLowering.cpp
   TargetSelectionDAGInfo.cpp
   )
+
+add_dependencies(LLVMSelectionDAG intrinsics_gen)
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 0914c66..747bc44 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -215,6 +215,7 @@ namespace {
     SDValue visitFADD(SDNode *N);
     SDValue visitFSUB(SDNode *N);
     SDValue visitFMUL(SDNode *N);
+    SDValue visitFMA(SDNode *N);
     SDValue visitFDIV(SDNode *N);
     SDValue visitFREM(SDNode *N);
     SDValue visitFCOPYSIGN(SDNode *N);
@@ -328,15 +329,12 @@ namespace {
 class WorkListRemover : public SelectionDAG::DAGUpdateListener {
   DAGCombiner &DC;
 public:
-  explicit WorkListRemover(DAGCombiner &dc) : DC(dc) {}
+  explicit WorkListRemover(DAGCombiner &dc)
+    : SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {}
 
   virtual void NodeDeleted(SDNode *N, SDNode *E) {
     DC.removeFromWorkList(N);
   }
-
-  virtual void NodeUpdated(SDNode *N) {
-    // Ignore updates.
-  }
 };
 }
 
@@ -619,8 +617,7 @@ SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
                   N->getValueType(i) == To[i].getValueType()) &&
                  "Cannot combine value to value of different type!"));
   WorkListRemover DeadNodes(*this);
-  DAG.ReplaceAllUsesWith(N, To, &DeadNodes);
-
+  DAG.ReplaceAllUsesWith(N, To);
   if (AddTo) {
     // Push the new nodes and any users onto the worklist
     for (unsigned i = 0, e = NumTo; i != e; ++i) {
@@ -650,7 +647,7 @@ CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
   // Replace all uses.  If any nodes become isomorphic to other nodes and
   // are deleted, make sure to remove them from our worklist.
   WorkListRemover DeadNodes(*this);
-  DAG.ReplaceAllUsesOfValueWith(TLO.Old, TLO.New, &DeadNodes);
+  DAG.ReplaceAllUsesOfValueWith(TLO.Old, TLO.New);
 
   // Push the new node and any (possibly new) users onto the worklist.
   AddToWorkList(TLO.New.getNode());
@@ -707,9 +704,8 @@ void DAGCombiner::ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad) {
         Trunc.getNode()->dump(&DAG);
         dbgs() << '\n');
   WorkListRemover DeadNodes(*this);
-  DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), Trunc, &DeadNodes);
-  DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), SDValue(ExtLoad, 1),
-                                &DeadNodes);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), Trunc);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), SDValue(ExtLoad, 1));
   removeFromWorkList(Load);
   DAG.DeleteNode(Load);
   AddToWorkList(Trunc.getNode());
@@ -961,8 +957,8 @@ bool DAGCombiner::PromoteLoad(SDValue Op) {
           Result.getNode()->dump(&DAG);
           dbgs() << '\n');
     WorkListRemover DeadNodes(*this);
-    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result, &DeadNodes);
-    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1), &DeadNodes);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1));
     removeFromWorkList(N);
     DAG.DeleteNode(N);
     AddToWorkList(Result.getNode());
@@ -1047,12 +1043,12 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
     DAG.TransferDbgValues(SDValue(N, 0), RV);
     WorkListRemover DeadNodes(*this);
     if (N->getNumValues() == RV.getNode()->getNumValues())
-      DAG.ReplaceAllUsesWith(N, RV.getNode(), &DeadNodes);
+      DAG.ReplaceAllUsesWith(N, RV.getNode());
     else {
       assert(N->getValueType(0) == RV.getValueType() &&
              N->getNumValues() == 1 && "Type mismatch");
       SDValue OpV = RV;
-      DAG.ReplaceAllUsesWith(N, &OpV, &DeadNodes);
+      DAG.ReplaceAllUsesWith(N, &OpV);
     }
 
     // Push the new node and any users onto the worklist
@@ -1131,6 +1127,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::FADD:               return visitFADD(N);
   case ISD::FSUB:               return visitFSUB(N);
   case ISD::FMUL:               return visitFMUL(N);
+  case ISD::FMA:                return visitFMA(N);
   case ISD::FDIV:               return visitFDIV(N);
   case ISD::FREM:               return visitFREM(N);
   case ISD::FCOPYSIGN:          return visitFCOPYSIGN(N);
@@ -1325,10 +1322,12 @@ SDValue DAGCombiner::visitMERGE_VALUES(SDNode *N) {
   // Replacing results may cause a different MERGE_VALUES to suddenly
   // be CSE'd with N, and carry its uses with it. Iterate until no
   // uses remain, to ensure that the node can be safely deleted.
+  // First add the users of this node to the work list so that they
+  // can be tried again once they have new operands.
+  AddUsersToWorkList(N);
   do {
     for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
-      DAG.ReplaceAllUsesOfValueWith(SDValue(N, i), N->getOperand(i),
-                                    &DeadNodes);
+      DAG.ReplaceAllUsesOfValueWith(SDValue(N, i), N->getOperand(i));
   } while (!N->use_empty());
   removeFromWorkList(N);
   DAG.DeleteNode(N);
@@ -1640,7 +1639,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   if (N1.getOpcode() == ISD::ADD && N0C && N1C1) {
     SDValue NewC = DAG.getConstant((N0C->getAPIntValue() - N1C1->getAPIntValue()), VT);
     return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, NewC,
-		       N1.getOperand(0));
+                       N1.getOperand(0));
   }
   // fold ((A+(B+or-C))-B) -> A+or-C
   if (N0.getOpcode() == ISD::ADD &&
@@ -2341,7 +2340,7 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
   // We also handle SCALAR_TO_VECTOR because xor/or/and operations are cheaper
   // on scalars.
   if ((N0.getOpcode() == ISD::BITCAST || N0.getOpcode() == ISD::SCALAR_TO_VECTOR)
-      && Level == AfterLegalizeVectorOps) {
+      && Level == AfterLegalizeTypes) {
     SDValue In0 = N0.getOperand(0);
     SDValue In1 = N1.getOperand(0);
     EVT In0Ty = In0.getValueType();
@@ -2528,7 +2527,14 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
                               Load->getOffset(), Load->getMemoryVT(),
                               Load->getMemOperand());
         // Replace uses of the EXTLOAD with the new ZEXTLOAD.
-        CombineTo(Load, NewLoad.getValue(0), NewLoad.getValue(1));
+        if (Load->getNumValues() == 3) {
+          // PRE/POST_INC loads have 3 values.
+          SDValue To[] = { NewLoad.getValue(0), NewLoad.getValue(1),
+                           NewLoad.getValue(2) };
+          CombineTo(Load, To, 3, true);
+        } else {
+          CombineTo(Load, NewLoad.getValue(0), NewLoad.getValue(1));
+        }
       }
 
       // Fold the AND away, taking care not to fold to the old load node if we
@@ -2710,6 +2716,34 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     }
   }
 
+  if (N0.getOpcode() == ISD::ADD && N1.getOpcode() == ISD::SRL &&
+      VT.getSizeInBits() <= 64) {
+    if (ConstantSDNode *ADDI = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+      APInt ADDC = ADDI->getAPIntValue();
+      if (!TLI.isLegalAddImmediate(ADDC.getSExtValue())) {
+        // Look for (and (add x, c1), (lshr y, c2)). If C1 wasn't a legal
+        // immediate for an add, but it is legal if its top c2 bits are set,
+        // transform the ADD so the immediate doesn't need to be materialized
+        // in a register.
+        if (ConstantSDNode *SRLI = dyn_cast<ConstantSDNode>(N1.getOperand(1))) {
+          APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
+                                             SRLI->getZExtValue());
+          if (DAG.MaskedValueIsZero(N0.getOperand(1), Mask)) {
+            ADDC |= Mask;
+            if (TLI.isLegalAddImmediate(ADDC.getSExtValue())) {
+              SDValue NewAdd =
+                DAG.getNode(ISD::ADD, N0.getDebugLoc(), VT,
+                            N0.getOperand(0), DAG.getConstant(ADDC, VT));
+              CombineTo(N0.getNode(), NewAdd);
+              return SDValue(N, 0); // Return N so it doesn't get rechecked!
+            }
+          }
+        }
+      }
+    }
+  }
+      
+
   return SDValue();
 }
 
@@ -4526,8 +4560,10 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
     SDValue Op = N0.getOperand(0);
     if (Op.getValueType().bitsLT(VT)) {
       Op = DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, Op);
+      AddToWorkList(Op.getNode());
     } else if (Op.getValueType().bitsGT(VT)) {
       Op = DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, Op);
+      AddToWorkList(Op.getNode());
     }
     return DAG.getZeroExtendInReg(Op, N->getDebugLoc(),
                                   N0.getValueType().getScalarType());
@@ -5012,6 +5048,10 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
   LoadSDNode *LN0 = cast<LoadSDNode>(N0);
   EVT PtrType = N0.getOperand(1).getValueType();
 
+  if (PtrType == MVT::Untyped || PtrType.isExtended())
+    // It's not possible to generate a constant of extended or untyped type.
+    return SDValue();
+
   // For big endian targets, we need to adjust the offset to the pointer to
   // load the correct bytes.
   if (TLI.isBigEndian()) {
@@ -5041,8 +5081,7 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
 
   // Replace the old load's chain with the new load's chain.
   WorkListRemover DeadNodes(*this);
-  DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1),
-                                &DeadNodes);
+  DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
 
   // Shift the result left, if we've swallowed a left shift.
   SDValue Result = Load;
@@ -5225,7 +5264,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     SDValue EltNo = N0->getOperand(1);
     if (isa<ConstantSDNode>(EltNo) && isTypeLegal(NVT)) {
       int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
-
+      EVT IndexTy = N0->getOperand(1).getValueType();
       int Index = isLE ? (Elt*SizeRatio) : (Elt*SizeRatio + (SizeRatio-1));
 
       SDValue V = DAG.getNode(ISD::BITCAST, N->getDebugLoc(),
@@ -5233,7 +5272,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
 
       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
                          N->getDebugLoc(), TrTy, V,
-                         DAG.getConstant(Index, MVT::i32));
+                         DAG.getConstant(Index, IndexTy));
     }
   }
 
@@ -5607,7 +5646,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
     if (FoldedVOp.getNode()) return FoldedVOp;
   }
 
-  // fold (fadd c1, c2) -> (fadd c1, c2)
+  // fold (fadd c1, c2) -> c1 + c2
   if (N0CFP && N1CFP && VT != MVT::ppcf128)
     return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0, N1);
   // canonicalize constant to RHS
@@ -5636,6 +5675,26 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
                        DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
                                    N0.getOperand(1), N1));
 
+  // FADD -> FMA combines:
+  if ((DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
+       DAG.getTarget().Options.UnsafeFPMath) &&
+      DAG.getTarget().getTargetLowering()->isFMAFasterThanMulAndAdd(VT) &&
+      TLI.isOperationLegal(ISD::FMA, VT)) {
+
+    // fold (fadd (fmul x, y), z) -> (fma x, y, z)
+    if (N0.getOpcode() == ISD::FMUL && N0->hasOneUse()) {
+      return DAG.getNode(ISD::FMA, N->getDebugLoc(), VT,
+                         N0.getOperand(0), N0.getOperand(1), N1);
+    }
+  
+    // fold (fadd x, (fmul y, z)) -> (fma x, y, z)
+    // Note: Commutes FADD operands.
+    if (N1.getOpcode() == ISD::FMUL && N1->hasOneUse()) {
+      return DAG.getNode(ISD::FMA, N->getDebugLoc(), VT,
+                         N1.getOperand(0), N1.getOperand(1), N0);
+    }
+  }
+
   return SDValue();
 }
 
@@ -5673,9 +5732,13 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
                        GetNegatedExpression(N1, DAG, LegalOperations));
 
   // If 'unsafe math' is enabled, fold
+  //    (fsub x, x) -> 0.0 &
   //    (fsub x, (fadd x, y)) -> (fneg y) &
   //    (fsub x, (fadd y, x)) -> (fneg y)
   if (DAG.getTarget().Options.UnsafeFPMath) {
+    if (N0 == N1)
+      return DAG.getConstantFP(0.0f, VT);
+
     if (N1.getOpcode() == ISD::FADD) {
       SDValue N10 = N1->getOperand(0);
       SDValue N11 = N1->getOperand(1);
@@ -5689,6 +5752,29 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
     }
   }
 
+  // FSUB -> FMA combines:
+  if ((DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
+       DAG.getTarget().Options.UnsafeFPMath) &&
+      DAG.getTarget().getTargetLowering()->isFMAFasterThanMulAndAdd(VT) &&
+      TLI.isOperationLegal(ISD::FMA, VT)) {
+
+    // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
+    if (N0.getOpcode() == ISD::FMUL && N0->hasOneUse()) {
+      return DAG.getNode(ISD::FMA, N->getDebugLoc(), VT,
+                         N0.getOperand(0), N0.getOperand(1),
+                         DAG.getNode(ISD::FNEG, N1->getDebugLoc(), VT, N1));
+    }
+
+    // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
+    // Note: Commutes FSUB operands.
+    if (N1.getOpcode() == ISD::FMUL && N1->hasOneUse()) {
+      return DAG.getNode(ISD::FMA, N->getDebugLoc(), VT,
+                         DAG.getNode(ISD::FNEG, N1->getDebugLoc(), VT,
+                         N1.getOperand(0)),
+                         N1.getOperand(1), N0);
+    }
+  }
+
   return SDValue();
 }
 
@@ -5720,6 +5806,9 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
   if (DAG.getTarget().Options.UnsafeFPMath &&
       ISD::isBuildVectorAllZeros(N1.getNode()))
     return N1;
+  // fold (fmul A, 1.0) -> A
+  if (N1CFP && N1CFP->isExactlyValue(1.0))
+    return N0;
   // fold (fmul X, 2.0) -> (fadd X, X)
   if (N1CFP && N1CFP->isExactlyValue(+2.0))
     return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0, N0);
@@ -5753,6 +5842,26 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitFMA(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  EVT VT = N->getValueType(0);
+
+  if (N0CFP && N0CFP->isExactlyValue(1.0))
+    return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N1, N2);
+  if (N1CFP && N1CFP->isExactlyValue(1.0))
+    return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0, N2);
+
+  // Canonicalize (fma c, x, y) -> (fma x, c, y)
+  if (N0CFP && !N1CFP)
+    return DAG.getNode(ISD::FMA, N->getDebugLoc(), VT, N1, N0, N2);
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitFDIV(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -5893,6 +6002,38 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
       return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), VT, N0);
   }
 
+  // The next optimizations are desireable only if SELECT_CC can be lowered.
+  // Check against MVT::Other for SELECT_CC, which is a workaround for targets
+  // having to say they don't support SELECT_CC on every type the DAG knows
+  // about, since there is no way to mark an opcode illegal at all value types
+  // (See also visitSELECT)
+  if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, MVT::Other)) {
+    // fold (sint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc)
+    if (N0.getOpcode() == ISD::SETCC && N0.getValueType() == MVT::i1 &&
+        !VT.isVector() &&
+        (!LegalOperations ||
+         TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) {
+      SDValue Ops[] =
+        { N0.getOperand(0), N0.getOperand(1),
+          DAG.getConstantFP(-1.0, VT) , DAG.getConstantFP(0.0, VT),
+          N0.getOperand(2) };
+      return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), VT, Ops, 5);
+    }
+
+    // fold (sint_to_fp (zext (setcc x, y, cc))) ->
+    //      (select_cc x, y, 1.0, 0.0,, cc)
+    if (N0.getOpcode() == ISD::ZERO_EXTEND &&
+        N0.getOperand(0).getOpcode() == ISD::SETCC &&!VT.isVector() &&
+        (!LegalOperations ||
+         TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) {
+      SDValue Ops[] =
+        { N0.getOperand(0).getOperand(0), N0.getOperand(0).getOperand(1),
+          DAG.getConstantFP(1.0, VT) , DAG.getConstantFP(0.0, VT),
+          N0.getOperand(0).getOperand(2) };
+      return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), VT, Ops, 5);
+    }
+  }
+
   return SDValue();
 }
 
@@ -5918,6 +6059,25 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
       return DAG.getNode(ISD::SINT_TO_FP, N->getDebugLoc(), VT, N0);
   }
 
+  // The next optimizations are desireable only if SELECT_CC can be lowered.
+  // Check against MVT::Other for SELECT_CC, which is a workaround for targets
+  // having to say they don't support SELECT_CC on every type the DAG knows
+  // about, since there is no way to mark an opcode illegal at all value types
+  // (See also visitSELECT)
+  if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, MVT::Other)) {
+    // fold (uint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc)
+
+    if (N0.getOpcode() == ISD::SETCC && !VT.isVector() &&
+        (!LegalOperations ||
+         TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) {
+      SDValue Ops[] =
+        { N0.getOperand(0), N0.getOperand(1),
+          DAG.getConstantFP(1.0, VT),  DAG.getConstantFP(0.0, VT),
+          N0.getOperand(2) };
+      return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), VT, Ops, 5);
+    }
+  }
+
   return SDValue();
 }
 
@@ -6185,7 +6345,7 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) {
           }
           // Replace the uses of SRL with SETCC
           WorkListRemover DeadNodes(*this);
-          DAG.ReplaceAllUsesOfValueWith(N1, SetCC, &DeadNodes);
+          DAG.ReplaceAllUsesOfValueWith(N1, SetCC);
           removeFromWorkList(N1.getNode());
           DAG.DeleteNode(N1.getNode());
           return SDValue(N, 0);   // Return N so it doesn't get rechecked!
@@ -6214,7 +6374,7 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) {
               Tmp.getNode()->dump(&DAG);
               dbgs() << '\n');
         WorkListRemover DeadNodes(*this);
-        DAG.ReplaceAllUsesOfValueWith(N1, Tmp, &DeadNodes);
+        DAG.ReplaceAllUsesOfValueWith(N1, Tmp);
         removeFromWorkList(TheXor);
         DAG.DeleteNode(TheXor);
         return DAG.getNode(ISD::BRCOND, N->getDebugLoc(),
@@ -6240,7 +6400,7 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) {
                                    Equal ? ISD::SETEQ : ISD::SETNE);
       // Replace the uses of XOR with SETCC
       WorkListRemover DeadNodes(*this);
-      DAG.ReplaceAllUsesOfValueWith(N1, SetCC, &DeadNodes);
+      DAG.ReplaceAllUsesOfValueWith(N1, SetCC);
       removeFromWorkList(N1.getNode());
       DAG.DeleteNode(N1.getNode());
       return DAG.getNode(ISD::BRCOND, N->getDebugLoc(),
@@ -6431,21 +6591,17 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
         dbgs() << '\n');
   WorkListRemover DeadNodes(*this);
   if (isLoad) {
-    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0),
-                                  &DeadNodes);
-    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2),
-                                  &DeadNodes);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
   } else {
-    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1),
-                                  &DeadNodes);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1));
   }
 
   // Finally, since the node is now dead, remove it from the graph.
   DAG.DeleteNode(N);
 
   // Replace the uses of Ptr with uses of the updated base value.
-  DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(isLoad ? 1 : 0),
-                                &DeadNodes);
+  DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(isLoad ? 1 : 0));
   removeFromWorkList(Ptr.getNode());
   DAG.DeleteNode(Ptr.getNode());
 
@@ -6559,13 +6715,10 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
               dbgs() << '\n');
         WorkListRemover DeadNodes(*this);
         if (isLoad) {
-          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0),
-                                        &DeadNodes);
-          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2),
-                                        &DeadNodes);
+          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
+          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
         } else {
-          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1),
-                                        &DeadNodes);
+          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1));
         }
 
         // Finally, since the node is now dead, remove it from the graph.
@@ -6573,8 +6726,7 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
 
         // Replace the uses of Use with uses of the updated base value.
         DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0),
-                                      Result.getValue(isLoad ? 1 : 0),
-                                      &DeadNodes);
+                                      Result.getValue(isLoad ? 1 : 0));
         removeFromWorkList(Op);
         DAG.DeleteNode(Op);
         return true;
@@ -6609,7 +6761,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
               Chain.getNode()->dump(&DAG);
               dbgs() << "\n");
         WorkListRemover DeadNodes(*this);
-        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain, &DeadNodes);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
 
         if (N->use_empty()) {
           removeFromWorkList(N);
@@ -6629,11 +6781,10 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
               Undef.getNode()->dump(&DAG);
               dbgs() << " and 2 other values\n");
         WorkListRemover DeadNodes(*this);
-        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef, &DeadNodes);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef);
         DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1),
-                                      DAG.getUNDEF(N->getValueType(1)),
-                                      &DeadNodes);
-        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 2), Chain, &DeadNodes);
+                                      DAG.getUNDEF(N->getValueType(1)));
+        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 2), Chain);
         removeFromWorkList(N);
         DAG.DeleteNode(N);
         return SDValue(N, 0);   // Return N so it doesn't get rechecked!
@@ -6955,8 +7106,7 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
       AddToWorkList(NewLD.getNode());
       AddToWorkList(NewVal.getNode());
       WorkListRemover DeadNodes(*this);
-      DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1),
-                                    &DeadNodes);
+      DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1));
       ++OpsNarrowed;
       return NewST;
     }
@@ -7013,8 +7163,7 @@ SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) {
     AddToWorkList(NewLD.getNode());
     AddToWorkList(NewST.getNode());
     WorkListRemover DeadNodes(*this);
-    DAG.ReplaceAllUsesOfValueWith(Value.getValue(1), NewLD.getValue(1),
-                                  &DeadNodes);
+    DAG.ReplaceAllUsesOfValueWith(Value.getValue(1), NewLD.getValue(1));
     ++LdStFP2Int;
     return NewST;
   }
@@ -7058,7 +7207,8 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
       SDValue Tmp;
       switch (CFP->getValueType(0).getSimpleVT().SimpleTy) {
       default: llvm_unreachable("Unknown FP type");
-      case MVT::f80:    // We don't do this for these yet.
+      case MVT::f16:    // We don't do this for these yet.
+      case MVT::f80:
       case MVT::f128:
       case MVT::ppcf128:
         break;
@@ -7323,8 +7473,9 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
       OrigElt -= NumElem;
     }
 
+    EVT IndexTy = N->getOperand(1).getValueType();
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, N->getDebugLoc(), NVT,
-                       InVec, DAG.getConstant(OrigElt, MVT::i32));
+                       InVec, DAG.getConstant(OrigElt, IndexTy));
   }
 
   // Perform only after legalization to ensure build_vector / vector_shuffle
@@ -7472,7 +7623,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
     WorkListRemover DeadNodes(*this);
     SDValue From[] = { SDValue(N, 0), SDValue(LN0,1) };
     SDValue To[] = { Load, Chain };
-    DAG.ReplaceAllUsesOfValuesWith(From, To, 2, &DeadNodes);
+    DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
     // Since we're explcitly calling ReplaceAllUses, add the new node to the
     // worklist explicitly as well.
     AddToWorkList(Load.getNode());
@@ -7489,6 +7640,11 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
   unsigned NumInScalars = N->getNumOperands();
   DebugLoc dl = N->getDebugLoc();
   EVT VT = N->getValueType(0);
+
+  // A vector built entirely of undefs is undef.
+  if (ISD::allOperandsUndef(N))
+    return DAG.getUNDEF(VT);
+
   // Check to see if this is a BUILD_VECTOR of a bunch of values
   // which come from any_extend or zero_extend nodes. If so, we can create
   // a new BUILD_VECTOR using bit-casts which may enable other BUILD_VECTOR
@@ -7496,12 +7652,11 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
   // using shuffles.
   EVT SourceType = MVT::Other;
   bool AllAnyExt = true;
-  bool AllUndef = true;
+
   for (unsigned i = 0; i != NumInScalars; ++i) {
     SDValue In = N->getOperand(i);
     // Ignore undef inputs.
     if (In.getOpcode() == ISD::UNDEF) continue;
-    AllUndef = false;
 
     bool AnyExt  = In.getOpcode() == ISD::ANY_EXTEND;
     bool ZeroExt = In.getOpcode() == ISD::ZERO_EXTEND;
@@ -7529,9 +7684,6 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
     AllAnyExt &= AnyExt;
   }
 
-  if (AllUndef)
-    return DAG.getUNDEF(VT);
-
   // In order to have valid types, all of the inputs must be extended from the
   // same source type and all of the inputs must be any or zero extend.
   // Scalar sizes must be a power of two.
@@ -7707,6 +7859,10 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
   if (N->getNumOperands() == 1)
     return N->getOperand(0);
 
+  // Check if all of the operands are undefs.
+  if (ISD::allOperandsUndef(N))
+    return DAG.getUNDEF(N->getValueType(0));
+
   return SDValue();
 }
 
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 0c1ac69..e5ea6e6 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -40,6 +40,7 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "isel"
+#include "llvm/DebugInfo.h"
 #include "llvm/Function.h"
 #include "llvm/GlobalVariable.h"
 #include "llvm/Instructions.h"
@@ -51,7 +52,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -484,7 +484,7 @@ bool FastISel::SelectGetElementPtr(const User *I) {
       if (const ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) {
         if (CI->isZero()) continue;
         // N = N + Offset
-        TotalOffs += 
+        TotalOffs +=
           TD.getTypeAllocSize(Ty)*cast<ConstantInt>(CI)->getSExtValue();
         if (TotalOffs >= MaxOffs) {
           N = FastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT);
@@ -573,7 +573,10 @@ bool FastISel::SelectCall(const User *I) {
     // At -O0 we don't care about the lifetime intrinsics.
   case Intrinsic::lifetime_start:
   case Intrinsic::lifetime_end:
+    // The donothing intrinsic does, well, nothing.
+  case Intrinsic::donothing:
     return true;
+
   case Intrinsic::dbg_declare: {
     const DbgDeclareInst *DI = cast<DbgDeclareInst>(Call);
     if (!DIVariable(DI->getVariable()).Verify() ||
@@ -642,7 +645,7 @@ bool FastISel::SelectCall(const User *I) {
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
           .addCImm(CI).addImm(DI->getOffset())
           .addMetadata(DI->getVariable());
-      else 
+      else
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
           .addImm(CI->getZExtValue()).addImm(DI->getOffset())
           .addMetadata(DI->getVariable());
@@ -792,7 +795,7 @@ FastISel::SelectInstruction(const Instruction *I) {
     DL = DebugLoc();
     return true;
   }
-  // Remove dead code.  However, ignore call instructions since we've flushed 
+  // Remove dead code.  However, ignore call instructions since we've flushed
   // the local value map and recomputed the insert point.
   if (!isa<CallInst>(I)) {
     recomputeInsertPt();
@@ -1306,6 +1309,30 @@ unsigned FastISel::FastEmitInst_rri(unsigned MachineInstOpcode,
   return ResultReg;
 }
 
+unsigned FastISel::FastEmitInst_rrii(unsigned MachineInstOpcode,
+                                     const TargetRegisterClass *RC,
+                                     unsigned Op0, bool Op0IsKill,
+                                     unsigned Op1, bool Op1IsKill,
+                                     uint64_t Imm1, uint64_t Imm2) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+      .addReg(Op0, Op0IsKill * RegState::Kill)
+      .addReg(Op1, Op1IsKill * RegState::Kill)
+      .addImm(Imm1).addImm(Imm2);
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+      .addReg(Op0, Op0IsKill * RegState::Kill)
+      .addReg(Op1, Op1IsKill * RegState::Kill)
+      .addImm(Imm1).addImm(Imm2);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(II.ImplicitDefs[0]);
+  }
+  return ResultReg;
+}
+
 unsigned FastISel::FastEmitInst_i(unsigned MachineInstOpcode,
                                   const TargetRegisterClass *RC,
                                   uint64_t Imm) {
@@ -1345,6 +1372,8 @@ unsigned FastISel::FastEmitInst_extractsubreg(MVT RetVT,
   unsigned ResultReg = createResultReg(TLI.getRegClassFor(RetVT));
   assert(TargetRegisterInfo::isVirtualRegister(Op0) &&
          "Cannot yet extract from physregs");
+  const TargetRegisterClass *RC = MRI.getRegClass(Op0);
+  MRI.constrainRegClass(Op0, TRI.getSubClassWithSubReg(RC, Idx));
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
           DL, TII.get(TargetOpcode::COPY), ResultReg)
     .addReg(Op0, getKillRegState(Op0IsKill), Idx);
diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index 8dde919..3e18ea7 100644
--- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -15,13 +15,13 @@
 #define DEBUG_TYPE "function-lowering-info"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 1467d88..936c126 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -48,16 +48,31 @@ unsigned InstrEmitter::CountResults(SDNode *Node) {
   return N;
 }
 
-/// CountOperands - The inputs to target nodes have any actual inputs first,
+/// countOperands - The inputs to target nodes have any actual inputs first,
 /// followed by an optional chain operand, then an optional glue operand.
 /// Compute the number of actual operands that will go into the resulting
 /// MachineInstr.
-unsigned InstrEmitter::CountOperands(SDNode *Node) {
+///
+/// Also count physreg RegisterSDNode and RegisterMaskSDNode operands preceding
+/// the chain and glue. These operands may be implicit on the machine instr.
+static unsigned countOperands(SDNode *Node, unsigned &NumImpUses) {
   unsigned N = Node->getNumOperands();
   while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
     --N;
   if (N && Node->getOperand(N - 1).getValueType() == MVT::Other)
     --N; // Ignore chain if it exists.
+
+  // Count RegisterSDNode and RegisterMaskSDNode operands for NumImpUses.
+  for (unsigned I = N; I; --I) {
+    if (isa<RegisterMaskSDNode>(Node->getOperand(I - 1)))
+      continue;
+    if (RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Node->getOperand(I - 1)))
+      if (TargetRegisterInfo::isPhysicalRegister(RN->getReg()))
+        continue;
+    NumImpUses = N - I;
+    break;
+  }
+
   return N;
 }
 
@@ -114,8 +129,10 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
           if (User->isMachineOpcode()) {
             const MCInstrDesc &II = TII->get(User->getMachineOpcode());
             const TargetRegisterClass *RC = 0;
-            if (i+II.getNumDefs() < II.getNumOperands())
-              RC = TII->getRegClass(II, i+II.getNumDefs(), TRI);
+            if (i+II.getNumDefs() < II.getNumOperands()) {
+              RC = TRI->getAllocatableClass(
+                TII->getRegClass(II, i+II.getNumDefs(), TRI, *MF));
+            }
             if (!UseRC)
               UseRC = RC;
             else if (RC) {
@@ -196,7 +213,8 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node, MachineInstr *MI,
     // is a vreg in the same register class, use the CopyToReg'd destination
     // register instead of creating a new vreg.
     unsigned VRBase = 0;
-    const TargetRegisterClass *RC = TII->getRegClass(II, i, TRI);
+    const TargetRegisterClass *RC =
+      TRI->getAllocatableClass(TII->getRegClass(II, i, TRI, *MF));
     if (II.OpInfo[i].isOptionalDef()) {
       // Optional def must be a physical register.
       unsigned NumResults = CountResults(Node);
@@ -293,7 +311,7 @@ InstrEmitter::AddRegisterOperand(MachineInstr *MI, SDValue Op,
   if (II) {
     const TargetRegisterClass *DstRC = 0;
     if (IIOpNum < II->getNumOperands())
-      DstRC = TII->getRegClass(*II, IIOpNum, TRI);
+      DstRC = TRI->getAllocatableClass(TII->getRegClass(*II,IIOpNum,TRI,*MF));
     assert((DstRC || (MI->isVariadic() && IIOpNum >= MCID.getNumOperands())) &&
            "Don't have operand info for this instruction!");
     if (DstRC && !MRI->constrainRegClass(VReg, DstRC, MinRCSize)) {
@@ -334,8 +352,7 @@ InstrEmitter::AddRegisterOperand(MachineInstr *MI, SDValue Op,
 
 /// AddOperand - Add the specified operand to the specified machine instr.  II
 /// specifies the instruction information for the node, and IIOpNum is the
-/// operand number (in the II) that we are adding. IIOpNum and II are used for
-/// assertions only.
+/// operand number (in the II) that we are adding.
 void InstrEmitter::AddOperand(MachineInstr *MI, SDValue Op,
                               unsigned IIOpNum,
                               const MCInstrDesc *II,
@@ -350,7 +367,11 @@ void InstrEmitter::AddOperand(MachineInstr *MI, SDValue Op,
     const ConstantFP *CFP = F->getConstantFPValue();
     MI->addOperand(MachineOperand::CreateFPImm(CFP));
   } else if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(Op)) {
-    MI->addOperand(MachineOperand::CreateReg(R->getReg(), false));
+    // Turn additional physreg operands into implicit uses on non-variadic
+    // instructions. This is used by call and return instructions passing
+    // arguments in registers.
+    bool Imp = II && (IIOpNum >= II->getNumOperands() && !II->isVariadic());
+    MI->addOperand(MachineOperand::CreateReg(R->getReg(), false, Imp));
   } else if (RegisterMaskSDNode *RM = dyn_cast<RegisterMaskSDNode>(Op)) {
     MI->addOperand(MachineOperand::CreateRegMask(RM->getRegMask()));
   } else if (GlobalAddressSDNode *TGA = dyn_cast<GlobalAddressSDNode>(Op)) {
@@ -458,7 +479,8 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
     unsigned SrcReg, DstReg, DefSubIdx;
     if (DefMI &&
         TII->isCoalescableExtInstr(*DefMI, SrcReg, DstReg, DefSubIdx) &&
-        SubIdx == DefSubIdx) {
+        SubIdx == DefSubIdx &&
+        TRC == MRI->getRegClass(SrcReg)) {
       // Optimize these:
       // r1025 = s/zext r1024, 4
       // r1026 = extract_subreg r1025, 4
@@ -467,6 +489,7 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
       VRBase = MRI->createVirtualRegister(TRC);
       BuildMI(*MBB, InsertPos, Node->getDebugLoc(),
               TII->get(TargetOpcode::COPY), VRBase).addReg(SrcReg);
+      MRI->clearKillFlags(SrcReg);
     } else {
       // VReg may not support a SubIdx sub-register, and we may need to
       // constrain its register class or issue a COPY to a compatible register
@@ -548,7 +571,8 @@ InstrEmitter::EmitCopyToRegClassNode(SDNode *Node,
 
   // Create the new VReg in the destination class and emit a copy.
   unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
-  const TargetRegisterClass *DstRC = TRI->getRegClass(DstRCIdx);
+  const TargetRegisterClass *DstRC =
+    TRI->getAllocatableClass(TRI->getRegClass(DstRCIdx));
   unsigned NewVReg = MRI->createVirtualRegister(DstRC);
   BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY),
     NewVReg).addReg(VReg);
@@ -566,7 +590,7 @@ void InstrEmitter::EmitRegSequence(SDNode *Node,
                                   bool IsClone, bool IsCloned) {
   unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
   const TargetRegisterClass *RC = TRI->getRegClass(DstRCIdx);
-  unsigned NewVReg = MRI->createVirtualRegister(RC);
+  unsigned NewVReg = MRI->createVirtualRegister(TRI->getAllocatableClass(RC));
   MachineInstr *MI = BuildMI(*MF, Node->getDebugLoc(),
                              TII->get(TargetOpcode::REG_SEQUENCE), NewVReg);
   unsigned NumOps = Node->getNumOperands();
@@ -691,7 +715,8 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
 
   const MCInstrDesc &II = TII->get(Opc);
   unsigned NumResults = CountResults(Node);
-  unsigned NodeOperands = CountOperands(Node);
+  unsigned NumImpUses = 0;
+  unsigned NodeOperands = countOperands(Node, NumImpUses);
   bool HasPhysRegOuts = NumResults > II.getNumDefs() && II.getImplicitDefs()!=0;
 #ifndef NDEBUG
   unsigned NumMIOperands = NodeOperands + NumResults;
@@ -700,7 +725,8 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
            "Too few operands for a variadic node!");
   else
     assert(NumMIOperands >= II.getNumOperands() &&
-           NumMIOperands <= II.getNumOperands()+II.getNumImplicitDefs() &&
+           NumMIOperands <= II.getNumOperands() + II.getNumImplicitDefs() +
+                            NumImpUses &&
            "#operands for dag node doesn't match .td file!");
 #endif
 
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.h b/lib/CodeGen/SelectionDAG/InstrEmitter.h
index c081f38..9eddee9 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.h
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.h
@@ -105,12 +105,6 @@ public:
   /// (which do not go into the machine instrs.)
   static unsigned CountResults(SDNode *Node);
 
-  /// CountOperands - The inputs to target nodes have any actual inputs first,
-  /// followed by an optional chain operand, then flag operands.  Compute
-  /// the number of actual operands that will go into the resulting
-  /// MachineInstr.
-  static unsigned CountOperands(SDNode *Node);
-
   /// EmitDbgValue - Generate machine instruction for a dbg_value node.
   ///
   MachineInstr *EmitDbgValue(SDDbgValue *SD,
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index a96a997..b0776af 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -11,7 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/LLVMContext.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
@@ -20,10 +24,6 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/CallingConv.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/LLVMContext.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
@@ -70,6 +70,9 @@ private:
 
   SDValue OptimizeFloatStore(StoreSDNode *ST);
 
+  void LegalizeLoadOps(SDNode *Node);
+  void LegalizeStoreOps(SDNode *Node);
+
   /// PerformInsertVectorEltInMemory - Some target cannot handle a variable
   /// insertion index for the INSERT_VECTOR_ELT instruction.  In this case, it
   /// is necessary to spill the vector being inserted into to memory, perform
@@ -150,21 +153,21 @@ public:
   // Node replacement helpers
   void ReplacedNode(SDNode *N) {
     if (N->use_empty()) {
-      DAG.RemoveDeadNode(N, this);
+      DAG.RemoveDeadNode(N);
     } else {
       ForgetNode(N);
     }
   }
   void ReplaceNode(SDNode *Old, SDNode *New) {
-    DAG.ReplaceAllUsesWith(Old, New, this);
+    DAG.ReplaceAllUsesWith(Old, New);
     ReplacedNode(Old);
   }
   void ReplaceNode(SDValue Old, SDValue New) {
-    DAG.ReplaceAllUsesWith(Old, New, this);
+    DAG.ReplaceAllUsesWith(Old, New);
     ReplacedNode(Old.getNode());
   }
   void ReplaceNode(SDNode *Old, const SDValue *New) {
-    DAG.ReplaceAllUsesWith(Old, New, this);
+    DAG.ReplaceAllUsesWith(Old, New);
     ReplacedNode(Old);
   }
 };
@@ -203,7 +206,8 @@ SelectionDAGLegalize::ShuffleWithNarrowerEltType(EVT NVT, EVT VT,  DebugLoc dl,
 }
 
 SelectionDAGLegalize::SelectionDAGLegalize(SelectionDAG &dag)
-  : TM(dag.getTarget()), TLI(dag.getTargetLoweringInfo()),
+  : SelectionDAG::DAGUpdateListener(dag),
+    TM(dag.getTarget()), TLI(dag.getTargetLoweringInfo()),
     DAG(dag) {
 }
 
@@ -638,9 +642,8 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
   // probably means that we need to integrate dag combiner and legalizer
   // together.
   // We generally can't do this one for long doubles.
-  SDValue Tmp1 = ST->getChain();
-  SDValue Tmp2 = ST->getBasePtr();
-  SDValue Tmp3;
+  SDValue Chain = ST->getChain();
+  SDValue Ptr = ST->getBasePtr();
   unsigned Alignment = ST->getAlignment();
   bool isVolatile = ST->isVolatile();
   bool isNonTemporal = ST->isNonTemporal();
@@ -648,19 +651,19 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(ST->getValue())) {
     if (CFP->getValueType(0) == MVT::f32 &&
         TLI.isTypeLegal(MVT::i32)) {
-      Tmp3 = DAG.getConstant(CFP->getValueAPF().
+      SDValue Con = DAG.getConstant(CFP->getValueAPF().
                                       bitcastToAPInt().zextOrTrunc(32),
                               MVT::i32);
-      return DAG.getStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(),
+      return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(),
                           isVolatile, isNonTemporal, Alignment);
     }
 
     if (CFP->getValueType(0) == MVT::f64) {
       // If this target supports 64-bit registers, do a single 64-bit store.
       if (TLI.isTypeLegal(MVT::i64)) {
-        Tmp3 = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
+        SDValue Con = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
                                   zextOrTrunc(64), MVT::i64);
-        return DAG.getStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(),
+        return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(),
                             isVolatile, isNonTemporal, Alignment);
       }
 
@@ -673,11 +676,11 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
         SDValue Hi = DAG.getConstant(IntVal.lshr(32).trunc(32), MVT::i32);
         if (TLI.isBigEndian()) std::swap(Lo, Hi);
 
-        Lo = DAG.getStore(Tmp1, dl, Lo, Tmp2, ST->getPointerInfo(), isVolatile,
+        Lo = DAG.getStore(Chain, dl, Lo, Ptr, ST->getPointerInfo(), isVolatile,
                           isNonTemporal, Alignment);
-        Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
+        Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                             DAG.getIntPtrConstant(4));
-        Hi = DAG.getStore(Tmp1, dl, Hi, Tmp2,
+        Hi = DAG.getStore(Chain, dl, Hi, Ptr,
                           ST->getPointerInfo().getWithOffset(4),
                           isVolatile, isNonTemporal, MinAlign(Alignment, 4U));
 
@@ -688,14 +691,448 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
   return SDValue(0, 0);
 }
 
+void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
+    StoreSDNode *ST = cast<StoreSDNode>(Node);
+    SDValue Chain = ST->getChain();
+    SDValue Ptr = ST->getBasePtr();
+    DebugLoc dl = Node->getDebugLoc();
+
+    unsigned Alignment = ST->getAlignment();
+    bool isVolatile = ST->isVolatile();
+    bool isNonTemporal = ST->isNonTemporal();
+
+    if (!ST->isTruncatingStore()) {
+      if (SDNode *OptStore = OptimizeFloatStore(ST).getNode()) {
+        ReplaceNode(ST, OptStore);
+        return;
+      }
+
+      {
+        SDValue Value = ST->getValue();
+        EVT VT = Value.getValueType();
+        switch (TLI.getOperationAction(ISD::STORE, VT)) {
+        default: llvm_unreachable("This action is not supported yet!");
+        case TargetLowering::Legal:
+          // If this is an unaligned store and the target doesn't support it,
+          // expand it.
+          if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT())) {
+            Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
+            unsigned ABIAlignment= TLI.getTargetData()->getABITypeAlignment(Ty);
+            if (ST->getAlignment() < ABIAlignment)
+              ExpandUnalignedStore(cast<StoreSDNode>(Node),
+                                   DAG, TLI, this);
+          }
+          break;
+        case TargetLowering::Custom: {
+          SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
+          if (Res.getNode())
+            ReplaceNode(SDValue(Node, 0), Res);
+          return;
+        }
+        case TargetLowering::Promote: {
+          assert(VT.isVector() && "Unknown legal promote case!");
+          Value = DAG.getNode(ISD::BITCAST, dl,
+                             TLI.getTypeToPromoteTo(ISD::STORE, VT), Value);
+          SDValue Result =
+            DAG.getStore(Chain, dl, Value, Ptr,
+                         ST->getPointerInfo(), isVolatile,
+                         isNonTemporal, Alignment);
+          ReplaceNode(SDValue(Node, 0), Result);
+          break;
+        }
+        }
+        return;
+      }
+    } else {
+      SDValue Value = ST->getValue();
+
+      EVT StVT = ST->getMemoryVT();
+      unsigned StWidth = StVT.getSizeInBits();
+
+      if (StWidth != StVT.getStoreSizeInBits()) {
+        // Promote to a byte-sized store with upper bits zero if not
+        // storing an integral number of bytes.  For example, promote
+        // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
+        EVT NVT = EVT::getIntegerVT(*DAG.getContext(),
+                                    StVT.getStoreSizeInBits());
+        Value = DAG.getZeroExtendInReg(Value, dl, StVT);
+        SDValue Result =
+          DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
+                            NVT, isVolatile, isNonTemporal, Alignment);
+        ReplaceNode(SDValue(Node, 0), Result);
+      } else if (StWidth & (StWidth - 1)) {
+        // If not storing a power-of-2 number of bits, expand as two stores.
+        assert(!StVT.isVector() && "Unsupported truncstore!");
+        unsigned RoundWidth = 1 << Log2_32(StWidth);
+        assert(RoundWidth < StWidth);
+        unsigned ExtraWidth = StWidth - RoundWidth;
+        assert(ExtraWidth < RoundWidth);
+        assert(!(RoundWidth % 8) && !(ExtraWidth % 8) &&
+               "Store size not an integral number of bytes!");
+        EVT RoundVT = EVT::getIntegerVT(*DAG.getContext(), RoundWidth);
+        EVT ExtraVT = EVT::getIntegerVT(*DAG.getContext(), ExtraWidth);
+        SDValue Lo, Hi;
+        unsigned IncrementSize;
+
+        if (TLI.isLittleEndian()) {
+          // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 X, TRUNCSTORE@+2:i8 (srl X, 16)
+          // Store the bottom RoundWidth bits.
+          Lo = DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
+                                 RoundVT,
+                                 isVolatile, isNonTemporal, Alignment);
+
+          // Store the remaining ExtraWidth bits.
+          IncrementSize = RoundWidth / 8;
+          Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                             DAG.getIntPtrConstant(IncrementSize));
+          Hi = DAG.getNode(ISD::SRL, dl, Value.getValueType(), Value,
+                           DAG.getConstant(RoundWidth,
+                                    TLI.getShiftAmountTy(Value.getValueType())));
+          Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr,
+                             ST->getPointerInfo().getWithOffset(IncrementSize),
+                                 ExtraVT, isVolatile, isNonTemporal,
+                                 MinAlign(Alignment, IncrementSize));
+        } else {
+          // Big endian - avoid unaligned stores.
+          // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 (srl X, 8), TRUNCSTORE@+2:i8 X
+          // Store the top RoundWidth bits.
+          Hi = DAG.getNode(ISD::SRL, dl, Value.getValueType(), Value,
+                           DAG.getConstant(ExtraWidth,
+                                    TLI.getShiftAmountTy(Value.getValueType())));
+          Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr, ST->getPointerInfo(),
+                                 RoundVT, isVolatile, isNonTemporal, Alignment);
+
+          // Store the remaining ExtraWidth bits.
+          IncrementSize = RoundWidth / 8;
+          Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                             DAG.getIntPtrConstant(IncrementSize));
+          Lo = DAG.getTruncStore(Chain, dl, Value, Ptr,
+                              ST->getPointerInfo().getWithOffset(IncrementSize),
+                                 ExtraVT, isVolatile, isNonTemporal,
+                                 MinAlign(Alignment, IncrementSize));
+        }
+
+        // The order of the stores doesn't matter.
+        SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
+        ReplaceNode(SDValue(Node, 0), Result);
+      } else {
+        switch (TLI.getTruncStoreAction(ST->getValue().getValueType(), StVT)) {
+        default: llvm_unreachable("This action is not supported yet!");
+        case TargetLowering::Legal:
+          // If this is an unaligned store and the target doesn't support it,
+          // expand it.
+          if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT())) {
+            Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
+            unsigned ABIAlignment= TLI.getTargetData()->getABITypeAlignment(Ty);
+            if (ST->getAlignment() < ABIAlignment)
+              ExpandUnalignedStore(cast<StoreSDNode>(Node), DAG, TLI, this);
+          }
+          break;
+        case TargetLowering::Custom: {
+          SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
+          if (Res.getNode())
+            ReplaceNode(SDValue(Node, 0), Res);
+          return;
+        }
+        case TargetLowering::Expand:
+          assert(!StVT.isVector() &&
+                 "Vector Stores are handled in LegalizeVectorOps");
+
+          // TRUNCSTORE:i16 i32 -> STORE i16
+          assert(TLI.isTypeLegal(StVT) &&
+                 "Do not know how to expand this store!");
+          Value = DAG.getNode(ISD::TRUNCATE, dl, StVT, Value);
+          SDValue Result =
+            DAG.getStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
+                         isVolatile, isNonTemporal, Alignment);
+          ReplaceNode(SDValue(Node, 0), Result);
+          break;
+        }
+      }
+    }
+}
+
+void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
+  LoadSDNode *LD = cast<LoadSDNode>(Node);
+  SDValue Chain = LD->getChain();  // The chain.
+  SDValue Ptr = LD->getBasePtr();  // The base pointer.
+  SDValue Value;                   // The value returned by the load op.
+  DebugLoc dl = Node->getDebugLoc();
+
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  if (ExtType == ISD::NON_EXTLOAD) {
+    EVT VT = Node->getValueType(0);
+    SDValue RVal = SDValue(Node, 0);
+    SDValue RChain = SDValue(Node, 1);
+
+    switch (TLI.getOperationAction(Node->getOpcode(), VT)) {
+    default: llvm_unreachable("This action is not supported yet!");
+    case TargetLowering::Legal:
+             // If this is an unaligned load and the target doesn't support it,
+             // expand it.
+             if (!TLI.allowsUnalignedMemoryAccesses(LD->getMemoryVT())) {
+               Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
+               unsigned ABIAlignment =
+                 TLI.getTargetData()->getABITypeAlignment(Ty);
+               if (LD->getAlignment() < ABIAlignment){
+                 ExpandUnalignedLoad(cast<LoadSDNode>(Node),
+                                     DAG, TLI, RVal, RChain);
+               }
+             }
+             break;
+    case TargetLowering::Custom: {
+             SDValue Res = TLI.LowerOperation(RVal, DAG);
+             if (Res.getNode()) {
+               RVal = Res;
+               RChain = Res.getValue(1);
+             }
+             break;
+    }
+    case TargetLowering::Promote: {
+      // Only promote a load of vector type to another.
+      assert(VT.isVector() && "Cannot promote this load!");
+      // Change base type to a different vector type.
+      EVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), VT);
+
+      SDValue Res = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo(),
+                         LD->isVolatile(), LD->isNonTemporal(),
+                         LD->isInvariant(), LD->getAlignment());
+      RVal = DAG.getNode(ISD::BITCAST, dl, VT, Res);
+      RChain = Res.getValue(1);
+      break;
+    }
+    }
+    if (RChain.getNode() != Node) {
+      assert(RVal.getNode() != Node && "Load must be completely replaced");
+      DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), RVal);
+      DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), RChain);
+      ReplacedNode(Node);
+    }
+    return;
+  }
+
+  EVT SrcVT = LD->getMemoryVT();
+  unsigned SrcWidth = SrcVT.getSizeInBits();
+  unsigned Alignment = LD->getAlignment();
+  bool isVolatile = LD->isVolatile();
+  bool isNonTemporal = LD->isNonTemporal();
+
+  if (SrcWidth != SrcVT.getStoreSizeInBits() &&
+      // Some targets pretend to have an i1 loading operation, and actually
+      // load an i8.  This trick is correct for ZEXTLOAD because the top 7
+      // bits are guaranteed to be zero; it helps the optimizers understand
+      // that these bits are zero.  It is also useful for EXTLOAD, since it
+      // tells the optimizers that those bits are undefined.  It would be
+      // nice to have an effective generic way of getting these benefits...
+      // Until such a way is found, don't insist on promoting i1 here.
+      (SrcVT != MVT::i1 ||
+       TLI.getLoadExtAction(ExtType, MVT::i1) == TargetLowering::Promote)) {
+    // Promote to a byte-sized load if not loading an integral number of
+    // bytes.  For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
+    unsigned NewWidth = SrcVT.getStoreSizeInBits();
+    EVT NVT = EVT::getIntegerVT(*DAG.getContext(), NewWidth);
+    SDValue Ch;
+
+    // The extra bits are guaranteed to be zero, since we stored them that
+    // way.  A zext load from NVT thus automatically gives zext from SrcVT.
+
+    ISD::LoadExtType NewExtType =
+      ExtType == ISD::ZEXTLOAD ? ISD::ZEXTLOAD : ISD::EXTLOAD;
+
+    SDValue Result =
+      DAG.getExtLoad(NewExtType, dl, Node->getValueType(0),
+                     Chain, Ptr, LD->getPointerInfo(),
+                     NVT, isVolatile, isNonTemporal, Alignment);
+
+    Ch = Result.getValue(1); // The chain.
+
+    if (ExtType == ISD::SEXTLOAD)
+      // Having the top bits zero doesn't help when sign extending.
+      Result = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl,
+                           Result.getValueType(),
+                           Result, DAG.getValueType(SrcVT));
+    else if (ExtType == ISD::ZEXTLOAD || NVT == Result.getValueType())
+      // All the top bits are guaranteed to be zero - inform the optimizers.
+      Result = DAG.getNode(ISD::AssertZext, dl,
+                           Result.getValueType(), Result,
+                           DAG.getValueType(SrcVT));
+
+    Value = Result;
+    Chain = Ch;
+  } else if (SrcWidth & (SrcWidth - 1)) {
+    // If not loading a power-of-2 number of bits, expand as two loads.
+    assert(!SrcVT.isVector() && "Unsupported extload!");
+    unsigned RoundWidth = 1 << Log2_32(SrcWidth);
+    assert(RoundWidth < SrcWidth);
+    unsigned ExtraWidth = SrcWidth - RoundWidth;
+    assert(ExtraWidth < RoundWidth);
+    assert(!(RoundWidth % 8) && !(ExtraWidth % 8) &&
+           "Load size not an integral number of bytes!");
+    EVT RoundVT = EVT::getIntegerVT(*DAG.getContext(), RoundWidth);
+    EVT ExtraVT = EVT::getIntegerVT(*DAG.getContext(), ExtraWidth);
+    SDValue Lo, Hi, Ch;
+    unsigned IncrementSize;
+
+    if (TLI.isLittleEndian()) {
+      // EXTLOAD:i24 -> ZEXTLOAD:i16 | (shl EXTLOAD@+2:i8, 16)
+      // Load the bottom RoundWidth bits.
+      Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0),
+                          Chain, Ptr,
+                          LD->getPointerInfo(), RoundVT, isVolatile,
+                          isNonTemporal, Alignment);
+
+      // Load the remaining ExtraWidth bits.
+      IncrementSize = RoundWidth / 8;
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                         DAG.getIntPtrConstant(IncrementSize));
+      Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
+                          LD->getPointerInfo().getWithOffset(IncrementSize),
+                          ExtraVT, isVolatile, isNonTemporal,
+                          MinAlign(Alignment, IncrementSize));
+
+      // Build a factor node to remember that this load is independent of
+      // the other one.
+      Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                       Hi.getValue(1));
+
+      // Move the top bits to the right place.
+      Hi = DAG.getNode(ISD::SHL, dl, Hi.getValueType(), Hi,
+                       DAG.getConstant(RoundWidth,
+                                       TLI.getShiftAmountTy(Hi.getValueType())));
+
+      // Join the hi and lo parts.
+      Value = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
+    } else {
+      // Big endian - avoid unaligned loads.
+      // EXTLOAD:i24 -> (shl EXTLOAD:i16, 8) | ZEXTLOAD@+2:i8
+      // Load the top RoundWidth bits.
+      Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
+                          LD->getPointerInfo(), RoundVT, isVolatile,
+                          isNonTemporal, Alignment);
+
+      // Load the remaining ExtraWidth bits.
+      IncrementSize = RoundWidth / 8;
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                         DAG.getIntPtrConstant(IncrementSize));
+      Lo = DAG.getExtLoad(ISD::ZEXTLOAD,
+                          dl, Node->getValueType(0), Chain, Ptr,
+                          LD->getPointerInfo().getWithOffset(IncrementSize),
+                          ExtraVT, isVolatile, isNonTemporal,
+                          MinAlign(Alignment, IncrementSize));
+
+      // Build a factor node to remember that this load is independent of
+      // the other one.
+      Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                       Hi.getValue(1));
+
+      // Move the top bits to the right place.
+      Hi = DAG.getNode(ISD::SHL, dl, Hi.getValueType(), Hi,
+                       DAG.getConstant(ExtraWidth,
+                                       TLI.getShiftAmountTy(Hi.getValueType())));
+
+      // Join the hi and lo parts.
+      Value = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
+    }
+
+    Chain = Ch;
+  } else {
+    bool isCustom = false;
+    switch (TLI.getLoadExtAction(ExtType, SrcVT)) {
+    default: llvm_unreachable("This action is not supported yet!");
+    case TargetLowering::Custom:
+             isCustom = true;
+             // FALLTHROUGH
+    case TargetLowering::Legal: {
+             Value = SDValue(Node, 0);
+             Chain = SDValue(Node, 1);
+
+             if (isCustom) {
+               SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
+               if (Res.getNode()) {
+                 Value = Res;
+                 Chain = Res.getValue(1);
+               }
+             } else {
+               // If this is an unaligned load and the target doesn't support it,
+               // expand it.
+               if (!TLI.allowsUnalignedMemoryAccesses(LD->getMemoryVT())) {
+                 Type *Ty =
+                   LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
+                 unsigned ABIAlignment =
+                   TLI.getTargetData()->getABITypeAlignment(Ty);
+                 if (LD->getAlignment() < ABIAlignment){
+                   ExpandUnalignedLoad(cast<LoadSDNode>(Node),
+                                       DAG, TLI, Value, Chain);
+                 }
+               }
+             }
+             break;
+    }
+    case TargetLowering::Expand:
+             if (!TLI.isLoadExtLegal(ISD::EXTLOAD, SrcVT) && TLI.isTypeLegal(SrcVT)) {
+               SDValue Load = DAG.getLoad(SrcVT, dl, Chain, Ptr,
+                                          LD->getPointerInfo(),
+                                          LD->isVolatile(), LD->isNonTemporal(),
+                                          LD->isInvariant(), LD->getAlignment());
+               unsigned ExtendOp;
+               switch (ExtType) {
+               case ISD::EXTLOAD:
+                 ExtendOp = (SrcVT.isFloatingPoint() ?
+                             ISD::FP_EXTEND : ISD::ANY_EXTEND);
+                 break;
+               case ISD::SEXTLOAD: ExtendOp = ISD::SIGN_EXTEND; break;
+               case ISD::ZEXTLOAD: ExtendOp = ISD::ZERO_EXTEND; break;
+               default: llvm_unreachable("Unexpected extend load type!");
+               }
+               Value = DAG.getNode(ExtendOp, dl, Node->getValueType(0), Load);
+               Chain = Load.getValue(1);
+               break;
+             }
+
+             assert(!SrcVT.isVector() &&
+                    "Vector Loads are handled in LegalizeVectorOps");
+
+             // FIXME: This does not work for vectors on most targets.  Sign- and
+             // zero-extend operations are currently folded into extending loads,
+             // whether they are legal or not, and then we end up here without any
+             // support for legalizing them.
+             assert(ExtType != ISD::EXTLOAD &&
+                    "EXTLOAD should always be supported!");
+             // Turn the unsupported load into an EXTLOAD followed by an explicit
+             // zero/sign extend inreg.
+             SDValue Result = DAG.getExtLoad(ISD::EXTLOAD, dl, Node->getValueType(0),
+                                             Chain, Ptr, LD->getPointerInfo(), SrcVT,
+                                             LD->isVolatile(), LD->isNonTemporal(),
+                                             LD->getAlignment());
+             SDValue ValRes;
+             if (ExtType == ISD::SEXTLOAD)
+               ValRes = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl,
+                                    Result.getValueType(),
+                                    Result, DAG.getValueType(SrcVT));
+             else
+               ValRes = DAG.getZeroExtendInReg(Result, dl, SrcVT.getScalarType());
+             Value = ValRes;
+             Chain = Result.getValue(1);
+             break;
+    }
+  }
+
+  // Since loads produce two values, make sure to remember that we legalized
+  // both of them.
+  if (Chain.getNode() != Node) {
+    assert(Value.getNode() != Node && "Load must be completely replaced");
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), Value);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), Chain);
+    ReplacedNode(Node);
+  }
+}
+
 /// LegalizeOp - Return a legal replacement for the given operation, with
 /// all legal operands.
 void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   if (Node->getOpcode() == ISD::TargetConstant) // Allow illegal target nodes.
     return;
 
-  DebugLoc dl = Node->getDebugLoc();
-
   for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
     assert(TLI.getTypeAction(*DAG.getContext(), Node->getValueType(i)) ==
              TargetLowering::TypeLegal &&
@@ -708,9 +1145,6 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
             Node->getOperand(i).getOpcode() == ISD::TargetConstant) &&
            "Unexpected illegal type!");
 
-  SDValue Tmp1, Tmp2, Tmp3, Tmp4;
-  bool isCustom = false;
-
   // Figure out the correct action; the way to query this varies by opcode
   TargetLowering::LegalizeAction Action = TargetLowering::Legal;
   bool SimpleFinishLegalizing = true;
@@ -816,9 +1250,7 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   }
 
   if (SimpleFinishLegalizing) {
-    SmallVector<SDValue, 8> Ops;
-    for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i)
-      Ops.push_back(Node->getOperand(i));
+    SDNode *NewNode = Node;
     switch (Node->getOpcode()) {
     default: break;
     case ISD::SHL:
@@ -828,11 +1260,14 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     case ISD::ROTR:
       // Legalizing shifts/rotates requires adjusting the shift amount
       // to the appropriate width.
-      if (!Ops[1].getValueType().isVector()) {
-        SDValue SAO = DAG.getShiftAmountOperand(Ops[0].getValueType(), Ops[1]);
+      if (!Node->getOperand(1).getValueType().isVector()) {
+        SDValue SAO =
+          DAG.getShiftAmountOperand(Node->getOperand(0).getValueType(),
+                                    Node->getOperand(1));
         HandleSDNode Handle(SAO);
         LegalizeOp(SAO.getNode());
-        Ops[1] = Handle.getValue();
+        NewNode = DAG.UpdateNodeOperands(Node, Node->getOperand(0),
+                                         Handle.getValue());
       }
       break;
     case ISD::SRL_PARTS:
@@ -840,18 +1275,21 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     case ISD::SHL_PARTS:
       // Legalizing shifts/rotates requires adjusting the shift amount
       // to the appropriate width.
-      if (!Ops[2].getValueType().isVector()) {
-        SDValue SAO = DAG.getShiftAmountOperand(Ops[0].getValueType(), Ops[2]);
+      if (!Node->getOperand(2).getValueType().isVector()) {
+        SDValue SAO =
+          DAG.getShiftAmountOperand(Node->getOperand(0).getValueType(),
+                                    Node->getOperand(2));
         HandleSDNode Handle(SAO);
         LegalizeOp(SAO.getNode());
-        Ops[2] = Handle.getValue();
+        NewNode = DAG.UpdateNodeOperands(Node, Node->getOperand(0),
+                                         Node->getOperand(1),
+                                         Handle.getValue());
       }
       break;
     }
 
-    SDNode *NewNode = DAG.UpdateNodeOperands(Node, Ops.data(), Ops.size());
     if (NewNode != Node) {
-      DAG.ReplaceAllUsesWith(Node, NewNode, this);
+      DAG.ReplaceAllUsesWith(Node, NewNode);
       for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
         DAG.TransferDbgValues(SDValue(Node, i), SDValue(NewNode, i));
       ReplacedNode(Node);
@@ -860,27 +1298,27 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     switch (Action) {
     case TargetLowering::Legal:
       return;
-    case TargetLowering::Custom:
+    case TargetLowering::Custom: {
       // FIXME: The handling for custom lowering with multiple results is
       // a complete mess.
-      Tmp1 = TLI.LowerOperation(SDValue(Node, 0), DAG);
-      if (Tmp1.getNode()) {
+      SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
+      if (Res.getNode()) {
         SmallVector<SDValue, 8> ResultVals;
         for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i) {
           if (e == 1)
-            ResultVals.push_back(Tmp1);
+            ResultVals.push_back(Res);
           else
-            ResultVals.push_back(Tmp1.getValue(i));
+            ResultVals.push_back(Res.getValue(i));
         }
-        if (Tmp1.getNode() != Node || Tmp1.getResNo() != 0) {
-          DAG.ReplaceAllUsesWith(Node, ResultVals.data(), this);
+        if (Res.getNode() != Node || Res.getResNo() != 0) {
+          DAG.ReplaceAllUsesWith(Node, ResultVals.data());
           for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
             DAG.TransferDbgValues(SDValue(Node, i), ResultVals[i]);
           ReplacedNode(Node);
         }
         return;
       }
-
+    }
       // FALL THROUGH
     case TargetLowering::Expand:
       ExpandNode(Node);
@@ -904,428 +1342,10 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   case ISD::CALLSEQ_END:
     break;
   case ISD::LOAD: {
-    LoadSDNode *LD = cast<LoadSDNode>(Node);
-    Tmp1 = LD->getChain();   // Legalize the chain.
-    Tmp2 = LD->getBasePtr(); // Legalize the base pointer.
-
-    ISD::LoadExtType ExtType = LD->getExtensionType();
-    if (ExtType == ISD::NON_EXTLOAD) {
-      EVT VT = Node->getValueType(0);
-      Tmp3 = SDValue(Node, 0);
-      Tmp4 = SDValue(Node, 1);
-
-      switch (TLI.getOperationAction(Node->getOpcode(), VT)) {
-      default: llvm_unreachable("This action is not supported yet!");
-      case TargetLowering::Legal:
-        // If this is an unaligned load and the target doesn't support it,
-        // expand it.
-        if (!TLI.allowsUnalignedMemoryAccesses(LD->getMemoryVT())) {
-          Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
-          unsigned ABIAlignment = TLI.getTargetData()->getABITypeAlignment(Ty);
-          if (LD->getAlignment() < ABIAlignment){
-            ExpandUnalignedLoad(cast<LoadSDNode>(Node),
-                                DAG, TLI, Tmp3, Tmp4);
-          }
-        }
-        break;
-      case TargetLowering::Custom:
-        Tmp1 = TLI.LowerOperation(Tmp3, DAG);
-        if (Tmp1.getNode()) {
-          Tmp3 = Tmp1;
-          Tmp4 = Tmp1.getValue(1);
-        }
-        break;
-      case TargetLowering::Promote: {
-        // Only promote a load of vector type to another.
-        assert(VT.isVector() && "Cannot promote this load!");
-        // Change base type to a different vector type.
-        EVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), VT);
-
-        Tmp1 = DAG.getLoad(NVT, dl, Tmp1, Tmp2, LD->getPointerInfo(),
-                           LD->isVolatile(), LD->isNonTemporal(),
-                           LD->isInvariant(), LD->getAlignment());
-        Tmp3 = DAG.getNode(ISD::BITCAST, dl, VT, Tmp1);
-        Tmp4 = Tmp1.getValue(1);
-        break;
-      }
-      }
-      if (Tmp4.getNode() != Node) {
-        assert(Tmp3.getNode() != Node && "Load must be completely replaced");
-        DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), Tmp3);
-        DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), Tmp4);
-        ReplacedNode(Node);
-      }
-      return;
-    }
-
-    EVT SrcVT = LD->getMemoryVT();
-    unsigned SrcWidth = SrcVT.getSizeInBits();
-    unsigned Alignment = LD->getAlignment();
-    bool isVolatile = LD->isVolatile();
-    bool isNonTemporal = LD->isNonTemporal();
-
-    if (SrcWidth != SrcVT.getStoreSizeInBits() &&
-        // Some targets pretend to have an i1 loading operation, and actually
-        // load an i8.  This trick is correct for ZEXTLOAD because the top 7
-        // bits are guaranteed to be zero; it helps the optimizers understand
-        // that these bits are zero.  It is also useful for EXTLOAD, since it
-        // tells the optimizers that those bits are undefined.  It would be
-        // nice to have an effective generic way of getting these benefits...
-        // Until such a way is found, don't insist on promoting i1 here.
-        (SrcVT != MVT::i1 ||
-         TLI.getLoadExtAction(ExtType, MVT::i1) == TargetLowering::Promote)) {
-      // Promote to a byte-sized load if not loading an integral number of
-      // bytes.  For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
-      unsigned NewWidth = SrcVT.getStoreSizeInBits();
-      EVT NVT = EVT::getIntegerVT(*DAG.getContext(), NewWidth);
-      SDValue Ch;
-
-      // The extra bits are guaranteed to be zero, since we stored them that
-      // way.  A zext load from NVT thus automatically gives zext from SrcVT.
-
-      ISD::LoadExtType NewExtType =
-        ExtType == ISD::ZEXTLOAD ? ISD::ZEXTLOAD : ISD::EXTLOAD;
-
-      SDValue Result =
-        DAG.getExtLoad(NewExtType, dl, Node->getValueType(0),
-                       Tmp1, Tmp2, LD->getPointerInfo(),
-                       NVT, isVolatile, isNonTemporal, Alignment);
-
-      Ch = Result.getValue(1); // The chain.
-
-      if (ExtType == ISD::SEXTLOAD)
-        // Having the top bits zero doesn't help when sign extending.
-        Result = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl,
-                             Result.getValueType(),
-                             Result, DAG.getValueType(SrcVT));
-      else if (ExtType == ISD::ZEXTLOAD || NVT == Result.getValueType())
-        // All the top bits are guaranteed to be zero - inform the optimizers.
-        Result = DAG.getNode(ISD::AssertZext, dl,
-                             Result.getValueType(), Result,
-                             DAG.getValueType(SrcVT));
-
-      Tmp1 = Result;
-      Tmp2 = Ch;
-    } else if (SrcWidth & (SrcWidth - 1)) {
-      // If not loading a power-of-2 number of bits, expand as two loads.
-      assert(!SrcVT.isVector() && "Unsupported extload!");
-      unsigned RoundWidth = 1 << Log2_32(SrcWidth);
-      assert(RoundWidth < SrcWidth);
-      unsigned ExtraWidth = SrcWidth - RoundWidth;
-      assert(ExtraWidth < RoundWidth);
-      assert(!(RoundWidth % 8) && !(ExtraWidth % 8) &&
-             "Load size not an integral number of bytes!");
-      EVT RoundVT = EVT::getIntegerVT(*DAG.getContext(), RoundWidth);
-      EVT ExtraVT = EVT::getIntegerVT(*DAG.getContext(), ExtraWidth);
-      SDValue Lo, Hi, Ch;
-      unsigned IncrementSize;
-
-      if (TLI.isLittleEndian()) {
-        // EXTLOAD:i24 -> ZEXTLOAD:i16 | (shl EXTLOAD@+2:i8, 16)
-        // Load the bottom RoundWidth bits.
-        Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0),
-                            Tmp1, Tmp2,
-                            LD->getPointerInfo(), RoundVT, isVolatile,
-                            isNonTemporal, Alignment);
-
-        // Load the remaining ExtraWidth bits.
-        IncrementSize = RoundWidth / 8;
-        Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
-                           DAG.getIntPtrConstant(IncrementSize));
-        Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Tmp1, Tmp2,
-                            LD->getPointerInfo().getWithOffset(IncrementSize),
-                            ExtraVT, isVolatile, isNonTemporal,
-                            MinAlign(Alignment, IncrementSize));
-
-        // Build a factor node to remember that this load is independent of
-        // the other one.
-        Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
-                         Hi.getValue(1));
-
-        // Move the top bits to the right place.
-        Hi = DAG.getNode(ISD::SHL, dl, Hi.getValueType(), Hi,
-                         DAG.getConstant(RoundWidth,
-                                      TLI.getShiftAmountTy(Hi.getValueType())));
-
-        // Join the hi and lo parts.
-        Tmp1 = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
-      } else {
-        // Big endian - avoid unaligned loads.
-        // EXTLOAD:i24 -> (shl EXTLOAD:i16, 8) | ZEXTLOAD@+2:i8
-        // Load the top RoundWidth bits.
-        Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Tmp1, Tmp2,
-                            LD->getPointerInfo(), RoundVT, isVolatile,
-                            isNonTemporal, Alignment);
-
-        // Load the remaining ExtraWidth bits.
-        IncrementSize = RoundWidth / 8;
-        Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
-                           DAG.getIntPtrConstant(IncrementSize));
-        Lo = DAG.getExtLoad(ISD::ZEXTLOAD,
-                            dl, Node->getValueType(0), Tmp1, Tmp2,
-                            LD->getPointerInfo().getWithOffset(IncrementSize),
-                            ExtraVT, isVolatile, isNonTemporal,
-                            MinAlign(Alignment, IncrementSize));
-
-        // Build a factor node to remember that this load is independent of
-        // the other one.
-        Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
-                         Hi.getValue(1));
-
-        // Move the top bits to the right place.
-        Hi = DAG.getNode(ISD::SHL, dl, Hi.getValueType(), Hi,
-                         DAG.getConstant(ExtraWidth,
-                                      TLI.getShiftAmountTy(Hi.getValueType())));
-
-        // Join the hi and lo parts.
-        Tmp1 = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
-      }
-
-      Tmp2 = Ch;
-    } else {
-      switch (TLI.getLoadExtAction(ExtType, SrcVT)) {
-      default: llvm_unreachable("This action is not supported yet!");
-      case TargetLowering::Custom:
-        isCustom = true;
-        // FALLTHROUGH
-      case TargetLowering::Legal:
-        Tmp1 = SDValue(Node, 0);
-        Tmp2 = SDValue(Node, 1);
-
-        if (isCustom) {
-          Tmp3 = TLI.LowerOperation(SDValue(Node, 0), DAG);
-          if (Tmp3.getNode()) {
-            Tmp1 = Tmp3;
-            Tmp2 = Tmp3.getValue(1);
-          }
-        } else {
-          // If this is an unaligned load and the target doesn't support it,
-          // expand it.
-          if (!TLI.allowsUnalignedMemoryAccesses(LD->getMemoryVT())) {
-            Type *Ty =
-              LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
-            unsigned ABIAlignment =
-              TLI.getTargetData()->getABITypeAlignment(Ty);
-            if (LD->getAlignment() < ABIAlignment){
-              ExpandUnalignedLoad(cast<LoadSDNode>(Node),
-                                  DAG, TLI, Tmp1, Tmp2);
-            }
-          }
-        }
-        break;
-      case TargetLowering::Expand:
-        if (!TLI.isLoadExtLegal(ISD::EXTLOAD, SrcVT) && TLI.isTypeLegal(SrcVT)) {
-          SDValue Load = DAG.getLoad(SrcVT, dl, Tmp1, Tmp2,
-                                     LD->getPointerInfo(),
-                                     LD->isVolatile(), LD->isNonTemporal(),
-                                     LD->isInvariant(), LD->getAlignment());
-          unsigned ExtendOp;
-          switch (ExtType) {
-          case ISD::EXTLOAD:
-            ExtendOp = (SrcVT.isFloatingPoint() ?
-                        ISD::FP_EXTEND : ISD::ANY_EXTEND);
-            break;
-          case ISD::SEXTLOAD: ExtendOp = ISD::SIGN_EXTEND; break;
-          case ISD::ZEXTLOAD: ExtendOp = ISD::ZERO_EXTEND; break;
-          default: llvm_unreachable("Unexpected extend load type!");
-          }
-          Tmp1 = DAG.getNode(ExtendOp, dl, Node->getValueType(0), Load);
-          Tmp2 = Load.getValue(1);
-          break;
-        }
-
-        assert(!SrcVT.isVector() &&
-               "Vector Loads are handled in LegalizeVectorOps");
-
-        // FIXME: This does not work for vectors on most targets.  Sign- and
-        // zero-extend operations are currently folded into extending loads,
-        // whether they are legal or not, and then we end up here without any
-        // support for legalizing them.
-        assert(ExtType != ISD::EXTLOAD &&
-               "EXTLOAD should always be supported!");
-        // Turn the unsupported load into an EXTLOAD followed by an explicit
-        // zero/sign extend inreg.
-        SDValue Result = DAG.getExtLoad(ISD::EXTLOAD, dl, Node->getValueType(0),
-                                        Tmp1, Tmp2, LD->getPointerInfo(), SrcVT,
-                                        LD->isVolatile(), LD->isNonTemporal(),
-                                        LD->getAlignment());
-        SDValue ValRes;
-        if (ExtType == ISD::SEXTLOAD)
-          ValRes = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl,
-                               Result.getValueType(),
-                               Result, DAG.getValueType(SrcVT));
-        else
-          ValRes = DAG.getZeroExtendInReg(Result, dl, SrcVT.getScalarType());
-        Tmp1 = ValRes;
-        Tmp2 = Result.getValue(1);
-        break;
-      }
-    }
-
-    // Since loads produce two values, make sure to remember that we legalized
-    // both of them.
-    if (Tmp2.getNode() != Node) {
-      assert(Tmp1.getNode() != Node && "Load must be completely replaced");
-      DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), Tmp1);
-      DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), Tmp2);
-      ReplacedNode(Node);
-    }
-    break;
+    return LegalizeLoadOps(Node);
   }
   case ISD::STORE: {
-    StoreSDNode *ST = cast<StoreSDNode>(Node);
-    Tmp1 = ST->getChain();
-    Tmp2 = ST->getBasePtr();
-    unsigned Alignment = ST->getAlignment();
-    bool isVolatile = ST->isVolatile();
-    bool isNonTemporal = ST->isNonTemporal();
-
-    if (!ST->isTruncatingStore()) {
-      if (SDNode *OptStore = OptimizeFloatStore(ST).getNode()) {
-        ReplaceNode(ST, OptStore);
-        break;
-      }
-
-      {
-        Tmp3 = ST->getValue();
-        EVT VT = Tmp3.getValueType();
-        switch (TLI.getOperationAction(ISD::STORE, VT)) {
-        default: llvm_unreachable("This action is not supported yet!");
-        case TargetLowering::Legal:
-          // If this is an unaligned store and the target doesn't support it,
-          // expand it.
-          if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT())) {
-            Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
-            unsigned ABIAlignment= TLI.getTargetData()->getABITypeAlignment(Ty);
-            if (ST->getAlignment() < ABIAlignment)
-              ExpandUnalignedStore(cast<StoreSDNode>(Node),
-                                   DAG, TLI, this);
-          }
-          break;
-        case TargetLowering::Custom:
-          Tmp1 = TLI.LowerOperation(SDValue(Node, 0), DAG);
-          if (Tmp1.getNode())
-            ReplaceNode(SDValue(Node, 0), Tmp1);
-          break;
-        case TargetLowering::Promote: {
-          assert(VT.isVector() && "Unknown legal promote case!");
-          Tmp3 = DAG.getNode(ISD::BITCAST, dl,
-                             TLI.getTypeToPromoteTo(ISD::STORE, VT), Tmp3);
-          SDValue Result =
-            DAG.getStore(Tmp1, dl, Tmp3, Tmp2,
-                         ST->getPointerInfo(), isVolatile,
-                         isNonTemporal, Alignment);
-          ReplaceNode(SDValue(Node, 0), Result);
-          break;
-        }
-        }
-        break;
-      }
-    } else {
-      Tmp3 = ST->getValue();
-
-      EVT StVT = ST->getMemoryVT();
-      unsigned StWidth = StVT.getSizeInBits();
-
-      if (StWidth != StVT.getStoreSizeInBits()) {
-        // Promote to a byte-sized store with upper bits zero if not
-        // storing an integral number of bytes.  For example, promote
-        // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
-        EVT NVT = EVT::getIntegerVT(*DAG.getContext(),
-                                    StVT.getStoreSizeInBits());
-        Tmp3 = DAG.getZeroExtendInReg(Tmp3, dl, StVT);
-        SDValue Result =
-          DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(),
-                            NVT, isVolatile, isNonTemporal, Alignment);
-        ReplaceNode(SDValue(Node, 0), Result);
-      } else if (StWidth & (StWidth - 1)) {
-        // If not storing a power-of-2 number of bits, expand as two stores.
-        assert(!StVT.isVector() && "Unsupported truncstore!");
-        unsigned RoundWidth = 1 << Log2_32(StWidth);
-        assert(RoundWidth < StWidth);
-        unsigned ExtraWidth = StWidth - RoundWidth;
-        assert(ExtraWidth < RoundWidth);
-        assert(!(RoundWidth % 8) && !(ExtraWidth % 8) &&
-               "Store size not an integral number of bytes!");
-        EVT RoundVT = EVT::getIntegerVT(*DAG.getContext(), RoundWidth);
-        EVT ExtraVT = EVT::getIntegerVT(*DAG.getContext(), ExtraWidth);
-        SDValue Lo, Hi;
-        unsigned IncrementSize;
-
-        if (TLI.isLittleEndian()) {
-          // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 X, TRUNCSTORE@+2:i8 (srl X, 16)
-          // Store the bottom RoundWidth bits.
-          Lo = DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(),
-                                 RoundVT,
-                                 isVolatile, isNonTemporal, Alignment);
-
-          // Store the remaining ExtraWidth bits.
-          IncrementSize = RoundWidth / 8;
-          Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
-                             DAG.getIntPtrConstant(IncrementSize));
-          Hi = DAG.getNode(ISD::SRL, dl, Tmp3.getValueType(), Tmp3,
-                           DAG.getConstant(RoundWidth,
-                                    TLI.getShiftAmountTy(Tmp3.getValueType())));
-          Hi = DAG.getTruncStore(Tmp1, dl, Hi, Tmp2,
-                             ST->getPointerInfo().getWithOffset(IncrementSize),
-                                 ExtraVT, isVolatile, isNonTemporal,
-                                 MinAlign(Alignment, IncrementSize));
-        } else {
-          // Big endian - avoid unaligned stores.
-          // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 (srl X, 8), TRUNCSTORE@+2:i8 X
-          // Store the top RoundWidth bits.
-          Hi = DAG.getNode(ISD::SRL, dl, Tmp3.getValueType(), Tmp3,
-                           DAG.getConstant(ExtraWidth,
-                                    TLI.getShiftAmountTy(Tmp3.getValueType())));
-          Hi = DAG.getTruncStore(Tmp1, dl, Hi, Tmp2, ST->getPointerInfo(),
-                                 RoundVT, isVolatile, isNonTemporal, Alignment);
-
-          // Store the remaining ExtraWidth bits.
-          IncrementSize = RoundWidth / 8;
-          Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
-                             DAG.getIntPtrConstant(IncrementSize));
-          Lo = DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2,
-                              ST->getPointerInfo().getWithOffset(IncrementSize),
-                                 ExtraVT, isVolatile, isNonTemporal,
-                                 MinAlign(Alignment, IncrementSize));
-        }
-
-        // The order of the stores doesn't matter.
-        SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
-        ReplaceNode(SDValue(Node, 0), Result);
-      } else {
-        switch (TLI.getTruncStoreAction(ST->getValue().getValueType(), StVT)) {
-        default: llvm_unreachable("This action is not supported yet!");
-        case TargetLowering::Legal:
-          // If this is an unaligned store and the target doesn't support it,
-          // expand it.
-          if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT())) {
-            Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
-            unsigned ABIAlignment= TLI.getTargetData()->getABITypeAlignment(Ty);
-            if (ST->getAlignment() < ABIAlignment)
-              ExpandUnalignedStore(cast<StoreSDNode>(Node), DAG, TLI, this);
-          }
-          break;
-        case TargetLowering::Custom:
-          ReplaceNode(SDValue(Node, 0),
-                      TLI.LowerOperation(SDValue(Node, 0), DAG));
-          break;
-        case TargetLowering::Expand:
-          assert(!StVT.isVector() &&
-                 "Vector Stores are handled in LegalizeVectorOps");
-
-          // TRUNCSTORE:i16 i32 -> STORE i16
-          assert(TLI.isTypeLegal(StVT) && "Do not know how to expand this store!");
-          Tmp3 = DAG.getNode(ISD::TRUNCATE, dl, StVT, Tmp3);
-          SDValue Result =
-            DAG.getStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(),
-                         isVolatile, isNonTemporal, Alignment);
-          ReplaceNode(SDValue(Node, 0), Result);
-          break;
-        }
-      }
-    }
-    break;
+    return LegalizeStoreOps(Node);
   }
   }
 }
@@ -1795,11 +1815,13 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
   if (isTailCall)
     InChain = TCChain;
 
-  std::pair<SDValue, SDValue> CallInfo =
-    TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
+  TargetLowering::
+  CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, false, false,
                     0, TLI.getLibcallCallingConv(LC), isTailCall,
                     /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
                     Callee, Args, DAG, Node->getDebugLoc());
+  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
+
 
   if (!CallInfo.second.getNode())
     // It's a tailcall, return the chain (which is the DAG root).
@@ -1828,11 +1850,13 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT,
                                          TLI.getPointerTy());
 
   Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
-  std::pair<SDValue,SDValue> CallInfo =
-  TLI.LowerCallTo(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false,
-                  false, 0, TLI.getLibcallCallingConv(LC), /*isTailCall=*/false,
+  TargetLowering::
+  CallLoweringInfo CLI(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false,
+                       false, 0, TLI.getLibcallCallingConv(LC),
+                       /*isTailCall=*/false,
                   /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
                   Callee, Args, DAG, dl);
+  std::pair<SDValue,SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
   return CallInfo.first;
 }
@@ -1860,11 +1884,12 @@ SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC,
                                          TLI.getPointerTy());
 
   Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
-  std::pair<SDValue, SDValue> CallInfo =
-    TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
+  TargetLowering::
+  CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, false, false,
                     0, TLI.getLibcallCallingConv(LC), /*isTailCall=*/false,
                     /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
                     Callee, Args, DAG, Node->getDebugLoc());
+  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
   return CallInfo;
 }
@@ -1919,9 +1944,11 @@ static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned,
   return TLI.getLibcallName(LC) != 0;
 }
 
-/// UseDivRem - Only issue divrem libcall if both quotient and remainder are
+/// useDivRem - Only issue divrem libcall if both quotient and remainder are
 /// needed.
-static bool UseDivRem(SDNode *Node, bool isSigned, bool isDIV) {
+static bool useDivRem(SDNode *Node, bool isSigned, bool isDIV) {
+  // The other use might have been replaced with a divrem already.
+  unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
   unsigned OtherOpcode = 0;
   if (isSigned)
     OtherOpcode = isDIV ? ISD::SREM : ISD::SDIV;
@@ -1935,7 +1962,7 @@ static bool UseDivRem(SDNode *Node, bool isSigned, bool isDIV) {
     SDNode *User = *UI;
     if (User == Node)
       continue;
-    if (User->getOpcode() == OtherOpcode &&
+    if ((User->getOpcode() == OtherOpcode || User->getOpcode() == DivRemOpc) &&
         User->getOperand(0) == Op0 &&
         User->getOperand(1) == Op1)
       return true;
@@ -1992,11 +2019,12 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
                                          TLI.getPointerTy());
 
   DebugLoc dl = Node->getDebugLoc();
-  std::pair<SDValue, SDValue> CallInfo =
-    TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
+  TargetLowering::
+  CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, false, false,
                     0, TLI.getLibcallCallingConv(LC), /*isTailCall=*/false,
                     /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
                     Callee, Args, DAG, dl);
+  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
   // Remainder is loaded back from the stack frame.
   SDValue Rem = DAG.getLoad(RetVT, dl, CallInfo.second, FIPtr,
@@ -2570,14 +2598,17 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     // If the target didn't lower this, lower it to '__sync_synchronize()' call
     // FIXME: handle "fence singlethread" more efficiently.
     TargetLowering::ArgListTy Args;
-    std::pair<SDValue, SDValue> CallResult =
-      TLI.LowerCallTo(Node->getOperand(0), Type::getVoidTy(*DAG.getContext()),
+    TargetLowering::
+    CallLoweringInfo CLI(Node->getOperand(0),
+                         Type::getVoidTy(*DAG.getContext()),
                       false, false, false, false, 0, CallingConv::C,
                       /*isTailCall=*/false,
                       /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
                       DAG.getExternalSymbol("__sync_synchronize",
                                             TLI.getPointerTy()),
                       Args, DAG, dl);
+    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+
     Results.push_back(CallResult.second);
     break;
   }
@@ -2647,13 +2678,16 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   case ISD::TRAP: {
     // If this operation is not supported, lower it to 'abort()' call
     TargetLowering::ArgListTy Args;
-    std::pair<SDValue, SDValue> CallResult =
-      TLI.LowerCallTo(Node->getOperand(0), Type::getVoidTy(*DAG.getContext()),
+    TargetLowering::
+    CallLoweringInfo CLI(Node->getOperand(0),
+                         Type::getVoidTy(*DAG.getContext()),
                       false, false, false, false, 0, CallingConv::C,
                       /*isTailCall=*/false,
                       /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
                       DAG.getExternalSymbol("abort", TLI.getPointerTy()),
                       Args, DAG, dl);
+    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+
     Results.push_back(CallResult.second);
     break;
   }
@@ -3059,7 +3093,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
            "Don't know how to expand this subtraction!");
     Tmp1 = DAG.getNode(ISD::XOR, dl, VT, Node->getOperand(1),
                DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), VT));
-    Tmp1 = DAG.getNode(ISD::ADD, dl, VT, Tmp2, DAG.getConstant(1, VT));
+    Tmp1 = DAG.getNode(ISD::ADD, dl, VT, Tmp1, DAG.getConstant(1, VT));
     Results.push_back(DAG.getNode(ISD::ADD, dl, VT, Node->getOperand(0), Tmp1));
     break;
   }
@@ -3074,7 +3108,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Tmp3 = Node->getOperand(1);
     if (TLI.isOperationLegalOrCustom(DivRemOpc, VT) ||
         (isDivRemLibcallAvailable(Node, isSigned, TLI) &&
-         UseDivRem(Node, isSigned, false))) {
+         useDivRem(Node, isSigned, false))) {
       Tmp1 = DAG.getNode(DivRemOpc, dl, VTs, Tmp2, Tmp3).getValue(1);
     } else if (TLI.isOperationLegalOrCustom(DivOpc, VT)) {
       // X % Y -> X-X/Y*Y
@@ -3102,7 +3136,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     SDVTList VTs = DAG.getVTList(VT, VT);
     if (TLI.isOperationLegalOrCustom(DivRemOpc, VT) ||
         (isDivRemLibcallAvailable(Node, isSigned, TLI) &&
-         UseDivRem(Node, isSigned, true)))
+         useDivRem(Node, isSigned, true)))
       Tmp1 = DAG.getNode(DivRemOpc, dl, VTs, Node->getOperand(0),
                          Node->getOperand(1));
     else if (isSigned)
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 95ddb1e..e8e968a 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -588,18 +588,14 @@ SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) {
     unsigned NumElts = InVT.getVectorNumElements();
     assert(NumElts == NVT.getVectorNumElements() &&
            "Dst and Src must have the same number of elements");
-    EVT EltVT = InVT.getScalarType();
     assert(isPowerOf2_32(NumElts) &&
            "Promoted vector type must be a power of two");
 
-    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts/2);
+    SDValue EOp1, EOp2;
+    GetSplitVector(InOp, EOp1, EOp2);
+
     EVT HalfNVT = EVT::getVectorVT(*DAG.getContext(), NVT.getScalarType(),
                                    NumElts/2);
-
-    SDValue EOp1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, InOp,
-                               DAG.getIntPtrConstant(0));
-    SDValue EOp2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, InOp,
-                               DAG.getIntPtrConstant(NumElts/2));
     EOp1 = DAG.getNode(ISD::TRUNCATE, dl, HalfNVT, EOp1);
     EOp2 = DAG.getNode(ISD::TRUNCATE, dl, HalfNVT, EOp2);
 
@@ -2273,9 +2269,9 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
     // A divide for UMULO will be faster than a function call. Select to
     // make sure we aren't using 0.
     SDValue isZero = DAG.getSetCC(dl, TLI.getSetCCResultType(VT),
-				  RHS, DAG.getConstant(0, VT), ISD::SETNE);
+                                  RHS, DAG.getConstant(0, VT), ISD::SETNE);
     SDValue NotZero = DAG.getNode(ISD::SELECT, dl, VT, isZero,
-				  DAG.getConstant(1, VT), RHS);
+                                  DAG.getConstant(1, VT), RHS);
     SDValue DIV = DAG.getNode(ISD::UDIV, DL, LHS.getValueType(), MUL, NotZero);
     SDValue Overflow;
     Overflow = DAG.getSetCC(DL, N->getValueType(1), DIV, LHS, ISD::SETNE);
@@ -2296,8 +2292,8 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
   SDValue Temp = DAG.CreateStackTemporary(PtrVT);
   // Temporary for the overflow value, default it to zero.
   SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl,
-			       DAG.getConstant(0, PtrVT), Temp,
-			       MachinePointerInfo(), false, false, 0);
+                               DAG.getConstant(0, PtrVT), Temp,
+                               MachinePointerInfo(), false, false, 0);
 
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
@@ -2319,16 +2315,17 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
   Args.push_back(Entry);
 
   SDValue Func = DAG.getExternalSymbol(TLI.getLibcallName(LC), PtrVT);
-  std::pair<SDValue, SDValue> CallInfo =
-    TLI.LowerCallTo(Chain, RetTy, true, false, false, false,
-		    0, TLI.getLibcallCallingConv(LC),
-                    /*isTailCall=*/false,
-		    /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
-                    Func, Args, DAG, dl);
+  TargetLowering::
+  CallLoweringInfo CLI(Chain, RetTy, true, false, false, false,
+                       0, TLI.getLibcallCallingConv(LC),
+                       /*isTailCall=*/false,
+                       /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
+                       Func, Args, DAG, dl);
+  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
   SplitInteger(CallInfo.first, Lo, Hi);
   SDValue Temp2 = DAG.getLoad(PtrVT, dl, CallInfo.second, Temp,
-			      MachinePointerInfo(), false, false, false, 0);
+                              MachinePointerInfo(), false, false, false, 0);
   SDValue Ofl = DAG.getSetCC(dl, N->getValueType(1), Temp2,
                              DAG.getConstant(0, PtrVT),
                              ISD::SETNE);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 439aa4d..39337ff 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -628,7 +628,8 @@ namespace {
   public:
     explicit NodeUpdateListener(DAGTypeLegalizer &dtl,
                                 SmallSetVector<SDNode*, 16> &nta)
-      : DTL(dtl), NodesToAnalyze(nta) {}
+      : SelectionDAG::DAGUpdateListener(dtl.getDAG()),
+        DTL(dtl), NodesToAnalyze(nta) {}
 
     virtual void NodeDeleted(SDNode *N, SDNode *E) {
       assert(N->getNodeId() != DAGTypeLegalizer::ReadyToProcess &&
@@ -680,7 +681,7 @@ void DAGTypeLegalizer::ReplaceValueWith(SDValue From, SDValue To) {
   SmallSetVector<SDNode*, 16> NodesToAnalyze;
   NodeUpdateListener NUL(*this, NodesToAnalyze);
   do {
-    DAG.ReplaceAllUsesOfValueWith(From, To, &NUL);
+    DAG.ReplaceAllUsesOfValueWith(From, To);
 
     // The old node may still be present in a map like ExpandedIntegers or
     // PromotedIntegers.  Inform maps about the replacement.
@@ -709,7 +710,7 @@ void DAGTypeLegalizer::ReplaceValueWith(SDValue From, SDValue To) {
           SDValue NewVal(M, i);
           if (M->getNodeId() == Processed)
             RemapValue(NewVal);
-          DAG.ReplaceAllUsesOfValueWith(OldVal, NewVal, &NUL);
+          DAG.ReplaceAllUsesOfValueWith(OldVal, NewVal);
           // OldVal may be a target of the ReplacedValues map which was marked
           // NewNode to force reanalysis because it was updated.  Ensure that
           // anything that ReplacedValues mapped to OldVal will now be mapped
@@ -950,7 +951,7 @@ SDValue DAGTypeLegalizer::DisintegrateMERGE_VALUES(SDNode *N, unsigned ResNo) {
   for (unsigned i = 0, e = N->getNumValues(); i != e; ++i)
     if (i != ResNo)
       ReplaceValueWith(SDValue(N, i), SDValue(N->getOperand(i)));
-  return SDValue(N, ResNo);
+  return SDValue(N->getOperand(ResNo));
 }
 
 /// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type
@@ -1054,12 +1055,14 @@ SDValue DAGTypeLegalizer::MakeLibCall(RTLIB::Libcall LC, EVT RetVT,
                                          TLI.getPointerTy());
 
   Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
-  std::pair<SDValue,SDValue> CallInfo =
-    TLI.LowerCallTo(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false,
+  TargetLowering::
+  CallLoweringInfo CLI(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false,
                     false, 0, TLI.getLibcallCallingConv(LC),
                     /*isTailCall=*/false,
                     /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
                     Callee, Args, DAG, dl);
+  std::pair<SDValue,SDValue> CallInfo = TLI.LowerCallTo(CLI);
+
   return CallInfo.first;
 }
 
@@ -1086,11 +1089,12 @@ DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC,
                                          TLI.getPointerTy());
 
   Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
-  std::pair<SDValue, SDValue> CallInfo =
-    TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
+  TargetLowering::
+  CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, false, false,
                     0, TLI.getLibcallCallingConv(LC), /*isTailCall=*/false,
                     /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
                     Callee, Args, DAG, Node->getDebugLoc());
+  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
   return CallInfo;
 }
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index e866445..94fc976 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -135,6 +135,8 @@ public:
       ReplacedValues[SDValue(Old, i)] = SDValue(New, i);
   }
 
+  SelectionDAG &getDAG() const { return DAG; }
+
 private:
   SDNode *AnalyzeNewNode(SDNode *N);
   void AnalyzeNewValue(SDValue &Val);
@@ -151,7 +153,7 @@ private:
 
   /// DisintegrateMERGE_VALUES - Replace each result of the given MERGE_VALUES
   /// node with the corresponding input operand, except for the result 'ResNo',
-  /// which is returned.
+  /// for which the corresponding input operand is returned.
   SDValue DisintegrateMERGE_VALUES(SDNode *N, unsigned ResNo);
 
   SDValue GetVectorElementPointer(SDValue VecPtr, EVT EltVT, SDValue Index);
@@ -509,10 +511,12 @@ private:
   void ScalarizeVectorResult(SDNode *N, unsigned OpNo);
   SDValue ScalarizeVecRes_MERGE_VALUES(SDNode *N, unsigned ResNo);
   SDValue ScalarizeVecRes_BinOp(SDNode *N);
+  SDValue ScalarizeVecRes_TernaryOp(SDNode *N);
   SDValue ScalarizeVecRes_UnaryOp(SDNode *N);
   SDValue ScalarizeVecRes_InregOp(SDNode *N);
 
   SDValue ScalarizeVecRes_BITCAST(SDNode *N);
+  SDValue ScalarizeVecRes_BUILD_VECTOR(SDNode *N);
   SDValue ScalarizeVecRes_CONVERT_RNDSAT(SDNode *N);
   SDValue ScalarizeVecRes_EXTRACT_SUBVECTOR(SDNode *N);
   SDValue ScalarizeVecRes_FP_ROUND(SDNode *N);
@@ -553,6 +557,7 @@ private:
   // Vector Result Splitting: <128 x ty> -> 2 x <64 x ty>.
   void SplitVectorResult(SDNode *N, unsigned OpNo);
   void SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi);
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index a8ff7c6..06f6bd6 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -168,6 +168,7 @@ void DAGTypeLegalizer::ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo,
                                                     SDValue &Hi) {
   SDValue OldVec = N->getOperand(0);
   unsigned OldElts = OldVec.getValueType().getVectorNumElements();
+  EVT OldEltVT = OldVec.getValueType().getVectorElementType();
   DebugLoc dl = N->getDebugLoc();
 
   // Convert to a vector of the expanded element type, for example
@@ -175,6 +176,15 @@ void DAGTypeLegalizer::ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo,
   EVT OldVT = N->getValueType(0);
   EVT NewVT = TLI.getTypeToTransformTo(*DAG.getContext(), OldVT);
 
+  if (OldVT != OldEltVT) {
+    // The result of EXTRACT_VECTOR_ELT may be larger than the element type of
+    // the input vector.  If so, extend the elements of the input vector to the
+    // same bitwidth as the result before expanding.
+    assert(OldEltVT.bitsLT(OldVT) && "Result type smaller then element type!");
+    EVT NVecVT = EVT::getVectorVT(*DAG.getContext(), OldVT, OldElts);
+    OldVec = DAG.getNode(ISD::ANY_EXTEND, dl, NVecVT, N->getOperand(0));
+  }
+
   SDValue NewVec = DAG.getNode(ISD::BITCAST, dl,
                                EVT::getVectorVT(*DAG.getContext(),
                                                 NewVT, 2*OldElts),
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 9fe4480..704f99b 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -71,6 +71,9 @@ class VectorLegalizer {
   // operands to a different type and bitcasting the result back to the
   // original type.
   SDValue PromoteVectorOp(SDValue Op);
+  // Implements [SU]INT_TO_FP vector promotion; this is a [zs]ext of the input
+  // operand to the next size up.
+  SDValue PromoteVectorOpINT_TO_FP(SDValue Op);
 
   public:
   bool Run();
@@ -231,9 +234,19 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
 
   switch (TLI.getOperationAction(Node->getOpcode(), QueryType)) {
   case TargetLowering::Promote:
-    // "Promote" the operation by bitcasting
-    Result = PromoteVectorOp(Op);
-    Changed = true;
+    switch (Op.getOpcode()) {
+    default:
+      // "Promote" the operation by bitcasting
+      Result = PromoteVectorOp(Op);
+      Changed = true;
+      break;
+    case ISD::SINT_TO_FP:
+    case ISD::UINT_TO_FP:
+      // "Promote" the operation by extending the operand.
+      Result = PromoteVectorOpINT_TO_FP(Op);
+      Changed = true;
+      break;
+    }
     break;
   case TargetLowering::Legal: break;
   case TargetLowering::Custom: {
@@ -293,6 +306,44 @@ SDValue VectorLegalizer::PromoteVectorOp(SDValue Op) {
   return DAG.getNode(ISD::BITCAST, dl, VT, Op);
 }
 
+SDValue VectorLegalizer::PromoteVectorOpINT_TO_FP(SDValue Op) {
+  // INT_TO_FP operations may require the input operand be promoted even
+  // when the type is otherwise legal.
+  EVT VT = Op.getOperand(0).getValueType();
+  assert(Op.getNode()->getNumValues() == 1 &&
+         "Can't promote a vector with multiple results!");
+
+  // Normal getTypeToPromoteTo() doesn't work here, as that will promote
+  // by widening the vector w/ the same element width and twice the number
+  // of elements. We want the other way around, the same number of elements,
+  // each twice the width.
+  //
+  // Increase the bitwidth of the element to the next pow-of-two
+  // (which is greater than 8 bits).
+  unsigned NumElts = VT.getVectorNumElements();
+  EVT EltVT = VT.getVectorElementType();
+  EltVT = EVT::getIntegerVT(*DAG.getContext(), 2 * EltVT.getSizeInBits());
+  assert(EltVT.isSimple() && "Promoting to a non-simple vector type!");
+
+  // Build a new vector type and check if it is legal.
+  MVT NVT = MVT::getVectorVT(EltVT.getSimpleVT(), NumElts);
+
+  DebugLoc dl = Op.getDebugLoc();
+  SmallVector<SDValue, 4> Operands(Op.getNumOperands());
+
+  unsigned Opc = Op.getOpcode() == ISD::UINT_TO_FP ? ISD::ZERO_EXTEND :
+    ISD::SIGN_EXTEND;
+  for (unsigned j = 0; j != Op.getNumOperands(); ++j) {
+    if (Op.getOperand(j).getValueType().isVector())
+      Operands[j] = DAG.getNode(Opc, dl, NVT, Op.getOperand(j));
+    else
+      Operands[j] = Op.getOperand(j);
+  }
+
+  return DAG.getNode(Op.getOpcode(), dl, Op.getValueType(), &Operands[0],
+                     Operands.size());
+}
+
 
 SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
   DebugLoc dl = Op.getDebugLoc();
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 5f23f01..4709202 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -48,7 +48,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
 
   case ISD::MERGE_VALUES:      R = ScalarizeVecRes_MERGE_VALUES(N, ResNo);break;
   case ISD::BITCAST:           R = ScalarizeVecRes_BITCAST(N); break;
-  case ISD::BUILD_VECTOR:      R = N->getOperand(0); break;
+  case ISD::BUILD_VECTOR:      R = ScalarizeVecRes_BUILD_VECTOR(N); break;
   case ISD::CONVERT_RNDSAT:    R = ScalarizeVecRes_CONVERT_RNDSAT(N); break;
   case ISD::EXTRACT_SUBVECTOR: R = ScalarizeVecRes_EXTRACT_SUBVECTOR(N); break;
   case ISD::FP_ROUND:          R = ScalarizeVecRes_FP_ROUND(N); break;
@@ -115,6 +115,9 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SRL:
     R = ScalarizeVecRes_BinOp(N);
     break;
+  case ISD::FMA:
+    R = ScalarizeVecRes_TernaryOp(N);
+    break;
   }
 
   // If R is null, the sub-method took care of registering the result.
@@ -129,6 +132,14 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_BinOp(SDNode *N) {
                      LHS.getValueType(), LHS, RHS);
 }
 
+SDValue DAGTypeLegalizer::ScalarizeVecRes_TernaryOp(SDNode *N) {
+  SDValue Op0 = GetScalarizedVector(N->getOperand(0));
+  SDValue Op1 = GetScalarizedVector(N->getOperand(1));
+  SDValue Op2 = GetScalarizedVector(N->getOperand(2));
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(),
+                     Op0.getValueType(), Op0, Op1, Op2);
+}
+
 SDValue DAGTypeLegalizer::ScalarizeVecRes_MERGE_VALUES(SDNode *N,
                                                        unsigned ResNo) {
   SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
@@ -141,6 +152,16 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_BITCAST(SDNode *N) {
                      NewVT, N->getOperand(0));
 }
 
+SDValue DAGTypeLegalizer::ScalarizeVecRes_BUILD_VECTOR(SDNode *N) {
+  EVT EltVT = N->getValueType(0).getVectorElementType();
+  SDValue InOp = N->getOperand(0);
+  // The BUILD_VECTOR operands may be of wider element types and
+  // we may need to truncate them back to the requested return type.
+  if (EltVT.isInteger())
+    return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), EltVT, InOp);
+  return InOp;
+}
+
 SDValue DAGTypeLegalizer::ScalarizeVecRes_CONVERT_RNDSAT(SDNode *N) {
   EVT NewVT = N->getValueType(0).getVectorElementType();
   SDValue Op0 = GetScalarizedVector(N->getOperand(0));
@@ -436,7 +457,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
         N->dump(&DAG);
         dbgs() << "\n");
   SDValue Lo, Hi;
-  
+
   // See if the target wants to custom expand this node.
   if (CustomLowerNode(N, N->getValueType(ResNo), true))
     return;
@@ -448,7 +469,8 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
     N->dump(&DAG);
     dbgs() << "\n";
 #endif
-    llvm_unreachable("Do not know how to split the result of this operator!");
+    report_fatal_error("Do not know how to split the result of this "
+                       "operator!\n");
 
   case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
   case ISD::VSELECT:
@@ -529,6 +551,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FREM:
     SplitVecRes_BinOp(N, Lo, Hi);
     break;
+  case ISD::FMA:
+    SplitVecRes_TernaryOp(N, Lo, Hi);
+    break;
   }
 
   // If Lo/Hi is null, the sub-method took care of registering results etc.
@@ -548,6 +573,22 @@ void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo,
   Hi = DAG.getNode(N->getOpcode(), dl, LHSHi.getValueType(), LHSHi, RHSHi);
 }
 
+void DAGTypeLegalizer::SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo,
+                                             SDValue &Hi) {
+  SDValue Op0Lo, Op0Hi;
+  GetSplitVector(N->getOperand(0), Op0Lo, Op0Hi);
+  SDValue Op1Lo, Op1Hi;
+  GetSplitVector(N->getOperand(1), Op1Lo, Op1Hi);
+  SDValue Op2Lo, Op2Hi;
+  GetSplitVector(N->getOperand(2), Op2Lo, Op2Hi);
+  DebugLoc dl = N->getDebugLoc();
+
+  Lo = DAG.getNode(N->getOpcode(), dl, Op0Lo.getValueType(),
+                   Op0Lo, Op1Lo, Op2Lo);
+  Hi = DAG.getNode(N->getOpcode(), dl, Op0Hi.getValueType(),
+                   Op0Hi, Op1Hi, Op2Hi);
+}
+
 void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   // We know the result is a vector.  The input may be either a vector or a
@@ -977,7 +1018,9 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
       N->dump(&DAG);
       dbgs() << "\n";
 #endif
-      llvm_unreachable("Do not know how to split this operator's operand!");
+      report_fatal_error("Do not know how to split this operator's "
+                         "operand!\n");
+
     case ISD::SETCC:             Res = SplitVecOp_VSETCC(N); break;
     case ISD::BITCAST:           Res = SplitVecOp_BITCAST(N); break;
     case ISD::EXTRACT_SUBVECTOR: Res = SplitVecOp_EXTRACT_SUBVECTOR(N); break;
@@ -1203,15 +1246,15 @@ SDValue DAGTypeLegalizer::SplitVecOp_FP_ROUND(SDNode *N) {
   DebugLoc DL = N->getDebugLoc();
   GetSplitVector(N->getOperand(0), Lo, Hi);
   EVT InVT = Lo.getValueType();
-  
+
   EVT OutVT = EVT::getVectorVT(*DAG.getContext(), ResVT.getVectorElementType(),
                                InVT.getVectorNumElements());
-  
+
   Lo = DAG.getNode(ISD::FP_ROUND, DL, OutVT, Lo, N->getOperand(1));
   Hi = DAG.getNode(ISD::FP_ROUND, DL, OutVT, Hi, N->getOperand(1));
-  
+
   return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
-}  
+}
 
 
 
@@ -1755,8 +1798,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
     if (InputWidened)
       InOp = GetWidenedVector(InOp);
     for (unsigned j=0; j < NumInElts; ++j)
-        Ops[Idx++] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
-                                 DAG.getIntPtrConstant(j));
+      Ops[Idx++] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
+                               DAG.getIntPtrConstant(j));
   }
   SDValue UndefVal = DAG.getUNDEF(EltVT);
   for (; Idx < WidenNumElts; ++Idx)
@@ -1816,7 +1859,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONVERT_RNDSAT(SDNode *N) {
       InOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InWidenVT, InOp,
                          DAG.getIntPtrConstant(0));
       return DAG.getConvertRndSat(WidenVT, dl, InOp, DTyOp, STyOp, RndOp,
-                                SatOp, CvtCode);
+                                  SatOp, CvtCode);
     }
   }
 
@@ -1832,7 +1875,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONVERT_RNDSAT(SDNode *N) {
     SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,
                                  DAG.getIntPtrConstant(i));
     Ops[i] = DAG.getConvertRndSat(WidenVT, dl, ExtVal, DTyOp, STyOp, RndOp,
-                                        SatOp, CvtCode);
+                                  SatOp, CvtCode);
   }
 
   SDValue UndefVal = DAG.getUNDEF(EltVT);
@@ -1936,7 +1979,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {
       Cond1 = GetWidenedVector(Cond1);
 
     if (Cond1.getValueType() != CondWidenVT)
-       Cond1 = ModifyToType(Cond1, CondWidenVT);
+      Cond1 = ModifyToType(Cond1, CondWidenVT);
   }
 
   SDValue InOp1 = GetWidenedVector(N->getOperand(1));
@@ -2202,7 +2245,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
   SDValue CC = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
                            ResVT, WideSETCC, DAG.getIntPtrConstant(0));
 
-  return PromoteTargetBoolean(CC, N->getValueType(0)); 
+  return PromoteTargetBoolean(CC, N->getValueType(0));
 }
 
 
@@ -2371,10 +2414,8 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVector<SDValue, 16> &LdChain,
       NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
       NewVTWidth = NewVT.getSizeInBits();
       L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
-                               LD->getPointerInfo().getWithOffset(Offset),
-                               isVolatile,
-                               isNonTemporal, isInvariant,
-                               MinAlign(Align, Increment));
+                      LD->getPointerInfo().getWithOffset(Offset), isVolatile,
+                      isNonTemporal, isInvariant, MinAlign(Align, Increment));
       LdChain.push_back(L.getValue(1));
       if (L->getValueType(0).isVector()) {
         SmallVector<SDValue, 16> Loads;
@@ -2563,7 +2604,7 @@ void DAGTypeLegalizer::GenWidenVectorStores(SmallVector<SDValue, 16>& StChain,
         Offset += Increment;
         BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
                               DAG.getIntPtrConstant(Increment));
-      } while (StWidth != 0  && StWidth >= NewVTWidth);
+      } while (StWidth != 0 && StWidth >= NewVTWidth);
       // Restore index back to be relative to the original widen element type
       Idx = Idx * NewVTWidth / ValEltWidth;
     }
diff --git a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
index ff0136e..c3794d5 100644
--- a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
+++ b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
@@ -50,7 +50,7 @@ ResourcePriorityQueue::ResourcePriorityQueue(SelectionDAGISel *IS) :
 
    const TargetMachine &tm = (*IS->MF).getTarget();
    ResourcesModel = tm.getInstrInfo()->CreateTargetScheduleState(&tm,NULL);
-   // This hard requirment could be relaxed, but for now
+   // This hard requirement could be relaxed, but for now
    // do not let it procede.
    assert (ResourcesModel && "Unimplemented CreateTargetScheduleState.");
 
@@ -318,7 +318,7 @@ void ResourcePriorityQueue::reserveResources(SUnit *SU) {
 
   // If packet is now full, reset the state so in the next cycle
   // we start fresh.
-  if (Packet.size() >= InstrItins->IssueWidth) {
+  if (Packet.size() >= InstrItins->SchedModel->IssueWidth) {
     ResourcesModel->clearResources();
     Packet.clear();
   }
@@ -353,7 +353,7 @@ signed ResourcePriorityQueue::rawRegPressureDelta(SUnit *SU, unsigned RCId) {
 }
 
 /// Estimates change in reg pressure from this SU.
-/// It is acheived by trivial tracking of defined
+/// It is achieved by trivial tracking of defined
 /// and used vregs in dependent instructions.
 /// The RawPressure flag makes this function to ignore
 /// existing reg file sizes, and report raw def/use
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index 24da432..b7ce48a 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -441,19 +441,14 @@ static bool CheckForLiveRegDef(SUnit *SU, unsigned Reg,
                                SmallVector<unsigned, 4> &LRegs,
                                const TargetRegisterInfo *TRI) {
   bool Added = false;
-  if (LiveRegDefs[Reg] && LiveRegDefs[Reg] != SU) {
-    if (RegAdded.insert(Reg)) {
-      LRegs.push_back(Reg);
-      Added = true;
-    }
-  }
-  for (const uint16_t *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias)
-    if (LiveRegDefs[*Alias] && LiveRegDefs[*Alias] != SU) {
-      if (RegAdded.insert(*Alias)) {
-        LRegs.push_back(*Alias);
+  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+    if (LiveRegDefs[*AI] && LiveRegDefs[*AI] != SU) {
+      if (RegAdded.insert(*AI)) {
+        LRegs.push_back(*AI);
         Added = true;
       }
     }
+  }
   return Added;
 }
 
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 2cb5d37..bf0a437 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -266,7 +266,8 @@ static void GetCostForDef(const ScheduleDAGSDNodes::RegDefIter &RegDefPos,
                           const TargetLowering *TLI,
                           const TargetInstrInfo *TII,
                           const TargetRegisterInfo *TRI,
-                          unsigned &RegClass, unsigned &Cost) {
+                          unsigned &RegClass, unsigned &Cost,
+                          const MachineFunction &MF) {
   EVT VT = RegDefPos.GetValue();
 
   // Special handling for untyped values.  These values can only come from
@@ -285,7 +286,7 @@ static void GetCostForDef(const ScheduleDAGSDNodes::RegDefIter &RegDefPos,
 
     unsigned Idx = RegDefPos.GetIdx();
     const MCInstrDesc Desc = TII->get(Opcode);
-    const TargetRegisterClass *RC = TII->getRegClass(Desc, Idx, TRI);
+    const TargetRegisterClass *RC = TII->getRegClass(Desc, Idx, TRI, MF);
     RegClass = RC->getID();
     // FIXME: Cost arbitrarily set to 1 because there doesn't seem to be a
     // better way to determine it.
@@ -852,7 +853,7 @@ void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) {
 }
 
 /// After backtracking, the hazard checker needs to be restored to a state
-/// corresponding the the current cycle.
+/// corresponding the current cycle.
 void ScheduleDAGRRList::RestoreHazardCheckerBottomUp() {
   HazardRec->Reset();
 
@@ -1181,7 +1182,7 @@ static void CheckForLiveRegDef(SUnit *SU, unsigned Reg,
                                SmallSet<unsigned, 4> &RegAdded,
                                SmallVector<unsigned, 4> &LRegs,
                                const TargetRegisterInfo *TRI) {
-  for (const uint16_t *AliasI = TRI->getOverlaps(Reg); *AliasI; ++AliasI) {
+  for (MCRegAliasIterator AliasI(Reg, TRI, true); AliasI.isValid(); ++AliasI) {
 
     // Check if Ref is live.
     if (!LiveRegDefs[*AliasI]) continue;
@@ -1920,7 +1921,7 @@ bool RegReductionPQBase::HighRegPressure(const SUnit *SU) const {
     for (ScheduleDAGSDNodes::RegDefIter RegDefPos(PredSU, scheduleDAG);
          RegDefPos.IsValid(); RegDefPos.Advance()) {
       unsigned RCId, Cost;
-      GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost);
+      GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost, MF);
 
       if ((RegPressure[RCId] + Cost) >= RegLimit[RCId])
         return true;
@@ -2034,7 +2035,7 @@ void RegReductionPQBase::scheduledNode(SUnit *SU) {
         continue;
 
       unsigned RCId, Cost;
-      GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost);
+      GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost, MF);
       RegPressure[RCId] += Cost;
       break;
     }
@@ -2049,7 +2050,7 @@ void RegReductionPQBase::scheduledNode(SUnit *SU) {
     if (SkipRegDefs > 0)
       continue;
     unsigned RCId, Cost;
-    GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost);
+    GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost, MF);
     if (RegPressure[RCId] < Cost) {
       // Register pressure tracking is imprecise. This can happen. But we try
       // hard not to let it happen because it likely results in poor scheduling.
@@ -2330,22 +2331,21 @@ static int BUCompareLatency(SUnit *left, SUnit *right, bool checkPref,
   // and latency.
   if (!checkPref || (left->SchedulingPref == Sched::ILP ||
                      right->SchedulingPref == Sched::ILP)) {
-    if (DisableSchedCycles) {
+    // If neither instruction stalls (!LStall && !RStall) and HazardRecognizer
+    // is enabled, grouping instructions by cycle, then its height is already
+    // covered so only its depth matters. We also reach this point if both stall
+    // but have the same height.
+    if (!SPQ->getHazardRec()->isEnabled()) {
       if (LHeight != RHeight)
         return LHeight > RHeight ? 1 : -1;
     }
-    else {
-      // If neither instruction stalls (!LStall && !RStall) then
-      // its height is already covered so only its depth matters. We also reach
-      // this if both stall but have the same height.
-      int LDepth = left->getDepth() - LPenalty;
-      int RDepth = right->getDepth() - RPenalty;
-      if (LDepth != RDepth) {
-        DEBUG(dbgs() << "  Comparing latency of SU (" << left->NodeNum
-              << ") depth " << LDepth << " vs SU (" << right->NodeNum
-              << ") depth " << RDepth << "\n");
-        return LDepth < RDepth ? 1 : -1;
-      }
+    int LDepth = left->getDepth() - LPenalty;
+    int RDepth = right->getDepth() - RPenalty;
+    if (LDepth != RDepth) {
+      DEBUG(dbgs() << "  Comparing latency of SU (" << left->NodeNum
+            << ") depth " << LDepth << " vs SU (" << right->NodeNum
+            << ") depth " << RDepth << "\n");
+      return LDepth < RDepth ? 1 : -1;
     }
     if (left->Latency != right->Latency)
       return left->Latency > right->Latency ? 1 : -1;
@@ -2363,7 +2363,7 @@ static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) {
     bool RHasPhysReg = right->hasPhysRegDefs;
     if (LHasPhysReg != RHasPhysReg) {
       #ifndef NDEBUG
-      const char *PhysRegMsg[] = {" has no physreg", " defines a physreg"};
+      const char *const PhysRegMsg[] = {" has no physreg"," defines a physreg"};
       #endif
       DEBUG(dbgs() << "  SU (" << left->NodeNum << ") "
             << PhysRegMsg[LHasPhysReg] << " SU(" << right->NodeNum << ") "
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 69dd813..748668c 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -131,28 +131,16 @@ static void CheckForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op,
   }
 }
 
-static void AddGlue(SDNode *N, SDValue Glue, bool AddGlue, SelectionDAG *DAG) {
-  SmallVector<EVT, 4> VTs;
-  SDNode *GlueDestNode = Glue.getNode();
-
-  // Don't add glue from a node to itself.
-  if (GlueDestNode == N) return;
-
-  // Don't add glue to something which already has glue.
-  if (N->getValueType(N->getNumValues() - 1) == MVT::Glue) return;
-
-  for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
-    VTs.push_back(N->getValueType(I));
-
-  if (AddGlue)
-    VTs.push_back(MVT::Glue);
-
+// Helper for AddGlue to clone node operands.
+static void CloneNodeWithValues(SDNode *N, SelectionDAG *DAG,
+                                SmallVectorImpl<EVT> &VTs,
+                                SDValue ExtraOper = SDValue()) {
   SmallVector<SDValue, 4> Ops;
   for (unsigned I = 0, E = N->getNumOperands(); I != E; ++I)
     Ops.push_back(N->getOperand(I));
 
-  if (GlueDestNode)
-    Ops.push_back(Glue);
+  if (ExtraOper.getNode())
+    Ops.push_back(ExtraOper);
 
   SDVTList VTList = DAG->getVTList(&VTs[0], VTs.size());
   MachineSDNode::mmo_iterator Begin = 0, End = 0;
@@ -171,6 +159,46 @@ static void AddGlue(SDNode *N, SDValue Glue, bool AddGlue, SelectionDAG *DAG) {
     MN->setMemRefs(Begin, End);
 }
 
+static bool AddGlue(SDNode *N, SDValue Glue, bool AddGlue, SelectionDAG *DAG) {
+  SmallVector<EVT, 4> VTs;
+  SDNode *GlueDestNode = Glue.getNode();
+
+  // Don't add glue from a node to itself.
+  if (GlueDestNode == N) return false;
+
+  // Don't add a glue operand to something that already uses glue.
+  if (GlueDestNode &&
+      N->getOperand(N->getNumOperands()-1).getValueType() == MVT::Glue) {
+    return false;
+  }
+  // Don't add glue to something that already has a glue value.
+  if (N->getValueType(N->getNumValues() - 1) == MVT::Glue) return false;
+
+  for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
+    VTs.push_back(N->getValueType(I));
+
+  if (AddGlue)
+    VTs.push_back(MVT::Glue);
+
+  CloneNodeWithValues(N, DAG, VTs, Glue);
+
+  return true;
+}
+
+// Cleanup after unsuccessful AddGlue. Use the standard method of morphing the
+// node even though simply shrinking the value list is sufficient.
+static void RemoveUnusedGlue(SDNode *N, SelectionDAG *DAG) {
+  assert((N->getValueType(N->getNumValues() - 1) == MVT::Glue &&
+          !N->hasAnyUseOfValue(N->getNumValues() - 1)) &&
+         "expected an unused glue value");
+
+  SmallVector<EVT, 4> VTs;
+  for (unsigned I = 0, E = N->getNumValues()-1; I != E; ++I)
+    VTs.push_back(N->getValueType(I));
+
+  CloneNodeWithValues(N, DAG, VTs);
+}
+
 /// ClusterNeighboringLoads - Force nearby loads together by "gluing" them.
 /// This function finds loads of the same base and different offsets. If the
 /// offsets are not far apart (target specific), it add MVT::Glue inputs and
@@ -238,19 +266,23 @@ void ScheduleDAGSDNodes::ClusterNeighboringLoads(SDNode *Node) {
   // Cluster loads by adding MVT::Glue outputs and inputs. This also
   // ensure they are scheduled in order of increasing addresses.
   SDNode *Lead = Loads[0];
-  AddGlue(Lead, SDValue(0, 0), true, DAG);
-
-  SDValue InGlue = SDValue(Lead, Lead->getNumValues() - 1);
+  SDValue InGlue = SDValue(0, 0);
+  if (AddGlue(Lead, InGlue, true, DAG))
+    InGlue = SDValue(Lead, Lead->getNumValues() - 1);
   for (unsigned I = 1, E = Loads.size(); I != E; ++I) {
     bool OutGlue = I < E - 1;
     SDNode *Load = Loads[I];
 
-    AddGlue(Load, InGlue, OutGlue, DAG);
+    // If AddGlue fails, we could leave an unsused glue value. This should not
+    // cause any
+    if (AddGlue(Load, InGlue, OutGlue, DAG)) {
+      if (OutGlue)
+        InGlue = SDValue(Load, Load->getNumValues() - 1);
 
-    if (OutGlue)
-      InGlue = SDValue(Load, Load->getNumValues() - 1);
-
-    ++LoadsClustered;
+      ++LoadsClustered;
+    }
+    else if (!OutGlue && InGlue.getNode())
+      RemoveUnusedGlue(InGlue.getNode(), DAG);
   }
 }
 
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
index 75940ec..5384576 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
@@ -98,12 +98,6 @@ namespace llvm {
     ///
     virtual void computeLatency(SUnit *SU);
 
-    /// computeOperandLatency - Override dependence edge latency using
-    /// operand use/def information
-    ///
-    virtual void computeOperandLatency(SUnit *Def, SUnit *Use,
-                                       SDep& dep) const { }
-
     virtual void computeOperandLatency(SDNode *Def, SDNode *Use,
                                        unsigned OpIdx, SDep& dep) const;
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 92671d1..b971b69 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -14,16 +14,16 @@
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "SDNodeOrdering.h"
 #include "SDNodeDbgValue.h"
+#include "llvm/CallingConv.h"
 #include "llvm/Constants.h"
-#include "llvm/Analysis/DebugInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
 #include "llvm/GlobalAlias.h"
 #include "llvm/GlobalVariable.h"
 #include "llvm/Intrinsics.h"
-#include "llvm/DerivedTypes.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Assembly/Writer.h"
-#include "llvm/CallingConv.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -71,7 +71,9 @@ static const fltSemantics *EVTToAPFloatSemantics(EVT VT) {
   }
 }
 
-SelectionDAG::DAGUpdateListener::~DAGUpdateListener() {}
+// Default null implementations of the callbacks.
+void SelectionDAG::DAGUpdateListener::NodeDeleted(SDNode*, SDNode*) {}
+void SelectionDAG::DAGUpdateListener::NodeUpdated(SDNode*) {}
 
 //===----------------------------------------------------------------------===//
 //                              ConstantFPSDNode Class
@@ -217,6 +219,22 @@ bool ISD::isScalarToVector(const SDNode *N) {
   return true;
 }
 
+/// allOperandsUndef - Return true if the node has at least one operand
+/// and all operands of the specified node are ISD::UNDEF.
+bool ISD::allOperandsUndef(const SDNode *N) {
+  // Return false if the node has no operands.
+  // This is "logically inconsistent" with the definition of "all" but
+  // is probably the desired behavior.
+  if (N->getNumOperands() == 0)
+    return false;
+
+  for (unsigned i = 0, e = N->getNumOperands(); i != e ; ++i)
+    if (N->getOperand(i).getOpcode() != ISD::UNDEF)
+      return false;
+
+  return true;
+}
+
 /// getSetCCSwappedOperands - Return the operation corresponding to (Y op X)
 /// when given the operation for (X op Y).
 ISD::CondCode ISD::getSetCCSwappedOperands(ISD::CondCode Operation) {
@@ -544,16 +562,15 @@ void SelectionDAG::RemoveDeadNodes() {
 
 /// RemoveDeadNodes - This method deletes the unreachable nodes in the
 /// given list, and any nodes that become unreachable as a result.
-void SelectionDAG::RemoveDeadNodes(SmallVectorImpl<SDNode *> &DeadNodes,
-                                   DAGUpdateListener *UpdateListener) {
+void SelectionDAG::RemoveDeadNodes(SmallVectorImpl<SDNode *> &DeadNodes) {
 
   // Process the worklist, deleting the nodes and adding their uses to the
   // worklist.
   while (!DeadNodes.empty()) {
     SDNode *N = DeadNodes.pop_back_val();
 
-    if (UpdateListener)
-      UpdateListener->NodeDeleted(N, 0);
+    for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
+      DUL->NodeDeleted(N, 0);
 
     // Take the node out of the appropriate CSE map.
     RemoveNodeFromCSEMaps(N);
@@ -574,7 +591,7 @@ void SelectionDAG::RemoveDeadNodes(SmallVectorImpl<SDNode *> &DeadNodes,
   }
 }
 
-void SelectionDAG::RemoveDeadNode(SDNode *N, DAGUpdateListener *UpdateListener){
+void SelectionDAG::RemoveDeadNode(SDNode *N){
   SmallVector<SDNode*, 16> DeadNodes(1, N);
 
   // Create a dummy node that adds a reference to the root node, preventing
@@ -582,7 +599,7 @@ void SelectionDAG::RemoveDeadNode(SDNode *N, DAGUpdateListener *UpdateListener){
   // dead node.)
   HandleSDNode Dummy(getRoot());
 
-  RemoveDeadNodes(DeadNodes, UpdateListener);
+  RemoveDeadNodes(DeadNodes);
 }
 
 void SelectionDAG::DeleteNode(SDNode *N) {
@@ -684,8 +701,7 @@ bool SelectionDAG::RemoveNodeFromCSEMaps(SDNode *N) {
 /// node. This transfer can potentially trigger recursive merging.
 ///
 void
-SelectionDAG::AddModifiedNodeToCSEMaps(SDNode *N,
-                                       DAGUpdateListener *UpdateListener) {
+SelectionDAG::AddModifiedNodeToCSEMaps(SDNode *N) {
   // For node types that aren't CSE'd, just act as if no identical node
   // already exists.
   if (!doNotCSE(N)) {
@@ -694,20 +710,19 @@ SelectionDAG::AddModifiedNodeToCSEMaps(SDNode *N,
       // If there was already an existing matching node, use ReplaceAllUsesWith
       // to replace the dead one with the existing one.  This can cause
       // recursive merging of other unrelated nodes down the line.
-      ReplaceAllUsesWith(N, Existing, UpdateListener);
+      ReplaceAllUsesWith(N, Existing);
 
-      // N is now dead.  Inform the listener if it exists and delete it.
-      if (UpdateListener)
-        UpdateListener->NodeDeleted(N, Existing);
+      // N is now dead. Inform the listeners and delete it.
+      for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
+        DUL->NodeDeleted(N, Existing);
       DeleteNodeNotInCSEMaps(N);
       return;
     }
   }
 
-  // If the node doesn't already exist, we updated it.  Inform a listener if
-  // it exists.
-  if (UpdateListener)
-    UpdateListener->NodeUpdated(N);
+  // If the node doesn't already exist, we updated it.  Inform listeners.
+  for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
+    DUL->NodeUpdated(N);
 }
 
 /// FindModifiedNodeSlot - Find a slot for the specified node if its operands
@@ -855,7 +870,7 @@ unsigned SelectionDAG::getEVTAlignment(EVT VT) const {
 SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL)
   : TM(tm), TLI(*tm.getTargetLowering()), TSI(*tm.getSelectionDAGInfo()),
     OptLevel(OL), EntryNode(ISD::EntryToken, DebugLoc(), getVTList(MVT::Other)),
-    Root(getEntryNode()), Ordering(0) {
+    Root(getEntryNode()), Ordering(0), UpdateListeners(0) {
   AllNodes.push_back(&EntryNode);
   Ordering = new SDNodeOrdering();
   DbgInfo = new SDDbgInfo();
@@ -867,6 +882,7 @@ void SelectionDAG::init(MachineFunction &mf) {
 }
 
 SelectionDAG::~SelectionDAG() {
+  assert(!UpdateListeners && "Dangling registered DAGUpdateListeners");
   allnodes_clear();
   delete Ordering;
   delete DbgInfo;
@@ -1949,6 +1965,7 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
     APInt InMask = APInt::getLowBitsSet(BitWidth, VT.getSizeInBits());
     ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
     KnownZero |= (~InMask);
+    KnownOne  &= (~KnownZero);
     return;
   }
   case ISD::FGETSIGN:
@@ -2246,8 +2263,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{
   }
 
   // Handle LOADX separately here. EXTLOAD case will fallthrough.
-  if (Op.getOpcode() == ISD::LOAD) {
-    LoadSDNode *LD = cast<LoadSDNode>(Op);
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op)) {
     unsigned ExtType = LD->getExtensionType();
     switch (ExtType) {
     default: break;
@@ -2675,6 +2691,11 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
     if (N1 == N2) return N1;
     break;
   case ISD::CONCAT_VECTORS:
+    // Concat of UNDEFs is UNDEF.
+    if (N1.getOpcode() == ISD::UNDEF &&
+        N2.getOpcode() == ISD::UNDEF)
+      return getUNDEF(VT);
+
     // A CONCAT_VECTOR with all operands BUILD_VECTOR can be simplified to
     // one big BUILD_VECTOR.
     if (N1.getOpcode() == ISD::BUILD_VECTOR &&
@@ -3708,8 +3729,8 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, DebugLoc dl, SDValue Dst,
   Entry.Node = Src; Args.push_back(Entry);
   Entry.Node = Size; Args.push_back(Entry);
   // FIXME: pass in DebugLoc
-  std::pair<SDValue,SDValue> CallResult =
-    TLI.LowerCallTo(Chain, Type::getVoidTy(*getContext()),
+  TargetLowering::
+  CallLoweringInfo CLI(Chain, Type::getVoidTy(*getContext()),
                     false, false, false, false, 0,
                     TLI.getLibcallCallingConv(RTLIB::MEMCPY),
                     /*isTailCall=*/false,
@@ -3717,6 +3738,8 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, DebugLoc dl, SDValue Dst,
                     getExternalSymbol(TLI.getLibcallName(RTLIB::MEMCPY),
                                       TLI.getPointerTy()),
                     Args, *this, dl);
+  std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
+
   return CallResult.second;
 }
 
@@ -3761,8 +3784,8 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, DebugLoc dl, SDValue Dst,
   Entry.Node = Src; Args.push_back(Entry);
   Entry.Node = Size; Args.push_back(Entry);
   // FIXME:  pass in DebugLoc
-  std::pair<SDValue,SDValue> CallResult =
-    TLI.LowerCallTo(Chain, Type::getVoidTy(*getContext()),
+  TargetLowering::
+  CallLoweringInfo CLI(Chain, Type::getVoidTy(*getContext()),
                     false, false, false, false, 0,
                     TLI.getLibcallCallingConv(RTLIB::MEMMOVE),
                     /*isTailCall=*/false,
@@ -3770,6 +3793,8 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, DebugLoc dl, SDValue Dst,
                     getExternalSymbol(TLI.getLibcallName(RTLIB::MEMMOVE),
                                       TLI.getPointerTy()),
                     Args, *this, dl);
+  std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
+
   return CallResult.second;
 }
 
@@ -3822,8 +3847,8 @@ SDValue SelectionDAG::getMemset(SDValue Chain, DebugLoc dl, SDValue Dst,
   Entry.isSExt = false;
   Args.push_back(Entry);
   // FIXME: pass in DebugLoc
-  std::pair<SDValue,SDValue> CallResult =
-    TLI.LowerCallTo(Chain, Type::getVoidTy(*getContext()),
+  TargetLowering::
+  CallLoweringInfo CLI(Chain, Type::getVoidTy(*getContext()),
                     false, false, false, false, 0,
                     TLI.getLibcallCallingConv(RTLIB::MEMSET),
                     /*isTailCall=*/false,
@@ -3831,6 +3856,8 @@ SDValue SelectionDAG::getMemset(SDValue Chain, DebugLoc dl, SDValue Dst,
                     getExternalSymbol(TLI.getLibcallName(RTLIB::MEMSET),
                                       TLI.getPointerTy()),
                     Args, *this, dl);
+  std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
+
   return CallResult.second;
 }
 
@@ -4654,13 +4681,7 @@ SDVTList SelectionDAG::getVTList(const EVT *VTs, unsigned NumVTs) {
     if (I->NumVTs != NumVTs || VTs[0] != I->VTs[0] || VTs[1] != I->VTs[1])
       continue;
 
-    bool NoMatch = false;
-    for (unsigned i = 2; i != NumVTs; ++i)
-      if (VTs[i] != I->VTs[i]) {
-        NoMatch = true;
-        break;
-      }
-    if (!NoMatch)
+    if (std::equal(&VTs[2], &VTs[NumVTs], &I->VTs[2]))
       return *I;
   }
 
@@ -5237,11 +5258,7 @@ namespace {
 /// pointed to by a use iterator is deleted, increment the use iterator
 /// so that it doesn't dangle.
 ///
-/// This class also manages a "downlink" DAGUpdateListener, to forward
-/// messages to ReplaceAllUsesWith's callers.
-///
 class RAUWUpdateListener : public SelectionDAG::DAGUpdateListener {
-  SelectionDAG::DAGUpdateListener *DownLink;
   SDNode::use_iterator &UI;
   SDNode::use_iterator &UE;
 
@@ -5249,21 +5266,13 @@ class RAUWUpdateListener : public SelectionDAG::DAGUpdateListener {
     // Increment the iterator as needed.
     while (UI != UE && N == *UI)
       ++UI;
-
-    // Then forward the message.
-    if (DownLink) DownLink->NodeDeleted(N, E);
-  }
-
-  virtual void NodeUpdated(SDNode *N) {
-    // Just forward the message.
-    if (DownLink) DownLink->NodeUpdated(N);
   }
 
 public:
-  RAUWUpdateListener(SelectionDAG::DAGUpdateListener *dl,
+  RAUWUpdateListener(SelectionDAG &d,
                      SDNode::use_iterator &ui,
                      SDNode::use_iterator &ue)
-    : DownLink(dl), UI(ui), UE(ue) {}
+    : SelectionDAG::DAGUpdateListener(d), UI(ui), UE(ue) {}
 };
 
 }
@@ -5273,8 +5282,7 @@ public:
 ///
 /// This version assumes From has a single result value.
 ///
-void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To,
-                                      DAGUpdateListener *UpdateListener) {
+void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To) {
   SDNode *From = FromN.getNode();
   assert(From->getNumValues() == 1 && FromN.getResNo() == 0 &&
          "Cannot replace with this method!");
@@ -5288,7 +5296,7 @@ void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To,
   // is replaced by To, we don't want to replace of all its users with To
   // too. See PR3018 for more info.
   SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
-  RAUWUpdateListener Listener(UpdateListener, UI, UE);
+  RAUWUpdateListener Listener(*this, UI, UE);
   while (UI != UE) {
     SDNode *User = *UI;
 
@@ -5307,7 +5315,7 @@ void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To,
 
     // Now that we have modified User, add it back to the CSE maps.  If it
     // already exists there, recursively merge the results together.
-    AddModifiedNodeToCSEMaps(User, &Listener);
+    AddModifiedNodeToCSEMaps(User);
   }
 
   // If we just RAUW'd the root, take note.
@@ -5321,8 +5329,7 @@ void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To,
 /// This version assumes that for each value of From, there is a
 /// corresponding value in To in the same position with the same type.
 ///
-void SelectionDAG::ReplaceAllUsesWith(SDNode *From, SDNode *To,
-                                      DAGUpdateListener *UpdateListener) {
+void SelectionDAG::ReplaceAllUsesWith(SDNode *From, SDNode *To) {
 #ifndef NDEBUG
   for (unsigned i = 0, e = From->getNumValues(); i != e; ++i)
     assert((!From->hasAnyUseOfValue(i) ||
@@ -5337,7 +5344,7 @@ void SelectionDAG::ReplaceAllUsesWith(SDNode *From, SDNode *To,
   // Iterate over just the existing users of From. See the comments in
   // the ReplaceAllUsesWith above.
   SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
-  RAUWUpdateListener Listener(UpdateListener, UI, UE);
+  RAUWUpdateListener Listener(*this, UI, UE);
   while (UI != UE) {
     SDNode *User = *UI;
 
@@ -5356,7 +5363,7 @@ void SelectionDAG::ReplaceAllUsesWith(SDNode *From, SDNode *To,
 
     // Now that we have modified User, add it back to the CSE maps.  If it
     // already exists there, recursively merge the results together.
-    AddModifiedNodeToCSEMaps(User, &Listener);
+    AddModifiedNodeToCSEMaps(User);
   }
 
   // If we just RAUW'd the root, take note.
@@ -5369,16 +5376,14 @@ void SelectionDAG::ReplaceAllUsesWith(SDNode *From, SDNode *To,
 ///
 /// This version can replace From with any result values.  To must match the
 /// number and types of values returned by From.
-void SelectionDAG::ReplaceAllUsesWith(SDNode *From,
-                                      const SDValue *To,
-                                      DAGUpdateListener *UpdateListener) {
+void SelectionDAG::ReplaceAllUsesWith(SDNode *From, const SDValue *To) {
   if (From->getNumValues() == 1)  // Handle the simple case efficiently.
-    return ReplaceAllUsesWith(SDValue(From, 0), To[0], UpdateListener);
+    return ReplaceAllUsesWith(SDValue(From, 0), To[0]);
 
   // Iterate over just the existing users of From. See the comments in
   // the ReplaceAllUsesWith above.
   SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
-  RAUWUpdateListener Listener(UpdateListener, UI, UE);
+  RAUWUpdateListener Listener(*this, UI, UE);
   while (UI != UE) {
     SDNode *User = *UI;
 
@@ -5398,7 +5403,7 @@ void SelectionDAG::ReplaceAllUsesWith(SDNode *From,
 
     // Now that we have modified User, add it back to the CSE maps.  If it
     // already exists there, recursively merge the results together.
-    AddModifiedNodeToCSEMaps(User, &Listener);
+    AddModifiedNodeToCSEMaps(User);
   }
 
   // If we just RAUW'd the root, take note.
@@ -5409,14 +5414,13 @@ void SelectionDAG::ReplaceAllUsesWith(SDNode *From,
 /// ReplaceAllUsesOfValueWith - Replace any uses of From with To, leaving
 /// uses of other values produced by From.getNode() alone.  The Deleted
 /// vector is handled the same way as for ReplaceAllUsesWith.
-void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To,
-                                             DAGUpdateListener *UpdateListener){
+void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To){
   // Handle the really simple, really trivial case efficiently.
   if (From == To) return;
 
   // Handle the simple, trivial, case efficiently.
   if (From.getNode()->getNumValues() == 1) {
-    ReplaceAllUsesWith(From, To, UpdateListener);
+    ReplaceAllUsesWith(From, To);
     return;
   }
 
@@ -5424,7 +5428,7 @@ void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To,
   // the ReplaceAllUsesWith above.
   SDNode::use_iterator UI = From.getNode()->use_begin(),
                        UE = From.getNode()->use_end();
-  RAUWUpdateListener Listener(UpdateListener, UI, UE);
+  RAUWUpdateListener Listener(*this, UI, UE);
   while (UI != UE) {
     SDNode *User = *UI;
     bool UserRemovedFromCSEMaps = false;
@@ -5460,7 +5464,7 @@ void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To,
 
     // Now that we have modified User, add it back to the CSE maps.  If it
     // already exists there, recursively merge the results together.
-    AddModifiedNodeToCSEMaps(User, &Listener);
+    AddModifiedNodeToCSEMaps(User);
   }
 
   // If we just RAUW'd the root, take note.
@@ -5489,11 +5493,10 @@ namespace {
 /// handled the same way as for ReplaceAllUsesWith.
 void SelectionDAG::ReplaceAllUsesOfValuesWith(const SDValue *From,
                                               const SDValue *To,
-                                              unsigned Num,
-                                              DAGUpdateListener *UpdateListener){
+                                              unsigned Num){
   // Handle the simple, trivial case efficiently.
   if (Num == 1)
-    return ReplaceAllUsesOfValueWith(*From, *To, UpdateListener);
+    return ReplaceAllUsesOfValueWith(*From, *To);
 
   // Read up all the uses and make records of them. This helps
   // processing new uses that are introduced during the
@@ -5538,7 +5541,7 @@ void SelectionDAG::ReplaceAllUsesOfValuesWith(const SDValue *From,
 
     // Now that we have modified User, add it back to the CSE maps.  If it
     // already exists there, recursively merge the results together.
-    AddModifiedNodeToCSEMaps(User, UpdateListener);
+    AddModifiedNodeToCSEMaps(User);
   }
 }
 
@@ -5579,7 +5582,7 @@ unsigned SelectionDAG::AssignTopologicalOrder() {
     }
   }
 
-  // Visit all the nodes. As we iterate, moves nodes into sorted order,
+  // Visit all the nodes. As we iterate, move nodes into sorted order,
   // such that by the time the end is reached all nodes will be sorted.
   for (allnodes_iterator I = allnodes_begin(),E = allnodes_end(); I != E; ++I) {
     SDNode *N = I;
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 94cb958..8cbe818 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Constants.h"
 #include "llvm/CallingConv.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
 #include "llvm/GlobalVariable.h"
@@ -42,7 +43,6 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -51,6 +51,7 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/IntegersSubsetMapping.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
@@ -843,7 +844,7 @@ void SelectionDAGBuilder::clear() {
 }
 
 /// clearDanglingDebugInfo - Clear the dangling debug information
-/// map. This function is seperated from the clear so that debug
+/// map. This function is separated from the clear so that debug
 /// information that is dangling in a basic block can be properly
 /// resolved in a different basic block. This allows the
 /// SelectionDAG to resolve dangling debug information attached
@@ -941,7 +942,7 @@ void SelectionDAGBuilder::visit(unsigned Opcode, const User &I) {
   default: llvm_unreachable("Unknown instruction type encountered!");
     // Build the switch statement using the Instruction.def file.
 #define HANDLE_INST(NUM, OPCODE, CLASS) \
-    case Instruction::OPCODE: visit##OPCODE((CLASS&)I); break;
+    case Instruction::OPCODE: visit##OPCODE((const CLASS&)I); break;
 #include "llvm/Instruction.def"
   }
 
@@ -1578,17 +1579,18 @@ void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB,
     } else
       Cond = DAG.getSetCC(dl, MVT::i1, CondLHS, getValue(CB.CmpRHS), CB.CC);
   } else {
-    assert(CB.CC == ISD::SETLE && "Can handle only LE ranges now");
+    assert(CB.CC == ISD::SETCC_INVALID &&
+           "Condition is undefined for to-the-range belonging check.");
 
     const APInt& Low = cast<ConstantInt>(CB.CmpLHS)->getValue();
     const APInt& High  = cast<ConstantInt>(CB.CmpRHS)->getValue();
 
     SDValue CmpOp = getValue(CB.CmpMHS);
     EVT VT = CmpOp.getValueType();
-
-    if (cast<ConstantInt>(CB.CmpLHS)->isMinValue(true)) {
+    
+    if (cast<ConstantInt>(CB.CmpLHS)->isMinValue(false)) {
       Cond = DAG.getSetCC(dl, MVT::i1, CmpOp, DAG.getConstant(High, VT),
-                          ISD::SETLE);
+                          ISD::SETULE);
     } else {
       SDValue SUB = DAG.getNode(ISD::SUB, dl,
                                 VT, CmpOp, DAG.getConstant(Low, VT));
@@ -1826,9 +1828,13 @@ void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) {
   MachineBasicBlock *LandingPad = FuncInfo.MBBMap[I.getSuccessor(1)];
 
   const Value *Callee(I.getCalledValue());
+  const Function *Fn = dyn_cast<Function>(Callee);
   if (isa<InlineAsm>(Callee))
     visitInlineAsm(&I);
-  else
+  else if (Fn && Fn->isIntrinsic()) {
+    assert(Fn->getIntrinsicID() == Intrinsic::donothing);
+    // Ignore invokes to @llvm.donothing: jump directly to the next BB.
+  } else
     LowerCallTo(&I, getValue(Callee), false, LandingPad);
 
   // If the value of the invoke is used outside of its defining block, make it
@@ -1901,8 +1907,6 @@ bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR,
                                                  const Value* SV,
                                                  MachineBasicBlock *Default,
                                                  MachineBasicBlock *SwitchBB) {
-  Case& BackCase  = *(CR.Range.second-1);
-
   // Size is the number of Cases represented by this range.
   size_t Size = CR.Range.second - CR.Range.first;
   if (Size > 3)
@@ -1970,11 +1974,28 @@ bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR,
     }
   }
 
+  // Order cases by weight so the most likely case will be checked first.
+  BranchProbabilityInfo *BPI = FuncInfo.BPI;
+  if (BPI) {
+    for (CaseItr I = CR.Range.first, IE = CR.Range.second; I != IE; ++I) {
+      uint32_t IWeight = BPI->getEdgeWeight(SwitchBB->getBasicBlock(),
+                                            I->BB->getBasicBlock());
+      for (CaseItr J = CR.Range.first; J < I; ++J) {
+        uint32_t JWeight = BPI->getEdgeWeight(SwitchBB->getBasicBlock(),
+                                              J->BB->getBasicBlock());
+        if (IWeight > JWeight)
+          std::swap(*I, *J);
+      }
+    }
+  }
   // Rearrange the case blocks so that the last one falls through if possible.
-  if (NextBlock && Default != NextBlock && BackCase.BB != NextBlock) {
+  Case &BackCase = *(CR.Range.second-1);
+  if (Size > 1 &&
+      NextBlock && Default != NextBlock && BackCase.BB != NextBlock) {
     // The last case block won't fall through into 'NextBlock' if we emit the
     // branches in this order.  See if rearranging a case value would help.
-    for (CaseItr I = CR.Range.first, E = CR.Range.second-1; I != E; ++I) {
+    // We start at the bottom as it's the case with the least weight.
+    for (Case *I = &*(CR.Range.second-2), *E = &*CR.Range.first-1; I != E; --I){
       if (I->BB == NextBlock) {
         std::swap(*I, BackCase);
         break;
@@ -2006,7 +2027,7 @@ bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR,
       CC = ISD::SETEQ;
       LHS = SV; RHS = I->High; MHS = NULL;
     } else {
-      CC = ISD::SETLE;
+      CC = ISD::SETCC_INVALID; 
       LHS = I->Low; MHS = SV; RHS = I->High;
     }
 
@@ -2031,14 +2052,14 @@ bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR,
 }
 
 static inline bool areJTsAllowed(const TargetLowering &TLI) {
-  return !TLI.getTargetMachine().Options.DisableJumpTables &&
+  return TLI.supportJumpTables() &&
           (TLI.isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
            TLI.isOperationLegalOrCustom(ISD::BRIND, MVT::Other));
 }
 
 static APInt ComputeRange(const APInt &First, const APInt &Last) {
   uint32_t BitWidth = std::max(Last.getBitWidth(), First.getBitWidth()) + 1;
-  APInt LastExt = Last.sext(BitWidth), FirstExt = First.sext(BitWidth);
+  APInt LastExt = Last.zext(BitWidth), FirstExt = First.zext(BitWidth);
   return (LastExt - FirstExt + 1ULL);
 }
 
@@ -2104,7 +2125,7 @@ bool SelectionDAGBuilder::handleJTSwitchCase(CaseRec &CR,
     const APInt &Low = cast<ConstantInt>(I->Low)->getValue();
     const APInt &High = cast<ConstantInt>(I->High)->getValue();
 
-    if (Low.sle(TEI) && TEI.sle(High)) {
+    if (Low.ule(TEI) && TEI.ule(High)) {
       DestBBs.push_back(I->BB);
       if (TEI==High)
         ++I;
@@ -2261,7 +2282,7 @@ bool SelectionDAGBuilder::handleBTSplitSwitchCase(CaseRec& CR,
   // Create a CaseBlock record representing a conditional branch to
   // the LHS node if the value being switched on SV is less than C.
   // Otherwise, branch to LHS.
-  CaseBlock CB(ISD::SETLT, SV, C, NULL, TrueBB, FalseBB, CR.CaseBB);
+  CaseBlock CB(ISD::SETULT, SV, C, NULL, TrueBB, FalseBB, CR.CaseBB);
 
   if (CR.CaseBB == SwitchBB)
     visitSwitchCase(CB, SwitchBB);
@@ -2333,7 +2354,7 @@ bool SelectionDAGBuilder::handleBitTestsSwitchCase(CaseRec& CR,
   // Optimize the case where all the case values fit in a
   // word without having to subtract minValue. In this case,
   // we can optimize away the subtraction.
-  if (minValue.isNonNegative() && maxValue.slt(IntPtrBits)) {
+  if (maxValue.ult(IntPtrBits)) {
     cmpRange = maxValue;
   } else {
     lowBound = minValue;
@@ -2407,57 +2428,46 @@ bool SelectionDAGBuilder::handleBitTestsSwitchCase(CaseRec& CR,
 /// Clusterify - Transform simple list of Cases into list of CaseRange's
 size_t SelectionDAGBuilder::Clusterify(CaseVector& Cases,
                                        const SwitchInst& SI) {
-  size_t numCmps = 0;
+  
+  /// Use a shorter form of declaration, and also
+  /// show the we want to use CRSBuilder as Clusterifier.
+  typedef IntegersSubsetMapping<MachineBasicBlock> Clusterifier;
+  
+  Clusterifier TheClusterifier;
 
-  BranchProbabilityInfo *BPI = FuncInfo.BPI;
   // Start with "simple" cases
   for (SwitchInst::ConstCaseIt i = SI.case_begin(), e = SI.case_end();
        i != e; ++i) {
     const BasicBlock *SuccBB = i.getCaseSuccessor();
     MachineBasicBlock *SMBB = FuncInfo.MBBMap[SuccBB];
 
-    uint32_t ExtraWeight = BPI ? BPI->getEdgeWeight(SI.getParent(), SuccBB) : 0;
-
-    Cases.push_back(Case(i.getCaseValue(), i.getCaseValue(),
-                         SMBB, ExtraWeight));
-  }
-  std::sort(Cases.begin(), Cases.end(), CaseCmp());
-
-  // Merge case into clusters
-  if (Cases.size() >= 2)
-    // Must recompute end() each iteration because it may be
-    // invalidated by erase if we hold on to it
-    for (CaseItr I = Cases.begin(), J = llvm::next(Cases.begin());
-         J != Cases.end(); ) {
-      const APInt& nextValue = cast<ConstantInt>(J->Low)->getValue();
-      const APInt& currentValue = cast<ConstantInt>(I->High)->getValue();
-      MachineBasicBlock* nextBB = J->BB;
-      MachineBasicBlock* currentBB = I->BB;
-
-      // If the two neighboring cases go to the same destination, merge them
-      // into a single case.
-      if ((nextValue - currentValue == 1) && (currentBB == nextBB)) {
-        I->High = J->High;
-        J = Cases.erase(J);
-
-        if (BranchProbabilityInfo *BPI = FuncInfo.BPI) {
-          uint32_t CurWeight = currentBB->getBasicBlock() ?
-            BPI->getEdgeWeight(SI.getParent(), currentBB->getBasicBlock()) : 16;
-          uint32_t NextWeight = nextBB->getBasicBlock() ?
-            BPI->getEdgeWeight(SI.getParent(), nextBB->getBasicBlock()) : 16;
-
-          BPI->setEdgeWeight(SI.getParent(), currentBB->getBasicBlock(),
-                             CurWeight + NextWeight);
-        }
-      } else {
-        I = J++;
-      }
+    TheClusterifier.add(i.getCaseValueEx(), SMBB);
+  }
+  
+  TheClusterifier.optimize();
+  
+  BranchProbabilityInfo *BPI = FuncInfo.BPI;
+  size_t numCmps = 0;
+  for (Clusterifier::RangeIterator i = TheClusterifier.begin(),
+       e = TheClusterifier.end(); i != e; ++i, ++numCmps) {
+    Clusterifier::Cluster &C = *i;
+    unsigned W = 0;
+    if (BPI) {
+      W = BPI->getEdgeWeight(SI.getParent(), C.second->getBasicBlock());
+      if (!W)
+        W = 16;
+      W *= C.first.Weight;
+      BPI->setEdgeWeight(SI.getParent(), C.second->getBasicBlock(), W);  
     }
 
-  for (CaseItr I=Cases.begin(), E=Cases.end(); I!=E; ++I, ++numCmps) {
-    if (I->Low != I->High)
-      // A range counts double, since it requires two compares.
-      ++numCmps;
+    // FIXME: Currently work with ConstantInt based numbers.
+    // Changing it to APInt based is a pretty heavy for this commit.
+    Cases.push_back(Case(C.first.getLow().toConstantInt(),
+                         C.first.getHigh().toConstantInt(), C.second, W));
+    
+    if (C.first.getLow() != C.first.getHigh())
+    // A range counts double, since it requires two compares.
+    ++numCmps;
   }
 
   return numCmps;
@@ -2804,7 +2814,7 @@ void SelectionDAGBuilder::visitExtractElement(const User &I) {
 }
 
 // Utility for visitShuffleVector - Return true if every element in Mask,
-// begining from position Pos and ending in Pos+Size, falls within the
+// beginning from position Pos and ending in Pos+Size, falls within the
 // specified sequential range [L, L+Pos). or is undef.
 static bool isSequentialInRange(const SmallVectorImpl<int> &Mask,
                                 unsigned Pos, unsigned Size, int Low) {
@@ -4914,6 +4924,11 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::pow:
     visitPow(I);
     return 0;
+  case Intrinsic::fabs:
+    setValue(&I, DAG.getNode(ISD::FABS, dl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0))));
+    return 0;
   case Intrinsic::fma:
     setValue(&I, DAG.getNode(ISD::FMA, dl,
                              getValue(I.getArgOperand(0)).getValueType(),
@@ -4921,6 +4936,29 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
                              getValue(I.getArgOperand(1)),
                              getValue(I.getArgOperand(2))));
     return 0;
+  case Intrinsic::fmuladd: {
+    EVT VT = TLI.getValueType(I.getType());
+    if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict &&
+        TLI.isOperationLegal(ISD::FMA, VT) &&
+        TLI.isFMAFasterThanMulAndAdd(VT)){
+      setValue(&I, DAG.getNode(ISD::FMA, dl,
+                               getValue(I.getArgOperand(0)).getValueType(),
+                               getValue(I.getArgOperand(0)),
+                               getValue(I.getArgOperand(1)),
+                               getValue(I.getArgOperand(2))));
+    } else {
+      SDValue Mul = DAG.getNode(ISD::FMUL, dl,
+                                getValue(I.getArgOperand(0)).getValueType(),
+                                getValue(I.getArgOperand(0)),
+                                getValue(I.getArgOperand(1)));
+      SDValue Add = DAG.getNode(ISD::FADD, dl,
+                                getValue(I.getArgOperand(0)).getValueType(),
+                                Mul,
+                                getValue(I.getArgOperand(2)));
+      setValue(&I, Add);
+    }
+    return 0;
+  }
   case Intrinsic::convert_to_fp16:
     setValue(&I, DAG.getNode(ISD::FP32_TO_FP16, dl,
                              MVT::i16, getValue(I.getArgOperand(0))));
@@ -5050,7 +5088,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   }
   case Intrinsic::gcroot:
     if (GFI) {
-      const Value *Alloca = I.getArgOperand(0);
+      const Value *Alloca = I.getArgOperand(0)->stripPointerCasts();
       const Constant *TypeMap = cast<Constant>(I.getArgOperand(1));
 
       FrameIndexSDNode *FI = cast<FrameIndexSDNode>(getValue(Alloca).getNode());
@@ -5077,16 +5115,21 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
       return 0;
     }
     TargetLowering::ArgListTy Args;
-    std::pair<SDValue, SDValue> Result =
-      TLI.LowerCallTo(getRoot(), I.getType(),
+    TargetLowering::
+    CallLoweringInfo CLI(getRoot(), I.getType(),
                  false, false, false, false, 0, CallingConv::C,
                  /*isTailCall=*/false,
                  /*doesNotRet=*/false, /*isReturnValueUsed=*/true,
                  DAG.getExternalSymbol(TrapFuncName.data(), TLI.getPointerTy()),
                  Args, DAG, getCurDebugLoc());
+    std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
     DAG.setRoot(Result.second);
     return 0;
   }
+  case Intrinsic::debugtrap: {
+    DAG.setRoot(DAG.getNode(ISD::DEBUGTRAP, dl,MVT::Other, getRoot()));
+    return 0;
+  }
   case Intrinsic::uadd_with_overflow:
   case Intrinsic::sadd_with_overflow:
   case Intrinsic::usub_with_overflow:
@@ -5139,6 +5182,9 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::lifetime_end:
     // Discard region information.
     return 0;
+  case Intrinsic::donothing:
+    // ignore
+    return 0;
   }
 }
 
@@ -5157,14 +5203,13 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
 
   // Check whether the function can return without sret-demotion.
   SmallVector<ISD::OutputArg, 4> Outs;
-  SmallVector<uint64_t, 4> Offsets;
   GetReturnInfo(RetTy, CS.getAttributes().getRetAttributes(),
-                Outs, TLI, &Offsets);
+                Outs, TLI);
 
   bool CanLowerReturn = TLI.CanLowerReturn(CS.getCallingConv(),
-					   DAG.getMachineFunction(),
-					   FTy->isVarArg(), Outs,
-					   FTy->getContext());
+                                           DAG.getMachineFunction(),
+                                           FTy->isVarArg(), Outs,
+                                           FTy->getContext());
 
   SDValue DemoteStackSlot;
   int DemoteStackIdx = -100;
@@ -5247,16 +5292,10 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
   if (isTailCall && TM.Options.EnableFastISel)
     isTailCall = false;
 
-  std::pair<SDValue,SDValue> Result =
-    TLI.LowerCallTo(getRoot(), RetTy,
-                    CS.paramHasAttr(0, Attribute::SExt),
-                    CS.paramHasAttr(0, Attribute::ZExt), FTy->isVarArg(),
-                    CS.paramHasAttr(0, Attribute::InReg), FTy->getNumParams(),
-                    CS.getCallingConv(),
-                    isTailCall,
-                    CS.doesNotReturn(),
-                    !CS.getInstruction()->use_empty(),
-                    Callee, Args, DAG, getCurDebugLoc());
+  TargetLowering::
+  CallLoweringInfo CLI(getRoot(), RetTy, FTy, isTailCall, Callee, Args, DAG,
+                       getCurDebugLoc(), CS);
+  std::pair<SDValue,SDValue> Result = TLI.LowerCallTo(CLI);
   assert((isTailCall || Result.second.getNode()) &&
          "Non-null chain expected with non-tail call!");
   assert((Result.second.getNode() || !Result.first.getNode()) &&
@@ -5272,7 +5311,13 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
     ComputeValueVTs(TLI, PtrRetTy, PVTs);
     assert(PVTs.size() == 1 && "Pointers should fit in one register");
     EVT PtrVT = PVTs[0];
-    unsigned NumValues = Outs.size();
+
+    SmallVector<EVT, 4> RetTys;
+    SmallVector<uint64_t, 4> Offsets;
+    RetTy = FTy->getReturnType();
+    ComputeValueVTs(TLI, RetTy, RetTys, &Offsets);
+
+    unsigned NumValues = RetTys.size();
     SmallVector<SDValue, 4> Values(NumValues);
     SmallVector<SDValue, 4> Chains(NumValues);
 
@@ -5280,8 +5325,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
       SDValue Add = DAG.getNode(ISD::ADD, getCurDebugLoc(), PtrVT,
                                 DemoteStackSlot,
                                 DAG.getConstant(Offsets[i], PtrVT));
-      SDValue L = DAG.getLoad(Outs[i].VT, getCurDebugLoc(), Result.second,
-                              Add,
+      SDValue L = DAG.getLoad(RetTys[i], getCurDebugLoc(), Result.second, Add,
                   MachinePointerInfo::getFixedStack(DemoteStackIdx, Offsets[i]),
                               false, false, false, 1);
       Values[i] = L;
@@ -5292,30 +5336,10 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
                                 MVT::Other, &Chains[0], NumValues);
     PendingLoads.push_back(Chain);
 
-    // Collect the legal value parts into potentially illegal values
-    // that correspond to the original function's return values.
-    SmallVector<EVT, 4> RetTys;
-    RetTy = FTy->getReturnType();
-    ComputeValueVTs(TLI, RetTy, RetTys);
-    ISD::NodeType AssertOp = ISD::DELETED_NODE;
-    SmallVector<SDValue, 4> ReturnValues;
-    unsigned CurReg = 0;
-    for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
-      EVT VT = RetTys[I];
-      EVT RegisterVT = TLI.getRegisterType(RetTy->getContext(), VT);
-      unsigned NumRegs = TLI.getNumRegisters(RetTy->getContext(), VT);
-
-      SDValue ReturnValue =
-        getCopyFromParts(DAG, getCurDebugLoc(), &Values[CurReg], NumRegs,
-                         RegisterVT, VT, AssertOp);
-      ReturnValues.push_back(ReturnValue);
-      CurReg += NumRegs;
-    }
-
     setValue(CS.getInstruction(),
              DAG.getNode(ISD::MERGE_VALUES, getCurDebugLoc(),
                          DAG.getVTList(&RetTys[0], RetTys.size()),
-                         &ReturnValues[0], ReturnValues.size()));
+                         &Values[0], Values.size()));
   }
 
   // Assign order to nodes here. If the call does not produce a result, it won't
@@ -5952,11 +5976,11 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
       SDISelAsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput];
 
       if (OpInfo.ConstraintVT != Input.ConstraintVT) {
-	std::pair<unsigned, const TargetRegisterClass*> MatchRC =
-	  TLI.getRegForInlineAsmConstraint(OpInfo.ConstraintCode,
+        std::pair<unsigned, const TargetRegisterClass*> MatchRC =
+          TLI.getRegForInlineAsmConstraint(OpInfo.ConstraintCode,
                                            OpInfo.ConstraintVT);
-	std::pair<unsigned, const TargetRegisterClass*> InputRC =
-	  TLI.getRegForInlineAsmConstraint(Input.ConstraintCode,
+        std::pair<unsigned, const TargetRegisterClass*> InputRC =
+          TLI.getRegForInlineAsmConstraint(Input.ConstraintCode,
                                            Input.ConstraintVT);
         if ((OpInfo.ConstraintVT.isInteger() !=
              Input.ConstraintVT.isInteger()) ||
@@ -6225,8 +6249,15 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
       assert((OpInfo.ConstraintType == TargetLowering::C_RegisterClass ||
               OpInfo.ConstraintType == TargetLowering::C_Register) &&
              "Unknown constraint type!");
-      assert(!OpInfo.isIndirect &&
-             "Don't know how to handle indirect register inputs yet!");
+
+      // TODO: Support this.
+      if (OpInfo.isIndirect) {
+        LLVMContext &Ctx = *DAG.getContext();
+        Ctx.emitError(CS.getInstruction(),
+                      "Don't know how to handle indirect register inputs yet "
+                      "for constraint '" + Twine(OpInfo.ConstraintCode) + "'");
+        break;
+      }
 
       // Copy the input into the appropriate registers.
       if (OpInfo.AssignedRegs.Regs.empty()) {
@@ -6369,24 +6400,18 @@ void SelectionDAGBuilder::visitVACopy(const CallInst &I) {
 /// FIXME: When all targets are
 /// migrated to using LowerCall, this hook should be integrated into SDISel.
 std::pair<SDValue, SDValue>
-TargetLowering::LowerCallTo(SDValue Chain, Type *RetTy,
-                            bool RetSExt, bool RetZExt, bool isVarArg,
-                            bool isInreg, unsigned NumFixedArgs,
-                            CallingConv::ID CallConv, bool isTailCall,
-                            bool doesNotRet, bool isReturnValueUsed,
-                            SDValue Callee,
-                            ArgListTy &Args, SelectionDAG &DAG,
-                            DebugLoc dl) const {
+TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
   // Handle all of the outgoing arguments.
-  SmallVector<ISD::OutputArg, 32> Outs;
-  SmallVector<SDValue, 32> OutVals;
+  CLI.Outs.clear();
+  CLI.OutVals.clear();
+  ArgListTy &Args = CLI.Args;
   for (unsigned i = 0, e = Args.size(); i != e; ++i) {
     SmallVector<EVT, 4> ValueVTs;
     ComputeValueVTs(*this, Args[i].Ty, ValueVTs);
     for (unsigned Value = 0, NumValues = ValueVTs.size();
          Value != NumValues; ++Value) {
       EVT VT = ValueVTs[Value];
-      Type *ArgTy = VT.getTypeForEVT(RetTy->getContext());
+      Type *ArgTy = VT.getTypeForEVT(CLI.RetTy->getContext());
       SDValue Op = SDValue(Args[i].Node.getNode(),
                            Args[i].Node.getResNo() + Value);
       ISD::ArgFlagsTy Flags;
@@ -6419,8 +6444,8 @@ TargetLowering::LowerCallTo(SDValue Chain, Type *RetTy,
         Flags.setNest();
       Flags.setOrigAlign(OriginalAlignment);
 
-      EVT PartVT = getRegisterType(RetTy->getContext(), VT);
-      unsigned NumParts = getNumRegisters(RetTy->getContext(), VT);
+      EVT PartVT = getRegisterType(CLI.RetTy->getContext(), VT);
+      unsigned NumParts = getNumRegisters(CLI.RetTy->getContext(), VT);
       SmallVector<SDValue, 4> Parts(NumParts);
       ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
 
@@ -6429,89 +6454,88 @@ TargetLowering::LowerCallTo(SDValue Chain, Type *RetTy,
       else if (Args[i].isZExt)
         ExtendKind = ISD::ZERO_EXTEND;
 
-      getCopyToParts(DAG, dl, Op, &Parts[0], NumParts,
+      getCopyToParts(CLI.DAG, CLI.DL, Op, &Parts[0], NumParts,
                      PartVT, ExtendKind);
 
       for (unsigned j = 0; j != NumParts; ++j) {
         // if it isn't first piece, alignment must be 1
         ISD::OutputArg MyFlags(Flags, Parts[j].getValueType(),
-                               i < NumFixedArgs);
+                               i < CLI.NumFixedArgs);
         if (NumParts > 1 && j == 0)
           MyFlags.Flags.setSplit();
         else if (j != 0)
           MyFlags.Flags.setOrigAlign(1);
 
-        Outs.push_back(MyFlags);
-        OutVals.push_back(Parts[j]);
+        CLI.Outs.push_back(MyFlags);
+        CLI.OutVals.push_back(Parts[j]);
       }
     }
   }
 
   // Handle the incoming return values from the call.
-  SmallVector<ISD::InputArg, 32> Ins;
+  CLI.Ins.clear();
   SmallVector<EVT, 4> RetTys;
-  ComputeValueVTs(*this, RetTy, RetTys);
+  ComputeValueVTs(*this, CLI.RetTy, RetTys);
   for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
     EVT VT = RetTys[I];
-    EVT RegisterVT = getRegisterType(RetTy->getContext(), VT);
-    unsigned NumRegs = getNumRegisters(RetTy->getContext(), VT);
+    EVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), VT);
+    unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), VT);
     for (unsigned i = 0; i != NumRegs; ++i) {
       ISD::InputArg MyFlags;
       MyFlags.VT = RegisterVT.getSimpleVT();
-      MyFlags.Used = isReturnValueUsed;
-      if (RetSExt)
+      MyFlags.Used = CLI.IsReturnValueUsed;
+      if (CLI.RetSExt)
         MyFlags.Flags.setSExt();
-      if (RetZExt)
+      if (CLI.RetZExt)
         MyFlags.Flags.setZExt();
-      if (isInreg)
+      if (CLI.IsInReg)
         MyFlags.Flags.setInReg();
-      Ins.push_back(MyFlags);
+      CLI.Ins.push_back(MyFlags);
     }
   }
 
   SmallVector<SDValue, 4> InVals;
-  Chain = LowerCall(Chain, Callee, CallConv, isVarArg, doesNotRet, isTailCall,
-                    Outs, OutVals, Ins, dl, DAG, InVals);
+  CLI.Chain = LowerCall(CLI, InVals);
 
   // Verify that the target's LowerCall behaved as expected.
-  assert(Chain.getNode() && Chain.getValueType() == MVT::Other &&
+  assert(CLI.Chain.getNode() && CLI.Chain.getValueType() == MVT::Other &&
          "LowerCall didn't return a valid chain!");
-  assert((!isTailCall || InVals.empty()) &&
+  assert((!CLI.IsTailCall || InVals.empty()) &&
          "LowerCall emitted a return value for a tail call!");
-  assert((isTailCall || InVals.size() == Ins.size()) &&
+  assert((CLI.IsTailCall || InVals.size() == CLI.Ins.size()) &&
          "LowerCall didn't emit the correct number of values!");
 
   // For a tail call, the return value is merely live-out and there aren't
   // any nodes in the DAG representing it. Return a special value to
   // indicate that a tail call has been emitted and no more Instructions
   // should be processed in the current block.
-  if (isTailCall) {
-    DAG.setRoot(Chain);
+  if (CLI.IsTailCall) {
+    CLI.DAG.setRoot(CLI.Chain);
     return std::make_pair(SDValue(), SDValue());
   }
 
-  DEBUG(for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+  DEBUG(for (unsigned i = 0, e = CLI.Ins.size(); i != e; ++i) {
           assert(InVals[i].getNode() &&
                  "LowerCall emitted a null value!");
-          assert(EVT(Ins[i].VT) == InVals[i].getValueType() &&
+          assert(EVT(CLI.Ins[i].VT) == InVals[i].getValueType() &&
                  "LowerCall emitted a value with the wrong type!");
         });
 
   // Collect the legal value parts into potentially illegal values
   // that correspond to the original function's return values.
   ISD::NodeType AssertOp = ISD::DELETED_NODE;
-  if (RetSExt)
+  if (CLI.RetSExt)
     AssertOp = ISD::AssertSext;
-  else if (RetZExt)
+  else if (CLI.RetZExt)
     AssertOp = ISD::AssertZext;
   SmallVector<SDValue, 4> ReturnValues;
   unsigned CurReg = 0;
   for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
     EVT VT = RetTys[I];
-    EVT RegisterVT = getRegisterType(RetTy->getContext(), VT);
-    unsigned NumRegs = getNumRegisters(RetTy->getContext(), VT);
+    EVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), VT);
+    unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), VT);
 
-    ReturnValues.push_back(getCopyFromParts(DAG, dl, &InVals[CurReg],
+    ReturnValues.push_back(getCopyFromParts(CLI.DAG, CLI.DL, &InVals[CurReg],
                                             NumRegs, RegisterVT, VT,
                                             AssertOp));
     CurReg += NumRegs;
@@ -6521,12 +6545,12 @@ TargetLowering::LowerCallTo(SDValue Chain, Type *RetTy,
   // such a node, so we just return a null return value in that case. In
   // that case, nothing will actually look at the value.
   if (ReturnValues.empty())
-    return std::make_pair(SDValue(), Chain);
+    return std::make_pair(SDValue(), CLI.Chain);
 
-  SDValue Res = DAG.getNode(ISD::MERGE_VALUES, dl,
-                            DAG.getVTList(&RetTys[0], RetTys.size()),
+  SDValue Res = CLI.DAG.getNode(ISD::MERGE_VALUES, CLI.DL,
+                                CLI.DAG.getVTList(&RetTys[0], RetTys.size()),
                             &ReturnValues[0], ReturnValues.size());
-  return std::make_pair(Res, Chain);
+  return std::make_pair(Res, CLI.Chain);
 }
 
 void TargetLowering::LowerOperationWrapper(SDNode *N,
@@ -6746,7 +6770,7 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) {
 
     // Note down frame index.
     if (FrameIndexSDNode *FI =
-	dyn_cast<FrameIndexSDNode>(ArgValues[0].getNode()))
+        dyn_cast<FrameIndexSDNode>(ArgValues[0].getNode()))
       FuncInfo->setArgumentFrameIndex(I, FI->getIndex());
 
     SDValue Res = DAG.getMergeValues(&ArgValues[0], NumValues,
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 8393b41..d0fde6f 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -180,17 +180,6 @@ private:
 
   typedef std::vector<CaseRec> CaseRecVector;
 
-  /// The comparison function for sorting the switch case values in the vector.
-  /// WARNING: Case ranges should be disjoint!
-  struct CaseCmp {
-    bool operator()(const Case &C1, const Case &C2) {
-      assert(isa<ConstantInt>(C1.Low) && isa<ConstantInt>(C2.High));
-      const ConstantInt* CI1 = cast<const ConstantInt>(C1.Low);
-      const ConstantInt* CI2 = cast<const ConstantInt>(C2.High);
-      return CI1->getValue().slt(CI2->getValue());
-    }
-  };
-
   struct CaseBitsCmp {
     bool operator()(const CaseBits &C1, const CaseBits &C2) {
       return C1.Bits > C2.Bits;
@@ -351,7 +340,7 @@ public:
   void clear();
 
   /// clearDanglingDebugInfo - Clear the dangling debug information
-  /// map. This function is seperated from the clear so that debug
+  /// map. This function is separated from the clear so that debug
   /// information that is dangling in a basic block can be properly
   /// resolved in a different basic block. This allows the
   /// SelectionDAG to resolve dangling debug information attached
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index f981afb..9fc225f 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ScheduleDAGSDNodes.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/Function.h"
 #include "llvm/Intrinsics.h"
 #include "llvm/Assembly/Writer.h"
@@ -19,7 +20,6 @@
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Target/TargetMachine.h"
@@ -265,6 +265,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::STACKSAVE:                  return "stacksave";
   case ISD::STACKRESTORE:               return "stackrestore";
   case ISD::TRAP:                       return "trap";
+  case ISD::DEBUGTRAP:                  return "debugtrap";
 
   // Bit manipulation
   case ISD::BSWAP:                      return "bswap";
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 605509b..287c679 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -14,12 +14,8 @@
 #define DEBUG_TYPE "isel"
 #include "ScheduleDAGSDNodes.h"
 #include "SelectionDAGBuilder.h"
-#include "llvm/CodeGen/FunctionLoweringInfo.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/Function.h"
 #include "llvm/InlineAsm.h"
 #include "llvm/Instructions.h"
@@ -27,7 +23,10 @@
 #include "llvm/IntrinsicInst.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GCStrategy.h"
 #include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -38,6 +37,7 @@
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -263,8 +263,6 @@ void TargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
 // SelectionDAGISel code
 //===----------------------------------------------------------------------===//
 
-void SelectionDAGISel::ISelUpdater::anchor() { }
-
 SelectionDAGISel::SelectionDAGISel(const TargetMachine &tm,
                                    CodeGenOpt::Level OL) :
   MachineFunctionPass(ID), TM(tm), TLI(*tm.getTargetLowering()),
@@ -451,9 +449,9 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
         }
       }
     }
-  done:;
   }
 
+  done:
   // Determine if there is a call to setjmp in the machine function.
   MF->setExposesReturnsTwice(Fn.callsFunctionThatReturnsTwice());
 
@@ -468,8 +466,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
     // If To is also scheduled to be replaced, find what its ultimate
     // replacement is.
     for (;;) {
-      DenseMap<unsigned, unsigned>::iterator J =
-        FuncInfo->RegFixups.find(To);
+      DenseMap<unsigned, unsigned>::iterator J = FuncInfo->RegFixups.find(To);
       if (J == E) break;
       To = J->second;
     }
@@ -703,6 +700,25 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   CurDAG->clear();
 }
 
+namespace {
+/// ISelUpdater - helper class to handle updates of the instruction selection
+/// graph.
+class ISelUpdater : public SelectionDAG::DAGUpdateListener {
+  SelectionDAG::allnodes_iterator &ISelPosition;
+public:
+  ISelUpdater(SelectionDAG &DAG, SelectionDAG::allnodes_iterator &isp)
+    : SelectionDAG::DAGUpdateListener(DAG), ISelPosition(isp) {}
+
+  /// NodeDeleted - Handle nodes deleted from the graph. If the node being
+  /// deleted is the current ISelPosition node, update ISelPosition.
+  ///
+  virtual void NodeDeleted(SDNode *N, SDNode *E) {
+    if (ISelPosition == SelectionDAG::allnodes_iterator(N))
+      ++ISelPosition;
+  }
+};
+} // end anonymous namespace
+
 void SelectionDAGISel::DoInstructionSelection() {
   DEBUG(errs() << "===== Instruction selection begins: BB#"
         << FuncInfo->MBB->getNumber()
@@ -719,9 +735,13 @@ void SelectionDAGISel::DoInstructionSelection() {
     // a reference to the root node, preventing it from being deleted,
     // and tracking any changes of the root.
     HandleSDNode Dummy(CurDAG->getRoot());
-    ISelPosition = SelectionDAG::allnodes_iterator(CurDAG->getRoot().getNode());
+    SelectionDAG::allnodes_iterator ISelPosition (CurDAG->getRoot().getNode());
     ++ISelPosition;
 
+    // Make sure that ISelPosition gets properly updated when nodes are deleted
+    // in calls made from this function.
+    ISelUpdater ISU(*CurDAG, ISelPosition);
+
     // The AllNodes list is now topological-sorted. Visit the
     // nodes by starting at the end of the list (the root of the
     // graph) and preceding back toward the beginning (the entry
@@ -748,10 +768,8 @@ void SelectionDAGISel::DoInstructionSelection() {
 
       // If after the replacement this node is not used any more,
       // remove this dead node.
-      if (Node->use_empty()) { // Don't delete EntryToken, etc.
-        ISelUpdater ISU(ISelPosition);
-        CurDAG->RemoveDeadNode(Node, &ISU);
-      }
+      if (Node->use_empty()) // Don't delete EntryToken, etc.
+        CurDAG->RemoveDeadNode(Node);
     }
 
     CurDAG->setRoot(Dummy.getValue());
@@ -1680,8 +1698,6 @@ UpdateChainsAndGlue(SDNode *NodeToMatch, SDValue InputChain,
                     bool isMorphNodeTo) {
   SmallVector<SDNode*, 4> NowDeadNodes;
 
-  ISelUpdater ISU(ISelPosition);
-
   // Now that all the normal results are replaced, we replace the chain and
   // glue results if present.
   if (!ChainNodesMatched.empty()) {
@@ -1705,7 +1721,7 @@ UpdateChainsAndGlue(SDNode *NodeToMatch, SDValue InputChain,
       if (ChainVal.getValueType() == MVT::Glue)
         ChainVal = ChainVal.getValue(ChainVal->getNumValues()-2);
       assert(ChainVal.getValueType() == MVT::Other && "Not a chain?");
-      CurDAG->ReplaceAllUsesOfValueWith(ChainVal, InputChain, &ISU);
+      CurDAG->ReplaceAllUsesOfValueWith(ChainVal, InputChain);
 
       // If the node became dead and we haven't already seen it, delete it.
       if (ChainNode->use_empty() &&
@@ -1728,7 +1744,7 @@ UpdateChainsAndGlue(SDNode *NodeToMatch, SDValue InputChain,
       assert(FRN->getValueType(FRN->getNumValues()-1) == MVT::Glue &&
              "Doesn't have a glue result");
       CurDAG->ReplaceAllUsesOfValueWith(SDValue(FRN, FRN->getNumValues()-1),
-                                        InputGlue, &ISU);
+                                        InputGlue);
 
       // If the node became dead and we haven't already seen it, delete it.
       if (FRN->use_empty() &&
@@ -1738,7 +1754,7 @@ UpdateChainsAndGlue(SDNode *NodeToMatch, SDValue InputChain,
   }
 
   if (!NowDeadNodes.empty())
-    CurDAG->RemoveDeadNodes(NowDeadNodes, &ISU);
+    CurDAG->RemoveDeadNodes(NowDeadNodes);
 
   DEBUG(errs() << "ISEL: Match complete!\n");
 }
@@ -1759,7 +1775,7 @@ enum ChainResult {
 /// The walk we do here is guaranteed to be small because we quickly get down to
 /// already selected nodes "below" us.
 static ChainResult
-WalkChainUsers(SDNode *ChainedNode,
+WalkChainUsers(const SDNode *ChainedNode,
                SmallVectorImpl<SDNode*> &ChainedNodesInPattern,
                SmallVectorImpl<SDNode*> &InteriorChainedNodes) {
   ChainResult Result = CR_Simple;
@@ -1992,14 +2008,14 @@ CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
 /// CheckPatternPredicate - Implements OP_CheckPatternPredicate.
 LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
 CheckPatternPredicate(const unsigned char *MatcherTable, unsigned &MatcherIndex,
-                      SelectionDAGISel &SDISel) {
+                      const SelectionDAGISel &SDISel) {
   return SDISel.CheckPatternPredicate(MatcherTable[MatcherIndex++]);
 }
 
 /// CheckNodePredicate - Implements OP_CheckNodePredicate.
 LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
 CheckNodePredicate(const unsigned char *MatcherTable, unsigned &MatcherIndex,
-                   SelectionDAGISel &SDISel, SDNode *N) {
+                   const SelectionDAGISel &SDISel, SDNode *N) {
   return SDISel.CheckNodePredicate(N, MatcherTable[MatcherIndex++]);
 }
 
@@ -2062,7 +2078,7 @@ CheckInteger(const unsigned char *MatcherTable, unsigned &MatcherIndex,
 
 LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
 CheckAndImm(const unsigned char *MatcherTable, unsigned &MatcherIndex,
-            SDValue N, SelectionDAGISel &SDISel) {
+            SDValue N, const SelectionDAGISel &SDISel) {
   int64_t Val = MatcherTable[MatcherIndex++];
   if (Val & 128)
     Val = GetVBR(Val, MatcherTable, MatcherIndex);
@@ -2075,7 +2091,7 @@ CheckAndImm(const unsigned char *MatcherTable, unsigned &MatcherIndex,
 
 LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
 CheckOrImm(const unsigned char *MatcherTable, unsigned &MatcherIndex,
-           SDValue N, SelectionDAGISel &SDISel) {
+           SDValue N, const SelectionDAGISel &SDISel) {
   int64_t Val = MatcherTable[MatcherIndex++];
   if (Val & 128)
     Val = GetVBR(Val, MatcherTable, MatcherIndex);
@@ -2094,7 +2110,8 @@ CheckOrImm(const unsigned char *MatcherTable, unsigned &MatcherIndex,
 /// MatcherIndex to continue with.
 static unsigned IsPredicateKnownToFail(const unsigned char *Table,
                                        unsigned Index, SDValue N,
-                                       bool &Result, SelectionDAGISel &SDISel,
+                                       bool &Result,
+                                       const SelectionDAGISel &SDISel,
                  SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes) {
   switch (Table[Index++]) {
   default:
@@ -2759,9 +2776,14 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
                                                              (SDNode*) 0));
         }
 
-      } else {
+      } else if (NodeToMatch->getOpcode() != ISD::DELETED_NODE) {
         Res = MorphNode(NodeToMatch, TargetOpc, VTList, Ops.data(), Ops.size(),
                         EmitNodeInfo);
+      } else {
+        // NodeToMatch was eliminated by CSE when the target changed the DAG.
+        // We will visit the equivalent node later.
+        DEBUG(dbgs() << "Node was eliminated by CSE\n");
+        return 0;
       }
 
       // If the node had chain/glue results, update our notion of the current
@@ -2959,6 +2981,7 @@ void SelectionDAGISel::CannotYetSelect(SDNode *N) {
       N->getOpcode() != ISD::INTRINSIC_WO_CHAIN &&
       N->getOpcode() != ISD::INTRINSIC_VOID) {
     N->printrFull(Msg, CurDAG);
+    Msg << "\nIn function: " << MF->getFunction()->getName();
   } else {
     bool HasInputChain = N->getOperand(0).getValueType() == MVT::Other;
     unsigned iid =
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
index 6cde05a..173ffac 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
@@ -13,13 +13,13 @@
 
 #include "ScheduleDAGSDNodes.h"
 #include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/Function.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/Debug.h"
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index e341e15..dff9b2c 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -25,6 +25,7 @@
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -32,13 +33,6 @@
 #include <cctype>
 using namespace llvm;
 
-/// We are in the process of implementing a new TypeLegalization action
-/// - the promotion of vector elements. This feature is disabled by default
-/// and only enabled using this flag.
-static cl::opt<bool>
-AllowPromoteIntElem("promote-elements", cl::Hidden, cl::init(true),
-  cl::desc("Allow promotion of integer vector element types"));
-
 /// InitLibcallNames - Set default libcall names.
 ///
 static void InitLibcallNames(const char **Names) {
@@ -521,8 +515,7 @@ static void InitCmpLibcallCCs(ISD::CondCode *CCs) {
 /// NOTE: The constructor takes ownership of TLOF.
 TargetLowering::TargetLowering(const TargetMachine &tm,
                                const TargetLoweringObjectFile *tlof)
-  : TM(tm), TD(TM.getTargetData()), TLOF(*tlof),
-  mayPromoteElements(AllowPromoteIntElem) {
+  : TM(tm), TD(TM.getTargetData()), TLOF(*tlof) {
   // All operations default to being supported.
   memset(OpActions, 0, sizeof(OpActions));
   memset(LoadExtActions, 0, sizeof(LoadExtActions));
@@ -604,6 +597,7 @@ TargetLowering::TargetLowering(const TargetMachine &tm,
   IntDivIsCheap = false;
   Pow2DivIsCheap = false;
   JumpIsExpensive = false;
+  predictableSelectIsExpensive = false;
   StackPointerRegisterToSaveRestore = 0;
   ExceptionPointerRegister = 0;
   ExceptionSelectorRegister = 0;
@@ -618,6 +612,7 @@ TargetLowering::TargetLowering(const TargetMachine &tm,
   MinStackArgumentAlignment = 1;
   ShouldFoldAtomicFences = false;
   InsertFencesForAtomic = false;
+  SupportJumpTables = true;
 
   InitLibcallNames(LibcallRoutineNames);
   InitCmpLibcallCCs(CmpLibcallCCs);
@@ -708,42 +703,34 @@ bool TargetLowering::isLegalRC(const TargetRegisterClass *RC) const {
   return false;
 }
 
-/// hasLegalSuperRegRegClasses - Return true if the specified register class
-/// has one or more super-reg register classes that are legal.
-bool
-TargetLowering::hasLegalSuperRegRegClasses(const TargetRegisterClass *RC) const{
-  if (*RC->superregclasses_begin() == 0)
-    return false;
-  for (TargetRegisterInfo::regclass_iterator I = RC->superregclasses_begin(),
-         E = RC->superregclasses_end(); I != E; ++I) {
-    const TargetRegisterClass *RRC = *I;
-    if (isLegalRC(RRC))
-      return true;
-  }
-  return false;
-}
-
 /// findRepresentativeClass - Return the largest legal super-reg register class
 /// of the register class for the specified type and its associated "cost".
 std::pair<const TargetRegisterClass*, uint8_t>
 TargetLowering::findRepresentativeClass(EVT VT) const {
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
   const TargetRegisterClass *RC = RegClassForVT[VT.getSimpleVT().SimpleTy];
   if (!RC)
     return std::make_pair(RC, 0);
+
+  // Compute the set of all super-register classes.
+  BitVector SuperRegRC(TRI->getNumRegClasses());
+  for (SuperRegClassIterator RCI(RC, TRI); RCI.isValid(); ++RCI)
+    SuperRegRC.setBitsInMask(RCI.getMask());
+
+  // Find the first legal register class with the largest spill size.
   const TargetRegisterClass *BestRC = RC;
-  for (TargetRegisterInfo::regclass_iterator I = RC->superregclasses_begin(),
-         E = RC->superregclasses_end(); I != E; ++I) {
-    const TargetRegisterClass *RRC = *I;
-    if (RRC->isASubClass() || !isLegalRC(RRC))
+  for (int i = SuperRegRC.find_first(); i >= 0; i = SuperRegRC.find_next(i)) {
+    const TargetRegisterClass *SuperRC = TRI->getRegClass(i);
+    // We want the largest possible spill size.
+    if (SuperRC->getSize() <= BestRC->getSize())
+      continue;
+    if (!isLegalRC(SuperRC))
       continue;
-    if (!hasLegalSuperRegRegClasses(RRC))
-      return std::make_pair(RRC, 1);
-    BestRC = RRC;
+    BestRC = SuperRC;
   }
   return std::make_pair(BestRC, 1);
 }
 
-
 /// computeRegisterProperties - Once all of the register classes are added,
 /// this allows us to compute derived properties we expose.
 void TargetLowering::computeRegisterProperties() {
@@ -835,11 +822,8 @@ void TargetLowering::computeRegisterProperties() {
     unsigned NElts = VT.getVectorNumElements();
     if (NElts != 1) {
       bool IsLegalWiderType = false;
-      // If we allow the promotion of vector elements using a flag,
-      // then return TypePromoteInteger on vector elements.
       // First try to promote the elements of integer vectors. If no legal
       // promotion was found, fallback to the widen-vector method.
-      if (mayPromoteElements)
       for (unsigned nVT = i+1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
         EVT SVT = (MVT::SimpleValueType)nVT;
         // Promote vectors of integers to vectors with the same number
@@ -940,9 +924,12 @@ unsigned TargetLowering::getVectorTypeBreakdown(LLVMContext &Context, EVT VT,
   unsigned NumElts = VT.getVectorNumElements();
 
   // If there is a wider vector type with the same element type as this one,
-  // we should widen to that legal vector type.  This handles things like
-  // <2 x float> -> <4 x float>.
-  if (NumElts != 1 && getTypeAction(Context, VT) == TypeWidenVector) {
+  // or a promoted vector type that has the same number of elements which
+  // are wider, then we should convert to that legal vector type.
+  // This handles things like <2 x float> -> <4 x float> and
+  // <4 x i1> -> <4 x i32>.
+  LegalizeTypeAction TA = getTypeAction(Context, VT);
+  if (NumElts != 1 && (TA == TypeWidenVector || TA == TypePromoteInteger)) {
     RegisterVT = getTypeToTransformTo(Context, VT);
     if (isTypeLegal(RegisterVT)) {
       IntermediateVT = RegisterVT;
@@ -1000,13 +987,11 @@ unsigned TargetLowering::getVectorTypeBreakdown(LLVMContext &Context, EVT VT,
 /// TODO: Move this out of TargetLowering.cpp.
 void llvm::GetReturnInfo(Type* ReturnType, Attributes attr,
                          SmallVectorImpl<ISD::OutputArg> &Outs,
-                         const TargetLowering &TLI,
-                         SmallVectorImpl<uint64_t> *Offsets) {
+                         const TargetLowering &TLI) {
   SmallVector<EVT, 4> ValueVTs;
   ComputeValueVTs(TLI, ReturnType, ValueVTs);
   unsigned NumValues = ValueVTs.size();
   if (NumValues == 0) return;
-  unsigned Offset = 0;
 
   for (unsigned j = 0, f = NumValues; j != f; ++j) {
     EVT VT = ValueVTs[j];
@@ -1029,8 +1014,6 @@ void llvm::GetReturnInfo(Type* ReturnType, Attributes attr,
 
     unsigned NumParts = TLI.getNumRegisters(ReturnType->getContext(), VT);
     EVT PartVT = TLI.getRegisterType(ReturnType->getContext(), VT);
-    unsigned PartSize = TLI.getTargetData()->getTypeAllocSize(
-                        PartVT.getTypeForEVT(ReturnType->getContext()));
 
     // 'inreg' on function refers to return value
     ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
@@ -1045,10 +1028,6 @@ void llvm::GetReturnInfo(Type* ReturnType, Attributes attr,
 
     for (unsigned i = 0; i < NumParts; ++i) {
       Outs.push_back(ISD::OutputArg(Flags, PartVT, /*isFixed=*/true));
-      if (Offsets) {
-        Offsets->push_back(Offset);
-        Offset += PartSize;
-      }
     }
   }
 }
@@ -2019,7 +1998,7 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
         }
       }
 
-      // Make sure we're not loosing bits from the constant.
+      // Make sure we're not losing bits from the constant.
       if (MinBits < C1.getBitWidth() && MinBits > C1.getActiveBits()) {
         EVT MinVT = EVT::getIntegerVT(*DAG.getContext(), MinBits);
         if (isTypeDesirableForOp(ISD::SETCC, MinVT)) {
@@ -2343,6 +2322,55 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
           }
         }
       }
+
+    if (C1.getMinSignedBits() <= 64 &&
+        !isLegalICmpImmediate(C1.getSExtValue())) {
+      // (X & -256) == 256 -> (X >> 8) == 1
+      if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+          N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
+        if (ConstantSDNode *AndRHS =
+            dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+          const APInt &AndRHSC = AndRHS->getAPIntValue();
+          if ((-AndRHSC).isPowerOf2() && (AndRHSC & C1) == C1) {
+            unsigned ShiftBits = AndRHSC.countTrailingZeros();
+            EVT ShiftTy = DCI.isBeforeLegalize() ?
+              getPointerTy() : getShiftAmountTy(N0.getValueType());
+            EVT CmpTy = N0.getValueType();
+            SDValue Shift = DAG.getNode(ISD::SRL, dl, CmpTy, N0.getOperand(0),
+                                        DAG.getConstant(ShiftBits, ShiftTy));
+            SDValue CmpRHS = DAG.getConstant(C1.lshr(ShiftBits), CmpTy);
+            return DAG.getSetCC(dl, VT, Shift, CmpRHS, Cond);
+          }
+        }
+      } else if (Cond == ISD::SETULT || Cond == ISD::SETUGE ||
+                 Cond == ISD::SETULE || Cond == ISD::SETUGT) {
+        bool AdjOne = (Cond == ISD::SETULE || Cond == ISD::SETUGT);
+        // X <  0x100000000 -> (X >> 32) <  1
+        // X >= 0x100000000 -> (X >> 32) >= 1
+        // X <= 0x0ffffffff -> (X >> 32) <  1
+        // X >  0x0ffffffff -> (X >> 32) >= 1
+        unsigned ShiftBits;
+        APInt NewC = C1;
+        ISD::CondCode NewCond = Cond;
+        if (AdjOne) {
+          ShiftBits = C1.countTrailingOnes();
+          NewC = NewC + 1;
+          NewCond = (Cond == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
+        } else {
+          ShiftBits = C1.countTrailingZeros();
+        }
+        NewC = NewC.lshr(ShiftBits);
+        if (ShiftBits && isLegalICmpImmediate(NewC.getSExtValue())) {
+          EVT ShiftTy = DCI.isBeforeLegalize() ?
+            getPointerTy() : getShiftAmountTy(N0.getValueType());
+          EVT CmpTy = N0.getValueType();
+          SDValue Shift = DAG.getNode(ISD::SRL, dl, CmpTy, N0,
+                                      DAG.getConstant(ShiftBits, ShiftTy));
+          SDValue CmpRHS = DAG.getConstant(NewC, CmpTy);
+          return DAG.getSetCC(dl, VT, Shift, CmpRHS, NewCond);
+        }
+      }
+    }
   }
 
   if (isa<ConstantFPSDNode>(N0.getNode())) {
@@ -2411,21 +2439,28 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
   }
 
   if (N0 == N1) {
+    // The sext(setcc()) => setcc() optimization relies on the appropriate
+    // constant being emitted.
+    uint64_t EqVal;
+    switch (getBooleanContents(N0.getValueType().isVector())) {
+    case UndefinedBooleanContent:
+    case ZeroOrOneBooleanContent:
+      EqVal = ISD::isTrueWhenEqual(Cond);
+      break;
+    case ZeroOrNegativeOneBooleanContent:
+      EqVal = ISD::isTrueWhenEqual(Cond) ? -1 : 0;
+      break;
+    }
+
     // We can always fold X == X for integer setcc's.
     if (N0.getValueType().isInteger()) {
-      switch (getBooleanContents(N0.getValueType().isVector())) {
-      case UndefinedBooleanContent: 
-      case ZeroOrOneBooleanContent: 
-        return DAG.getConstant(ISD::isTrueWhenEqual(Cond), VT);
-      case ZeroOrNegativeOneBooleanContent:
-        return DAG.getConstant(ISD::isTrueWhenEqual(Cond) ? -1 : 0, VT);
-      }
+      return DAG.getConstant(EqVal, VT);
     }
     unsigned UOF = ISD::getUnorderedFlavor(Cond);
     if (UOF == 2)   // FP operators that are undefined on NaNs.
-      return DAG.getConstant(ISD::isTrueWhenEqual(Cond), VT);
+      return DAG.getConstant(EqVal, VT);
     if (UOF == unsigned(ISD::isTrueWhenEqual(Cond)))
-      return DAG.getConstant(UOF, VT);
+      return DAG.getConstant(EqVal, VT);
     // Otherwise, we can't fold it.  However, we can simplify it to SETUO/SETO
     // if it is not already.
     ISD::CondCode NewCond = UOF == 0 ? ISD::SETO : ISD::SETUO;
@@ -2998,10 +3033,12 @@ TargetLowering::AsmOperandInfoVector TargetLowering::ParseConstraints(
       AsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput];
 
       if (OpInfo.ConstraintVT != Input.ConstraintVT) {
-	std::pair<unsigned, const TargetRegisterClass*> MatchRC =
-	  getRegForInlineAsmConstraint(OpInfo.ConstraintCode, OpInfo.ConstraintVT);
-	std::pair<unsigned, const TargetRegisterClass*> InputRC =
-	  getRegForInlineAsmConstraint(Input.ConstraintCode, Input.ConstraintVT);
+        std::pair<unsigned, const TargetRegisterClass*> MatchRC =
+          getRegForInlineAsmConstraint(OpInfo.ConstraintCode,
+                                       OpInfo.ConstraintVT);
+        std::pair<unsigned, const TargetRegisterClass*> InputRC =
+          getRegForInlineAsmConstraint(Input.ConstraintCode,
+                                       Input.ConstraintVT);
         if ((OpInfo.ConstraintVT.isInteger() !=
              Input.ConstraintVT.isInteger()) ||
             (MatchRC.second != InputRC.second)) {
diff --git a/lib/CodeGen/ShadowStackGC.cpp b/lib/CodeGen/ShadowStackGC.cpp
index 0016047..8a6b120 100644
--- a/lib/CodeGen/ShadowStackGC.cpp
+++ b/lib/CodeGen/ShadowStackGC.cpp
@@ -26,13 +26,13 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "shadowstackgc"
-#include "llvm/CodeGen/GCs.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Module.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/GCs.h"
 #include "llvm/Support/CallSite.h"
-#include "llvm/Support/IRBuilder.h"
 
 using namespace llvm;
 
diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index 9a86f32..980bd74 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp
@@ -13,28 +13,28 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "sjljehprepare"
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/Instructions.h"
 #include "llvm/Intrinsics.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetLowering.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/IRBuilder.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <set>
 using namespace llvm;
 
diff --git a/lib/CodeGen/SlotIndexes.cpp b/lib/CodeGen/SlotIndexes.cpp
index 26cf259..c8c3fb3 100644
--- a/lib/CodeGen/SlotIndexes.cpp
+++ b/lib/CodeGen/SlotIndexes.cpp
@@ -62,7 +62,6 @@ bool SlotIndexes::runOnMachineFunction(MachineFunction &fn) {
   assert(mi2iMap.empty() &&
          "MachineInstr -> Index mapping non-empty at initial numbering?");
 
-  functionSize = 0;
   unsigned index = 0;
   MBBRanges.resize(mf->getNumBlockIDs());
   idx2MBBMap.reserve(mf->size());
@@ -89,8 +88,6 @@ bool SlotIndexes::runOnMachineFunction(MachineFunction &fn) {
       // Save this base index in the maps.
       mi2iMap.insert(std::make_pair(mi, SlotIndex(&indexList.back(),
                                                   SlotIndex::Slot_Block)));
-
-      ++functionSize;
     }
 
     // We insert one blank instructions between basic blocks.
diff --git a/lib/CodeGen/SpillPlacement.cpp b/lib/CodeGen/SpillPlacement.cpp
index 6f33f54..320128a 100644
--- a/lib/CodeGen/SpillPlacement.cpp
+++ b/lib/CodeGen/SpillPlacement.cpp
@@ -207,6 +207,17 @@ void SpillPlacement::activate(unsigned n) {
     return;
   ActiveNodes->set(n);
   nodes[n].clear();
+
+  // Very large bundles usually come from big switches, indirect branches,
+  // landing pads, or loops with many 'continue' statements. It is difficult to
+  // allocate registers when so many different blocks are involved.
+  //
+  // Give a small negative bias to large bundles such that 1/32 of the
+  // connected blocks need to be interested before we consider expanding the
+  // region through the bundle. This helps compile time by limiting the number
+  // of blocks visited and the number of links in the Hopfield network.
+  if (bundles->getBlocks(n).size() > 100)
+    nodes[n].Bias = -0.0625f;
 }
 
 
diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp
index 9959f74..9a751c1 100644
--- a/lib/CodeGen/SplitKit.cpp
+++ b/lib/CodeGen/SplitKit.cpp
@@ -345,9 +345,11 @@ void SplitEditor::reset(LiveRangeEdit &LRE, ComplementSpillMode SM) {
   Values.clear();
 
   // Reset the LiveRangeCalc instances needed for this spill mode.
-  LRCalc[0].reset(&VRM.getMachineFunction());
+  LRCalc[0].reset(&VRM.getMachineFunction(), LIS.getSlotIndexes(), &MDT,
+                  &LIS.getVNInfoAllocator());
   if (SpillMode)
-    LRCalc[1].reset(&VRM.getMachineFunction());
+    LRCalc[1].reset(&VRM.getMachineFunction(), LIS.getSlotIndexes(), &MDT,
+                    &LIS.getVNInfoAllocator());
 
   // We don't need an AliasAnalysis since we will only be performing
   // cheap-as-a-copy remats anyway.
@@ -924,11 +926,9 @@ bool SplitEditor::transferValues() {
     DEBUG(dbgs() << '\n');
   }
 
-  LRCalc[0].calculateValues(LIS.getSlotIndexes(), &MDT,
-                            &LIS.getVNInfoAllocator());
+  LRCalc[0].calculateValues();
   if (SpillMode)
-    LRCalc[1].calculateValues(LIS.getSlotIndexes(), &MDT,
-                              &LIS.getVNInfoAllocator());
+    LRCalc[1].calculateValues();
 
   return Skipped;
 }
@@ -953,8 +953,7 @@ void SplitEditor::extendPHIKillRanges() {
       if (Edit->getParent().liveAt(LastUse)) {
         assert(RegAssign.lookup(LastUse) == RegIdx &&
                "Different register assignment in phi predecessor");
-        LRC.extend(LI, End,
-                   LIS.getSlotIndexes(), &MDT, &LIS.getVNInfoAllocator());
+        LRC.extend(LI, End);
       }
     }
   }
@@ -1004,8 +1003,7 @@ void SplitEditor::rewriteAssigned(bool ExtendRanges) {
     } else
       Idx = Idx.getRegSlot(true);
 
-    getLRCalc(RegIdx).extend(LI, Idx.getNextSlot(), LIS.getSlotIndexes(),
-                             &MDT, &LIS.getVNInfoAllocator());
+    getLRCalc(RegIdx).extend(LI, Idx.getNextSlot());
   }
 }
 
diff --git a/lib/CodeGen/StackSlotColoring.cpp b/lib/CodeGen/StackSlotColoring.cpp
index 1e940b1..20da36e 100644
--- a/lib/CodeGen/StackSlotColoring.cpp
+++ b/lib/CodeGen/StackSlotColoring.cpp
@@ -46,7 +46,6 @@ STATISTIC(NumDead,       "Number of trivially dead stack accesses eliminated");
 
 namespace {
   class StackSlotColoring : public MachineFunctionPass {
-    bool ColorWithRegs;
     LiveStacks* LS;
     MachineFrameInfo *MFI;
     const TargetInstrInfo  *TII;
@@ -82,7 +81,7 @@ namespace {
   public:
     static char ID; // Pass identification
     StackSlotColoring() :
-      MachineFunctionPass(ID), ColorWithRegs(false), NextColor(-1) {
+      MachineFunctionPass(ID), NextColor(-1) {
         initializeStackSlotColoringPass(*PassRegistry::getPassRegistry());
       }
 
diff --git a/lib/CodeGen/TailDuplication.cpp b/lib/CodeGen/TailDuplication.cpp
index 8ebfbca..a813fa6 100644
--- a/lib/CodeGen/TailDuplication.cpp
+++ b/lib/CodeGen/TailDuplication.cpp
@@ -20,12 +20,15 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineSSAUpdater.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -57,8 +60,10 @@ namespace {
   /// TailDuplicatePass - Perform tail duplication.
   class TailDuplicatePass : public MachineFunctionPass {
     const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
     MachineModuleInfo *MMI;
     MachineRegisterInfo *MRI;
+    OwningPtr<RegScavenger> RS;
     bool PreRegAlloc;
 
     // SSAUpdateVRs - A list of virtual registers for which to update SSA form.
@@ -124,9 +129,13 @@ INITIALIZE_PASS(TailDuplicatePass, "tailduplication", "Tail Duplication",
 
 bool TailDuplicatePass::runOnMachineFunction(MachineFunction &MF) {
   TII = MF.getTarget().getInstrInfo();
+  TRI = MF.getTarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
   MMI = getAnalysisIfAvailable<MachineModuleInfo>();
   PreRegAlloc = MRI->isSSA();
+  RS.reset();
+  if (MRI->tracksLiveness() && TRI->trackLivenessAfterRegAlloc(MF))
+    RS.reset(new RegScavenger());
 
   bool MadeChange = false;
   while (TailDuplicateBlocks(MF))
@@ -272,8 +281,8 @@ TailDuplicatePass::TailDuplicateAndUpdate(MachineBasicBlock *MBB,
       continue;
     unsigned Dst = Copy->getOperand(0).getReg();
     unsigned Src = Copy->getOperand(1).getReg();
-    MachineRegisterInfo::use_iterator UI = MRI->use_begin(Src);
-    if (++UI == MRI->use_end()) {
+    if (MRI->hasOneNonDBGUse(Src) &&
+        MRI->constrainRegClass(Src, MRI->getRegClass(Dst))) {
       // Copy is the only use. Do trivial copy propagation here.
       MRI->replaceRegWith(Dst, Src);
       Copy->eraseFromParent();
@@ -429,8 +438,10 @@ void TailDuplicatePass::DuplicateInstruction(MachineInstr *MI,
         AddSSAUpdateEntry(Reg, NewReg, PredBB);
     } else {
       DenseMap<unsigned, unsigned>::iterator VI = LocalVRMap.find(Reg);
-      if (VI != LocalVRMap.end())
+      if (VI != LocalVRMap.end()) {
         MO.setReg(VI->second);
+        MRI->constrainRegClass(VI->second, MRI->getRegClass(Reg));
+      }
     }
   }
   PredBB->insert(PredBB->instr_end(), NewMI);
@@ -775,6 +786,23 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB,
     // Remove PredBB's unconditional branch.
     TII->RemoveBranch(*PredBB);
 
+    if (RS && !TailBB->livein_empty()) {
+      // Update PredBB livein.
+      RS->enterBasicBlock(PredBB);
+      if (!PredBB->empty())
+        RS->forward(prior(PredBB->end()));
+      BitVector RegsLiveAtExit(TRI->getNumRegs());
+      RS->getRegsUsed(RegsLiveAtExit, false);
+      for (MachineBasicBlock::livein_iterator I = TailBB->livein_begin(),
+             E = TailBB->livein_end(); I != E; ++I) {
+        if (!RegsLiveAtExit[*I])
+          // If a register is previously livein to the tail but it's not live
+          // at the end of predecessor BB, then it should be added to its
+          // livein list.
+          PredBB->addLiveIn(*I);
+      }
+    }
+
     // Clone the contents of TailBB into PredBB.
     DenseMap<unsigned, unsigned> LocalVRMap;
     SmallVector<std::pair<unsigned,unsigned>, 4> CopyInfos;
diff --git a/lib/CodeGen/TargetInstrInfoImpl.cpp b/lib/CodeGen/TargetInstrInfoImpl.cpp
index 2beb928..a3d6771 100644
--- a/lib/CodeGen/TargetInstrInfoImpl.cpp
+++ b/lib/CodeGen/TargetInstrInfoImpl.cpp
@@ -501,6 +501,14 @@ CreateTargetHazardRecognizer(const TargetMachine *TM,
   return new ScheduleHazardRecognizer();
 }
 
+// Default implementation of CreateTargetMIHazardRecognizer.
+ScheduleHazardRecognizer *TargetInstrInfoImpl::
+CreateTargetMIHazardRecognizer(const InstrItineraryData *II,
+                               const ScheduleDAG *DAG) const {
+  return (ScheduleHazardRecognizer *)
+    new ScoreboardHazardRecognizer(II, DAG, "misched");
+}
+
 // Default implementation of CreateTargetPostRAHazardRecognizer.
 ScheduleHazardRecognizer *TargetInstrInfoImpl::
 CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
@@ -509,6 +517,10 @@ CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
     new ScoreboardHazardRecognizer(II, DAG, "post-RA-sched");
 }
 
+//===----------------------------------------------------------------------===//
+//  SelectionDAG latency interface.
+//===----------------------------------------------------------------------===//
+
 int
 TargetInstrInfoImpl::getOperandLatency(const InstrItineraryData *ItinData,
                                        SDNode *DefNode, unsigned DefIdx,
@@ -537,3 +549,199 @@ int TargetInstrInfoImpl::getInstrLatency(const InstrItineraryData *ItinData,
   return ItinData->getStageLatency(get(N->getMachineOpcode()).getSchedClass());
 }
 
+//===----------------------------------------------------------------------===//
+//  MachineInstr latency interface.
+//===----------------------------------------------------------------------===//
+
+unsigned
+TargetInstrInfoImpl::getNumMicroOps(const InstrItineraryData *ItinData,
+                                    const MachineInstr *MI) const {
+  if (!ItinData || ItinData->isEmpty())
+    return 1;
+
+  unsigned Class = MI->getDesc().getSchedClass();
+  int UOps = ItinData->Itineraries[Class].NumMicroOps;
+  if (UOps >= 0)
+    return UOps;
+
+  // The # of u-ops is dynamically determined. The specific target should
+  // override this function to return the right number.
+  return 1;
+}
+
+/// Return the default expected latency for a def based on it's opcode.
+unsigned TargetInstrInfo::defaultDefLatency(const InstrItineraryData *ItinData,
+                                            const MachineInstr *DefMI) const {
+  if (DefMI->mayLoad())
+    return ItinData->SchedModel->LoadLatency;
+  if (isHighLatencyDef(DefMI->getOpcode()))
+    return ItinData->SchedModel->HighLatency;
+  return 1;
+}
+
+unsigned TargetInstrInfoImpl::
+getInstrLatency(const InstrItineraryData *ItinData,
+                const MachineInstr *MI,
+                unsigned *PredCost) const {
+  // Default to one cycle for no itinerary. However, an "empty" itinerary may
+  // still have a MinLatency property, which getStageLatency checks.
+  if (!ItinData)
+    return MI->mayLoad() ? 2 : 1;
+
+  return ItinData->getStageLatency(MI->getDesc().getSchedClass());
+}
+
+bool TargetInstrInfoImpl::hasLowDefLatency(const InstrItineraryData *ItinData,
+                                           const MachineInstr *DefMI,
+                                           unsigned DefIdx) const {
+  if (!ItinData || ItinData->isEmpty())
+    return false;
+
+  unsigned DefClass = DefMI->getDesc().getSchedClass();
+  int DefCycle = ItinData->getOperandCycle(DefClass, DefIdx);
+  return (DefCycle != -1 && DefCycle <= 1);
+}
+
+/// Both DefMI and UseMI must be valid.  By default, call directly to the
+/// itinerary. This may be overriden by the target.
+int TargetInstrInfoImpl::
+getOperandLatency(const InstrItineraryData *ItinData,
+                  const MachineInstr *DefMI, unsigned DefIdx,
+                  const MachineInstr *UseMI, unsigned UseIdx) const {
+  unsigned DefClass = DefMI->getDesc().getSchedClass();
+  unsigned UseClass = UseMI->getDesc().getSchedClass();
+  return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx);
+}
+
+/// If we can determine the operand latency from the def only, without itinerary
+/// lookup, do so. Otherwise return -1.
+static int computeDefOperandLatency(
+  const TargetInstrInfo *TII, const InstrItineraryData *ItinData,
+  const MachineInstr *DefMI, bool FindMin) {
+
+  // Let the target hook getInstrLatency handle missing itineraries.
+  if (!ItinData)
+    return TII->getInstrLatency(ItinData, DefMI);
+
+  // Return a latency based on the itinerary properties and defining instruction
+  // if possible. Some common subtargets don't require per-operand latency,
+  // especially for minimum latencies.
+  if (FindMin) {
+    // If MinLatency is valid, call getInstrLatency. This uses Stage latency if
+    // it exists before defaulting to MinLatency.
+    if (ItinData->SchedModel->MinLatency >= 0)
+      return TII->getInstrLatency(ItinData, DefMI);
+
+    // If MinLatency is invalid, OperandLatency is interpreted as MinLatency.
+    // For empty itineraries, short-cirtuit the check and default to one cycle.
+    if (ItinData->isEmpty())
+      return 1;
+  }
+  else if(ItinData->isEmpty())
+    return TII->defaultDefLatency(ItinData, DefMI);
+
+  // ...operand lookup required
+  return -1;
+}
+
+/// computeOperandLatency - Compute and return the latency of the given data
+/// dependent def and use when the operand indices are already known.
+///
+/// FindMin may be set to get the minimum vs. expected latency.
+unsigned TargetInstrInfo::
+computeOperandLatency(const InstrItineraryData *ItinData,
+                      const MachineInstr *DefMI, unsigned DefIdx,
+                      const MachineInstr *UseMI, unsigned UseIdx,
+                      bool FindMin) const {
+
+  int DefLatency = computeDefOperandLatency(this, ItinData, DefMI, FindMin);
+  if (DefLatency >= 0)
+    return DefLatency;
+
+  assert(ItinData && !ItinData->isEmpty() && "computeDefOperandLatency fail");
+
+  int OperLatency = getOperandLatency(ItinData, DefMI, DefIdx, UseMI, UseIdx);
+  if (OperLatency >= 0)
+    return OperLatency;
+
+  // No operand latency was found.
+  unsigned InstrLatency = getInstrLatency(ItinData, DefMI);
+
+  // Expected latency is the max of the stage latency and itinerary props.
+  if (!FindMin)
+    InstrLatency = std::max(InstrLatency, defaultDefLatency(ItinData, DefMI));
+  return InstrLatency;
+}
+
+/// computeOperandLatency - Compute and return the latency of the given data
+/// dependent def and use. DefMI must be a valid def. UseMI may be NULL for an
+/// unknown use. Depending on the subtarget's itinerary properties, this may or
+/// may not need to call getOperandLatency().
+///
+/// FindMin may be set to get the minimum vs. expected latency. Minimum
+/// latency is used for scheduling groups, while expected latency is for
+/// instruction cost and critical path.
+///
+/// For most subtargets, we don't need DefIdx or UseIdx to compute min latency.
+/// DefMI must be a valid definition, but UseMI may be NULL for an unknown use.
+unsigned TargetInstrInfo::
+computeOperandLatency(const InstrItineraryData *ItinData,
+                      const TargetRegisterInfo *TRI,
+                      const MachineInstr *DefMI, const MachineInstr *UseMI,
+                      unsigned Reg, bool FindMin) const {
+
+  int DefLatency = computeDefOperandLatency(this, ItinData, DefMI, FindMin);
+  if (DefLatency >= 0)
+    return DefLatency;
+
+  assert(ItinData && !ItinData->isEmpty() && "computeDefOperandLatency fail");
+
+  // Find the definition of the register in the defining instruction.
+  int DefIdx = DefMI->findRegisterDefOperandIdx(Reg);
+  if (DefIdx != -1) {
+    const MachineOperand &MO = DefMI->getOperand(DefIdx);
+    if (MO.isReg() && MO.isImplicit() &&
+        DefIdx >= (int)DefMI->getDesc().getNumOperands()) {
+      // This is an implicit def, getOperandLatency() won't return the correct
+      // latency. e.g.
+      //   %D6<def>, %D7<def> = VLD1q16 %R2<kill>, 0, ..., %Q3<imp-def>
+      //   %Q1<def> = VMULv8i16 %Q1<kill>, %Q3<kill>, ...
+      // What we want is to compute latency between def of %D6/%D7 and use of
+      // %Q3 instead.
+      unsigned Op2 = DefMI->findRegisterDefOperandIdx(Reg, false, true, TRI);
+      if (DefMI->getOperand(Op2).isReg())
+        DefIdx = Op2;
+    }
+    // For all uses of the register, calculate the maxmimum latency
+    int OperLatency = -1;
+
+    // UseMI is null, then it must be a scheduling barrier.
+    if (!UseMI) {
+      unsigned DefClass = DefMI->getDesc().getSchedClass();
+      OperLatency = ItinData->getOperandCycle(DefClass, DefIdx);
+    }
+    else {
+      for (unsigned i = 0, e = UseMI->getNumOperands(); i != e; ++i) {
+        const MachineOperand &MO = UseMI->getOperand(i);
+        if (!MO.isReg() || !MO.isUse())
+          continue;
+        unsigned MOReg = MO.getReg();
+        if (MOReg != Reg)
+          continue;
+
+        int UseCycle = getOperandLatency(ItinData, DefMI, DefIdx, UseMI, i);
+        OperLatency = std::max(OperLatency, UseCycle);
+      }
+    }
+    // If we found an operand latency, we're done.
+    if (OperLatency >= 0)
+      return OperLatency;
+  }
+  // No operand latency was found.
+  unsigned InstrLatency = getInstrLatency(ItinData, DefMI);
+
+  // Expected latency is the max of the stage latency and itinerary props.
+  if (!FindMin)
+    InstrLatency = std::max(InstrLatency, defaultDefLatency(ItinData, DefMI));
+  return InstrLatency;
+}
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 9925185..2a2fa9e 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -93,8 +93,9 @@ getELFKindForNamedSection(StringRef Name, SectionKind K) {
   // N.B.: The defaults used in here are no the same ones used in MC.
   // We follow gcc, MC follows gas. For example, given ".section .eh_frame",
   // both gas and MC will produce a section with no flags. Given
-  // section(".eh_frame") gcc will produce
-  // .section	.eh_frame,"a",@progbits
+  // section(".eh_frame") gcc will produce:
+  //
+  //   .section   .eh_frame,"a",@progbits
   if (Name.empty() || Name[0] != '.') return K;
 
   // Some lame default implementation based on some magic section names.
@@ -349,10 +350,17 @@ TargetLoweringObjectFileELF::getStaticCtorSection(unsigned Priority) const {
   if (Priority == 65535)
     return StaticCtorSection;
 
-  std::string Name = std::string(".ctors.") + utostr(65535 - Priority);
-  return getContext().getELFSection(Name, ELF::SHT_PROGBITS,
-                                    ELF::SHF_ALLOC |ELF::SHF_WRITE,
-                                    SectionKind::getDataRel());
+  if (UseInitArray) {
+    std::string Name = std::string(".init_array.") + utostr(Priority);
+    return getContext().getELFSection(Name, ELF::SHT_INIT_ARRAY,
+                                      ELF::SHF_ALLOC | ELF::SHF_WRITE,
+                                      SectionKind::getDataRel());
+  } else {
+    std::string Name = std::string(".ctors.") + utostr(65535 - Priority);
+    return getContext().getELFSection(Name, ELF::SHT_PROGBITS,
+                                      ELF::SHF_ALLOC |ELF::SHF_WRITE,
+                                      SectionKind::getDataRel());
+  }
 }
 
 const MCSection *
@@ -362,10 +370,35 @@ TargetLoweringObjectFileELF::getStaticDtorSection(unsigned Priority) const {
   if (Priority == 65535)
     return StaticDtorSection;
 
-  std::string Name = std::string(".dtors.") + utostr(65535 - Priority);
-  return getContext().getELFSection(Name, ELF::SHT_PROGBITS,
-                                    ELF::SHF_ALLOC |ELF::SHF_WRITE,
-                                    SectionKind::getDataRel());
+  if (UseInitArray) {
+    std::string Name = std::string(".fini_array.") + utostr(Priority);
+    return getContext().getELFSection(Name, ELF::SHT_FINI_ARRAY,
+                                      ELF::SHF_ALLOC | ELF::SHF_WRITE,
+                                      SectionKind::getDataRel());
+  } else {
+    std::string Name = std::string(".dtors.") + utostr(65535 - Priority);
+    return getContext().getELFSection(Name, ELF::SHT_PROGBITS,
+                                      ELF::SHF_ALLOC |ELF::SHF_WRITE,
+                                      SectionKind::getDataRel());
+  }
+}
+
+void
+TargetLoweringObjectFileELF::InitializeELF(bool UseInitArray_) {
+  UseInitArray = UseInitArray_;
+  if (!UseInitArray)
+    return;
+
+  StaticCtorSection =
+    getContext().getELFSection(".init_array", ELF::SHT_INIT_ARRAY,
+                               ELF::SHF_WRITE |
+                               ELF::SHF_ALLOC,
+                               SectionKind::getDataRel());
+  StaticDtorSection =
+    getContext().getELFSection(".fini_array", ELF::SHT_FINI_ARRAY,
+                               ELF::SHF_WRITE |
+                               ELF::SHF_ALLOC,
+                               SectionKind::getDataRel());
 }
 
 //===----------------------------------------------------------------------===//
@@ -379,7 +412,7 @@ emitModuleFlags(MCStreamer &Streamer,
                 ArrayRef<Module::ModuleFlagEntry> ModuleFlags,
                 Mangler *Mang, const TargetMachine &TM) const {
   unsigned VersionVal = 0;
-  unsigned GCFlags = 0;
+  unsigned ImageInfoFlags = 0;
   StringRef SectionVal;
 
   for (ArrayRef<Module::ModuleFlagEntry>::iterator
@@ -396,8 +429,9 @@ emitModuleFlags(MCStreamer &Streamer,
     if (Key == "Objective-C Image Info Version")
       VersionVal = cast<ConstantInt>(Val)->getZExtValue();
     else if (Key == "Objective-C Garbage Collection" ||
-             Key == "Objective-C GC Only")
-      GCFlags |= cast<ConstantInt>(Val)->getZExtValue();
+             Key == "Objective-C GC Only" ||
+             Key == "Objective-C Is Simulated")
+      ImageInfoFlags |= cast<ConstantInt>(Val)->getZExtValue();
     else if (Key == "Objective-C Image Info Section")
       SectionVal = cast<MDString>(Val)->getString();
   }
@@ -424,7 +458,7 @@ emitModuleFlags(MCStreamer &Streamer,
   Streamer.EmitLabel(getContext().
                      GetOrCreateSymbol(StringRef("L_OBJC_IMAGE_INFO")));
   Streamer.EmitIntValue(VersionVal, 4);
-  Streamer.EmitIntValue(GCFlags, 4);
+  Streamer.EmitIntValue(ImageInfoFlags, 4);
   Streamer.AddBlankLine();
 }
 
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index c30b133..e4c0119 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -102,7 +102,7 @@ namespace {
     MachineInstr *FindLastUseInMBB(unsigned Reg, MachineBasicBlock *MBB,
                                    unsigned Dist);
 
-    bool isProfitableToCommute(unsigned regB, unsigned regC,
+    bool isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
                                MachineInstr *MI, MachineBasicBlock *MBB,
                                unsigned Dist);
 
@@ -483,32 +483,6 @@ static bool isTwoAddrUse(MachineInstr &MI, unsigned Reg, unsigned &DstReg) {
   return false;
 }
 
-/// findLocalKill - Look for an instruction below MI in the MBB that kills the
-/// specified register. Returns null if there are any other Reg use between the
-/// instructions.
-static
-MachineInstr *findLocalKill(unsigned Reg, MachineBasicBlock *MBB,
-                            MachineInstr *MI, MachineRegisterInfo *MRI,
-                            DenseMap<MachineInstr*, unsigned> &DistanceMap) {
-  MachineInstr *KillMI = 0;
-  for (MachineRegisterInfo::use_nodbg_iterator
-         UI = MRI->use_nodbg_begin(Reg),
-         UE = MRI->use_nodbg_end(); UI != UE; ++UI) {
-    MachineInstr *UseMI = &*UI;
-    if (UseMI == MI || UseMI->getParent() != MBB)
-      continue;
-    if (DistanceMap.count(UseMI))
-      continue;
-    if (!UI.getOperand().isKill())
-      return 0;
-    if (KillMI)
-      return 0;  // -O0 kill markers cannot be trusted?
-    KillMI = UseMI;
-  }
-
-  return KillMI;
-}
-
 /// findOnlyInterestingUse - Given a register, if has a single in-basic block
 /// use, return the use instruction if it's a copy or a two-address use.
 static
@@ -567,7 +541,8 @@ regsAreCompatible(unsigned RegA, unsigned RegB, const TargetRegisterInfo *TRI) {
 /// isProfitableToReMat - Return true if it's potentially profitable to commute
 /// the two-address instruction that's being processed.
 bool
-TwoAddressInstructionPass::isProfitableToCommute(unsigned regB, unsigned regC,
+TwoAddressInstructionPass::isProfitableToCommute(unsigned regA, unsigned regB,
+                                       unsigned regC,
                                        MachineInstr *MI, MachineBasicBlock *MBB,
                                        unsigned Dist) {
   if (OptLevel == CodeGenOpt::None)
@@ -604,15 +579,15 @@ TwoAddressInstructionPass::isProfitableToCommute(unsigned regB, unsigned regC,
   // %reg1026<def> = ADD %reg1024, %reg1025
   // r0            = MOV %reg1026
   // Commute the ADD to hopefully eliminate an otherwise unavoidable copy.
-  unsigned FromRegB = getMappedReg(regB, SrcRegMap);
-  unsigned FromRegC = getMappedReg(regC, SrcRegMap);
-  unsigned ToRegB = getMappedReg(regB, DstRegMap);
-  unsigned ToRegC = getMappedReg(regC, DstRegMap);
-  if ((FromRegB && ToRegB && !regsAreCompatible(FromRegB, ToRegB, TRI)) &&
-      ((!FromRegC && !ToRegC) ||
-       regsAreCompatible(FromRegB, ToRegC, TRI) ||
-       regsAreCompatible(FromRegC, ToRegB, TRI)))
-    return true;
+  unsigned ToRegA = getMappedReg(regA, DstRegMap);
+  if (ToRegA) {
+    unsigned FromRegB = getMappedReg(regB, SrcRegMap);
+    unsigned FromRegC = getMappedReg(regC, SrcRegMap);
+    bool BComp = !FromRegB || regsAreCompatible(FromRegB, ToRegA, TRI);
+    bool CComp = !FromRegC || regsAreCompatible(FromRegC, ToRegA, TRI);
+    if (BComp != CComp)
+      return !BComp && CComp;
+  }
 
   // If there is a use of regC between its last def (could be livein) and this
   // instruction, then bail.
@@ -904,14 +879,19 @@ TwoAddressInstructionPass::RescheduleMIBelowKill(MachineBasicBlock *MBB,
                                      MachineBasicBlock::iterator &mi,
                                      MachineBasicBlock::iterator &nmi,
                                      unsigned Reg) {
+  // Bail immediately if we don't have LV available. We use it to find kills
+  // efficiently.
+  if (!LV)
+    return false;
+
   MachineInstr *MI = &*mi;
   DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(MI);
   if (DI == DistanceMap.end())
     // Must be created from unfolded load. Don't waste time trying this.
     return false;
 
-  MachineInstr *KillMI = findLocalKill(Reg, MBB, mi, MRI, DistanceMap);
-  if (!KillMI || KillMI->isCopy() || KillMI->isCopyLike())
+  MachineInstr *KillMI = LV->getVarInfo(Reg).findKill(MBB);
+  if (!KillMI || MI == KillMI || KillMI->isCopy() || KillMI->isCopyLike())
     // Don't mess with copies, they may be coalesced later.
     return false;
 
@@ -998,6 +978,12 @@ TwoAddressInstructionPass::RescheduleMIBelowKill(MachineBasicBlock *MBB,
             ((MO.isKill() && Uses.count(MOReg)) || Kills.count(MOReg)))
           // Don't want to extend other live ranges and update kills.
           return false;
+        if (MOReg == Reg && !MO.isKill())
+          // We can't schedule across a use of the register in question.
+          return false;
+        // Ensure that if this is register in question, its the kill we expect.
+        assert((MOReg != Reg || OtherMI == KillMI) &&
+               "Found multiple kills of a register in a basic block");
       }
     }
   }
@@ -1011,20 +997,11 @@ TwoAddressInstructionPass::RescheduleMIBelowKill(MachineBasicBlock *MBB,
   MBB->splice(KillPos, MBB, From, To);
   DistanceMap.erase(DI);
 
-  if (LV) {
-    // Update live variables
-    LV->removeVirtualRegisterKilled(Reg, KillMI);
-    LV->addVirtualRegisterKilled(Reg, MI);
-  } else {
-    for (unsigned i = 0, e = KillMI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = KillMI->getOperand(i);
-      if (!MO.isReg() || !MO.isUse() || MO.getReg() != Reg)
-        continue;
-      MO.setIsKill(false);
-    }
-    MI->addRegisterKilled(Reg, 0);
-  }
+  // Update live variables
+  LV->removeVirtualRegisterKilled(Reg, KillMI);
+  LV->addVirtualRegisterKilled(Reg, MI);
 
+  DEBUG(dbgs() << "\trescheduled below kill: " << *KillMI);
   return true;
 }
 
@@ -1045,7 +1022,7 @@ bool TwoAddressInstructionPass::isDefTooClose(unsigned Reg, unsigned Dist,
       return true;  // Below MI
     unsigned DefDist = DDI->second;
     assert(Dist > DefDist && "Visited def already?");
-    if (TII->getInstrLatency(InstrItins, DefMI) > (int)(Dist - DefDist))
+    if (TII->getInstrLatency(InstrItins, DefMI) > (Dist - DefDist))
       return true;
   }
   return false;
@@ -1060,14 +1037,19 @@ TwoAddressInstructionPass::RescheduleKillAboveMI(MachineBasicBlock *MBB,
                                      MachineBasicBlock::iterator &mi,
                                      MachineBasicBlock::iterator &nmi,
                                      unsigned Reg) {
+  // Bail immediately if we don't have LV available. We use it to find kills
+  // efficiently.
+  if (!LV)
+    return false;
+
   MachineInstr *MI = &*mi;
   DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(MI);
   if (DI == DistanceMap.end())
     // Must be created from unfolded load. Don't waste time trying this.
     return false;
 
-  MachineInstr *KillMI = findLocalKill(Reg, MBB, mi, MRI, DistanceMap);
-  if (!KillMI || KillMI->isCopy() || KillMI->isCopyLike())
+  MachineInstr *KillMI = LV->getVarInfo(Reg).findKill(MBB);
+  if (!KillMI || MI == KillMI || KillMI->isCopy() || KillMI->isCopyLike())
     // Don't mess with copies, they may be coalesced later.
     return false;
 
@@ -1093,6 +1075,8 @@ TwoAddressInstructionPass::RescheduleKillAboveMI(MachineBasicBlock *MBB,
         continue;
       if (isDefTooClose(MOReg, DI->second, MI, MBB))
         return false;
+      if (MOReg == Reg && !MO.isKill())
+        return false;
       Uses.insert(MOReg);
       if (MO.isKill() && MOReg != Reg)
         Kills.insert(MOReg);
@@ -1134,6 +1118,9 @@ TwoAddressInstructionPass::RescheduleKillAboveMI(MachineBasicBlock *MBB,
         if (Kills.count(MOReg))
           // Don't want to extend other live ranges and update kills.
           return false;
+        if (OtherMI != MI && MOReg == Reg && !MO.isKill())
+          // We can't schedule across a use of the register in question.
+          return false;
       } else {
         OtherDefs.push_back(MOReg);
       }
@@ -1164,19 +1151,11 @@ TwoAddressInstructionPass::RescheduleKillAboveMI(MachineBasicBlock *MBB,
   nmi = llvm::prior(InsertPos); // Backtrack so we process the moved instr.
   DistanceMap.erase(DI);
 
-  if (LV) {
-    // Update live variables
-    LV->removeVirtualRegisterKilled(Reg, KillMI);
-    LV->addVirtualRegisterKilled(Reg, MI);
-  } else {
-    for (unsigned i = 0, e = KillMI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = KillMI->getOperand(i);
-      if (!MO.isReg() || !MO.isUse() || MO.getReg() != Reg)
-        continue;
-      MO.setIsKill(false);
-    }
-    MI->addRegisterKilled(Reg, 0);
-  }
+  // Update live variables
+  LV->removeVirtualRegisterKilled(Reg, KillMI);
+  LV->addVirtualRegisterKilled(Reg, MI);
+
+  DEBUG(dbgs() << "\trescheduled kill: " << *KillMI);
   return true;
 }
 
@@ -1208,9 +1187,13 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi,
   if (!regBKilled && MI.getOperand(DstIdx).isDead() &&
       DeleteUnusedInstr(mi, nmi, mbbi, Dist)) {
     ++NumDeletes;
-    return true; // Done with this instruction.
+    DEBUG(dbgs() << "\tdeleted unused instruction.\n");
+    return true; // Done with this instruction."
   }
 
+  if (TargetRegisterInfo::isVirtualRegister(regA))
+    ScanUses(regA, &*mbbi, Processed);
+
   // Check if it is profitable to commute the operands.
   unsigned SrcOp1, SrcOp2;
   unsigned regC = 0;
@@ -1230,7 +1213,7 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi,
         // If C dies but B does not, swap the B and C operands.
         // This makes the live ranges of A and C joinable.
         TryCommute = true;
-      else if (isProfitableToCommute(regB, regC, &MI, mbbi, Dist)) {
+      else if (isProfitableToCommute(regA, regB, regC, &MI, mbbi, Dist)) {
         TryCommute = true;
         AggressiveCommute = true;
       }
@@ -1252,9 +1235,6 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi,
     return true;
   }
 
-  if (TargetRegisterInfo::isVirtualRegister(regA))
-    ScanUses(regA, &*mbbi, Processed);
-
   if (MI.isConvertibleTo3Addr()) {
     // This instruction is potentially convertible to a true
     // three-address instruction.  Check if it is profitable.
@@ -1298,7 +1278,8 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi,
         // Unfold the load.
         DEBUG(dbgs() << "2addr:   UNFOLDING: " << MI);
         const TargetRegisterClass *RC =
-          TII->getRegClass(UnfoldMCID, LoadRegIndex, TRI);
+          TRI->getAllocatableClass(
+            TII->getRegClass(UnfoldMCID, LoadRegIndex, TRI, MF));
         unsigned Reg = MRI->createVirtualRegister(RC);
         SmallVector<MachineInstr *, 2> NewMIs;
         if (!TII->unfoldMemoryOperand(MF, &MI, Reg,
@@ -1454,31 +1435,50 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
                "two address instruction invalid");
 
         unsigned regB = mi->getOperand(SrcIdx).getReg();
+
+        // Deal with <undef> uses immediately - simply rewrite the src operand.
+        if (mi->getOperand(SrcIdx).isUndef()) {
+          unsigned DstReg = mi->getOperand(DstIdx).getReg();
+          // Constrain the DstReg register class if required.
+          if (TargetRegisterInfo::isVirtualRegister(DstReg))
+            if (const TargetRegisterClass *RC = TII->getRegClass(MCID, SrcIdx,
+                                                                 TRI, MF))
+              MRI->constrainRegClass(DstReg, RC);
+          mi->getOperand(SrcIdx).setReg(DstReg);
+          DEBUG(dbgs() << "\t\trewrite undef:\t" << *mi);
+          continue;
+        }
         TiedOperands[regB].push_back(std::make_pair(SrcIdx, DstIdx));
       }
 
+      // If the instruction has a single pair of tied operands, try some
+      // transformations that may either eliminate the tied operands or
+      // improve the opportunities for coalescing away the register copy.
+      if (TiedOperands.size() == 1) {
+        SmallVector<std::pair<unsigned, unsigned>, 4> &TiedPairs
+          = TiedOperands.begin()->second;
+        if (TiedPairs.size() == 1) {
+          unsigned SrcIdx = TiedPairs[0].first;
+          unsigned DstIdx = TiedPairs[0].second;
+          unsigned SrcReg = mi->getOperand(SrcIdx).getReg();
+          unsigned DstReg = mi->getOperand(DstIdx).getReg();
+          if (SrcReg != DstReg &&
+              TryInstructionTransform(mi, nmi, mbbi, SrcIdx, DstIdx, Dist,
+                                      Processed)) {
+            // The tied operands have been eliminated or shifted further down the
+            // block to ease elimination. Continue processing with 'nmi'.
+            TiedOperands.clear();
+            mi = nmi;
+            continue;
+          }
+        }
+      }
+
       // Now iterate over the information collected above.
       for (TiedOperandMap::iterator OI = TiedOperands.begin(),
              OE = TiedOperands.end(); OI != OE; ++OI) {
         SmallVector<std::pair<unsigned, unsigned>, 4> &TiedPairs = OI->second;
 
-        // If the instruction has a single pair of tied operands, try some
-        // transformations that may either eliminate the tied operands or
-        // improve the opportunities for coalescing away the register copy.
-        if (TiedOperands.size() == 1 && TiedPairs.size() == 1) {
-          unsigned SrcIdx = TiedPairs[0].first;
-          unsigned DstIdx = TiedPairs[0].second;
-
-          // If the registers are already equal, nothing needs to be done.
-          if (mi->getOperand(SrcIdx).getReg() ==
-              mi->getOperand(DstIdx).getReg())
-            break; // Done with this instruction.
-
-          if (TryInstructionTransform(mi, nmi, mbbi, SrcIdx, DstIdx, Dist,
-                                      Processed))
-            break; // The tied operands have been eliminated.
-        }
-
         bool IsEarlyClobber = false;
         bool RemovedKillFlag = false;
         bool AllUsesCopied = true;
@@ -1519,8 +1519,9 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
 #endif
 
           // Emit a copy or rematerialize the definition.
+          bool isCopy = false;
           const TargetRegisterClass *rc = MRI->getRegClass(regB);
-          MachineInstr *DefMI = MRI->getVRegDef(regB);
+          MachineInstr *DefMI = MRI->getUniqueVRegDef(regB);
           // If it's safe and profitable, remat the definition instead of
           // copying it.
           if (DefMI &&
@@ -1535,10 +1536,11 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
           } else {
             BuildMI(*mbbi, mi, mi->getDebugLoc(), TII->get(TargetOpcode::COPY),
                     regA).addReg(regB);
+            isCopy = true;
           }
 
-          MachineBasicBlock::iterator prevMI = prior(mi);
           // Update DistanceMap.
+          MachineBasicBlock::iterator prevMI = prior(mi);
           DistanceMap.insert(std::make_pair(prevMI, Dist));
           DistanceMap[mi] = ++Dist;
 
@@ -1551,7 +1553,17 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
             MO.setIsKill(false);
             RemovedKillFlag = true;
           }
+
+          // Make sure regA is a legal regclass for the SrcIdx operand.
+          if (TargetRegisterInfo::isVirtualRegister(regA) &&
+              TargetRegisterInfo::isVirtualRegister(regB))
+            MRI->constrainRegClass(regA, MRI->getRegClass(regB));
+
           MO.setReg(regA);
+
+          if (isCopy)
+            // Propagate SrcRegMap.
+            SrcRegMap[regA] = regB;
         }
 
         if (AllUsesCopied) {
@@ -1587,27 +1599,32 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
           }
         }
 
-        // Schedule the source copy / remat inserted to form two-address
-        // instruction. FIXME: Does it matter the distance map may not be
-        // accurate after it's scheduled?
-        TII->scheduleTwoAddrSource(prior(mi), mi, *TRI);
+        // We didn't change anything if there was a single tied pair, and that
+        // pair didn't require copies.
+        if (AllUsesCopied || TiedPairs.size() > 1) {
+          MadeChange = true;
 
-        MadeChange = true;
+          // Schedule the source copy / remat inserted to form two-address
+          // instruction. FIXME: Does it matter the distance map may not be
+          // accurate after it's scheduled?
+          TII->scheduleTwoAddrSource(prior(mi), mi, *TRI);
+        }
 
         DEBUG(dbgs() << "\t\trewrite to:\t" << *mi);
+      }
 
-        // Rewrite INSERT_SUBREG as COPY now that we no longer need SSA form.
-        if (mi->isInsertSubreg()) {
-          // From %reg = INSERT_SUBREG %reg, %subreg, subidx
-          // To   %reg:subidx = COPY %subreg
-          unsigned SubIdx = mi->getOperand(3).getImm();
-          mi->RemoveOperand(3);
-          assert(mi->getOperand(0).getSubReg() == 0 && "Unexpected subreg idx");
-          mi->getOperand(0).setSubReg(SubIdx);
-          mi->RemoveOperand(1);
-          mi->setDesc(TII->get(TargetOpcode::COPY));
-          DEBUG(dbgs() << "\t\tconvert to:\t" << *mi);
-        }
+      // Rewrite INSERT_SUBREG as COPY now that we no longer need SSA form.
+      if (mi->isInsertSubreg()) {
+        // From %reg = INSERT_SUBREG %reg, %subreg, subidx
+        // To   %reg:subidx = COPY %subreg
+        unsigned SubIdx = mi->getOperand(3).getImm();
+        mi->RemoveOperand(3);
+        assert(mi->getOperand(0).getSubReg() == 0 && "Unexpected subreg idx");
+        mi->getOperand(0).setSubReg(SubIdx);
+        mi->getOperand(0).setIsUndef(mi->getOperand(1).isUndef());
+        mi->RemoveOperand(1);
+        mi->setDesc(TII->get(TargetOpcode::COPY));
+        DEBUG(dbgs() << "\t\tconvert to:\t" << *mi);
       }
 
       // Clear TiedOperands here instead of at the top of the loop
@@ -1694,9 +1711,10 @@ TwoAddressInstructionPass::CoalesceExtSubRegs(SmallVector<unsigned,4> &Srcs,
       continue;
 
     // Check that the instructions are all in the same basic block.
-    MachineInstr *SrcDefMI = MRI->getVRegDef(SrcReg);
-    MachineInstr *DstDefMI = MRI->getVRegDef(DstReg);
-    if (SrcDefMI->getParent() != DstDefMI->getParent())
+    MachineInstr *SrcDefMI = MRI->getUniqueVRegDef(SrcReg);
+    MachineInstr *DstDefMI = MRI->getUniqueVRegDef(DstReg);
+    if (!SrcDefMI || !DstDefMI ||
+        SrcDefMI->getParent() != DstDefMI->getParent())
       continue;
 
     // If there are no other uses than copies which feed into
@@ -1832,6 +1850,11 @@ bool TwoAddressInstructionPass::EliminateRegSequences() {
     SmallVector<unsigned, 4> RealSrcs;
     SmallSet<unsigned, 4> Seen;
     for (unsigned i = 1, e = MI->getNumOperands(); i < e; i += 2) {
+      // Nothing needs to be inserted for <undef> operands.
+      if (MI->getOperand(i).isUndef()) {
+        MI->getOperand(i).setReg(0);
+        continue;
+      }
       unsigned SrcReg = MI->getOperand(i).getReg();
       unsigned SrcSubIdx = MI->getOperand(i).getSubReg();
       unsigned SubIdx = MI->getOperand(i+1).getImm();
@@ -1841,7 +1864,7 @@ bool TwoAddressInstructionPass::EliminateRegSequences() {
       MachineInstr *DefMI = NULL;
       if (!MI->getOperand(i).getSubReg() &&
           !TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
-        DefMI = MRI->getVRegDef(SrcReg);
+        DefMI = MRI->getUniqueVRegDef(SrcReg);
       }
 
       if (DefMI && DefMI->isImplicitDef()) {
diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp
index 3bab93b..93840f0 100644
--- a/lib/CodeGen/VirtRegMap.cpp
+++ b/lib/CodeGen/VirtRegMap.cpp
@@ -18,12 +18,14 @@
 
 #define DEBUG_TYPE "regalloc"
 #include "VirtRegMap.h"
+#include "LiveDebugVariables.h"
 #include "llvm/Function.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -104,11 +106,149 @@ void VirtRegMap::assignVirt2StackSlot(unsigned virtReg, int SS) {
   Virt2StackSlotMap[virtReg] = SS;
 }
 
-void VirtRegMap::rewrite(SlotIndexes *Indexes) {
+void VirtRegMap::print(raw_ostream &OS, const Module*) const {
+  OS << "********** REGISTER MAP **********\n";
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (Virt2PhysMap[Reg] != (unsigned)VirtRegMap::NO_PHYS_REG) {
+      OS << '[' << PrintReg(Reg, TRI) << " -> "
+         << PrintReg(Virt2PhysMap[Reg], TRI) << "] "
+         << MRI->getRegClass(Reg)->getName() << "\n";
+    }
+  }
+
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (Virt2StackSlotMap[Reg] != VirtRegMap::NO_STACK_SLOT) {
+      OS << '[' << PrintReg(Reg, TRI) << " -> fi#" << Virt2StackSlotMap[Reg]
+         << "] " << MRI->getRegClass(Reg)->getName() << "\n";
+    }
+  }
+  OS << '\n';
+}
+
+void VirtRegMap::dump() const {
+  print(dbgs());
+}
+
+//===----------------------------------------------------------------------===//
+//                              VirtRegRewriter
+//===----------------------------------------------------------------------===//
+//
+// The VirtRegRewriter is the last of the register allocator passes.
+// It rewrites virtual registers to physical registers as specified in the
+// VirtRegMap analysis. It also updates live-in information on basic blocks
+// according to LiveIntervals.
+//
+namespace {
+class VirtRegRewriter : public MachineFunctionPass {
+  MachineFunction *MF;
+  const TargetMachine *TM;
+  const TargetRegisterInfo *TRI;
+  const TargetInstrInfo *TII;
+  MachineRegisterInfo *MRI;
+  SlotIndexes *Indexes;
+  LiveIntervals *LIS;
+  VirtRegMap *VRM;
+
+  void rewrite();
+  void addMBBLiveIns();
+public:
+  static char ID;
+  VirtRegRewriter() : MachineFunctionPass(ID) {}
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+
+  virtual bool runOnMachineFunction(MachineFunction&);
+};
+} // end anonymous namespace
+
+char &llvm::VirtRegRewriterID = VirtRegRewriter::ID;
+
+INITIALIZE_PASS_BEGIN(VirtRegRewriter, "virtregrewriter",
+                      "Virtual Register Rewriter", false, false)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(LiveDebugVariables)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_END(VirtRegRewriter, "virtregrewriter",
+                    "Virtual Register Rewriter", false, false)
+
+char VirtRegRewriter::ID = 0;
+
+void VirtRegRewriter::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<LiveIntervals>();
+  AU.addRequired<SlotIndexes>();
+  AU.addPreserved<SlotIndexes>();
+  AU.addRequired<LiveDebugVariables>();
+  AU.addRequired<VirtRegMap>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool VirtRegRewriter::runOnMachineFunction(MachineFunction &fn) {
+  MF = &fn;
+  TM = &MF->getTarget();
+  TRI = TM->getRegisterInfo();
+  TII = TM->getInstrInfo();
+  MRI = &MF->getRegInfo();
+  Indexes = &getAnalysis<SlotIndexes>();
+  LIS = &getAnalysis<LiveIntervals>();
+  VRM = &getAnalysis<VirtRegMap>();
   DEBUG(dbgs() << "********** REWRITE VIRTUAL REGISTERS **********\n"
                << "********** Function: "
                << MF->getFunction()->getName() << '\n');
-  DEBUG(dump());
+  DEBUG(VRM->dump());
+
+  // Add kill flags while we still have virtual registers.
+  LIS->addKillFlags();
+
+  // Live-in lists on basic blocks are required for physregs.
+  addMBBLiveIns();
+
+  // Rewrite virtual registers.
+  rewrite();
+
+  // Write out new DBG_VALUE instructions.
+  getAnalysis<LiveDebugVariables>().emitDebugValues(VRM);
+
+  // All machine operands and other references to virtual registers have been
+  // replaced. Remove the virtual registers and release all the transient data.
+  VRM->clearAllVirt();
+  MRI->clearVirtRegs();
+  return true;
+}
+
+// Compute MBB live-in lists from virtual register live ranges and their
+// assignments.
+void VirtRegRewriter::addMBBLiveIns() {
+  SmallVector<MachineBasicBlock*, 16> LiveIn;
+  for (unsigned Idx = 0, IdxE = MRI->getNumVirtRegs(); Idx != IdxE; ++Idx) {
+    unsigned VirtReg = TargetRegisterInfo::index2VirtReg(Idx);
+    if (MRI->reg_nodbg_empty(VirtReg))
+      continue;
+    LiveInterval &LI = LIS->getInterval(VirtReg);
+    if (LI.empty() || LIS->intervalIsInOneMBB(LI))
+      continue;
+    // This is a virtual register that is live across basic blocks. Its
+    // assigned PhysReg must be marked as live-in to those blocks.
+    unsigned PhysReg = VRM->getPhys(VirtReg);
+    assert(PhysReg != VirtRegMap::NO_PHYS_REG && "Unmapped virtual register.");
+
+    // Scan the segments of LI.
+    for (LiveInterval::const_iterator I = LI.begin(), E = LI.end(); I != E;
+         ++I) {
+      if (!Indexes->findLiveInMBBs(I->start, I->end, LiveIn))
+        continue;
+      for (unsigned i = 0, e = LiveIn.size(); i != e; ++i)
+        if (!LiveIn[i]->isLiveIn(PhysReg))
+          LiveIn[i]->addLiveIn(PhysReg);
+      LiveIn.clear();
+    }
+  }
+}
+
+void VirtRegRewriter::rewrite() {
   SmallVector<unsigned, 8> SuperDeads;
   SmallVector<unsigned, 8> SuperDefs;
   SmallVector<unsigned, 8> SuperKills;
@@ -135,8 +275,9 @@ void VirtRegMap::rewrite(SlotIndexes *Indexes) {
         if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
           continue;
         unsigned VirtReg = MO.getReg();
-        unsigned PhysReg = getPhys(VirtReg);
-        assert(PhysReg != NO_PHYS_REG && "Instruction uses unmapped VirtReg");
+        unsigned PhysReg = VRM->getPhys(VirtReg);
+        assert(PhysReg != VirtRegMap::NO_PHYS_REG &&
+               "Instruction uses unmapped VirtReg");
         assert(!Reserved.test(PhysReg) && "Reserved register assignment");
 
         // Preserve semantics of sub-register operands.
@@ -207,31 +348,3 @@ void VirtRegMap::rewrite(SlotIndexes *Indexes) {
     if (!MRI->reg_nodbg_empty(Reg))
       MRI->setPhysRegUsed(Reg);
 }
-
-void VirtRegMap::print(raw_ostream &OS, const Module* M) const {
-  const TargetRegisterInfo* TRI = MF->getTarget().getRegisterInfo();
-  const MachineRegisterInfo &MRI = MF->getRegInfo();
-
-  OS << "********** REGISTER MAP **********\n";
-  for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
-    if (Virt2PhysMap[Reg] != (unsigned)VirtRegMap::NO_PHYS_REG) {
-      OS << '[' << PrintReg(Reg, TRI) << " -> "
-         << PrintReg(Virt2PhysMap[Reg], TRI) << "] "
-         << MRI.getRegClass(Reg)->getName() << "\n";
-    }
-  }
-
-  for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
-    if (Virt2StackSlotMap[Reg] != VirtRegMap::NO_STACK_SLOT) {
-      OS << '[' << PrintReg(Reg, TRI) << " -> fi#" << Virt2StackSlotMap[Reg]
-         << "] " << MRI.getRegClass(Reg)->getName() << "\n";
-    }
-  }
-  OS << '\n';
-}
-
-void VirtRegMap::dump() const {
-  print(dbgs());
-}
diff --git a/lib/CodeGen/VirtRegMap.h b/lib/CodeGen/VirtRegMap.h
index 8cac311..c320985 100644
--- a/lib/CodeGen/VirtRegMap.h
+++ b/lib/CodeGen/VirtRegMap.h
@@ -177,13 +177,6 @@ namespace llvm {
     /// the specified stack slot
     void assignVirt2StackSlot(unsigned virtReg, int frameIndex);
 
-    /// rewrite - Rewrite all instructions in MF to use only physical registers
-    /// by mapping all virtual register operands to their assigned physical
-    /// registers.
-    ///
-    /// @param Indexes Optionally remove deleted instructions from indexes.
-    void rewrite(SlotIndexes *Indexes);
-
     void print(raw_ostream &OS, const Module* M = 0) const;
     void dump() const;
   };
diff --git a/lib/DebugInfo/DWARFCompileUnit.cpp b/lib/DebugInfo/DWARFCompileUnit.cpp
index 24bf97f..b27d57b 100644
--- a/lib/DebugInfo/DWARFCompileUnit.cpp
+++ b/lib/DebugInfo/DWARFCompileUnit.cpp
@@ -82,7 +82,7 @@ void DWARFCompileUnit::clear() {
   Abbrevs = 0;
   AddrSize = 0;
   BaseAddr = 0;
-  DieArray.clear();
+  clearDIEs(false);
 }
 
 void DWARFCompileUnit::dump(raw_ostream &OS) {
@@ -97,6 +97,13 @@ void DWARFCompileUnit::dump(raw_ostream &OS) {
   getCompileUnitDIE(false)->dump(OS, this, -1U);
 }
 
+const char *DWARFCompileUnit::getCompilationDir() {
+  extractDIEsIfNeeded(true);
+  if (DieArray.empty())
+    return 0;
+  return DieArray[0].getAttributeValueAsString(this, DW_AT_comp_dir, 0);
+}
+
 void DWARFCompileUnit::setDIERelations() {
   if (DieArray.empty())
     return;
@@ -201,7 +208,7 @@ size_t DWARFCompileUnit::extractDIEsIfNeeded(bool cu_die_only) {
 }
 
 void DWARFCompileUnit::clearDIEs(bool keep_compile_unit_die) {
-  if (DieArray.size() > 1) {
+  if (DieArray.size() > (unsigned)keep_compile_unit_die) {
     // std::vectors never get any smaller when resized to a smaller size,
     // or when clear() or erase() are called, the size will report that it
     // is smaller, but the memory allocated remains intact (call capacity()
@@ -227,8 +234,8 @@ DWARFCompileUnit::buildAddressRangeTable(DWARFDebugAranges *debug_aranges,
   // all compile units to stay loaded when they weren't needed. So we can end
   // up parsing the DWARF and then throwing them all away to keep memory usage
   // down.
-  const bool clear_dies = extractDIEsIfNeeded(false) > 1;
-
+  const bool clear_dies = extractDIEsIfNeeded(false) > 1 &&
+                          clear_dies_if_already_not_parsed;
   DieArray[0].buildAddressRangeTable(this, debug_aranges);
 
   // Keep memory down by clearing DIEs if this generate function
@@ -236,3 +243,13 @@ DWARFCompileUnit::buildAddressRangeTable(DWARFDebugAranges *debug_aranges,
   if (clear_dies)
     clearDIEs(true);
 }
+
+const DWARFDebugInfoEntryMinimal*
+DWARFCompileUnit::getFunctionDIEForAddress(int64_t address) {
+  extractDIEsIfNeeded(false);
+  for (size_t i = 0, n = DieArray.size(); i != n; i++) {
+    if (DieArray[i].addressRangeContainsAddress(this, address))
+      return &DieArray[i];
+  }
+  return 0;
+}
diff --git a/lib/DebugInfo/DWARFCompileUnit.h b/lib/DebugInfo/DWARFCompileUnit.h
index d916729..b34a596 100644
--- a/lib/DebugInfo/DWARFCompileUnit.h
+++ b/lib/DebugInfo/DWARFCompileUnit.h
@@ -43,7 +43,7 @@ public:
                    const DWARFAbbreviationDeclarationSet *abbrevs);
 
   /// extractDIEsIfNeeded - Parses a compile unit and indexes its DIEs if it
-  /// hasn't already been done.
+  /// hasn't already been done. Returns the number of DIEs parsed at this call.
   size_t extractDIEsIfNeeded(bool cu_die_only);
   void clear();
   void dump(raw_ostream &OS);
@@ -78,6 +78,8 @@ public:
     return &DieArray[0];
   }
 
+  const char *getCompilationDir();
+
   /// setDIERelations - We read in all of the DIE entries into our flat list
   /// of DIE entries and now we need to go back through all of them and set the
   /// parent, sibling and child pointers for quick DIE navigation.
@@ -104,6 +106,11 @@ public:
 
   void buildAddressRangeTable(DWARFDebugAranges *debug_aranges,
                               bool clear_dies_if_already_not_parsed);
+  /// getFunctionDIEForAddress - Returns pointer to parsed subprogram DIE,
+  /// address ranges of which contain the provided address,
+  /// or NULL if there is no such subprogram. The pointer
+  /// is valid until DWARFCompileUnit::clear() or clearDIEs() is called.
+  const DWARFDebugInfoEntryMinimal *getFunctionDIEForAddress(int64_t address);
 };
 
 }
diff --git a/lib/DebugInfo/DWARFContext.cpp b/lib/DebugInfo/DWARFContext.cpp
index dccadc4..a4e0d8e 100644
--- a/lib/DebugInfo/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARFContext.cpp
@@ -8,8 +8,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "DWARFContext.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 using namespace llvm;
@@ -140,30 +142,66 @@ DWARFCompileUnit *DWARFContext::getCompileUnitForOffset(uint32_t offset) {
   return 0;
 }
 
-DILineInfo DWARFContext::getLineInfoForAddress(uint64_t address) {
+DILineInfo DWARFContext::getLineInfoForAddress(uint64_t address,
+    DILineInfoSpecifier specifier) {
   // First, get the offset of the compile unit.
   uint32_t cuOffset = getDebugAranges()->findAddress(address);
   // Retrieve the compile unit.
   DWARFCompileUnit *cu = getCompileUnitForOffset(cuOffset);
   if (!cu)
-    return DILineInfo("<invalid>", 0, 0);
-  // Get the line table for this compile unit.
-  const DWARFDebugLine::LineTable *lineTable = getLineTableForCompileUnit(cu);
-  if (!lineTable)
-    return DILineInfo("<invalid>", 0, 0);
-  // Get the index of the row we're looking for in the line table.
-  uint64_t hiPC =
-    cu->getCompileUnitDIE()->getAttributeValueAsUnsigned(cu, DW_AT_high_pc,
-                                                         -1ULL);
-  uint32_t rowIndex = lineTable->lookupAddress(address, hiPC);
-  if (rowIndex == -1U)
-    return DILineInfo("<invalid>", 0, 0);
-
-  // From here, contruct the DILineInfo.
-  const DWARFDebugLine::Row &row = lineTable->Rows[rowIndex];
-  const std::string &fileName = lineTable->Prologue.FileNames[row.File-1].Name;
-
-  return DILineInfo(fileName.c_str(), row.Line, row.Column);
+    return DILineInfo();
+  SmallString<16> fileName("<invalid>");
+  SmallString<16> functionName("<invalid>");
+  uint32_t line = 0;
+  uint32_t column = 0;
+  if (specifier.needs(DILineInfoSpecifier::FunctionName)) {
+    const DWARFDebugInfoEntryMinimal *function_die =
+        cu->getFunctionDIEForAddress(address);
+    if (function_die) {
+      if (const char *name = function_die->getSubprogramName(cu))
+        functionName = name;
+    }
+  }
+  if (specifier.needs(DILineInfoSpecifier::FileLineInfo)) {
+    // Get the line table for this compile unit.
+    const DWARFDebugLine::LineTable *lineTable = getLineTableForCompileUnit(cu);
+    if (lineTable) {
+      // Get the index of the row we're looking for in the line table.
+      uint64_t hiPC = cu->getCompileUnitDIE()->getAttributeValueAsUnsigned(
+          cu, DW_AT_high_pc, -1ULL);
+      uint32_t rowIndex = lineTable->lookupAddress(address, hiPC);
+      if (rowIndex != -1U) {
+        const DWARFDebugLine::Row &row = lineTable->Rows[rowIndex];
+        // Take file/line info from the line table.
+        const DWARFDebugLine::FileNameEntry &fileNameEntry =
+            lineTable->Prologue.FileNames[row.File - 1];
+        fileName = fileNameEntry.Name;
+        if (specifier.needs(DILineInfoSpecifier::AbsoluteFilePath) &&
+            sys::path::is_relative(fileName.str())) {
+          // Append include directory of file (if it is present in line table)
+          // and compilation directory of compile unit to make path absolute.
+          const char *includeDir = 0;
+          if (uint64_t includeDirIndex = fileNameEntry.DirIdx) {
+            includeDir = lineTable->Prologue
+                         .IncludeDirectories[includeDirIndex - 1];
+          }
+          SmallString<16> absFileName;
+          if (includeDir == 0 || sys::path::is_relative(includeDir)) {
+            if (const char *compilationDir = cu->getCompilationDir())
+              sys::path::append(absFileName, compilationDir);
+          }
+          if (includeDir) {
+            sys::path::append(absFileName, includeDir);
+          }
+          sys::path::append(absFileName, fileName.str());
+          fileName = absFileName;
+        }
+        line = row.Line;
+        column = row.Column;
+      }
+    }
+  }
+  return DILineInfo(fileName, functionName, line, column);
 }
 
 void DWARFContextInMemory::anchor() { }
diff --git a/lib/DebugInfo/DWARFContext.h b/lib/DebugInfo/DWARFContext.h
index d2e763a..e55a27e 100644
--- a/lib/DebugInfo/DWARFContext.h
+++ b/lib/DebugInfo/DWARFContext.h
@@ -66,7 +66,8 @@ public:
   const DWARFDebugLine::LineTable *
   getLineTableForCompileUnit(DWARFCompileUnit *cu);
 
-  virtual DILineInfo getLineInfoForAddress(uint64_t address);
+  virtual DILineInfo getLineInfoForAddress(uint64_t address,
+      DILineInfoSpecifier specifier = DILineInfoSpecifier());
 
   bool isLittleEndian() const { return IsLittleEndian; }
 
diff --git a/lib/DebugInfo/DWARFDebugAranges.cpp b/lib/DebugInfo/DWARFDebugAranges.cpp
index 1788145..ef470e5 100644
--- a/lib/DebugInfo/DWARFDebugAranges.cpp
+++ b/lib/DebugInfo/DWARFDebugAranges.cpp
@@ -93,6 +93,7 @@ bool DWARFDebugAranges::generate(DWARFContext *ctx) {
         cu->buildAddressRangeTable(this, true);
     }
   }
+  sort(true, /* overlap size */ 0);
   return !isEmpty();
 }
 
@@ -221,4 +222,3 @@ bool DWARFDebugAranges::getMaxRange(uint64_t &LoPC, uint64_t &HiPC) const {
   HiPC = Aranges.back().HiPC();
   return true;
 }
-
diff --git a/lib/DebugInfo/DWARFDebugInfoEntry.cpp b/lib/DebugInfo/DWARFDebugInfoEntry.cpp
index 236db97..429a36c 100644
--- a/lib/DebugInfo/DWARFDebugInfoEntry.cpp
+++ b/lib/DebugInfo/DWARFDebugInfoEntry.cpp
@@ -440,3 +440,54 @@ DWARFDebugInfoEntryMinimal::buildAddressRangeTable(const DWARFCompileUnit *cu,
     }
   }
 }
+
+bool
+DWARFDebugInfoEntryMinimal::addressRangeContainsAddress(
+    const DWARFCompileUnit *cu, const uint64_t address) const {
+  if (!isNULL() && getTag() == DW_TAG_subprogram) {
+    uint64_t hi_pc = -1ULL;
+    uint64_t lo_pc = getAttributeValueAsUnsigned(cu, DW_AT_low_pc, -1ULL);
+    if (lo_pc != -1ULL)
+      hi_pc = getAttributeValueAsUnsigned(cu, DW_AT_high_pc, -1ULL);
+    if (hi_pc != -1ULL) {
+      return (lo_pc <= address && address < hi_pc);
+    }
+  }
+  return false;
+}
+
+const char*
+DWARFDebugInfoEntryMinimal::getSubprogramName(
+    const DWARFCompileUnit *cu) const {
+  if (isNULL() || getTag() != DW_TAG_subprogram)
+    return 0;
+  // Try to get mangled name if possible.
+  if (const char *name =
+      getAttributeValueAsString(cu, DW_AT_MIPS_linkage_name, 0))
+    return name;
+  if (const char *name = getAttributeValueAsString(cu, DW_AT_linkage_name, 0))
+    return name;
+  if (const char *name = getAttributeValueAsString(cu, DW_AT_name, 0))
+    return name;
+  // Try to get name from specification DIE.
+  uint32_t spec_ref =
+      getAttributeValueAsReference(cu, DW_AT_specification, -1U);
+  if (spec_ref != -1U) {
+    DWARFDebugInfoEntryMinimal spec_die;
+    if (spec_die.extract(cu, &spec_ref)) {
+      if (const char *name = spec_die.getSubprogramName(cu))
+        return name;
+    }
+  }
+  // Try to get name from abstract origin DIE.
+  uint32_t abs_origin_ref =
+      getAttributeValueAsReference(cu, DW_AT_abstract_origin, -1U);
+  if (abs_origin_ref != -1U) {
+    DWARFDebugInfoEntryMinimal abs_origin_die;
+    if (abs_origin_die.extract(cu, &abs_origin_ref)) {
+      if (const char *name = abs_origin_die.getSubprogramName(cu))
+        return name;
+    }
+  }
+  return 0;
+}
diff --git a/lib/DebugInfo/DWARFDebugInfoEntry.h b/lib/DebugInfo/DWARFDebugInfoEntry.h
index 37b3bcd..d5d86b9 100644
--- a/lib/DebugInfo/DWARFDebugInfoEntry.h
+++ b/lib/DebugInfo/DWARFDebugInfoEntry.h
@@ -128,6 +128,15 @@ public:
 
   void buildAddressRangeTable(const DWARFCompileUnit *cu,
                               DWARFDebugAranges *debug_aranges) const;
+
+  bool addressRangeContainsAddress(const DWARFCompileUnit *cu,
+                                   const uint64_t address) const;
+
+  // If a DIE represents a subprogram, returns its mangled name
+  // (or short name, if mangled is missing). This name may be fetched
+  // from specification or abstract origin for this subprogram.
+  // Returns null if no name is found.
+  const char* getSubprogramName(const DWARFCompileUnit *cu) const;
 };
 
 }
diff --git a/lib/DebugInfo/DWARFDebugLine.h b/lib/DebugInfo/DWARFDebugLine.h
index bc6a70b..a8c0669 100644
--- a/lib/DebugInfo/DWARFDebugLine.h
+++ b/lib/DebugInfo/DWARFDebugLine.h
@@ -12,7 +12,6 @@
 
 #include "llvm/Support/DataExtractor.h"
 #include <map>
-#include <string>
 #include <vector>
 
 namespace llvm {
@@ -22,9 +21,9 @@ class raw_ostream;
 class DWARFDebugLine {
 public:
   struct FileNameEntry {
-    FileNameEntry() : DirIdx(0), ModTime(0), Length(0) {}
+    FileNameEntry() : Name(0), DirIdx(0), ModTime(0), Length(0) {}
 
-    std::string Name;
+    const char *Name;
     uint64_t DirIdx;
     uint64_t ModTime;
     uint64_t Length;
@@ -56,7 +55,7 @@ public:
     // The number assigned to the first special opcode.
     uint8_t OpcodeBase;
     std::vector<uint8_t> StandardOpcodeLengths;
-    std::vector<std::string> IncludeDirectories;
+    std::vector<const char*> IncludeDirectories;
     std::vector<FileNameEntry> FileNames;
 
     // Length of the prologue in bytes.
diff --git a/lib/ExecutionEngine/EventListenerCommon.h b/lib/ExecutionEngine/EventListenerCommon.h
index 1c07c94..911d1d6 100644
--- a/lib/ExecutionEngine/EventListenerCommon.h
+++ b/lib/ExecutionEngine/EventListenerCommon.h
@@ -14,8 +14,8 @@
 #ifndef EVENT_LISTENER_COMMON_H
 #define EVENT_LISTENER_COMMON_H
 
+#include "llvm/DebugInfo.h"
 #include "llvm/Metadata.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/Path.h"
diff --git a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
index 5dfa78f..c11c17e 100644
--- a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
+++ b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
@@ -16,11 +16,11 @@
 #include "llvm/ExecutionEngine/JITEventListener.h"
 
 #define DEBUG_TYPE "amplifier-jit-event-listener"
+#include "llvm/DebugInfo.h"
 #include "llvm/Function.h"
 #include "llvm/Metadata.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/OwningPtr.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/ExecutionEngine/IntelJITEventsWrapper.h"
 #include "llvm/Support/Debug.h"
@@ -138,7 +138,7 @@ void IntelJITEventListener::NotifyFunctionEmitted(
       // the first instruction that has one
       if (FunctionMessage.source_file_name == 0) {
         MDNode *scope = I->Loc.getScope(
-					Details.MF->getFunction()->getContext());
+          Details.MF->getFunction()->getContext());
         FunctionMessage.source_file_name = const_cast<char*>(
                                                   Filenames.getFullPath(scope));
       }
@@ -152,7 +152,7 @@ void IntelJITEventListener::NotifyFunctionEmitted(
   }
 
   Wrapper.iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED,
-													 &FunctionMessage);
+                           &FunctionMessage);
   MethodIDs[FnStart] = FunctionMessage.method_id;
 }
 
diff --git a/lib/ExecutionEngine/IntelJITEvents/LLVMBuild.txt b/lib/ExecutionEngine/IntelJITEvents/LLVMBuild.txt
index 80d2273..9c06fda 100644
--- a/lib/ExecutionEngine/IntelJITEvents/LLVMBuild.txt
+++ b/lib/ExecutionEngine/IntelJITEvents/LLVMBuild.txt
@@ -18,6 +18,6 @@
 [common]
 
 [component_0]
-type = Library
+type = OptionalLibrary
 name = IntelJITEvents
 parent = ExecutionEngine
diff --git a/lib/ExecutionEngine/Interpreter/CMakeLists.txt b/lib/ExecutionEngine/Interpreter/CMakeLists.txt
index d331f83..74df8f0 100644
--- a/lib/ExecutionEngine/Interpreter/CMakeLists.txt
+++ b/lib/ExecutionEngine/Interpreter/CMakeLists.txt
@@ -15,3 +15,5 @@ add_llvm_library(LLVMInterpreter
 if( LLVM_ENABLE_FFI )
   target_link_libraries( LLVMInterpreter ${FFI_LIBRARY_PATH} )
 endif()
+
+add_dependencies(LLVMInterpreter intrinsics_gen)
diff --git a/lib/ExecutionEngine/Interpreter/Execution.cpp b/lib/ExecutionEngine/Interpreter/Execution.cpp
index af47be9..5202b09 100644
--- a/lib/ExecutionEngine/Interpreter/Execution.cpp
+++ b/lib/ExecutionEngine/Interpreter/Execution.cpp
@@ -651,11 +651,40 @@ void Interpreter::visitSwitchInst(SwitchInst &I) {
   // Check to see if any of the cases match...
   BasicBlock *Dest = 0;
   for (SwitchInst::CaseIt i = I.case_begin(), e = I.case_end(); i != e; ++i) {
-    GenericValue CaseVal = getOperandValue(i.getCaseValue(), SF);
-    if (executeICMP_EQ(CondVal, CaseVal, ElTy).IntVal != 0) {
-      Dest = cast<BasicBlock>(i.getCaseSuccessor());
-      break;
+    IntegersSubset& Case = i.getCaseValueEx();
+    if (Case.isSingleNumber()) {
+      // FIXME: Currently work with ConstantInt based numbers.
+      const ConstantInt *CI = Case.getSingleNumber(0).toConstantInt();
+      GenericValue Val = getOperandValue(const_cast<ConstantInt*>(CI), SF);
+      if (executeICMP_EQ(Val, CondVal, ElTy).IntVal != 0) {
+        Dest = cast<BasicBlock>(i.getCaseSuccessor());
+        break;        
+      }
     }
+    if (Case.isSingleNumbersOnly()) {
+      for (unsigned n = 0, en = Case.getNumItems(); n != en; ++n) {
+        // FIXME: Currently work with ConstantInt based numbers.
+        const ConstantInt *CI = Case.getSingleNumber(n).toConstantInt();
+        GenericValue Val = getOperandValue(const_cast<ConstantInt*>(CI), SF);
+        if (executeICMP_EQ(Val, CondVal, ElTy).IntVal != 0) {
+          Dest = cast<BasicBlock>(i.getCaseSuccessor());
+          break;        
+        }
+      }      
+    } else
+      for (unsigned n = 0, en = Case.getNumItems(); n != en; ++n) {
+        IntegersSubset::Range r = Case.getItem(n);
+        // FIXME: Currently work with ConstantInt based numbers.
+        const ConstantInt *LowCI = r.getLow().toConstantInt();
+        const ConstantInt *HighCI = r.getHigh().toConstantInt();
+        GenericValue Low = getOperandValue(const_cast<ConstantInt*>(LowCI), SF);
+        GenericValue High = getOperandValue(const_cast<ConstantInt*>(HighCI), SF);
+        if (executeICMP_ULE(Low, CondVal, ElTy).IntVal != 0 &&
+            executeICMP_ULE(CondVal, High, ElTy).IntVal != 0) {
+          Dest = cast<BasicBlock>(i.getCaseSuccessor());
+          break;        
+        }
+      }
   }
   if (!Dest) Dest = I.getDefaultDest();   // No cases matched: use default
   SwitchToNewBasicBlock(Dest, SF);
diff --git a/lib/ExecutionEngine/JIT/JITEmitter.cpp b/lib/ExecutionEngine/JIT/JITEmitter.cpp
index 504c8bd..ff3a9dc 100644
--- a/lib/ExecutionEngine/JIT/JITEmitter.cpp
+++ b/lib/ExecutionEngine/JIT/JITEmitter.cpp
@@ -17,9 +17,9 @@
 #include "JITDwarfEmitter.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/Constants.h"
-#include "llvm/Module.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
-#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/Module.h"
 #include "llvm/CodeGen/JITCodeEmitter.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineCodeInfo.h"
@@ -108,13 +108,18 @@ namespace {
     /// particular GlobalVariable so that we can reuse them if necessary.
     GlobalToIndirectSymMapTy GlobalToIndirectSymMap;
 
+#ifndef NDEBUG
     /// Instance of the JIT this ResolverState serves.
     JIT *TheJIT;
+#endif
 
   public:
     JITResolverState(JIT *jit) : FunctionToLazyStubMap(this),
-                                 FunctionToCallSitesMap(this),
-                                 TheJIT(jit) {}
+                                 FunctionToCallSitesMap(this) {
+#ifndef NDEBUG
+      TheJIT = jit;
+#endif
+    }
 
     FunctionToLazyStubMapTy& getFunctionToLazyStubMap(
       const MutexGuard& locked) {
diff --git a/lib/ExecutionEngine/JIT/JITMemoryManager.cpp b/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
index 2d1775c..7be6ef8 100644
--- a/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
+++ b/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
@@ -852,7 +852,7 @@ static int jit_noop() {
 /// for resolving library symbols, not code generated symbols.
 ///
 void *DefaultJITMemoryManager::getPointerToNamedFunction(const std::string &Name,
-                                     bool AbortOnFailure) {
+                                                         bool AbortOnFailure) {
   // Check to see if this is one of the functions we want to intercept.  Note,
   // we cast to intptr_t here to silence a -pedantic warning that complains
   // about casting a function pointer to a normal pointer.
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index 44f89cf..84274c0 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -45,7 +45,7 @@ ExecutionEngine *MCJIT::createJIT(Module *M,
 
   // If the target supports JIT code generation, create the JIT.
   if (TargetJITInfo *TJ = TM->getJITInfo())
-    return new MCJIT(M, TM, *TJ, new MCJITMemoryManager(JMM, M), GVsWithCode);
+    return new MCJIT(M, TM, *TJ, new MCJITMemoryManager(JMM), GVsWithCode);
 
   if (ErrorStr)
     *ErrorStr = "target does not support JIT code generation";
@@ -217,7 +217,7 @@ GenericValue MCJIT::runFunction(Function *F,
 }
 
 void *MCJIT::getPointerToNamedFunction(const std::string &Name,
-                                       bool AbortOnFailure){
+                                       bool AbortOnFailure) {
   if (!isSymbolSearchingDisabled() && MemMgr) {
     void *ptr = MemMgr->getPointerToNamedFunction(Name, false);
     if (ptr)
@@ -231,7 +231,7 @@ void *MCJIT::getPointerToNamedFunction(const std::string &Name,
 
   if (AbortOnFailure) {
     report_fatal_error("Program used external function '"+Name+
-                      "' which could not be resolved!");
+                       "' which could not be resolved!");
   }
   return 0;
 }
diff --git a/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h b/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h
index a68949a..441aaeb 100644
--- a/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h
+++ b/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h
@@ -22,24 +22,20 @@ namespace llvm {
 // matching LLVM IR counterparts in the module(s) being compiled.
 class MCJITMemoryManager : public RTDyldMemoryManager {
   virtual void anchor();
-  JITMemoryManager *JMM;
+  OwningPtr<JITMemoryManager> JMM;
 
-  // FIXME: Multiple modules.
-  Module *M;
 public:
-  MCJITMemoryManager(JITMemoryManager *jmm, Module *m) :
-    JMM(jmm?jmm:JITMemoryManager::CreateDefaultMemManager()), M(m) {}
-  // We own the JMM, so make sure to delete it.
-  ~MCJITMemoryManager() { delete JMM; }
+  MCJITMemoryManager(JITMemoryManager *jmm) :
+    JMM(jmm?jmm:JITMemoryManager::CreateDefaultMemManager()) {}
 
   uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
                                unsigned SectionID) {
-    return JMM->allocateSpace(Size, Alignment);
+    return JMM->allocateDataSection(Size, Alignment, SectionID);
   }
 
   uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
                                unsigned SectionID) {
-    return JMM->allocateSpace(Size, Alignment);
+    return JMM->allocateCodeSection(Size, Alignment, SectionID);
   }
 
   virtual void *getPointerToNamedFunction(const std::string &Name,
diff --git a/lib/ExecutionEngine/OProfileJIT/LLVMBuild.txt b/lib/ExecutionEngine/OProfileJIT/LLVMBuild.txt
index 4516dfa..e30516e 100644
--- a/lib/ExecutionEngine/OProfileJIT/LLVMBuild.txt
+++ b/lib/ExecutionEngine/OProfileJIT/LLVMBuild.txt
@@ -18,6 +18,6 @@
 [common]
 
 [component_0]
-type = Library
+type = OptionalLibrary
 name = OProfileJIT
 parent = ExecutionEngine
diff --git a/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
index e6142e3..6b8e9d1 100644
--- a/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
+++ b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
@@ -16,9 +16,9 @@
 #include "llvm/ExecutionEngine/JITEventListener.h"
 
 #define DEBUG_TYPE "oprofile-jit-event-listener"
+#include "llvm/DebugInfo.h"
 #include "llvm/Function.h"
 #include "llvm/ADT/OwningPtr.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/ExecutionEngine/OProfileWrapper.h"
 #include "llvm/Support/Debug.h"
diff --git a/lib/ExecutionEngine/RuntimeDyld/ObjectImage.h b/lib/ExecutionEngine/RuntimeDyld/ObjectImage.h
index 8206ead..c3e3572 100644
--- a/lib/ExecutionEngine/RuntimeDyld/ObjectImage.h
+++ b/lib/ExecutionEngine/RuntimeDyld/ObjectImage.h
@@ -48,7 +48,7 @@ public:
   virtual void updateSymbolAddress(const object::SymbolRef &Sym, uint64_t Addr)
               {}
 
-  // Subclasses can override this method to provide JIT debugging support
+  // Subclasses can override these methods to provide JIT debugging support
   virtual void registerWithDebugger() {}
   virtual void deregisterWithDebugger() {}
 };
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index 1b1840a..b464040 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -39,7 +39,7 @@ namespace {
 // Resolve the relocations for all symbols we currently know about.
 void RuntimeDyldImpl::resolveRelocations() {
   // First, resolve relocations associated with external symbols.
-  resolveSymbols();
+  resolveExternalSymbols();
 
   // Just iterate over the sections we have and resolve all the relocations
   // in them. Gross overkill, but it gets the job done.
@@ -59,8 +59,8 @@ void RuntimeDyldImpl::mapSectionAddress(void *LocalAddress,
   llvm_unreachable("Attempting to remap address of unknown section!");
 }
 
-// Subclasses can implement this method to create specialized image instances
-// The caller owns the the pointer that is returned.
+// Subclasses can implement this method to create specialized image instances.
+// The caller owns the pointer that is returned.
 ObjectImage *RuntimeDyldImpl::createObjectImage(const MemoryBuffer *InputBuffer) {
   ObjectFile *ObjFile = ObjectFile::createObjectFile(const_cast<MemoryBuffer*>
                                                                  (InputBuffer));
@@ -75,11 +75,15 @@ bool RuntimeDyldImpl::loadObject(const MemoryBuffer *InputBuffer) {
 
   Arch = (Triple::ArchType)obj->getArch();
 
-  LocalSymbolMap LocalSymbols;     // Functions and data symbols from the
-                                   // object file.
-  ObjSectionToIDMap LocalSections; // Used sections from the object file
-  CommonSymbolMap   CommonSymbols; // Common symbols requiring allocation
-  uint64_t          CommonSize = 0;
+  // Symbols found in this object
+  StringMap<SymbolLoc> LocalSymbols;
+  // Used sections from the object file
+  ObjSectionToIDMap LocalSections;
+
+  // Common symbols requiring allocation, and the total size required to
+  // allocate all common symbols.
+  CommonSymbolMap CommonSymbols;
+  uint64_t CommonSize = 0;
 
   error_code err;
   // Parse symbols
@@ -106,28 +110,29 @@ bool RuntimeDyldImpl::loadObject(const MemoryBuffer *InputBuffer) {
       if (SymType == object::SymbolRef::ST_Function ||
           SymType == object::SymbolRef::ST_Data) {
         uint64_t FileOffset;
-        StringRef sData;
+        StringRef SectionData;
         section_iterator si = obj->end_sections();
         Check(i->getFileOffset(FileOffset));
         Check(i->getSection(si));
         if (si == obj->end_sections()) continue;
-        Check(si->getContents(sData));
+        Check(si->getContents(SectionData));
         const uint8_t* SymPtr = (const uint8_t*)InputBuffer->getBufferStart() +
                                 (uintptr_t)FileOffset;
-        uintptr_t SectOffset = (uintptr_t)(SymPtr - (const uint8_t*)sData.begin());
+        uintptr_t SectOffset = (uintptr_t)(SymPtr -
+                                           (const uint8_t*)SectionData.begin());
         unsigned SectionID =
           findOrEmitSection(*obj,
                             *si,
                             SymType == object::SymbolRef::ST_Function,
                             LocalSections);
-        bool isGlobal = flags & SymbolRef::SF_Global;
         LocalSymbols[Name.data()] = SymbolLoc(SectionID, SectOffset);
         DEBUG(dbgs() << "\tFileOffset: " << format("%p", (uintptr_t)FileOffset)
                      << " flags: " << flags
                      << " SID: " << SectionID
                      << " Offset: " << format("%p", SectOffset));
+        bool isGlobal = flags & SymbolRef::SF_Global;
         if (isGlobal)
-          SymbolTable[Name] = SymbolLoc(SectionID, SectOffset);
+          GlobalSymbolTable[Name] = SymbolLoc(SectionID, SectOffset);
       }
     }
     DEBUG(dbgs() << "\tType: " << SymType << " Name: " << Name << "\n");
@@ -137,7 +142,7 @@ bool RuntimeDyldImpl::loadObject(const MemoryBuffer *InputBuffer) {
   if (CommonSize != 0)
     emitCommonSymbols(*obj, CommonSymbols, CommonSize, LocalSymbols);
 
-  // Parse and proccess relocations
+  // Parse and process relocations
   DEBUG(dbgs() << "Parse relocations:\n");
   for (section_iterator si = obj->begin_sections(),
        se = obj->end_sections(); si != se; si.increment(err)) {
@@ -150,7 +155,7 @@ bool RuntimeDyldImpl::loadObject(const MemoryBuffer *InputBuffer) {
          e = si->end_relocations(); i != e; i.increment(err)) {
       Check(err);
 
-      // If it's first relocation in this section, find its SectionID
+      // If it's the first relocation in this section, find its SectionID
       if (isFirstRelocation) {
         SectionID = findOrEmitSection(*obj, *si, true, LocalSections);
         DEBUG(dbgs() << "\tSectionID: " << SectionID << "\n");
@@ -177,10 +182,10 @@ bool RuntimeDyldImpl::loadObject(const MemoryBuffer *InputBuffer) {
   return false;
 }
 
-unsigned RuntimeDyldImpl::emitCommonSymbols(ObjectImage &Obj,
-                                            const CommonSymbolMap &Map,
-                                            uint64_t TotalSize,
-                                            LocalSymbolMap &LocalSymbols) {
+void RuntimeDyldImpl::emitCommonSymbols(ObjectImage &Obj,
+                                        const CommonSymbolMap &CommonSymbols,
+                                        uint64_t TotalSize,
+                                        SymbolTableMap &SymbolTable) {
   // Allocate memory for the section
   unsigned SectionID = Sections.size();
   uint8_t *Addr = MemMgr->allocateDataSection(TotalSize, sizeof(void*),
@@ -197,18 +202,16 @@ unsigned RuntimeDyldImpl::emitCommonSymbols(ObjectImage &Obj,
                << "\n");
 
   // Assign the address of each symbol
-  for (CommonSymbolMap::const_iterator it = Map.begin(), itEnd = Map.end();
-       it != itEnd; it++) {
-    uint64_t Size = it->second;
+  for (CommonSymbolMap::const_iterator it = CommonSymbols.begin(),
+       itEnd = CommonSymbols.end(); it != itEnd; it++) {
     StringRef Name;
     it->first.getName(Name);
     Obj.updateSymbolAddress(it->first, (uint64_t)Addr);
-    LocalSymbols[Name.data()] = SymbolLoc(SectionID, Offset);
+    SymbolTable[Name.data()] = SymbolLoc(SectionID, Offset);
+    uint64_t Size = it->second;
     Offset += Size;
     Addr += Size;
   }
-
-  return SectionID;
 }
 
 unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
@@ -274,8 +277,8 @@ unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
   }
   else {
     // Even if we didn't load the section, we need to record an entry for it
-    //   to handle later processing (and by 'handle' I mean don't do anything
-    //   with these sections).
+    // to handle later processing (and by 'handle' I mean don't do anything
+    // with these sections).
     Allocate = 0;
     Addr = 0;
     DEBUG(dbgs() << "emitSection SectionID: " << SectionID
@@ -307,28 +310,26 @@ unsigned RuntimeDyldImpl::findOrEmitSection(ObjectImage &Obj,
   return SectionID;
 }
 
-void RuntimeDyldImpl::AddRelocation(const RelocationValueRef &Value,
-                                   unsigned SectionID, uintptr_t Offset,
-                                   uint32_t RelType) {
-  DEBUG(dbgs() << "AddRelocation SymNamePtr: " << format("%p", Value.SymbolName)
-               << " SID: " << Value.SectionID
-               << " Addend: " << format("%p", Value.Addend)
-               << " Offset: " << format("%p", Offset)
-               << " RelType: " << format("%x", RelType)
-               << "\n");
+void RuntimeDyldImpl::addRelocationForSection(const RelocationEntry &RE,
+                                              unsigned SectionID) {
+  Relocations[SectionID].push_back(RE);
+}
 
-  if (Value.SymbolName == 0) {
-    Relocations[Value.SectionID].push_back(RelocationEntry(
-      SectionID,
-      Offset,
-      RelType,
-      Value.Addend));
-  } else
-    SymbolRelocations[Value.SymbolName].push_back(RelocationEntry(
-      SectionID,
-      Offset,
-      RelType,
-      Value.Addend));
+void RuntimeDyldImpl::addRelocationForSymbol(const RelocationEntry &RE,
+                                             StringRef SymbolName) {
+  // Relocation by symbol.  If the symbol is found in the global symbol table,
+  // create an appropriate section relocation.  Otherwise, add it to
+  // ExternalSymbolRelocations.
+  SymbolTableMap::const_iterator Loc =
+      GlobalSymbolTable.find(SymbolName);
+  if (Loc == GlobalSymbolTable.end()) {
+    ExternalSymbolRelocations[SymbolName].push_back(RE);
+  } else {
+    // Copy the RE since we want to modify its addend.
+    RelocationEntry RECopy = RE;
+    RECopy.Addend += Loc->second.second;
+    Relocations[Loc->second.first].push_back(RECopy);
+  }
 }
 
 uint8_t *RuntimeDyldImpl::createStubFunction(uint8_t *Addr) {
@@ -369,12 +370,12 @@ void RuntimeDyldImpl::resolveRelocationEntry(const RelocationEntry &RE,
       uint8_t *Target = Sections[RE.SectionID].Address + RE.Offset;
       DEBUG(dbgs() << "\tSectionID: " << RE.SectionID
             << " + " << RE.Offset << " (" << format("%p", Target) << ")"
-            << " Data: " << RE.Data
+            << " RelType: " << RE.RelType
             << " Addend: " << RE.Addend
             << "\n");
 
       resolveRelocation(Target, Sections[RE.SectionID].LoadAddress + RE.Offset,
-                        Value, RE.Data, RE.Addend);
+                        Value, RE.RelType, RE.Addend);
   }
 }
 
@@ -385,16 +386,14 @@ void RuntimeDyldImpl::resolveRelocationList(const RelocationList &Relocs,
   }
 }
 
-// resolveSymbols - Resolve any relocations to the specified symbols if
-// we know where it lives.
-void RuntimeDyldImpl::resolveSymbols() {
-  StringMap<RelocationList>::iterator i = SymbolRelocations.begin(),
-                                      e = SymbolRelocations.end();
+void RuntimeDyldImpl::resolveExternalSymbols() {
+  StringMap<RelocationList>::iterator i = ExternalSymbolRelocations.begin(),
+                                      e = ExternalSymbolRelocations.end();
   for (; i != e; i++) {
     StringRef Name = i->first();
     RelocationList &Relocs = i->second;
-    StringMap<SymbolLoc>::const_iterator Loc = SymbolTable.find(Name);
-    if (Loc == SymbolTable.end()) {
+    SymbolTableMap::const_iterator Loc = GlobalSymbolTable.find(Name);
+    if (Loc == GlobalSymbolTable.end()) {
       // This is an external symbol, try to get it address from
       // MemoryManager.
       uint8_t *Addr = (uint8_t*) MemMgr->getPointerToNamedFunction(Name.data(),
@@ -404,15 +403,7 @@ void RuntimeDyldImpl::resolveSymbols() {
               << "\n");
       resolveRelocationList(Relocs, (uintptr_t)Addr);
     } else {
-      // Change the relocation to be section relative rather than symbol
-      // relative and move it to the resolved relocation list.
-      DEBUG(dbgs() << "Resolving symbol '" << Name << "'\n");
-      for (int i = 0, e = Relocs.size(); i != e; ++i) {
-        RelocationEntry Entry = Relocs[i];
-        Entry.Addend += Loc->second.second;
-        Relocations[Loc->second.first].push_back(Entry);
-      }
-      Relocs.clear();
+      report_fatal_error("Expected external symbol");
     }
   }
 }
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index db6da8c..39aed34 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -334,28 +334,31 @@ void RuntimeDyldELF::resolveRelocation(uint8_t *LocalAddress,
 void RuntimeDyldELF::processRelocationRef(const ObjRelocationInfo &Rel,
                                           ObjectImage &Obj,
                                           ObjSectionToIDMap &ObjSectionToID,
-                                          LocalSymbolMap &Symbols,
+                                          const SymbolTableMap &Symbols,
                                           StubMap &Stubs) {
 
   uint32_t RelType = (uint32_t)(Rel.Type & 0xffffffffL);
   intptr_t Addend = (intptr_t)Rel.AdditionalInfo;
-  RelocationValueRef Value;
-  StringRef TargetName;
   const SymbolRef &Symbol = Rel.Symbol;
+
+  // Obtain the symbol name which is referenced in the relocation
+  StringRef TargetName;
   Symbol.getName(TargetName);
   DEBUG(dbgs() << "\t\tRelType: " << RelType
                << " Addend: " << Addend
                << " TargetName: " << TargetName
                << "\n");
-  // First look the symbol in object file symbols.
-  LocalSymbolMap::iterator lsi = Symbols.find(TargetName.data());
+  RelocationValueRef Value;
+  // First search for the symbol in the local symbol table
+  SymbolTableMap::const_iterator lsi = Symbols.find(TargetName.data());
   if (lsi != Symbols.end()) {
     Value.SectionID = lsi->second.first;
     Value.Addend = lsi->second.second;
   } else {
-    // Second look the symbol in global symbol table.
-    StringMap<SymbolLoc>::iterator gsi = SymbolTable.find(TargetName.data());
-    if (gsi != SymbolTable.end()) {
+    // Search for the symbol in the global symbol table
+    SymbolTableMap::const_iterator gsi =
+        GlobalSymbolTable.find(TargetName.data());
+    if (gsi != GlobalSymbolTable.end()) {
       Value.SectionID = gsi->second.first;
       Value.Addend = gsi->second.second;
     } else {
@@ -366,7 +369,7 @@ void RuntimeDyldELF::processRelocationRef(const ObjRelocationInfo &Rel,
           // TODO: Now ELF SymbolRef::ST_Debug = STT_SECTION, it's not obviously
           // and can be changed by another developers. Maybe best way is add
           // a new symbol type ST_Section to SymbolRef and use it.
-          section_iterator si = Obj.end_sections();
+          section_iterator si(Obj.end_sections());
           Symbol.getSection(si);
           if (si == Obj.end_sections())
             llvm_unreachable("Symbol section not found, bad object file format!");
@@ -410,14 +413,24 @@ void RuntimeDyldELF::processRelocationRef(const ObjRelocationInfo &Rel,
       Stubs[Value] = Section.StubOffset;
       uint8_t *StubTargetAddr = createStubFunction(Section.Address +
                                                    Section.StubOffset);
-      AddRelocation(Value, Rel.SectionID,
-                    StubTargetAddr - Section.Address, ELF::R_ARM_ABS32);
+      RelocationEntry RE(Rel.SectionID, StubTargetAddr - Section.Address,
+                         ELF::R_ARM_ABS32, Value.Addend);
+      if (Value.SymbolName)
+        addRelocationForSymbol(RE, Value.SymbolName);
+      else
+        addRelocationForSection(RE, Value.SectionID);
+
       resolveRelocation(Target, (uint64_t)Target, (uint64_t)Section.Address +
                         Section.StubOffset, RelType, 0);
       Section.StubOffset += getMaxStubSize();
     }
-  } else
-    AddRelocation(Value, Rel.SectionID, Rel.Offset, RelType);
+  } else {
+    RelocationEntry RE(Rel.SectionID, Rel.Offset, RelType, Value.Addend);
+    if (Value.SymbolName)
+      addRelocationForSymbol(RE, Value.SymbolName);
+    else
+      addRelocationForSection(RE, Value.SectionID);
+  }
 }
 
 bool RuntimeDyldELF::isCompatibleFormat(const MemoryBuffer *InputBuffer) const {
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
index e7f6fab..e413f78 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
@@ -51,7 +51,8 @@ protected:
   virtual void processRelocationRef(const ObjRelocationInfo &Rel,
                                     ObjectImage &Obj,
                                     ObjSectionToIDMap &ObjSectionToID,
-                                    LocalSymbolMap &Symbols, StubMap &Stubs);
+                                    const SymbolTableMap &Symbols,
+                                    StubMap &Stubs);
 
   virtual ObjectImage *createObjectImage(const MemoryBuffer *InputBuffer);
   virtual void handleObjectLoaded(ObjectImage *Obj);
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index 2dea13f..c38ca69 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -14,60 +14,83 @@
 #ifndef LLVM_RUNTIME_DYLD_IMPL_H
 #define LLVM_RUNTIME_DYLD_IMPL_H
 
+#include "ObjectImage.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
-#include "llvm/Object/ObjectFile.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/Twine.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Memory.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/system_error.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/ADT/Triple.h"
-#include <map>
 #include "llvm/Support/Format.h"
-#include "ObjectImage.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/system_error.h"
+#include <map>
 
 using namespace llvm;
 using namespace llvm::object;
 
 namespace llvm {
 
+class MemoryBuffer;
+class Twine;
+
+
+/// SectionEntry - represents a section emitted into memory by the dynamic
+/// linker.
 class SectionEntry {
 public:
-  uint8_t* Address;
+  /// Address - address in the linker's memory where the section resides.
+  uint8_t *Address;
+
+  /// Size - section size.
   size_t Size;
-  uint64_t LoadAddress;   // For each section, the address it will be
-                          // considered to live at for relocations. The same
-                          // as the pointer to the above memory block for
-                          // hosted JITs.
-  uintptr_t StubOffset;   // It's used for architecturies with stub
-                          // functions for far relocations like ARM.
-  uintptr_t ObjAddress;   // Section address in object file. It's use for
-                          // calculate MachO relocation addend
-  SectionEntry(uint8_t* address, size_t size, uintptr_t stubOffset,
+
+  /// LoadAddress - the address of the section in the target process's memory.
+  /// Used for situations in which JIT-ed code is being executed in the address
+  /// space of a separate process.  If the code executes in the same address
+  /// space where it was JIT-ed, this just equals Address.
+  uint64_t LoadAddress;
+
+  /// StubOffset - used for architectures with stub functions for far
+  /// relocations (like ARM).
+  uintptr_t StubOffset;
+
+  /// ObjAddress - address of the section in the in-memory object file.  Used
+  /// for calculating relocations in some object formats (like MachO).
+  uintptr_t ObjAddress;
+
+  SectionEntry(uint8_t *address, size_t size, uintptr_t stubOffset,
                uintptr_t objAddress)
     : Address(address), Size(size), LoadAddress((uintptr_t)address),
       StubOffset(stubOffset), ObjAddress(objAddress) {}
 };
 
+/// RelocationEntry - used to represent relocations internally in the dynamic
+/// linker.
 class RelocationEntry {
 public:
-  unsigned    SectionID;  // Section the relocation is contained in.
-  uintptr_t   Offset;     // Offset into the section for the relocation.
-  uint32_t    Data;       // Relocatino data. Including type of relocation
-                          // and another flags and parameners from
-  intptr_t    Addend;     // Addend encoded in the instruction itself, if any,
-                          // plus the offset into the source section for
-                          // the symbol once the relocation is resolvable.
-  RelocationEntry(unsigned id, uint64_t offset, uint32_t data, int64_t addend)
-    : SectionID(id), Offset(offset), Data(data), Addend(addend) {}
+  /// SectionID - the section this relocation points to.
+  unsigned SectionID;
+
+  /// Offset - offset into the section.
+  uintptr_t Offset;
+
+  /// RelType - relocation type.
+  uint32_t RelType;
+
+  /// Addend - the relocation addend encoded in the instruction itself.  Also
+  /// used to make a relocation section relative instead of symbol relative.
+  intptr_t Addend;
+
+  RelocationEntry(unsigned id, uint64_t offset, uint32_t type, int64_t addend)
+    : SectionID(id), Offset(offset), RelType(type), Addend(addend) {}
 };
 
-// Raw relocation data from object file
+/// ObjRelocationInfo - relocation information as read from the object file.
+/// Used to pass around data taken from object::RelocationRef, together with
+/// the section to which the relocation points (represented by a SectionID).
 class ObjRelocationInfo {
 public:
   unsigned  SectionID;
@@ -97,7 +120,8 @@ protected:
   // The MemoryManager to load objects into.
   RTDyldMemoryManager *MemMgr;
 
-  // A list of emmitted sections.
+  // A list of all sections emitted by the dynamic linker.  These sections are
+  // referenced in the code by means of their index in this list - SectionID.
   typedef SmallVector<SectionEntry, 64> SectionList;
   SectionList Sections;
 
@@ -105,11 +129,11 @@ protected:
   // references it.
   typedef std::map<SectionRef, unsigned> ObjSectionToIDMap;
 
-  // Master symbol table. As modules are loaded and external symbols are
-  // resolved, their addresses are stored here as a SectionID/Offset pair.
+  // A global symbol table for symbols from all loaded modules.  Maps the
+  // symbol name to a (SectionID, offset in section) pair.
   typedef std::pair<unsigned, uintptr_t> SymbolLoc;
-  StringMap<SymbolLoc> SymbolTable;
-  typedef DenseMap<const char*, SymbolLoc> LocalSymbolMap;
+  typedef StringMap<SymbolLoc> SymbolTableMap;
+  SymbolTableMap GlobalSymbolTable;
 
   // Keep a map of common symbols to their sizes
   typedef std::map<SymbolRef, unsigned> CommonSymbolMap;
@@ -121,12 +145,14 @@ protected:
   // in the relocation list where it's stored.
   typedef SmallVector<RelocationEntry, 64> RelocationList;
   // Relocations to sections already loaded. Indexed by SectionID which is the
-  // source of the address. The target where the address will be writen is
+  // source of the address. The target where the address will be written is
   // SectionID/Offset in the relocation itself.
   DenseMap<unsigned, RelocationList> Relocations;
-  // Relocations to external symbols that are not yet resolved.
-  // Indexed by symbol name.
-  StringMap<RelocationList> SymbolRelocations;
+
+  // Relocations to external symbols that are not yet resolved.  Symbols are
+  // external when they aren't found in the global symbol table of all loaded
+  // modules.  This map is indexed by symbol name.
+  StringMap<RelocationList> ExternalSymbolRelocations;
 
   typedef std::map<RelocationValueRef, uintptr_t> StubMap;
 
@@ -153,16 +179,17 @@ protected:
     return (uint8_t*)Sections[SectionID].Address;
   }
 
-  /// \brief Emits a section containing common symbols.
-  /// \return SectionID.
-  unsigned emitCommonSymbols(ObjectImage &Obj,
-                             const CommonSymbolMap &Map,
-                             uint64_t TotalSize,
-                             LocalSymbolMap &Symbols);
+  /// \brief Given the common symbols discovered in the object file, emit a
+  /// new section for them and update the symbol mappings in the object and
+  /// symbol table.
+  void emitCommonSymbols(ObjectImage &Obj,
+                         const CommonSymbolMap &CommonSymbols,
+                         uint64_t TotalSize,
+                         SymbolTableMap &SymbolTable);
 
   /// \brief Emits section data from the object file to the MemoryManager.
   /// \param IsCode if it's true then allocateCodeSection() will be
-  ///        used for emmits, else allocateDataSection() will be used.
+  ///        used for emits, else allocateDataSection() will be used.
   /// \return SectionID.
   unsigned emitSection(ObjectImage &Obj,
                        const SectionRef &Section,
@@ -178,10 +205,12 @@ protected:
                              bool IsCode,
                              ObjSectionToIDMap &LocalSections);
 
-  /// \brief If Value.SymbolName is NULL then store relocation to the
-  ///        Relocations, else store it in the SymbolRelocations.
-  void AddRelocation(const RelocationValueRef &Value, unsigned SectionID,
-                     uintptr_t Offset, uint32_t RelType);
+  // \brief Add a relocation entry that uses the given section.
+  void addRelocationForSection(const RelocationEntry &RE, unsigned SectionID);
+
+  // \brief Add a relocation entry that uses the given symbol.  This symbol may
+  // be found in the global symbol table, or it may be external.
+  void addRelocationForSymbol(const RelocationEntry &RE, StringRef SymbolName);
 
   /// \brief Emits long jump instruction to Addr.
   /// \return Pointer to the memory area for emitting target address.
@@ -203,14 +232,16 @@ protected:
                                  uint32_t Type,
                                  int64_t Addend) = 0;
 
-  /// \brief Parses the object file relocation and store it to Relocations
-  ///        or SymbolRelocations. Its depend from object file type.
+  /// \brief Parses the object file relocation and stores it to Relocations
+  ///        or SymbolRelocations (this depends on the object file type).
   virtual void processRelocationRef(const ObjRelocationInfo &Rel,
                                     ObjectImage &Obj,
                                     ObjSectionToIDMap &ObjSectionToID,
-                                    LocalSymbolMap &Symbols, StubMap &Stubs) = 0;
+                                    const SymbolTableMap &Symbols,
+                                    StubMap &Stubs) = 0;
 
-  void resolveSymbols();
+  /// \brief Resolve relocations to external symbols.
+  void resolveExternalSymbols();
   virtual ObjectImage *createObjectImage(const MemoryBuffer *InputBuffer);
   virtual void handleObjectLoaded(ObjectImage *Obj)
   {
@@ -228,9 +259,9 @@ public:
   void *getSymbolAddress(StringRef Name) {
     // FIXME: Just look up as a function for now. Overly simple of course.
     // Work in progress.
-    if (SymbolTable.find(Name) == SymbolTable.end())
+    if (GlobalSymbolTable.find(Name) == GlobalSymbolTable.end())
       return 0;
-    SymbolLoc Loc = SymbolTable.lookup(Name);
+    SymbolLoc Loc = GlobalSymbolTable.lookup(Name);
     return getSectionAddress(Loc.first) + Loc.second;
   }
 
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
index b7f515d..0e3a9d4 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
@@ -30,7 +30,8 @@ void RuntimeDyldMachO::resolveRelocation(uint8_t *LocalAddress,
   unsigned MachoType = (Type >> 28) & 0xf;
   unsigned Size = 1 << ((Type >> 25) & 3);
 
-  DEBUG(dbgs() << "resolveRelocation LocalAddress: " << format("%p", LocalAddress)
+  DEBUG(dbgs() << "resolveRelocation LocalAddress: " 
+        << format("%p", LocalAddress)
         << " FinalAddress: " << format("%p", FinalAddress)
         << " Value: " << format("%p", Value)
         << " Addend: " << Addend
@@ -53,12 +54,12 @@ void RuntimeDyldMachO::resolveRelocation(uint8_t *LocalAddress,
     break;
   case Triple::x86:
     resolveI386Relocation(LocalAddress,
-                                 FinalAddress,
-                                 (uintptr_t)Value,
-                                 isPCRel,
-                                 Type,
-                                 Size,
-                                 Addend);
+                          FinalAddress,
+                          (uintptr_t)Value,
+                          isPCRel,
+                          Type,
+                          Size,
+                          Addend);
     break;
   case Triple::arm:    // Fall through.
   case Triple::thumb:
@@ -73,14 +74,13 @@ void RuntimeDyldMachO::resolveRelocation(uint8_t *LocalAddress,
   }
 }
 
-bool RuntimeDyldMachO::
-resolveI386Relocation(uint8_t *LocalAddress,
-                      uint64_t FinalAddress,
-                      uint64_t Value,
-                      bool isPCRel,
-                      unsigned Type,
-                      unsigned Size,
-                      int64_t Addend) {
+bool RuntimeDyldMachO::resolveI386Relocation(uint8_t *LocalAddress,
+                                             uint64_t FinalAddress,
+                                             uint64_t Value,
+                                             bool isPCRel,
+                                             unsigned Type,
+                                             unsigned Size,
+                                             int64_t Addend) {
   if (isPCRel)
     Value -= FinalAddress + 4; // see resolveX86_64Relocation
 
@@ -102,14 +102,13 @@ resolveI386Relocation(uint8_t *LocalAddress,
   }
 }
 
-bool RuntimeDyldMachO::
-resolveX86_64Relocation(uint8_t *LocalAddress,
-                        uint64_t FinalAddress,
-                        uint64_t Value,
-                        bool isPCRel,
-                        unsigned Type,
-                        unsigned Size,
-                        int64_t Addend) {
+bool RuntimeDyldMachO::resolveX86_64Relocation(uint8_t *LocalAddress,
+                                               uint64_t FinalAddress,
+                                               uint64_t Value,
+                                               bool isPCRel,
+                                               unsigned Type,
+                                               unsigned Size,
+                                               int64_t Addend) {
   // If the relocation is PC-relative, the value to be encoded is the
   // pointer difference.
   if (isPCRel)
@@ -144,14 +143,13 @@ resolveX86_64Relocation(uint8_t *LocalAddress,
   }
 }
 
-bool RuntimeDyldMachO::
-resolveARMRelocation(uint8_t *LocalAddress,
-                     uint64_t FinalAddress,
-                     uint64_t Value,
-                     bool isPCRel,
-                     unsigned Type,
-                     unsigned Size,
-                     int64_t Addend) {
+bool RuntimeDyldMachO::resolveARMRelocation(uint8_t *LocalAddress,
+                                            uint64_t FinalAddress,
+                                            uint64_t Value,
+                                            bool isPCRel,
+                                            unsigned Type,
+                                            unsigned Size,
+                                            int64_t Addend) {
   // If the relocation is PC-relative, the value to be encoded is the
   // pointer difference.
   if (isPCRel) {
@@ -207,7 +205,7 @@ resolveARMRelocation(uint8_t *LocalAddress,
 void RuntimeDyldMachO::processRelocationRef(const ObjRelocationInfo &Rel,
                                             ObjectImage &Obj,
                                             ObjSectionToIDMap &ObjSectionToID,
-                                            LocalSymbolMap &Symbols,
+                                            const SymbolTableMap &Symbols,
                                             StubMap &Stubs) {
 
   uint32_t RelType = (uint32_t) (Rel.Type & 0xffffffffL);
@@ -217,18 +215,19 @@ void RuntimeDyldMachO::processRelocationRef(const ObjRelocationInfo &Rel,
 
   bool isExtern = (RelType >> 27) & 1;
   if (isExtern) {
+    // Obtain the symbol name which is referenced in the relocation
     StringRef TargetName;
     const SymbolRef &Symbol = Rel.Symbol;
     Symbol.getName(TargetName);
-    // First look the symbol in object file symbols.
-    LocalSymbolMap::iterator lsi = Symbols.find(TargetName.data());
+    // First search for the symbol in the local symbol table
+    SymbolTableMap::const_iterator lsi = Symbols.find(TargetName.data());
     if (lsi != Symbols.end()) {
       Value.SectionID = lsi->second.first;
       Value.Addend = lsi->second.second;
     } else {
-      // Second look the symbol in global symbol table.
-      StringMap<SymbolLoc>::iterator gsi = SymbolTable.find(TargetName.data());
-      if (gsi != SymbolTable.end()) {
+      // Search for the symbol in the global symbol table
+      SymbolTableMap::const_iterator gsi = GlobalSymbolTable.find(TargetName.data());
+      if (gsi != GlobalSymbolTable.end()) {
         Value.SectionID = gsi->second.first;
         Value.Addend = gsi->second.second;
       } else
@@ -249,8 +248,8 @@ void RuntimeDyldMachO::processRelocationRef(const ObjRelocationInfo &Rel,
     Value.SectionID = findOrEmitSection(Obj, *si, true, ObjSectionToID);
     Value.Addend = *(const intptr_t *)Target;
     if (Value.Addend) {
-      // The MachO addend is offset from the current section, we need set it
-      // as offset from destination section
+      // The MachO addend is an offset from the current section.  We need it
+      // to be an offset from the destination section
       Value.Addend += Section.ObjAddress - Sections[Value.SectionID].ObjAddress;
     }
   }
@@ -269,19 +268,29 @@ void RuntimeDyldMachO::processRelocationRef(const ObjRelocationInfo &Rel,
       Stubs[Value] = Section.StubOffset;
       uint8_t *StubTargetAddr = createStubFunction(Section.Address +
                                                    Section.StubOffset);
-      AddRelocation(Value, Rel.SectionID, StubTargetAddr - Section.Address,
-                    macho::RIT_Vanilla);
+      RelocationEntry RE(Rel.SectionID, StubTargetAddr - Section.Address,
+                         macho::RIT_Vanilla, Value.Addend);
+      if (Value.SymbolName)
+        addRelocationForSymbol(RE, Value.SymbolName);
+      else
+        addRelocationForSection(RE, Value.SectionID);
       resolveRelocation(Target, (uint64_t)Target,
                         (uint64_t)Section.Address + Section.StubOffset,
                         RelType, 0);
       Section.StubOffset += getMaxStubSize();
     }
-  } else
-    AddRelocation(Value, Rel.SectionID, Rel.Offset, RelType);
+  } else {
+    RelocationEntry RE(Rel.SectionID, Rel.Offset, RelType, Value.Addend);
+    if (Value.SymbolName)
+      addRelocationForSymbol(RE, Value.SymbolName);
+    else
+      addRelocationForSection(RE, Value.SectionID);
+  }
 }
 
 
-bool RuntimeDyldMachO::isCompatibleFormat(const MemoryBuffer *InputBuffer) const {
+bool RuntimeDyldMachO::isCompatibleFormat(
+        const MemoryBuffer *InputBuffer) const {
   StringRef Magic = InputBuffer->getBuffer().slice(0, 4);
   if (Magic == "\xFE\xED\xFA\xCE") return true;
   if (Magic == "\xCE\xFA\xED\xFE") return true;
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
index 418d130..707664c 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
@@ -51,7 +51,8 @@ protected:
   virtual void processRelocationRef(const ObjRelocationInfo &Rel,
                                     ObjectImage &Obj,
                                     ObjSectionToIDMap &ObjSectionToID,
-                                    LocalSymbolMap &Symbols, StubMap &Stubs);
+                                    const SymbolTableMap &Symbols,
+                                    StubMap &Stubs);
 
 public:
   virtual void resolveRelocation(uint8_t *LocalAddress,
diff --git a/lib/ExecutionEngine/TargetSelect.cpp b/lib/ExecutionEngine/TargetSelect.cpp
index 42364f9..7cdd669 100644
--- a/lib/ExecutionEngine/TargetSelect.cpp
+++ b/lib/ExecutionEngine/TargetSelect.cpp
@@ -26,11 +26,7 @@
 using namespace llvm;
 
 TargetMachine *EngineBuilder::selectTarget() {
-  StringRef MArch = "";
-  StringRef MCPU = "";
-  SmallVector<std::string, 1> MAttrs;
-  Triple TT(M->getTargetTriple());
-
+  Triple TT(LLVM_HOSTTRIPLE);
   return selectTarget(TT, MArch, MCPU, MAttrs);
 }
 
@@ -56,8 +52,9 @@ TargetMachine *EngineBuilder::selectTarget(const Triple &TargetTriple,
     }
 
     if (!TheTarget) {
-      *ErrorStr = "No available targets are compatible with this -march, "
-        "see -version for the available targets.\n";
+      if (ErrorStr)
+        *ErrorStr = "No available targets are compatible with this -march, "
+                    "see -version for the available targets.\n";
       return 0;
     }
 
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 765fcc8..afba2e8 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm-c/Linker.h"
 #include <cctype>
 using namespace llvm;
 
@@ -595,12 +596,12 @@ void ModuleLinker::computeTypeMapping() {
   // example.  When the source module got loaded into the same LLVMContext, if
   // it had the same type, it would have been renamed to "%foo.42 = { i32 }".
   std::vector<StructType*> SrcStructTypes;
-  SrcM->findUsedStructTypes(SrcStructTypes);
+  SrcM->findUsedStructTypes(SrcStructTypes, true);
   SmallPtrSet<StructType*, 32> SrcStructTypesSet(SrcStructTypes.begin(),
                                                  SrcStructTypes.end());
 
   std::vector<StructType*> DstStructTypes;
-  DstM->findUsedStructTypes(DstStructTypes);
+  DstM->findUsedStructTypes(DstStructTypes, true);
   SmallPtrSet<StructType*, 32> DstStructTypesSet(DstStructTypes.begin(),
                                                  DstStructTypes.end());
 
@@ -683,7 +684,7 @@ bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
   GlobalVariable *NG =
     new GlobalVariable(*DstGV->getParent(), NewType, SrcGV->isConstant(),
                        DstGV->getLinkage(), /*init*/0, /*name*/"", DstGV,
-                       DstGV->isThreadLocal(),
+                       DstGV->getThreadLocalMode(),
                        DstGV->getType()->getAddressSpace());
   
   // Propagate alignment, visibility and section info.
@@ -758,7 +759,7 @@ bool ModuleLinker::linkGlobalProto(GlobalVariable *SGV) {
     new GlobalVariable(*DstM, TypeMap.get(SGV->getType()->getElementType()),
                        SGV->isConstant(), SGV->getLinkage(), /*init*/0,
                        SGV->getName(), /*insertbefore*/0,
-                       SGV->isThreadLocal(),
+                       SGV->getThreadLocalMode(),
                        SGV->getType()->getAddressSpace());
   // Propagate alignment, visibility and section info.
   copyGVAttributes(NewDGV, SGV);
@@ -1335,3 +1336,17 @@ bool Linker::LinkModules(Module *Dest, Module *Src, unsigned Mode,
 
   return false;
 }
+
+//===----------------------------------------------------------------------===//
+// C API.
+//===----------------------------------------------------------------------===//
+
+LLVMBool LLVMLinkModules(LLVMModuleRef Dest, LLVMModuleRef Src,
+                         LLVMLinkerMode Mode, char **OutMessages) {
+  std::string Messages;
+  LLVMBool Result = Linker::LinkModules(unwrap(Dest), unwrap(Src),
+                                        Mode, OutMessages? &Messages : 0);
+  if (OutMessages)
+    *OutMessages = strdup(Messages.c_str());
+  return Result;
+}
diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp
index 9fc33b6..7203b9a 100644
--- a/lib/MC/ELFObjectWriter.cpp
+++ b/lib/MC/ELFObjectWriter.cpp
@@ -627,7 +627,7 @@ void ELFObjectWriter::WriteSymbolTable(MCDataFragment *SymtabF,
 
 const MCSymbol *ELFObjectWriter::SymbolToReloc(const MCAssembler &Asm,
                                                const MCValue &Target,
-                                               const MCFragment &F, 
+                                               const MCFragment &F,
                                                const MCFixup &Fixup,
                                                bool IsPCRel) const {
   const MCSymbol &Symbol = Target.getSymA()->getSymbol();
@@ -1061,11 +1061,19 @@ void ELFObjectWriter::WriteRelocationsFragment(const MCAssembler &Asm,
       entry.Index += LocalSymbolData.size();
     if (is64Bit()) {
       String64(*F, entry.r_offset);
+      if (TargetObjectWriter->isN64()) {
+        String32(*F, entry.Index);
 
-      struct ELF::Elf64_Rela ERE64;
-      ERE64.setSymbolAndType(entry.Index, entry.Type);
-      String64(*F, ERE64.r_info);
-
+        String8(*F, TargetObjectWriter->getRSsym(entry.Type));
+        String8(*F, TargetObjectWriter->getRType3(entry.Type));
+        String8(*F, TargetObjectWriter->getRType2(entry.Type));
+        String8(*F, TargetObjectWriter->getRType(entry.Type));
+      }
+      else {
+        struct ELF::Elf64_Rela ERE64;
+        ERE64.setSymbolAndType(entry.Index, entry.Type);
+        String64(*F, ERE64.r_info);
+      }
       if (hasRelocationAddend())
         String64(*F, entry.r_addend);
     } else {
diff --git a/lib/MC/MCAsmBackend.cpp b/lib/MC/MCAsmBackend.cpp
index 0b2e4ae..2e447b0 100644
--- a/lib/MC/MCAsmBackend.cpp
+++ b/lib/MC/MCAsmBackend.cpp
@@ -39,7 +39,7 @@ MCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
     { "FK_SecRel_4", 0, 32, 0 },
     { "FK_SecRel_8", 0, 64, 0 }
   };
-  
+
   assert((size_t)Kind <= sizeof(Builtins) / sizeof(Builtins[0]) &&
          "Unknown fixup kind");
   return Builtins[Kind];
diff --git a/lib/MC/MCAsmInfo.cpp b/lib/MC/MCAsmInfo.cpp
index 8286c1d..8da2e0e 100644
--- a/lib/MC/MCAsmInfo.cpp
+++ b/lib/MC/MCAsmInfo.cpp
@@ -50,6 +50,7 @@ MCAsmInfo::MCAsmInfo() {
   AllowNameToStartWithDigit = false;
   AllowPeriodsInName = true;
   AllowUTF8 = true;
+  UseDataRegionDirectives = false;
   ZeroDirective = "\t.zero\t";
   AsciiDirective = "\t.ascii\t";
   AscizDirective = "\t.asciz\t";
@@ -57,12 +58,6 @@ MCAsmInfo::MCAsmInfo() {
   Data16bitsDirective = "\t.short\t";
   Data32bitsDirective = "\t.long\t";
   Data64bitsDirective = "\t.quad\t";
-  DataBegin = "$d.";
-  CodeBegin = "$a.";
-  JT8Begin = "$d.";
-  JT16Begin = "$d.";
-  JT32Begin = "$d.";
-  SupportsDataRegions = false;
   SunStyleELFSectionSwitchSyntax = false;
   UsesELFSectionDirectiveForBSS = false;
   AlignDirective = "\t.align\t";
@@ -89,14 +84,10 @@ MCAsmInfo::MCAsmInfo() {
   SupportsDebugInformation = false;
   ExceptionsType = ExceptionHandling::None;
   DwarfUsesInlineInfoSection = false;
-  DwarfRequiresRelocationForSectionOffset = true;
   DwarfSectionOffsetDirective = 0;
-  DwarfUsesLabelOffsetForRanges = true;
-  DwarfUsesRelocationsForStringPool = true;
+  DwarfUsesRelocationsAcrossSections = true;
   DwarfRegNumForCFI = false;
   HasMicrosoftFastStdCallMangling = false;
-
-  AsmTransCBE = 0;
 }
 
 MCAsmInfo::~MCAsmInfo() {
diff --git a/lib/MC/MCAsmInfoCOFF.cpp b/lib/MC/MCAsmInfoCOFF.cpp
index 881d992..678e75a 100644
--- a/lib/MC/MCAsmInfoCOFF.cpp
+++ b/lib/MC/MCAsmInfoCOFF.cpp
@@ -26,7 +26,7 @@ MCAsmInfoCOFF::MCAsmInfoCOFF() {
   PrivateGlobalPrefix = "L";  // Prefix for private global symbols
   WeakRefDirective = "\t.weak\t";
   LinkOnceDirective = "\t.linkonce discard\n";
-  
+
   // Doesn't support visibility:
   HiddenVisibilityAttr = HiddenDeclarationVisibilityAttr = MCSA_Invalid;
   ProtectedVisibilityAttr = MCSA_Invalid;
@@ -36,8 +36,6 @@ MCAsmInfoCOFF::MCAsmInfoCOFF() {
   SupportsDebugInformation = true;
   DwarfSectionOffsetDirective = "\t.secrel32\t";
   HasMicrosoftFastStdCallMangling = true;
-
-  SupportsDataRegions = false;
 }
 
 void MCAsmInfoMicrosoft::anchor() { }
diff --git a/lib/MC/MCAsmInfoDarwin.cpp b/lib/MC/MCAsmInfoDarwin.cpp
index c1e2635..8e0ac23 100644
--- a/lib/MC/MCAsmInfoDarwin.cpp
+++ b/lib/MC/MCAsmInfoDarwin.cpp
@@ -18,7 +18,7 @@
 #include "llvm/MC/MCStreamer.h"
 using namespace llvm;
 
-void MCAsmInfoDarwin::anchor() { } 
+void MCAsmInfoDarwin::anchor() { }
 
 MCAsmInfoDarwin::MCAsmInfoDarwin() {
   // Common settings for all Darwin targets.
@@ -43,13 +43,6 @@ MCAsmInfoDarwin::MCAsmInfoDarwin() {
   HasMachoTBSSDirective = true; // Uses .tbss
   HasStaticCtorDtorReferenceInStaticMode = true;
 
-  CodeBegin = "L$start$code$";
-  DataBegin = "L$start$data$";
-  JT8Begin  = "L$start$jt8$";
-  JT16Begin = "L$start$jt16$";
-  JT32Begin = "L$start$jt32$";
-  SupportsDataRegions = true;
-
   // FIXME: Darwin 10 and newer don't need this.
   LinkerRequiresNonEmptyDwarfLines = true;
 
@@ -61,12 +54,10 @@ MCAsmInfoDarwin::MCAsmInfoDarwin() {
 
   // Doesn't support protected visibility.
   ProtectedVisibilityAttr = MCSA_Invalid;
-  
+
   HasDotTypeDotSizeDirective = false;
   HasNoDeadStrip = true;
   HasSymbolResolver = true;
 
-  DwarfRequiresRelocationForSectionOffset = false;
-  DwarfUsesLabelOffsetForRanges = false;
-  DwarfUsesRelocationsForStringPool = false;
+  DwarfUsesRelocationsAcrossSections = false;
 }
diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp
index 11f0f72..373df4b 100644
--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp
@@ -138,6 +138,7 @@ public:
   virtual void EmitEHSymAttributes(const MCSymbol *Symbol,
                                    MCSymbol *EHSymbol);
   virtual void EmitAssemblerFlag(MCAssemblerFlag Flag);
+  virtual void EmitDataRegion(MCDataRegionType Kind);
   virtual void EmitThumbFunc(MCSymbol *Func);
 
   virtual void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value);
@@ -170,7 +171,7 @@ public:
                                      unsigned ByteAlignment);
 
   virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
-                            unsigned Size = 0, unsigned ByteAlignment = 0);
+                            uint64_t Size = 0, unsigned ByteAlignment = 0);
 
   virtual void EmitTBSSSymbol (const MCSection *Section, MCSymbol *Symbol,
                                uint64_t Size, unsigned ByteAlignment = 0);
@@ -352,6 +353,21 @@ void MCAsmStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
   EmitEOL();
 }
 
+void MCAsmStreamer::EmitDataRegion(MCDataRegionType Kind) {
+  MCContext &Ctx = getContext();
+  const MCAsmInfo &MAI = Ctx.getAsmInfo();
+  if (!MAI.doesSupportDataRegionDirectives())
+    return;
+  switch (Kind) {
+  case MCDR_DataRegion:            OS << "\t.data_region"; break;
+  case MCDR_DataRegionJT8:         OS << "\t.data_region jt8"; break;
+  case MCDR_DataRegionJT16:        OS << "\t.data_region jt16"; break;
+  case MCDR_DataRegionJT32:        OS << "\t.data_region jt32"; break;
+  case MCDR_DataRegionEnd:         OS << "\t.end_data_region"; break;
+  }
+  EmitEOL();
+}
+
 void MCAsmStreamer::EmitThumbFunc(MCSymbol *Func) {
   // This needs to emit to a temporary string to get properly quoted
   // MCSymbols when they have spaces in them.
@@ -513,7 +529,7 @@ void MCAsmStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
 }
 
 void MCAsmStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
-                                 unsigned Size, unsigned ByteAlignment) {
+                                 uint64_t Size, unsigned ByteAlignment) {
   // Note: a .zerofill directive does not switch sections.
   OS << ".zerofill ";
 
@@ -826,7 +842,7 @@ void MCAsmStreamer::EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
 
   if (IsVerboseAsm) {
     OS.PadToColumn(MAI.getCommentColumn());
-    OS << MAI.getCommentString() << ' ' << FileName << ':' 
+    OS << MAI.getCommentString() << ' ' << FileName << ':'
        << Line << ':' << Column;
   }
   EmitEOL();
@@ -1009,7 +1025,7 @@ void MCAsmStreamer::EmitCFISignalFrame() {
   if (!UseCFI)
     return;
 
-  OS << "\t.cif_signal_frame";
+  OS << "\t.cfi_signal_frame";
   EmitEOL();
 }
 
diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp
index bb67868..0aa0c98 100644
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp
@@ -409,7 +409,7 @@ static void WriteFragmentData(const MCAssembler &Asm, const MCAsmLayout &Layout,
 
     // See if we are aligning with nops, and if so do that first to try to fill
     // the Count bytes.  Then if that did not fill any bytes or there are any
-    // bytes left to fill use the the Value and ValueSize to fill the rest.
+    // bytes left to fill use the Value and ValueSize to fill the rest.
     // If we are aligning with nops, ask that target to emit the right data.
     if (AF.hasEmitNops()) {
       if (!Asm.getBackend().writeNopData(Count, OW))
diff --git a/lib/MC/MCContext.cpp b/lib/MC/MCContext.cpp
index d3c4fb1..b5b14b9 100644
--- a/lib/MC/MCContext.cpp
+++ b/lib/MC/MCContext.cpp
@@ -274,11 +274,11 @@ unsigned MCContext::GetDwarfFile(StringRef Directory, StringRef FileName,
 
   if (Directory.empty()) {
     // Separate the directory part from the basename of the FileName.
-    std::pair<StringRef, StringRef> Slash = FileName.rsplit('/');
-    Directory = Slash.second;
-    if (!Directory.empty()) {
-      Directory = Slash.first;
-      FileName = Slash.second;
+    StringRef tFileName = sys::path::filename(FileName);
+    if (!tFileName.empty()) {
+      Directory = sys::path::parent_path(FileName);
+      if (!Directory.empty())
+        FileName = tFileName;
     }
   }
 
diff --git a/lib/MC/MCDisassembler/Disassembler.h b/lib/MC/MCDisassembler/Disassembler.h
index 880a31a..322abd5 100644
--- a/lib/MC/MCDisassembler/Disassembler.h
+++ b/lib/MC/MCDisassembler/Disassembler.h
@@ -99,6 +99,14 @@ public:
     DisAsm.reset(disAsm);
     IP.reset(iP);
   }
+  const std::string &getTripleName() const { return TripleName; }
+  void *getDisInfo() const { return DisInfo; }
+  int getTagType() const { return TagType; }
+  LLVMOpInfoCallback getGetOpInfo() const { return GetOpInfo; }
+  LLVMSymbolLookupCallback getSymbolLookupCallback() const {
+    return SymbolLookUp;
+  }
+  const Target *getTarget() const { return TheTarget; }
   const MCDisassembler *getDisAsm() const { return DisAsm.get(); }
   const MCAsmInfo *getAsmInfo() const { return MAI.get(); }
   MCInstPrinter *getIP() { return IP.get(); }
diff --git a/lib/MC/MCDisassembler/EDDisassembler.cpp b/lib/MC/MCDisassembler/EDDisassembler.cpp
index b2672ca..1226f1a 100644
--- a/lib/MC/MCDisassembler/EDDisassembler.cpp
+++ b/lib/MC/MCDisassembler/EDDisassembler.cpp
@@ -44,7 +44,7 @@ struct TripleMap {
   const char *String;
 };
 
-static struct TripleMap triplemap[] = {
+static const struct TripleMap triplemap[] = {
   { Triple::x86,          "i386-unknown-unknown"    },
   { Triple::x86_64,       "x86_64-unknown-unknown"  },
   { Triple::arm,          "arm-unknown-unknown"     },
@@ -256,7 +256,7 @@ void EDDisassembler::initMaps(const MCRegisterInfo &registerInfo) {
   unsigned registerIndex;
   
   for (registerIndex = 0; registerIndex < numRegisters; ++registerIndex) {
-    const char* registerName = registerInfo.get(registerIndex).Name;
+    const char* registerName = registerInfo.getName(registerIndex);
     
     RegVec.push_back(registerName);
     RegRMap[registerName] = registerIndex;
diff --git a/lib/MC/MCDisassembler/EDMain.cpp b/lib/MC/MCDisassembler/EDMain.cpp
index c658717..5c065db 100644
--- a/lib/MC/MCDisassembler/EDMain.cpp
+++ b/lib/MC/MCDisassembler/EDMain.cpp
@@ -4,7 +4,7 @@
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
-// 
+//
 //===----------------------------------------------------------------------===//
 //
 // This file implements the enhanced disassembler's public C API.
@@ -34,9 +34,9 @@ int EDGetDisassembler(EDDisassemblerRef *disassembler,
     Syntax = EDDisassembler::kEDAssemblySyntaxARMUAL;
     break;
   }
-  
+
   EDDisassemblerRef ret = EDDisassembler::getDisassembler(triple, Syntax);
-  
+
   if (!ret)
     return -1;
   *disassembler = ret;
@@ -70,18 +70,18 @@ unsigned int EDCreateInsts(EDInstRef *insts,
                            uint64_t address,
                            void *arg) {
   unsigned int index;
-  
+
   for (index = 0; index < count; ++index) {
     EDInst *inst = ((EDDisassembler*)disassembler)->createInst(byteReader,
                                                                address, arg);
-    
+
     if (!inst)
       return index;
-    
+
     insts[index] = inst;
     address += inst->byteSize();
   }
-  
+
   return count;
 }
 
@@ -165,14 +165,14 @@ int EDTokenIsRegister(EDTokenRef token) {
 int EDTokenIsNegativeLiteral(EDTokenRef token) {
   if (((EDToken*)token)->type() != EDToken::kTokenLiteral)
     return -1;
-  
+
   return ((EDToken*)token)->literalSign();
 }
 
 int EDLiteralTokenAbsoluteValue(uint64_t *value, EDTokenRef token) {
   if (((EDToken*)token)->type() != EDToken::kTokenLiteral)
     return -1;
-  
+
   return ((EDToken*)token)->literalAbsoluteValue(*value);
 }
 
@@ -180,7 +180,7 @@ int EDRegisterTokenValue(unsigned *registerID,
                          EDTokenRef token) {
   if (((EDToken*)token)->type() != EDToken::kTokenRegister)
     return -1;
-  
+
   return ((EDToken*)token)->registerID(*registerID);
 }
 
@@ -231,7 +231,7 @@ struct ByteReaderWrapper {
   EDByteBlock_t byteBlock;
 };
 
-static int readerWrapperCallback(uint8_t *byte, 
+static int readerWrapperCallback(uint8_t *byte,
                           uint64_t address,
                           void *arg) {
   struct ByteReaderWrapper *wrapper = (struct ByteReaderWrapper *)arg;
@@ -245,13 +245,9 @@ unsigned int EDBlockCreateInsts(EDInstRef *insts,
                                 uint64_t address) {
   struct ByteReaderWrapper wrapper;
   wrapper.byteBlock = byteBlock;
-  
-  return EDCreateInsts(insts,
-                       count,
-                       disassembler, 
-                       readerWrapperCallback, 
-                       address, 
-                       (void*)&wrapper);
+
+  return EDCreateInsts(insts, count, disassembler, readerWrapperCallback,
+                       address, (void*)&wrapper);
 }
 
 int EDBlockEvaluateOperand(uint64_t *result, EDOperandRef operand,
diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp
index 84a34f1..75eaf80 100644
--- a/lib/MC/MCDwarf.cpp
+++ b/lib/MC/MCDwarf.cpp
@@ -36,7 +36,7 @@ using namespace llvm;
 
 // First special line opcode - leave room for the standard opcodes.
 // Note: If you want to change this, you'll have to update the
-// "standard_opcode_lengths" table that is emitted in DwarfFileTable::Emit().  
+// "standard_opcode_lengths" table that is emitted in DwarfFileTable::Emit().
 #define DWARF2_LINE_OPCODE_BASE         13
 
 // Minimum line offset in a special line info. opcode.  This value
@@ -105,7 +105,7 @@ void MCLineEntry::Make(MCStreamer *MCOS, const MCSection *Section) {
 
 //
 // This helper routine returns an expression of End - Start + IntVal .
-// 
+//
 static inline const MCExpr *MakeStartMinusEndExpr(const MCStreamer &MCOS,
                                                   const MCSymbol &Start,
                                                   const MCSymbol &End,
@@ -198,7 +198,7 @@ static inline void EmitDwarfLineTable(MCStreamer *MCOS,
   // Set the value of the symbol, as we are at the end of the section.
   MCOS->EmitLabel(SectionEnd);
 
-  // Switch back the the dwarf line section.
+  // Switch back the dwarf line section.
   MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfLineSection());
 
   const MCAsmInfo &asmInfo = MCOS->getContext().getAsmInfo();
@@ -310,7 +310,7 @@ const MCSymbol *MCDwarfFileTable::Emit(MCStreamer *MCOS) {
   if (MCOS->getContext().getAsmInfo().getLinkerRequiresNonEmptyDwarfLines()
       && MCLineSectionOrder.begin() == MCLineSectionOrder.end()) {
     // The darwin9 linker has a bug (see PR8715). For for 32-bit architectures
-    // it requires:  
+    // it requires:
     // total_length >= prologue_length + 10
     // We are 4 bytes short, since we have total_length = 51 and
     // prologue_length = 45
@@ -354,7 +354,7 @@ void MCDwarfLineAddr::Encode(int64_t LineDelta, uint64_t AddrDelta,
   AddrDelta = ScaleAddrDelta(AddrDelta);
 
   // A LineDelta of INT64_MAX is a signal that this is actually a
-  // DW_LNE_end_sequence. We cannot use special opcodes here, since we want the 
+  // DW_LNE_end_sequence. We cannot use special opcodes here, since we want the
   // end_sequence to emit the matrix entry.
   if (LineDelta == INT64_MAX) {
     if (AddrDelta == MAX_SPECIAL_ADDR_DELTA)
@@ -552,7 +552,7 @@ static void EmitGenDwarfInfo(MCStreamer *MCOS,
                              const MCSymbol *LineSectionSymbol) {
   MCContext &context = MCOS->getContext();
 
-  MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfInfoSection()); 
+  MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfInfoSection());
 
   // Create a symbol at the start and end of this section used in here for the
   // expression to calculate the length in the header.
@@ -705,7 +705,7 @@ void MCGenDwarfInfo::Emit(MCStreamer *MCOS, const MCSymbol *LineSectionSymbol) {
   MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfInfoSection());
   MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfAbbrevSection());
   MCSymbol *AbbrevSectionSymbol;
-  if (AsmInfo.doesDwarfRequireRelocationForSectionOffset()) {
+  if (AsmInfo.doesDwarfUseRelocationsAcrossSections()) {
     AbbrevSectionSymbol = context.CreateTempSymbol();
     MCOS->EmitLabel(AbbrevSectionSymbol);
   } else {
@@ -766,7 +766,7 @@ void MCGenDwarfLabelEntry::Make(MCSymbol *Symbol, MCStreamer *MCOS,
   MCOS->EmitLabel(Label);
 
   // Create and entry for the info and add it to the other entries.
-  MCGenDwarfLabelEntry *Entry = 
+  MCGenDwarfLabelEntry *Entry =
     new MCGenDwarfLabelEntry(Name, FileNumber, LineNumber, Label);
   MCOS->getContext().addMCGenDwarfLabelEntry(Entry);
 }
@@ -1285,7 +1285,7 @@ MCSymbol *FrameEmitterImpl::EmitFDE(MCStreamer &streamer,
                                                  0);
     if (verboseAsm) streamer.AddComment("FDE CIE Offset");
     streamer.EmitAbsValue(offset, 4);
-  } else if (!asmInfo.doesDwarfRequireRelocationForSectionOffset()) {
+  } else if (!asmInfo.doesDwarfUseRelocationsAcrossSections()) {
     const MCExpr *offset = MakeStartMinusEndExpr(streamer, *SectionStart,
                                                  cieStart, 0);
     streamer.EmitAbsValue(offset, 4);
diff --git a/lib/MC/MCELFObjectTargetWriter.cpp b/lib/MC/MCELFObjectTargetWriter.cpp
index 171ab4d..6eb6914 100644
--- a/lib/MC/MCELFObjectTargetWriter.cpp
+++ b/lib/MC/MCELFObjectTargetWriter.cpp
@@ -15,9 +15,11 @@ using namespace llvm;
 MCELFObjectTargetWriter::MCELFObjectTargetWriter(bool Is64Bit_,
                                                  uint8_t OSABI_,
                                                  uint16_t EMachine_,
-                                                 bool HasRelocationAddend_)
+                                                 bool HasRelocationAddend_,
+                                                 bool IsN64_)
   : OSABI(OSABI_), EMachine(EMachine_),
-    HasRelocationAddend(HasRelocationAddend_), Is64Bit(Is64Bit_) {
+    HasRelocationAddend(HasRelocationAddend_), Is64Bit(Is64Bit_),
+    IsN64(IsN64_){
 }
 
 /// Default e_flags = 0
diff --git a/lib/MC/MCELFStreamer.cpp b/lib/MC/MCELFStreamer.cpp
index 6c4d0e3..2d342dc 100644
--- a/lib/MC/MCELFStreamer.cpp
+++ b/lib/MC/MCELFStreamer.cpp
@@ -13,6 +13,8 @@
 
 #include "MCELF.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
@@ -89,7 +91,7 @@ public:
                                      unsigned ByteAlignment);
 
   virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
-                            unsigned Size = 0, unsigned ByteAlignment = 0) {
+                            uint64_t Size = 0, unsigned ByteAlignment = 0) {
     llvm_unreachable("ELF doesn't support this directive");
   }
   virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp
index 7880155..0eb7fcc 100644
--- a/lib/MC/MCExpr.cpp
+++ b/lib/MC/MCExpr.cpp
@@ -202,6 +202,8 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_PPC_DARWIN_LO16: return "lo16";
   case VK_PPC_GAS_HA16: return "ha";
   case VK_PPC_GAS_LO16: return "l";
+  case VK_PPC_TPREL16_HA: return "tprel@ha";
+  case VK_PPC_TPREL16_LO: return "tprel@l";
   case VK_Mips_GPREL: return "GPREL";
   case VK_Mips_GOT_CALL: return "GOT_CALL";
   case VK_Mips_GOT16: return "GOT16";
@@ -220,6 +222,8 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_Mips_GOT_DISP: return "GOT_DISP";
   case VK_Mips_GOT_PAGE: return "GOT_PAGE";
   case VK_Mips_GOT_OFST: return "GOT_OFST";
+  case VK_Mips_HIGHER:   return "HIGHER";
+  case VK_Mips_HIGHEST:  return "HIGHEST";
   }
   llvm_unreachable("Invalid variant kind");
 }
diff --git a/lib/MC/MCMachOStreamer.cpp b/lib/MC/MCMachOStreamer.cpp
index bc6cf77..b75fe2c 100644
--- a/lib/MC/MCMachOStreamer.cpp
+++ b/lib/MC/MCMachOStreamer.cpp
@@ -1,4 +1,3 @@
-//===- lib/MC/MCMachOStreamer.cpp - Mach-O Object Output ------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -33,6 +32,8 @@ class MCMachOStreamer : public MCObjectStreamer {
 private:
   virtual void EmitInstToData(const MCInst &Inst);
 
+  void EmitDataRegion(DataRegionData::KindTy Kind);
+  void EmitDataRegionEnd();
 public:
   MCMachOStreamer(MCContext &Context, MCAsmBackend &MAB,
                   raw_ostream &OS, MCCodeEmitter *Emitter)
@@ -46,6 +47,7 @@ public:
   virtual void EmitEHSymAttributes(const MCSymbol *Symbol,
                                    MCSymbol *EHSymbol);
   virtual void EmitAssemblerFlag(MCAssemblerFlag Flag);
+  virtual void EmitDataRegion(MCDataRegionType Kind);
   virtual void EmitThumbFunc(MCSymbol *Func);
   virtual void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value);
   virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
@@ -72,7 +74,7 @@ public:
     llvm_unreachable("macho doesn't support this directive");
   }
   virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
-                            unsigned Size = 0, unsigned ByteAlignment = 0);
+                            uint64_t Size = 0, unsigned ByteAlignment = 0);
   virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
                               uint64_t Size, unsigned ByteAlignment = 0);
   virtual void EmitBytes(StringRef Data, unsigned AddrSpace);
@@ -138,6 +140,26 @@ void MCMachOStreamer::EmitLabel(MCSymbol *Symbol) {
   SD.setFlags(SD.getFlags() & ~SF_ReferenceTypeMask);
 }
 
+void MCMachOStreamer::EmitDataRegion(DataRegionData::KindTy Kind) {
+  // Create a temporary label to mark the start of the data region.
+  MCSymbol *Start = getContext().CreateTempSymbol();
+  EmitLabel(Start);
+  // Record the region for the object writer to use.
+  DataRegionData Data = { Kind, Start, NULL };
+  std::vector<DataRegionData> &Regions = getAssembler().getDataRegions();
+  Regions.push_back(Data);
+}
+
+void MCMachOStreamer::EmitDataRegionEnd() {
+  std::vector<DataRegionData> &Regions = getAssembler().getDataRegions();
+  assert(Regions.size() && "Mismatched .end_data_region!");
+  DataRegionData &Data = Regions.back();
+  assert(Data.End == NULL && "Mismatched .end_data_region!");
+  // Create a temporary label to mark the end of the data region.
+  Data.End = getContext().CreateTempSymbol();
+  EmitLabel(Data.End);
+}
+
 void MCMachOStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
   // Let the target do whatever target specific stuff it needs to do.
   getAssembler().getBackend().handleAssemblerFlag(Flag);
@@ -153,6 +175,26 @@ void MCMachOStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
   }
 }
 
+void MCMachOStreamer::EmitDataRegion(MCDataRegionType Kind) {
+  switch (Kind) {
+  case MCDR_DataRegion:
+    EmitDataRegion(DataRegionData::Data);
+    return;
+  case MCDR_DataRegionJT8:
+    EmitDataRegion(DataRegionData::JumpTable8);
+    return;
+  case MCDR_DataRegionJT16:
+    EmitDataRegion(DataRegionData::JumpTable16);
+    return;
+  case MCDR_DataRegionJT32:
+    EmitDataRegion(DataRegionData::JumpTable32);
+    return;
+  case MCDR_DataRegionEnd:
+    EmitDataRegionEnd();
+    return;
+  }
+}
+
 void MCMachOStreamer::EmitThumbFunc(MCSymbol *Symbol) {
   // Remember that the function is a thumb function. Fixup and relocation
   // values will need adjusted.
@@ -284,7 +326,7 @@ void MCMachOStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
 }
 
 void MCMachOStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
-                                   unsigned Size, unsigned ByteAlignment) {
+                                   uint64_t Size, unsigned ByteAlignment) {
   MCSectionData &SectData = getAssembler().getOrCreateSectionData(*Section);
 
   // The symbol may not be present, which only creates the section.
diff --git a/lib/MC/MCNullStreamer.cpp b/lib/MC/MCNullStreamer.cpp
index 7ff2d1b..4c17d91 100644
--- a/lib/MC/MCNullStreamer.cpp
+++ b/lib/MC/MCNullStreamer.cpp
@@ -63,7 +63,7 @@ namespace {
     virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                        unsigned ByteAlignment) {}
     virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
-                              unsigned Size = 0, unsigned ByteAlignment = 0) {}
+                              uint64_t Size = 0, unsigned ByteAlignment = 0) {}
     virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
                                 uint64_t Size, unsigned ByteAlignment) {}
     virtual void EmitBytes(StringRef Data, unsigned AddrSpace) {}
@@ -82,7 +82,7 @@ namespace {
 
     virtual bool EmitValueToOffset(const MCExpr *Offset,
                                    unsigned char Value = 0) { return false; }
-    
+
     virtual void EmitFileDirective(StringRef Filename) {}
     virtual bool EmitDwarfFileDirective(unsigned FileNo, StringRef Directory,
                                         StringRef Filename) {
@@ -99,12 +99,12 @@ namespace {
     virtual void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) {
       RecordProcEnd(Frame);
     }
-    
+
     /// @}
   };
 
 }
-    
+
 MCStreamer *llvm::createNullStreamer(MCContext &Context) {
   return new MCNullStreamer(Context);
 }
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index b22ae33..4e6a1b9 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -169,7 +169,7 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {
     Ctx->getMachOSection("__DWARF", "__apple_types",
                          MCSectionMachO::S_ATTR_DEBUG,
                          SectionKind::getMetadata());
-    
+
   DwarfAbbrevSection =
     Ctx->getMachOSection("__DWARF", "__debug_abbrev",
                          MCSectionMachO::S_ATTR_DEBUG,
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 8aef43c..2daad0a 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -45,6 +45,8 @@ FatalAssemblerWarnings("fatal-assembler-warnings",
 namespace {
 
 /// \brief Helper class for tracking macro definitions.
+typedef std::vector<AsmToken> MacroArgument;
+
 struct Macro {
   StringRef Name;
   StringRef Body;
@@ -178,9 +180,9 @@ private:
   bool ParseCppHashLineFilenameComment(const SMLoc &L);
 
   bool HandleMacroEntry(StringRef Name, SMLoc NameLoc, const Macro *M);
-  bool expandMacro(SmallString<256> &Buf, StringRef Body,
+  bool expandMacro(raw_svector_ostream &OS, StringRef Body,
                    const std::vector<StringRef> &Parameters,
-                   const std::vector<std::vector<AsmToken> > &A,
+                   const std::vector<MacroArgument> &A,
                    const SMLoc &L);
   void HandleMacroExit();
 
@@ -204,11 +206,18 @@ private:
 
   void EatToEndOfStatement();
 
+  bool ParseMacroArgument(MacroArgument &MA);
+  bool ParseMacroArguments(const Macro *M, std::vector<MacroArgument> &A);
+
   /// \brief Parse up to the end of statement and a return the contents from the
   /// current token until the end of the statement; the current token on exit
   /// will be either the EndOfStatement or EOF.
   StringRef ParseStringToEndOfStatement();
 
+  /// \brief Parse until the end of a statement or a comma is encountered,
+  /// return the contents from the current token up to the end or comma.
+  StringRef ParseStringToComma();
+
   bool ParseAssignment(StringRef Name, bool allow_redef);
 
   bool ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc);
@@ -245,6 +254,10 @@ private:
   bool ParseDirectiveIncbin(); // ".incbin"
 
   bool ParseDirectiveIf(SMLoc DirectiveLoc); // ".if"
+  // ".ifb" or ".ifnb", depending on ExpectBlank.
+  bool ParseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank);
+  // ".ifc" or ".ifnc", depending on ExpectEqual.
+  bool ParseDirectiveIfc(SMLoc DirectiveLoc, bool ExpectEqual);
   // ".ifdef" or ".ifndef", depending on expect_defined
   bool ParseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined);
   bool ParseDirectiveElseIf(SMLoc DirectiveLoc); // ".elseif"
@@ -257,6 +270,15 @@ private:
 
   const MCExpr *ApplyModifierToExpr(const MCExpr *E,
                                     MCSymbolRefExpr::VariantKind Variant);
+
+  // Macro-like directives
+  Macro *ParseMacroLikeBody(SMLoc DirectiveLoc);
+  void InstantiateMacroLikeBody(Macro *M, SMLoc DirectiveLoc,
+                                raw_svector_ostream &OS);
+  bool ParseDirectiveRept(SMLoc DirectiveLoc); // ".rept"
+  bool ParseDirectiveIrp(SMLoc DirectiveLoc);  // ".irp"
+  bool ParseDirectiveIrpc(SMLoc DirectiveLoc); // ".irpc"
+  bool ParseDirectiveEndr(SMLoc DirectiveLoc); // ".endr"
 };
 
 /// \brief Generic implementations of directive handling, etc. which is shared
@@ -328,6 +350,7 @@ public:
     AddDirectiveHandler<&GenericAsmParser::ParseDirectiveMacro>(".macro");
     AddDirectiveHandler<&GenericAsmParser::ParseDirectiveEndMacro>(".endm");
     AddDirectiveHandler<&GenericAsmParser::ParseDirectiveEndMacro>(".endmacro");
+    AddDirectiveHandler<&GenericAsmParser::ParseDirectivePurgeMacro>(".purgem");
 
     AddDirectiveHandler<&GenericAsmParser::ParseDirectiveLEB128>(".sleb128");
     AddDirectiveHandler<&GenericAsmParser::ParseDirectiveLEB128>(".uleb128");
@@ -359,6 +382,7 @@ public:
   bool ParseDirectiveMacrosOnOff(StringRef, SMLoc DirectiveLoc);
   bool ParseDirectiveMacro(StringRef, SMLoc DirectiveLoc);
   bool ParseDirectiveEndMacro(StringRef, SMLoc DirectiveLoc);
+  bool ParseDirectivePurgeMacro(StringRef, SMLoc DirectiveLoc);
 
   bool ParseDirectiveLEB128(StringRef, SMLoc);
 };
@@ -456,7 +480,7 @@ bool AsmParser::EnterIncludeFile(const std::string &Filename) {
 }
 
 /// Process the specified .incbin file by seaching for it in the include paths
-/// then just emiting the byte contents of the file to the streamer. This 
+/// then just emitting the byte contents of the file to the streamer. This
 /// returns true on failure.
 bool AsmParser::ProcessIncbinFile(const std::string &Filename) {
   std::string IncludedFile;
@@ -602,6 +626,18 @@ StringRef AsmParser::ParseStringToEndOfStatement() {
   return StringRef(Start, End - Start);
 }
 
+StringRef AsmParser::ParseStringToComma() {
+  const char *Start = getTok().getLoc().getPointer();
+
+  while (Lexer.isNot(AsmToken::EndOfStatement) &&
+         Lexer.isNot(AsmToken::Comma) &&
+         Lexer.isNot(AsmToken::Eof))
+    Lex();
+
+  const char *End = getTok().getLoc().getPointer();
+  return StringRef(Start, End - Start);
+}
+
 /// ParseParenExpr - Parse a paren expression and return it.
 /// NOTE: This assumes the leading '(' has already been consumed.
 ///
@@ -700,7 +736,7 @@ bool AsmParser::ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
                                                       IDVal == "f" ? 1 : 0);
         Res = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None,
                                       getContext());
-        if(IDVal == "b" && Sym->isUndefined())
+        if (IDVal == "b" && Sym->isUndefined())
           return Error(Loc, "invalid reference to undefined symbol");
         EndLoc = Lexer.getLoc();
         Lex(); // Eat identifier.
@@ -1042,6 +1078,14 @@ bool AsmParser::ParseStatement() {
   // example.
   if (IDVal == ".if")
     return ParseDirectiveIf(IDLoc);
+  if (IDVal == ".ifb")
+    return ParseDirectiveIfb(IDLoc, true);
+  if (IDVal == ".ifnb")
+    return ParseDirectiveIfb(IDLoc, false);
+  if (IDVal == ".ifc")
+    return ParseDirectiveIfc(IDLoc, true);
+  if (IDVal == ".ifnc")
+    return ParseDirectiveIfc(IDLoc, false);
   if (IDVal == ".ifdef")
     return ParseDirectiveIfdef(IDLoc, true);
   if (IDVal == ".ifndef" || IDVal == ".ifnotdef")
@@ -1123,6 +1167,11 @@ bool AsmParser::ParseStatement() {
 
   // Otherwise, we have a normal instruction or directive.
   if (IDVal[0] == '.' && IDVal != ".") {
+
+    // Target hook for parsing target specific directives.
+    if (!getTargetParser().ParseDirective(ID))
+      return false;
+
     // Assembler features
     if (IDVal == ".set" || IDVal == ".equ")
       return ParseDirectiveSet(IDVal, true);
@@ -1192,6 +1241,10 @@ bool AsmParser::ParseStatement() {
 
     // Symbol attribute directives
 
+    if (IDVal == ".extern") {
+      EatToEndOfStatement(); // .extern is the default, ignore it.
+      return false;
+    }
     if (IDVal == ".globl" || IDVal == ".global")
       return ParseDirectiveSymbolAttribute(MCSA_Global);
     if (IDVal == ".indirect_symbol")
@@ -1225,22 +1278,27 @@ bool AsmParser::ParseStatement() {
     if (IDVal == ".incbin")
       return ParseDirectiveIncbin();
 
-    if (IDVal == ".code16")
+    if (IDVal == ".code16" || IDVal == ".code16gcc")
       return TokError(Twine(IDVal) + " not supported yet");
 
+    // Macro-like directives
+    if (IDVal == ".rept")
+      return ParseDirectiveRept(IDLoc);
+    if (IDVal == ".irp")
+      return ParseDirectiveIrp(IDLoc);
+    if (IDVal == ".irpc")
+      return ParseDirectiveIrpc(IDLoc);
+    if (IDVal == ".endr")
+      return ParseDirectiveEndr(IDLoc);
+
     // Look up the handler in the handler table.
     std::pair<MCAsmParserExtension*, DirectiveHandler> Handler =
       DirectiveMap.lookup(IDVal);
     if (Handler.first)
       return (*Handler.second)(Handler.first, IDVal, IDLoc);
 
-    // Target hook for parsing target specific directives.
-    if (!getTargetParser().ParseDirective(ID))
-      return false;
 
-    bool retval = Warning(IDLoc, "ignoring directive for now");
-    EatToEndOfStatement();
-    return retval;
+    return Error(IDLoc, "unknown directive");
   }
 
   CheckForValidSection();
@@ -1339,7 +1397,7 @@ bool AsmParser::ParseCppHashLineFilenameComment(const SMLoc &L) {
   return false;
 }
 
-/// DiagHandler - will use the the last parsed cpp hash line filename comment
+/// DiagHandler - will use the last parsed cpp hash line filename comment
 /// for the Filename and LineNo if any in the diagnostic.
 void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
   const AsmParser *Parser = static_cast<const AsmParser*>(Context);
@@ -1393,11 +1451,10 @@ void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
     NewDiag.print(0, OS);
 }
 
-bool AsmParser::expandMacro(SmallString<256> &Buf, StringRef Body,
+bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
                             const std::vector<StringRef> &Parameters,
-                            const std::vector<std::vector<AsmToken> > &A,
+                            const std::vector<MacroArgument> &A,
                             const SMLoc &L) {
-  raw_svector_ostream OS(Buf);
   unsigned NParameters = Parameters.size();
   if (NParameters != 0 && NParameters != A.size())
     return Error(L, "Wrong number of arguments");
@@ -1449,7 +1506,7 @@ bool AsmParser::expandMacro(SmallString<256> &Buf, StringRef Body,
           break;
 
         // Otherwise substitute with the token values, with spaces eliminated.
-        for (std::vector<AsmToken>::const_iterator it = A[Index].begin(),
+        for (MacroArgument::const_iterator it = A[Index].begin(),
                ie = A[Index].end(); it != ie; ++it)
           OS << it->getString();
         break;
@@ -1472,7 +1529,7 @@ bool AsmParser::expandMacro(SmallString<256> &Buf, StringRef Body,
       if (Index == NParameters)
         return Error(L, "Parameter not found");
 
-      for (std::vector<AsmToken>::const_iterator it = A[Index].begin(),
+      for (MacroArgument::const_iterator it = A[Index].begin(),
              ie = A[Index].end(); it != ie; ++it)
         OS << it->getString();
 
@@ -1482,9 +1539,6 @@ bool AsmParser::expandMacro(SmallString<256> &Buf, StringRef Body,
     Body = Body.substr(Pos);
   }
 
-  // We include the .endmacro in the buffer as our queue to exit the macro
-  // instantiation.
-  OS << ".endmacro\n";
   return false;
 }
 
@@ -1494,55 +1548,92 @@ MacroInstantiation::MacroInstantiation(const Macro *M, SMLoc IL, SMLoc EL,
 {
 }
 
-bool AsmParser::HandleMacroEntry(StringRef Name, SMLoc NameLoc,
-                                 const Macro *M) {
-  // Arbitrarily limit macro nesting depth, to match 'as'. We can eliminate
-  // this, although we should protect against infinite loops.
-  if (ActiveMacros.size() == 20)
-    return TokError("macros cannot be nested more than 20 levels deep");
-
-  // Parse the macro instantiation arguments.
-  std::vector<std::vector<AsmToken> > MacroArguments;
-  MacroArguments.push_back(std::vector<AsmToken>());
+/// ParseMacroArgument - Extract AsmTokens for a macro argument.
+/// This is used for both default macro parameter values and the
+/// arguments in macro invocations
+bool AsmParser::ParseMacroArgument(MacroArgument &MA) {
   unsigned ParenLevel = 0;
+
   for (;;) {
-    if (Lexer.is(AsmToken::Eof))
+    SMLoc LastTokenLoc;
+
+    if (Lexer.is(AsmToken::Eof) || Lexer.is(AsmToken::Equal))
       return TokError("unexpected token in macro instantiation");
+
+    // HandleMacroEntry relies on not advancing the lexer here
+    // to be able to fill in the remaining default parameter values
     if (Lexer.is(AsmToken::EndOfStatement))
       break;
+    if (ParenLevel == 0 && Lexer.is(AsmToken::Comma))
+      break;
 
-    // If we aren't inside parentheses and this is a comma, start a new token
-    // list.
-    if (ParenLevel == 0 && Lexer.is(AsmToken::Comma)) {
-      MacroArguments.push_back(std::vector<AsmToken>());
-    } else {
-      // Adjust the current parentheses level.
-      if (Lexer.is(AsmToken::LParen))
-        ++ParenLevel;
-      else if (Lexer.is(AsmToken::RParen) && ParenLevel)
-        --ParenLevel;
-
-      // Append the token to the current argument list.
-      MacroArguments.back().push_back(getTok());
-    }
+    // Adjust the current parentheses level.
+    if (Lexer.is(AsmToken::LParen))
+      ++ParenLevel;
+    else if (Lexer.is(AsmToken::RParen) && ParenLevel)
+      --ParenLevel;
+
+    // Append the token to the current argument list.
+    MA.push_back(getTok());
     Lex();
   }
-  // If the last argument didn't end up with any tokens, it's not a real
-  // argument and we should remove it from the list. This happens with either
-  // a tailing comma or an empty argument list.
-  if (MacroArguments.back().empty())
-    MacroArguments.pop_back();
+  if (ParenLevel != 0)
+    return TokError("unbalanced parenthesises in macro argument");
+  return false;
+}
+
+// Parse the macro instantiation arguments.
+bool AsmParser::ParseMacroArguments(const Macro *M,
+                                    std::vector<MacroArgument> &A) {
+  const unsigned NParameters = M ? M->Parameters.size() : 0;
+
+  // Parse two kinds of macro invocations:
+  // - macros defined without any parameters accept an arbitrary number of them
+  // - macros defined with parameters accept at most that many of them
+  for (unsigned Parameter = 0; !NParameters || Parameter < NParameters;
+       ++Parameter) {
+    MacroArgument MA;
+
+    if (ParseMacroArgument(MA))
+      return true;
+
+    if (!MA.empty())
+      A.push_back(MA);
+    if (Lexer.is(AsmToken::EndOfStatement))
+      return false;
+
+    if (Lexer.is(AsmToken::Comma))
+      Lex();
+  }
+  return TokError("Too many arguments");
+}
+
+bool AsmParser::HandleMacroEntry(StringRef Name, SMLoc NameLoc,
+                                 const Macro *M) {
+  // Arbitrarily limit macro nesting depth, to match 'as'. We can eliminate
+  // this, although we should protect against infinite loops.
+  if (ActiveMacros.size() == 20)
+    return TokError("macros cannot be nested more than 20 levels deep");
+
+  std::vector<MacroArgument> MacroArguments;
+  if (ParseMacroArguments(M, MacroArguments))
+    return true;
 
   // Macro instantiation is lexical, unfortunately. We construct a new buffer
   // to hold the macro body with substitutions.
   SmallString<256> Buf;
   StringRef Body = M->Body;
+  raw_svector_ostream OS(Buf);
 
-  if (expandMacro(Buf, Body, M->Parameters, MacroArguments, getTok().getLoc()))
+  if (expandMacro(OS, Body, M->Parameters, MacroArguments, getTok().getLoc()))
     return true;
 
+  // We include the .endmacro in the buffer as our queue to exit the macro
+  // instantiation.
+  OS << ".endmacro\n";
+
   MemoryBuffer *Instantiation =
-    MemoryBuffer::getMemBufferCopy(Buf.str(), "<instantiation>");
+    MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");
 
   // Create the macro instantiation object and add to the current macro
   // instantiation stack.
@@ -2295,10 +2386,9 @@ bool AsmParser::ParseDirectiveIncbin() {
 bool AsmParser::ParseDirectiveIf(SMLoc DirectiveLoc) {
   TheCondStack.push_back(TheCondState);
   TheCondState.TheCond = AsmCond::IfCond;
-  if(TheCondState.Ignore) {
+  if (TheCondState.Ignore) {
     EatToEndOfStatement();
-  }
-  else {
+  } else {
     int64_t ExprValue;
     if (ParseAbsoluteExpression(ExprValue))
       return true;
@@ -2315,6 +2405,61 @@ bool AsmParser::ParseDirectiveIf(SMLoc DirectiveLoc) {
   return false;
 }
 
+/// ParseDirectiveIfb
+/// ::= .ifb string
+bool AsmParser::ParseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank) {
+  TheCondStack.push_back(TheCondState);
+  TheCondState.TheCond = AsmCond::IfCond;
+
+  if (TheCondState.Ignore) {
+    EatToEndOfStatement();
+  } else {
+    StringRef Str = ParseStringToEndOfStatement();
+
+    if (getLexer().isNot(AsmToken::EndOfStatement))
+      return TokError("unexpected token in '.ifb' directive");
+
+    Lex();
+
+    TheCondState.CondMet = ExpectBlank == Str.empty();
+    TheCondState.Ignore = !TheCondState.CondMet;
+  }
+
+  return false;
+}
+
+/// ParseDirectiveIfc
+/// ::= .ifc string1, string2
+bool AsmParser::ParseDirectiveIfc(SMLoc DirectiveLoc, bool ExpectEqual) {
+  TheCondStack.push_back(TheCondState);
+  TheCondState.TheCond = AsmCond::IfCond;
+
+  if (TheCondState.Ignore) {
+    EatToEndOfStatement();
+  } else {
+    StringRef Str1 = ParseStringToComma();
+
+    if (getLexer().isNot(AsmToken::Comma))
+      return TokError("unexpected token in '.ifc' directive");
+
+    Lex();
+
+    StringRef Str2 = ParseStringToEndOfStatement();
+
+    if (getLexer().isNot(AsmToken::EndOfStatement))
+      return TokError("unexpected token in '.ifc' directive");
+
+    Lex();
+
+    TheCondState.CondMet = ExpectEqual == (Str1 == Str2);
+    TheCondState.Ignore = !TheCondState.CondMet;
+  }
+
+  return false;
+}
+
+/// ParseDirectiveIfdef
+/// ::= .ifdef symbol
 bool AsmParser::ParseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined) {
   StringRef Name;
   TheCondStack.push_back(TheCondState);
@@ -2853,7 +2998,7 @@ bool GenericAsmParser::ParseDirectiveCFISameValue(StringRef IDVal,
 /// ParseDirectiveCFIRestore
 /// ::= .cfi_restore register
 bool GenericAsmParser::ParseDirectiveCFIRestore(StringRef IDVal,
-						SMLoc DirectiveLoc) {
+                                                SMLoc DirectiveLoc) {
   int64_t Register = 0;
   if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
@@ -2866,7 +3011,7 @@ bool GenericAsmParser::ParseDirectiveCFIRestore(StringRef IDVal,
 /// ParseDirectiveCFIEscape
 /// ::= .cfi_escape expression[,...]
 bool GenericAsmParser::ParseDirectiveCFIEscape(StringRef IDVal,
-					       SMLoc DirectiveLoc) {
+                                               SMLoc DirectiveLoc) {
   std::string Values;
   int64_t CurrValue;
   if (getParser().ParseAbsoluteExpression(CurrValue))
@@ -2998,6 +3143,27 @@ bool GenericAsmParser::ParseDirectiveEndMacro(StringRef Directive,
                   "no current macro definition");
 }
 
+/// ParseDirectivePurgeMacro
+/// ::= .purgem
+bool GenericAsmParser::ParseDirectivePurgeMacro(StringRef Directive,
+                                                SMLoc DirectiveLoc) {
+  StringRef Name;
+  if (getParser().ParseIdentifier(Name))
+    return TokError("expected identifier in '.purgem' directive");
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.purgem' directive");
+
+  StringMap<Macro*>::iterator I = getParser().MacroMap.find(Name);
+  if (I == getParser().MacroMap.end())
+    return Error(DirectiveLoc, "macro '" + Name + "' is not defined");
+
+  // Undefine the macro.
+  delete I->getValue();
+  getParser().MacroMap.erase(I);
+  return false;
+}
+
 bool GenericAsmParser::ParseDirectiveLEB128(StringRef DirName, SMLoc) {
   getParser().CheckForValidSection();
 
@@ -3017,6 +3183,217 @@ bool GenericAsmParser::ParseDirectiveLEB128(StringRef DirName, SMLoc) {
   return false;
 }
 
+Macro *AsmParser::ParseMacroLikeBody(SMLoc DirectiveLoc) {
+  AsmToken EndToken, StartToken = getTok();
+
+  unsigned NestLevel = 0;
+  for (;;) {
+    // Check whether we have reached the end of the file.
+    if (getLexer().is(AsmToken::Eof)) {
+      Error(DirectiveLoc, "no matching '.endr' in definition");
+      return 0;
+    }
+
+    if (Lexer.is(AsmToken::Identifier) &&
+        (getTok().getIdentifier() == ".rept")) {
+      ++NestLevel;
+    }
+
+    // Otherwise, check whether we have reached the .endr.
+    if (Lexer.is(AsmToken::Identifier) &&
+        getTok().getIdentifier() == ".endr") {
+      if (NestLevel == 0) {
+        EndToken = getTok();
+        Lex();
+        if (Lexer.isNot(AsmToken::EndOfStatement)) {
+          TokError("unexpected token in '.endr' directive");
+          return 0;
+        }
+        break;
+      }
+      --NestLevel;
+    }
+
+    // Otherwise, scan till the end of the statement.
+    EatToEndOfStatement();
+  }
+
+  const char *BodyStart = StartToken.getLoc().getPointer();
+  const char *BodyEnd = EndToken.getLoc().getPointer();
+  StringRef Body = StringRef(BodyStart, BodyEnd - BodyStart);
+
+  // We Are Anonymous.
+  StringRef Name;
+  std::vector<StringRef> Parameters;
+  return new Macro(Name, Body, Parameters);
+}
+
+void AsmParser::InstantiateMacroLikeBody(Macro *M, SMLoc DirectiveLoc,
+                                         raw_svector_ostream &OS) {
+  OS << ".endr\n";
+
+  MemoryBuffer *Instantiation =
+    MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");
+
+  // Create the macro instantiation object and add to the current macro
+  // instantiation stack.
+  MacroInstantiation *MI = new MacroInstantiation(M, DirectiveLoc,
+                                                  getTok().getLoc(),
+                                                  Instantiation);
+  ActiveMacros.push_back(MI);
+
+  // Jump to the macro instantiation and prime the lexer.
+  CurBuffer = SrcMgr.AddNewSourceBuffer(MI->Instantiation, SMLoc());
+  Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer));
+  Lex();
+}
+
+bool AsmParser::ParseDirectiveRept(SMLoc DirectiveLoc) {
+  int64_t Count;
+  if (ParseAbsoluteExpression(Count))
+    return TokError("unexpected token in '.rept' directive");
+
+  if (Count < 0)
+    return TokError("Count is negative");
+
+  if (Lexer.isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.rept' directive");
+
+  // Eat the end of statement.
+  Lex();
+
+  // Lex the rept definition.
+  Macro *M = ParseMacroLikeBody(DirectiveLoc);
+  if (!M)
+    return true;
+
+  // Macro instantiation is lexical, unfortunately. We construct a new buffer
+  // to hold the macro body with substitutions.
+  SmallString<256> Buf;
+  std::vector<StringRef> Parameters;
+  const std::vector<MacroArgument> A;
+  raw_svector_ostream OS(Buf);
+  while (Count--) {
+    if (expandMacro(OS, M->Body, Parameters, A, getTok().getLoc()))
+      return true;
+  }
+  InstantiateMacroLikeBody(M, DirectiveLoc, OS);
+
+  return false;
+}
+
+/// ParseDirectiveIrp
+/// ::= .irp symbol,values
+bool AsmParser::ParseDirectiveIrp(SMLoc DirectiveLoc) {
+  std::vector<StringRef> Parameters;
+  StringRef Parameter;
+
+  if (ParseIdentifier(Parameter))
+    return TokError("expected identifier in '.irp' directive");
+
+  Parameters.push_back(Parameter);
+
+  if (Lexer.isNot(AsmToken::Comma))
+    return TokError("expected comma in '.irp' directive");
+
+  Lex();
+
+  std::vector<MacroArgument> A;
+  if (ParseMacroArguments(0, A))
+    return true;
+
+  // Eat the end of statement.
+  Lex();
+
+  // Lex the irp definition.
+  Macro *M = ParseMacroLikeBody(DirectiveLoc);
+  if (!M)
+    return true;
+
+  // Macro instantiation is lexical, unfortunately. We construct a new buffer
+  // to hold the macro body with substitutions.
+  SmallString<256> Buf;
+  raw_svector_ostream OS(Buf);
+
+  for (std::vector<MacroArgument>::iterator i = A.begin(), e = A.end(); i != e;
+       ++i) {
+    std::vector<MacroArgument> Args;
+    Args.push_back(*i);
+
+    if (expandMacro(OS, M->Body, Parameters, Args, getTok().getLoc()))
+      return true;
+  }
+
+  InstantiateMacroLikeBody(M, DirectiveLoc, OS);
+
+  return false;
+}
+
+/// ParseDirectiveIrpc
+/// ::= .irpc symbol,values
+bool AsmParser::ParseDirectiveIrpc(SMLoc DirectiveLoc) {
+  std::vector<StringRef> Parameters;
+  StringRef Parameter;
+
+  if (ParseIdentifier(Parameter))
+    return TokError("expected identifier in '.irpc' directive");
+
+  Parameters.push_back(Parameter);
+
+  if (Lexer.isNot(AsmToken::Comma))
+    return TokError("expected comma in '.irpc' directive");
+
+  Lex();
+
+  std::vector<MacroArgument> A;
+  if (ParseMacroArguments(0, A))
+    return true;
+
+  if (A.size() != 1 || A.front().size() != 1)
+    return TokError("unexpected token in '.irpc' directive");
+
+  // Eat the end of statement.
+  Lex();
+
+  // Lex the irpc definition.
+  Macro *M = ParseMacroLikeBody(DirectiveLoc);
+  if (!M)
+    return true;
+
+  // Macro instantiation is lexical, unfortunately. We construct a new buffer
+  // to hold the macro body with substitutions.
+  SmallString<256> Buf;
+  raw_svector_ostream OS(Buf);
+
+  StringRef Values = A.front().front().getString();
+  std::size_t I, End = Values.size();
+  for (I = 0; I < End; ++I) {
+    MacroArgument Arg;
+    Arg.push_back(AsmToken(AsmToken::Identifier, Values.slice(I, I+1)));
+
+    std::vector<MacroArgument> Args;
+    Args.push_back(Arg);
+
+    if (expandMacro(OS, M->Body, Parameters, Args, getTok().getLoc()))
+      return true;
+  }
+
+  InstantiateMacroLikeBody(M, DirectiveLoc, OS);
+
+  return false;
+}
+
+bool AsmParser::ParseDirectiveEndr(SMLoc DirectiveLoc) {
+  if (ActiveMacros.empty())
+    return TokError("unexpected '.endr' directive, no current .rept");
+
+  // The only .repl that should get here are the ones created by
+  // InstantiateMacroLikeBody.
+  assert(getLexer().is(AsmToken::EndOfStatement));
+
+  HandleMacroExit();
+  return false;
+}
 
 /// \brief Create an MCAsmParser instance.
 MCAsmParser *llvm::createMCAsmParser(SourceMgr &SM,
diff --git a/lib/MC/MCParser/DarwinAsmParser.cpp b/lib/MC/MCParser/DarwinAsmParser.cpp
index 6f45068..5662fea 100644
--- a/lib/MC/MCParser/DarwinAsmParser.cpp
+++ b/lib/MC/MCParser/DarwinAsmParser.cpp
@@ -14,6 +14,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -56,6 +57,9 @@ public:
     AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveTBSS>(".tbss");
     AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveZerofill>(".zerofill");
 
+    AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveDataRegion>(".data_region");
+    AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveDataRegionEnd>(".end_data_region");
+
     // Special section directives.
     AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveConst>(".const");
     AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveConstData>(".const_data");
@@ -113,6 +117,8 @@ public:
   bool ParseDirectiveSubsectionsViaSymbols(StringRef, SMLoc);
   bool ParseDirectiveTBSS(StringRef, SMLoc);
   bool ParseDirectiveZerofill(StringRef, SMLoc);
+  bool ParseDirectiveDataRegion(StringRef, SMLoc);
+  bool ParseDirectiveDataRegionEnd(StringRef, SMLoc);
 
   // Named Section Directive
   bool ParseSectionDirectiveConst(StringRef, SMLoc) {
@@ -659,6 +665,42 @@ bool DarwinAsmParser::ParseDirectiveZerofill(StringRef, SMLoc) {
   return false;
 }
 
+/// ParseDirectiveDataRegion
+///  ::= .data_region [ ( jt8 | jt16 | jt32 ) ]
+bool DarwinAsmParser::ParseDirectiveDataRegion(StringRef, SMLoc) {
+  if (getLexer().is(AsmToken::EndOfStatement)) {
+    Lex();
+    getStreamer().EmitDataRegion(MCDR_DataRegion);
+    return false;
+  }
+  StringRef RegionType;
+  SMLoc Loc = getParser().getTok().getLoc();
+  if (getParser().ParseIdentifier(RegionType))
+    return TokError("expected region type after '.data_region' directive");
+  int Kind = StringSwitch<int>(RegionType)
+    .Case("jt8", MCDR_DataRegionJT8)
+    .Case("jt16", MCDR_DataRegionJT16)
+    .Case("jt32", MCDR_DataRegionJT32)
+    .Default(-1);
+  if (Kind == -1)
+    return Error(Loc, "unknown region type in '.data_region' directive");
+  Lex();
+
+  getStreamer().EmitDataRegion((MCDataRegionType)Kind);
+  return false;
+}
+
+/// ParseDirectiveDataRegionEnd
+///  ::= .end_data_region
+bool DarwinAsmParser::ParseDirectiveDataRegionEnd(StringRef, SMLoc) {
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.end_data_region' directive");
+
+  Lex();
+  getStreamer().EmitDataRegion(MCDR_DataRegionEnd);
+  return false;
+}
+
 namespace llvm {
 
 MCAsmParserExtension *createDarwinAsmParser() {
diff --git a/lib/MC/MCParser/ELFAsmParser.cpp b/lib/MC/MCParser/ELFAsmParser.cpp
index ffc400b..9316bb1 100644
--- a/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/lib/MC/MCParser/ELFAsmParser.cpp
@@ -64,6 +64,7 @@ public:
     AddDirectiveHandler<&ELFAsmParser::ParseDirectiveType>(".type");
     AddDirectiveHandler<&ELFAsmParser::ParseDirectiveIdent>(".ident");
     AddDirectiveHandler<&ELFAsmParser::ParseDirectiveSymver>(".symver");
+    AddDirectiveHandler<&ELFAsmParser::ParseDirectiveVersion>(".version");
     AddDirectiveHandler<&ELFAsmParser::ParseDirectiveWeakref>(".weakref");
     AddDirectiveHandler<&ELFAsmParser::ParseDirectiveSymbolAttribute>(".weak");
     AddDirectiveHandler<&ELFAsmParser::ParseDirectiveSymbolAttribute>(".local");
@@ -141,6 +142,7 @@ public:
   bool ParseDirectiveType(StringRef, SMLoc);
   bool ParseDirectiveIdent(StringRef, SMLoc);
   bool ParseDirectiveSymver(StringRef, SMLoc);
+  bool ParseDirectiveVersion(StringRef, SMLoc);
   bool ParseDirectiveWeakref(StringRef, SMLoc);
   bool ParseDirectiveSymbolAttribute(StringRef, SMLoc);
 
@@ -548,6 +550,32 @@ bool ELFAsmParser::ParseDirectiveSymver(StringRef, SMLoc) {
   return false;
 }
 
+/// ParseDirectiveVersion
+///  ::= .version string
+bool ELFAsmParser::ParseDirectiveVersion(StringRef, SMLoc) {
+  if (getLexer().isNot(AsmToken::String))
+    return TokError("unexpected token in '.version' directive");
+
+  StringRef Data = getTok().getIdentifier();
+
+  Lex();
+
+  const MCSection *Note =
+    getContext().getELFSection(".note", ELF::SHT_NOTE, 0,
+                               SectionKind::getReadOnly());
+
+  getStreamer().PushSection();
+  getStreamer().SwitchSection(Note);
+  getStreamer().EmitIntValue(Data.size()+1, 4); // namesz.
+  getStreamer().EmitIntValue(0, 4);             // descsz = 0 (no description).
+  getStreamer().EmitIntValue(1, 4);             // type = NT_VERSION.
+  getStreamer().EmitBytes(Data, 0);             // name.
+  getStreamer().EmitIntValue(0, 1);             // terminate the string.
+  getStreamer().EmitValueToAlignment(4);        // ensure 4 byte alignment.
+  getStreamer().PopSection();
+  return false;
+}
+
 /// ParseDirectiveWeakref
 ///  ::= .weakref foo, bar
 bool ELFAsmParser::ParseDirectiveWeakref(StringRef, SMLoc) {
diff --git a/lib/MC/MCPureStreamer.cpp b/lib/MC/MCPureStreamer.cpp
index a770c97..9ccab93 100644
--- a/lib/MC/MCPureStreamer.cpp
+++ b/lib/MC/MCPureStreamer.cpp
@@ -39,7 +39,7 @@ public:
   virtual void EmitLabel(MCSymbol *Symbol);
   virtual void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value);
   virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
-                            unsigned Size = 0, unsigned ByteAlignment = 0);
+                            uint64_t Size = 0, unsigned ByteAlignment = 0);
   virtual void EmitBytes(StringRef Data, unsigned AddrSpace);
   virtual void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value = 0,
                                     unsigned ValueSize = 1,
@@ -144,7 +144,7 @@ void MCPureStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
 }
 
 void MCPureStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
-                                  unsigned Size, unsigned ByteAlignment) {
+                                  uint64_t Size, unsigned ByteAlignment) {
   report_fatal_error("not yet implemented in pure streamer");
 }
 
diff --git a/lib/MC/MCSectionCOFF.cpp b/lib/MC/MCSectionCOFF.cpp
index 90091f0..aac9377 100644
--- a/lib/MC/MCSectionCOFF.cpp
+++ b/lib/MC/MCSectionCOFF.cpp
@@ -20,7 +20,7 @@ MCSectionCOFF::~MCSectionCOFF() {} // anchor.
 // should be printed before the section name
 bool MCSectionCOFF::ShouldOmitSectionDirective(StringRef Name,
                                                const MCAsmInfo &MAI) const {
-  
+
   // FIXME: Does .section .bss/.data/.text work everywhere??
   if (Name == ".text" || Name == ".data" || Name == ".bss")
     return true;
@@ -30,7 +30,7 @@ bool MCSectionCOFF::ShouldOmitSectionDirective(StringRef Name,
 
 void MCSectionCOFF::PrintSwitchToSection(const MCAsmInfo &MAI,
                                          raw_ostream &OS) const {
-  
+
   // standard sections don't require the '.section'
   if (ShouldOmitSectionDirective(SectionName, MAI)) {
     OS << '\t' << getSectionName() << '\n';
@@ -47,7 +47,7 @@ void MCSectionCOFF::PrintSwitchToSection(const MCAsmInfo &MAI,
   if (getCharacteristics() & COFF::IMAGE_SCN_MEM_DISCARDABLE)
     OS << 'n';
   OS << "\"\n";
-  
+
   if (getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT) {
     switch (Selection) {
       case COFF::IMAGE_COMDAT_SELECT_NODUPLICATES:
diff --git a/lib/MC/MCSectionELF.cpp b/lib/MC/MCSectionELF.cpp
index dfd77c3..0775cfa 100644
--- a/lib/MC/MCSectionELF.cpp
+++ b/lib/MC/MCSectionELF.cpp
@@ -22,7 +22,7 @@ MCSectionELF::~MCSectionELF() {} // anchor.
 // should be printed before the section name
 bool MCSectionELF::ShouldOmitSectionDirective(StringRef Name,
                                               const MCAsmInfo &MAI) const {
-  
+
   // FIXME: Does .section .bss/.data/.text work everywhere??
   if (Name == ".text" || Name == ".data" ||
       (Name == ".bss" && !MAI.usesELFSectionDirectiveForBSS()))
@@ -33,7 +33,7 @@ bool MCSectionELF::ShouldOmitSectionDirective(StringRef Name,
 
 void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
                                         raw_ostream &OS) const {
-   
+
   if (ShouldOmitSectionDirective(SectionName, MAI)) {
     OS << '\t' << getSectionName() << '\n';
     return;
@@ -62,7 +62,7 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
   }
 
   // Handle the weird solaris syntax if desired.
-  if (MAI.usesSunStyleELFSectionSwitchSyntax() && 
+  if (MAI.usesSunStyleELFSectionSwitchSyntax() &&
       !(Flags & ELF::SHF_MERGE)) {
     if (Flags & ELF::SHF_ALLOC)
       OS << ",#alloc";
@@ -75,7 +75,7 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
     OS << '\n';
     return;
   }
-  
+
   OS << ",\"";
   if (Flags & ELF::SHF_ALLOC)
     OS << 'a';
@@ -91,13 +91,13 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
     OS << 'S';
   if (Flags & ELF::SHF_TLS)
     OS << 'T';
-  
+
   // If there are target-specific flags, print them.
   if (Flags & ELF::XCORE_SHF_CP_SECTION)
     OS << 'c';
   if (Flags & ELF::XCORE_SHF_DP_SECTION)
     OS << 'd';
-  
+
   OS << '"';
 
   OS << ',';
diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp
index 43e62ff..e363f28 100644
--- a/lib/MC/MCStreamer.cpp
+++ b/lib/MC/MCStreamer.cpp
@@ -20,12 +20,9 @@
 #include <cstdlib>
 using namespace llvm;
 
-MCStreamer::MCStreamer(MCContext &Ctx) : Context(Ctx), EmitEHFrame(true),
-                                         EmitDebugFrame(false),
-                                         CurrentW64UnwindInfo(0),
-                                         LastSymbol(0),
-                                         UniqueCodeBeginSuffix(0),
-                                         UniqueDataBeginSuffix(0) {
+MCStreamer::MCStreamer(MCContext &Ctx)
+  : Context(Ctx), EmitEHFrame(true), EmitDebugFrame(false),
+    CurrentW64UnwindInfo(0), LastSymbol(0) {
   const MCSection *section = NULL;
   SectionStack.push_back(std::make_pair(section, section));
 }
@@ -183,85 +180,6 @@ void MCStreamer::EmitLabel(MCSymbol *Symbol) {
   LastSymbol = Symbol;
 }
 
-void MCStreamer::EmitDataRegion() {
-  if (RegionIndicator == Data) return;
-
-  MCContext &Context = getContext();
-  const MCAsmInfo &MAI = Context.getAsmInfo();
-  if (!MAI.getSupportsDataRegions()) return;
-
-  // Generate a unique symbol name.
-  MCSymbol *NewSym = Context.GetOrCreateSymbol(MAI.getDataBeginLabelName() +
-                                               Twine(UniqueDataBeginSuffix++));
-  EmitLabel(NewSym);
-
-  RegionIndicator = Data;
-}
-
-void MCStreamer::EmitCodeRegion() {
-  if (RegionIndicator == Code) return;
-
-  MCContext &Context = getContext();
-  const MCAsmInfo &MAI = Context.getAsmInfo();
-  if (!MAI.getSupportsDataRegions()) return;
-
-  // Generate a unique symbol name.
-  MCSymbol *NewSym = Context.GetOrCreateSymbol(MAI.getCodeBeginLabelName() +
-                                               Twine(UniqueCodeBeginSuffix++));
-  EmitLabel(NewSym);
-
-  RegionIndicator = Code;
-}
-
-void MCStreamer::EmitJumpTable8Region() {
-  if (RegionIndicator == JumpTable8) return;
-
-  MCContext &Context = getContext();
-  const MCAsmInfo &MAI = Context.getAsmInfo();
-  if (!MAI.getSupportsDataRegions()) return;
-
-  // Generate a unique symbol name.
-  MCSymbol *NewSym =
-    Context.GetOrCreateSymbol(MAI.getJumpTable8BeginLabelName() +
-                              Twine(UniqueDataBeginSuffix++));
-  EmitLabel(NewSym);
-
-  RegionIndicator = JumpTable8;
-}
-
-void MCStreamer::EmitJumpTable16Region() {
-  if (RegionIndicator == JumpTable16) return;
-
-  MCContext &Context = getContext();
-  const MCAsmInfo &MAI = Context.getAsmInfo();
-  if (!MAI.getSupportsDataRegions()) return;
-
-  // Generate a unique symbol name.
-  MCSymbol *NewSym =
-    Context.GetOrCreateSymbol(MAI.getJumpTable16BeginLabelName() +
-                              Twine(UniqueDataBeginSuffix++));
-  EmitLabel(NewSym);
-
-  RegionIndicator = JumpTable16;
-}
-
-
-void MCStreamer::EmitJumpTable32Region() {
-  if (RegionIndicator == JumpTable32) return;
-
-  MCContext &Context = getContext();
-  const MCAsmInfo &MAI = Context.getAsmInfo();
-  if (!MAI.getSupportsDataRegions()) return;
-
-  // Generate a unique symbol name.
-  MCSymbol *NewSym =
-    Context.GetOrCreateSymbol(MAI.getJumpTable32BeginLabelName() +
-                              Twine(UniqueDataBeginSuffix++));
-  EmitLabel(NewSym);
-
-  RegionIndicator = JumpTable32;
-}
-
 void MCStreamer::EmitCompactUnwindEncoding(uint32_t CompactUnwindEncoding) {
   EnsureValidFrame();
   MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
@@ -283,7 +201,6 @@ void MCStreamer::EmitCFIStartProc() {
   EmitCFIStartProcImpl(Frame);
 
   FrameInfos.push_back(Frame);
-  RegionIndicator = Code;
 }
 
 void MCStreamer::EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) {
diff --git a/lib/MC/MCSubtargetInfo.cpp b/lib/MC/MCSubtargetInfo.cpp
index 86dc108..05c83f7 100644
--- a/lib/MC/MCSubtargetInfo.cpp
+++ b/lib/MC/MCSubtargetInfo.cpp
@@ -17,11 +17,13 @@
 
 using namespace llvm;
 
+MCSchedModel MCSchedModel::DefaultSchedModel; // For unknown processors.
+
 void
 MCSubtargetInfo::InitMCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS,
                                      const SubtargetFeatureKV *PF,
                                      const SubtargetFeatureKV *PD,
-                                     const SubtargetInfoKV *PI,
+                                     const SubtargetInfoKV *ProcSched,
                                      const InstrStage *IS,
                                      const unsigned *OC,
                                      const unsigned *FP,
@@ -29,10 +31,10 @@ MCSubtargetInfo::InitMCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS,
   TargetTriple = TT;
   ProcFeatures = PF;
   ProcDesc = PD;
-  ProcItins = PI;
+  ProcSchedModel = ProcSched;
   Stages = IS;
   OperandCycles = OC;
-  ForwardingPathes = FP;
+  ForwardingPaths = FP;
   NumFeatures = NF;
   NumProcs = NP;
 
@@ -68,14 +70,14 @@ uint64_t MCSubtargetInfo::ToggleFeature(StringRef FS) {
 }
 
 
-InstrItineraryData
-MCSubtargetInfo::getInstrItineraryForCPU(StringRef CPU) const {
-  assert(ProcItins && "Instruction itineraries information not available!");
+MCSchedModel *
+MCSubtargetInfo::getSchedModelForCPU(StringRef CPU) const {
+  assert(ProcSchedModel && "Processor machine model not available!");
 
 #ifndef NDEBUG
   for (size_t i = 1; i < NumProcs; i++) {
-    assert(strcmp(ProcItins[i - 1].Key, ProcItins[i].Key) < 0 &&
-           "Itineraries table is not sorted");
+    assert(strcmp(ProcSchedModel[i - 1].Key, ProcSchedModel[i].Key) < 0 &&
+           "Processor machine model table is not sorted");
   }
 #endif
 
@@ -83,14 +85,19 @@ MCSubtargetInfo::getInstrItineraryForCPU(StringRef CPU) const {
   SubtargetInfoKV KV;
   KV.Key = CPU.data();
   const SubtargetInfoKV *Found =
-    std::lower_bound(ProcItins, ProcItins+NumProcs, KV);
-  if (Found == ProcItins+NumProcs || StringRef(Found->Key) != CPU) {
+    std::lower_bound(ProcSchedModel, ProcSchedModel+NumProcs, KV);
+  if (Found == ProcSchedModel+NumProcs || StringRef(Found->Key) != CPU) {
     errs() << "'" << CPU
            << "' is not a recognized processor for this target"
            << " (ignoring processor)\n";
-    return InstrItineraryData();
+    return &MCSchedModel::DefaultSchedModel;
   }
+  assert(Found->Value && "Missing processor SchedModel value");
+  return (MCSchedModel *)Found->Value;
+}
 
-  return InstrItineraryData(Stages, OperandCycles, ForwardingPathes,
-                            (InstrItinerary *)Found->Value);
+InstrItineraryData
+MCSubtargetInfo::getInstrItineraryForCPU(StringRef CPU) const {
+  MCSchedModel *SchedModel = getSchedModelForCPU(CPU);
+  return InstrItineraryData(SchedModel, Stages, OperandCycles, ForwardingPaths);
 }
diff --git a/lib/MC/MCSymbol.cpp b/lib/MC/MCSymbol.cpp
index e013e77..f7f9184 100644
--- a/lib/MC/MCSymbol.cpp
+++ b/lib/MC/MCSymbol.cpp
@@ -30,7 +30,7 @@ static bool isAcceptableChar(char C) {
 /// syntactically correct.
 static bool NameNeedsQuoting(StringRef Str) {
   assert(!Str.empty() && "Cannot create an empty MCSymbol");
-  
+
   // If any of the characters in the string is an unacceptable character, force
   // quotes.
   for (unsigned i = 0, e = Str.size(); i != e; ++i)
@@ -72,7 +72,7 @@ void MCSymbol::print(raw_ostream &OS) const {
     OS << getName();
     return;
   }
-    
+
   OS << '"' << getName() << '"';
 }
 
diff --git a/lib/MC/MachObjectWriter.cpp b/lib/MC/MachObjectWriter.cpp
index 8e4066c..5820a22 100644
--- a/lib/MC/MachObjectWriter.cpp
+++ b/lib/MC/MachObjectWriter.cpp
@@ -21,6 +21,7 @@
 #include "llvm/MC/MCMachOSymbolFlags.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Object/MachOFormat.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 
 #include <vector>
@@ -351,6 +352,21 @@ void MachObjectWriter::WriteNlist(MachSymbolData &MSD,
     Write32(Address);
 }
 
+void MachObjectWriter::WriteLinkeditLoadCommand(uint32_t Type,
+                                                uint32_t DataOffset,
+                                                uint32_t DataSize) {
+  uint64_t Start = OS.tell();
+  (void) Start;
+
+  Write32(Type);
+  Write32(macho::LinkeditLoadCommandSize);
+  Write32(DataOffset);
+  Write32(DataSize);
+
+  assert(OS.tell() - Start == macho::LinkeditLoadCommandSize);
+}
+
+
 void MachObjectWriter::RecordRelocation(const MCAssembler &Asm,
                                         const MCAsmLayout &Layout,
                                         const MCFragment *Fragment,
@@ -654,6 +670,13 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
                          macho::DysymtabLoadCommandSize);
   }
 
+  // Add the data-in-code load command size, if used.
+  unsigned NumDataRegions = Asm.getDataRegions().size();
+  if (NumDataRegions) {
+    ++NumLoadCommands;
+    LoadCommandsSize += macho::LinkeditLoadCommandSize;
+  }
+
   // Compute the total size of the section data, as well as its file size and vm
   // size.
   uint64_t SectionDataStart = (is64Bit() ? macho::Header64Size :
@@ -701,6 +724,15 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
     RelocTableEnd += NumRelocs * macho::RelocationInfoSize;
   }
 
+  // Write the data-in-code load command, if used.
+  uint64_t DataInCodeTableEnd = RelocTableEnd + NumDataRegions * 8;
+  if (NumDataRegions) {
+    uint64_t DataRegionsOffset = RelocTableEnd;
+    uint64_t DataRegionsSize = NumDataRegions * 8;
+    WriteLinkeditLoadCommand(macho::LCT_DataInCode, DataRegionsOffset,
+                             DataRegionsSize);
+  }
+
   // Write the symbol table load command, if used.
   if (NumSymbols) {
     unsigned FirstLocalSymbol = 0;
@@ -717,10 +749,10 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
 
     // If used, the indirect symbols are written after the section data.
     if (NumIndirectSymbols)
-      IndirectSymbolOffset = RelocTableEnd;
+      IndirectSymbolOffset = DataInCodeTableEnd;
 
     // The symbol table is written after the indirect symbol data.
-    uint64_t SymbolTableOffset = RelocTableEnd + IndirectSymbolSize;
+    uint64_t SymbolTableOffset = DataInCodeTableEnd + IndirectSymbolSize;
 
     // The string table is written after symbol table.
     uint64_t StringTableOffset =
@@ -760,6 +792,23 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
     }
   }
 
+  // Write out the data-in-code region payload, if there is one.
+  for (MCAssembler::const_data_region_iterator
+         it = Asm.data_region_begin(), ie = Asm.data_region_end();
+         it != ie; ++it) {
+    const DataRegionData *Data = &(*it);
+    uint64_t Start = getSymbolAddress(&Layout.getAssembler().getSymbolData(*Data->Start), Layout);
+    uint64_t End = getSymbolAddress(&Layout.getAssembler().getSymbolData(*Data->End), Layout);
+    DEBUG(dbgs() << "data in code region-- kind: " << Data->Kind
+                 << "  start: " << Start << "(" << Data->Start->getName() << ")"
+                 << "  end: " << End << "(" << Data->End->getName() << ")"
+                 << "  size: " << End - Start
+                 << "\n");
+    Write32(Start);
+    Write16(End - Start);
+    Write16(Data->Kind);
+  }
+
   // Write the symbol table data, if used.
   if (NumSymbols) {
     // Write the indirect symbol entries.
diff --git a/lib/MC/SubtargetFeature.cpp b/lib/MC/SubtargetFeature.cpp
index be41579..0a44e77 100644
--- a/lib/MC/SubtargetFeature.cpp
+++ b/lib/MC/SubtargetFeature.cpp
@@ -92,7 +92,7 @@ static void Split(std::vector<std::string> &V, const StringRef S) {
 static std::string Join(const std::vector<std::string> &V) {
   // Start with empty string.
   std::string Result;
-  // If the vector is not empty 
+  // If the vector is not empty
   if (!V.empty()) {
     // Start with the first feature
     Result = V[0];
@@ -104,7 +104,7 @@ static std::string Join(const std::vector<std::string> &V) {
       Result += V[i];
     }
   }
-  // Return the features string 
+  // Return the features string
   return Result;
 }
 
@@ -205,7 +205,7 @@ void SetImpliedBits(uint64_t &Bits, const SubtargetFeatureKV *FeatureEntry,
 
 /// ClearImpliedBits - For each feature that (transitively) implies this
 /// feature, clear it.
-/// 
+///
 static
 void ClearImpliedBits(uint64_t &Bits, const SubtargetFeatureKV *FeatureEntry,
                       const SubtargetFeatureKV *FeatureTable,
@@ -252,7 +252,7 @@ SubtargetFeatures::ToggleFeature(uint64_t Bits, const StringRef Feature,
 
   return Bits;
 }
-           
+
 
 /// getFeatureBits - Get feature bits a CPU.
 ///
@@ -279,7 +279,7 @@ uint64_t SubtargetFeatures::getFeatureBits(const StringRef CPU,
   // Check if help is needed
   if (CPU == "help")
     Help(CPUTable, CPUTableSize, FeatureTable, FeatureTableSize);
-  
+
   // Find CPU entry if CPU name is specified.
   if (!CPU.empty()) {
     const SubtargetFeatureKV *CPUEntry = Find(CPU, CPUTable, CPUTableSize);
@@ -304,11 +304,11 @@ uint64_t SubtargetFeatures::getFeatureBits(const StringRef CPU,
   // Iterate through each feature
   for (size_t i = 0, E = Features.size(); i < E; i++) {
     const StringRef Feature = Features[i];
-    
+
     // Check for help
     if (Feature == "+help")
       Help(CPUTable, CPUTableSize, FeatureTable, FeatureTableSize);
-    
+
     // Find feature in table.
     const SubtargetFeatureKV *FeatureEntry =
                        Find(StripFlag(Feature), FeatureTable, FeatureTableSize);
@@ -349,7 +349,7 @@ void *SubtargetFeatures::getItinerary(const StringRef CPU,
 
   // Find entry
   const SubtargetInfoKV *Entry = Find(CPU, Table, TableSize);
-  
+
   if (Entry) {
     return Entry->Value;
   } else {
diff --git a/lib/MC/WinCOFFStreamer.cpp b/lib/MC/WinCOFFStreamer.cpp
index 67dc649..b026277 100644
--- a/lib/MC/WinCOFFStreamer.cpp
+++ b/lib/MC/WinCOFFStreamer.cpp
@@ -67,7 +67,7 @@ public:
   virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                      unsigned ByteAlignment);
   virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
-                            unsigned Size,unsigned ByteAlignment);
+                            uint64_t Size,unsigned ByteAlignment);
   virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
                               uint64_t Size, unsigned ByteAlignment);
   virtual void EmitBytes(StringRef Data, unsigned AddrSpace);
@@ -324,7 +324,7 @@ void WinCOFFStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
 }
 
 void WinCOFFStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
-                                   unsigned Size,unsigned ByteAlignment) {
+                                   uint64_t Size,unsigned ByteAlignment) {
   llvm_unreachable("not implemented");
 }
 
diff --git a/lib/Object/Archive.cpp b/lib/Object/Archive.cpp
index c5f15ba..2a5951a 100644
--- a/lib/Object/Archive.cpp
+++ b/lib/Object/Archive.cpp
@@ -28,7 +28,7 @@ struct ArchiveMemberHeader {
   char UID[6];
   char GID[6];
   char AccessMode[8];
-  char Size[10]; //< Size of data, not including header or padding.
+  char Size[10]; ///< Size of data, not including header or padding.
   char Terminator[2];
 
   ///! Get the name without looking up long names.
@@ -60,11 +60,11 @@ static const ArchiveMemberHeader *ToHeader(const char *base) {
 
 
 static bool isInternalMember(const ArchiveMemberHeader &amh) {
-  const char *internals[] = {
+  static const char *const internals[] = {
     "/",
     "//",
     "#_LLVM_SYM_TAB_#"
-    };
+  };
 
   StringRef name = amh.getName();
   for (std::size_t i = 0; i < sizeof(internals) / sizeof(*internals); ++i) {
diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
index 8024ac8..7766946d 100644
--- a/lib/Object/COFFObjectFile.cpp
+++ b/lib/Object/COFFObjectFile.cpp
@@ -624,6 +624,28 @@ error_code COFFObjectFile::getSymbolName(const coff_symbol *symbol,
   return object_error::success;
 }
 
+ArrayRef<uint8_t> COFFObjectFile::getSymbolAuxData(
+                                  const coff_symbol *symbol) const {
+  const uint8_t *aux = NULL;
+  
+  if ( symbol->NumberOfAuxSymbols > 0 ) {
+  // AUX data comes immediately after the symbol in COFF
+    aux = reinterpret_cast<const uint8_t *>(symbol + 1);
+# ifndef NDEBUG
+    // Verify that the aux symbol points to a valid entry in the symbol table.
+    uintptr_t offset = uintptr_t(aux) - uintptr_t(base());
+    if (offset < Header->PointerToSymbolTable
+        || offset >= Header->PointerToSymbolTable
+           + (Header->NumberOfSymbols * sizeof(coff_symbol)))
+      report_fatal_error("Aux Symbol data was outside of symbol table.");
+
+    assert((offset - Header->PointerToSymbolTable) % sizeof(coff_symbol)
+         == 0 && "Aux Symbol data did not point to the beginning of a symbol");
+# endif
+  }
+  return ArrayRef<uint8_t>(aux, symbol->NumberOfAuxSymbols * sizeof(coff_symbol));
+}
+
 error_code COFFObjectFile::getSectionName(const coff_section *Sec,
                                           StringRef &Res) const {
   StringRef Name;
@@ -696,6 +718,20 @@ error_code COFFObjectFile::getRelocationType(DataRefImpl Rel,
   return object_error::success;
 }
 
+const coff_section *COFFObjectFile::getCOFFSection(section_iterator &It) const {
+  return toSec(It->getRawDataRefImpl());
+}
+
+const coff_symbol *COFFObjectFile::getCOFFSymbol(symbol_iterator &It) const {
+  return toSymb(It->getRawDataRefImpl());
+}
+
+const coff_relocation *COFFObjectFile::getCOFFRelocation(
+                                             relocation_iterator &It) const {
+  return toRel(It->getRawDataRefImpl());
+}
+
+
 #define LLVM_COFF_SWITCH_RELOC_TYPE_NAME(enum) \
   case COFF::enum: res = #enum; break;
 
diff --git a/lib/Object/MachOObject.cpp b/lib/Object/MachOObject.cpp
index b7e5cdc..00dea3f 100644
--- a/lib/Object/MachOObject.cpp
+++ b/lib/Object/MachOObject.cpp
@@ -357,6 +357,19 @@ void MachOObject::ReadSymbol64TableEntry(uint64_t SymbolTableOffset,
   ReadInMemoryStruct(*this, Buffer->getBuffer(), Offset, Res);
 }
 
+template<>
+void SwapStruct(macho::DataInCodeTableEntry &Value) {
+  SwapValue(Value.Offset);
+  SwapValue(Value.Length);
+  SwapValue(Value.Kind);
+}
+void MachOObject::ReadDataInCodeTableEntry(uint64_t TableOffset,
+                                           unsigned Index,
+                       InMemoryStruct<macho::DataInCodeTableEntry> &Res) const {
+  uint64_t Offset = (TableOffset +
+                     Index * sizeof(macho::DataInCodeTableEntry));
+  ReadInMemoryStruct(*this, Buffer->getBuffer(), Offset, Res);
+}
 
 void MachOObject::ReadULEB128s(uint64_t Index,
                                SmallVectorImpl<uint64_t> &Out) const {
diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp
index 3bcda17..d229671 100644
--- a/lib/Object/MachOObjectFile.cpp
+++ b/lib/Object/MachOObjectFile.cpp
@@ -598,13 +598,15 @@ error_code MachOObjectFile::isSectionZeroInit(DataRefImpl DRI,
   if (MachOObj->is64Bit()) {
     InMemoryStruct<macho::Section64> Sect;
     getSection64(DRI, Sect);
-    Result = (Sect->Flags & MachO::SectionTypeZeroFill ||
-              Sect->Flags & MachO::SectionTypeZeroFillLarge);
+    unsigned SectionType = Sect->Flags & MachO::SectionFlagMaskSectionType;
+    Result = (SectionType == MachO::SectionTypeZeroFill ||
+              SectionType == MachO::SectionTypeZeroFillLarge);
   } else {
     InMemoryStruct<macho::Section> Sect;
     getSection(DRI, Sect);
-    Result = (Sect->Flags & MachO::SectionTypeZeroFill ||
-              Sect->Flags & MachO::SectionTypeZeroFillLarge);
+    unsigned SectionType = Sect->Flags & MachO::SectionFlagMaskSectionType;
+    Result = (SectionType == MachO::SectionTypeZeroFill ||
+              SectionType == MachO::SectionTypeZeroFillLarge);
   }
 
   return object_error::success;
@@ -786,7 +788,7 @@ error_code MachOObjectFile::getRelocationTypeName(DataRefImpl Rel,
 
   switch (Arch) {
     case Triple::x86: {
-      const char* Table[] =  {
+      static const char *const Table[] =  {
         "GENERIC_RELOC_VANILLA",
         "GENERIC_RELOC_PAIR",
         "GENERIC_RELOC_SECTDIFF",
@@ -801,7 +803,7 @@ error_code MachOObjectFile::getRelocationTypeName(DataRefImpl Rel,
       break;
     }
     case Triple::x86_64: {
-      const char* Table[] =  {
+      static const char *const Table[] =  {
         "X86_64_RELOC_UNSIGNED",
         "X86_64_RELOC_SIGNED",
         "X86_64_RELOC_BRANCH",
@@ -820,7 +822,7 @@ error_code MachOObjectFile::getRelocationTypeName(DataRefImpl Rel,
       break;
     }
     case Triple::arm: {
-      const char* Table[] =  {
+      static const char *const Table[] =  {
         "ARM_RELOC_VANILLA",
         "ARM_RELOC_PAIR",
         "ARM_RELOC_SECTDIFF",
@@ -839,7 +841,7 @@ error_code MachOObjectFile::getRelocationTypeName(DataRefImpl Rel,
       break;
     }
     case Triple::ppc: {
-      const char* Table[] =  {
+      static const char *const Table[] =  {
         "PPC_RELOC_VANILLA",
         "PPC_RELOC_PAIR",
         "PPC_RELOC_BR14",
diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp
index 9b81fe7..38cfaed 100644
--- a/lib/Support/APInt.cpp
+++ b/lib/Support/APInt.cpp
@@ -1135,7 +1135,7 @@ APInt APInt::lshr(unsigned shiftAmt) const {
   // If all the bits were shifted out, the result is 0. This avoids issues
   // with shifting by the size of the integer type, which produces undefined
   // results. We define these "undefined results" to always be 0.
-  if (shiftAmt == BitWidth)
+  if (shiftAmt >= BitWidth)
     return APInt(BitWidth, 0);
 
   // If none of the bits are shifted out, the result is *this. This avoids
@@ -1446,7 +1446,7 @@ APInt::mu APInt::magicu(unsigned LeadingZeros) const {
   APInt signedMin = APInt::getSignedMinValue(d.getBitWidth());
   APInt signedMax = APInt::getSignedMaxValue(d.getBitWidth());
 
-  nc = allOnes - (-d).urem(d);
+  nc = allOnes - (allOnes - d).urem(d);
   p = d.getBitWidth() - 1;  // initialize p
   q1 = signedMin.udiv(nc);  // initialize q1 = 2p/nc
   r1 = signedMin - q1*nc;   // initialize r1 = rem(2p,nc)
diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp
index e6fdf16..593315d 100644
--- a/lib/Support/CommandLine.cpp
+++ b/lib/Support/CommandLine.cpp
@@ -219,10 +219,10 @@ static Option *LookupNearestOption(StringRef Arg,
       if (!Best || Distance < BestDistance) {
         Best = O;
         BestDistance = Distance;
-	if (RHS.empty() || !PermitValue)
-	  NearestString = OptionNames[i];
-	else
-	  NearestString = std::string(OptionNames[i]) + "=" + RHS.str();
+        if (RHS.empty() || !PermitValue)
+          NearestString = OptionNames[i];
+        else
+          NearestString = std::string(OptionNames[i]) + "=" + RHS.str();
       }
     }
   }
diff --git a/lib/Support/ConstantRange.cpp b/lib/Support/ConstantRange.cpp
index 5206cf1..720ef36 100644
--- a/lib/Support/ConstantRange.cpp
+++ b/lib/Support/ConstantRange.cpp
@@ -143,16 +143,17 @@ bool ConstantRange::isSignWrappedSet() const {
 /// getSetSize - Return the number of elements in this set.
 ///
 APInt ConstantRange::getSetSize() const {
-  if (isEmptySet()) 
-    return APInt(getBitWidth(), 0);
-  if (getBitWidth() == 1) {
-    if (Lower != Upper)  // One of T or F in the set...
-      return APInt(2, 1);
-    return APInt(2, 2);      // Must be full set...
+  if (isEmptySet())
+    return APInt(getBitWidth()+1, 0);
+
+  if (isFullSet()) {
+    APInt Size(getBitWidth()+1, 0);
+    Size.setBit(getBitWidth());
+    return Size;
   }
 
-  // Simply subtract the bounds...
-  return Upper - Lower;
+  // This is also correct for wrapped sets.
+  return (Upper - Lower).zext(getBitWidth()+1);
 }
 
 /// getUnsignedMax - Return the largest unsigned value contained in the
@@ -248,6 +249,12 @@ ConstantRange ConstantRange::subtract(const APInt &Val) const {
   return ConstantRange(Lower - Val, Upper - Val);
 }
 
+/// \brief Subtract the specified range from this range (aka relative complement
+/// of the sets).
+ConstantRange ConstantRange::difference(const ConstantRange &CR) const {
+  return intersectWith(CR.inverse());
+}
+
 /// intersectWith - Return the range that results from the intersection of this
 /// range with another range.  The resultant range is guaranteed to include all
 /// elements contained in both input ranges, and to have the smallest possible
@@ -288,7 +295,7 @@ ConstantRange ConstantRange::intersectWith(const ConstantRange &CR) const {
       if (CR.Upper.ult(Upper))
         return CR;
 
-      if (CR.Upper.ult(Lower))
+      if (CR.Upper.ule(Lower))
         return ConstantRange(CR.Lower, Upper);
 
       if (getSetSize().ult(CR.getSetSize()))
@@ -316,7 +323,7 @@ ConstantRange ConstantRange::intersectWith(const ConstantRange &CR) const {
 
     return CR;
   }
-  if (CR.Upper.ult(Lower)) {
+  if (CR.Upper.ule(Lower)) {
     if (CR.Lower.ult(Lower))
       return *this;
 
@@ -420,9 +427,13 @@ ConstantRange ConstantRange::zeroExtend(uint32_t DstTySize) const {
 
   unsigned SrcTySize = getBitWidth();
   assert(SrcTySize < DstTySize && "Not a value extension");
-  if (isFullSet() || isWrappedSet())
+  if (isFullSet() || isWrappedSet()) {
     // Change into [0, 1 << src bit width)
-    return ConstantRange(APInt(DstTySize,0), APInt(DstTySize,1).shl(SrcTySize));
+    APInt LowerExt(DstTySize, 0);
+    if (!Upper) // special case: [X, 0) -- not really wrapping around
+      LowerExt = Lower.zext(DstTySize);
+    return ConstantRange(LowerExt, APInt(DstTySize, 1).shl(SrcTySize));
+  }
 
   return ConstantRange(Lower.zext(DstTySize), Upper.zext(DstTySize));
 }
@@ -450,10 +461,53 @@ ConstantRange ConstantRange::signExtend(uint32_t DstTySize) const {
 /// truncated to the specified type.
 ConstantRange ConstantRange::truncate(uint32_t DstTySize) const {
   assert(getBitWidth() > DstTySize && "Not a value truncation");
-  if (isFullSet() || getSetSize().getActiveBits() > DstTySize)
+  if (isEmptySet())
+    return ConstantRange(DstTySize, /*isFullSet=*/false);
+  if (isFullSet())
     return ConstantRange(DstTySize, /*isFullSet=*/true);
 
-  return ConstantRange(Lower.trunc(DstTySize), Upper.trunc(DstTySize));
+  APInt MaxValue = APInt::getMaxValue(DstTySize).zext(getBitWidth());
+  APInt MaxBitValue(getBitWidth(), 0);
+  MaxBitValue.setBit(DstTySize);
+
+  APInt LowerDiv(Lower), UpperDiv(Upper);
+  ConstantRange Union(DstTySize, /*isFullSet=*/false);
+
+  // Analyze wrapped sets in their two parts: [0, Upper) \/ [Lower, MaxValue]
+  // We use the non-wrapped set code to analyze the [Lower, MaxValue) part, and
+  // then we do the union with [MaxValue, Upper)
+  if (isWrappedSet()) {
+    // if Upper is greater than Max Value, it covers the whole truncated range.
+    if (Upper.uge(MaxValue))
+      return ConstantRange(DstTySize, /*isFullSet=*/true);
+
+    Union = ConstantRange(APInt::getMaxValue(DstTySize),Upper.trunc(DstTySize));
+    UpperDiv = APInt::getMaxValue(getBitWidth());
+
+    // Union covers the MaxValue case, so return if the remaining range is just
+    // MaxValue.
+    if (LowerDiv == UpperDiv)
+      return Union;
+  }
+
+  // Chop off the most significant bits that are past the destination bitwidth.
+  if (LowerDiv.uge(MaxValue)) {
+    APInt Div(getBitWidth(), 0);
+    APInt::udivrem(LowerDiv, MaxBitValue, Div, LowerDiv);
+    UpperDiv = UpperDiv - MaxBitValue * Div;
+  }
+
+  if (UpperDiv.ule(MaxValue))
+    return ConstantRange(LowerDiv.trunc(DstTySize),
+                         UpperDiv.trunc(DstTySize)).unionWith(Union);
+
+  // The truncated value wrapps around. Check if we can do better than fullset.
+  APInt UpperModulo = UpperDiv - MaxBitValue;
+  if (UpperModulo.ult(LowerDiv))
+    return ConstantRange(LowerDiv.trunc(DstTySize),
+                         UpperModulo.trunc(DstTySize)).unionWith(Union);
+
+  return ConstantRange(DstTySize, /*isFullSet=*/true);
 }
 
 /// zextOrTrunc - make this range have the bit width given by \p DstTySize. The
@@ -529,8 +583,6 @@ ConstantRange::multiply(const ConstantRange &Other) const {
 
   if (isEmptySet() || Other.isEmptySet())
     return ConstantRange(getBitWidth(), /*isFullSet=*/false);
-  if (isFullSet() || Other.isFullSet())
-    return ConstantRange(getBitWidth(), /*isFullSet=*/true);
 
   APInt this_min = getUnsignedMin().zext(getBitWidth() * 2);
   APInt this_max = getUnsignedMax().zext(getBitWidth() * 2);
diff --git a/lib/Support/CrashRecoveryContext.cpp b/lib/Support/CrashRecoveryContext.cpp
index e2af0bc..e175056 100644
--- a/lib/Support/CrashRecoveryContext.cpp
+++ b/lib/Support/CrashRecoveryContext.cpp
@@ -223,7 +223,7 @@ void CrashRecoveryContext::Disable() {
 
 #include <signal.h>
 
-static int Signals[] = { SIGABRT, SIGBUS, SIGFPE, SIGILL, SIGSEGV, SIGTRAP };
+static const int Signals[] = { SIGABRT, SIGBUS, SIGFPE, SIGILL, SIGSEGV, SIGTRAP };
 static const unsigned NumSignals = sizeof(Signals) / sizeof(Signals[0]);
 static struct sigaction PrevActions[NumSignals];
 
diff --git a/lib/Support/Errno.cpp b/lib/Support/Errno.cpp
index 18c6581..dd218f6 100644
--- a/lib/Support/Errno.cpp
+++ b/lib/Support/Errno.cpp
@@ -52,7 +52,7 @@ std::string StrError(int errnum) {
 # endif
 #elif HAVE_DECL_STRERROR_S // "Windows Secure API"
     if (errnum)
-      strerror_s(buffer, errnum);
+      strerror_s(buffer, MaxErrStrLen - 1, errnum);
 #elif defined(HAVE_STRERROR)
   // Copy the thread un-safe result of strerror into
   // the buffer as fast as possible to minimize impact
diff --git a/lib/Support/GraphWriter.cpp b/lib/Support/GraphWriter.cpp
index 32126ec..f6aaf83 100644
--- a/lib/Support/GraphWriter.cpp
+++ b/lib/Support/GraphWriter.cpp
@@ -99,7 +99,6 @@ void llvm::DisplayGraph(const sys::Path &Filename, bool wait,
   case GraphProgram::NEATO: args.push_back("-f"); args.push_back("neato");break;
   case GraphProgram::TWOPI: args.push_back("-f"); args.push_back("twopi");break;
   case GraphProgram::CIRCO: args.push_back("-f"); args.push_back("circo");break;
-  default: errs() << "Unknown graph layout name; using default.\n";
   }
 
   args.push_back(0);
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index 0f06964..9a2c39d 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -11,7 +11,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/DataStream.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/Host.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Config/config.h"
 #include <string.h>
 
@@ -25,6 +31,12 @@
 #ifdef _MSC_VER
 #include <intrin.h>
 #endif
+#if defined(__APPLE__) && (defined(__ppc__) || defined(__powerpc__))
+#include <mach/mach.h>
+#include <mach/mach_host.h>
+#include <mach/host_info.h>
+#include <mach/machine.h>
+#endif
 
 //===----------------------------------------------------------------------===//
 //
@@ -230,11 +242,18 @@ std::string sys::getHostCPUName() {
       case 45:
         return "corei7-avx";
 
-      case 28: // Intel Atom processor. All processors are manufactured using
-               // the 45 nm process
+      // Ivy Bridge:
+      case 58:
+        return "core-avx-i";
+
+      case 28: // Most 45 nm Intel Atom processors
+      case 38: // 45 nm Atom Lincroft
+      case 39: // 32 nm Atom Medfield
+      case 53: // 32 nm Atom Midview
+      case 54: // 32 nm Atom Midview
         return "atom";
 
-      default: return "i686";
+      default: return (Em64T) ? "x86-64" : "i686";
       }
     case 15: {
       switch (Model) {
@@ -315,6 +334,179 @@ std::string sys::getHostCPUName() {
   }
   return "generic";
 }
+#elif defined(__APPLE__) && (defined(__ppc__) || defined(__powerpc__))
+std::string sys::getHostCPUName() {
+  host_basic_info_data_t hostInfo;
+  mach_msg_type_number_t infoCount;
+
+  infoCount = HOST_BASIC_INFO_COUNT;
+  host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&hostInfo, 
+            &infoCount);
+            
+  if (hostInfo.cpu_type != CPU_TYPE_POWERPC) return "generic";
+
+  switch(hostInfo.cpu_subtype) {
+  case CPU_SUBTYPE_POWERPC_601:   return "601";
+  case CPU_SUBTYPE_POWERPC_602:   return "602";
+  case CPU_SUBTYPE_POWERPC_603:   return "603";
+  case CPU_SUBTYPE_POWERPC_603e:  return "603e";
+  case CPU_SUBTYPE_POWERPC_603ev: return "603ev";
+  case CPU_SUBTYPE_POWERPC_604:   return "604";
+  case CPU_SUBTYPE_POWERPC_604e:  return "604e";
+  case CPU_SUBTYPE_POWERPC_620:   return "620";
+  case CPU_SUBTYPE_POWERPC_750:   return "750";
+  case CPU_SUBTYPE_POWERPC_7400:  return "7400";
+  case CPU_SUBTYPE_POWERPC_7450:  return "7450";
+  case CPU_SUBTYPE_POWERPC_970:   return "970";
+  default: ;
+  }
+  
+  return "generic";
+}
+#elif defined(__linux__) && (defined(__ppc__) || defined(__powerpc__))
+std::string sys::getHostCPUName() {
+  // Access to the Processor Version Register (PVR) on PowerPC is privileged,
+  // and so we must use an operating-system interface to determine the current
+  // processor type. On Linux, this is exposed through the /proc/cpuinfo file.
+  const char *generic = "generic";
+
+  // Note: We cannot mmap /proc/cpuinfo here and then process the resulting
+  // memory buffer because the 'file' has 0 size (it can be read from only
+  // as a stream).
+
+  std::string Err;
+  DataStreamer *DS = getDataFileStreamer("/proc/cpuinfo", &Err);
+  if (!DS) {
+    DEBUG(dbgs() << "Unable to open /proc/cpuinfo: " << Err << "\n");
+    return generic;
+  }
+
+  // The cpu line is second (after the 'processor: 0' line), so if this
+  // buffer is too small then something has changed (or is wrong).
+  char buffer[1024];
+  size_t CPUInfoSize = DS->GetBytes((unsigned char*) buffer, sizeof(buffer));
+  delete DS;
+
+  const char *CPUInfoStart = buffer;
+  const char *CPUInfoEnd = buffer + CPUInfoSize;
+
+  const char *CIP = CPUInfoStart;
+
+  const char *CPUStart = 0;
+  size_t CPULen = 0;
+
+  // We need to find the first line which starts with cpu, spaces, and a colon.
+  // After the colon, there may be some additional spaces and then the cpu type.
+  while (CIP < CPUInfoEnd && CPUStart == 0) {
+    if (CIP < CPUInfoEnd && *CIP == '\n')
+      ++CIP;
+
+    if (CIP < CPUInfoEnd && *CIP == 'c') {
+      ++CIP;
+      if (CIP < CPUInfoEnd && *CIP == 'p') {
+        ++CIP;
+        if (CIP < CPUInfoEnd && *CIP == 'u') {
+          ++CIP;
+          while (CIP < CPUInfoEnd && (*CIP == ' ' || *CIP == '\t'))
+            ++CIP;
+  
+          if (CIP < CPUInfoEnd && *CIP == ':') {
+            ++CIP;
+            while (CIP < CPUInfoEnd && (*CIP == ' ' || *CIP == '\t'))
+              ++CIP;
+  
+            if (CIP < CPUInfoEnd) {
+              CPUStart = CIP;
+              while (CIP < CPUInfoEnd && (*CIP != ' ' && *CIP != '\t' &&
+                                          *CIP != ',' && *CIP != '\n'))
+                ++CIP;
+              CPULen = CIP - CPUStart;
+            }
+          }
+        }
+      }
+    }
+
+    if (CPUStart == 0)
+      while (CIP < CPUInfoEnd && *CIP != '\n')
+        ++CIP;
+  }
+
+  if (CPUStart == 0)
+    return generic;
+
+  return StringSwitch<const char *>(StringRef(CPUStart, CPULen))
+    .Case("604e", "604e")
+    .Case("604", "604")
+    .Case("7400", "7400")
+    .Case("7410", "7400")
+    .Case("7447", "7400")
+    .Case("7455", "7450")
+    .Case("G4", "g4")
+    .Case("POWER4", "970")
+    .Case("PPC970FX", "970")
+    .Case("PPC970MP", "970")
+    .Case("G5", "g5")
+    .Case("POWER5", "g5")
+    .Case("A2", "a2")
+    .Case("POWER6", "pwr6")
+    .Case("POWER7", "pwr7")
+    .Default(generic);
+}
+#elif defined(__linux__) && defined(__arm__)
+std::string sys::getHostCPUName() {
+  // The cpuid register on arm is not accessible from user space. On Linux,
+  // it is exposed through the /proc/cpuinfo file.
+  // Note: We cannot mmap /proc/cpuinfo here and then process the resulting
+  // memory buffer because the 'file' has 0 size (it can be read from only
+  // as a stream).
+
+  std::string Err;
+  DataStreamer *DS = getDataFileStreamer("/proc/cpuinfo", &Err);
+  if (!DS) {
+    DEBUG(dbgs() << "Unable to open /proc/cpuinfo: " << Err << "\n");
+    return "generic";
+  }
+
+  // Read 1024 bytes from /proc/cpuinfo, which should contain the CPU part line
+  // in all cases.
+  char buffer[1024];
+  size_t CPUInfoSize = DS->GetBytes((unsigned char*) buffer, sizeof(buffer));
+  delete DS;
+
+  StringRef Str(buffer, CPUInfoSize);
+
+  SmallVector<StringRef, 32> Lines;
+  Str.split(Lines, "\n");
+
+  // Look for the CPU implementer line.
+  StringRef Implementer;
+  for (unsigned I = 0, E = Lines.size(); I != E; ++I)
+    if (Lines[I].startswith("CPU implementer"))
+      Implementer = Lines[I].substr(15).ltrim("\t :");
+
+  if (Implementer == "0x41") // ARM Ltd.
+    // Look for the CPU part line.
+    for (unsigned I = 0, E = Lines.size(); I != E; ++I)
+      if (Lines[I].startswith("CPU part"))
+        // The CPU part is a 3 digit hexadecimal number with a 0x prefix. The
+        // values correspond to the "Part number" in the CP15/c0 register. The
+        // contents are specified in the various processor manuals.
+        return StringSwitch<const char *>(Lines[I].substr(8).ltrim("\t :"))
+          .Case("0x926", "arm926ej-s")
+          .Case("0xb02", "mpcore")
+          .Case("0xb36", "arm1136j-s")
+          .Case("0xb56", "arm1156t2-s")
+          .Case("0xb76", "arm1176jz-s")
+          .Case("0xc08", "cortex-a8")
+          .Case("0xc09", "cortex-a9")
+          .Case("0xc20", "cortex-m0")
+          .Case("0xc23", "cortex-m3")
+          .Case("0xc24", "cortex-m4")
+          .Default("generic");
+
+  return "generic";
+}
 #else
 std::string sys::getHostCPUName() {
   return "generic";
diff --git a/lib/Support/Memory.cpp b/lib/Support/Memory.cpp
index 6806c1d..22f7494 100644
--- a/lib/Support/Memory.cpp
+++ b/lib/Support/Memory.cpp
@@ -45,7 +45,7 @@ void llvm::sys::Memory::InvalidateInstructionCache(const void *Addr,
 
 #  if (defined(__POWERPC__) || defined (__ppc__) || \
      defined(_POWER) || defined(_ARCH_PPC)) || defined(__arm__)
-  sys_icache_invalidate(Addr, Len);
+  sys_icache_invalidate(const_cast<void *>(Addr), Len);
 #  endif
 
 #else
@@ -67,11 +67,12 @@ void llvm::sys::Memory::InvalidateInstructionCache(const void *Addr,
   asm volatile("isync");
 #  elif defined(__arm__) && defined(__GNUC__)
   // FIXME: Can we safely always call this for __GNUC__ everywhere?
-  char *Start = (char*) Addr;
-  char *End = Start + Len;
-  __clear_cache(Start, End);
+  const char *Start = static_cast<const char *>(Addr);
+  const char *End = Start + Len;
+  __clear_cache(const_cast<char *>(Start), const_cast<char *>(End));
 #  elif defined(__mips__)
-  cacheflush((intptr_t)Addr, Len, BCACHE);
+  const char *Start = static_cast<const char *>(Addr);
+  cacheflush(const_cast<char *>(Start), Len, BCACHE);
 #  endif
 
 #endif  // end apple
diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp
index 16e5c7a..992f03c 100644
--- a/lib/Support/MemoryBuffer.cpp
+++ b/lib/Support/MemoryBuffer.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Config/config.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Errno.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
@@ -214,6 +215,14 @@ error_code MemoryBuffer::getFile(const char *Filename,
                                  OwningPtr<MemoryBuffer> &result,
                                  int64_t FileSize,
                                  bool RequiresNullTerminator) {
+  // First check that the "file" is not a directory
+  bool is_dir = false;
+  error_code err = sys::fs::is_directory(Filename, is_dir);
+  if (err)
+    return err;
+  if (is_dir)
+    return make_error_code(errc::is_a_directory);
+
   int OpenFlags = O_RDONLY;
 #ifdef O_BINARY
   OpenFlags |= O_BINARY;  // Open input file in binary mode on win32.
@@ -304,16 +313,6 @@ error_code MemoryBuffer::getOpenFile(int FD, const char *Filename,
                                                       RealMapOffset)) {
       result.reset(GetNamedBuffer<MemoryBufferMMapFile>(
           StringRef(Pages + Delta, MapSize), Filename, RequiresNullTerminator));
-
-      if (RequiresNullTerminator && result->getBufferEnd()[0] != '\0') {
-        // There could be a racing issue that resulted in the file being larger
-        // than the FileSize passed by the caller. We already have an assertion
-        // for this in MemoryBuffer::init() but have a runtime guarantee that
-        // the buffer will be null-terminated here, so do a copy that adds a
-        // null-terminator.
-        result.reset(MemoryBuffer::getMemBufferCopy(result->getBuffer(),
-                                                    Filename));
-      }
       return error_code::success();
     }
   }
diff --git a/lib/Support/Path.cpp b/lib/Support/Path.cpp
index dcddeda..db4a56b 100644
--- a/lib/Support/Path.cpp
+++ b/lib/Support/Path.cpp
@@ -60,8 +60,11 @@ sys::IdentifyFileType(const char *magic, unsigned length) {
 
     case '\177':
       if (magic[1] == 'E' && magic[2] == 'L' && magic[3] == 'F') {
-        if (length >= 18 && magic[17] == 0)
-          switch (magic[16]) {
+        bool Data2MSB = magic[5] == 2;
+        unsigned high = Data2MSB ? 16 : 17;
+        unsigned low  = Data2MSB ? 17 : 16;
+        if (length >= 18 && magic[high] == 0)
+          switch (magic[low]) {
             default: break;
             case 1: return ELF_Relocatable_FileType;
             case 2: return ELF_Executable_FileType;
diff --git a/lib/Support/PathV2.cpp b/lib/Support/PathV2.cpp
index e2a69a6..46571c0 100644
--- a/lib/Support/PathV2.cpp
+++ b/lib/Support/PathV2.cpp
@@ -744,6 +744,8 @@ error_code has_magic(const Twine &path, const Twine &magic, bool &result) {
 
 /// @brief Identify the magic in magic.
 file_magic identify_magic(StringRef magic) {
+  if (magic.size() < 4)
+    return file_magic::unknown;
   switch ((unsigned char)magic[0]) {
     case 0xDE:  // 0x0B17C0DE = BC wraper
       if (magic[1] == (char)0xC0 && magic[2] == (char)0x17 &&
diff --git a/lib/Support/SourceMgr.cpp b/lib/Support/SourceMgr.cpp
index 15278c5..e4e01be 100644
--- a/lib/Support/SourceMgr.cpp
+++ b/lib/Support/SourceMgr.cpp
@@ -79,9 +79,10 @@ int SourceMgr::FindBufferContainingLoc(SMLoc Loc) const {
   return -1;
 }
 
-/// FindLineNumber - Find the line number for the specified location in the
-/// specified file.  This is not a fast method.
-unsigned SourceMgr::FindLineNumber(SMLoc Loc, int BufferID) const {
+/// getLineAndColumn - Find the line and column number for the specified
+/// location in the specified file.  This is not a fast method.
+std::pair<unsigned, unsigned>
+SourceMgr::getLineAndColumn(SMLoc Loc, int BufferID) const {
   if (BufferID == -1) BufferID = FindBufferContainingLoc(Loc);
   assert(BufferID != -1 && "Invalid Location!");
 
@@ -91,7 +92,8 @@ unsigned SourceMgr::FindLineNumber(SMLoc Loc, int BufferID) const {
   // location.
   unsigned LineNo = 1;
 
-  const char *Ptr = Buff->getBufferStart();
+  const char *BufStart = Buff->getBufferStart();
+  const char *Ptr = BufStart;
 
   // If we have a line number cache, and if the query is to a later point in the
   // same file, start searching from the last query location.  This optimizes
@@ -108,7 +110,6 @@ unsigned SourceMgr::FindLineNumber(SMLoc Loc, int BufferID) const {
   for (; SMLoc::getFromPointer(Ptr) != Loc; ++Ptr)
     if (*Ptr == '\n') ++LineNo;
 
-
   // Allocate the line number cache if it doesn't exist.
   if (LineNoCache == 0)
     LineNoCache = new LineNoCacheTy();
@@ -118,7 +119,10 @@ unsigned SourceMgr::FindLineNumber(SMLoc Loc, int BufferID) const {
   Cache.LastQueryBufferID = BufferID;
   Cache.LastQuery = Ptr;
   Cache.LineNoOfQuery = LineNo;
-  return LineNo;
+  
+  size_t NewlineOffs = StringRef(BufStart, Ptr-BufStart).find_last_of("\n\r");
+  if (NewlineOffs == StringRef::npos) NewlineOffs = ~(size_t)0;
+  return std::make_pair(LineNo, Ptr-BufStart-NewlineOffs);
 }
 
 void SourceMgr::PrintIncludeStack(SMLoc IncludeLoc, raw_ostream &OS) const {
@@ -145,50 +149,59 @@ SMDiagnostic SourceMgr::GetMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
                                    ArrayRef<SMRange> Ranges) const {
 
   // First thing to do: find the current buffer containing the specified
-  // location.
-  int CurBuf = FindBufferContainingLoc(Loc);
-  assert(CurBuf != -1 && "Invalid or unspecified location!");
-
-  MemoryBuffer *CurMB = getBufferInfo(CurBuf).Buffer;
-
-  // Scan backward to find the start of the line.
-  const char *LineStart = Loc.getPointer();
-  while (LineStart != CurMB->getBufferStart() &&
-         LineStart[-1] != '\n' && LineStart[-1] != '\r')
-    --LineStart;
-
-  // Get the end of the line.
-  const char *LineEnd = Loc.getPointer();
-  while (LineEnd != CurMB->getBufferEnd() &&
-         LineEnd[0] != '\n' && LineEnd[0] != '\r')
-    ++LineEnd;
-  std::string LineStr(LineStart, LineEnd);
-
-  // Convert any ranges to column ranges that only intersect the line of the
-  // location.
+  // location to pull out the source line.
   SmallVector<std::pair<unsigned, unsigned>, 4> ColRanges;
-  for (unsigned i = 0, e = Ranges.size(); i != e; ++i) {
-    SMRange R = Ranges[i];
-    if (!R.isValid()) continue;
-    
-    // If the line doesn't contain any part of the range, then ignore it.
-    if (R.Start.getPointer() > LineEnd || R.End.getPointer() < LineStart)
-      continue;
-   
-    // Ignore pieces of the range that go onto other lines.
-    if (R.Start.getPointer() < LineStart)
-      R.Start = SMLoc::getFromPointer(LineStart);
-    if (R.End.getPointer() > LineEnd)
-      R.End = SMLoc::getFromPointer(LineEnd);
+  std::pair<unsigned, unsigned> LineAndCol;
+  const char *BufferID = "<unknown>";
+  std::string LineStr;
+  
+  if (Loc.isValid()) {
+    int CurBuf = FindBufferContainingLoc(Loc);
+    assert(CurBuf != -1 && "Invalid or unspecified location!");
+
+    MemoryBuffer *CurMB = getBufferInfo(CurBuf).Buffer;
+    BufferID = CurMB->getBufferIdentifier();
     
-    // Translate from SMLoc ranges to column ranges.
-    ColRanges.push_back(std::make_pair(R.Start.getPointer()-LineStart,
-                                       R.End.getPointer()-LineStart));
+    // Scan backward to find the start of the line.
+    const char *LineStart = Loc.getPointer();
+    const char *BufStart = CurMB->getBufferStart();
+    while (LineStart != BufStart && LineStart[-1] != '\n' &&
+           LineStart[-1] != '\r')
+      --LineStart;
+
+    // Get the end of the line.
+    const char *LineEnd = Loc.getPointer();
+    const char *BufEnd = CurMB->getBufferEnd();
+    while (LineEnd != BufEnd && LineEnd[0] != '\n' && LineEnd[0] != '\r')
+      ++LineEnd;
+    LineStr = std::string(LineStart, LineEnd);
+
+    // Convert any ranges to column ranges that only intersect the line of the
+    // location.
+    for (unsigned i = 0, e = Ranges.size(); i != e; ++i) {
+      SMRange R = Ranges[i];
+      if (!R.isValid()) continue;
+      
+      // If the line doesn't contain any part of the range, then ignore it.
+      if (R.Start.getPointer() > LineEnd || R.End.getPointer() < LineStart)
+        continue;
+     
+      // Ignore pieces of the range that go onto other lines.
+      if (R.Start.getPointer() < LineStart)
+        R.Start = SMLoc::getFromPointer(LineStart);
+      if (R.End.getPointer() > LineEnd)
+        R.End = SMLoc::getFromPointer(LineEnd);
+      
+      // Translate from SMLoc ranges to column ranges.
+      ColRanges.push_back(std::make_pair(R.Start.getPointer()-LineStart,
+                                         R.End.getPointer()-LineStart));
+    }
+
+    LineAndCol = getLineAndColumn(Loc, CurBuf);
   }
-  
-  return SMDiagnostic(*this, Loc,
-                      CurMB->getBufferIdentifier(), FindLineNumber(Loc, CurBuf),
-                      Loc.getPointer()-LineStart, Kind, Msg.str(),
+    
+  return SMDiagnostic(*this, Loc, BufferID, LineAndCol.first,
+                      LineAndCol.second-1, Kind, Msg.str(),
                       LineStr, ColRanges);
 }
 
@@ -205,9 +218,11 @@ void SourceMgr::PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
 
   raw_ostream &OS = errs();
 
-  int CurBuf = FindBufferContainingLoc(Loc);
-  assert(CurBuf != -1 && "Invalid or unspecified location!");
-  PrintIncludeStack(getBufferInfo(CurBuf).IncludeLoc, OS);
+  if (Loc != SMLoc()) {
+    int CurBuf = FindBufferContainingLoc(Loc);
+    assert(CurBuf != -1 && "Invalid or unspecified location!");
+    PrintIncludeStack(getBufferInfo(CurBuf).IncludeLoc, OS);
+  }
 
   Diagnostic.print(0, OS, ShowColors);
 }
@@ -228,8 +243,8 @@ SMDiagnostic::SMDiagnostic(const SourceMgr &sm, SMLoc L, const std::string &FN,
 
 void SMDiagnostic::print(const char *ProgName, raw_ostream &S,
                          bool ShowColors) const {
-  // Display colors only if OS goes to a tty.
-  ShowColors &= S.is_displayed();
+  // Display colors only if OS supports colors.
+  ShowColors &= S.has_colors();
 
   if (ShowColors)
     S.changeColor(raw_ostream::SAVEDCOLOR, true);
@@ -343,5 +358,3 @@ void SMDiagnostic::print(const char *ProgName, raw_ostream &S,
   
   S << '\n';
 }
-
-
diff --git a/lib/Support/StreamableMemoryObject.cpp b/lib/Support/StreamableMemoryObject.cpp
index c23f07b..fe3752a 100644
--- a/lib/Support/StreamableMemoryObject.cpp
+++ b/lib/Support/StreamableMemoryObject.cpp
@@ -20,7 +20,7 @@ class RawMemoryObject : public StreamableMemoryObject {
 public:
   RawMemoryObject(const unsigned char *Start, const unsigned char *End) :
     FirstChar(Start), LastChar(End) {
-    assert(LastChar > FirstChar && "Invalid start/end range");
+    assert(LastChar >= FirstChar && "Invalid start/end range");
   }
 
   virtual uint64_t getBase() const { return 0; }
diff --git a/lib/Support/StringMap.cpp b/lib/Support/StringMap.cpp
index c131fe0..c2fc261 100644
--- a/lib/Support/StringMap.cpp
+++ b/lib/Support/StringMap.cpp
@@ -189,7 +189,7 @@ void StringMapImpl::RehashTable() {
   // grow/rehash the table.
   if (NumItems*4 > NumBuckets*3) {
     NewSize = NumBuckets*2;
-  } else if (NumBuckets-(NumItems+NumTombstones) < NumBuckets/8) {
+  } else if (NumBuckets-(NumItems+NumTombstones) <= NumBuckets/8) {
     NewSize = NumBuckets;
   } else {
     return;
diff --git a/lib/Support/StringRef.cpp b/lib/Support/StringRef.cpp
index abe570f..8aab4b2 100644
--- a/lib/Support/StringRef.cpp
+++ b/lib/Support/StringRef.cpp
@@ -12,6 +12,7 @@
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/edit_distance.h"
+
 #include <bitset>
 
 using namespace llvm;
@@ -230,6 +231,31 @@ StringRef::size_type StringRef::find_last_of(StringRef Chars,
   return npos;
 }
 
+/// find_last_not_of - Find the last character in the string that is not
+/// \arg C, or npos if not found.
+StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const {
+  for (size_type i = min(From, Length) - 1, e = -1; i != e; --i)
+    if (Data[i] != C)
+      return i;
+  return npos;
+}
+
+/// find_last_not_of - Find the last character in the string that is not in
+/// \arg Chars, or npos if not found.
+///
+/// Note: O(size() + Chars.size())
+StringRef::size_type StringRef::find_last_not_of(StringRef Chars,
+                                                 size_t From) const {
+  std::bitset<1 << CHAR_BIT> CharBits;
+  for (size_type i = 0, e = Chars.size(); i != e; ++i)
+    CharBits.set((unsigned char)Chars[i]);
+
+  for (size_type i = min(From, Length) - 1, e = -1; i != e; --i)
+    if (!CharBits.test((unsigned char)Data[i]))
+      return i;
+  return npos;
+}
+
 void StringRef::split(SmallVectorImpl<StringRef> &A,
                       StringRef Separators, int MaxSplit,
                       bool KeepEmpty) const {
@@ -272,14 +298,22 @@ static unsigned GetAutoSenseRadix(StringRef &Str) {
   if (Str.startswith("0x")) {
     Str = Str.substr(2);
     return 16;
-  } else if (Str.startswith("0b")) {
+  }
+  
+  if (Str.startswith("0b")) {
     Str = Str.substr(2);
     return 2;
-  } else if (Str.startswith("0")) {
+  }
+
+  if (Str.startswith("0o")) {
+    Str = Str.substr(2);
     return 8;
-  } else {
-    return 10;
   }
+
+  if (Str.startswith("0"))
+    return 8;
+  
+  return 10;
 }
 
 
@@ -383,7 +417,7 @@ bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const {
   unsigned BitWidth = Log2Radix * Str.size();
   if (BitWidth < Result.getBitWidth())
     BitWidth = Result.getBitWidth(); // don't shrink the result
-  else
+  else if (BitWidth > Result.getBitWidth())
     Result = Result.zext(BitWidth);
 
   APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix
diff --git a/lib/Support/TargetRegistry.cpp b/lib/Support/TargetRegistry.cpp
index 53c8d84..9c81327 100644
--- a/lib/Support/TargetRegistry.cpp
+++ b/lib/Support/TargetRegistry.cpp
@@ -23,6 +23,47 @@ TargetRegistry::iterator TargetRegistry::begin() {
   return iterator(FirstTarget);
 }
 
+const Target *TargetRegistry::lookupTarget(const std::string &ArchName,
+                                           Triple &TheTriple,
+                                           std::string &Error) {
+  // Allocate target machine.  First, check whether the user has explicitly
+  // specified an architecture to compile for. If so we have to look it up by
+  // name, because it might be a backend that has no mapping to a target triple.
+  const Target *TheTarget = 0;
+  if (!ArchName.empty()) {
+    for (TargetRegistry::iterator it = TargetRegistry::begin(),
+           ie = TargetRegistry::end(); it != ie; ++it) {
+      if (ArchName == it->getName()) {
+        TheTarget = &*it;
+        break;
+      }
+    }
+
+    if (!TheTarget) {
+      Error = "error: invalid target '" + ArchName + "'.\n";
+      return 0;
+    }
+
+    // Adjust the triple to match (if known), otherwise stick with the
+    // given triple.
+    Triple::ArchType Type = Triple::getArchTypeForLLVMName(ArchName);
+    if (Type != Triple::UnknownArch)
+      TheTriple.setArch(Type);
+  } else {
+    // Get the target specific parser.
+    std::string TempError;
+    TheTarget = TargetRegistry::lookupTarget(TheTriple.getTriple(), TempError);
+    if (TheTarget == 0) {
+      Error = ": error: unable to get target for '"
+            + TheTriple.getTriple()
+            + "', see --version and --triple.\n";
+      return 0;
+    }
+  }
+
+  return TheTarget;
+}
+
 const Target *TargetRegistry::lookupTarget(const std::string &TT,
                                            std::string &Error) {
   // Provide special warning when no targets are initialized.
diff --git a/lib/Support/ThreadLocal.cpp b/lib/Support/ThreadLocal.cpp
index 08b12b6..0587aae 100644
--- a/lib/Support/ThreadLocal.cpp
+++ b/lib/Support/ThreadLocal.cpp
@@ -25,9 +25,18 @@ namespace llvm {
 using namespace sys;
 ThreadLocalImpl::ThreadLocalImpl() { }
 ThreadLocalImpl::~ThreadLocalImpl() { }
-void ThreadLocalImpl::setInstance(const void* d) { data = const_cast<void*>(d);}
-const void* ThreadLocalImpl::getInstance() { return data; }
-void ThreadLocalImpl::removeInstance() { data = 0; }
+void ThreadLocalImpl::setInstance(const void* d) {
+  typedef int SIZE_TOO_BIG[sizeof(d) <= sizeof(data) ? 1 : -1];
+  void **pd = reinterpret_cast<void**>(&data);
+  *pd = const_cast<void*>(d);
+}
+const void* ThreadLocalImpl::getInstance() {
+  void **pd = reinterpret_cast<void**>(&data);
+  return *pd;
+}
+void ThreadLocalImpl::removeInstance() {
+  setInstance(0);
+}
 }
 #else
 
@@ -40,31 +49,30 @@ void ThreadLocalImpl::removeInstance() { data = 0; }
 namespace llvm {
 using namespace sys;
 
-ThreadLocalImpl::ThreadLocalImpl() : data(0) {
-  pthread_key_t* key = new pthread_key_t;
+ThreadLocalImpl::ThreadLocalImpl() : data() {
+  typedef int SIZE_TOO_BIG[sizeof(pthread_key_t) <= sizeof(data) ? 1 : -1];
+  pthread_key_t* key = reinterpret_cast<pthread_key_t*>(&data);
   int errorcode = pthread_key_create(key, NULL);
   assert(errorcode == 0);
   (void) errorcode;
-  data = (void*)key;
 }
 
 ThreadLocalImpl::~ThreadLocalImpl() {
-  pthread_key_t* key = static_cast<pthread_key_t*>(data);
+  pthread_key_t* key = reinterpret_cast<pthread_key_t*>(&data);
   int errorcode = pthread_key_delete(*key);
   assert(errorcode == 0);
   (void) errorcode;
-  delete key;
 }
 
 void ThreadLocalImpl::setInstance(const void* d) {
-  pthread_key_t* key = static_cast<pthread_key_t*>(data);
+  pthread_key_t* key = reinterpret_cast<pthread_key_t*>(&data);
   int errorcode = pthread_setspecific(*key, d);
   assert(errorcode == 0);
   (void) errorcode;
 }
 
 const void* ThreadLocalImpl::getInstance() {
-  pthread_key_t* key = static_cast<pthread_key_t*>(data);
+  pthread_key_t* key = reinterpret_cast<pthread_key_t*>(&data);
   return pthread_getspecific(*key);
 }
 
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index 44a1b38..7b26ea9 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@@ -38,8 +38,8 @@ const char *Triple::getArchTypeName(ArchType Kind) {
   case x86_64:  return "x86_64";
   case xcore:   return "xcore";
   case mblaze:  return "mblaze";
-  case ptx32:   return "ptx32";
-  case ptx64:   return "ptx64";
+  case nvptx:   return "nvptx";
+  case nvptx64: return "nvptx64";
   case le32:    return "le32";
   case amdil:   return "amdil";
   }
@@ -62,7 +62,12 @@ const char *Triple::getArchTypePrefix(ArchType Kind) {
 
   case mblaze:  return "mblaze";
 
-  case hexagon:   return "hexagon";
+  case mips:
+  case mipsel:
+  case mips64:
+  case mips64el:return "mips";
+
+  case hexagon: return "hexagon";
 
   case r600:    return "r600";
 
@@ -74,8 +79,8 @@ const char *Triple::getArchTypePrefix(ArchType Kind) {
 
   case xcore:   return "xcore";
 
-  case ptx32:   return "ptx";
-  case ptx64:   return "ptx";
+  case nvptx:   return "nvptx";
+  case nvptx64: return "nvptx";
   case le32:    return "le32";
   case amdil:   return "amdil";
   }
@@ -160,8 +165,8 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
     .Case("x86", x86)
     .Case("x86-64", x86_64)
     .Case("xcore", xcore)
-    .Case("ptx32", ptx32)
-    .Case("ptx64", ptx64)
+    .Case("nvptx", nvptx)
+    .Case("nvptx64", nvptx64)
     .Case("le32", le32)
     .Case("amdil", amdil)
     .Default(UnknownArch);
@@ -192,8 +197,8 @@ Triple::ArchType Triple::getArchTypeForDarwinArchName(StringRef Str) {
     .Cases("arm", "armv4t", "armv5", "armv6", Triple::arm)
     .Cases("armv7", "armv7f", "armv7k", "armv7s", "xscale", Triple::arm)
     .Case("r600", Triple::r600)
-    .Case("ptx32", Triple::ptx32)
-    .Case("ptx64", Triple::ptx64)
+    .Case("nvptx", Triple::nvptx)
+    .Case("nvptx64", Triple::nvptx64)
     .Case("amdil", Triple::amdil)
     .Default(Triple::UnknownArch);
 }
@@ -215,8 +220,8 @@ const char *Triple::getArchNameForAssembler() {
     .Cases("armv6", "thumbv6", "armv6")
     .Cases("armv7", "thumbv7", "armv7")
     .Case("r600", "r600")
-    .Case("ptx32", "ptx32")
-    .Case("ptx64", "ptx64")
+    .Case("nvptx", "nvptx")
+    .Case("nvptx64", "nvptx64")
     .Case("le32", "le32")
     .Case("amdil", "amdil")
     .Default(NULL);
@@ -249,8 +254,8 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     .Case("sparcv9", Triple::sparcv9)
     .Case("tce", Triple::tce)
     .Case("xcore", Triple::xcore)
-    .Case("ptx32", Triple::ptx32)
-    .Case("ptx64", Triple::ptx64)
+    .Case("nvptx", Triple::nvptx)
+    .Case("nvptx64", Triple::nvptx64)
     .Case("le32", Triple::le32)
     .Case("amdil", Triple::amdil)
     .Default(Triple::UnknownArch);
@@ -584,6 +589,29 @@ bool Triple::getMacOSXVersion(unsigned &Major, unsigned &Minor,
   return true;
 }
 
+void Triple::getiOSVersion(unsigned &Major, unsigned &Minor,
+                           unsigned &Micro) const {
+  switch (getOS()) {
+  default: llvm_unreachable("unexpected OS for Darwin triple");
+  case Darwin:
+  case MacOSX:
+    // Ignore the version from the triple.  This is only handled because the
+    // the clang driver combines OS X and IOS support into a common Darwin
+    // toolchain that wants to know the iOS version number even when targeting
+    // OS X.
+    Major = 3;
+    Minor = 0;
+    Micro = 0;
+    break;
+  case IOS:
+    getOSVersion(Major, Minor, Micro);
+    // Default to 3.0.
+    if (Major == 0)
+      Major = 3;
+    break;
+  }
+}
+
 void Triple::setTriple(const Twine &Str) {
   *this = Triple(Str);
 }
@@ -652,8 +680,8 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
   case llvm::Triple::mblaze:
   case llvm::Triple::mips:
   case llvm::Triple::mipsel:
+  case llvm::Triple::nvptx:
   case llvm::Triple::ppc:
-  case llvm::Triple::ptx32:
   case llvm::Triple::r600:
   case llvm::Triple::sparc:
   case llvm::Triple::tce:
@@ -664,8 +692,8 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
 
   case llvm::Triple::mips64:
   case llvm::Triple::mips64el:
+  case llvm::Triple::nvptx64:
   case llvm::Triple::ppc64:
-  case llvm::Triple::ptx64:
   case llvm::Triple::sparcv9:
   case llvm::Triple::x86_64:
     return 64;
@@ -701,8 +729,8 @@ Triple Triple::get32BitArchVariant() const {
   case Triple::mblaze:
   case Triple::mips:
   case Triple::mipsel:
+  case Triple::nvptx:
   case Triple::ppc:
-  case Triple::ptx32:
   case Triple::r600:
   case Triple::sparc:
   case Triple::tce:
@@ -714,8 +742,8 @@ Triple Triple::get32BitArchVariant() const {
 
   case Triple::mips64:    T.setArch(Triple::mips);    break;
   case Triple::mips64el:  T.setArch(Triple::mipsel);  break;
+  case Triple::nvptx64:   T.setArch(Triple::nvptx);   break;
   case Triple::ppc64:     T.setArch(Triple::ppc);   break;
-  case Triple::ptx64:     T.setArch(Triple::ptx32);   break;
   case Triple::sparcv9:   T.setArch(Triple::sparc);   break;
   case Triple::x86_64:    T.setArch(Triple::x86);     break;
   }
@@ -742,8 +770,8 @@ Triple Triple::get64BitArchVariant() const {
 
   case Triple::mips64:
   case Triple::mips64el:
+  case Triple::nvptx64:
   case Triple::ppc64:
-  case Triple::ptx64:
   case Triple::sparcv9:
   case Triple::x86_64:
     // Already 64-bit.
@@ -751,8 +779,8 @@ Triple Triple::get64BitArchVariant() const {
 
   case Triple::mips:    T.setArch(Triple::mips64);    break;
   case Triple::mipsel:  T.setArch(Triple::mips64el);  break;
+  case Triple::nvptx:   T.setArch(Triple::nvptx64);   break;
   case Triple::ppc:     T.setArch(Triple::ppc64);     break;
-  case Triple::ptx32:   T.setArch(Triple::ptx64);     break;
   case Triple::sparc:   T.setArch(Triple::sparcv9);   break;
   case Triple::x86:     T.setArch(Triple::x86_64);    break;
   }
diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc
index ddc1e0f..b41390a 100644
--- a/lib/Support/Unix/Path.inc
+++ b/lib/Support/Unix/Path.inc
@@ -884,7 +884,8 @@ const char *Path::MapInFilePages(int FD, size_t FileSize, off_t Offset) {
 }
 
 void Path::UnMapFilePages(const char *BasePtr, size_t FileSize) {
-  ::munmap((void*)BasePtr, FileSize);
+  const void *Addr = static_cast<const void *>(BasePtr);
+  ::munmap(const_cast<void *>(Addr), FileSize);
 }
 
 } // end llvm namespace
diff --git a/lib/Support/Unix/PathV2.inc b/lib/Support/Unix/PathV2.inc
index 7d259a3..93ccd1a 100644
--- a/lib/Support/Unix/PathV2.inc
+++ b/lib/Support/Unix/PathV2.inc
@@ -17,12 +17,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "Unix.h"
+#include "llvm/Support/Process.h"
 #if HAVE_SYS_STAT_H
 #include <sys/stat.h>
 #endif
 #if HAVE_FCNTL_H
 #include <fcntl.h>
 #endif
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif
 #if HAVE_DIRENT_H
 # include <dirent.h>
 # define NAMLEN(dirent) strlen((dirent)->d_name)
@@ -274,8 +278,7 @@ error_code exists(const Twine &path, bool &result) {
   SmallString<128> path_storage;
   StringRef p = path.toNullTerminatedStringRef(path_storage);
 
-  struct stat status;
-  if (::stat(p.begin(), &status) == -1) {
+  if (::access(p.begin(), F_OK) == -1) {
     if (errno != errc::no_such_file_or_directory)
       return error_code(errno, system_category());
     result = false;
@@ -287,8 +290,8 @@ error_code exists(const Twine &path, bool &result) {
 
 bool equivalent(file_status A, file_status B) {
   assert(status_known(A) && status_known(B));
-  return A.st_dev == B.st_dev &&
-         A.st_ino == B.st_ino;
+  return A.fs_st_dev == B.fs_st_dev &&
+         A.fs_st_ino == B.fs_st_ino;
 }
 
 error_code equivalent(const Twine &A, const Twine &B, bool &result) {
@@ -327,30 +330,62 @@ error_code status(const Twine &path, file_status &result) {
     return ec;
   }
 
+  perms prms = static_cast<perms>(status.st_mode & perms_mask);
+  
   if (S_ISDIR(status.st_mode))
-    result = file_status(file_type::directory_file);
+    result = file_status(file_type::directory_file, prms);
   else if (S_ISREG(status.st_mode))
-    result = file_status(file_type::regular_file);
+    result = file_status(file_type::regular_file, prms);
   else if (S_ISBLK(status.st_mode))
-    result = file_status(file_type::block_file);
+    result = file_status(file_type::block_file, prms);
   else if (S_ISCHR(status.st_mode))
-    result = file_status(file_type::character_file);
+    result = file_status(file_type::character_file, prms);
   else if (S_ISFIFO(status.st_mode))
-    result = file_status(file_type::fifo_file);
+    result = file_status(file_type::fifo_file, prms);
   else if (S_ISSOCK(status.st_mode))
-    result = file_status(file_type::socket_file);
+    result = file_status(file_type::socket_file, prms);
   else
-    result = file_status(file_type::type_unknown);
+    result = file_status(file_type::type_unknown, prms);
+
+  result.fs_st_dev = status.st_dev;
+  result.fs_st_ino = status.st_ino;
+
+  return error_code::success();
+}
+
+// Modifies permissions on a file.
+error_code permissions(const Twine &path, perms prms) {
+  if ((prms & add_perms) && (prms & remove_perms))
+    llvm_unreachable("add_perms and remove_perms are mutually exclusive");
 
-  result.st_dev = status.st_dev;
-  result.st_ino = status.st_ino;
+  // Get current permissions
+  file_status info;
+  if (error_code ec = status(path, info)) {
+    return ec;
+  }
+  
+  // Set updated permissions.
+  SmallString<128> path_storage;
+  StringRef p = path.toNullTerminatedStringRef(path_storage);
+  perms permsToSet;
+  if (prms & add_perms) {
+    permsToSet = (info.permissions() | prms) & perms_mask;
+  } else if (prms & remove_perms) {
+    permsToSet = (info.permissions() & ~prms) & perms_mask;
+  } else {
+    permsToSet = prms & perms_mask;
+  }
+  if (::chmod(p.begin(), static_cast<mode_t>(permsToSet))) {
+    return error_code(errno, system_category()); 
+  }
 
   return error_code::success();
 }
 
+// Since this is most often used for temporary files, mode defaults to 0600.
 error_code unique_file(const Twine &model, int &result_fd,
-                             SmallVectorImpl<char> &result_path,
-                             bool makeAbsolute) {
+                       SmallVectorImpl<char> &result_path,
+                       bool makeAbsolute, unsigned mode) {
   SmallString<128> Model;
   model.toVector(Model);
   // Null terminate.
@@ -367,37 +402,20 @@ error_code unique_file(const Twine &model, int &result_fd,
     }
   }
 
-  // Replace '%' with random chars. From here on, DO NOT modify model. It may be
-  // needed if the randomly chosen path already exists.
-  SmallString<128> RandomPath;
-  RandomPath.reserve(Model.size() + 1);
-  ::srand(::time(NULL));
+  // From here on, DO NOT modify model. It may be needed if the randomly chosen
+  // path already exists.
+  SmallString<128> RandomPath = Model;
 
 retry_random_path:
-  // This is opened here instead of above to make it easier to track when to
-  // close it. Collisions should be rare enough for the possible extra syscalls
-  // not to matter.
-  FILE *RandomSource = ::fopen("/dev/urandom", "r");
-  RandomPath.set_size(0);
-  for (SmallVectorImpl<char>::const_iterator i = Model.begin(),
-                                             e = Model.end(); i != e; ++i) {
-    if (*i == '%') {
-      char val = 0;
-      if (RandomSource)
-        val = fgetc(RandomSource);
-      else
-        val = ::rand();
-      RandomPath.push_back("0123456789abcdef"[val & 15]);
-    } else
-      RandomPath.push_back(*i);
+  // Replace '%' with random chars.
+  for (unsigned i = 0, e = Model.size(); i != e; ++i) {
+    if (Model[i] == '%')
+      RandomPath[i] = "0123456789abcdef"[sys::Process::GetRandomNumber() & 15];
   }
 
-  if (RandomSource)
-    ::fclose(RandomSource);
-
   // Try to open + create the file.
 rety_open_create:
-  int RandomFD = ::open(RandomPath.c_str(), O_RDWR | O_CREAT | O_EXCL, 0600);
+  int RandomFD = ::open(RandomPath.c_str(), O_RDWR | O_CREAT | O_EXCL, mode);
   if (RandomFD == -1) {
     // If the file existed, try again, otherwise, error.
     if (errno == errc::file_exists)
@@ -513,6 +531,36 @@ error_code get_magic(const Twine &path, uint32_t len,
   return error_code::success();
 }
 
+error_code map_file_pages(const Twine &path, off_t file_offset, size_t size,  
+                                            bool map_writable, void *&result) {
+  SmallString<128> path_storage;
+  StringRef name = path.toNullTerminatedStringRef(path_storage);
+  int oflags = map_writable ? O_RDWR : O_RDONLY;
+  int ofd = ::open(name.begin(), oflags);
+  if ( ofd == -1 )
+    return error_code(errno, system_category());
+  AutoFD fd(ofd);
+  int flags = map_writable ? MAP_SHARED : MAP_PRIVATE;
+  int prot = map_writable ? (PROT_READ|PROT_WRITE) : PROT_READ;
+#ifdef MAP_FILE
+  flags |= MAP_FILE;
+#endif
+  result = ::mmap(0, size, prot, flags, fd, file_offset);
+  if (result == MAP_FAILED) {
+    return error_code(errno, system_category());
+  }
+  
+  return error_code::success();
+}
+
+error_code unmap_file_pages(void *base, size_t size) {
+  if ( ::munmap(base, size) == -1 )
+    return error_code(errno, system_category());
+   
+  return error_code::success();
+}
+
+
 } // end namespace fs
 } // end namespace sys
 } // end namespace llvm
diff --git a/lib/Support/Unix/Process.inc b/lib/Support/Unix/Process.inc
index f640462..174112e 100644
--- a/lib/Support/Unix/Process.inc
+++ b/lib/Support/Unix/Process.inc
@@ -12,6 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "Unix.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/Support/TimeValue.h"
 #ifdef HAVE_SYS_TIME_H
 #include <sys/time.h>
 #endif
@@ -247,16 +249,18 @@ static bool terminalHasColors() {
   return false;
 }
 
+bool Process::FileDescriptorHasColors(int fd) {
+  // A file descriptor has colors if it is displayed and the terminal has
+  // colors.
+  return FileDescriptorIsDisplayed(fd) && terminalHasColors();
+}
+
 bool Process::StandardOutHasColors() {
-  if (!StandardOutIsDisplayed())
-    return false;
-  return terminalHasColors();
+  return FileDescriptorHasColors(STDOUT_FILENO);
 }
 
 bool Process::StandardErrHasColors() {
-  if (!StandardErrIsDisplayed())
-    return false;
-  return terminalHasColors();
+  return FileDescriptorHasColors(STDERR_FILENO);
 }
 
 bool Process::ColorNeedsFlush() {
@@ -297,3 +301,33 @@ const char *Process::OutputReverse() {
 const char *Process::ResetColor() {
   return "\033[0m";
 }
+
+#if !defined(HAVE_ARC4RANDOM)
+static unsigned GetRandomNumberSeed() {
+  // Attempt to get the initial seed from /dev/urandom, if possible.
+  if (FILE *RandomSource = ::fopen("/dev/urandom", "r")) {
+    unsigned seed;
+    int count = ::fread((void *)&seed, sizeof(seed), 1, RandomSource);
+    ::fclose(RandomSource);
+
+    // Return the seed if the read was successful.
+    if (count == 1)
+      return seed;
+  }
+
+  // Otherwise, swizzle the current time and the process ID to form a reasonable
+  // seed.
+  TimeValue Now = llvm::TimeValue::now();
+  return hash_combine(Now.seconds(), Now.nanoseconds(), ::getpid());
+}
+#endif
+
+unsigned llvm::sys::Process::GetRandomNumber() {
+#if defined(HAVE_ARC4RANDOM)
+  return arc4random();
+#else
+  static int x = (::srand(GetRandomNumberSeed()), 0);
+  (void)x;
+  return ::rand();
+#endif
+}
diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc
index f5454cf..1d667ab 100644
--- a/lib/Support/Unix/Signals.inc
+++ b/lib/Support/Unix/Signals.inc
@@ -15,6 +15,7 @@
 #include "Unix.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Mutex.h"
+#include <string>
 #include <vector>
 #include <algorithm>
 #if HAVE_EXECINFO_H
@@ -43,7 +44,7 @@ static SmartMutex<true> SignalsMutex;
 /// InterruptFunction - The function to call if ctrl-c is pressed.
 static void (*InterruptFunction)() = 0;
 
-static std::vector<sys::Path> FilesToRemove;
+static std::vector<std::string> FilesToRemove;
 static std::vector<std::pair<void(*)(void*), void*> > CallBacksToRun;
 
 // IntSigs - Signals that may interrupt the program at any time.
@@ -117,10 +118,20 @@ static void UnregisterHandlers() {
 
 /// RemoveFilesToRemove - Process the FilesToRemove list. This function
 /// should be called with the SignalsMutex lock held.
+/// NB: This must be an async signal safe function. It cannot allocate or free
+/// memory, even in debug builds.
 static void RemoveFilesToRemove() {
-  while (!FilesToRemove.empty()) {
-    FilesToRemove.back().eraseFromDisk(true);
-    FilesToRemove.pop_back();
+  // Note: avoid iterators in case of debug iterators that allocate or release
+  // memory.
+  for (unsigned i = 0, e = FilesToRemove.size(); i != e; ++i) {
+    // Note that we don't want to use any external code here, and we don't care
+    // about errors. We're going to try as hard as we can as often as we need
+    // to to make these files go away. If these aren't files, too bad.
+    //
+    // We do however rely on a std::string implementation for which repeated
+    // calls to 'c_str()' don't allocate memory. We pre-call 'c_str()' on all
+    // of these strings to try to ensure this is safe.
+    unlink(FilesToRemove[i].c_str());
   }
 }
 
@@ -178,7 +189,19 @@ void llvm::sys::SetInterruptFunction(void (*IF)()) {
 bool llvm::sys::RemoveFileOnSignal(const sys::Path &Filename,
                                    std::string* ErrMsg) {
   SignalsMutex.acquire();
-  FilesToRemove.push_back(Filename);
+  std::string *OldPtr = FilesToRemove.empty() ? 0 : &FilesToRemove[0];
+  FilesToRemove.push_back(Filename.str());
+
+  // We want to call 'c_str()' on every std::string in this vector so that if
+  // the underlying implementation requires a re-allocation, it happens here
+  // rather than inside of the signal handler. If we see the vector grow, we
+  // have to call it on every entry. If it remains in place, we only need to
+  // call it on the latest one.
+  if (OldPtr == &FilesToRemove[0])
+    FilesToRemove.back().c_str();
+  else
+    for (unsigned i = 0, e = FilesToRemove.size(); i != e; ++i)
+      FilesToRemove[i].c_str();
 
   SignalsMutex.release();
 
@@ -189,10 +212,19 @@ bool llvm::sys::RemoveFileOnSignal(const sys::Path &Filename,
 // DontRemoveFileOnSignal - The public API
 void llvm::sys::DontRemoveFileOnSignal(const sys::Path &Filename) {
   SignalsMutex.acquire();
-  std::vector<sys::Path>::reverse_iterator I =
-    std::find(FilesToRemove.rbegin(), FilesToRemove.rend(), Filename);
-  if (I != FilesToRemove.rend())
-    FilesToRemove.erase(I.base()-1);
+  std::vector<std::string>::reverse_iterator RI =
+    std::find(FilesToRemove.rbegin(), FilesToRemove.rend(), Filename.str());
+  std::vector<std::string>::iterator I = FilesToRemove.end();
+  if (RI != FilesToRemove.rend())
+    I = FilesToRemove.erase(RI.base()-1);
+
+  // We need to call c_str() on every element which would have been moved by
+  // the erase. These elements, in a C++98 implementation where c_str()
+  // requires a reallocation on the first call may have had the call to c_str()
+  // made on insertion become invalid by being copied down an element.
+  for (std::vector<std::string>::iterator E = FilesToRemove.end(); I != E; ++I)
+    I->c_str();
+
   SignalsMutex.release();
 }
 
diff --git a/lib/Support/Unix/Unix.h b/lib/Support/Unix/Unix.h
index b7be311..361f297 100644
--- a/lib/Support/Unix/Unix.h
+++ b/lib/Support/Unix/Unix.h
@@ -44,16 +44,10 @@
 #include <assert.h>
 #endif
 
-#ifdef TIME_WITH_SYS_TIME
+#ifdef HAVE_SYS_TIME_H
 # include <sys/time.h>
-# include <time.h>
-#else
-# ifdef HAVE_SYS_TIME_H
-#  include <sys/time.h>
-# else
-#  include <time.h>
-# endif
 #endif
+#include <time.h>
 
 #ifdef HAVE_SYS_WAIT_H
 # include <sys/wait.h>
diff --git a/lib/Support/Windows/Path.inc b/lib/Support/Windows/Path.inc
index d8dc522..2280b34 100644
--- a/lib/Support/Windows/Path.inc
+++ b/lib/Support/Windows/Path.inc
@@ -188,8 +188,20 @@ static Path *TempDirectory;
 
 Path
 Path::GetTemporaryDirectory(std::string* ErrMsg) {
-  if (TempDirectory)
+  if (TempDirectory) {
+#if defined(_MSC_VER)
+    // Visual Studio gets confused and emits a diagnostic about calling exists,
+    // even though this is the implementation for PathV1.  Temporarily 
+    // disable the deprecated warning message
+    #pragma warning(push)
+    #pragma warning(disable:4996)
+#endif
+    assert(TempDirectory->exists() && "Who has removed TempDirectory?");
+#if defined(_MSC_VER)
+    #pragma warning(pop)
+#endif
     return *TempDirectory;
+  }
 
   char pathname[MAX_PATH];
   if (!GetTempPath(MAX_PATH, pathname)) {
@@ -201,7 +213,7 @@ Path::GetTemporaryDirectory(std::string* ErrMsg) {
   Path result;
   result.set(pathname);
 
-  // Append a subdirectory passed on our process id so multiple LLVMs don't
+  // Append a subdirectory based on our process id so multiple LLVMs don't
   // step on each other's toes.
 #ifdef __MINGW32__
   // Mingw's Win32 header files are broken.
diff --git a/lib/Support/Windows/PathV2.inc b/lib/Support/Windows/PathV2.inc
index e9ce5d9..66eeab0 100644
--- a/lib/Support/Windows/PathV2.inc
+++ b/lib/Support/Windows/PathV2.inc
@@ -301,11 +301,21 @@ error_code rename(const Twine &from, const Twine &to) {
   if (error_code ec = UTF8ToUTF16(f, wide_from)) return ec;
   if (error_code ec = UTF8ToUTF16(t, wide_to)) return ec;
 
-  if (!::MoveFileExW(wide_from.begin(), wide_to.begin(),
-                     MOVEFILE_COPY_ALLOWED | MOVEFILE_REPLACE_EXISTING))
-    return windows_error(::GetLastError());
+  error_code ec = error_code::success();
+  for (int i = 0; i < 2000; i++) {
+    if (::MoveFileExW(wide_from.begin(), wide_to.begin(),
+                      MOVEFILE_COPY_ALLOWED | MOVEFILE_REPLACE_EXISTING))
+      return error_code::success();
+    ec = windows_error(::GetLastError());
+    if (ec != windows_error::access_denied)
+      break;
+    // Retry MoveFile() at ACCESS_DENIED.
+    // System scanners (eg. indexer) might open the source file when
+    // It is written and closed.
+    ::Sleep(1);
+  }
 
-  return error_code::success();
+  return ec;
 }
 
 error_code resize_file(const Twine &path, uint64_t size) {
@@ -487,9 +497,46 @@ handle_status_error:
   return error_code::success();
 }
 
+
+// Modifies permissions on a file.
+error_code permissions(const Twine &path, perms prms) {
+#if 0 // verify code below before enabling:
+  // If the permissions bits are not trying to modify
+  // "write" permissions, there is nothing to do.
+  if (!(prms & (owner_write|group_write|others_write)))
+    return error_code::success();
+  
+  SmallString<128> path_storage;
+  SmallVector<wchar_t, 128> path_utf16;
+
+  if (error_code ec = UTF8ToUTF16(path.toStringRef(path_storage),
+                                  path_utf16))
+    return ec;
+
+  DWORD attributes = ::GetFileAttributesW(path_utf16.begin());
+
+  if (prms & add_perms) {
+    attributes &= ~FILE_ATTRIBUTE_READONLY;
+  }
+  else if (prms & remove_perms) {
+    attributes |= FILE_ATTRIBUTE_READONLY;
+  }
+  else {
+    assert(0 && "neither add_perms or remove_perms is set");
+  }
+
+  if ( ! ::SetFileAttributesW(path_utf16.begin(), attributes))
+    return windows_error(::GetLastError());
+#endif    
+  return error_code::success();
+}
+
+
+// FIXME: mode should be used here and default to user r/w only,
+// it currently comes in as a UNIX mode.
 error_code unique_file(const Twine &model, int &result_fd,
-                             SmallVectorImpl<char> &result_path,
-                             bool makeAbsolute) {
+                       SmallVectorImpl<char> &result_path,
+                       bool makeAbsolute, unsigned mode) {
   // Use result_path as temp storage.
   result_path.set_size(0);
   StringRef m = model.toStringRef(result_path);
@@ -743,6 +790,19 @@ error_code detail::directory_iterator_increment(detail::DirIterState &it) {
   return error_code::success();
 }
 
+error_code map_file_pages(const Twine &path, off_t file_offset, size_t size,  
+                                            bool map_writable, void *&result) {
+  assert(0 && "NOT IMPLEMENTED");
+  return windows_error::invalid_function;
+}
+
+error_code unmap_file_pages(void *base, size_t size) {
+  assert(0 && "NOT IMPLEMENTED");
+  return windows_error::invalid_function;
+}
+
+
+
 } // end namespace fs
 } // end namespace sys
 } // end namespace llvm
diff --git a/lib/Support/Windows/Process.inc b/lib/Support/Windows/Process.inc
index 9a388b4..e29eb6d 100644
--- a/lib/Support/Windows/Process.inc
+++ b/lib/Support/Windows/Process.inc
@@ -133,7 +133,7 @@ bool Process::StandardErrIsDisplayed() {
 }
 
 bool Process::FileDescriptorIsDisplayed(int fd) {
-  DWORD Mode;	// Unused
+  DWORD Mode;  // Unused
   return (GetConsoleMode((HANDLE)_get_osfhandle(fd), &Mode) != 0);
 }
 
@@ -153,13 +153,17 @@ unsigned Process::StandardErrColumns() {
   return Columns;
 }
 
-// It always has colors.
-bool Process::StandardErrHasColors() {
-  return StandardErrIsDisplayed();
+// The terminal always has colors.
+bool Process::FileDescriptorHasColors(int fd) {
+  return FileDescriptorIsDisplayed(fd);
 }
 
 bool Process::StandardOutHasColors() {
-  return StandardOutIsDisplayed();
+  return FileDescriptorHasColors(1);
+}
+
+bool Process::StandardErrHasColors() {
+  return FileDescriptorHasColors(2);
 }
 
 namespace {
diff --git a/lib/Support/Windows/RWMutex.inc b/lib/Support/Windows/RWMutex.inc
index 26b9bba..9593923 100644
--- a/lib/Support/Windows/RWMutex.inc
+++ b/lib/Support/Windows/RWMutex.inc
@@ -67,9 +67,9 @@ static bool loadSRW() {
                                                "ReleaseSRWLockShared");
       ::FreeLibrary(hLib);
 
-	  if (fpInitializeSRWLock != NULL) {
-	    sHasSRW = true;
-	  }
+      if (fpInitializeSRWLock != NULL) {
+        sHasSRW = true;
+      }
     }
   }
   return sHasSRW;
diff --git a/lib/Support/Windows/ThreadLocal.inc b/lib/Support/Windows/ThreadLocal.inc
index 512462d..057deb3 100644
--- a/lib/Support/Windows/ThreadLocal.inc
+++ b/lib/Support/Windows/ThreadLocal.inc
@@ -22,26 +22,25 @@
 namespace llvm {
 using namespace sys;
 
-ThreadLocalImpl::ThreadLocalImpl() {
-  DWORD* tls = new DWORD;
+ThreadLocalImpl::ThreadLocalImpl() : data() {
+  typedef int SIZE_TOO_BIG[sizeof(DWORD) <= sizeof(data) ? 1 : -1];
+  DWORD* tls = reinterpret_cast<DWORD*>(&data);
   *tls = TlsAlloc();
   assert(*tls != TLS_OUT_OF_INDEXES);
-  data = tls;
 }
 
 ThreadLocalImpl::~ThreadLocalImpl() {
-  DWORD* tls = static_cast<DWORD*>(data);
+  DWORD* tls = reinterpret_cast<DWORD*>(&data);
   TlsFree(*tls);
-  delete tls;
 }
 
 const void* ThreadLocalImpl::getInstance() {
-  DWORD* tls = static_cast<DWORD*>(data);
+  DWORD* tls = reinterpret_cast<DWORD*>(&data);
   return TlsGetValue(*tls);
 }
 
 void ThreadLocalImpl::setInstance(const void* d){
-  DWORD* tls = static_cast<DWORD*>(data);
+  DWORD* tls = reinterpret_cast<DWORD*>(&data);
   int errorcode = TlsSetValue(*tls, const_cast<void*>(d));
   assert(errorcode != 0);
   (void)errorcode;
diff --git a/lib/Support/YAMLParser.cpp b/lib/Support/YAMLParser.cpp
index 330519f..7c353c8 100644
--- a/lib/Support/YAMLParser.cpp
+++ b/lib/Support/YAMLParser.cpp
@@ -27,12 +27,12 @@ using namespace llvm;
 using namespace yaml;
 
 enum UnicodeEncodingForm {
-  UEF_UTF32_LE, //< UTF-32 Little Endian
-  UEF_UTF32_BE, //< UTF-32 Big Endian
-  UEF_UTF16_LE, //< UTF-16 Little Endian
-  UEF_UTF16_BE, //< UTF-16 Big Endian
-  UEF_UTF8,     //< UTF-8 or ascii.
-  UEF_Unknown   //< Not a valid Unicode encoding.
+  UEF_UTF32_LE, ///< UTF-32 Little Endian
+  UEF_UTF32_BE, ///< UTF-32 Big Endian
+  UEF_UTF16_LE, ///< UTF-16 Little Endian
+  UEF_UTF16_BE, ///< UTF-16 Big Endian
+  UEF_UTF8,     ///< UTF-8 or ascii.
+  UEF_Unknown   ///< Not a valid Unicode encoding.
 };
 
 /// EncodingInfo - Holds the encoding type and length of the byte order mark if
@@ -489,9 +489,6 @@ private:
   /// @brief Can the next token be the start of a simple key?
   bool IsSimpleKeyAllowed;
 
-  /// @brief Is the next token required to start a simple key?
-  bool IsSimpleKeyRequired;
-
   /// @brief True if an error has occurred.
   bool Failed;
 
@@ -658,7 +655,7 @@ std::string yaml::escape(StringRef Input) {
       EscapedInput += "\\r";
     else if (*i == 0x1B)
       EscapedInput += "\\e";
-    else if (*i >= 0 && *i < 0x20) { // Control characters not handled above.
+    else if ((unsigned char)*i < 0x20) { // Control characters not handled above.
       std::string HexStr = utohexstr(*i);
       EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
     } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence.
@@ -704,7 +701,6 @@ Scanner::Scanner(StringRef Input, SourceMgr &sm)
   , FlowLevel(0)
   , IsStartOfStream(true)
   , IsSimpleKeyAllowed(true)
-  , IsSimpleKeyRequired(false)
   , Failed(false) {
   InputBuffer = MemoryBuffer::getMemBuffer(Input, "YAML");
   SM.AddNewSourceBuffer(InputBuffer, SMLoc());
@@ -755,6 +751,8 @@ Token Scanner::getNext() {
 }
 
 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) {
+  if (Position == End)
+    return Position;
   // Check 7 bit c-printable - b-char.
   if (   *Position == 0x09
       || (*Position >= 0x20 && *Position <= 0x7E))
@@ -778,6 +776,8 @@ StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) {
 }
 
 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) {
+  if (Position == End)
+    return Position;
   if (*Position == 0x0D) {
     if (Position + 1 != End && *(Position + 1) == 0x0A)
       return Position + 2;
@@ -1211,7 +1211,9 @@ bool Scanner::scanFlowScalar(bool IsDoubleQuoted) {
         ++Current;
       // Repeat until the previous character was not a '\' or was an escaped
       // backslash.
-    } while (*(Current - 1) == '\\' && wasEscaped(Start + 1, Current));
+    } while (   Current != End
+             && *(Current - 1) == '\\'
+             && wasEscaped(Start + 1, Current));
   } else {
     skip(1);
     while (true) {
@@ -1624,9 +1626,7 @@ StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const {
     return UnquotedValue;
   }
   // Plain or block.
-  size_t trimtrail = Value.rfind(' ');
-  return Value.drop_back(
-    trimtrail == StringRef::npos ? 0 : Value.size() - trimtrail);
+  return Value.rtrim(" ");
 }
 
 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
@@ -1733,7 +1733,9 @@ StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
             // TODO: Report error.
             break;
           unsigned int UnicodeScalarValue;
-          UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue);
+          if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue))
+            // TODO: Report error.
+            UnicodeScalarValue = 0xFFFD;
           encodeUTF8(UnicodeScalarValue, Storage);
           UnquotedValue = UnquotedValue.substr(2);
           break;
@@ -1743,7 +1745,9 @@ StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
             // TODO: Report error.
             break;
           unsigned int UnicodeScalarValue;
-          UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue);
+          if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue))
+            // TODO: Report error.
+            UnicodeScalarValue = 0xFFFD;
           encodeUTF8(UnicodeScalarValue, Storage);
           UnquotedValue = UnquotedValue.substr(4);
           break;
@@ -1753,7 +1757,9 @@ StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
             // TODO: Report error.
             break;
           unsigned int UnicodeScalarValue;
-          UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue);
+          if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue))
+            // TODO: Report error.
+            UnicodeScalarValue = 0xFFFD;
           encodeUTF8(UnicodeScalarValue, Storage);
           UnquotedValue = UnquotedValue.substr(8);
           break;
@@ -2113,5 +2119,3 @@ bool Document::expectToken(int TK) {
   }
   return true;
 }
-
-OwningPtr<Document> document_iterator::NullDoc;
diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index 86cdca1..fa69c2d 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@@ -528,7 +528,8 @@ void raw_fd_ostream::write_impl(const char *Ptr, size_t Size) {
     } else {
       // Use ::writev() where available.
 #if defined(HAVE_WRITEV)
-      struct iovec IOV = { (void*) Ptr, Size };
+      const void *Addr = static_cast<const void *>(Ptr);
+      struct iovec IOV = {const_cast<void *>(Addr), Size };
       ret = ::writev(FD, &IOV, 1);
 #else
       ret = ::write(FD, Ptr, Size);
@@ -650,6 +651,10 @@ bool raw_fd_ostream::is_displayed() const {
   return sys::Process::FileDescriptorIsDisplayed(FD);
 }
 
+bool raw_fd_ostream::has_colors() const {
+  return sys::Process::FileDescriptorHasColors(FD);
+}
+
 //===----------------------------------------------------------------------===//
 //  outs(), errs(), nulls()
 //===----------------------------------------------------------------------===//
diff --git a/lib/TableGen/CMakeLists.txt b/lib/TableGen/CMakeLists.txt
index 82f72b0..ba7bf14 100644
--- a/lib/TableGen/CMakeLists.txt
+++ b/lib/TableGen/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_library(LLVMTableGen
   Error.cpp
   Main.cpp
   Record.cpp
+  StringMatcher.cpp
   TableGenAction.cpp
   TableGenBackend.cpp
   TGLexer.cpp
diff --git a/lib/TableGen/Main.cpp b/lib/TableGen/Main.cpp
index 01bc55e..7aeef56 100644
--- a/lib/TableGen/Main.cpp
+++ b/lib/TableGen/Main.cpp
@@ -34,7 +34,9 @@ namespace {
                  cl::init("-"));
 
   cl::opt<std::string>
-  DependFilename("d", cl::desc("Dependency filename"), cl::value_desc("filename"),
+  DependFilename("d",
+                 cl::desc("Dependency filename"),
+                 cl::value_desc("filename"),
                  cl::init(""));
 
   cl::opt<std::string>
@@ -53,7 +55,8 @@ int TableGenMain(char *argv0, TableGenAction &Action) {
   try {
     // Parse the input file.
     OwningPtr<MemoryBuffer> File;
-    if (error_code ec = MemoryBuffer::getFileOrSTDIN(InputFilename.c_str(), File)) {
+    if (error_code ec =
+          MemoryBuffer::getFileOrSTDIN(InputFilename.c_str(), File)) {
       errs() << "Could not open input file '" << InputFilename << "': "
              << ec.message() <<"\n";
       return 1;
@@ -93,7 +96,7 @@ int TableGenMain(char *argv0, TableGenAction &Action) {
       DepOut.os() << OutputFilename << ":";
       const std::vector<std::string> &Dependencies = Parser.getDependencies();
       for (std::vector<std::string>::const_iterator I = Dependencies.begin(),
-                                                          E = Dependencies.end();
+                                                    E = Dependencies.end();
            I != E; ++I) {
         DepOut.os() << " " << (*I);
       }
diff --git a/lib/TableGen/Record.cpp b/lib/TableGen/Record.cpp
index 93eed24..99fdc1f 100644
--- a/lib/TableGen/Record.cpp
+++ b/lib/TableGen/Record.cpp
@@ -1699,7 +1699,7 @@ void Record::checkName() {
   assert(TypedName && "Record name is not typed!");
   RecTy *Type = TypedName->getType();
   if (dynamic_cast<StringRecTy *>(Type) == 0) {
-    throw "Record name is not a string!";
+    throw TGError(getLoc(), "Record name is not a string!");
   }
 }
 
diff --git a/lib/TableGen/StringMatcher.cpp b/lib/TableGen/StringMatcher.cpp
new file mode 100644
index 0000000..1668170
--- /dev/null
+++ b/lib/TableGen/StringMatcher.cpp
@@ -0,0 +1,149 @@
+//===- StringMatcher.cpp - Generate a matcher for input strings -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the StringMatcher class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/TableGen/StringMatcher.h"
+#include "llvm/Support/raw_ostream.h"
+#include <map>
+using namespace llvm;
+
+/// FindFirstNonCommonLetter - Find the first character in the keys of the
+/// string pairs that is not shared across the whole set of strings.  All
+/// strings are assumed to have the same length.
+static unsigned 
+FindFirstNonCommonLetter(const std::vector<const
+                              StringMatcher::StringPair*> &Matches) {
+  assert(!Matches.empty());
+  for (unsigned i = 0, e = Matches[0]->first.size(); i != e; ++i) {
+    // Check to see if letter i is the same across the set.
+    char Letter = Matches[0]->first[i];
+    
+    for (unsigned str = 0, e = Matches.size(); str != e; ++str)
+      if (Matches[str]->first[i] != Letter)
+        return i;
+  }
+  
+  return Matches[0]->first.size();
+}
+
+/// EmitStringMatcherForChar - Given a set of strings that are known to be the
+/// same length and whose characters leading up to CharNo are the same, emit
+/// code to verify that CharNo and later are the same.
+///
+/// \return - True if control can leave the emitted code fragment.
+bool StringMatcher::
+EmitStringMatcherForChar(const std::vector<const StringPair*> &Matches,
+                         unsigned CharNo, unsigned IndentCount) const {
+  assert(!Matches.empty() && "Must have at least one string to match!");
+  std::string Indent(IndentCount*2+4, ' ');
+  
+  // If we have verified that the entire string matches, we're done: output the
+  // matching code.
+  if (CharNo == Matches[0]->first.size()) {
+    assert(Matches.size() == 1 && "Had duplicate keys to match on");
+    
+    // If the to-execute code has \n's in it, indent each subsequent line.
+    StringRef Code = Matches[0]->second;
+    
+    std::pair<StringRef, StringRef> Split = Code.split('\n');
+    OS << Indent << Split.first << "\t // \"" << Matches[0]->first << "\"\n";
+
+    Code = Split.second;
+    while (!Code.empty()) {
+      Split = Code.split('\n');
+      OS << Indent << Split.first << "\n";
+      Code = Split.second;
+    }
+    return false;
+  }
+  
+  // Bucket the matches by the character we are comparing.
+  std::map<char, std::vector<const StringPair*> > MatchesByLetter;
+  
+  for (unsigned i = 0, e = Matches.size(); i != e; ++i)
+    MatchesByLetter[Matches[i]->first[CharNo]].push_back(Matches[i]);
+  
+  
+  // If we have exactly one bucket to match, see how many characters are common
+  // across the whole set and match all of them at once.
+  if (MatchesByLetter.size() == 1) {
+    unsigned FirstNonCommonLetter = FindFirstNonCommonLetter(Matches);
+    unsigned NumChars = FirstNonCommonLetter-CharNo;
+    
+    // Emit code to break out if the prefix doesn't match.
+    if (NumChars == 1) {
+      // Do the comparison with if (Str[1] != 'f')
+      // FIXME: Need to escape general characters.
+      OS << Indent << "if (" << StrVariableName << "[" << CharNo << "] != '"
+      << Matches[0]->first[CharNo] << "')\n";
+      OS << Indent << "  break;\n";
+    } else {
+      // Do the comparison with if memcmp(Str.data()+1, "foo", 3).
+      // FIXME: Need to escape general strings.
+      OS << Indent << "if (memcmp(" << StrVariableName << ".data()+" << CharNo
+         << ", \"" << Matches[0]->first.substr(CharNo, NumChars) << "\", "
+         << NumChars << "))\n";
+      OS << Indent << "  break;\n";
+    }
+    
+    return EmitStringMatcherForChar(Matches, FirstNonCommonLetter, IndentCount);
+  }
+  
+  // Otherwise, we have multiple possible things, emit a switch on the
+  // character.
+  OS << Indent << "switch (" << StrVariableName << "[" << CharNo << "]) {\n";
+  OS << Indent << "default: break;\n";
+  
+  for (std::map<char, std::vector<const StringPair*> >::iterator LI = 
+       MatchesByLetter.begin(), E = MatchesByLetter.end(); LI != E; ++LI) {
+    // TODO: escape hard stuff (like \n) if we ever care about it.
+    OS << Indent << "case '" << LI->first << "':\t // "
+       << LI->second.size() << " string";
+    if (LI->second.size() != 1) OS << 's';
+    OS << " to match.\n";
+    if (EmitStringMatcherForChar(LI->second, CharNo+1, IndentCount+1))
+      OS << Indent << "  break;\n";
+  }
+  
+  OS << Indent << "}\n";
+  return true;
+}
+
+
+/// Emit - Top level entry point.
+///
+void StringMatcher::Emit(unsigned Indent) const {
+  // If nothing to match, just fall through.
+  if (Matches.empty()) return;
+  
+  // First level categorization: group strings by length.
+  std::map<unsigned, std::vector<const StringPair*> > MatchesByLength;
+  
+  for (unsigned i = 0, e = Matches.size(); i != e; ++i)
+    MatchesByLength[Matches[i].first.size()].push_back(&Matches[i]);
+  
+  // Output a switch statement on length and categorize the elements within each
+  // bin.
+  OS.indent(Indent*2+2) << "switch (" << StrVariableName << ".size()) {\n";
+  OS.indent(Indent*2+2) << "default: break;\n";
+  
+  for (std::map<unsigned, std::vector<const StringPair*> >::iterator LI =
+       MatchesByLength.begin(), E = MatchesByLength.end(); LI != E; ++LI) {
+    OS.indent(Indent*2+2) << "case " << LI->first << ":\t // "
+       << LI->second.size()
+       << " string" << (LI->second.size() == 1 ? "" : "s") << " to match.\n";
+    if (EmitStringMatcherForChar(LI->second, 0, Indent))
+      OS.indent(Indent*2+4) << "break;\n";
+  }
+  
+  OS.indent(Indent*2+2) << "}\n";
+}
diff --git a/lib/TableGen/TGParser.cpp b/lib/TableGen/TGParser.cpp
index 04c4fc1..9424677 100644
--- a/lib/TableGen/TGParser.cpp
+++ b/lib/TableGen/TGParser.cpp
@@ -292,107 +292,78 @@ bool TGParser::AddSubMultiClass(MultiClass *CurMC,
 /// ProcessForeachDefs - Given a record, apply all of the variable
 /// values in all surrounding foreach loops, creating new records for
 /// each combination of values.
-bool TGParser::ProcessForeachDefs(Record *CurRec, MultiClass *CurMultiClass,
-                                  SMLoc Loc) {
+bool TGParser::ProcessForeachDefs(Record *CurRec, SMLoc Loc) {
+  if (Loops.empty())
+    return false;
+
   // We want to instantiate a new copy of CurRec for each combination
   // of nested loop iterator values.  We don't want top instantiate
   // any copies until we have values for each loop iterator.
   IterSet IterVals;
-  for (LoopVector::iterator Loop = Loops.begin(), LoopEnd = Loops.end();
-       Loop != LoopEnd;
-       ++Loop) {
-    // Process this loop.
-    if (ProcessForeachDefs(CurRec, CurMultiClass, Loc,
-                           IterVals, *Loop, Loop+1)) {
-      Error(Loc,
-            "Could not process loops for def " + CurRec->getNameInitAsString());
-      return true;
-    }
-  }
-
-  return false;
+  return ProcessForeachDefs(CurRec, Loc, IterVals);
 }
 
 /// ProcessForeachDefs - Given a record, a loop and a loop iterator,
 /// apply each of the variable values in this loop and then process
 /// subloops.
-bool TGParser::ProcessForeachDefs(Record *CurRec, MultiClass *CurMultiClass,
-                                  SMLoc Loc, IterSet &IterVals,
-                                  ForeachLoop &CurLoop,
-                                  LoopVector::iterator NextLoop) {
-  Init *IterVar = CurLoop.IterVar;
-  ListInit *List = dynamic_cast<ListInit *>(CurLoop.ListValue);
-
-  if (List == 0) {
-    Error(Loc, "Loop list is not a list");
-    return true;
-  }
-
-  // Process each value.
-  for (int64_t i = 0; i < List->getSize(); ++i) {
-    Init *ItemVal = List->resolveListElementReference(*CurRec, 0, i);
-    IterVals.push_back(IterRecord(IterVar, ItemVal));
-
-    if (IterVals.size() == Loops.size()) {
-      // Ok, we have all of the iterator values for this point in the
-      // iteration space.  Instantiate a new record to reflect this
-      // combination of values.
-      Record *IterRec = new Record(*CurRec);
-
-      // Set the iterator values now.
-      for (IterSet::iterator i = IterVals.begin(), iend = IterVals.end();
-           i != iend;
-           ++i) {
-        VarInit *IterVar = dynamic_cast<VarInit *>(i->IterVar);
-        if (IterVar == 0) {
-          Error(Loc, "foreach iterator is unresolved");
-          return true;
-        }
-
-        TypedInit *IVal  = dynamic_cast<TypedInit *>(i->IterValue);
-        if (IVal == 0) {
-          Error(Loc, "foreach iterator value is untyped");
-          return true;
-        }
-
-        IterRec->addValue(RecordVal(IterVar->getName(), IVal->getType(), false));
+bool TGParser::ProcessForeachDefs(Record *CurRec, SMLoc Loc, IterSet &IterVals){
+  // Recursively build a tuple of iterator values.
+  if (IterVals.size() != Loops.size()) {
+    assert(IterVals.size() < Loops.size());
+    ForeachLoop &CurLoop = Loops[IterVals.size()];
+    ListInit *List = dynamic_cast<ListInit *>(CurLoop.ListValue);
+    if (List == 0) {
+      Error(Loc, "Loop list is not a list");
+      return true;
+    }
 
-        if (SetValue(IterRec, Loc, IterVar->getName(),
-                     std::vector<unsigned>(), IVal)) {
-          Error(Loc, "when instantiating this def");
-          return true;
-        }
+    // Process each value.
+    for (int64_t i = 0; i < List->getSize(); ++i) {
+      Init *ItemVal = List->resolveListElementReference(*CurRec, 0, i);
+      IterVals.push_back(IterRecord(CurLoop.IterVar, ItemVal));
+      if (ProcessForeachDefs(CurRec, Loc, IterVals))
+        return true;
+      IterVals.pop_back();
+    }
+    return false;
+  }
 
-        // Resolve it next.
-        IterRec->resolveReferencesTo(IterRec->getValue(IterVar->getName()));
+  // This is the bottom of the recursion. We have all of the iterator values
+  // for this point in the iteration space.  Instantiate a new record to
+  // reflect this combination of values.
+  Record *IterRec = new Record(*CurRec);
 
-        // Remove it.
-        IterRec->removeValue(IterVar->getName());
-      }
+  // Set the iterator values now.
+  for (unsigned i = 0, e = IterVals.size(); i != e; ++i) {
+    VarInit *IterVar = IterVals[i].IterVar;
+    TypedInit *IVal = dynamic_cast<TypedInit *>(IterVals[i].IterValue);
+    if (IVal == 0) {
+      Error(Loc, "foreach iterator value is untyped");
+      return true;
+    }
 
-      if (Records.getDef(IterRec->getNameInitAsString())) {
-        Error(Loc, "def already exists: " + IterRec->getNameInitAsString());
-        return true;
-      }
+    IterRec->addValue(RecordVal(IterVar->getName(), IVal->getType(), false));
 
-      Records.addDef(IterRec);
-      IterRec->resolveReferences();
+    if (SetValue(IterRec, Loc, IterVar->getName(),
+                 std::vector<unsigned>(), IVal)) {
+      Error(Loc, "when instantiating this def");
+      return true;
     }
 
-    if (NextLoop != Loops.end()) {
-      // Process nested loops.
-      if (ProcessForeachDefs(CurRec, CurMultiClass, Loc, IterVals, *NextLoop,
-                             NextLoop+1)) {
-        Error(Loc,
-              "Could not process loops for def " +
-              CurRec->getNameInitAsString());
-        return true;
-      }
-    }
+    // Resolve it next.
+    IterRec->resolveReferencesTo(IterRec->getValue(IterVar->getName()));
+
+    // Remove it.
+    IterRec->removeValue(IterVar->getName());
+  }
 
-    // We're done with this iterator.
-    IterVals.pop_back();
+  if (Records.getDef(IterRec->getNameInitAsString())) {
+    Error(Loc, "def already exists: " + IterRec->getNameInitAsString());
+    return true;
   }
+
+  Records.addDef(IterRec);
+  IterRec->resolveReferences();
   return false;
 }
 
@@ -1726,9 +1697,11 @@ Init *TGParser::ParseDeclaration(Record *CurRec,
 /// the name of the declared object or a NULL Init on error.  Return
 /// the name of the parsed initializer list through ForeachListName.
 ///
-///  ForeachDeclaration ::= ID '=' Value
+///  ForeachDeclaration ::= ID '=' '[' ValueList ']'
+///  ForeachDeclaration ::= ID '=' '{' RangeList '}'
+///  ForeachDeclaration ::= ID '=' RangePiece
 ///
-Init *TGParser::ParseForeachDeclaration(Init *&ForeachListValue) {
+VarInit *TGParser::ParseForeachDeclaration(ListInit *&ForeachListValue) {
   if (Lex.getCode() != tgtok::Id) {
     TokError("Expected identifier in foreach declaration");
     return 0;
@@ -1744,26 +1717,59 @@ Init *TGParser::ParseForeachDeclaration(Init *&ForeachListValue) {
   }
   Lex.Lex();  // Eat the '='
 
-  // Expect a list initializer.
-  ForeachListValue = ParseValue(0, 0, ParseForeachMode);
+  RecTy *IterType = 0;
+  std::vector<unsigned> Ranges;
 
-  TypedInit *TypedList = dynamic_cast<TypedInit *>(ForeachListValue);
-  if (TypedList == 0) {
-    TokError("Value list is untyped");
-    return 0;
+  switch (Lex.getCode()) {
+  default: TokError("Unknown token when expecting a range list"); return 0;
+  case tgtok::l_square: { // '[' ValueList ']'
+    Init *List = ParseSimpleValue(0, 0, ParseForeachMode);
+    ForeachListValue = dynamic_cast<ListInit*>(List);
+    if (ForeachListValue == 0) {
+      TokError("Expected a Value list");
+      return 0;
+    }
+    RecTy *ValueType = ForeachListValue->getType();
+    ListRecTy *ListType = dynamic_cast<ListRecTy *>(ValueType);
+    if (ListType == 0) {
+      TokError("Value list is not of list type");
+      return 0;
+    }
+    IterType = ListType->getElementType();
+    break;
   }
 
-  RecTy *ValueType = TypedList->getType();
-  ListRecTy *ListType = dynamic_cast<ListRecTy *>(ValueType);
-  if (ListType == 0) {
-    TokError("Value list is not of list type");
-    return 0;
+  case tgtok::IntVal: { // RangePiece.
+    if (ParseRangePiece(Ranges))
+      return 0;
+    break;
   }
 
-  RecTy *IterType = ListType->getElementType();
-  VarInit *IterVar = VarInit::get(DeclName, IterType);
+  case tgtok::l_brace: { // '{' RangeList '}'
+    Lex.Lex(); // eat the '{'
+    Ranges = ParseRangeList();
+    if (Lex.getCode() != tgtok::r_brace) {
+      TokError("expected '}' at end of bit range list");
+      return 0;
+    }
+    Lex.Lex();
+    break;
+  }
+  }
+
+  if (!Ranges.empty()) {
+    assert(!IterType && "Type already initialized?");
+    IterType = IntRecTy::get();
+    std::vector<Init*> Values;
+    for (unsigned i = 0, e = Ranges.size(); i != e; ++i)
+      Values.push_back(IntInit::get(Ranges[i]));
+    ForeachListValue = ListInit::get(Values, IterType);
+  }
 
-  return IterVar;
+  if (!IterType)
+    return 0;
+
+  return VarInit::get(DeclName, IterType);
 }
 
 /// ParseTemplateArgList - Read a template argument list, which is a non-empty
@@ -1932,7 +1938,7 @@ bool TGParser::ParseDef(MultiClass *CurMultiClass) {
   // Parse ObjectName and make a record for it.
   Record *CurRec = new Record(ParseObjectName(CurMultiClass), DefLoc, Records);
 
-  if (!CurMultiClass) {
+  if (!CurMultiClass && Loops.empty()) {
     // Top-level def definition.
 
     // Ensure redefinition doesn't happen.
@@ -1942,7 +1948,7 @@ bool TGParser::ParseDef(MultiClass *CurMultiClass) {
       return true;
     }
     Records.addDef(CurRec);
-  } else {
+  } else if (CurMultiClass) {
     // Otherwise, a def inside a multiclass, add it to the multiclass.
     for (unsigned i = 0, e = CurMultiClass->DefPrototypes.size(); i != e; ++i)
       if (CurMultiClass->DefPrototypes[i]->getNameInit()
@@ -1978,7 +1984,7 @@ bool TGParser::ParseDef(MultiClass *CurMultiClass) {
     }
   }
 
-  if (ProcessForeachDefs(CurRec, CurMultiClass, DefLoc)) {
+  if (ProcessForeachDefs(CurRec, DefLoc)) {
     Error(DefLoc,
           "Could not process loops for def" + CurRec->getNameInitAsString());
     return true;
@@ -1999,8 +2005,8 @@ bool TGParser::ParseForeach(MultiClass *CurMultiClass) {
 
   // Make a temporary object to record items associated with the for
   // loop.
-  Init *ListValue = 0;
-  Init *IterName = ParseForeachDeclaration(ListValue);
+  ListInit *ListValue = 0;
+  VarInit *IterName = ParseForeachDeclaration(ListValue);
   if (IterName == 0)
     return TokError("expected declaration in for");
 
diff --git a/lib/TableGen/TGParser.h b/lib/TableGen/TGParser.h
index b8e7cb1..3d2c72c 100644
--- a/lib/TableGen/TGParser.h
+++ b/lib/TableGen/TGParser.h
@@ -45,10 +45,11 @@ namespace llvm {
   /// ForeachLoop - Record the iteration state associated with a for loop.
   /// This is used to instantiate items in the loop body.
   struct ForeachLoop {
-    Init *IterVar;
-    Init *ListValue;
+    VarInit *IterVar;
+    ListInit *ListValue;
 
-    ForeachLoop(Init *IVar, Init *LValue) : IterVar(IVar), ListValue(LValue) {}
+    ForeachLoop(VarInit *IVar, ListInit *LValue)
+      : IterVar(IVar), ListValue(LValue) {}
   };
 
 class TGParser {
@@ -113,20 +114,17 @@ private:  // Semantic analysis methods.
 
   // IterRecord: Map an iterator name to a value.
   struct IterRecord {
-    Init *IterVar;
+    VarInit *IterVar;
     Init *IterValue;
-    IterRecord(Init *Var, Init *Val) : IterVar(Var), IterValue(Val) {}
+    IterRecord(VarInit *Var, Init *Val) : IterVar(Var), IterValue(Val) {}
   };
 
   // IterSet: The set of all iterator values at some point in the
   // iteration space.
   typedef std::vector<IterRecord> IterSet;
 
-  bool ProcessForeachDefs(Record *CurRec, MultiClass *CurMultiClass,
-                          SMLoc Loc);
-  bool ProcessForeachDefs(Record *CurRec, MultiClass *CurMultiClass,
-                          SMLoc Loc, IterSet &IterVals, ForeachLoop &CurLoop,
-                          LoopVector::iterator NextLoop);
+  bool ProcessForeachDefs(Record *CurRec, SMLoc Loc);
+  bool ProcessForeachDefs(Record *CurRec, SMLoc Loc, IterSet &IterVals);
 
 private:  // Parser methods.
   bool ParseObjectList(MultiClass *MC = 0);
@@ -160,7 +158,7 @@ private:  // Parser methods.
 
   bool ParseTemplateArgList(Record *CurRec);
   Init *ParseDeclaration(Record *CurRec, bool ParsingTemplateArgs);
-  Init *ParseForeachDeclaration(Init *&ForeachListValue);
+  VarInit *ParseForeachDeclaration(ListInit *&ForeachListValue);
 
   SubClassReference ParseSubClassReference(Record *CurRec, bool isDefm);
   SubMultiClassReference ParseSubMultiClassReference(MultiClass *CurMC);
diff --git a/lib/TableGen/TableGenBackend.cpp b/lib/TableGen/TableGenBackend.cpp
index 09bcc7a..7c8367a 100644
--- a/lib/TableGen/TableGenBackend.cpp
+++ b/lib/TableGen/TableGenBackend.cpp
@@ -1,4 +1,4 @@
-//===- TableGenBackend.cpp - Base class for TableGen Backends ---*- C++ -*-===//
+//===- TableGenBackend.cpp - Utilities for TableGen Backends ----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,17 +11,27 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/TableGenBackend.h"
-#include "llvm/TableGen/Record.h"
 using namespace llvm;
 
-void TableGenBackend::anchor() { }
-
-void TableGenBackend::EmitSourceFileHeader(StringRef Desc,
-                                           raw_ostream &OS) const {
-  OS << "//===- TableGen'erated file -------------------------------------*-"
-       " C++ -*-===//\n//\n// " << Desc << "\n//\n// Automatically generate"
-       "d file, do not edit!\n//\n//===------------------------------------"
-       "----------------------------------===//\n\n";
+static void printLine(raw_ostream &OS, const Twine &Prefix, char Fill,
+                      StringRef Suffix) {
+  uint64_t Pos = OS.tell();
+  OS << Prefix;
+  for (unsigned i = OS.tell() - Pos, e = 80 - Suffix.size(); i != e; ++i)
+    OS << Fill;
+  OS << Suffix << '\n';
 }
 
+void llvm::emitSourceFileHeader(StringRef Desc, raw_ostream &OS) {
+  printLine(OS, "/*===- TableGen'erated file ", '-', "*- C++ -*-===*\\");
+  printLine(OS, "|*", ' ', "*|");
+  printLine(OS, "|* " + Desc, ' ', "*|");
+  printLine(OS, "|*", ' ', "*|");
+  printLine(OS, "|* Automatically generated file, do not edit!", ' ', "*|");
+  printLine(OS, "|*", ' ', "*|");
+  printLine(OS, "\\*===", '-', "===*/");
+  OS << '\n';
+}
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index 9b0cb0c..cd3c0e0 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -141,7 +141,7 @@ def ProcA9      : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9",
                                     FeatureAvoidPartialCPSR]>;
 
 class ProcNoItin<string Name, list<SubtargetFeature> Features>
- : Processor<Name, GenericItineraries, Features>;
+ : Processor<Name, NoItineraries, Features>;
 
 // V4 Processors.
 def : ProcNoItin<"generic",         []>;
@@ -204,13 +204,13 @@ def : Processor<"arm1156t2f-s",     ARMV6Itineraries, [HasV6T2Ops, FeatureVFP2,
                                                        FeatureDSPThumb2]>;
 
 // V7a Processors.
-def : Processor<"cortex-a8",        CortexA8Itineraries,
+def : ProcessorModel<"cortex-a8",   CortexA8Model,
                                     [ProcA8, HasV7Ops, FeatureNEON, FeatureDB,
                                      FeatureDSPThumb2, FeatureHasRAS]>;
-def : Processor<"cortex-a9",        CortexA9Itineraries,
+def : ProcessorModel<"cortex-a9",   CortexA9Model,
                                     [ProcA9, HasV7Ops, FeatureNEON, FeatureDB,
                                      FeatureDSPThumb2, FeatureHasRAS]>;
-def : Processor<"cortex-a9-mp",     CortexA9Itineraries,
+def : ProcessorModel<"cortex-a9-mp", CortexA9Model,
                                     [ProcA9, HasV7Ops, FeatureNEON, FeatureDB,
                                      FeatureDSPThumb2, FeatureMP,
                                      FeatureHasRAS]>;
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index 410790a..9a1ce06 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -23,8 +23,8 @@
 #include "InstPrinter/ARMInstPrinter.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMMCExpr.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/Module.h"
 #include "llvm/Type.h"
 #include "llvm/Assembly/Writer.h"
@@ -283,9 +283,16 @@ void ARMAsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const {
   }
 }
 
-void ARMAsmPrinter::EmitFunctionEntryLabel() {
-  OutStreamer.ForceCodeRegion();
+void ARMAsmPrinter::EmitFunctionBodyEnd() {
+  // Make sure to terminate any constant pools that were at the end
+  // of the function.
+  if (!InConstantPool)
+    return;
+  InConstantPool = false;
+  OutStreamer.EmitDataRegion(MCDR_DataRegionEnd);
+}
 
+void ARMAsmPrinter::EmitFunctionEntryLabel() {
   if (AFI->isThumbFunction()) {
     OutStreamer.EmitAssemblerFlag(MCAF_Code16);
     OutStreamer.EmitThumbFunc(CurrentFnSym);
@@ -415,7 +422,9 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
     if (ExtraCode[1] != 0) return true; // Unknown modifier.
 
     switch (ExtraCode[0]) {
-    default: return true;  // Unknown modifier.
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O);
     case 'a': // Print as a memory address.
       if (MI->getOperand(OpNum).isReg()) {
         O << "["
@@ -434,15 +443,18 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
       printOperand(MI, OpNum, O);
       return false;
     case 'y': // Print a VFP single precision register as indexed double.
-      // This uses the ordering of the alias table to get the first 'd' register
-      // that overlaps the 's' register. Also, s0 is an odd register, hence the
-      // odd modulus check below.
       if (MI->getOperand(OpNum).isReg()) {
         unsigned Reg = MI->getOperand(OpNum).getReg();
         const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
-        O << ARMInstPrinter::getRegisterName(TRI->getAliasSet(Reg)[0]) <<
-        (((Reg % 2) == 1) ? "[0]" : "[1]");
-        return false;
+        // Find the 'd' register that has this 's' register as a sub-register,
+        // and determine the lane number.
+        for (MCSuperRegIterator SR(Reg, TRI); SR.isValid(); ++SR) {
+          if (!ARM::DPRRegClass.contains(*SR))
+            continue;
+          bool Lane0 = TRI->getSubReg(*SR, ARM::ssub_0) == Reg;
+          O << ARMInstPrinter::getRegisterName(*SR) << (Lane0 ? "[0]" : "[1]");
+          return false;
+        }
       }
       return true;
     case 'B': // Bitwise inverse of integer or symbol without a preceding #.
@@ -934,13 +946,13 @@ void ARMAsmPrinter::EmitJumpTable(const MachineInstr *MI) {
   const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id
   unsigned JTI = MO1.getIndex();
 
-  // Tag the jump table appropriately for precise disassembly.
-  OutStreamer.EmitJumpTable32Region();
-
   // Emit a label for the jump table.
   MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel2(JTI, MO2.getImm());
   OutStreamer.EmitLabel(JTISymbol);
 
+  // Mark the jump table as data-in-code.
+  OutStreamer.EmitDataRegion(MCDR_DataRegionJT32);
+
   // Emit each entry of the table.
   const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
   const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
@@ -969,6 +981,8 @@ void ARMAsmPrinter::EmitJumpTable(const MachineInstr *MI) {
                                      OutContext);
     OutStreamer.EmitValue(Expr, 4);
   }
+  // Mark the end of jump table data-in-code region.
+  OutStreamer.EmitDataRegion(MCDR_DataRegionEnd);
 }
 
 void ARMAsmPrinter::EmitJump2Table(const MachineInstr *MI) {
@@ -978,15 +992,6 @@ void ARMAsmPrinter::EmitJump2Table(const MachineInstr *MI) {
   const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id
   unsigned JTI = MO1.getIndex();
 
-  // Emit a label for the jump table.
-  if (MI->getOpcode() == ARM::t2TBB_JT) {
-    OutStreamer.EmitJumpTable8Region();
-  } else if (MI->getOpcode() == ARM::t2TBH_JT) {
-    OutStreamer.EmitJumpTable16Region();
-  } else {
-    OutStreamer.EmitJumpTable32Region();
-  }
-
   MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel2(JTI, MO2.getImm());
   OutStreamer.EmitLabel(JTISymbol);
 
@@ -995,10 +1000,15 @@ void ARMAsmPrinter::EmitJump2Table(const MachineInstr *MI) {
   const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
   const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
   unsigned OffsetWidth = 4;
-  if (MI->getOpcode() == ARM::t2TBB_JT)
+  if (MI->getOpcode() == ARM::t2TBB_JT) {
     OffsetWidth = 1;
-  else if (MI->getOpcode() == ARM::t2TBH_JT)
+    // Mark the jump table as data-in-code.
+    OutStreamer.EmitDataRegion(MCDR_DataRegionJT8);
+  } else if (MI->getOpcode() == ARM::t2TBH_JT) {
     OffsetWidth = 2;
+    // Mark the jump table as data-in-code.
+    OutStreamer.EmitDataRegion(MCDR_DataRegionJT16);
+  }
 
   for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
     MachineBasicBlock *MBB = JTBBs[i];
@@ -1031,6 +1041,11 @@ void ARMAsmPrinter::EmitJump2Table(const MachineInstr *MI) {
                                    OutContext);
     OutStreamer.EmitValue(Expr, OffsetWidth);
   }
+  // Mark the end of jump table data-in-code region. 32-bit offsets use
+  // actual branch instructions here, so we don't mark those as a data-region
+  // at all.
+  if (OffsetWidth != 4)
+    OutStreamer.EmitDataRegion(MCDR_DataRegionEnd);
 }
 
 void ARMAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
@@ -1208,8 +1223,11 @@ extern cl::opt<bool> EnableARMEHABI;
 #include "ARMGenMCPseudoLowering.inc"
 
 void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  if (MI->getOpcode() != ARM::CONSTPOOL_ENTRY)
-    OutStreamer.EmitCodeRegion();
+  // If we just ended a constant pool, mark it as such.
+  if (InConstantPool && MI->getOpcode() != ARM::CONSTPOOL_ENTRY) {
+    OutStreamer.EmitDataRegion(MCDR_DataRegionEnd);
+    InConstantPool = false;
+  }
 
   // Emit unwinding stuff for frame-related instructions
   if (EnableARMEHABI && MI->getFlag(MachineInstr::FrameSetup))
@@ -1565,9 +1583,12 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     unsigned LabelId = (unsigned)MI->getOperand(0).getImm();
     unsigned CPIdx   = (unsigned)MI->getOperand(1).getIndex();
 
-    // Mark the constant pool entry as data if we're not already in a data
-    // region.
-    OutStreamer.EmitDataRegion();
+    // If this is the first entry of the pool, mark it.
+    if (!InConstantPool) {
+      OutStreamer.EmitDataRegion(MCDR_DataRegion);
+      InConstantPool = true;
+    }
+
     OutStreamer.EmitLabel(GetCPISymbol(LabelId));
 
     const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPIdx];
diff --git a/lib/Target/ARM/ARMAsmPrinter.h b/lib/Target/ARM/ARMAsmPrinter.h
index af3f75a..3555e8f 100644
--- a/lib/Target/ARM/ARMAsmPrinter.h
+++ b/lib/Target/ARM/ARMAsmPrinter.h
@@ -44,9 +44,12 @@ class LLVM_LIBRARY_VISIBILITY ARMAsmPrinter : public AsmPrinter {
   /// MachineFunction.
   const MachineConstantPool *MCP;
 
+  /// InConstantPool - Maintain state when emitting a sequence of constant
+  /// pool entries so we can properly mark them as data regions.
+  bool InConstantPool;
 public:
   explicit ARMAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-    : AsmPrinter(TM, Streamer), AFI(NULL), MCP(NULL) {
+    : AsmPrinter(TM, Streamer), AFI(NULL), MCP(NULL), InConstantPool(false) {
       Subtarget = &TM.getSubtarget<ARMSubtarget>();
     }
 
@@ -70,6 +73,7 @@ public:
   bool runOnMachineFunction(MachineFunction &F);
 
   virtual void EmitConstantPool() {} // we emit constant pools customly!
+  virtual void EmitFunctionBodyEnd();
   virtual void EmitFunctionEntryLabel();
   void EmitStartOfAsmFile(Module &M);
   void EmitEndOfAsmFile(Module &M);
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index c6280f8..714238a 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -51,9 +51,9 @@ WidenVMOVS("widen-vmovs", cl::Hidden, cl::init(true),
 
 /// ARM_MLxEntry - Record information about MLA / MLS instructions.
 struct ARM_MLxEntry {
-  unsigned MLxOpc;     // MLA / MLS opcode
-  unsigned MulOpc;     // Expanded multiplication opcode
-  unsigned AddSubOpc;  // Expanded add / sub opcode
+  uint16_t MLxOpc;     // MLA / MLS opcode
+  uint16_t MulOpc;     // Expanded multiplication opcode
+  uint16_t AddSubOpc;  // Expanded add / sub opcode
   bool NegAcc;         // True if the acc is negated before the add / sub.
   bool HasLane;        // True if instruction has an extra "lane" operand.
 };
@@ -1531,11 +1531,11 @@ ARMBaseInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
 /// This will go away once we can teach tblgen how to set the optional CPSR def
 /// operand itself.
 struct AddSubFlagsOpcodePair {
-  unsigned PseudoOpc;
-  unsigned MachineOpc;
+  uint16_t PseudoOpc;
+  uint16_t MachineOpc;
 };
 
-static AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = {
+static const AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = {
   {ARM::ADDSri, ARM::ADDri},
   {ARM::ADDSrr, ARM::ADDrr},
   {ARM::ADDSrsi, ARM::ADDrsi},
@@ -1563,14 +1563,9 @@ static AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = {
 };
 
 unsigned llvm::convertAddSubFlagsOpcode(unsigned OldOpc) {
-  static const int NPairs =
-    sizeof(AddSubFlagsOpcodeMap) / sizeof(AddSubFlagsOpcodePair);
-  for (AddSubFlagsOpcodePair *OpcPair = &AddSubFlagsOpcodeMap[0],
-         *End = &AddSubFlagsOpcodeMap[NPairs]; OpcPair != End; ++OpcPair) {
-    if (OldOpc == OpcPair->PseudoOpc) {
-      return OpcPair->MachineOpc;
-    }
-  }
+  for (unsigned i = 0, e = array_lengthof(AddSubFlagsOpcodeMap); i != e; ++i)
+    if (OldOpc == AddSubFlagsOpcodeMap[i].PseudoOpc)
+      return AddSubFlagsOpcodeMap[i].MachineOpc;
   return 0;
 }
 
@@ -1742,20 +1737,33 @@ bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
   return Offset == 0;
 }
 
+/// analyzeCompare - For a comparison instruction, return the source registers
+/// in SrcReg and SrcReg2 if having two register operands, and the value it
+/// compares against in CmpValue. Return true if the comparison instruction
+/// can be analyzed.
 bool ARMBaseInstrInfo::
-AnalyzeCompare(const MachineInstr *MI, unsigned &SrcReg, int &CmpMask,
-               int &CmpValue) const {
+analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2,
+               int &CmpMask, int &CmpValue) const {
   switch (MI->getOpcode()) {
   default: break;
   case ARM::CMPri:
   case ARM::t2CMPri:
     SrcReg = MI->getOperand(0).getReg();
+    SrcReg2 = 0;
     CmpMask = ~0;
     CmpValue = MI->getOperand(1).getImm();
     return true;
+  case ARM::CMPrr:
+  case ARM::t2CMPrr:
+    SrcReg = MI->getOperand(0).getReg();
+    SrcReg2 = MI->getOperand(1).getReg();
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
   case ARM::TSTri:
   case ARM::t2TSTri:
     SrcReg = MI->getOperand(0).getReg();
+    SrcReg2 = 0;
     CmpMask = MI->getOperand(1).getImm();
     CmpValue = 0;
     return true;
@@ -1793,20 +1801,67 @@ static bool isSuitableForMask(MachineInstr *&MI, unsigned SrcReg,
   return false;
 }
 
-/// OptimizeCompareInstr - Convert the instruction supplying the argument to the
-/// comparison into one that sets the zero bit in the flags register.
-bool ARMBaseInstrInfo::
-OptimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, int CmpMask,
-                     int CmpValue, const MachineRegisterInfo *MRI) const {
-  if (CmpValue != 0)
-    return false;
+/// getSwappedCondition - assume the flags are set by MI(a,b), return
+/// the condition code if we modify the instructions such that flags are
+/// set by MI(b,a).
+inline static ARMCC::CondCodes getSwappedCondition(ARMCC::CondCodes CC) {
+  switch (CC) {
+  default: return ARMCC::AL;
+  case ARMCC::EQ: return ARMCC::EQ;
+  case ARMCC::NE: return ARMCC::NE;
+  case ARMCC::HS: return ARMCC::LS;
+  case ARMCC::LO: return ARMCC::HI;
+  case ARMCC::HI: return ARMCC::LO;
+  case ARMCC::LS: return ARMCC::HS;
+  case ARMCC::GE: return ARMCC::LE;
+  case ARMCC::LT: return ARMCC::GT;
+  case ARMCC::GT: return ARMCC::LT;
+  case ARMCC::LE: return ARMCC::GE;
+  }
+}
+
+/// isRedundantFlagInstr - check whether the first instruction, whose only
+/// purpose is to update flags, can be made redundant.
+/// CMPrr can be made redundant by SUBrr if the operands are the same.
+/// CMPri can be made redundant by SUBri if the operands are the same.
+/// This function can be extended later on.
+inline static bool isRedundantFlagInstr(MachineInstr *CmpI, unsigned SrcReg,
+                                        unsigned SrcReg2, int ImmValue,
+                                        MachineInstr *OI) {
+  if ((CmpI->getOpcode() == ARM::CMPrr ||
+       CmpI->getOpcode() == ARM::t2CMPrr) &&
+      (OI->getOpcode() == ARM::SUBrr ||
+       OI->getOpcode() == ARM::t2SUBrr) &&
+      ((OI->getOperand(1).getReg() == SrcReg &&
+        OI->getOperand(2).getReg() == SrcReg2) ||
+       (OI->getOperand(1).getReg() == SrcReg2 &&
+        OI->getOperand(2).getReg() == SrcReg)))
+    return true;
 
-  MachineRegisterInfo::def_iterator DI = MRI->def_begin(SrcReg);
-  if (llvm::next(DI) != MRI->def_end())
-    // Only support one definition.
-    return false;
+  if ((CmpI->getOpcode() == ARM::CMPri ||
+       CmpI->getOpcode() == ARM::t2CMPri) &&
+      (OI->getOpcode() == ARM::SUBri ||
+       OI->getOpcode() == ARM::t2SUBri) &&
+      OI->getOperand(1).getReg() == SrcReg &&
+      OI->getOperand(2).getImm() == ImmValue)
+    return true;
+  return false;
+}
 
-  MachineInstr *MI = &*DI;
+/// optimizeCompareInstr - Convert the instruction supplying the argument to the
+/// comparison into one that sets the zero bit in the flags register;
+/// Remove a redundant Compare instruction if an earlier instruction can set the
+/// flags in the same way as Compare.
+/// E.g. SUBrr(r1,r2) and CMPrr(r1,r2). We also handle the case where two
+/// operands are swapped: SUBrr(r1,r2) and CMPrr(r2,r1), by updating the
+/// condition code of instructions which use the flags.
+bool ARMBaseInstrInfo::
+optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
+                     int CmpMask, int CmpValue,
+                     const MachineRegisterInfo *MRI) const {
+  // Get the unique definition of SrcReg.
+  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+  if (!MI) return false;
 
   // Masked compares sometimes use the same register as the corresponding 'and'.
   if (CmpMask != ~0) {
@@ -1825,32 +1880,49 @@ OptimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, int CmpMask,
     }
   }
 
-  // Conservatively refuse to convert an instruction which isn't in the same BB
-  // as the comparison.
-  if (MI->getParent() != CmpInstr->getParent())
-    return false;
-
-  // Check that CPSR isn't set between the comparison instruction and the one we
-  // want to change.
-  MachineBasicBlock::iterator I = CmpInstr,E = MI, B = MI->getParent()->begin();
+  // Get ready to iterate backward from CmpInstr.
+  MachineBasicBlock::iterator I = CmpInstr, E = MI,
+                              B = CmpInstr->getParent()->begin();
 
   // Early exit if CmpInstr is at the beginning of the BB.
   if (I == B) return false;
 
+  // There are two possible candidates which can be changed to set CPSR:
+  // One is MI, the other is a SUB instruction.
+  // For CMPrr(r1,r2), we are looking for SUB(r1,r2) or SUB(r2,r1).
+  // For CMPri(r1, CmpValue), we are looking for SUBri(r1, CmpValue).
+  MachineInstr *Sub = NULL;
+  if (SrcReg2 != 0)
+    // MI is not a candidate for CMPrr.
+    MI = NULL;
+  else if (MI->getParent() != CmpInstr->getParent() || CmpValue != 0) {
+    // Conservatively refuse to convert an instruction which isn't in the same
+    // BB as the comparison.
+    // For CMPri, we need to check Sub, thus we can't return here.
+    if (CmpInstr->getOpcode() == ARM::CMPri ||
+       CmpInstr->getOpcode() == ARM::t2CMPri)
+      MI = NULL;
+    else
+      return false;
+  }
+
+  // Check that CPSR isn't set between the comparison instruction and the one we
+  // want to change. At the same time, search for Sub.
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
   --I;
   for (; I != E; --I) {
     const MachineInstr &Instr = *I;
 
-    for (unsigned IO = 0, EO = Instr.getNumOperands(); IO != EO; ++IO) {
-      const MachineOperand &MO = Instr.getOperand(IO);
-      if (MO.isRegMask() && MO.clobbersPhysReg(ARM::CPSR))
-        return false;
-      if (!MO.isReg()) continue;
-
+    if (Instr.modifiesRegister(ARM::CPSR, TRI) ||
+        Instr.readsRegister(ARM::CPSR, TRI))
       // This instruction modifies or uses CPSR after the one we want to
       // change. We can't do this transformation.
-      if (MO.getReg() == ARM::CPSR)
-        return false;
+      return false;
+
+    // Check whether CmpInstr can be made redundant by the current instruction.
+    if (isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, &*I)) {
+      Sub = &*I;
+      break;
     }
 
     if (I == B)
@@ -1858,7 +1930,13 @@ OptimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, int CmpMask,
       return false;
   }
 
-  // Set the "zero" bit in CPSR.
+  // Return false if no candidates exist.
+  if (!MI && !Sub)
+    return false;
+
+  // The single candidate is called MI.
+  if (!MI) MI = Sub;
+
   switch (MI->getOpcode()) {
   default: break;
   case ARM::RSBrr:
@@ -1894,13 +1972,17 @@ OptimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, int CmpMask,
   case ARM::EORri:
   case ARM::t2EORrr:
   case ARM::t2EORri: {
-    // Scan forward for the use of CPSR, if it's a conditional code requires
-    // checking of V bit, then this is not safe to do. If we can't find the
-    // CPSR use (i.e. used in another block), then it's not safe to perform
-    // the optimization.
+    // Scan forward for the use of CPSR
+    // When checking against MI: if it's a conditional code requires
+    // checking of V bit, then this is not safe to do.
+    // It is safe to remove CmpInstr if CPSR is redefined or killed.
+    // If we are done with the basic block, we need to check whether CPSR is
+    // live-out.
+    SmallVector<std::pair<MachineOperand*, ARMCC::CondCodes>, 4>
+        OperandsToUpdate;
     bool isSafe = false;
     I = CmpInstr;
-    E = MI->getParent()->end();
+    E = CmpInstr->getParent()->end();
     while (!isSafe && ++I != E) {
       const MachineInstr &Instr = *I;
       for (unsigned IO = 0, EO = Instr.getNumOperands();
@@ -1918,28 +2000,56 @@ OptimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, int CmpMask,
         }
         // Condition code is after the operand before CPSR.
         ARMCC::CondCodes CC = (ARMCC::CondCodes)Instr.getOperand(IO-1).getImm();
-        switch (CC) {
-        default:
-          isSafe = true;
-          break;
-        case ARMCC::VS:
-        case ARMCC::VC:
-        case ARMCC::GE:
-        case ARMCC::LT:
-        case ARMCC::GT:
-        case ARMCC::LE:
-          return false;
+        if (Sub) {
+          ARMCC::CondCodes NewCC = getSwappedCondition(CC);
+          if (NewCC == ARMCC::AL)
+            return false;
+          // If we have SUB(r1, r2) and CMP(r2, r1), the condition code based
+          // on CMP needs to be updated to be based on SUB.
+          // Push the condition code operands to OperandsToUpdate.
+          // If it is safe to remove CmpInstr, the condition code of these
+          // operands will be modified.
+          if (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
+              Sub->getOperand(2).getReg() == SrcReg)
+            OperandsToUpdate.push_back(std::make_pair(&((*I).getOperand(IO-1)),
+                                                      NewCC));
         }
+        else
+          switch (CC) {
+          default:
+            // CPSR can be used multiple times, we should continue.
+            break;
+          case ARMCC::VS:
+          case ARMCC::VC:
+          case ARMCC::GE:
+          case ARMCC::LT:
+          case ARMCC::GT:
+          case ARMCC::LE:
+            return false;
+          }
       }
     }
 
-    if (!isSafe)
-      return false;
+    // If CPSR is not killed nor re-defined, we should check whether it is
+    // live-out. If it is live-out, do not optimize.
+    if (!isSafe) {
+      MachineBasicBlock *MBB = CmpInstr->getParent();
+      for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+               SE = MBB->succ_end(); SI != SE; ++SI)
+        if ((*SI)->isLiveIn(ARM::CPSR))
+          return false;
+    }
 
     // Toggle the optional operand to CPSR.
     MI->getOperand(5).setReg(ARM::CPSR);
     MI->getOperand(5).setIsDef(true);
     CmpInstr->eraseFromParent();
+
+    // Modify the condition code of operands in OperandsToUpdate.
+    // Since we have SUB(r1, r2) and CMP(r2, r1), the condition code needs to
+    // be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
+    for (unsigned i = 0, e = OperandsToUpdate.size(); i < e; i++)
+      OperandsToUpdate[i].first->setImm(OperandsToUpdate[i].second);
     return true;
   }
   }
@@ -2071,9 +2181,9 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
 
   const MCInstrDesc &Desc = MI->getDesc();
   unsigned Class = Desc.getSchedClass();
-  unsigned UOps = ItinData->Itineraries[Class].NumMicroOps;
-  if (UOps)
-    return UOps;
+  int ItinUOps = ItinData->getNumMicroOps(Class);
+  if (ItinUOps >= 0)
+    return ItinUOps;
 
   unsigned Opc = MI->getOpcode();
   switch (Opc) {
@@ -2088,7 +2198,7 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
   //
   // On Cortex-A8, each pair of register loads / stores can be scheduled on the
   // same cycle. The scheduling for the first load / store must be done
-  // separately by assuming the the address is not 64-bit aligned.
+  // separately by assuming the address is not 64-bit aligned.
   //
   // On Cortex-A9, the formula is simply (#reg / 2) + (#reg % 2). If the address
   // is not 64-bit aligned, then AGU would take an extra cycle.  For VFP / NEON
@@ -2147,19 +2257,19 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
         return 2;
       // 4 registers would be issued: 2, 2.
       // 5 registers would be issued: 2, 2, 1.
-      UOps = (NumRegs / 2);
+      int A8UOps = (NumRegs / 2);
       if (NumRegs % 2)
-        ++UOps;
-      return UOps;
+        ++A8UOps;
+      return A8UOps;
     } else if (Subtarget.isCortexA9()) {
-      UOps = (NumRegs / 2);
+      int A9UOps = (NumRegs / 2);
       // If there are odd number of registers or if it's not 64-bit aligned,
       // then it takes an extra AGU (Address Generation Unit) cycle.
       if ((NumRegs % 2) ||
           !MI->hasOneMemOperand() ||
           (*MI->memoperands_begin())->getAlignment() < 8)
-        ++UOps;
-      return UOps;
+        ++A9UOps;
+      return A9UOps;
     } else {
       // Assume the worst.
       return NumRegs;
@@ -2478,82 +2588,14 @@ static const MachineInstr *getBundledUseMI(const TargetRegisterInfo *TRI,
   return II;
 }
 
-int
-ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
-                             const MachineInstr *DefMI, unsigned DefIdx,
-                             const MachineInstr *UseMI, unsigned UseIdx) const {
-  if (DefMI->isCopyLike() || DefMI->isInsertSubreg() ||
-      DefMI->isRegSequence() || DefMI->isImplicitDef())
-    return 1;
-
-  if (!ItinData || ItinData->isEmpty())
-    return DefMI->mayLoad() ? 3 : 1;
-
-  const MCInstrDesc *DefMCID = &DefMI->getDesc();
-  const MCInstrDesc *UseMCID = &UseMI->getDesc();
-  const MachineOperand &DefMO = DefMI->getOperand(DefIdx);
-  unsigned Reg = DefMO.getReg();
-  if (Reg == ARM::CPSR) {
-    if (DefMI->getOpcode() == ARM::FMSTAT) {
-      // fpscr -> cpsr stalls over 20 cycles on A8 (and earlier?)
-      return Subtarget.isCortexA9() ? 1 : 20;
-    }
-
-    // CPSR set and branch can be paired in the same cycle.
-    if (UseMI->isBranch())
-      return 0;
-
-    // Otherwise it takes the instruction latency (generally one).
-    int Latency = getInstrLatency(ItinData, DefMI);
-
-    // For Thumb2 and -Os, prefer scheduling CPSR setting instruction close to
-    // its uses. Instructions which are otherwise scheduled between them may
-    // incur a code size penalty (not able to use the CPSR setting 16-bit
-    // instructions).
-    if (Latency > 0 && Subtarget.isThumb2()) {
-      const MachineFunction *MF = DefMI->getParent()->getParent();
-      if (MF->getFunction()->hasFnAttr(Attribute::OptimizeForSize))
-        --Latency;
-    }
-    return Latency;
-  }
-
-  unsigned DefAlign = DefMI->hasOneMemOperand()
-    ? (*DefMI->memoperands_begin())->getAlignment() : 0;
-  unsigned UseAlign = UseMI->hasOneMemOperand()
-    ? (*UseMI->memoperands_begin())->getAlignment() : 0;
-
-  unsigned DefAdj = 0;
-  if (DefMI->isBundle()) {
-    DefMI = getBundledDefMI(&getRegisterInfo(), DefMI, Reg, DefIdx, DefAdj);
-    if (DefMI->isCopyLike() || DefMI->isInsertSubreg() ||
-        DefMI->isRegSequence() || DefMI->isImplicitDef())
-      return 1;
-    DefMCID = &DefMI->getDesc();
-  }
-  unsigned UseAdj = 0;
-  if (UseMI->isBundle()) {
-    unsigned NewUseIdx;
-    const MachineInstr *NewUseMI = getBundledUseMI(&getRegisterInfo(), UseMI,
-                                                   Reg, NewUseIdx, UseAdj);
-    if (NewUseMI) {
-      UseMI = NewUseMI;
-      UseIdx = NewUseIdx;
-      UseMCID = &UseMI->getDesc();
-    }
-  }
-
-  int Latency = getOperandLatency(ItinData, *DefMCID, DefIdx, DefAlign,
-                                  *UseMCID, UseIdx, UseAlign);
-  int Adj = DefAdj + UseAdj;
-  if (Adj) {
-    Latency -= (int)(DefAdj + UseAdj);
-    if (Latency < 1)
-      return 1;
-  }
-
-  if (Latency > 1 &&
-      (Subtarget.isCortexA8() || Subtarget.isCortexA9())) {
+/// Return the number of cycles to add to (or subtract from) the static
+/// itinerary based on the def opcode and alignment. The caller will ensure that
+/// adjusted latency is at least one cycle.
+static int adjustDefLatency(const ARMSubtarget &Subtarget,
+                            const MachineInstr *DefMI,
+                            const MCInstrDesc *DefMCID, unsigned DefAlign) {
+  int Adjust = 0;
+  if (Subtarget.isCortexA8() || Subtarget.isCortexA9()) {
     // FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2]
     // variants are one cycle cheaper.
     switch (DefMCID->getOpcode()) {
@@ -2564,7 +2606,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
       unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
       if (ShImm == 0 ||
           (ShImm == 2 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl))
-        --Latency;
+        --Adjust;
       break;
     }
     case ARM::t2LDRs:
@@ -2574,13 +2616,13 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
       // Thumb2 mode: lsl only.
       unsigned ShAmt = DefMI->getOperand(3).getImm();
       if (ShAmt == 0 || ShAmt == 2)
-        --Latency;
+        --Adjust;
       break;
     }
     }
   }
 
-  if (DefAlign < 8 && Subtarget.isCortexA9())
+  if (DefAlign < 8 && Subtarget.isCortexA9()) {
     switch (DefMCID->getOpcode()) {
     default: break;
     case ARM::VLD1q8:
@@ -2689,10 +2731,101 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     case ARM::VLD4LNq32_UPD:
       // If the address is not 64-bit aligned, the latencies of these
       // instructions increases by one.
-      ++Latency;
+      ++Adjust;
       break;
     }
+  }
+  return Adjust;
+}
+
+
+
+int
+ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+                                    const MachineInstr *DefMI, unsigned DefIdx,
+                                    const MachineInstr *UseMI,
+                                    unsigned UseIdx) const {
+  // No operand latency. The caller may fall back to getInstrLatency.
+  if (!ItinData || ItinData->isEmpty())
+    return -1;
 
+  const MachineOperand &DefMO = DefMI->getOperand(DefIdx);
+  unsigned Reg = DefMO.getReg();
+  const MCInstrDesc *DefMCID = &DefMI->getDesc();
+  const MCInstrDesc *UseMCID = &UseMI->getDesc();
+
+  unsigned DefAdj = 0;
+  if (DefMI->isBundle()) {
+    DefMI = getBundledDefMI(&getRegisterInfo(), DefMI, Reg, DefIdx, DefAdj);
+    DefMCID = &DefMI->getDesc();
+  }
+  if (DefMI->isCopyLike() || DefMI->isInsertSubreg() ||
+      DefMI->isRegSequence() || DefMI->isImplicitDef()) {
+    return 1;
+  }
+
+  unsigned UseAdj = 0;
+  if (UseMI->isBundle()) {
+    unsigned NewUseIdx;
+    const MachineInstr *NewUseMI = getBundledUseMI(&getRegisterInfo(), UseMI,
+                                                   Reg, NewUseIdx, UseAdj);
+    if (!NewUseMI)
+      return -1;
+
+    UseMI = NewUseMI;
+    UseIdx = NewUseIdx;
+    UseMCID = &UseMI->getDesc();
+  }
+
+  if (Reg == ARM::CPSR) {
+    if (DefMI->getOpcode() == ARM::FMSTAT) {
+      // fpscr -> cpsr stalls over 20 cycles on A8 (and earlier?)
+      return Subtarget.isCortexA9() ? 1 : 20;
+    }
+
+    // CPSR set and branch can be paired in the same cycle.
+    if (UseMI->isBranch())
+      return 0;
+
+    // Otherwise it takes the instruction latency (generally one).
+    unsigned Latency = getInstrLatency(ItinData, DefMI);
+
+    // For Thumb2 and -Os, prefer scheduling CPSR setting instruction close to
+    // its uses. Instructions which are otherwise scheduled between them may
+    // incur a code size penalty (not able to use the CPSR setting 16-bit
+    // instructions).
+    if (Latency > 0 && Subtarget.isThumb2()) {
+      const MachineFunction *MF = DefMI->getParent()->getParent();
+      if (MF->getFunction()->hasFnAttr(Attribute::OptimizeForSize))
+        --Latency;
+    }
+    return Latency;
+  }
+
+  if (DefMO.isImplicit() || UseMI->getOperand(UseIdx).isImplicit())
+    return -1;
+
+  unsigned DefAlign = DefMI->hasOneMemOperand()
+    ? (*DefMI->memoperands_begin())->getAlignment() : 0;
+  unsigned UseAlign = UseMI->hasOneMemOperand()
+    ? (*UseMI->memoperands_begin())->getAlignment() : 0;
+
+  // Get the itinerary's latency if possible, and handle variable_ops.
+  int Latency = getOperandLatency(ItinData, *DefMCID, DefIdx, DefAlign,
+                                  *UseMCID, UseIdx, UseAlign);
+  // Unable to find operand latency. The caller may resort to getInstrLatency.
+  if (Latency < 0)
+    return Latency;
+
+  // Adjust for IT block position.
+  int Adj = DefAdj + UseAdj;
+
+  // Adjust for dynamic def-side opcode variants not captured by the itinerary.
+  Adj += adjustDefLatency(Subtarget, DefMI, DefMCID, DefAlign);
+  if (Adj >= 0 || (int)Latency > -Adj) {
+    return Latency + Adj;
+  }
+  // Return the itinerary latency, which may be zero but not less than zero.
   return Latency;
 }
 
@@ -2892,22 +3025,20 @@ ARMBaseInstrInfo::getOutputLatency(const InstrItineraryData *ItinData,
     return 1;
 
   // If the second MI is predicated, then there is an implicit use dependency.
-  return getOperandLatency(ItinData, DefMI, DefIdx, DepMI,
-                           DepMI->getNumOperands());
+  return getInstrLatency(ItinData, DefMI);
 }
 
-int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
-                                      const MachineInstr *MI,
-                                      unsigned *PredCost) const {
+unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+                                           const MachineInstr *MI,
+                                           unsigned *PredCost) const {
   if (MI->isCopyLike() || MI->isInsertSubreg() ||
       MI->isRegSequence() || MI->isImplicitDef())
     return 1;
 
-  if (!ItinData || ItinData->isEmpty())
-    return 1;
-
+  // An instruction scheduler typically runs on unbundled instructions, however
+  // other passes may query the latency of a bundled instruction.
   if (MI->isBundle()) {
-    int Latency = 0;
+    unsigned Latency = 0;
     MachineBasicBlock::const_instr_iterator I = MI;
     MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
     while (++I != E && I->isInsideBundle()) {
@@ -2918,15 +3049,33 @@ int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
   }
 
   const MCInstrDesc &MCID = MI->getDesc();
-  unsigned Class = MCID.getSchedClass();
-  unsigned UOps = ItinData->Itineraries[Class].NumMicroOps;
-  if (PredCost && (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR)))
+  if (PredCost && (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR))) {
     // When predicated, CPSR is an additional source operand for CPSR updating
     // instructions, this apparently increases their latencies.
     *PredCost = 1;
-  if (UOps)
-    return ItinData->getStageLatency(Class);
-  return getNumMicroOps(ItinData, MI);
+  }
+  // Be sure to call getStageLatency for an empty itinerary in case it has a
+  // valid MinLatency property.
+  if (!ItinData)
+    return MI->mayLoad() ? 3 : 1;
+
+  unsigned Class = MCID.getSchedClass();
+
+  // For instructions with variable uops, use uops as latency.
+  if (!ItinData->isEmpty() && ItinData->getNumMicroOps(Class) < 0)
+    return getNumMicroOps(ItinData, MI);
+
+  // For the common case, fall back on the itinerary's latency.
+  unsigned Latency = ItinData->getStageLatency(Class);
+
+  // Adjust for dynamic def-side opcode variants not captured by the itinerary.
+  unsigned DefAlign = MI->hasOneMemOperand()
+    ? (*MI->memoperands_begin())->getAlignment() : 0;
+  int Adj = adjustDefLatency(Subtarget, MI, &MCID, DefAlign);
+  if (Adj >= 0 || (int)Latency > -Adj) {
+    return Latency + Adj;
+  }
+  return Latency;
 }
 
 int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
@@ -2960,7 +3109,10 @@ hasHighOperandLatency(const InstrItineraryData *ItinData,
     return true;
 
   // Hoist VFP / NEON instructions with 4 or higher latency.
-  int Latency = getOperandLatency(ItinData, DefMI, DefIdx, UseMI, UseIdx);
+  int Latency = computeOperandLatency(ItinData, DefMI, DefIdx, UseMI, UseIdx,
+                                      /*FindMin=*/false);
+  if (Latency < 0)
+    Latency = getInstrLatency(ItinData, DefMI);
   if (Latency <= 3)
     return false;
   return DDomain == ARMII::DomainVFP || DDomain == ARMII::DomainNEON ||
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 2fe8507..1a10a4a 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -186,16 +186,20 @@ public:
     return NumCycles == 1;
   }
 
-  /// AnalyzeCompare - For a comparison instruction, return the source register
-  /// in SrcReg and the value it compares against in CmpValue. Return true if
-  /// the comparison instruction can be analyzed.
-  virtual bool AnalyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
-                              int &CmpMask, int &CmpValue) const;
-
-  /// OptimizeCompareInstr - Convert the instruction to set the zero flag so
-  /// that we can remove a "comparison with zero".
-  virtual bool OptimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
-                                    int CmpMask, int CmpValue,
+  /// analyzeCompare - For a comparison instruction, return the source registers
+  /// in SrcReg and SrcReg2 if having two register operands, and the value it
+  /// compares against in CmpValue. Return true if the comparison instruction
+  /// can be analyzed.
+  virtual bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+                              unsigned &SrcReg2, int &CmpMask,
+                              int &CmpValue) const;
+
+  /// optimizeCompareInstr - Convert the instruction to set the zero flag so
+  /// that we can remove a "comparison with zero"; Remove a redundant CMP
+  /// instruction if the flags can be updated in the same way by an earlier
+  /// instruction such as SUB.
+  virtual bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+                                    unsigned SrcReg2, int CmpMask, int CmpValue,
                                     const MachineRegisterInfo *MRI) const;
 
   /// FoldImmediate - 'Reg' is known to be defined by a move immediate
@@ -249,8 +253,9 @@ private:
                         const MCInstrDesc &UseMCID,
                         unsigned UseIdx, unsigned UseAlign) const;
 
-  int getInstrLatency(const InstrItineraryData *ItinData,
-                      const MachineInstr *MI, unsigned *PredCost = 0) const;
+  unsigned getInstrLatency(const InstrItineraryData *ItinData,
+                           const MachineInstr *MI,
+                           unsigned *PredCost = 0) const;
 
   int getInstrLatency(const InstrItineraryData *ItinData,
                       SDNode *Node) const;
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 3907f75..231bd26 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -62,12 +62,14 @@ ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMBaseInstrInfo &tii,
 
 const uint16_t*
 ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  return (STI.isTargetIOS()) ? CSR_iOS_SaveList : CSR_AAPCS_SaveList;
+  return (STI.isTargetIOS() && !STI.isAAPCS_ABI())
+    ? CSR_iOS_SaveList : CSR_AAPCS_SaveList;
 }
 
 const uint32_t*
 ARMBaseRegisterInfo::getCallPreservedMask(CallingConv::ID) const {
-  return (STI.isTargetIOS()) ? CSR_iOS_RegMask : CSR_AAPCS_RegMask;
+  return (STI.isTargetIOS() && !STI.isAAPCS_ABI())
+    ? CSR_iOS_RegMask : CSR_AAPCS_RegMask;
 }
 
 BitVector ARMBaseRegisterInfo::
@@ -257,8 +259,9 @@ ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC)
 }
 
 const TargetRegisterClass *
-ARMBaseRegisterInfo::getPointerRegClass(unsigned Kind) const {
-  return ARM::GPRRegisterClass;
+ARMBaseRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
+                                                                         const {
+  return &ARM::GPRRegClass;
 }
 
 const TargetRegisterClass *
@@ -369,7 +372,7 @@ ARMBaseRegisterInfo::getRawAllocationOrder(const TargetRegisterClass *RC,
   };
 
   // We only support even/odd hints for GPR and rGPR.
-  if (RC != ARM::GPRRegisterClass && RC != ARM::rGPRRegisterClass)
+  if (RC != &ARM::GPRRegClass && RC != &ARM::rGPRRegClass)
     return RC->getRawAllocationOrder(MF);
 
   if (HintType == ARMRI::RegPairEven) {
@@ -712,6 +715,11 @@ requiresRegisterScavenging(const MachineFunction &MF) const {
 }
 
 bool ARMBaseRegisterInfo::
+trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+  return true;
+}
+
+bool ARMBaseRegisterInfo::
 requiresFrameIndexScavenging(const MachineFunction &MF) const {
   return true;
 }
@@ -932,7 +940,8 @@ materializeFrameBaseRegister(MachineBasicBlock *MBB,
 
   const MCInstrDesc &MCID = TII.get(ADDriOpc);
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-  MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this));
+  const MachineFunction &MF = *MBB->getParent();
+  MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this, MF));
 
   MachineInstrBuilder MIB = AddDefaultPred(BuildMI(*MBB, Ins, DL, MCID, BaseReg)
     .addFrameIndex(FrameIdx).addImm(Offset));
@@ -1110,7 +1119,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     // Must be addrmode4/6.
     MI.getOperand(i).ChangeToRegister(FrameReg, false, false, false);
   else {
-    ScratchReg = MF.getRegInfo().createVirtualRegister(ARM::GPRRegisterClass);
+    ScratchReg = MF.getRegInfo().createVirtualRegister(&ARM::GPRRegClass);
     if (!AFI->isThumbFunction())
       emitARMRegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg,
                               Offset, Pred, PredReg, TII);
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index af79351..da29f7e 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -109,7 +109,8 @@ public:
                                        SmallVectorImpl<unsigned> &SubIndices,
                                        unsigned &NewSubIdx) const;
 
-  const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const;
+  const TargetRegisterClass*
+  getPointerRegClass(const MachineFunction &MF, unsigned Kind = 0) const;
   const TargetRegisterClass*
   getCrossCopyRegClass(const TargetRegisterClass *RC) const;
 
@@ -173,6 +174,8 @@ public:
 
   virtual bool requiresRegisterScavenging(const MachineFunction &MF) const;
 
+  virtual bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const;
+
   virtual bool requiresFrameIndexScavenging(const MachineFunction &MF) const;
 
   virtual bool requiresVirtualBaseRegisters(const MachineFunction &MF) const;
diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp
index bc681be..b8627a2 100644
--- a/lib/Target/ARM/ARMCodeEmitter.cpp
+++ b/lib/Target/ARM/ARMCodeEmitter.cpp
@@ -1648,7 +1648,7 @@ void ARMCodeEmitter::emitMiscBranchInstruction(const MachineInstr &MI) {
 static unsigned encodeVFPRd(const MachineInstr &MI, unsigned OpIdx) {
   unsigned RegD = MI.getOperand(OpIdx).getReg();
   unsigned Binary = 0;
-  bool isSPVFP = ARM::SPRRegisterClass->contains(RegD);
+  bool isSPVFP = ARM::SPRRegClass.contains(RegD);
   RegD = getARMRegisterNumbering(RegD);
   if (!isSPVFP) {
     Binary |=  (RegD & 0x0F)       << ARMII::RegRdShift;
@@ -1663,7 +1663,7 @@ static unsigned encodeVFPRd(const MachineInstr &MI, unsigned OpIdx) {
 static unsigned encodeVFPRn(const MachineInstr &MI, unsigned OpIdx) {
   unsigned RegN = MI.getOperand(OpIdx).getReg();
   unsigned Binary = 0;
-  bool isSPVFP = ARM::SPRRegisterClass->contains(RegN);
+  bool isSPVFP = ARM::SPRRegClass.contains(RegN);
   RegN = getARMRegisterNumbering(RegN);
   if (!isSPVFP) {
     Binary |=  (RegN & 0x0F)       << ARMII::RegRnShift;
@@ -1678,7 +1678,7 @@ static unsigned encodeVFPRn(const MachineInstr &MI, unsigned OpIdx) {
 static unsigned encodeVFPRm(const MachineInstr &MI, unsigned OpIdx) {
   unsigned RegM = MI.getOperand(OpIdx).getReg();
   unsigned Binary = 0;
-  bool isSPVFP = ARM::SPRRegisterClass->contains(RegM);
+  bool isSPVFP = ARM::SPRRegClass.contains(RegM);
   RegM = getARMRegisterNumbering(RegM);
   if (!isSPVFP) {
     Binary |=  (RegM & 0x0F);
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index fc35c7c..a953985 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -69,27 +69,6 @@ static inline unsigned UnknownPadding(unsigned LogAlign, unsigned KnownBits) {
   return 0;
 }
 
-/// WorstCaseAlign - Assuming only the low KnownBits bits in Offset are exact,
-/// add padding such that:
-///
-/// 1. The result is aligned to 1 << LogAlign.
-///
-/// 2. No other value of the unknown bits would require more padding.
-///
-/// This may add more padding than is required to satisfy just one of the
-/// constraints.  It is necessary to compute alignment this way to guarantee
-/// that we don't underestimate the padding before an aligned block.  If the
-/// real padding before a block is larger than we think, constant pool entries
-/// may go out of range.
-static inline unsigned WorstCaseAlign(unsigned Offset, unsigned LogAlign,
-                                      unsigned KnownBits) {
-  // Add the worst possible padding that the unknown bits could cause.
-  Offset += UnknownPadding(LogAlign, KnownBits);
-
-  // Then align the result.
-  return RoundUpToAlignment(Offset, 1u << LogAlign);
-}
-
 namespace {
   /// ARMConstantIslands - Due to limited PC-relative displacements, ARM
   /// requires constant pool entries to be scattered among the instructions
@@ -109,7 +88,12 @@ namespace {
       /// Offset - Distance from the beginning of the function to the beginning
       /// of this basic block.
       ///
-      /// The offset is always aligned as required by the basic block.
+      /// Offsets are computed assuming worst case padding before an aligned
+      /// block. This means that subtracting basic block offsets always gives a
+      /// conservative estimate of the real distance which may be smaller.
+      ///
+      /// Because worst case padding is used, the computed offset of an aligned
+      /// block may not actually be aligned.
       unsigned Offset;
 
       /// Size - Size of the basic block in bytes.  If the block contains
@@ -140,7 +124,12 @@ namespace {
       /// This number should be used to predict worst case padding when
       /// splitting the block.
       unsigned internalKnownBits() const {
-        return Unalign ? Unalign : KnownBits;
+        unsigned Bits = Unalign ? Unalign : KnownBits;
+        // If the block size isn't a multiple of the known bits, assume the
+        // worst case padding.
+        if (Size & ((1u << Bits) - 1))
+          Bits = CountTrailingZeros_32(Size);
+        return Bits;
       }
 
       /// Compute the offset immediately following this block.  If LogAlign is
@@ -152,7 +141,7 @@ namespace {
         if (!LA)
           return PO;
         // Add alignment padding from the terminator.
-        return WorstCaseAlign(PO, LA, internalKnownBits());
+        return PO + UnknownPadding(LA, internalKnownBits());
       }
 
       /// Compute the number of known low bits of postOffset.  If this block
@@ -342,9 +331,7 @@ void ARMConstantIslands::verify() {
   for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
        MBBI != E; ++MBBI) {
     MachineBasicBlock *MBB = MBBI;
-    unsigned Align = MBB->getAlignment();
     unsigned MBBId = MBB->getNumber();
-    assert(BBInfo[MBBId].Offset % (1u << Align) == 0);
     assert(!MBBId || BBInfo[MBBId - 1].postOffset() <= BBInfo[MBBId].Offset);
   }
   DEBUG(dbgs() << "Verifying " << CPUsers.size() << " CP users.\n");
@@ -428,7 +415,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
 
   // ARM and Thumb2 functions need to be 4-byte aligned.
   if (!isThumb1)
-    MF->EnsureAlignment(2);  // 2 = log2(4)
+    MF->ensureAlignment(2);  // 2 = log2(4)
 
   // Perform the initial placement of the constant pool entries.  To start with,
   // we put them all at the end of the function.
@@ -529,7 +516,7 @@ ARMConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
 
   // The function needs to be as aligned as the basic blocks. The linker may
   // move functions around based on their alignment.
-  MF->EnsureAlignment(BB->getAlignment());
+  MF->ensureAlignment(BB->getAlignment());
 
   // Order the entries in BB by descending alignment.  That ensures correct
   // alignment of all entries as long as BB is sufficiently aligned.  Keep
@@ -828,7 +815,7 @@ void ARMConstantIslands::computeBlockSize(MachineBasicBlock *MBB) {
   // tBR_JTr contains a .align 2 directive.
   if (!MBB->empty() && MBB->back().getOpcode() == ARM::tBR_JTr) {
     BBI.PostAlign = 2;
-    MBB->getParent()->EnsureAlignment(2);
+    MBB->getParent()->ensureAlignment(2);
   }
 }
 
@@ -1045,7 +1032,6 @@ bool ARMConstantIslands::isCPEntryInRange(MachineInstr *MI, unsigned UserOffset,
                                       MachineInstr *CPEMI, unsigned MaxDisp,
                                       bool NegOk, bool DoDump) {
   unsigned CPEOffset  = getOffsetOf(CPEMI);
-  assert(CPEOffset % 4 == 0 && "Misaligned CPE");
 
   if (DoDump) {
     DEBUG({
@@ -1256,11 +1242,8 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
   if (BBHasFallthrough(UserMBB)) {
     // Size of branch to insert.
     unsigned Delta = isThumb1 ? 2 : 4;
-    // End of UserBlock after adding a branch.
-    unsigned UserBlockEnd = UserBBI.postOffset() + Delta;
     // Compute the offset where the CPE will begin.
-    unsigned CPEOffset = WorstCaseAlign(UserBlockEnd, CPELogAlign,
-                                        UserBBI.postKnownBits());
+    unsigned CPEOffset = UserBBI.postOffset(CPELogAlign) + Delta;
 
     if (isOffsetInRange(UserOffset, CPEOffset, U)) {
       DEBUG(dbgs() << "Split at end of BB#" << UserMBB->getNumber()
@@ -1299,20 +1282,16 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
   // up the insertion point.
 
   // Try to split the block so it's fully aligned.  Compute the latest split
-  // point where we can add a 4-byte branch instruction, and then
-  // WorstCaseAlign to LogAlign.
+  // point where we can add a 4-byte branch instruction, and then align to
+  // LogAlign which is the largest possible alignment in the function.
   unsigned LogAlign = MF->getAlignment();
   assert(LogAlign >= CPELogAlign && "Over-aligned constant pool entry");
   unsigned KnownBits = UserBBI.internalKnownBits();
   unsigned UPad = UnknownPadding(LogAlign, KnownBits);
-  unsigned BaseInsertOffset = UserOffset + U.getMaxDisp();
+  unsigned BaseInsertOffset = UserOffset + U.getMaxDisp() - UPad;
   DEBUG(dbgs() << format("Split in middle of big block before %#x",
                          BaseInsertOffset));
 
-  // Account for alignment and unknown padding.
-  BaseInsertOffset &= ~((1u << LogAlign) - 1);
-  BaseInsertOffset -= UPad;
-
   // The 4 in the following is for the unconditional branch we'll be inserting
   // (allows for long branch on Thumb1).  Alignment of the island is handled
   // inside isOffsetInRange.
@@ -1327,11 +1306,11 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
   // pool entries following this block; only the last one is in the water list.
   // Back past any possible branches (allow for a conditional and a maximally
   // long unconditional).
-  if (BaseInsertOffset >= BBInfo[UserMBB->getNumber()+1].Offset)
-    BaseInsertOffset = BBInfo[UserMBB->getNumber()+1].Offset -
-      (isThumb1 ? 6 : 8);
-  unsigned EndInsertOffset =
-    WorstCaseAlign(BaseInsertOffset + 4, LogAlign, KnownBits) +
+  if (BaseInsertOffset + 8 >= UserBBI.postOffset()) {
+    BaseInsertOffset = UserBBI.postOffset() - UPad - 8;
+    DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset));
+  }
+  unsigned EndInsertOffset = BaseInsertOffset + 4 + UPad +
     CPEMI->getOperand(2).getImm();
   MachineBasicBlock::iterator MI = UserMI;
   ++MI;
@@ -1342,6 +1321,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
        Offset < BaseInsertOffset;
        Offset += TII->GetInstSizeInBytes(MI),
        MI = llvm::next(MI)) {
+    assert(MI != UserMBB->end() && "Fell off end of block");
     if (CPUIndex < NumCPUsers && CPUsers[CPUIndex].MI == MI) {
       CPUser &U = CPUsers[CPUIndex];
       if (!isOffsetInRange(Offset, EndInsertOffset, U)) {
@@ -1353,9 +1333,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
       // reused within the block, but it doesn't matter much.  Also assume CPEs
       // are added in order with alignment padding.  We may eventually be able
       // to pack the aligned CPEs better.
-      EndInsertOffset = RoundUpToAlignment(EndInsertOffset,
-                                           1u << getCPELogAlign(U.CPEMI)) +
-        U.CPEMI->getOperand(2).getImm();
+      EndInsertOffset += U.CPEMI->getOperand(2).getImm();
       CPUIndex++;
     }
 
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 5fc0360..a242b13 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -459,22 +459,23 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) {
     MIB.addOperand(MI.getOperand(OpIdx++));
 
   bool SrcIsKill = MI.getOperand(OpIdx).isKill();
+  bool SrcIsUndef = MI.getOperand(OpIdx).isUndef();
   unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
   unsigned D0, D1, D2, D3;
   GetDSubRegs(SrcReg, RegSpc, TRI, D0, D1, D2, D3);
-  MIB.addReg(D0);
+  MIB.addReg(D0, getUndefRegState(SrcIsUndef));
   if (NumRegs > 1 && TableEntry->copyAllListRegs)
-    MIB.addReg(D1);
+    MIB.addReg(D1, getUndefRegState(SrcIsUndef));
   if (NumRegs > 2 && TableEntry->copyAllListRegs)
-    MIB.addReg(D2);
+    MIB.addReg(D2, getUndefRegState(SrcIsUndef));
   if (NumRegs > 3 && TableEntry->copyAllListRegs)
-    MIB.addReg(D3);
+    MIB.addReg(D3, getUndefRegState(SrcIsUndef));
 
   // Copy the predicate operands.
   MIB.addOperand(MI.getOperand(OpIdx++));
   MIB.addOperand(MI.getOperand(OpIdx++));
 
-  if (SrcIsKill) // Add an implicit kill for the super-reg.
+  if (SrcIsKill && !SrcIsUndef) // Add an implicit kill for the super-reg.
     MIB->addRegisterKilled(SrcReg, TRI, true);
   TransferImpOps(MI, MIB, MIB);
 
@@ -925,7 +926,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       if (isARM) {
         AddDefaultPred(MIB3);
         if (Opcode == ARM::MOV_ga_pcrel_ldr)
-          MIB2->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+          MIB3->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
       }
       TransferImpOps(MI, MIB1, MIB3);
       MI.eraseFromParent();
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 2e1eaca..b96395f 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -47,11 +47,6 @@
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
-static cl::opt<bool>
-DisableARMFastISel("disable-arm-fast-isel",
-                    cl::desc("Turn off experimental ARM fast-isel support"),
-                    cl::init(false), cl::Hidden);
-
 extern cl::opt<bool> EnableARMLongCalls;
 
 namespace {
@@ -182,7 +177,6 @@ class ARMFastISel : public FastISel {
     bool ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr,
                      unsigned Alignment = 0, bool isZExt = true,
                      bool allocReg = true);
-                     
     bool ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr,
                       unsigned Alignment = 0);
     bool ARMComputeAddress(const Value *Obj, Address &Addr);
@@ -195,21 +189,25 @@ class ARMFastISel : public FastISel {
     unsigned ARMMaterializeGV(const GlobalValue *GV, EVT VT);
     unsigned ARMMoveToFPReg(EVT VT, unsigned SrcReg);
     unsigned ARMMoveToIntReg(EVT VT, unsigned SrcReg);
-    unsigned ARMSelectCallOp(const GlobalValue *GV);
+    unsigned ARMSelectCallOp(bool UseReg);
 
     // Call handling routines.
   private:
-    CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool Return);
+    CCAssignFn *CCAssignFnForCall(CallingConv::ID CC,
+                                  bool Return,
+                                  bool isVarArg);
     bool ProcessCallArgs(SmallVectorImpl<Value*> &Args,
                          SmallVectorImpl<unsigned> &ArgRegs,
                          SmallVectorImpl<MVT> &ArgVTs,
                          SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
                          SmallVectorImpl<unsigned> &RegArgs,
                          CallingConv::ID CC,
-                         unsigned &NumBytes);
+                         unsigned &NumBytes,
+                         bool isVarArg);
+    unsigned getLibcallReg(const Twine &Name);
     bool FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
                     const Instruction *I, CallingConv::ID CC,
-                    unsigned &NumBytes);
+                    unsigned &NumBytes, bool isVarArg);
     bool ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call);
 
     // OptionalDef handling routines.
@@ -719,7 +717,7 @@ unsigned ARMFastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
   if (!FuncInfo.StaticAllocaMap.count(AI)) return 0;
 
   MVT VT;
-  if (!isLoadTypeLegal(AI->getType(), VT)) return false;
+  if (!isLoadTypeLegal(AI->getType(), VT)) return 0;
 
   DenseMap<const AllocaInst*, int>::iterator SI =
     FuncInfo.StaticAllocaMap.find(AI);
@@ -910,8 +908,9 @@ void ARMFastISel::ARMSimplifyAddress(Address &Addr, EVT VT, bool useAM3) {
   // put the alloca address into a register, set the base type back to
   // register and continue. This should almost never happen.
   if (needsLowering && Addr.BaseType == Address::FrameIndexBase) {
-    const TargetRegisterClass *RC = isThumb2 ? ARM::tGPRRegisterClass
-                                             : ARM::GPRRegisterClass;
+    const TargetRegisterClass *RC = isThumb2 ?
+      (const TargetRegisterClass*)&ARM::tGPRRegClass :
+      (const TargetRegisterClass*)&ARM::GPRRegClass;
     unsigned ResultReg = createResultReg(RC);
     unsigned Opc = isThumb2 ? ARM::t2ADDri : ARM::ADDri;
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
@@ -1005,7 +1004,7 @@ bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr,
           useAM3 = true;
         }
       }
-      RC = ARM::GPRRegisterClass;
+      RC = &ARM::GPRRegClass;
       break;
     case MVT::i16:
       if (isThumb2) {
@@ -1017,7 +1016,7 @@ bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr,
         Opc = isZExt ? ARM::LDRH : ARM::LDRSH;
         useAM3 = true;
       }
-      RC = ARM::GPRRegisterClass;
+      RC = &ARM::GPRRegClass;
       break;
     case MVT::i32:
       if (isThumb2) {
@@ -1028,7 +1027,7 @@ bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr,
       } else {
         Opc = ARM::LDRi12;
       }
-      RC = ARM::GPRRegisterClass;
+      RC = &ARM::GPRRegClass;
       break;
     case MVT::f32:
       if (!Subtarget->hasVFP2()) return false;
@@ -1037,7 +1036,7 @@ bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr,
         needVMOV = true;
         VT = MVT::i32;
         Opc = isThumb2 ? ARM::t2LDRi12 : ARM::LDRi12;
-        RC = ARM::GPRRegisterClass;
+        RC = &ARM::GPRRegClass;
       } else {
         Opc = ARM::VLDRS;
         RC = TLI.getRegClassFor(VT);
@@ -1106,8 +1105,9 @@ bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr,
     // This is mostly going to be Neon/vector support.
     default: return false;
     case MVT::i1: {
-      unsigned Res = createResultReg(isThumb2 ? ARM::tGPRRegisterClass :
-                                               ARM::GPRRegisterClass);
+      unsigned Res = createResultReg(isThumb2 ?
+        (const TargetRegisterClass*)&ARM::tGPRRegClass :
+        (const TargetRegisterClass*)&ARM::GPRRegClass);
       unsigned Opc = isThumb2 ? ARM::t2ANDri : ARM::ANDri;
       AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                               TII.get(Opc), Res)
@@ -1358,7 +1358,7 @@ bool ARMFastISel::SelectIndirectBr(const Instruction *I) {
   unsigned Opc = isThumb2 ? ARM::tBRIND : ARM::BX;
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc))
                   .addReg(AddrReg));
-  return true;  
+  return true;
 }
 
 bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
@@ -1423,12 +1423,12 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
         if (!UseImm)
           CmpOpc = ARM::t2CMPrr;
         else
-          CmpOpc = isNegativeImm ? ARM::t2CMNzri : ARM::t2CMPri;
+          CmpOpc = isNegativeImm ? ARM::t2CMNri : ARM::t2CMPri;
       } else {
         if (!UseImm)
           CmpOpc = ARM::CMPrr;
         else
-          CmpOpc = isNegativeImm ? ARM::CMNzri : ARM::CMPri;
+          CmpOpc = isNegativeImm ? ARM::CMNri : ARM::CMPri;
       }
       break;
   }
@@ -1491,8 +1491,9 @@ bool ARMFastISel::SelectCmp(const Instruction *I) {
   // Now set a register based on the comparison. Explicitly set the predicates
   // here.
   unsigned MovCCOpc = isThumb2 ? ARM::t2MOVCCi : ARM::MOVCCi;
-  const TargetRegisterClass *RC = isThumb2 ? ARM::rGPRRegisterClass
-                                           : ARM::GPRRegisterClass;
+  const TargetRegisterClass *RC = isThumb2 ?
+    (const TargetRegisterClass*)&ARM::rGPRRegClass :
+    (const TargetRegisterClass*)&ARM::GPRRegClass;
   unsigned DestReg = createResultReg(RC);
   Constant *Zero = ConstantInt::get(Type::getInt32Ty(*Context), 0);
   unsigned ZeroReg = TargetMaterializeConstant(Zero);
@@ -1516,7 +1517,7 @@ bool ARMFastISel::SelectFPExt(const Instruction *I) {
   unsigned Op = getRegForValue(V);
   if (Op == 0) return false;
 
-  unsigned Result = createResultReg(ARM::DPRRegisterClass);
+  unsigned Result = createResultReg(&ARM::DPRRegClass);
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                           TII.get(ARM::VCVTDS), Result)
                   .addReg(Op));
@@ -1535,7 +1536,7 @@ bool ARMFastISel::SelectFPTrunc(const Instruction *I) {
   unsigned Op = getRegForValue(V);
   if (Op == 0) return false;
 
-  unsigned Result = createResultReg(ARM::SPRRegisterClass);
+  unsigned Result = createResultReg(&ARM::SPRRegClass);
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                           TII.get(ARM::VCVTSD), Result)
                   .addReg(Op));
@@ -1736,7 +1737,7 @@ bool ARMFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
   // type and the target independent selector doesn't know how to handle it.
   if (DestVT != MVT::i16 && DestVT != MVT::i8 && DestVT != MVT::i1)
     return false;
-  
+
   unsigned Opc;
   switch (ISDOpcode) {
     default: return false;
@@ -1809,10 +1810,11 @@ bool ARMFastISel::SelectBinaryFPOp(const Instruction *I, unsigned ISDOpcode) {
 
 // Call Handling Code
 
-// This is largely taken directly from CCAssignFnForNode - we don't support
-// varargs in FastISel so that part has been removed.
+// This is largely taken directly from CCAssignFnForNode
 // TODO: We may not support all of this.
-CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC, bool Return) {
+CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC,
+                                           bool Return,
+                                           bool isVarArg) {
   switch (CC) {
   default:
     llvm_unreachable("Unsupported calling convention");
@@ -1825,14 +1827,17 @@ CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC, bool Return) {
     // Use target triple & subtarget features to do actual dispatch.
     if (Subtarget->isAAPCS_ABI()) {
       if (Subtarget->hasVFP2() &&
-          TM.Options.FloatABIType == FloatABI::Hard)
+          TM.Options.FloatABIType == FloatABI::Hard && !isVarArg)
         return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
       else
         return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS);
     } else
         return (Return ? RetCC_ARM_APCS: CC_ARM_APCS);
   case CallingConv::ARM_AAPCS_VFP:
-    return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
+    if (!isVarArg)
+      return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
+    // Fall through to soft float variant, variadic functions don't
+    // use hard floating point ABI.
   case CallingConv::ARM_AAPCS:
     return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS);
   case CallingConv::ARM_APCS:
@@ -1846,10 +1851,12 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
                                   SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
                                   SmallVectorImpl<unsigned> &RegArgs,
                                   CallingConv::ID CC,
-                                  unsigned &NumBytes) {
+                                  unsigned &NumBytes,
+                                  bool isVarArg) {
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CC, false, *FuncInfo.MF, TM, ArgLocs, *Context);
-  CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CCAssignFnForCall(CC, false));
+  CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, ArgLocs, *Context);
+  CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags,
+                             CCAssignFnForCall(CC, false, isVarArg));
 
   // Check that we can handle all of the arguments. If we can't, then bail out
   // now before we add code to the MBB.
@@ -1981,7 +1988,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
 
 bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
                              const Instruction *I, CallingConv::ID CC,
-                             unsigned &NumBytes) {
+                             unsigned &NumBytes, bool isVarArg) {
   // Issue CALLSEQ_END
   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
@@ -1991,8 +1998,8 @@ bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
   // Now the return value.
   if (RetVT != MVT::isVoid) {
     SmallVector<CCValAssign, 16> RVLocs;
-    CCState CCInfo(CC, false, *FuncInfo.MF, TM, RVLocs, *Context);
-    CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true));
+    CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, isVarArg));
 
     // Copy all of the result registers out of their specified physreg.
     if (RVLocs.size() == 2 && RetVT == MVT::f64) {
@@ -2041,9 +2048,6 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
   if (!FuncInfo.CanLowerReturn)
     return false;
 
-  if (F.isVarArg())
-    return false;
-
   CallingConv::ID CC = F.getCallingConv();
   if (Ret->getNumOperands() > 0) {
     SmallVector<ISD::OutputArg, 4> Outs;
@@ -2053,7 +2057,8 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
     // Analyze operands of the call, assigning locations to each operand.
     SmallVector<CCValAssign, 16> ValLocs;
     CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs,I->getContext());
-    CCInfo.AnalyzeReturn(Outs, CCAssignFnForCall(CC, true /* is Ret */));
+    CCInfo.AnalyzeReturn(Outs, CCAssignFnForCall(CC, true /* is Ret */,
+                                                 F.isVarArg()));
 
     const Value *RV = Ret->getOperand(0);
     unsigned Reg = getRegForValue(RV);
@@ -2110,12 +2115,17 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
   return true;
 }
 
-unsigned ARMFastISel::ARMSelectCallOp(const GlobalValue *GV) {
-  if (isThumb2) {
-    return ARM::tBL;
-  } else  {
-    return ARM::BL;
-  }
+unsigned ARMFastISel::ARMSelectCallOp(bool UseReg) {
+  if (UseReg)
+    return isThumb2 ? ARM::tBLXr : ARM::BLX;
+  else
+    return isThumb2 ? ARM::tBL : ARM::BL;
+}
+
+unsigned ARMFastISel::getLibcallReg(const Twine &Name) {
+  GlobalValue *GV = new GlobalVariable(Type::getInt32Ty(*Context), false,
+                                       GlobalValue::ExternalLinkage, 0, Name);
+  return ARMMaterializeGV(GV, TLI.getValueType(GV->getType()));
 }
 
 // A quick function that will emit a call for a named libcall in F with the
@@ -2136,8 +2146,14 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
   else if (!isTypeLegal(RetTy, RetVT))
     return false;
 
-  // TODO: For now if we have long calls specified we don't handle the call.
-  if (EnableARMLongCalls) return false;
+  // Can't handle non-double multi-reg retvals.
+  if (RetVT != MVT::isVoid && RetVT != MVT::i32) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, false, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, false));
+    if (RVLocs.size() >= 2 && RetVT != MVT::f64)
+      return false;
+  }
 
   // Set up the argument vectors.
   SmallVector<Value*, 8> Args;
@@ -2170,23 +2186,36 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
   // Handle the arguments now that we've gotten them.
   SmallVector<unsigned, 4> RegArgs;
   unsigned NumBytes;
-  if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes))
+  if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags,
+                       RegArgs, CC, NumBytes, false))
     return false;
 
+  unsigned CalleeReg = 0;
+  if (EnableARMLongCalls) {
+    CalleeReg = getLibcallReg(TLI.getLibcallName(Call));
+    if (CalleeReg == 0) return false;
+  }
+
   // Issue the call.
-  MachineInstrBuilder MIB;
-  unsigned CallOpc = ARMSelectCallOp(NULL);
-  if (isThumb2)
-    // Explicitly adding the predicate here.
-    MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                         TII.get(CallOpc)))
-                         .addExternalSymbol(TLI.getLibcallName(Call));
-  else
+  unsigned CallOpc = ARMSelectCallOp(EnableARMLongCalls);
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+                                    DL, TII.get(CallOpc));
+  if (isThumb2) {
     // Explicitly adding the predicate here.
-    MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                         TII.get(CallOpc))
-          .addExternalSymbol(TLI.getLibcallName(Call)));
+    AddDefaultPred(MIB);
+    if (EnableARMLongCalls)
+      MIB.addReg(CalleeReg);
+    else
+      MIB.addExternalSymbol(TLI.getLibcallName(Call));
+  } else {
+    if (EnableARMLongCalls)
+      MIB.addReg(CalleeReg);
+    else
+      MIB.addExternalSymbol(TLI.getLibcallName(Call));
 
+    // Explicitly adding the predicate here.
+    AddDefaultPred(MIB);
+  }
   // Add implicit physical register uses to the call.
   for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
     MIB.addReg(RegArgs[i]);
@@ -2197,7 +2226,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
 
   // Finish off the call including any return values.
   SmallVector<unsigned, 4> UsedRegs;
-  if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes)) return false;
+  if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes, false)) return false;
 
   // Set all unused physreg defs as dead.
   static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
@@ -2213,22 +2242,15 @@ bool ARMFastISel::SelectCall(const Instruction *I,
   // Can't handle inline asm.
   if (isa<InlineAsm>(Callee)) return false;
 
-  // Only handle global variable Callees.
-  const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
-  if (!GV)
-    return false;
-
   // Check the calling convention.
   ImmutableCallSite CS(CI);
   CallingConv::ID CC = CS.getCallingConv();
 
   // TODO: Avoid some calling conventions?
 
-  // Let SDISel handle vararg functions.
   PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
   FunctionType *FTy = cast<FunctionType>(PT->getElementType());
-  if (FTy->isVarArg())
-    return false;
+  bool isVarArg = FTy->isVarArg();
 
   // Handle *simple* calls for now.
   Type *RetTy = I->getType();
@@ -2239,8 +2261,15 @@ bool ARMFastISel::SelectCall(const Instruction *I,
            RetVT != MVT::i8  && RetVT != MVT::i1)
     return false;
 
-  // TODO: For now if we have long calls specified we don't handle the call.
-  if (EnableARMLongCalls) return false;
+  // Can't handle non-double multi-reg retvals.
+  if (RetVT != MVT::isVoid && RetVT != MVT::i1 && RetVT != MVT::i8 &&
+      RetVT != MVT::i16 && RetVT != MVT::i32) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, isVarArg));
+    if (RVLocs.size() >= 2 && RetVT != MVT::f64)
+      return false;
+  }
 
   // Set up the argument vectors.
   SmallVector<Value*, 8> Args;
@@ -2295,33 +2324,49 @@ bool ARMFastISel::SelectCall(const Instruction *I,
   // Handle the arguments now that we've gotten them.
   SmallVector<unsigned, 4> RegArgs;
   unsigned NumBytes;
-  if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes))
+  if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags,
+                       RegArgs, CC, NumBytes, isVarArg))
     return false;
 
+  bool UseReg = false;
+  const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
+  if (!GV || EnableARMLongCalls) UseReg = true;
+
+  unsigned CalleeReg = 0;
+  if (UseReg) {
+    if (IntrMemName)
+      CalleeReg = getLibcallReg(IntrMemName);
+    else
+      CalleeReg = getRegForValue(Callee);
+
+    if (CalleeReg == 0) return false;
+  }
+
   // Issue the call.
-  MachineInstrBuilder MIB;
-  unsigned CallOpc = ARMSelectCallOp(GV);
-  // Explicitly adding the predicate here.
+  unsigned CallOpc = ARMSelectCallOp(UseReg);
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+                                    DL, TII.get(CallOpc));
   if(isThumb2) {
     // Explicitly adding the predicate here.
-    MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                                 TII.get(CallOpc)));
-    if (!IntrMemName)
+    AddDefaultPred(MIB);
+    if (UseReg)
+      MIB.addReg(CalleeReg);
+    else if (!IntrMemName)
       MIB.addGlobalAddress(GV, 0, 0);
-    else 
+    else
       MIB.addExternalSymbol(IntrMemName, 0);
   } else {
-    if (!IntrMemName)
-      // Explicitly adding the predicate here.
-      MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                                   TII.get(CallOpc))
-            .addGlobalAddress(GV, 0, 0));
+    if (UseReg)
+      MIB.addReg(CalleeReg);
+    else if (!IntrMemName)
+      MIB.addGlobalAddress(GV, 0, 0);
     else
-      MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                                   TII.get(CallOpc))
-            .addExternalSymbol(IntrMemName, 0));
+      MIB.addExternalSymbol(IntrMemName, 0);
+
+    // Explicitly adding the predicate here.
+    AddDefaultPred(MIB);
   }
-  
+
   // Add implicit physical register uses to the call.
   for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
     MIB.addReg(RegArgs[i]);
@@ -2332,7 +2377,8 @@ bool ARMFastISel::SelectCall(const Instruction *I,
 
   // Finish off the call including any return values.
   SmallVector<unsigned, 4> UsedRegs;
-  if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes)) return false;
+  if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes, isVarArg))
+    return false;
 
   // Set all unused physreg defs as dead.
   static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
@@ -2383,6 +2429,42 @@ bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
   // FIXME: Handle more intrinsics.
   switch (I.getIntrinsicID()) {
   default: return false;
+  case Intrinsic::frameaddress: {
+    MachineFrameInfo *MFI = FuncInfo.MF->getFrameInfo();
+    MFI->setFrameAddressIsTaken(true);
+
+    unsigned LdrOpc;
+    const TargetRegisterClass *RC;
+    if (isThumb2) {
+      LdrOpc =  ARM::t2LDRi12;
+      RC = (const TargetRegisterClass*)&ARM::tGPRRegClass;
+    } else {
+      LdrOpc =  ARM::LDRi12;
+      RC = (const TargetRegisterClass*)&ARM::GPRRegClass;
+    }
+
+    const ARMBaseRegisterInfo *RegInfo =
+          static_cast<const ARMBaseRegisterInfo*>(TM.getRegisterInfo());
+    unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF));
+    unsigned SrcReg = FramePtr;
+
+    // Recursively load frame address
+    // ldr r0 [fp]
+    // ldr r0 [r0]
+    // ldr r0 [r0]
+    // ...
+    unsigned DestReg;
+    unsigned Depth = cast<ConstantInt>(I.getOperand(0))->getZExtValue();
+    while (Depth--) {
+      DestReg = createResultReg(RC);
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                              TII.get(LdrOpc), DestReg)
+                      .addReg(SrcReg).addImm(0));
+      SrcReg = DestReg;
+    }
+    UpdateValueMap(&I, SrcReg);
+    return true;
+  }
   case Intrinsic::memcpy:
   case Intrinsic::memmove: {
     const MemTransferInst &MTI = cast<MemTransferInst>(I);
@@ -2406,10 +2488,10 @@ bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
           return true;
       }
     }
-    
+
     if (!MTI.getLength()->getType()->isIntegerTy(32))
       return false;
-    
+
     if (MTI.getSourceAddressSpace() > 255 || MTI.getDestAddressSpace() > 255)
       return false;
 
@@ -2421,20 +2503,24 @@ bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
     // Don't handle volatile.
     if (MSI.isVolatile())
       return false;
-    
+
     if (!MSI.getLength()->getType()->isIntegerTy(32))
       return false;
-    
+
     if (MSI.getDestAddressSpace() > 255)
       return false;
-    
+
     return SelectCall(&I, "memset");
   }
+  case Intrinsic::trap: {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::TRAP));
+    return true;
+  }
   }
 }
 
 bool ARMFastISel::SelectTrunc(const Instruction *I) {
-  // The high bits for a type smaller than the register size are assumed to be 
+  // The high bits for a type smaller than the register size are assumed to be
   // undefined.
   Value *Op = I->getOperand(0);
 
@@ -2625,7 +2711,7 @@ bool ARMFastISel::TryToFoldLoad(MachineInstr *MI, unsigned OpNo,
   // See if we can handle this address.
   Address Addr;
   if (!ARMComputeAddress(LI->getOperand(0), Addr)) return false;
-  
+
   unsigned ResultReg = MI->getOperand(0).getReg();
   if (!ARMEmitLoad(VT, ResultReg, Addr, LI->getAlignment(), isZExt, false))
     return false;
@@ -2640,8 +2726,7 @@ namespace llvm {
 
     // Darwin and thumb1 only for now.
     const ARMSubtarget *Subtarget = &TM.getSubtarget<ARMSubtarget>();
-    if (Subtarget->isTargetIOS() && !Subtarget->isThumb1Only() &&
-        !DisableARMFastISel)
+    if (Subtarget->isTargetIOS() && !Subtarget->isThumb1Only())
       return new ARMFastISel(funcInfo);
     return 0;
   }
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index 402ecb0..2629496 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -790,7 +790,7 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
   // The writeback is only needed when emitting two vst1.64 instructions.
   if (NumAlignedDPRCS2Regs >= 6) {
     unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
-                                               ARM::QQPRRegisterClass);
+                                               &ARM::QQPRRegClass);
     MBB.addLiveIn(SupReg);
     AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Qwb_fixed),
                            ARM::R4)
@@ -808,7 +808,7 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
   // 16-byte aligned vst1.64 with 4 d-regs, no writeback.
   if (NumAlignedDPRCS2Regs >= 4) {
     unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
-                                               ARM::QQPRRegisterClass);
+                                               &ARM::QQPRRegClass);
     MBB.addLiveIn(SupReg);
     AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Q))
                    .addReg(ARM::R4).addImm(16).addReg(NextReg)
@@ -820,7 +820,7 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
   // 16-byte aligned vst1.64 with 2 d-regs.
   if (NumAlignedDPRCS2Regs >= 2) {
     unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
-                                               ARM::QPRRegisterClass);
+                                               &ARM::QPRRegClass);
     MBB.addLiveIn(SupReg);
     AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1q64))
                    .addReg(ARM::R4).addImm(16).addReg(SupReg));
@@ -908,7 +908,7 @@ static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB,
   // 16-byte aligned vld1.64 with 4 d-regs and writeback.
   if (NumAlignedDPRCS2Regs >= 6) {
     unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
-                                               ARM::QQPRRegisterClass);
+                                               &ARM::QQPRRegClass);
     AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1d64Qwb_fixed), NextReg)
                    .addReg(ARM::R4, RegState::Define)
                    .addReg(ARM::R4, RegState::Kill).addImm(16)
@@ -924,7 +924,7 @@ static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB,
   // 16-byte aligned vld1.64 with 4 d-regs, no writeback.
   if (NumAlignedDPRCS2Regs >= 4) {
     unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
-                                               ARM::QQPRRegisterClass);
+                                               &ARM::QQPRRegClass);
     AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1d64Q), NextReg)
                    .addReg(ARM::R4).addImm(16)
                    .addReg(SupReg, RegState::ImplicitDefine));
@@ -935,7 +935,7 @@ static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB,
   // 16-byte aligned vld1.64 with 2 d-regs.
   if (NumAlignedDPRCS2Regs >= 2) {
     unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
-                                               ARM::QPRRegisterClass);
+                                               &ARM::QPRRegClass);
     AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1q64), SupReg)
                    .addReg(ARM::R4).addImm(16));
     NextReg += 2;
@@ -1244,7 +1244,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
       CanEliminateFrame = false;
     }
 
-    if (!ARM::GPRRegisterClass->contains(Reg))
+    if (!ARM::GPRRegClass.contains(Reg))
       continue;
 
     if (Spilled) {
@@ -1404,7 +1404,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
       } else if (!AFI->isThumb1OnlyFunction()) {
         // note: Thumb1 functions spill to R12, not the stack.  Reserve a slot
         // closest to SP or frame pointer.
-        const TargetRegisterClass *RC = ARM::GPRRegisterClass;
+        const TargetRegisterClass *RC = &ARM::GPRRegClass;
         RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
                                                            RC->getAlignment(),
                                                            false));
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 1eafbbc..1953192 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -210,29 +210,29 @@ private:
   /// loads of D registers and even subregs and odd subregs of Q registers.
   /// For NumVecs <= 2, QOpcodes1 is not used.
   SDNode *SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
-                    unsigned *DOpcodes,
-                    unsigned *QOpcodes0, unsigned *QOpcodes1);
+                    const uint16_t *DOpcodes,
+                    const uint16_t *QOpcodes0, const uint16_t *QOpcodes1);
 
   /// SelectVST - Select NEON store intrinsics.  NumVecs should
   /// be 1, 2, 3 or 4.  The opcode arrays specify the instructions used for
   /// stores of D registers and even subregs and odd subregs of Q registers.
   /// For NumVecs <= 2, QOpcodes1 is not used.
   SDNode *SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
-                    unsigned *DOpcodes,
-                    unsigned *QOpcodes0, unsigned *QOpcodes1);
+                    const uint16_t *DOpcodes,
+                    const uint16_t *QOpcodes0, const uint16_t *QOpcodes1);
 
   /// SelectVLDSTLane - Select NEON load/store lane intrinsics.  NumVecs should
   /// be 2, 3 or 4.  The opcode arrays specify the instructions used for
   /// load/store of D registers and Q registers.
   SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad,
                           bool isUpdating, unsigned NumVecs,
-                          unsigned *DOpcodes, unsigned *QOpcodes);
+                          const uint16_t *DOpcodes, const uint16_t *QOpcodes);
 
   /// SelectVLDDup - Select NEON load-duplicate intrinsics.  NumVecs
   /// should be 2, 3 or 4.  The opcode array specifies the instructions used
   /// for loading D registers.  (Q registers are not supported.)
   SDNode *SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
-                       unsigned *Opcodes);
+                       const uint16_t *Opcodes);
 
   /// SelectVTBL - Select NEON VTBL and VTBX intrinsics.  NumVecs should be 2,
   /// 3 or 4.  These are custom-selected so that a REG_SEQUENCE can be
@@ -583,8 +583,6 @@ bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset,
 }
 
 
-
-
 //-----
 
 AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
@@ -1597,8 +1595,9 @@ static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) {
 }
 
 SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
-                                   unsigned *DOpcodes, unsigned *QOpcodes0,
-                                   unsigned *QOpcodes1) {
+                                   const uint16_t *DOpcodes,
+                                   const uint16_t *QOpcodes0,
+                                   const uint16_t *QOpcodes1) {
   assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range");
   DebugLoc dl = N->getDebugLoc();
 
@@ -1729,8 +1728,9 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
 }
 
 SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
-                                   unsigned *DOpcodes, unsigned *QOpcodes0,
-                                   unsigned *QOpcodes1) {
+                                   const uint16_t *DOpcodes,
+                                   const uint16_t *QOpcodes0,
+                                   const uint16_t *QOpcodes1) {
   assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
   DebugLoc dl = N->getDebugLoc();
 
@@ -1875,8 +1875,8 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
 
 SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
                                          bool isUpdating, unsigned NumVecs,
-                                         unsigned *DOpcodes,
-                                         unsigned *QOpcodes) {
+                                         const uint16_t *DOpcodes,
+                                         const uint16_t *QOpcodes) {
   assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range");
   DebugLoc dl = N->getDebugLoc();
 
@@ -1994,7 +1994,8 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
 }
 
 SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
-                                      unsigned NumVecs, unsigned *Opcodes) {
+                                      unsigned NumVecs,
+                                      const uint16_t *Opcodes) {
   assert(NumVecs >=2 && NumVecs <= 4 && "VLDDup NumVecs out-of-range");
   DebugLoc dl = N->getDebugLoc();
 
@@ -2893,176 +2894,199 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
   }
 
   case ARMISD::VLD2DUP: {
-    unsigned Opcodes[] = { ARM::VLD2DUPd8, ARM::VLD2DUPd16,
-                           ARM::VLD2DUPd32 };
+    static const uint16_t Opcodes[] = { ARM::VLD2DUPd8, ARM::VLD2DUPd16,
+                                        ARM::VLD2DUPd32 };
     return SelectVLDDup(N, false, 2, Opcodes);
   }
 
   case ARMISD::VLD3DUP: {
-    unsigned Opcodes[] = { ARM::VLD3DUPd8Pseudo, ARM::VLD3DUPd16Pseudo,
-                           ARM::VLD3DUPd32Pseudo };
+    static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo,
+                                        ARM::VLD3DUPd16Pseudo,
+                                        ARM::VLD3DUPd32Pseudo };
     return SelectVLDDup(N, false, 3, Opcodes);
   }
 
   case ARMISD::VLD4DUP: {
-    unsigned Opcodes[] = { ARM::VLD4DUPd8Pseudo, ARM::VLD4DUPd16Pseudo,
-                           ARM::VLD4DUPd32Pseudo };
+    static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo,
+                                        ARM::VLD4DUPd16Pseudo,
+                                        ARM::VLD4DUPd32Pseudo };
     return SelectVLDDup(N, false, 4, Opcodes);
   }
 
   case ARMISD::VLD2DUP_UPD: {
-    unsigned Opcodes[] = { ARM::VLD2DUPd8wb_fixed, ARM::VLD2DUPd16wb_fixed,
-                           ARM::VLD2DUPd32wb_fixed };
+    static const uint16_t Opcodes[] = { ARM::VLD2DUPd8wb_fixed,
+                                        ARM::VLD2DUPd16wb_fixed,
+                                        ARM::VLD2DUPd32wb_fixed };
     return SelectVLDDup(N, true, 2, Opcodes);
   }
 
   case ARMISD::VLD3DUP_UPD: {
-    unsigned Opcodes[] = { ARM::VLD3DUPd8Pseudo_UPD, ARM::VLD3DUPd16Pseudo_UPD,
-                           ARM::VLD3DUPd32Pseudo_UPD };
+    static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo_UPD,
+                                        ARM::VLD3DUPd16Pseudo_UPD,
+                                        ARM::VLD3DUPd32Pseudo_UPD };
     return SelectVLDDup(N, true, 3, Opcodes);
   }
 
   case ARMISD::VLD4DUP_UPD: {
-    unsigned Opcodes[] = { ARM::VLD4DUPd8Pseudo_UPD, ARM::VLD4DUPd16Pseudo_UPD,
-                           ARM::VLD4DUPd32Pseudo_UPD };
+    static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo_UPD,
+                                        ARM::VLD4DUPd16Pseudo_UPD,
+                                        ARM::VLD4DUPd32Pseudo_UPD };
     return SelectVLDDup(N, true, 4, Opcodes);
   }
 
   case ARMISD::VLD1_UPD: {
-    unsigned DOpcodes[] = { ARM::VLD1d8wb_fixed, ARM::VLD1d16wb_fixed,
-                            ARM::VLD1d32wb_fixed, ARM::VLD1d64wb_fixed };
-    unsigned QOpcodes[] = { ARM::VLD1q8wb_fixed,
-                            ARM::VLD1q16wb_fixed,
-                            ARM::VLD1q32wb_fixed,
-                            ARM::VLD1q64wb_fixed };
+    static const uint16_t DOpcodes[] = { ARM::VLD1d8wb_fixed,
+                                         ARM::VLD1d16wb_fixed,
+                                         ARM::VLD1d32wb_fixed,
+                                         ARM::VLD1d64wb_fixed };
+    static const uint16_t QOpcodes[] = { ARM::VLD1q8wb_fixed,
+                                         ARM::VLD1q16wb_fixed,
+                                         ARM::VLD1q32wb_fixed,
+                                         ARM::VLD1q64wb_fixed };
     return SelectVLD(N, true, 1, DOpcodes, QOpcodes, 0);
   }
 
   case ARMISD::VLD2_UPD: {
-    unsigned DOpcodes[] = { ARM::VLD2d8wb_fixed,
-                            ARM::VLD2d16wb_fixed,
-                            ARM::VLD2d32wb_fixed,
-                            ARM::VLD1q64wb_fixed};
-    unsigned QOpcodes[] = { ARM::VLD2q8PseudoWB_fixed,
-                            ARM::VLD2q16PseudoWB_fixed,
-                            ARM::VLD2q32PseudoWB_fixed };
+    static const uint16_t DOpcodes[] = { ARM::VLD2d8wb_fixed,
+                                         ARM::VLD2d16wb_fixed,
+                                         ARM::VLD2d32wb_fixed,
+                                         ARM::VLD1q64wb_fixed};
+    static const uint16_t QOpcodes[] = { ARM::VLD2q8PseudoWB_fixed,
+                                         ARM::VLD2q16PseudoWB_fixed,
+                                         ARM::VLD2q32PseudoWB_fixed };
     return SelectVLD(N, true, 2, DOpcodes, QOpcodes, 0);
   }
 
   case ARMISD::VLD3_UPD: {
-    unsigned DOpcodes[] = { ARM::VLD3d8Pseudo_UPD, ARM::VLD3d16Pseudo_UPD,
-                            ARM::VLD3d32Pseudo_UPD, ARM::VLD1q64wb_fixed};
-    unsigned QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD,
-                             ARM::VLD3q16Pseudo_UPD,
-                             ARM::VLD3q32Pseudo_UPD };
-    unsigned QOpcodes1[] = { ARM::VLD3q8oddPseudo_UPD,
-                             ARM::VLD3q16oddPseudo_UPD,
-                             ARM::VLD3q32oddPseudo_UPD };
+    static const uint16_t DOpcodes[] = { ARM::VLD3d8Pseudo_UPD,
+                                         ARM::VLD3d16Pseudo_UPD,
+                                         ARM::VLD3d32Pseudo_UPD,
+                                         ARM::VLD1q64wb_fixed};
+    static const uint16_t QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD,
+                                          ARM::VLD3q16Pseudo_UPD,
+                                          ARM::VLD3q32Pseudo_UPD };
+    static const uint16_t QOpcodes1[] = { ARM::VLD3q8oddPseudo_UPD,
+                                          ARM::VLD3q16oddPseudo_UPD,
+                                          ARM::VLD3q32oddPseudo_UPD };
     return SelectVLD(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
   }
 
   case ARMISD::VLD4_UPD: {
-    unsigned DOpcodes[] = { ARM::VLD4d8Pseudo_UPD, ARM::VLD4d16Pseudo_UPD,
-                            ARM::VLD4d32Pseudo_UPD, ARM::VLD1q64wb_fixed};
-    unsigned QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD,
-                             ARM::VLD4q16Pseudo_UPD,
-                             ARM::VLD4q32Pseudo_UPD };
-    unsigned QOpcodes1[] = { ARM::VLD4q8oddPseudo_UPD,
-                             ARM::VLD4q16oddPseudo_UPD,
-                             ARM::VLD4q32oddPseudo_UPD };
+    static const uint16_t DOpcodes[] = { ARM::VLD4d8Pseudo_UPD,
+                                         ARM::VLD4d16Pseudo_UPD,
+                                         ARM::VLD4d32Pseudo_UPD,
+                                         ARM::VLD1q64wb_fixed};
+    static const uint16_t QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD,
+                                          ARM::VLD4q16Pseudo_UPD,
+                                          ARM::VLD4q32Pseudo_UPD };
+    static const uint16_t QOpcodes1[] = { ARM::VLD4q8oddPseudo_UPD,
+                                          ARM::VLD4q16oddPseudo_UPD,
+                                          ARM::VLD4q32oddPseudo_UPD };
     return SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
   }
 
   case ARMISD::VLD2LN_UPD: {
-    unsigned DOpcodes[] = { ARM::VLD2LNd8Pseudo_UPD, ARM::VLD2LNd16Pseudo_UPD,
-                            ARM::VLD2LNd32Pseudo_UPD };
-    unsigned QOpcodes[] = { ARM::VLD2LNq16Pseudo_UPD,
-                            ARM::VLD2LNq32Pseudo_UPD };
+    static const uint16_t DOpcodes[] = { ARM::VLD2LNd8Pseudo_UPD,
+                                         ARM::VLD2LNd16Pseudo_UPD,
+                                         ARM::VLD2LNd32Pseudo_UPD };
+    static const uint16_t QOpcodes[] = { ARM::VLD2LNq16Pseudo_UPD,
+                                         ARM::VLD2LNq32Pseudo_UPD };
     return SelectVLDSTLane(N, true, true, 2, DOpcodes, QOpcodes);
   }
 
   case ARMISD::VLD3LN_UPD: {
-    unsigned DOpcodes[] = { ARM::VLD3LNd8Pseudo_UPD, ARM::VLD3LNd16Pseudo_UPD,
-                            ARM::VLD3LNd32Pseudo_UPD };
-    unsigned QOpcodes[] = { ARM::VLD3LNq16Pseudo_UPD,
-                            ARM::VLD3LNq32Pseudo_UPD };
+    static const uint16_t DOpcodes[] = { ARM::VLD3LNd8Pseudo_UPD,
+                                         ARM::VLD3LNd16Pseudo_UPD,
+                                         ARM::VLD3LNd32Pseudo_UPD };
+    static const uint16_t QOpcodes[] = { ARM::VLD3LNq16Pseudo_UPD,
+                                         ARM::VLD3LNq32Pseudo_UPD };
     return SelectVLDSTLane(N, true, true, 3, DOpcodes, QOpcodes);
   }
 
   case ARMISD::VLD4LN_UPD: {
-    unsigned DOpcodes[] = { ARM::VLD4LNd8Pseudo_UPD, ARM::VLD4LNd16Pseudo_UPD,
-                            ARM::VLD4LNd32Pseudo_UPD };
-    unsigned QOpcodes[] = { ARM::VLD4LNq16Pseudo_UPD,
-                            ARM::VLD4LNq32Pseudo_UPD };
+    static const uint16_t DOpcodes[] = { ARM::VLD4LNd8Pseudo_UPD,
+                                         ARM::VLD4LNd16Pseudo_UPD,
+                                         ARM::VLD4LNd32Pseudo_UPD };
+    static const uint16_t QOpcodes[] = { ARM::VLD4LNq16Pseudo_UPD,
+                                         ARM::VLD4LNq32Pseudo_UPD };
     return SelectVLDSTLane(N, true, true, 4, DOpcodes, QOpcodes);
   }
 
   case ARMISD::VST1_UPD: {
-    unsigned DOpcodes[] = { ARM::VST1d8wb_fixed, ARM::VST1d16wb_fixed,
-                            ARM::VST1d32wb_fixed, ARM::VST1d64wb_fixed };
-    unsigned QOpcodes[] = { ARM::VST1q8wb_fixed,
-                            ARM::VST1q16wb_fixed,
-                            ARM::VST1q32wb_fixed,
-                            ARM::VST1q64wb_fixed };
+    static const uint16_t DOpcodes[] = { ARM::VST1d8wb_fixed,
+                                         ARM::VST1d16wb_fixed,
+                                         ARM::VST1d32wb_fixed,
+                                         ARM::VST1d64wb_fixed };
+    static const uint16_t QOpcodes[] = { ARM::VST1q8wb_fixed,
+                                         ARM::VST1q16wb_fixed,
+                                         ARM::VST1q32wb_fixed,
+                                         ARM::VST1q64wb_fixed };
     return SelectVST(N, true, 1, DOpcodes, QOpcodes, 0);
   }
 
   case ARMISD::VST2_UPD: {
-    unsigned DOpcodes[] = { ARM::VST2d8wb_fixed,
-                            ARM::VST2d16wb_fixed,
-                            ARM::VST2d32wb_fixed,
-                            ARM::VST1q64wb_fixed};
-    unsigned QOpcodes[] = { ARM::VST2q8PseudoWB_fixed,
-                            ARM::VST2q16PseudoWB_fixed,
-                            ARM::VST2q32PseudoWB_fixed };
+    static const uint16_t DOpcodes[] = { ARM::VST2d8wb_fixed,
+                                         ARM::VST2d16wb_fixed,
+                                         ARM::VST2d32wb_fixed,
+                                         ARM::VST1q64wb_fixed};
+    static const uint16_t QOpcodes[] = { ARM::VST2q8PseudoWB_fixed,
+                                         ARM::VST2q16PseudoWB_fixed,
+                                         ARM::VST2q32PseudoWB_fixed };
     return SelectVST(N, true, 2, DOpcodes, QOpcodes, 0);
   }
 
   case ARMISD::VST3_UPD: {
-    unsigned DOpcodes[] = { ARM::VST3d8Pseudo_UPD, ARM::VST3d16Pseudo_UPD,
-                            ARM::VST3d32Pseudo_UPD,ARM::VST1d64TPseudoWB_fixed};
-    unsigned QOpcodes0[] = { ARM::VST3q8Pseudo_UPD,
-                             ARM::VST3q16Pseudo_UPD,
-                             ARM::VST3q32Pseudo_UPD };
-    unsigned QOpcodes1[] = { ARM::VST3q8oddPseudo_UPD,
-                             ARM::VST3q16oddPseudo_UPD,
-                             ARM::VST3q32oddPseudo_UPD };
+    static const uint16_t DOpcodes[] = { ARM::VST3d8Pseudo_UPD,
+                                         ARM::VST3d16Pseudo_UPD,
+                                         ARM::VST3d32Pseudo_UPD,
+                                         ARM::VST1d64TPseudoWB_fixed};
+    static const uint16_t QOpcodes0[] = { ARM::VST3q8Pseudo_UPD,
+                                          ARM::VST3q16Pseudo_UPD,
+                                          ARM::VST3q32Pseudo_UPD };
+    static const uint16_t QOpcodes1[] = { ARM::VST3q8oddPseudo_UPD,
+                                          ARM::VST3q16oddPseudo_UPD,
+                                          ARM::VST3q32oddPseudo_UPD };
     return SelectVST(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
   }
 
   case ARMISD::VST4_UPD: {
-    unsigned DOpcodes[] = { ARM::VST4d8Pseudo_UPD, ARM::VST4d16Pseudo_UPD,
-                            ARM::VST4d32Pseudo_UPD,ARM::VST1d64QPseudoWB_fixed};
-    unsigned QOpcodes0[] = { ARM::VST4q8Pseudo_UPD,
-                             ARM::VST4q16Pseudo_UPD,
-                             ARM::VST4q32Pseudo_UPD };
-    unsigned QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD,
-                             ARM::VST4q16oddPseudo_UPD,
-                             ARM::VST4q32oddPseudo_UPD };
+    static const uint16_t DOpcodes[] = { ARM::VST4d8Pseudo_UPD,
+                                         ARM::VST4d16Pseudo_UPD,
+                                         ARM::VST4d32Pseudo_UPD,
+                                         ARM::VST1d64QPseudoWB_fixed};
+    static const uint16_t QOpcodes0[] = { ARM::VST4q8Pseudo_UPD,
+                                          ARM::VST4q16Pseudo_UPD,
+                                          ARM::VST4q32Pseudo_UPD };
+    static const uint16_t QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD,
+                                          ARM::VST4q16oddPseudo_UPD,
+                                          ARM::VST4q32oddPseudo_UPD };
     return SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
   }
 
   case ARMISD::VST2LN_UPD: {
-    unsigned DOpcodes[] = { ARM::VST2LNd8Pseudo_UPD, ARM::VST2LNd16Pseudo_UPD,
-                            ARM::VST2LNd32Pseudo_UPD };
-    unsigned QOpcodes[] = { ARM::VST2LNq16Pseudo_UPD,
-                            ARM::VST2LNq32Pseudo_UPD };
+    static const uint16_t DOpcodes[] = { ARM::VST2LNd8Pseudo_UPD,
+                                         ARM::VST2LNd16Pseudo_UPD,
+                                         ARM::VST2LNd32Pseudo_UPD };
+    static const uint16_t QOpcodes[] = { ARM::VST2LNq16Pseudo_UPD,
+                                         ARM::VST2LNq32Pseudo_UPD };
     return SelectVLDSTLane(N, false, true, 2, DOpcodes, QOpcodes);
   }
 
   case ARMISD::VST3LN_UPD: {
-    unsigned DOpcodes[] = { ARM::VST3LNd8Pseudo_UPD, ARM::VST3LNd16Pseudo_UPD,
-                            ARM::VST3LNd32Pseudo_UPD };
-    unsigned QOpcodes[] = { ARM::VST3LNq16Pseudo_UPD,
-                            ARM::VST3LNq32Pseudo_UPD };
+    static const uint16_t DOpcodes[] = { ARM::VST3LNd8Pseudo_UPD,
+                                         ARM::VST3LNd16Pseudo_UPD,
+                                         ARM::VST3LNd32Pseudo_UPD };
+    static const uint16_t QOpcodes[] = { ARM::VST3LNq16Pseudo_UPD,
+                                         ARM::VST3LNq32Pseudo_UPD };
     return SelectVLDSTLane(N, false, true, 3, DOpcodes, QOpcodes);
   }
 
   case ARMISD::VST4LN_UPD: {
-    unsigned DOpcodes[] = { ARM::VST4LNd8Pseudo_UPD, ARM::VST4LNd16Pseudo_UPD,
-                            ARM::VST4LNd32Pseudo_UPD };
-    unsigned QOpcodes[] = { ARM::VST4LNq16Pseudo_UPD,
-                            ARM::VST4LNq32Pseudo_UPD };
+    static const uint16_t DOpcodes[] = { ARM::VST4LNd8Pseudo_UPD,
+                                         ARM::VST4LNd16Pseudo_UPD,
+                                         ARM::VST4LNd32Pseudo_UPD };
+    static const uint16_t QOpcodes[] = { ARM::VST4LNq16Pseudo_UPD,
+                                         ARM::VST4LNq32Pseudo_UPD };
     return SelectVLDSTLane(N, false, true, 4, DOpcodes, QOpcodes);
   }
 
@@ -3179,124 +3203,144 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     }
 
     case Intrinsic::arm_neon_vld1: {
-      unsigned DOpcodes[] = { ARM::VLD1d8, ARM::VLD1d16,
-                              ARM::VLD1d32, ARM::VLD1d64 };
-      unsigned QOpcodes[] = { ARM::VLD1q8, ARM::VLD1q16,
-                              ARM::VLD1q32, ARM::VLD1q64};
+      static const uint16_t DOpcodes[] = { ARM::VLD1d8, ARM::VLD1d16,
+                                           ARM::VLD1d32, ARM::VLD1d64 };
+      static const uint16_t QOpcodes[] = { ARM::VLD1q8, ARM::VLD1q16,
+                                           ARM::VLD1q32, ARM::VLD1q64};
       return SelectVLD(N, false, 1, DOpcodes, QOpcodes, 0);
     }
 
     case Intrinsic::arm_neon_vld2: {
-      unsigned DOpcodes[] = { ARM::VLD2d8, ARM::VLD2d16,
-                              ARM::VLD2d32, ARM::VLD1q64 };
-      unsigned QOpcodes[] = { ARM::VLD2q8Pseudo, ARM::VLD2q16Pseudo,
-                              ARM::VLD2q32Pseudo };
+      static const uint16_t DOpcodes[] = { ARM::VLD2d8, ARM::VLD2d16,
+                                           ARM::VLD2d32, ARM::VLD1q64 };
+      static const uint16_t QOpcodes[] = { ARM::VLD2q8Pseudo, ARM::VLD2q16Pseudo,
+                                           ARM::VLD2q32Pseudo };
       return SelectVLD(N, false, 2, DOpcodes, QOpcodes, 0);
     }
 
     case Intrinsic::arm_neon_vld3: {
-      unsigned DOpcodes[] = { ARM::VLD3d8Pseudo, ARM::VLD3d16Pseudo,
-                              ARM::VLD3d32Pseudo, ARM::VLD1d64TPseudo };
-      unsigned QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD,
-                               ARM::VLD3q16Pseudo_UPD,
-                               ARM::VLD3q32Pseudo_UPD };
-      unsigned QOpcodes1[] = { ARM::VLD3q8oddPseudo,
-                               ARM::VLD3q16oddPseudo,
-                               ARM::VLD3q32oddPseudo };
+      static const uint16_t DOpcodes[] = { ARM::VLD3d8Pseudo,
+                                           ARM::VLD3d16Pseudo,
+                                           ARM::VLD3d32Pseudo,
+                                           ARM::VLD1d64TPseudo };
+      static const uint16_t QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD,
+                                            ARM::VLD3q16Pseudo_UPD,
+                                            ARM::VLD3q32Pseudo_UPD };
+      static const uint16_t QOpcodes1[] = { ARM::VLD3q8oddPseudo,
+                                            ARM::VLD3q16oddPseudo,
+                                            ARM::VLD3q32oddPseudo };
       return SelectVLD(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
     }
 
     case Intrinsic::arm_neon_vld4: {
-      unsigned DOpcodes[] = { ARM::VLD4d8Pseudo, ARM::VLD4d16Pseudo,
-                              ARM::VLD4d32Pseudo, ARM::VLD1d64QPseudo };
-      unsigned QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD,
-                               ARM::VLD4q16Pseudo_UPD,
-                               ARM::VLD4q32Pseudo_UPD };
-      unsigned QOpcodes1[] = { ARM::VLD4q8oddPseudo,
-                               ARM::VLD4q16oddPseudo,
-                               ARM::VLD4q32oddPseudo };
+      static const uint16_t DOpcodes[] = { ARM::VLD4d8Pseudo,
+                                           ARM::VLD4d16Pseudo,
+                                           ARM::VLD4d32Pseudo,
+                                           ARM::VLD1d64QPseudo };
+      static const uint16_t QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD,
+                                            ARM::VLD4q16Pseudo_UPD,
+                                            ARM::VLD4q32Pseudo_UPD };
+      static const uint16_t QOpcodes1[] = { ARM::VLD4q8oddPseudo,
+                                            ARM::VLD4q16oddPseudo,
+                                            ARM::VLD4q32oddPseudo };
       return SelectVLD(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
     }
 
     case Intrinsic::arm_neon_vld2lane: {
-      unsigned DOpcodes[] = { ARM::VLD2LNd8Pseudo, ARM::VLD2LNd16Pseudo,
-                              ARM::VLD2LNd32Pseudo };
-      unsigned QOpcodes[] = { ARM::VLD2LNq16Pseudo, ARM::VLD2LNq32Pseudo };
+      static const uint16_t DOpcodes[] = { ARM::VLD2LNd8Pseudo,
+                                           ARM::VLD2LNd16Pseudo,
+                                           ARM::VLD2LNd32Pseudo };
+      static const uint16_t QOpcodes[] = { ARM::VLD2LNq16Pseudo,
+                                           ARM::VLD2LNq32Pseudo };
       return SelectVLDSTLane(N, true, false, 2, DOpcodes, QOpcodes);
     }
 
     case Intrinsic::arm_neon_vld3lane: {
-      unsigned DOpcodes[] = { ARM::VLD3LNd8Pseudo, ARM::VLD3LNd16Pseudo,
-                              ARM::VLD3LNd32Pseudo };
-      unsigned QOpcodes[] = { ARM::VLD3LNq16Pseudo, ARM::VLD3LNq32Pseudo };
+      static const uint16_t DOpcodes[] = { ARM::VLD3LNd8Pseudo,
+                                           ARM::VLD3LNd16Pseudo,
+                                           ARM::VLD3LNd32Pseudo };
+      static const uint16_t QOpcodes[] = { ARM::VLD3LNq16Pseudo,
+                                           ARM::VLD3LNq32Pseudo };
       return SelectVLDSTLane(N, true, false, 3, DOpcodes, QOpcodes);
     }
 
     case Intrinsic::arm_neon_vld4lane: {
-      unsigned DOpcodes[] = { ARM::VLD4LNd8Pseudo, ARM::VLD4LNd16Pseudo,
-                              ARM::VLD4LNd32Pseudo };
-      unsigned QOpcodes[] = { ARM::VLD4LNq16Pseudo, ARM::VLD4LNq32Pseudo };
+      static const uint16_t DOpcodes[] = { ARM::VLD4LNd8Pseudo,
+                                           ARM::VLD4LNd16Pseudo,
+                                           ARM::VLD4LNd32Pseudo };
+      static const uint16_t QOpcodes[] = { ARM::VLD4LNq16Pseudo,
+                                           ARM::VLD4LNq32Pseudo };
       return SelectVLDSTLane(N, true, false, 4, DOpcodes, QOpcodes);
     }
 
     case Intrinsic::arm_neon_vst1: {
-      unsigned DOpcodes[] = { ARM::VST1d8, ARM::VST1d16,
-                              ARM::VST1d32, ARM::VST1d64 };
-      unsigned QOpcodes[] = { ARM::VST1q8, ARM::VST1q16,
-                              ARM::VST1q32, ARM::VST1q64 };
+      static const uint16_t DOpcodes[] = { ARM::VST1d8, ARM::VST1d16,
+                                           ARM::VST1d32, ARM::VST1d64 };
+      static const uint16_t QOpcodes[] = { ARM::VST1q8, ARM::VST1q16,
+                                           ARM::VST1q32, ARM::VST1q64 };
       return SelectVST(N, false, 1, DOpcodes, QOpcodes, 0);
     }
 
     case Intrinsic::arm_neon_vst2: {
-      unsigned DOpcodes[] = { ARM::VST2d8, ARM::VST2d16,
-                              ARM::VST2d32, ARM::VST1q64 };
-      unsigned QOpcodes[] = { ARM::VST2q8Pseudo, ARM::VST2q16Pseudo,
-                              ARM::VST2q32Pseudo };
+      static const uint16_t DOpcodes[] = { ARM::VST2d8, ARM::VST2d16,
+                                           ARM::VST2d32, ARM::VST1q64 };
+      static uint16_t QOpcodes[] = { ARM::VST2q8Pseudo, ARM::VST2q16Pseudo,
+                                     ARM::VST2q32Pseudo };
       return SelectVST(N, false, 2, DOpcodes, QOpcodes, 0);
     }
 
     case Intrinsic::arm_neon_vst3: {
-      unsigned DOpcodes[] = { ARM::VST3d8Pseudo, ARM::VST3d16Pseudo,
-                              ARM::VST3d32Pseudo, ARM::VST1d64TPseudo };
-      unsigned QOpcodes0[] = { ARM::VST3q8Pseudo_UPD,
-                               ARM::VST3q16Pseudo_UPD,
-                               ARM::VST3q32Pseudo_UPD };
-      unsigned QOpcodes1[] = { ARM::VST3q8oddPseudo,
-                               ARM::VST3q16oddPseudo,
-                               ARM::VST3q32oddPseudo };
+      static const uint16_t DOpcodes[] = { ARM::VST3d8Pseudo,
+                                           ARM::VST3d16Pseudo,
+                                           ARM::VST3d32Pseudo,
+                                           ARM::VST1d64TPseudo };
+      static const uint16_t QOpcodes0[] = { ARM::VST3q8Pseudo_UPD,
+                                            ARM::VST3q16Pseudo_UPD,
+                                            ARM::VST3q32Pseudo_UPD };
+      static const uint16_t QOpcodes1[] = { ARM::VST3q8oddPseudo,
+                                            ARM::VST3q16oddPseudo,
+                                            ARM::VST3q32oddPseudo };
       return SelectVST(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
     }
 
     case Intrinsic::arm_neon_vst4: {
-      unsigned DOpcodes[] = { ARM::VST4d8Pseudo, ARM::VST4d16Pseudo,
-                              ARM::VST4d32Pseudo, ARM::VST1d64QPseudo };
-      unsigned QOpcodes0[] = { ARM::VST4q8Pseudo_UPD,
-                               ARM::VST4q16Pseudo_UPD,
-                               ARM::VST4q32Pseudo_UPD };
-      unsigned QOpcodes1[] = { ARM::VST4q8oddPseudo,
-                               ARM::VST4q16oddPseudo,
-                               ARM::VST4q32oddPseudo };
+      static const uint16_t DOpcodes[] = { ARM::VST4d8Pseudo,
+                                           ARM::VST4d16Pseudo,
+                                           ARM::VST4d32Pseudo,
+                                           ARM::VST1d64QPseudo };
+      static const uint16_t QOpcodes0[] = { ARM::VST4q8Pseudo_UPD,
+                                            ARM::VST4q16Pseudo_UPD,
+                                            ARM::VST4q32Pseudo_UPD };
+      static const uint16_t QOpcodes1[] = { ARM::VST4q8oddPseudo,
+                                            ARM::VST4q16oddPseudo,
+                                            ARM::VST4q32oddPseudo };
       return SelectVST(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
     }
 
     case Intrinsic::arm_neon_vst2lane: {
-      unsigned DOpcodes[] = { ARM::VST2LNd8Pseudo, ARM::VST2LNd16Pseudo,
-                              ARM::VST2LNd32Pseudo };
-      unsigned QOpcodes[] = { ARM::VST2LNq16Pseudo, ARM::VST2LNq32Pseudo };
+      static const uint16_t DOpcodes[] = { ARM::VST2LNd8Pseudo,
+                                           ARM::VST2LNd16Pseudo,
+                                           ARM::VST2LNd32Pseudo };
+      static const uint16_t QOpcodes[] = { ARM::VST2LNq16Pseudo,
+                                           ARM::VST2LNq32Pseudo };
       return SelectVLDSTLane(N, false, false, 2, DOpcodes, QOpcodes);
     }
 
     case Intrinsic::arm_neon_vst3lane: {
-      unsigned DOpcodes[] = { ARM::VST3LNd8Pseudo, ARM::VST3LNd16Pseudo,
-                              ARM::VST3LNd32Pseudo };
-      unsigned QOpcodes[] = { ARM::VST3LNq16Pseudo, ARM::VST3LNq32Pseudo };
+      static const uint16_t DOpcodes[] = { ARM::VST3LNd8Pseudo,
+                                           ARM::VST3LNd16Pseudo,
+                                           ARM::VST3LNd32Pseudo };
+      static const uint16_t QOpcodes[] = { ARM::VST3LNq16Pseudo,
+                                           ARM::VST3LNq32Pseudo };
       return SelectVLDSTLane(N, false, false, 3, DOpcodes, QOpcodes);
     }
 
     case Intrinsic::arm_neon_vst4lane: {
-      unsigned DOpcodes[] = { ARM::VST4LNd8Pseudo, ARM::VST4LNd16Pseudo,
-                              ARM::VST4LNd32Pseudo };
-      unsigned QOpcodes[] = { ARM::VST4LNq16Pseudo, ARM::VST4LNq32Pseudo };
+      static const uint16_t DOpcodes[] = { ARM::VST4LNd8Pseudo,
+                                           ARM::VST4LNd16Pseudo,
+                                           ARM::VST4LNd32Pseudo };
+      static const uint16_t QOpcodes[] = { ARM::VST4LNq16Pseudo,
+                                           ARM::VST4LNq32Pseudo };
       return SelectVLDSTLane(N, false, false, 4, DOpcodes, QOpcodes);
     }
     }
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index a103c94..04370c0 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -52,6 +52,7 @@ using namespace llvm;
 
 STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
+STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
 
 // This option should go away when tail calls fully work.
 static cl::opt<bool>
@@ -153,12 +154,12 @@ void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT,
 }
 
 void ARMTargetLowering::addDRTypeForNEON(EVT VT) {
-  addRegisterClass(VT, ARM::DPRRegisterClass);
+  addRegisterClass(VT, &ARM::DPRRegClass);
   addTypeForNEON(VT, MVT::f64, MVT::v2i32);
 }
 
 void ARMTargetLowering::addQRTypeForNEON(EVT VT) {
-  addRegisterClass(VT, ARM::QPRRegisterClass);
+  addRegisterClass(VT, &ARM::QPRRegClass);
   addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
 }
 
@@ -431,14 +432,14 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   }
 
   if (Subtarget->isThumb1Only())
-    addRegisterClass(MVT::i32, ARM::tGPRRegisterClass);
+    addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
   else
-    addRegisterClass(MVT::i32, ARM::GPRRegisterClass);
+    addRegisterClass(MVT::i32, &ARM::GPRRegClass);
   if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() &&
       !Subtarget->isThumb1Only()) {
-    addRegisterClass(MVT::f32, ARM::SPRRegisterClass);
+    addRegisterClass(MVT::f32, &ARM::SPRRegClass);
     if (!Subtarget->isFPOnlySP())
-      addRegisterClass(MVT::f64, ARM::DPRRegisterClass);
+      addRegisterClass(MVT::f64, &ARM::DPRRegClass);
 
     setTruncStoreAction(MVT::f64, MVT::f32, Expand);
   }
@@ -824,6 +825,9 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
 
   benefitFromCodePlacementOpt = true;
 
+  // Prefer likely predicted branches to selects on out-of-order cores.
+  predictableSelectIsExpensive = Subtarget->isCortexA9();
+
   setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
 }
 
@@ -849,7 +853,7 @@ ARMTargetLowering::findRepresentativeClass(EVT VT) const{
   // the cost is 1 for both f32 and f64.
   case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
   case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
-    RRC = ARM::DPRRegisterClass;
+    RRC = &ARM::DPRRegClass;
     // When NEON is used for SP, only half of the register file is available
     // because operations that define both SP and DP results will be constrained
     // to the VFP2 class (D0-D15). We currently model this constraint prior to
@@ -859,15 +863,15 @@ ARMTargetLowering::findRepresentativeClass(EVT VT) const{
     break;
   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
   case MVT::v4f32: case MVT::v2f64:
-    RRC = ARM::DPRRegisterClass;
+    RRC = &ARM::DPRRegClass;
     Cost = 2;
     break;
   case MVT::v4i64:
-    RRC = ARM::DPRRegisterClass;
+    RRC = &ARM::DPRRegClass;
     Cost = 4;
     break;
   case MVT::v8i64:
-    RRC = ARM::DPRRegisterClass;
+    RRC = &ARM::DPRRegClass;
     Cost = 8;
     break;
   }
@@ -891,6 +895,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::RET_FLAG:      return "ARMISD::RET_FLAG";
   case ARMISD::PIC_ADD:       return "ARMISD::PIC_ADD";
   case ARMISD::CMP:           return "ARMISD::CMP";
+  case ARMISD::CMN:           return "ARMISD::CMN";
   case ARMISD::CMPZ:          return "ARMISD::CMPZ";
   case ARMISD::CMPFP:         return "ARMISD::CMPFP";
   case ARMISD::CMPFPw0:       return "ARMISD::CMPFPw0";
@@ -1027,9 +1032,9 @@ const TargetRegisterClass *ARMTargetLowering::getRegClassFor(EVT VT) const {
   // load / store 4 to 8 consecutive D registers.
   if (Subtarget->hasNEON()) {
     if (VT == MVT::v4i64)
-      return ARM::QQPRRegisterClass;
-    else if (VT == MVT::v8i64)
-      return ARM::QQQQPRRegisterClass;
+      return &ARM::QQPRRegClass;
+    if (VT == MVT::v8i64)
+      return &ARM::QQQQPRRegClass;
   }
   return TargetLowering::getRegClassFor(VT);
 }
@@ -1286,14 +1291,20 @@ void ARMTargetLowering::PassF64ArgInRegs(DebugLoc dl, SelectionDAG &DAG,
 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
 /// nodes.
 SDValue
-ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
-                             CallingConv::ID CallConv, bool isVarArg,
-                             bool doesNotRet, bool &isTailCall,
-                             const SmallVectorImpl<ISD::OutputArg> &Outs,
-                             const SmallVectorImpl<SDValue> &OutVals,
-                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                             DebugLoc dl, SelectionDAG &DAG,
+ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                              SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool doesNotRet                       = CLI.DoesNotReturn;
+  bool isVarArg                         = CLI.IsVarArg;
+
   MachineFunction &MF = DAG.getMachineFunction();
   bool IsStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
   bool IsSibCall = false;
@@ -1415,21 +1426,22 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
         CCInfo.clearFirstByValReg();
       }
 
-      unsigned LocMemOffset = VA.getLocMemOffset();
-      SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset);
-      SDValue Dst = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr,
-                                StkPtrOff);
-      SDValue SrcOffset = DAG.getIntPtrConstant(4*offset);
-      SDValue Src = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, SrcOffset);
-      SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset,
-                                         MVT::i32);
-      MemOpChains.push_back(DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode,
-                                          Flags.getByValAlign(),
-                                          /*isVolatile=*/false,
-                                          /*AlwaysInline=*/false,
-                                          MachinePointerInfo(0),
-                                          MachinePointerInfo(0)));
-
+      if (Flags.getByValSize() - 4*offset > 0) {
+        unsigned LocMemOffset = VA.getLocMemOffset();
+        SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset);
+        SDValue Dst = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr,
+                                  StkPtrOff);
+        SDValue SrcOffset = DAG.getIntPtrConstant(4*offset);
+        SDValue Src = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, SrcOffset);
+        SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset,
+                                           MVT::i32);
+        SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), MVT::i32);
+
+        SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+        SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
+        MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
+                                          Ops, array_lengthof(Ops)));
+      }
     } else if (!IsSibCall) {
       assert(VA.isMemLoc());
 
@@ -2095,12 +2107,13 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
   Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
   Args.push_back(Entry);
   // FIXME: is there useful debug info available here?
-  std::pair<SDValue, SDValue> CallResult =
-    LowerCallTo(Chain, (Type *) Type::getInt32Ty(*DAG.getContext()),
+  TargetLowering::CallLoweringInfo CLI(Chain,
+                (Type *) Type::getInt32Ty(*DAG.getContext()),
                 false, false, false, false,
                 0, CallingConv::C, /*isTailCall=*/false,
                 /*doesNotRet=*/false, /*isReturnValueUsed=*/true,
                 DAG.getExternalSymbol("__tls_get_addr", PtrVT), Args, DAG, dl);
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.first;
 }
 
@@ -2108,7 +2121,8 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
 // "local exec" model.
 SDValue
 ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
-                                        SelectionDAG &DAG) const {
+                                        SelectionDAG &DAG,
+                                        TLSModel::Model model) const {
   const GlobalValue *GV = GA->getGlobal();
   DebugLoc dl = GA->getDebugLoc();
   SDValue Offset;
@@ -2117,7 +2131,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
   // Get the Thread Pointer
   SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
 
-  if (GV->isDeclaration()) {
+  if (model == TLSModel::InitialExec) {
     MachineFunction &MF = DAG.getMachineFunction();
     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
@@ -2142,6 +2156,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
                          false, false, false, 0);
   } else {
     // local exec model
+    assert(model == TLSModel::LocalExec);
     ARMConstantPoolValue *CPV =
       ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF);
     Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
@@ -2162,12 +2177,18 @@ ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
   assert(Subtarget->isTargetELF() &&
          "TLS not implemented for non-ELF targets");
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
-  // If the relocation model is PIC, use the "General Dynamic" TLS Model,
-  // otherwise use the "Local Exec" TLS Model
-  if (getTargetMachine().getRelocationModel() == Reloc::PIC_)
-    return LowerToTLSGeneralDynamicModel(GA, DAG);
-  else
-    return LowerToTLSExecModels(GA, DAG);
+
+  TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal());
+
+  switch (model) {
+    case TLSModel::GeneralDynamic:
+    case TLSModel::LocalDynamic:
+      return LowerToTLSGeneralDynamicModel(GA, DAG);
+    case TLSModel::InitialExec:
+    case TLSModel::LocalExec:
+      return LowerToTLSExecModels(GA, DAG, model);
+  }
+  llvm_unreachable("bogus TLS model");
 }
 
 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
@@ -2457,9 +2478,9 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
 
   const TargetRegisterClass *RC;
   if (AFI->isThumb1OnlyFunction())
-    RC = ARM::tGPRRegisterClass;
+    RC = &ARM::tGPRRegClass;
   else
-    RC = ARM::GPRRegisterClass;
+    RC = &ARM::GPRRegClass;
 
   // Transform the arguments stored in physical registers into virtual ones.
   unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
@@ -2543,9 +2564,9 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
     for (; firstRegToSaveIndex < 4; ++firstRegToSaveIndex) {
       const TargetRegisterClass *RC;
       if (AFI->isThumb1OnlyFunction())
-        RC = ARM::tGPRRegisterClass;
+        RC = &ARM::tGPRRegClass;
       else
-        RC = ARM::GPRRegisterClass;
+        RC = &ARM::GPRRegClass;
 
       unsigned VReg = MF.addLiveIn(GPRArgRegs[firstRegToSaveIndex], RC);
       SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
@@ -2627,14 +2648,15 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
         const TargetRegisterClass *RC;
 
         if (RegVT == MVT::f32)
-          RC = ARM::SPRRegisterClass;
+          RC = &ARM::SPRRegClass;
         else if (RegVT == MVT::f64)
-          RC = ARM::DPRRegisterClass;
+          RC = &ARM::DPRRegClass;
         else if (RegVT == MVT::v2f64)
-          RC = ARM::QPRRegisterClass;
+          RC = &ARM::QPRRegClass;
         else if (RegVT == MVT::i32)
-          RC = (AFI->isThumb1OnlyFunction() ?
-                ARM::tGPRRegisterClass : ARM::GPRRegisterClass);
+          RC = AFI->isThumb1OnlyFunction() ?
+            (const TargetRegisterClass*)&ARM::tGPRRegClass :
+            (const TargetRegisterClass*)&ARM::GPRRegClass;
         else
           llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
 
@@ -4791,7 +4813,9 @@ static SDValue SkipExtension(SDNode *N, SelectionDAG &DAG) {
   for (unsigned i = 0; i != NumElts; ++i) {
     ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
     const APInt &CInt = C->getAPIntValue();
-    Ops.push_back(DAG.getConstant(CInt.trunc(EltSize), TruncVT));
+    // Element types smaller than 32 bits are not legal, so use i32 elements.
+    // The values are implicitly truncated so sext vs. zext doesn't matter.
+    Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32));
   }
   return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(),
                      MVT::getVectorVT(TruncVT, NumElts), Ops.data(), NumElts);
@@ -5252,14 +5276,14 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
   bool isThumb2 = Subtarget->isThumb2();
 
   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-  unsigned scratch =
-    MRI.createVirtualRegister(isThumb2 ? ARM::rGPRRegisterClass
-                                       : ARM::GPRRegisterClass);
+  unsigned scratch = MRI.createVirtualRegister(isThumb2 ?
+    (const TargetRegisterClass*)&ARM::rGPRRegClass :
+    (const TargetRegisterClass*)&ARM::GPRRegClass);
 
   if (isThumb2) {
-    MRI.constrainRegClass(dest, ARM::rGPRRegisterClass);
-    MRI.constrainRegClass(oldval, ARM::rGPRRegisterClass);
-    MRI.constrainRegClass(newval, ARM::rGPRRegisterClass);
+    MRI.constrainRegClass(dest, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(oldval, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(newval, &ARM::rGPRRegClass);
   }
 
   unsigned ldrOpc, strOpc;
@@ -5362,8 +5386,8 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
 
   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
   if (isThumb2) {
-    MRI.constrainRegClass(dest, ARM::rGPRRegisterClass);
-    MRI.constrainRegClass(ptr, ARM::rGPRRegisterClass);
+    MRI.constrainRegClass(dest, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
   }
 
   unsigned ldrOpc, strOpc;
@@ -5394,8 +5418,9 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
                   BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
-  const TargetRegisterClass *TRC =
-    isThumb2 ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass;
+  const TargetRegisterClass *TRC = isThumb2 ?
+    (const TargetRegisterClass*)&ARM::tGPRRegClass :
+    (const TargetRegisterClass*)&ARM::GPRRegClass;
   unsigned scratch = MRI.createVirtualRegister(TRC);
   unsigned scratch2 = (!BinOpcode) ? incr : MRI.createVirtualRegister(TRC);
 
@@ -5469,8 +5494,8 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
 
   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
   if (isThumb2) {
-    MRI.constrainRegClass(dest, ARM::rGPRRegisterClass);
-    MRI.constrainRegClass(ptr, ARM::rGPRRegisterClass);
+    MRI.constrainRegClass(dest, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
   }
 
   unsigned ldrOpc, strOpc, extendOpc;
@@ -5504,8 +5529,9 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
                   BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
-  const TargetRegisterClass *TRC =
-    isThumb2 ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass;
+  const TargetRegisterClass *TRC = isThumb2 ?
+    (const TargetRegisterClass*)&ARM::tGPRRegClass :
+    (const TargetRegisterClass*)&ARM::GPRRegClass;
   unsigned scratch = MRI.createVirtualRegister(TRC);
   unsigned scratch2 = MRI.createVirtualRegister(TRC);
 
@@ -5531,7 +5557,7 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
 
   // Sign extend the value, if necessary.
   if (signExtend && extendOpc) {
-    oldval = MRI.createVirtualRegister(ARM::GPRRegisterClass);
+    oldval = MRI.createVirtualRegister(&ARM::GPRRegClass);
     AddDefaultPred(BuildMI(BB, dl, TII->get(extendOpc), oldval)
                      .addReg(dest)
                      .addImm(0));
@@ -5586,9 +5612,9 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
 
   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
   if (isThumb2) {
-    MRI.constrainRegClass(destlo, ARM::rGPRRegisterClass);
-    MRI.constrainRegClass(desthi, ARM::rGPRRegisterClass);
-    MRI.constrainRegClass(ptr, ARM::rGPRRegisterClass);
+    MRI.constrainRegClass(destlo, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(desthi, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
   }
 
   unsigned ldrOpc = isThumb2 ? ARM::t2LDREXD : ARM::LDREXD;
@@ -5614,8 +5640,9 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
                   BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
-  const TargetRegisterClass *TRC =
-    isThumb2 ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass;
+  const TargetRegisterClass *TRC = isThumb2 ?
+    (const TargetRegisterClass*)&ARM::tGPRRegClass :
+    (const TargetRegisterClass*)&ARM::GPRRegClass;
   unsigned storesuccess = MRI.createVirtualRegister(TRC);
 
   //  thisMBB:
@@ -5722,8 +5749,9 @@ SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB,
     ARMConstantPoolMBB::Create(F->getContext(), DispatchBB, PCLabelId, PCAdj);
   unsigned CPI = MCP->getConstantPoolIndex(CPV, 4);
 
-  const TargetRegisterClass *TRC =
-    isThumb ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass;
+  const TargetRegisterClass *TRC = isThumb ?
+    (const TargetRegisterClass*)&ARM::tGPRRegClass :
+    (const TargetRegisterClass*)&ARM::GPRRegClass;
 
   // Grab constant pool and fixed stack memory operands.
   MachineMemOperand *CPMMO =
@@ -5827,8 +5855,9 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
   MachineFrameInfo *MFI = MF->getFrameInfo();
   int FI = MFI->getFunctionContextIndex();
 
-  const TargetRegisterClass *TRC =
-    Subtarget->isThumb() ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass;
+  const TargetRegisterClass *TRC = Subtarget->isThumb() ?
+    (const TargetRegisterClass*)&ARM::tGPRRegClass :
+    (const TargetRegisterClass*)&ARM::GPRnopcRegClass;
 
   // Get a mapping of the call site numbers to all of the landing pads they're
   // associated with.
@@ -6176,14 +6205,12 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
       for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
         unsigned Reg = SavedRegs[i];
         if (Subtarget->isThumb2() &&
-            !ARM::tGPRRegisterClass->contains(Reg) &&
-            !ARM::hGPRRegisterClass->contains(Reg))
+            !ARM::tGPRRegClass.contains(Reg) &&
+            !ARM::hGPRRegClass.contains(Reg))
           continue;
-        else if (Subtarget->isThumb1Only() &&
-                 !ARM::tGPRRegisterClass->contains(Reg))
+        if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
           continue;
-        else if (!Subtarget->isThumb() &&
-                 !ARM::GPRRegisterClass->contains(Reg))
+        if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
           continue;
         if (!DefRegs[Reg])
           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
@@ -6214,6 +6241,304 @@ MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
   llvm_unreachable("Expecting a BB with two successors!");
 }
 
+MachineBasicBlock *ARMTargetLowering::
+EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
+  // This pseudo instruction has 3 operands: dst, src, size
+  // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
+  // Otherwise, we will generate unrolled scalar copies.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  unsigned dest = MI->getOperand(0).getReg();
+  unsigned src = MI->getOperand(1).getReg();
+  unsigned SizeVal = MI->getOperand(2).getImm();
+  unsigned Align = MI->getOperand(3).getImm();
+  DebugLoc dl = MI->getDebugLoc();
+
+  bool isThumb2 = Subtarget->isThumb2();
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  unsigned ldrOpc, strOpc, UnitSize = 0;
+
+  const TargetRegisterClass *TRC = isThumb2 ?
+    (const TargetRegisterClass*)&ARM::tGPRRegClass :
+    (const TargetRegisterClass*)&ARM::GPRRegClass;
+  const TargetRegisterClass *TRC_Vec = 0;
+
+  if (Align & 1) {
+    ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
+    strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
+    UnitSize = 1;
+  } else if (Align & 2) {
+    ldrOpc = isThumb2 ? ARM::t2LDRH_POST : ARM::LDRH_POST;
+    strOpc = isThumb2 ? ARM::t2STRH_POST : ARM::STRH_POST;
+    UnitSize = 2;
+  } else {
+    // Check whether we can use NEON instructions.
+    if (!MF->getFunction()->hasFnAttr(Attribute::NoImplicitFloat) &&
+        Subtarget->hasNEON()) {
+      if ((Align % 16 == 0) && SizeVal >= 16) {
+        ldrOpc = ARM::VLD1q32wb_fixed;
+        strOpc = ARM::VST1q32wb_fixed;
+        UnitSize = 16;
+        TRC_Vec = (const TargetRegisterClass*)&ARM::DPairRegClass;
+      }
+      else if ((Align % 8 == 0) && SizeVal >= 8) {
+        ldrOpc = ARM::VLD1d32wb_fixed;
+        strOpc = ARM::VST1d32wb_fixed;
+        UnitSize = 8;
+        TRC_Vec = (const TargetRegisterClass*)&ARM::DPRRegClass;
+      }
+    }
+    // Can't use NEON instructions.
+    if (UnitSize == 0) {
+      ldrOpc = isThumb2 ? ARM::t2LDR_POST : ARM::LDR_POST_IMM;
+      strOpc = isThumb2 ? ARM::t2STR_POST : ARM::STR_POST_IMM;
+      UnitSize = 4;
+    }
+  }
+
+  unsigned BytesLeft = SizeVal % UnitSize;
+  unsigned LoopSize = SizeVal - BytesLeft;
+
+  if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
+    // Use LDR and STR to copy.
+    // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
+    // [destOut] = STR_POST(scratch, destIn, UnitSize)
+    unsigned srcIn = src;
+    unsigned destIn = dest;
+    for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
+      unsigned scratch = MRI.createVirtualRegister(UnitSize >= 8 ? TRC_Vec:TRC);
+      unsigned srcOut = MRI.createVirtualRegister(TRC);
+      unsigned destOut = MRI.createVirtualRegister(TRC);
+      if (UnitSize >= 8) {
+        AddDefaultPred(BuildMI(*BB, MI, dl,
+          TII->get(ldrOpc), scratch)
+          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(0));
+
+        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
+          .addReg(destIn).addImm(0).addReg(scratch));
+      } else if (isThumb2) {
+        AddDefaultPred(BuildMI(*BB, MI, dl,
+          TII->get(ldrOpc), scratch)
+          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(UnitSize));
+
+        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
+          .addReg(scratch).addReg(destIn)
+          .addImm(UnitSize));
+      } else {
+        AddDefaultPred(BuildMI(*BB, MI, dl,
+          TII->get(ldrOpc), scratch)
+          .addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0)
+          .addImm(UnitSize));
+
+        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
+          .addReg(scratch).addReg(destIn)
+          .addReg(0).addImm(UnitSize));
+      }
+      srcIn = srcOut;
+      destIn = destOut;
+    }
+
+    // Handle the leftover bytes with LDRB and STRB.
+    // [scratch, srcOut] = LDRB_POST(srcIn, 1)
+    // [destOut] = STRB_POST(scratch, destIn, 1)
+    ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
+    strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
+    for (unsigned i = 0; i < BytesLeft; i++) {
+      unsigned scratch = MRI.createVirtualRegister(TRC);
+      unsigned srcOut = MRI.createVirtualRegister(TRC);
+      unsigned destOut = MRI.createVirtualRegister(TRC);
+      if (isThumb2) {
+        AddDefaultPred(BuildMI(*BB, MI, dl,
+          TII->get(ldrOpc),scratch)
+          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1));
+
+        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
+          .addReg(scratch).addReg(destIn)
+          .addReg(0).addImm(1));
+      } else {
+        AddDefaultPred(BuildMI(*BB, MI, dl,
+          TII->get(ldrOpc),scratch)
+          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1));
+
+        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
+          .addReg(scratch).addReg(destIn)
+          .addReg(0).addImm(1));
+      }
+      srcIn = srcOut;
+      destIn = destOut;
+    }
+    MI->eraseFromParent();   // The instruction is gone now.
+    return BB;
+  }
+
+  // Expand the pseudo op to a loop.
+  // thisMBB:
+  //   ...
+  //   movw varEnd, # --> with thumb2
+  //   movt varEnd, #
+  //   ldrcp varEnd, idx --> without thumb2
+  //   fallthrough --> loopMBB
+  // loopMBB:
+  //   PHI varPhi, varEnd, varLoop
+  //   PHI srcPhi, src, srcLoop
+  //   PHI destPhi, dst, destLoop
+  //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
+  //   [destLoop] = STR_POST(scratch, destPhi, UnitSize)
+  //   subs varLoop, varPhi, #UnitSize
+  //   bne loopMBB
+  //   fallthrough --> exitMBB
+  // exitMBB:
+  //   epilogue to handle left-over bytes
+  //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
+  //   [destOut] = STRB_POST(scratch, destLoop, 1)
+  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, loopMBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Load an immediate to varEnd.
+  unsigned varEnd = MRI.createVirtualRegister(TRC);
+  if (isThumb2) {
+    unsigned VReg1 = varEnd;
+    if ((LoopSize & 0xFFFF0000) != 0)
+      VReg1 = MRI.createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVi16), VReg1)
+                   .addImm(LoopSize & 0xFFFF));
+
+    if ((LoopSize & 0xFFFF0000) != 0)
+      AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVTi16), varEnd)
+                     .addReg(VReg1)
+                     .addImm(LoopSize >> 16));
+  } else {
+    MachineConstantPool *ConstantPool = MF->getConstantPool();
+    Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
+    const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
+
+    // MachineConstantPool wants an explicit alignment.
+    unsigned Align = getTargetData()->getPrefTypeAlignment(Int32Ty);
+    if (Align == 0)
+      Align = getTargetData()->getTypeAllocSize(C->getType());
+    unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
+
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::LDRcp))
+                   .addReg(varEnd, RegState::Define)
+                   .addConstantPoolIndex(Idx)
+                   .addImm(0));
+  }
+  BB->addSuccessor(loopMBB);
+
+  // Generate the loop body:
+  //   varPhi = PHI(varLoop, varEnd)
+  //   srcPhi = PHI(srcLoop, src)
+  //   destPhi = PHI(destLoop, dst)
+  MachineBasicBlock *entryBB = BB;
+  BB = loopMBB;
+  unsigned varLoop = MRI.createVirtualRegister(TRC);
+  unsigned varPhi = MRI.createVirtualRegister(TRC);
+  unsigned srcLoop = MRI.createVirtualRegister(TRC);
+  unsigned srcPhi = MRI.createVirtualRegister(TRC);
+  unsigned destLoop = MRI.createVirtualRegister(TRC);
+  unsigned destPhi = MRI.createVirtualRegister(TRC);
+
+  BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
+    .addReg(varLoop).addMBB(loopMBB)
+    .addReg(varEnd).addMBB(entryBB);
+  BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
+    .addReg(srcLoop).addMBB(loopMBB)
+    .addReg(src).addMBB(entryBB);
+  BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
+    .addReg(destLoop).addMBB(loopMBB)
+    .addReg(dest).addMBB(entryBB);
+
+  //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
+  //   [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
+  unsigned scratch = MRI.createVirtualRegister(UnitSize >= 8 ? TRC_Vec:TRC);
+  if (UnitSize >= 8) {
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
+      .addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(0));
+
+    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
+      .addReg(destPhi).addImm(0).addReg(scratch));
+  } else if (isThumb2) {
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
+      .addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(UnitSize));
+
+    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
+      .addReg(scratch).addReg(destPhi)
+      .addImm(UnitSize));
+  } else {
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
+      .addReg(srcLoop, RegState::Define).addReg(srcPhi).addReg(0)
+      .addImm(UnitSize));
+
+    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
+      .addReg(scratch).addReg(destPhi)
+      .addReg(0).addImm(UnitSize));
+  }
+
+  // Decrement loop variable by UnitSize.
+  MachineInstrBuilder MIB = BuildMI(BB, dl,
+    TII->get(isThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
+  AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize)));
+  MIB->getOperand(5).setReg(ARM::CPSR);
+  MIB->getOperand(5).setIsDef(true);
+
+  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
+    .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
+
+  // loopMBB can loop back to loopMBB or fall through to exitMBB.
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  // Add epilogue to handle BytesLeft.
+  BB = exitMBB;
+  MachineInstr *StartOfExit = exitMBB->begin();
+  ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
+  strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
+
+  //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
+  //   [destOut] = STRB_POST(scratch, destLoop, 1)
+  unsigned srcIn = srcLoop;
+  unsigned destIn = destLoop;
+  for (unsigned i = 0; i < BytesLeft; i++) {
+    unsigned scratch = MRI.createVirtualRegister(TRC);
+    unsigned srcOut = MRI.createVirtualRegister(TRC);
+    unsigned destOut = MRI.createVirtualRegister(TRC);
+    if (isThumb2) {
+      AddDefaultPred(BuildMI(*BB, StartOfExit, dl,
+        TII->get(ldrOpc),scratch)
+        .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1));
+
+      AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut)
+        .addReg(scratch).addReg(destIn)
+        .addImm(1));
+    } else {
+      AddDefaultPred(BuildMI(*BB, StartOfExit, dl,
+        TII->get(ldrOpc),scratch)
+        .addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0).addImm(1));
+
+      AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut)
+        .addReg(scratch).addReg(destIn)
+        .addReg(0).addImm(1));
+    }
+    srcIn = srcOut;
+    destIn = destOut;
+  }
+
+  MI->eraseFromParent();   // The instruction is gone now.
+  return BB;
+}
+
 MachineBasicBlock *
 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                MachineBasicBlock *BB) const {
@@ -6517,10 +6842,9 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     MachineRegisterInfo &MRI = Fn->getRegInfo();
     // In Thumb mode S must not be specified if source register is the SP or
     // PC and if destination register is the SP, so restrict register class
-    unsigned NewMovDstReg = MRI.createVirtualRegister(
-      isThumb2 ? ARM::rGPRRegisterClass : ARM::GPRRegisterClass);
-    unsigned NewRsbDstReg = MRI.createVirtualRegister(
-      isThumb2 ? ARM::rGPRRegisterClass : ARM::GPRRegisterClass);
+    unsigned NewRsbDstReg = MRI.createVirtualRegister(isThumb2 ?
+      (const TargetRegisterClass*)&ARM::rGPRRegClass :
+      (const TargetRegisterClass*)&ARM::GPRRegClass);
 
     // Transfer the remainder of BB and its successor edges to sinkMBB.
     SinkBB->splice(SinkBB->begin(), BB,
@@ -6534,12 +6858,10 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     // fall through to SinkMBB
     RSBBB->addSuccessor(SinkBB);
 
-    // insert a movs at the end of BB
-    BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2MOVr : ARM::MOVr),
-      NewMovDstReg)
-      .addReg(ABSSrcReg, RegState::Kill)
-      .addImm((unsigned)ARMCC::AL).addReg(0)
-      .addReg(ARM::CPSR, RegState::Define);
+    // insert a cmp at the end of BB
+    AddDefaultPred(BuildMI(BB, dl,
+                           TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
+                   .addReg(ABSSrcReg).addImm(0));
 
     // insert a bcc with opposite CC to ARMCC::MI at the end of BB
     BuildMI(BB, dl,
@@ -6551,7 +6873,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     // by if-conversion pass
     BuildMI(*RSBBB, RSBBB->begin(), dl,
       TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
-      .addReg(NewMovDstReg, RegState::Kill)
+      .addReg(ABSSrcReg, RegState::Kill)
       .addImm(0).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
 
     // insert PHI in SinkBB,
@@ -6559,7 +6881,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     BuildMI(*SinkBB, SinkBB->begin(), dl,
       TII->get(ARM::PHI), ABSDstReg)
       .addReg(NewRsbDstReg).addMBB(RSBBB)
-      .addReg(NewMovDstReg).addMBB(BB);
+      .addReg(ABSSrcReg).addMBB(BB);
 
     // remove ABS instruction
     MI->eraseFromParent();
@@ -6567,6 +6889,9 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     // return last added BB
     return SinkBB;
   }
+  case ARM::COPY_STRUCT_BYVAL_I32:
+    ++NumLoopByVals;
+    return EmitStructByval(MI, BB);
   }
 }
 
@@ -7353,7 +7678,7 @@ static SDValue PerformSTORECombine(SDNode *N,
   if (St->isVolatile())
     return SDValue();
 
-  // Optimize trunc store (of multiple scalars) to shuffle and store.  First, 
+  // Optimize trunc store (of multiple scalars) to shuffle and store.  First,
   // pack all of the elements in one place.  Next, store to memory in fewer
   // chunks.
   SDValue StVal = St->getValue();
@@ -8721,12 +9046,19 @@ bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
   return Imm >= 0 && Imm <= 255;
 }
 
-/// isLegalAddImmediate - Return true if the specified immediate is legal
-/// add immediate, that is the target has add instructions which can add
-/// a register with the immediate without having to materialize the
+/// isLegalAddImmediate - Return true if the specified immediate is a legal add
+/// *or sub* immediate, that is the target has add or sub instructions which can
+/// add a register with the immediate without having to materialize the
 /// immediate into a register.
 bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
-  return ARM_AM::getSOImmVal(Imm) != -1;
+  // Same encoding for add/sub, just flip the sign.
+  int64_t AbsImm = llvm::abs64(Imm);
+  if (!Subtarget->isThumb())
+    return ARM_AM::getSOImmVal(AbsImm) != -1;
+  if (Subtarget->isThumb2())
+    return ARM_AM::getT2SOImmVal(AbsImm) != -1;
+  // Thumb1 only has 8-bit unsigned immediate.
+  return AbsImm >= 0 && AbsImm <= 255;
 }
 
 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
@@ -9030,39 +9362,38 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
     switch (Constraint[0]) {
     case 'l': // Low regs or general regs.
       if (Subtarget->isThumb())
-        return RCPair(0U, ARM::tGPRRegisterClass);
-      else
-        return RCPair(0U, ARM::GPRRegisterClass);
+        return RCPair(0U, &ARM::tGPRRegClass);
+      return RCPair(0U, &ARM::GPRRegClass);
     case 'h': // High regs or no regs.
       if (Subtarget->isThumb())
-        return RCPair(0U, ARM::hGPRRegisterClass);
+        return RCPair(0U, &ARM::hGPRRegClass);
       break;
     case 'r':
-      return RCPair(0U, ARM::GPRRegisterClass);
+      return RCPair(0U, &ARM::GPRRegClass);
     case 'w':
       if (VT == MVT::f32)
-        return RCPair(0U, ARM::SPRRegisterClass);
+        return RCPair(0U, &ARM::SPRRegClass);
       if (VT.getSizeInBits() == 64)
-        return RCPair(0U, ARM::DPRRegisterClass);
+        return RCPair(0U, &ARM::DPRRegClass);
       if (VT.getSizeInBits() == 128)
-        return RCPair(0U, ARM::QPRRegisterClass);
+        return RCPair(0U, &ARM::QPRRegClass);
       break;
     case 'x':
       if (VT == MVT::f32)
-        return RCPair(0U, ARM::SPR_8RegisterClass);
+        return RCPair(0U, &ARM::SPR_8RegClass);
       if (VT.getSizeInBits() == 64)
-        return RCPair(0U, ARM::DPR_8RegisterClass);
+        return RCPair(0U, &ARM::DPR_8RegClass);
       if (VT.getSizeInBits() == 128)
-        return RCPair(0U, ARM::QPR_8RegisterClass);
+        return RCPair(0U, &ARM::QPR_8RegClass);
       break;
     case 't':
       if (VT == MVT::f32)
-        return RCPair(0U, ARM::SPRRegisterClass);
+        return RCPair(0U, &ARM::SPRRegClass);
       break;
     }
   }
   if (StringRef("{cc}").equals_lower(Constraint))
-    return std::make_pair(unsigned(ARM::CPSR), ARM::CCRRegisterClass);
+    return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
 
   return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
 }
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 352d980..7ad48b9 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -41,6 +41,9 @@ namespace llvm {
                     // PIC mode.
       WrapperJT,    // WrapperJT - A wrapper node for TargetJumpTable
 
+      // Add pseudo op to model memcpy for struct byval.
+      COPY_STRUCT_BYVAL,
+
       CALL,         // Function call.
       CALL_PRED,    // Function call that's predicable.
       CALL_NOLINK,  // Function call with branch not branch-and-link.
@@ -53,6 +56,7 @@ namespace llvm {
       PIC_ADD,      // Add with a PC operand and a PIC label.
 
       CMP,          // ARM compare instructions.
+      CMN,          // ARM CMN instructions.
       CMPZ,         // ARM compare that sets only Z flag.
       CMPFP,        // ARM VFP compare instruction, sets FPSCR.
       CMPFPw0,      // ARM VFP compare against zero instruction, sets FPSCR.
@@ -422,7 +426,8 @@ namespace llvm {
     SDValue LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
                                             SelectionDAG &DAG) const;
     SDValue LowerToTLSExecModels(GlobalAddressSDNode *GA,
-                                   SelectionDAG &DAG) const;
+                                 SelectionDAG &DAG,
+                                 TLSModel::Model model) const;
     SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
@@ -462,13 +467,7 @@ namespace llvm {
                         unsigned &VARegSize, unsigned &VARegSaveSize) const;
 
     virtual SDValue
-      LowerCall(SDValue Chain, SDValue Callee,
-                CallingConv::ID CallConv, bool isVarArg,
-                bool doesNotRet, bool &isTailCall,
-                const SmallVectorImpl<ISD::OutputArg> &Outs,
-                const SmallVectorImpl<SDValue> &OutVals,
-                const SmallVectorImpl<ISD::InputArg> &Ins,
-                DebugLoc dl, SelectionDAG &DAG,
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
                 SmallVectorImpl<SDValue> &InVals) const;
 
     /// HandleByVal - Target-specific cleanup for ByVal support.
@@ -532,6 +531,9 @@ namespace llvm {
                                              MachineBasicBlock *MBB) const;
 
     bool RemapAddSubWithFlags(MachineInstr *MI, MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *EmitStructByval(MachineInstr *MI,
+                                       MachineBasicBlock *MBB) const;
   };
 
   enum NEONModImmType {
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index f04926a..c8966fb 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -827,6 +827,8 @@ class AExtI<bits<8> opcod, dag oops, dag iops, InstrItinClass itin,
   let Inst{7-4}   = 0b0111;
   let Inst{9-8}   = 0b00;
   let Inst{27-20} = opcod;
+
+  let Unpredictable{9-8} = 0b11;
 }
 
 // Misc Arithmetic instructions.
@@ -1862,7 +1864,6 @@ class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
           string opc, string dt, string asm, string cstr, list<dag> pattern>
   : N3VCommon<op24, op23, op21_20, op11_8, op6, op4,
               oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
-
   // Instruction operands.
   bits<5> Vd;
   bits<5> Vn;
diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
index b8f607e..31b0c41 100644
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@@ -31,7 +31,8 @@ ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI)
 /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
 void ARMInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
   if (hasNOP()) {
-    NopInst.setOpcode(ARM::NOP);
+    NopInst.setOpcode(ARM::HINT);
+    NopInst.addOperand(MCOperand::CreateImm(0));
     NopInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
     NopInst.addOperand(MCOperand::CreateReg(0));
   } else {
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 1eb561d..1b8fc3f 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -18,6 +18,9 @@
 // Type profiles.
 def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
 def SDT_ARMCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>;
+def SDT_ARMStructByVal : SDTypeProfile<0, 4,
+                                       [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+                                        SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
 
 def SDT_ARMSaveCallPC : SDTypeProfile<0, 1, []>;
 
@@ -90,6 +93,10 @@ def ARMcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_ARMCallSeqStart,
                               [SDNPHasChain, SDNPOutGlue]>;
 def ARMcallseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_ARMCallSeqEnd,
                               [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def ARMcopystructbyval : SDNode<"ARMISD::COPY_STRUCT_BYVAL" ,
+                                SDT_ARMStructByVal,
+                                [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+                                 SDNPMayStore, SDNPMayLoad]>;
 
 def ARMcall          : SDNode<"ARMISD::CALL", SDT_ARMcall,
                               [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
@@ -121,6 +128,9 @@ def ARMBcci64        : SDNode<"ARMISD::BCC_i64", SDT_ARMBCC_i64,
 def ARMcmp           : SDNode<"ARMISD::CMP", SDT_ARMCmp,
                               [SDNPOutGlue]>;
 
+def ARMcmn           : SDNode<"ARMISD::CMN", SDT_ARMCmp,
+                              [SDNPOutGlue]>;
+
 def ARMcmpZ          : SDNode<"ARMISD::CMPZ", SDT_ARMCmp,
                               [SDNPOutGlue, SDNPCommutative]>;
 
@@ -161,53 +171,59 @@ def ARMbfi           : SDNode<"ARMISD::BFI", SDT_ARMBFI>;
 // ARM Instruction Predicate Definitions.
 //
 def HasV4T           : Predicate<"Subtarget->hasV4TOps()">,
-                                 AssemblerPredicate<"HasV4TOps">;
+                                 AssemblerPredicate<"HasV4TOps", "armv4t">;
 def NoV4T            : Predicate<"!Subtarget->hasV4TOps()">;
 def HasV5T           : Predicate<"Subtarget->hasV5TOps()">;
 def HasV5TE          : Predicate<"Subtarget->hasV5TEOps()">,
-                                 AssemblerPredicate<"HasV5TEOps">;
+                                 AssemblerPredicate<"HasV5TEOps", "armv5te">;
 def HasV6            : Predicate<"Subtarget->hasV6Ops()">,
-                                 AssemblerPredicate<"HasV6Ops">;
+                                 AssemblerPredicate<"HasV6Ops", "armv6">;
 def NoV6             : Predicate<"!Subtarget->hasV6Ops()">;
 def HasV6T2          : Predicate<"Subtarget->hasV6T2Ops()">,
-                                 AssemblerPredicate<"HasV6T2Ops">;
+                                 AssemblerPredicate<"HasV6T2Ops", "armv6t2">;
 def NoV6T2           : Predicate<"!Subtarget->hasV6T2Ops()">;
 def HasV7            : Predicate<"Subtarget->hasV7Ops()">,
-                                 AssemblerPredicate<"HasV7Ops">;
+                                 AssemblerPredicate<"HasV7Ops", "armv7">;
 def NoVFP            : Predicate<"!Subtarget->hasVFP2()">;
 def HasVFP2          : Predicate<"Subtarget->hasVFP2()">,
-                                 AssemblerPredicate<"FeatureVFP2">;
+                                 AssemblerPredicate<"FeatureVFP2", "VFP2">;
 def HasVFP3          : Predicate<"Subtarget->hasVFP3()">,
-                                 AssemblerPredicate<"FeatureVFP3">;
+                                 AssemblerPredicate<"FeatureVFP3", "VFP3">;
 def HasVFP4          : Predicate<"Subtarget->hasVFP4()">,
-                                 AssemblerPredicate<"FeatureVFP4">;
+                                 AssemblerPredicate<"FeatureVFP4", "VFP4">;
 def HasNEON          : Predicate<"Subtarget->hasNEON()">,
-                                 AssemblerPredicate<"FeatureNEON">;
+                                 AssemblerPredicate<"FeatureNEON", "NEON">;
 def HasFP16          : Predicate<"Subtarget->hasFP16()">,
-                                 AssemblerPredicate<"FeatureFP16">;
+                                 AssemblerPredicate<"FeatureFP16","half-float">;
 def HasDivide        : Predicate<"Subtarget->hasDivide()">,
-                                 AssemblerPredicate<"FeatureHWDiv">;
+                                 AssemblerPredicate<"FeatureHWDiv", "divide">;
 def HasT2ExtractPack : Predicate<"Subtarget->hasT2ExtractPack()">,
-                                 AssemblerPredicate<"FeatureT2XtPk">;
+                                 AssemblerPredicate<"FeatureT2XtPk",
+                                                     "pack/extract">;
 def HasThumb2DSP     : Predicate<"Subtarget->hasThumb2DSP()">,
-                                 AssemblerPredicate<"FeatureDSPThumb2">;
+                                 AssemblerPredicate<"FeatureDSPThumb2",
+                                                    "thumb2-dsp">;
 def HasDB            : Predicate<"Subtarget->hasDataBarrier()">,
-                                 AssemblerPredicate<"FeatureDB">;
+                                 AssemblerPredicate<"FeatureDB",
+                                                    "data-barriers">;
 def HasMP            : Predicate<"Subtarget->hasMPExtension()">,
-                                 AssemblerPredicate<"FeatureMP">;
+                                 AssemblerPredicate<"FeatureMP",
+                                                    "mp-extensions">;
 def UseNEONForFP     : Predicate<"Subtarget->useNEONForSinglePrecisionFP()">;
 def DontUseNEONForFP : Predicate<"!Subtarget->useNEONForSinglePrecisionFP()">;
 def IsThumb          : Predicate<"Subtarget->isThumb()">,
-                                 AssemblerPredicate<"ModeThumb">;
+                                 AssemblerPredicate<"ModeThumb", "thumb">;
 def IsThumb1Only     : Predicate<"Subtarget->isThumb1Only()">;
 def IsThumb2         : Predicate<"Subtarget->isThumb2()">,
-                                 AssemblerPredicate<"ModeThumb,FeatureThumb2">;
+                                 AssemblerPredicate<"ModeThumb,FeatureThumb2",
+                                                    "thumb2">;
 def IsMClass         : Predicate<"Subtarget->isMClass()">,
-                                 AssemblerPredicate<"FeatureMClass">;
+                                 AssemblerPredicate<"FeatureMClass", "armv7m">;
 def IsARClass        : Predicate<"!Subtarget->isMClass()">,
-                                 AssemblerPredicate<"!FeatureMClass">;
+                                 AssemblerPredicate<"!FeatureMClass",
+                                                    "armv7a/r">;
 def IsARM            : Predicate<"!Subtarget->isThumb()">,
-                                 AssemblerPredicate<"!ModeThumb">;
+                                 AssemblerPredicate<"!ModeThumb", "arm-mode">;
 def IsIOS            : Predicate<"Subtarget->isTargetIOS()">;
 def IsNotIOS         : Predicate<"!Subtarget->isTargetIOS()">;
 def IsNaCl           : Predicate<"Subtarget->isTargetNaCl()">;
@@ -220,7 +236,8 @@ def UseFPVMLx        : Predicate<"Subtarget->useFPVMLx()">;
 // Prefer fused MAC for fp mul + add over fp VMLA / VMLS if they are available.
 // But only select them if more precision in FP computation is allowed.
 // Do not use them for Darwin platforms.
-def UseFusedMAC      : Predicate<"!TM.Options.NoExcessFPPrecision && "
+def UseFusedMAC      : Predicate<"(TM.Options.AllowFPOpFusion =="
+                                 " FPOpFusion::Fast) && "
                                  "!Subtarget->isTargetDarwin()">;
 def DontUseFusedMAC  : Predicate<"!Subtarget->hasVFP4() || "
                                  "Subtarget->isTargetDarwin()">;
@@ -236,9 +253,9 @@ class RegConstraint<string C> {
 //  ARM specific transformation functions and pattern fragments.
 //
 
-// so_imm_neg_XFORM - Return a so_imm value packed into the format described for
-// so_imm_neg def below.
-def so_imm_neg_XFORM : SDNodeXForm<imm, [{
+// imm_neg_XFORM - Return a imm value packed into the format described for
+// imm_neg defs below.
+def imm_neg_XFORM : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(-(int)N->getZExtValue(), MVT::i32);
 }]>;
 
@@ -257,7 +274,7 @@ def so_imm_neg_asmoperand : AsmOperandClass { let Name = "ARMSOImmNeg"; }
 def so_imm_neg : Operand<i32>, PatLeaf<(imm), [{
     int64_t Value = -(int)N->getZExtValue();
     return Value && ARM_AM::getSOImmVal(Value) != -1;
-  }], so_imm_neg_XFORM> {
+  }], imm_neg_XFORM> {
   let ParserMatchClass = so_imm_neg_asmoperand;
 }
 
@@ -570,7 +587,10 @@ def imm1_31 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 32; }]> {
 }
 
 /// imm0_15 predicate - Immediate in the range [0,15].
-def Imm0_15AsmOperand: ImmAsmOperand { let Name = "Imm0_15"; }
+def Imm0_15AsmOperand: ImmAsmOperand {
+  let Name = "Imm0_15";
+  let DiagnosticType = "ImmRange0_15";
+}
 def imm0_15 : Operand<i32>, ImmLeaf<i32, [{
   return Imm >= 0 && Imm < 16;
 }]> {
@@ -615,6 +635,11 @@ def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
   let ParserMatchClass = Imm0_65535AsmOperand;
 }
 
+// imm0_65535_neg - An immediate whose negative value is in the range [0.65535].
+def imm0_65535_neg : Operand<i32>, ImmLeaf<i32, [{
+  return -Imm >= 0 && -Imm < 65536;
+}]>;
+
 // imm0_65535_expr - For movt/movw - 16-bit immediate that can also reference
 // a relocatable expression.
 //
@@ -940,6 +965,7 @@ include "ARMInstrFormats.td"
 
 /// AsI1_bin_irs - Defines a set of (op r, {so_imm|r|so_reg}) patterns for a
 /// binop that produces a value.
+let TwoOperandAliasConstraint = "$Rn = $Rd" in
 multiclass AsI1_bin_irs<bits<4> opcod, string opc,
                      InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
                         PatFrag opnode, string baseOpc, bit Commutable = 0> {
@@ -1003,35 +1029,12 @@ multiclass AsI1_bin_irs<bits<4> opcod, string opc,
     let Inst{4} = 1;
     let Inst{3-0} = shift{3-0};
   }
-
-  // Assembly aliases for optional destination operand when it's the same
-  // as the source operand.
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"),
-     (!cast<Instruction>(!strconcat(baseOpc, "ri")) GPR:$Rdn, GPR:$Rdn,
-                                                    so_imm:$imm, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsARM]>;
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $Rm"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rr")) GPR:$Rdn, GPR:$Rdn,
-                                                    GPR:$Rm, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsARM]>;
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rsi")) GPR:$Rdn, GPR:$Rdn,
-                                                    so_reg_imm:$shift, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsARM]>;
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rsr")) GPR:$Rdn, GPR:$Rdn,
-                                                    so_reg_reg:$shift, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsARM]>;
-
 }
 
 /// AsI1_rbin_irs - Same as AsI1_bin_irs except the order of operands are
 /// reversed.  The 'rr' form is only defined for the disassembler; for codegen
 /// it is equivalent to the AsI1_bin_irs counterpart.
+let TwoOperandAliasConstraint = "$Rn = $Rd" in
 multiclass AsI1_rbin_irs<bits<4> opcod, string opc,
                      InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
                         PatFrag opnode, string baseOpc, bit Commutable = 0> {
@@ -1094,30 +1097,6 @@ multiclass AsI1_rbin_irs<bits<4> opcod, string opc,
     let Inst{4} = 1;
     let Inst{3-0} = shift{3-0};
   }
-
-  // Assembly aliases for optional destination operand when it's the same
-  // as the source operand.
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"),
-     (!cast<Instruction>(!strconcat(baseOpc, "ri")) GPR:$Rdn, GPR:$Rdn,
-                                                    so_imm:$imm, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsARM]>;
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $Rm"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rr")) GPR:$Rdn, GPR:$Rdn,
-                                                    GPR:$Rm, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsARM]>;
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rsi")) GPR:$Rdn, GPR:$Rdn,
-                                                    so_reg_imm:$shift, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsARM]>;
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rsr")) GPR:$Rdn, GPR:$Rdn,
-                                                    so_reg_reg:$shift, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsARM]>;
-
 }
 
 /// AsI1_bin_s_irs - Same as AsI1_bin_irs except it sets the 's' bit by default.
@@ -1304,6 +1283,7 @@ class AI_exta_rrot_np<bits<8> opcod, string opc>
 }
 
 /// AI1_adde_sube_irs - Define instructions and patterns for adde and sube.
+let TwoOperandAliasConstraint = "$Rn = $Rd" in
 multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
                              string baseOpc, bit Commutable = 0> {
   let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in {
@@ -1351,7 +1331,8 @@ multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
   def rsr : AsI1<opcod, (outs GPRnopc:$Rd),
                 (ins GPRnopc:$Rn, so_reg_reg:$shift),
                 DPSoRegRegFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
-              [(set GPRnopc:$Rd, CPSR, (opnode GPRnopc:$Rn, so_reg_reg:$shift, CPSR))]>,
+              [(set GPRnopc:$Rd, CPSR,
+                    (opnode GPRnopc:$Rn, so_reg_reg:$shift, CPSR))]>,
                Requires<[IsARM]> {
     bits<4> Rd;
     bits<4> Rn;
@@ -1366,32 +1347,10 @@ multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
     let Inst{3-0} = shift{3-0};
   }
   }
-
-  // Assembly aliases for optional destination operand when it's the same
-  // as the source operand.
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"),
-     (!cast<Instruction>(!strconcat(baseOpc, "ri")) GPR:$Rdn, GPR:$Rdn,
-                                                    so_imm:$imm, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsARM]>;
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $Rm"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rr")) GPR:$Rdn, GPR:$Rdn,
-                                                    GPR:$Rm, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsARM]>;
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rsi")) GPR:$Rdn, GPR:$Rdn,
-                                                    so_reg_imm:$shift, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsARM]>;
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rsr")) GPRnopc:$Rdn, GPRnopc:$Rdn,
-                                                    so_reg_reg:$shift, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsARM]>;
 }
 
 /// AI1_rsc_irs - Define instructions and patterns for rsc
+let TwoOperandAliasConstraint = "$Rn = $Rd" in
 multiclass AI1_rsc_irs<bits<4> opcod, string opc, PatFrag opnode,
                        string baseOpc> {
   let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in {
@@ -1450,29 +1409,6 @@ multiclass AI1_rsc_irs<bits<4> opcod, string opc, PatFrag opnode,
     let Inst{3-0} = shift{3-0};
   }
   }
-
-  // Assembly aliases for optional destination operand when it's the same
-  // as the source operand.
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"),
-     (!cast<Instruction>(!strconcat(baseOpc, "ri")) GPR:$Rdn, GPR:$Rdn,
-                                                    so_imm:$imm, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsARM]>;
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $Rm"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rr")) GPR:$Rdn, GPR:$Rdn,
-                                                    GPR:$Rm, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsARM]>;
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rsi")) GPR:$Rdn, GPR:$Rdn,
-                                                    so_reg_imm:$shift, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsARM]>;
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rsr")) GPR:$Rdn, GPR:$Rdn,
-                                                    so_reg_reg:$shift, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsARM]>;
 }
 
 let canFoldAsLoad = 1, isReMaterializable = 1 in {
@@ -1511,9 +1447,10 @@ multiclass AI_ldr1nopc<bit isByte, string opc, InstrItinClass iii,
   // Note: We use the complex addrmode_imm12 rather than just an input
   // GPR and a constrained immediate so that we can use this to match
   // frame index references and avoid matching constant pool references.
-  def i12: AI2ldst<0b010, 1, isByte, (outs GPRnopc:$Rt), (ins addrmode_imm12:$addr),
+  def i12: AI2ldst<0b010, 1, isByte, (outs GPRnopc:$Rt),
+                   (ins addrmode_imm12:$addr),
                    AddrMode_i12, LdFrm, iii, opc, "\t$Rt, $addr",
-                  [(set GPRnopc:$Rt, (opnode addrmode_imm12:$addr))]> {
+                   [(set GPRnopc:$Rt, (opnode addrmode_imm12:$addr))]> {
     bits<4>  Rt;
     bits<17> addr;
     let Inst{23}    = addr{12};     // U (add = ('U' == 1))
@@ -1521,9 +1458,10 @@ multiclass AI_ldr1nopc<bit isByte, string opc, InstrItinClass iii,
     let Inst{15-12} = Rt;
     let Inst{11-0}  = addr{11-0};   // imm12
   }
-  def rs : AI2ldst<0b011, 1, isByte, (outs GPRnopc:$Rt), (ins ldst_so_reg:$shift),
-                  AddrModeNone, LdFrm, iir, opc, "\t$Rt, $shift",
-                 [(set GPRnopc:$Rt, (opnode ldst_so_reg:$shift))]> {
+  def rs : AI2ldst<0b011, 1, isByte, (outs GPRnopc:$Rt),
+                   (ins ldst_so_reg:$shift),
+                   AddrModeNone, LdFrm, iir, opc, "\t$Rt, $shift",
+                   [(set GPRnopc:$Rt, (opnode ldst_so_reg:$shift))]> {
     bits<4>  Rt;
     bits<17> shift;
     let shift{4}    = 0;            // Inst{4} = 0
@@ -1581,9 +1519,10 @@ multiclass AI_str1nopc<bit isByte, string opc, InstrItinClass iii,
     let Inst{15-12} = Rt;
     let Inst{11-0}  = addr{11-0};   // imm12
   }
-  def rs : AI2ldst<0b011, 0, isByte, (outs), (ins GPRnopc:$Rt, ldst_so_reg:$shift),
-                  AddrModeNone, StFrm, iir, opc, "\t$Rt, $shift",
-                 [(opnode GPRnopc:$Rt, ldst_so_reg:$shift)]> {
+  def rs : AI2ldst<0b011, 0, isByte, (outs),
+                   (ins GPRnopc:$Rt, ldst_so_reg:$shift),
+                   AddrModeNone, StFrm, iir, opc, "\t$Rt, $shift",
+                   [(opnode GPRnopc:$Rt, ldst_so_reg:$shift)]> {
     bits<4> Rt;
     bits<17> shift;
     let shift{4}    = 0;            // Inst{4} = 0
@@ -1655,33 +1594,18 @@ def ATOMCMPXCHG6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
                                  NoItinerary, []>;
 }
 
-def NOP : AI<(outs), (ins), MiscFrm, NoItinerary, "nop", "", []>,
-          Requires<[IsARM, HasV6T2]> {
-  let Inst{27-16} = 0b001100100000;
-  let Inst{15-8} = 0b11110000;
-  let Inst{7-0} = 0b00000000;
+def HINT : AI<(outs), (ins imm0_255:$imm), MiscFrm, NoItinerary,
+              "hint", "\t$imm", []>, Requires<[IsARM, HasV6]> {
+  bits<8> imm;
+  let Inst{27-8} = 0b00110010000011110000;
+  let Inst{7-0} = imm;
 }
 
-def YIELD : AI<(outs), (ins), MiscFrm, NoItinerary, "yield", "", []>,
-          Requires<[IsARM, HasV6T2]> {
-  let Inst{27-16} = 0b001100100000;
-  let Inst{15-8} = 0b11110000;
-  let Inst{7-0} = 0b00000001;
-}
-
-def WFE : AI<(outs), (ins), MiscFrm, NoItinerary, "wfe", "", []>,
-          Requires<[IsARM, HasV6T2]> {
-  let Inst{27-16} = 0b001100100000;
-  let Inst{15-8} = 0b11110000;
-  let Inst{7-0} = 0b00000010;
-}
-
-def WFI : AI<(outs), (ins), MiscFrm, NoItinerary, "wfi", "", []>,
-          Requires<[IsARM, HasV6T2]> {
-  let Inst{27-16} = 0b001100100000;
-  let Inst{15-8} = 0b11110000;
-  let Inst{7-0} = 0b00000011;
-}
+def : InstAlias<"nop$p", (HINT 0, pred:$p)>, Requires<[IsARM, HasV6T2]>;
+def : InstAlias<"yield$p", (HINT 1, pred:$p)>, Requires<[IsARM, HasV6T2]>;
+def : InstAlias<"wfe$p", (HINT 2, pred:$p)>, Requires<[IsARM, HasV6T2]>;
+def : InstAlias<"wfi$p", (HINT 3, pred:$p)>, Requires<[IsARM, HasV6T2]>;
+def : InstAlias<"sev$p", (HINT 4, pred:$p)>, Requires<[IsARM, HasV6T2]>;
 
 def SEL : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, NoItinerary, "sel",
              "\t$Rd, $Rn, $Rm", []>, Requires<[IsARM, HasV6]> {
@@ -1694,16 +1618,10 @@ def SEL : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, NoItinerary, "sel",
   let Inst{27-20} = 0b01101000;
   let Inst{7-4} = 0b1011;
   let Inst{11-8} = 0b1111;
+  let Unpredictable{11-8} = 0b1111;
 }
 
-def SEV : AI<(outs), (ins), MiscFrm, NoItinerary, "sev", "",
-             []>, Requires<[IsARM, HasV6T2]> {
-  let Inst{27-16} = 0b001100100000;
-  let Inst{15-8} = 0b11110000;
-  let Inst{7-0} = 0b00000100;
-}
-
-// The i32imm operand $val can be used by a debugger to store more information
+// The 16-bit operand $val can be used by a debugger to store more information
 // about the breakpoint.
 def BKPT : AI<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary,
               "bkpt", "\t$val", []>, Requires<[IsARM]> {
@@ -1922,7 +1840,7 @@ let isCall = 1,
   // at least be a pseudo instruction expanding to the predicated version
   // at MC lowering time.
   Defs = [LR], Uses = [SP] in {
-  def BL  : ABXI<0b1011, (outs), (ins bl_target:$func, variable_ops),
+  def BL  : ABXI<0b1011, (outs), (ins bl_target:$func),
                 IIC_Br, "bl\t$func",
                 [(ARMcall tglobaladdr:$func)]>,
             Requires<[IsARM]> {
@@ -1932,7 +1850,7 @@ let isCall = 1,
     let DecoderMethod = "DecodeBranchImmInstruction";
   }
 
-  def BL_pred : ABI<0b1011, (outs), (ins bl_target:$func, variable_ops),
+  def BL_pred : ABI<0b1011, (outs), (ins bl_target:$func),
                    IIC_Br, "bl", "\t$func",
                    [(ARMcall_pred tglobaladdr:$func)]>,
                 Requires<[IsARM]> {
@@ -1942,7 +1860,7 @@ let isCall = 1,
   }
 
   // ARMv5T and above
-  def BLX : AXI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm,
+  def BLX : AXI<(outs), (ins GPR:$func), BrMiscFrm,
                 IIC_Br, "blx\t$func",
                 [(ARMcall GPR:$func)]>,
             Requires<[IsARM, HasV5T]> {
@@ -1951,7 +1869,7 @@ let isCall = 1,
     let Inst{3-0}  = func;
   }
 
-  def BLX_pred : AI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm,
+  def BLX_pred : AI<(outs), (ins GPR:$func), BrMiscFrm,
                     IIC_Br, "blx", "\t$func",
                     [(ARMcall_pred GPR:$func)]>,
                  Requires<[IsARM, HasV5T]> {
@@ -1962,19 +1880,18 @@ let isCall = 1,
 
   // ARMv4T
   // Note: Restrict $func to the tGPR regclass to prevent it being in LR.
-  def BX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func, variable_ops),
+  def BX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func),
                    8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>,
                    Requires<[IsARM, HasV4T]>;
 
   // ARMv4
-  def BMOVPCRX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func, variable_ops),
+  def BMOVPCRX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func),
                    8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>,
                    Requires<[IsARM, NoV4T]>;
 
   // mov lr, pc; b if callee is marked noreturn to avoid confusing the
   // return stack predictor.
-  def BMOVPCB_CALL : ARMPseudoInst<(outs),
-                                   (ins bl_target:$func, variable_ops),
+  def BMOVPCB_CALL : ARMPseudoInst<(outs), (ins bl_target:$func),
                                8, IIC_Br, [(ARMcall_nolink tglobaladdr:$func)]>,
                       Requires<[IsARM]>;
 }
@@ -2044,18 +1961,16 @@ def BXJ : ABI<0b0001, (outs), (ins GPR:$func), NoItinerary, "bxj", "\t$func",
 // Tail calls.
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
-  def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst, variable_ops),
-                              IIC_Br, []>;
+  def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst), IIC_Br, []>;
 
-  def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst, variable_ops),
-                              IIC_Br, []>;
+  def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst), IIC_Br, []>;
 
-  def TAILJMPd : ARMPseudoExpand<(outs), (ins br_target:$dst, variable_ops),
+  def TAILJMPd : ARMPseudoExpand<(outs), (ins br_target:$dst),
                                  4, IIC_Br, [],
                                  (Bcc br_target:$dst, (ops 14, zero_reg))>,
                                  Requires<[IsARM]>;
 
-  def TAILJMPr : ARMPseudoExpand<(outs), (ins tcGPR:$dst, variable_ops),
+  def TAILJMPr : ARMPseudoExpand<(outs), (ins tcGPR:$dst),
                                  4, IIC_Br, [],
                                  (BX GPR:$dst)>,
                                  Requires<[IsARM]>;
@@ -2509,6 +2424,7 @@ multiclass AI2_stridx<bit isByte, string opc,
      let Inst{23} = offset{12};
      let Inst{19-16} = addr;
      let Inst{11-0} = offset{11-0};
+     let Inst{4} = 0;
 
     let DecoderMethod = "DecodeAddrMode2IdxInstruction";
    }
@@ -2768,7 +2684,7 @@ defm STRHT : AI3strT<0b1011, "strht">;
 multiclass arm_ldst_mult<string asm, string sfx, bit L_bit, bit P_bit, Format f,
                          InstrItinClass itin, InstrItinClass itin_upd> {
   // IA is the default, so no need for an explicit suffix on the
-  // mnemonic here. Without it is the cannonical spelling.
+  // mnemonic here. Without it is the canonical spelling.
   def IA :
     AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
          IndexModeNone, f, itin,
@@ -3163,6 +3079,11 @@ def : ARMPat<(add     GPR:$src, so_imm_neg:$imm),
 def : ARMPat<(ARMaddc GPR:$src, so_imm_neg:$imm),
              (SUBSri  GPR:$src, so_imm_neg:$imm)>;
 
+def : ARMPat<(add     GPR:$src, imm0_65535_neg:$imm),
+             (SUBrr   GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>;
+def : ARMPat<(ARMaddc GPR:$src, imm0_65535_neg:$imm),
+             (SUBSrr  GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>;
+
 // The with-carry-in form matches bitwise not instead of the negation.
 // Effectively, the inverse interpretation of the carry flag already accounts
 // for part of the negation.
@@ -3190,7 +3111,7 @@ class AAI<bits<8> op27_20, bits<8> op11_4, string opc,
   let Inst{19-16} = Rn;
   let Inst{15-12} = Rd;
   let Inst{3-0}   = Rm;
-  
+
   let Unpredictable{11-8} = 0b1111;
 }
 
@@ -3482,27 +3403,28 @@ class AsMul1I64<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
 
 // FIXME: The v5 pseudos are only necessary for the additional Constraint
 //        property. Remove them when it's possible to add those properties
-//        on an individual MachineInstr, not just an instuction description.
-let isCommutable = 1 in {
-def MUL  : AsMul1I32<0b0000000, (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm),
-                   IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm",
-                   [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))]>,
-                   Requires<[IsARM, HasV6]> {
+//        on an individual MachineInstr, not just an instruction description.
+let isCommutable = 1, TwoOperandAliasConstraint = "$Rn = $Rd" in {
+def MUL : AsMul1I32<0b0000000, (outs GPRnopc:$Rd),
+                    (ins GPRnopc:$Rn, GPRnopc:$Rm),
+                    IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm",
+                  [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))]>,
+                  Requires<[IsARM, HasV6]> {
   let Inst{15-12} = 0b0000;
   let Unpredictable{15-12} = 0b1111;
 }
 
 let Constraints = "@earlyclobber $Rd" in
 def MULv5: ARMPseudoExpand<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm,
-                                            pred:$p, cc_out:$s),
-                          4, IIC_iMUL32,
-                         [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))],
-                         (MUL GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s)>,
-                        Requires<[IsARM, NoV6]>;
+                                                    pred:$p, cc_out:$s),
+                           4, IIC_iMUL32,
+               [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))],
+               (MUL GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s)>,
+               Requires<[IsARM, NoV6]>;
 }
 
 def MLA  : AsMul1I32<0b0000001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
-                    IIC_iMAC32, "mla", "\t$Rd, $Rn, $Rm, $Ra",
+                     IIC_iMAC32, "mla", "\t$Rd, $Rn, $Rm, $Ra",
                    [(set GPR:$Rd, (add (mul GPR:$Rn, GPR:$Rm), GPR:$Ra))]>,
                    Requires<[IsARM, HasV6]> {
   bits<4> Ra;
@@ -3511,8 +3433,8 @@ def MLA  : AsMul1I32<0b0000001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
 
 let Constraints = "@earlyclobber $Rd" in
 def MLAv5: ARMPseudoExpand<(outs GPR:$Rd),
-                          (ins GPR:$Rn, GPR:$Rm, GPR:$Ra, pred:$p, cc_out:$s),
-                          4, IIC_iMAC32,
+                           (ins GPR:$Rn, GPR:$Rm, GPR:$Ra, pred:$p, cc_out:$s),
+                           4, IIC_iMAC32,
                         [(set GPR:$Rd, (add (mul GPR:$Rn, GPR:$Rm), GPR:$Ra))],
                   (MLA GPR:$Rd, GPR:$Rn, GPR:$Rm, GPR:$Ra, pred:$p, cc_out:$s)>,
                         Requires<[IsARM, NoV6]>;
@@ -3630,8 +3552,7 @@ def SMMLAR : AMul2Ia <0b0111010, 0b0011, (outs GPR:$Rd),
 
 def SMMLS : AMul2Ia <0b0111010, 0b1101, (outs GPR:$Rd),
                (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
-               IIC_iMAC32, "smmls", "\t$Rd, $Rn, $Rm, $Ra",
-               [(set GPR:$Rd, (sub GPR:$Ra, (mulhs GPR:$Rn, GPR:$Rm)))]>,
+               IIC_iMAC32, "smmls", "\t$Rd, $Rn, $Rm, $Ra", []>,
             Requires<[IsARM, HasV6]>;
 
 def SMMLSR : AMul2Ia <0b0111010, 0b1111, (outs GPR:$Rd),
@@ -3912,49 +3833,85 @@ def : ARMPat<(ARMcmpZ GPR:$src, so_reg_imm:$rhs),
 def : ARMPat<(ARMcmpZ GPR:$src, so_reg_reg:$rhs),
              (CMPrsr   GPR:$src, so_reg_reg:$rhs)>;
 
-// FIXME: We have to be careful when using the CMN instruction and comparison
-// with 0. One would expect these two pieces of code should give identical
-// results:
-//
-//   rsbs r1, r1, 0
-//   cmp  r0, r1
-//   mov  r0, #0
-//   it   ls
-//   mov  r0, #1
-//
-// and:
-//
-//   cmn  r0, r1
-//   mov  r0, #0
-//   it   ls
-//   mov  r0, #1
-//
-// However, the CMN gives the *opposite* result when r1 is 0. This is because
-// the carry flag is set in the CMP case but not in the CMN case. In short, the
-// CMP instruction doesn't perform a truncate of the (logical) NOT of 0 plus the
-// value of r0 and the carry bit (because the "carry bit" parameter to
-// AddWithCarry is defined as 1 in this case, the carry flag will always be set
-// when r0 >= 0). The CMN instruction doesn't perform a NOT of 0 so there is
-// never a "carry" when this AddWithCarry is performed (because the "carry bit"
-// parameter to AddWithCarry is defined as 0).
-//
-// When x is 0 and unsigned:
-//
-//    x = 0
-//   ~x = 0xFFFF FFFF
-//   ~x + 1 = 0x1 0000 0000
-//   (-x = 0) != (0x1 0000 0000 = ~x + 1)
-//
-// Therefore, we should disable CMN when comparing against zero, until we can
-// limit when the CMN instruction is used (when we know that the RHS is not 0 or
-// when it's a comparison which doesn't look at the 'carry' flag).
-//
-// (See the ARM docs for the "AddWithCarry" pseudo-code.)
-//
-// This is related to <rdar://problem/7569620>.
-//
-//defm CMN  : AI1_cmp_irs<0b1011, "cmn",
-//                        BinOpFrag<(ARMcmp node:$LHS,(ineg node:$RHS))>>;
+// CMN register-integer
+let isCompare = 1, Defs = [CPSR] in {
+def CMNri : AI1<0b1011, (outs), (ins GPR:$Rn, so_imm:$imm), DPFrm, IIC_iCMPi,
+                "cmn", "\t$Rn, $imm",
+                [(ARMcmn GPR:$Rn, so_imm:$imm)]> {
+  bits<4> Rn;
+  bits<12> imm;
+  let Inst{25} = 1;
+  let Inst{20} = 1;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b0000;
+  let Inst{11-0} = imm;
+
+  let Unpredictable{15-12} = 0b1111;
+}
+
+// CMN register-register/shift
+def CMNzrr : AI1<0b1011, (outs), (ins GPR:$Rn, GPR:$Rm), DPFrm, IIC_iCMPr,
+                 "cmn", "\t$Rn, $Rm",
+                 [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
+                   GPR:$Rn, GPR:$Rm)]> {
+  bits<4> Rn;
+  bits<4> Rm;
+  let isCommutable = 1;
+  let Inst{25} = 0;
+  let Inst{20} = 1;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b0000;
+  let Inst{11-4} = 0b00000000;
+  let Inst{3-0} = Rm;
+
+  let Unpredictable{15-12} = 0b1111;
+}
+
+def CMNzrsi : AI1<0b1011, (outs),
+                  (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, IIC_iCMPsr,
+                  "cmn", "\t$Rn, $shift",
+                  [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
+                    GPR:$Rn, so_reg_imm:$shift)]> {
+  bits<4> Rn;
+  bits<12> shift;
+  let Inst{25} = 0;
+  let Inst{20} = 1;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b0000;
+  let Inst{11-5} = shift{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = shift{3-0};
+
+  let Unpredictable{15-12} = 0b1111;
+}
+
+def CMNzrsr : AI1<0b1011, (outs),
+                  (ins GPRnopc:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, IIC_iCMPsr,
+                  "cmn", "\t$Rn, $shift",
+                  [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
+                    GPRnopc:$Rn, so_reg_reg:$shift)]> {
+  bits<4> Rn;
+  bits<12> shift;
+  let Inst{25} = 0;
+  let Inst{20} = 1;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b0000;
+  let Inst{11-8} = shift{11-8};
+  let Inst{7} = 0;
+  let Inst{6-5} = shift{6-5};
+  let Inst{4} = 1;
+  let Inst{3-0} = shift{3-0};
+
+  let Unpredictable{15-12} = 0b1111;
+}
+
+}
+
+def : ARMPat<(ARMcmp  GPR:$src, so_imm_neg:$imm),
+             (CMNri   GPR:$src, so_imm_neg:$imm)>;
+
+def : ARMPat<(ARMcmpZ GPR:$src, so_imm_neg:$imm),
+             (CMNri   GPR:$src, so_imm_neg:$imm)>;
 
 // Note that TST/TEQ don't set all the same flags that CMP does!
 defm TST  : AI1_cmp_irs<0b1000, "tst",
@@ -3964,16 +3921,6 @@ defm TEQ  : AI1_cmp_irs<0b1001, "teq",
                         IIC_iTSTi, IIC_iTSTr, IIC_iTSTsr,
                       BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>, 1>;
 
-defm CMNz  : AI1_cmp_irs<0b1011, "cmn",
-                         IIC_iCMPi, IIC_iCMPr, IIC_iCMPsr,
-                         BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>>;
-
-//def : ARMPat<(ARMcmp GPR:$src, so_imm_neg:$imm),
-//             (CMNri  GPR:$src, so_imm_neg:$imm)>;
-
-def : ARMPat<(ARMcmpZ GPR:$src, so_imm_neg:$imm),
-             (CMNzri  GPR:$src, so_imm_neg:$imm)>;
-
 // Pseudo i64 compares for some floating point compares.
 let usesCustomInserter = 1, isBranch = 1, isTerminator = 1,
     Defs = [CPSR] in {
@@ -4242,6 +4189,13 @@ let usesCustomInserter = 1 in {
 }
 }
 
+let usesCustomInserter = 1 in {
+    def COPY_STRUCT_BYVAL_I32 : PseudoInst<
+      (outs), (ins GPR:$dst, GPR:$src, i32imm:$size, i32imm:$alignment),
+      NoItinerary,
+      [(ARMcopystructbyval GPR:$dst, GPR:$src, imm:$size, imm:$alignment)]>;
+}
+
 let mayLoad = 1 in {
 def LDREXB : AIldrex<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
                      NoItinerary,
@@ -4280,10 +4234,10 @@ def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex", []>,
 
 // SWP/SWPB are deprecated in V6/V7.
 let mayLoad = 1, mayStore = 1 in {
-def SWP : AIswp<0, (outs GPRnopc:$Rt), (ins GPRnopc:$Rt2, addr_offset_none:$addr),
-                "swp", []>;
-def SWPB: AIswp<1, (outs GPRnopc:$Rt), (ins GPRnopc:$Rt2, addr_offset_none:$addr),
-                "swpb", []>;
+def SWP : AIswp<0, (outs GPRnopc:$Rt),
+                (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swp", []>;
+def SWPB: AIswp<1, (outs GPRnopc:$Rt),
+                (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swpb", []>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -4609,8 +4563,8 @@ class MovRRCopro<string opc, bit direction, list<dag> pattern = []>
 }
 
 def MCRR : MovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */,
-                      [(int_arm_mcrr imm:$cop, imm:$opc1, GPRnopc:$Rt, GPRnopc:$Rt2,
-                                     imm:$CRm)]>;
+                      [(int_arm_mcrr imm:$cop, imm:$opc1, GPRnopc:$Rt,
+                                     GPRnopc:$Rt2, imm:$CRm)]>;
 def MRRC : MovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */>;
 
 class MovRRCopro2<string opc, bit direction, list<dag> pattern = []>
@@ -4637,8 +4591,8 @@ class MovRRCopro2<string opc, bit direction, list<dag> pattern = []>
 }
 
 def MCRR2 : MovRRCopro2<"mcrr2", 0 /* from ARM core register to coprocessor */,
-                        [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPRnopc:$Rt, GPRnopc:$Rt2,
-                                        imm:$CRm)]>;
+                        [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPRnopc:$Rt,
+                                        GPRnopc:$Rt2, imm:$CRm)]>;
 def MRRC2 : MovRRCopro2<"mrrc2", 1 /* from coprocessor to ARM core register */>;
 
 //===----------------------------------------------------------------------===//
@@ -4658,7 +4612,8 @@ def MRS : ABI<0b0001, (outs GPRnopc:$Rd), (ins), NoItinerary,
   let Unpredictable{11-0} = 0b110100001111;
 }
 
-def : InstAlias<"mrs${p} $Rd, cpsr", (MRS GPRnopc:$Rd, pred:$p)>, Requires<[IsARM]>;
+def : InstAlias<"mrs${p} $Rd, cpsr", (MRS GPRnopc:$Rd, pred:$p)>,
+         Requires<[IsARM]>;
 
 // The MRSsys instruction is the MRS instruction from the ARM ARM,
 // section B9.3.9, with the R bit set to 1.
@@ -5114,7 +5069,7 @@ def : ARMInstAlias<"add${s}${p} $Rd, $imm",
                  (SUBri GPR:$Rd, GPR:$Rd, so_imm_neg:$imm, pred:$p, cc_out:$s)>;
 // Same for CMP <--> CMN via so_imm_neg
 def : ARMInstAlias<"cmp${p} $Rd, $imm",
-                   (CMNzri rGPR:$Rd, so_imm_neg:$imm, pred:$p)>;
+                   (CMNri rGPR:$Rd, so_imm_neg:$imm, pred:$p)>;
 def : ARMInstAlias<"cmn${p} $Rd, $imm",
                    (CMPri rGPR:$Rd, so_imm_neg:$imm, pred:$p)>;
 
@@ -5123,6 +5078,7 @@ def : ARMInstAlias<"cmn${p} $Rd, $imm",
 // FIXME: We need C++ parser hooks to map the alias to the MOV
 //        encoding. It seems we should be able to do that sort of thing
 //        in tblgen, but it could get ugly.
+let TwoOperandAliasConstraint = "$Rm = $Rd" in {
 def ASRi : ARMAsmPseudo<"asr${s}${p} $Rd, $Rm, $imm",
                         (ins GPR:$Rd, GPR:$Rm, imm0_32:$imm, pred:$p,
                              cc_out:$s)>;
@@ -5135,8 +5091,10 @@ def LSLi : ARMAsmPseudo<"lsl${s}${p} $Rd, $Rm, $imm",
 def RORi : ARMAsmPseudo<"ror${s}${p} $Rd, $Rm, $imm",
                         (ins GPR:$Rd, GPR:$Rm, imm0_31:$imm, pred:$p,
                              cc_out:$s)>;
+}
 def RRXi : ARMAsmPseudo<"rrx${s}${p} $Rd, $Rm",
                         (ins GPRnopc:$Rd, GPRnopc:$Rm, pred:$p, cc_out:$s)>;
+let TwoOperandAliasConstraint = "$Rn = $Rd" in {
 def ASRr : ARMAsmPseudo<"asr${s}${p} $Rd, $Rn, $Rm",
                         (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
                              cc_out:$s)>;
@@ -5149,32 +5107,7 @@ def LSLr : ARMAsmPseudo<"lsl${s}${p} $Rd, $Rn, $Rm",
 def RORr : ARMAsmPseudo<"ror${s}${p} $Rd, $Rn, $Rm",
                         (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
                              cc_out:$s)>;
-// shifter instructions also support a two-operand form.
-def : ARMInstAlias<"asr${s}${p} $Rm, $imm",
-                   (ASRi GPR:$Rm, GPR:$Rm, imm0_32:$imm, pred:$p, cc_out:$s)>;
-def : ARMInstAlias<"lsr${s}${p} $Rm, $imm",
-                   (LSRi GPR:$Rm, GPR:$Rm, imm0_32:$imm, pred:$p, cc_out:$s)>;
-def : ARMInstAlias<"lsl${s}${p} $Rm, $imm",
-                   (LSLi GPR:$Rm, GPR:$Rm, imm0_31:$imm, pred:$p, cc_out:$s)>;
-def : ARMInstAlias<"ror${s}${p} $Rm, $imm",
-                   (RORi GPR:$Rm, GPR:$Rm, imm0_31:$imm, pred:$p, cc_out:$s)>;
-def : ARMInstAlias<"asr${s}${p} $Rn, $Rm",
-                   (ASRr GPRnopc:$Rn, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
-                         cc_out:$s)>;
-def : ARMInstAlias<"lsr${s}${p} $Rn, $Rm",
-                   (LSRr GPRnopc:$Rn, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
-                         cc_out:$s)>;
-def : ARMInstAlias<"lsl${s}${p} $Rn, $Rm",
-                   (LSLr GPRnopc:$Rn, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
-                         cc_out:$s)>;
-def : ARMInstAlias<"ror${s}${p} $Rn, $Rm",
-                   (RORr GPRnopc:$Rn, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
-                         cc_out:$s)>;
-
-
-// 'mul' instruction can be specified with only two operands.
-def : ARMInstAlias<"mul${s}${p} $Rn, $Rm",
-                   (MUL rGPR:$Rn, rGPR:$Rm, rGPR:$Rn, pred:$p, cc_out:$s)>;
+}
 
 // "neg" is and alias for "rsb rd, rn, #0"
 def : ARMInstAlias<"neg${s}${p} $Rd, $Rm",
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index fd8ac0b..d4afa33 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -1962,7 +1962,7 @@ def VST1LNd16 : VST1LN<0b0100, {?,?,0,?}, "16", v4i16, truncstorei16,
   let Inst{4}   = Rn{5};
 }
 
-def VST1LNd32 : VST1LN<0b1000, {?,0,?,?}, "32", v2i32, store, extractelt, 
+def VST1LNd32 : VST1LN<0b1000, {?,0,?,?}, "32", v2i32, store, extractelt,
                        addrmode6oneL32> {
   let Inst{7}   = lane{0};
   let Inst{5-4} = Rn{5-4};
@@ -2300,14 +2300,14 @@ class N2VQ<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
 class N2VDInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
               bits<2> op17_16, bits<5> op11_7, bit op4,
               InstrItinClass itin, string OpcodeStr, string Dt,
-              ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd),
         (ins DPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
         [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>;
 class N2VQInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
               bits<2> op17_16, bits<5> op11_7, bit op4,
               InstrItinClass itin, string OpcodeStr, string Dt,
-              ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd),
         (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
         [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
@@ -2325,7 +2325,7 @@ class N2VN<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
 class N2VNInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
               bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
               InstrItinClass itin, string OpcodeStr, string Dt,
-              ValueType TyD, ValueType TyQ, Intrinsic IntOp>
+              ValueType TyD, ValueType TyQ, SDPatternOperator IntOp>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs DPR:$Vd),
         (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
         [(set DPR:$Vd, (TyD (IntOp (TyQ QPR:$Vm))))]>;
@@ -2343,7 +2343,7 @@ class N2VL<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
 class N2VLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
               bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
               InstrItinClass itin, string OpcodeStr, string Dt,
-              ValueType TyQ, ValueType TyD, Intrinsic IntOp>
+              ValueType TyQ, ValueType TyD, SDPatternOperator IntOp>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs QPR:$Vd),
         (ins DPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
         [(set QPR:$Vd, (TyQ (IntOp (TyD DPR:$Vm))))]>;
@@ -2368,6 +2368,8 @@ class N3VD<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
         (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
         OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
         [(set DPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
   let isCommutable = Commutable;
 }
 // Same as N3VD but no data type.
@@ -2379,6 +2381,8 @@ class N3VDX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
          (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
          OpcodeStr, "$Vd, $Vn, $Vm", "",
          [(set DPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>{
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
   let isCommutable = Commutable;
 }
 
@@ -2391,6 +2395,8 @@ class N3VDSL<bits<2> op21_20, bits<4> op11_8,
         [(set (Ty DPR:$Vd),
               (Ty (ShOp (Ty DPR:$Vn),
                         (Ty (NEONvduplane (Ty DPR_VFP2:$Vm),imm:$lane)))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
   let isCommutable = 0;
 }
 class N3VDSL16<bits<2> op21_20, bits<4> op11_8,
@@ -2401,6 +2407,8 @@ class N3VDSL16<bits<2> op21_20, bits<4> op11_8,
         [(set (Ty DPR:$Vd),
               (Ty (ShOp (Ty DPR:$Vn),
                         (Ty (NEONvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
   let isCommutable = 0;
 }
 
@@ -2411,6 +2419,8 @@ class N3VQ<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
         (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
         OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
         [(set QPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
   let isCommutable = Commutable;
 }
 class N3VQX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
@@ -2420,6 +2430,8 @@ class N3VQX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
          (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
          OpcodeStr, "$Vd, $Vn, $Vm", "",
          [(set QPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]>{
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
   let isCommutable = Commutable;
 }
 class N3VQSL<bits<2> op21_20, bits<4> op11_8,
@@ -2432,6 +2444,8 @@ class N3VQSL<bits<2> op21_20, bits<4> op11_8,
               (ResTy (ShOp (ResTy QPR:$Vn),
                            (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
                                                 imm:$lane)))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
   let isCommutable = 0;
 }
 class N3VQSL16<bits<2> op21_20, bits<4> op11_8, string OpcodeStr, string Dt,
@@ -2443,21 +2457,25 @@ class N3VQSL16<bits<2> op21_20, bits<4> op11_8, string OpcodeStr, string Dt,
               (ResTy (ShOp (ResTy QPR:$Vn),
                            (ResTy (NEONvduplane (OpTy DPR_8:$Vm),
                                                 imm:$lane)))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
   let isCommutable = 0;
 }
 
 // Basic 3-register intrinsics, both double- and quad-register.
 class N3VDInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
               Format f, InstrItinClass itin, string OpcodeStr, string Dt,
-              ValueType ResTy, ValueType OpTy, Intrinsic IntOp, bit Commutable>
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
         (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), f, itin,
         OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
         [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
   let isCommutable = Commutable;
 }
 class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
-                string OpcodeStr, string Dt, ValueType Ty, Intrinsic IntOp>
+                string OpcodeStr, string Dt, ValueType Ty, SDPatternOperator IntOp>
   : N3VLane32<0, 1, op21_20, op11_8, 1, 0,
         (outs DPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
         NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
@@ -2468,7 +2486,7 @@ class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
   let isCommutable = 0;
 }
 class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
-                  string OpcodeStr, string Dt, ValueType Ty, Intrinsic IntOp>
+                  string OpcodeStr, string Dt, ValueType Ty, SDPatternOperator IntOp>
   : N3VLane16<0, 1, op21_20, op11_8, 1, 0,
         (outs DPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
         NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
@@ -2479,26 +2497,29 @@ class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
 }
 class N3VDIntSh<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
               Format f, InstrItinClass itin, string OpcodeStr, string Dt,
-              ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
         (outs DPR:$Vd), (ins DPR:$Vm, DPR:$Vn), f, itin,
         OpcodeStr, Dt, "$Vd, $Vm, $Vn", "",
         [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm), (OpTy DPR:$Vn))))]> {
+  let TwoOperandAliasConstraint = "$Vm = $Vd";
   let isCommutable = 0;
 }
 
 class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
               Format f, InstrItinClass itin, string OpcodeStr, string Dt,
-              ValueType ResTy, ValueType OpTy, Intrinsic IntOp, bit Commutable>
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 1, op4,
         (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), f, itin,
         OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
         [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
   let isCommutable = Commutable;
 }
 class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                 string OpcodeStr, string Dt,
-                ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+                ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N3VLane32<1, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$Vd), (ins QPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
         NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
@@ -2510,7 +2531,7 @@ class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
 }
 class N3VQIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                   string OpcodeStr, string Dt,
-                  ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+                  ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N3VLane16<1, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$Vd), (ins QPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
         NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
@@ -2522,11 +2543,12 @@ class N3VQIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
 }
 class N3VQIntSh<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
               Format f, InstrItinClass itin, string OpcodeStr, string Dt,
-              ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N3V<op24, op23, op21_20, op11_8, 1, op4,
         (outs QPR:$Vd), (ins QPR:$Vm, QPR:$Vn), f, itin,
         OpcodeStr, Dt, "$Vd, $Vm, $Vn", "",
         [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm), (OpTy QPR:$Vn))))]> {
+  let TwoOperandAliasConstraint = "$Vm = $Vd";
   let isCommutable = 0;
 }
 
@@ -2606,7 +2628,7 @@ class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
 // Neon Intrinsic-Op instructions (VABA): double- and quad-register.
 class N3VDIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                 InstrItinClass itin, string OpcodeStr, string Dt,
-                ValueType Ty, Intrinsic IntOp, SDNode OpNode>
+                ValueType Ty, SDPatternOperator IntOp, SDNode OpNode>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
         (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
         OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
@@ -2614,7 +2636,7 @@ class N3VDIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                              (Ty (IntOp (Ty DPR:$Vn), (Ty DPR:$Vm))))))]>;
 class N3VQIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                 InstrItinClass itin, string OpcodeStr, string Dt,
-                ValueType Ty, Intrinsic IntOp, SDNode OpNode>
+                ValueType Ty, SDPatternOperator IntOp, SDNode OpNode>
   : N3V<op24, op23, op21_20, op11_8, 1, op4,
         (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
         OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
@@ -2625,7 +2647,7 @@ class N3VQIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
 // The destination register is also used as the first source operand register.
 class N3VDInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                InstrItinClass itin, string OpcodeStr, string Dt,
-               ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+               ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
         (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
         OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
@@ -2633,7 +2655,7 @@ class N3VDInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                                       (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>;
 class N3VQInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                InstrItinClass itin, string OpcodeStr, string Dt,
-               ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+               ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N3V<op24, op23, op21_20, op11_8, 1, op4,
         (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
         OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
@@ -2678,7 +2700,7 @@ class N3VLMulOpSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
 // Long Intrinsic-Op vector operations with explicit extend (VABAL).
 class N3VLIntExtOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                    InstrItinClass itin, string OpcodeStr, string Dt,
-                   ValueType TyQ, ValueType TyD, Intrinsic IntOp, SDNode ExtOp,
+                   ValueType TyQ, ValueType TyD, SDPatternOperator IntOp, SDNode ExtOp,
                    SDNode OpNode>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
         (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
@@ -2691,7 +2713,7 @@ class N3VLIntExtOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
 // a quad-register and is also used as the first source operand register.
 class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                InstrItinClass itin, string OpcodeStr, string Dt,
-               ValueType TyQ, ValueType TyD, Intrinsic IntOp>
+               ValueType TyQ, ValueType TyD, SDPatternOperator IntOp>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
         (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
         OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
@@ -2699,7 +2721,7 @@ class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
           (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$Vn), (TyD DPR:$Vm))))]>;
 class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                  string OpcodeStr, string Dt,
-                 ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+                 ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N3VLane32<op24, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$Vd),
         (ins QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
@@ -2712,7 +2734,7 @@ class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                                                 imm:$lane)))))]>;
 class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8,
                    InstrItinClass itin, string OpcodeStr, string Dt,
-                   ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+                   ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N3VLane16<op24, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$Vd),
         (ins QPR:$src1, DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
@@ -2727,7 +2749,7 @@ class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8,
 // Narrowing 3-register intrinsics.
 class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
               string OpcodeStr, string Dt, ValueType TyD, ValueType TyQ,
-              Intrinsic IntOp, bit Commutable>
+              SDPatternOperator IntOp, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
         (outs DPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VBINi4D,
         OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
@@ -2780,7 +2802,7 @@ class N3VLExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
 // Long 3-register intrinsics with explicit extend (VABDL).
 class N3VLIntExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                  InstrItinClass itin, string OpcodeStr, string Dt,
-                 ValueType TyQ, ValueType TyD, Intrinsic IntOp, SDNode ExtOp,
+                 ValueType TyQ, ValueType TyD, SDPatternOperator IntOp, SDNode ExtOp,
                  bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
         (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
@@ -2793,7 +2815,7 @@ class N3VLIntExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
 // Long 3-register intrinsics.
 class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
               InstrItinClass itin, string OpcodeStr, string Dt,
-              ValueType TyQ, ValueType TyD, Intrinsic IntOp, bit Commutable>
+              ValueType TyQ, ValueType TyD, SDPatternOperator IntOp, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
         (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
         OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
@@ -2802,7 +2824,7 @@ class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
 }
 class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                 string OpcodeStr, string Dt,
-                ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+                ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N3VLane32<op24, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
         NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
@@ -2812,7 +2834,7 @@ class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                                                 imm:$lane)))))]>;
 class N3VLIntSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
                   InstrItinClass itin, string OpcodeStr, string Dt,
-                  ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+                  ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N3VLane16<op24, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
         NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
@@ -2830,6 +2852,8 @@ class N3VW<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
         OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
         [(set QPR:$Vd, (OpNode (TyQ QPR:$Vn),
                                 (TyQ (ExtOp (TyD DPR:$Vm)))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
   let isCommutable = Commutable;
 }
 
@@ -2837,14 +2861,14 @@ class N3VW<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
 class N2VDPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
                 bits<2> op17_16, bits<5> op11_7, bit op4,
                 string OpcodeStr, string Dt,
-                ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+                ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd),
         (ins DPR:$Vm), IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm", "",
         [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>;
 class N2VQPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
                 bits<2> op17_16, bits<5> op11_7, bit op4,
                 string OpcodeStr, string Dt,
-                ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+                ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd),
         (ins QPR:$Vm), IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm", "",
         [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
@@ -2855,7 +2879,7 @@ class N2VQPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
 class N2VDPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
                  bits<2> op17_16, bits<5> op11_7, bit op4,
                  string OpcodeStr, string Dt,
-                 ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+                 ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4,
         (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vm), IIC_VPALiD,
         OpcodeStr, Dt, "$Vd, $Vm", "$src1 = $Vd",
@@ -2863,7 +2887,7 @@ class N2VDPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
 class N2VQPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
                  bits<2> op17_16, bits<5> op11_7, bit op4,
                  string OpcodeStr, string Dt,
-                 ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+                 ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4,
         (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vm), IIC_VPALiQ,
         OpcodeStr, Dt, "$Vd, $Vm", "$src1 = $Vd",
@@ -2871,6 +2895,7 @@ class N2VQPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
 
 // Shift by immediate,
 // both double- and quad-register.
+let TwoOperandAliasConstraint = "$Vm = $Vd" in {
 class N2VDSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
              Format f, InstrItinClass itin, Operand ImmTy,
              string OpcodeStr, string Dt, ValueType Ty, SDNode OpNode>
@@ -2885,6 +2910,7 @@ class N2VQSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
            (outs QPR:$Vd), (ins QPR:$Vm, ImmTy:$SIMM), f, itin,
            OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
            [(set QPR:$Vd, (Ty (OpNode (Ty QPR:$Vm), (i32 imm:$SIMM))))]>;
+}
 
 // Long shift by immediate.
 class N2VLSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
@@ -2908,6 +2934,7 @@ class N2VNSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
 
 // Shift right by immediate and accumulate,
 // both double- and quad-register.
+let TwoOperandAliasConstraint = "$Vm = $Vd" in {
 class N2VDShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
                 Operand ImmTy, string OpcodeStr, string Dt,
                 ValueType Ty, SDNode ShOp>
@@ -2924,9 +2951,11 @@ class N2VQShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
            OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd",
            [(set QPR:$Vd, (Ty (add QPR:$src1,
                                 (Ty (ShOp QPR:$Vm, (i32 imm:$SIMM))))))]>;
+}
 
 // Shift by immediate and insert,
 // both double- and quad-register.
+let TwoOperandAliasConstraint = "$Vm = $Vd" in {
 class N2VDShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
                 Operand ImmTy, Format f, string OpcodeStr, string Dt,
                 ValueType Ty,SDNode ShOp>
@@ -2941,19 +2970,20 @@ class N2VQShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
            (ins QPR:$src1, QPR:$Vm, ImmTy:$SIMM), f, IIC_VSHLiQ,
            OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd",
            [(set QPR:$Vd, (Ty (ShOp QPR:$src1, QPR:$Vm, (i32 imm:$SIMM))))]>;
+}
 
 // Convert, with fractional bits immediate,
 // both double- and quad-register.
 class N2VCvtD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
               string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
-              Intrinsic IntOp>
+              SDPatternOperator IntOp>
   : N2VImm<op24, op23, op11_8, op7, 0, op4,
            (outs DPR:$Vd), (ins DPR:$Vm, neon_vcvt_imm32:$SIMM), NVCVTFrm,
            IIC_VUNAD, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
            [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm), (i32 imm:$SIMM))))]>;
 class N2VCvtQ<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
               string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
-              Intrinsic IntOp>
+              SDPatternOperator IntOp>
   : N2VImm<op24, op23, op11_8, op7, 1, op4,
            (outs QPR:$Vd), (ins QPR:$Vm, neon_vcvt_imm32:$SIMM), NVCVTFrm,
            IIC_VUNAQ, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
@@ -3023,7 +3053,7 @@ multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
 multiclass N2VInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
                       bits<5> op11_7, bit op4,
                       InstrItinClass itinD, InstrItinClass itinQ,
-                      string OpcodeStr, string Dt, Intrinsic IntOp> {
+                      string OpcodeStr, string Dt, SDPatternOperator IntOp> {
   // 64-bit vector types.
   def v8i8  : N2VDInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
                       itinD, OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>;
@@ -3064,7 +3094,7 @@ multiclass N2VN_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
 multiclass N2VNInt_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
                        bits<5> op11_7, bit op6, bit op4,
                        InstrItinClass itin, string OpcodeStr, string Dt,
-                       Intrinsic IntOp> {
+                       SDPatternOperator IntOp> {
   def v8i8  : N2VNInt<op24_23, op21_20, 0b00, op17_16, op11_7, op6, op4,
                       itin, OpcodeStr, !strconcat(Dt, "16"),
                       v8i8, v8i16, IntOp>;
@@ -3152,7 +3182,7 @@ multiclass N3VInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
                      InstrItinClass itinD16, InstrItinClass itinD32,
                      InstrItinClass itinQ16, InstrItinClass itinQ32,
                      string OpcodeStr, string Dt,
-                     Intrinsic IntOp, bit Commutable = 0> {
+                     SDPatternOperator IntOp, bit Commutable = 0> {
   // 64-bit vector types.
   def v4i16 : N3VDInt<op24, op23, 0b01, op11_8, op4, f, itinD16,
                       OpcodeStr, !strconcat(Dt, "16"),
@@ -3173,7 +3203,7 @@ multiclass N3VInt_HSSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
                      InstrItinClass itinD16, InstrItinClass itinD32,
                      InstrItinClass itinQ16, InstrItinClass itinQ32,
                      string OpcodeStr, string Dt,
-                     Intrinsic IntOp> {
+                     SDPatternOperator IntOp> {
   // 64-bit vector types.
   def v4i16 : N3VDIntSh<op24, op23, 0b01, op11_8, op4, f, itinD16,
                       OpcodeStr, !strconcat(Dt, "16"),
@@ -3194,7 +3224,7 @@ multiclass N3VInt_HSSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
 multiclass N3VIntSL_HS<bits<4> op11_8,
                        InstrItinClass itinD16, InstrItinClass itinD32,
                        InstrItinClass itinQ16, InstrItinClass itinQ32,
-                       string OpcodeStr, string Dt, Intrinsic IntOp> {
+                       string OpcodeStr, string Dt, SDPatternOperator IntOp> {
   def v4i16 : N3VDIntSL16<0b01, op11_8, itinD16,
                           OpcodeStr, !strconcat(Dt, "16"), v4i16, IntOp>;
   def v2i32 : N3VDIntSL<0b10, op11_8, itinD32,
@@ -3210,7 +3240,7 @@ multiclass N3VInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
                       InstrItinClass itinD16, InstrItinClass itinD32,
                       InstrItinClass itinQ16, InstrItinClass itinQ32,
                       string OpcodeStr, string Dt,
-                      Intrinsic IntOp, bit Commutable = 0>
+                      SDPatternOperator IntOp, bit Commutable = 0>
   : N3VInt_HS<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
               OpcodeStr, Dt, IntOp, Commutable> {
   def v8i8  : N3VDInt<op24, op23, 0b00, op11_8, op4, f, itinD16,
@@ -3224,7 +3254,7 @@ multiclass N3VInt_QHSSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
                       InstrItinClass itinD16, InstrItinClass itinD32,
                       InstrItinClass itinQ16, InstrItinClass itinQ32,
                       string OpcodeStr, string Dt,
-                      Intrinsic IntOp>
+                      SDPatternOperator IntOp>
   : N3VInt_HSSh<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
               OpcodeStr, Dt, IntOp> {
   def v8i8  : N3VDIntSh<op24, op23, 0b00, op11_8, op4, f, itinD16,
@@ -3241,7 +3271,7 @@ multiclass N3VInt_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
                        InstrItinClass itinD16, InstrItinClass itinD32,
                        InstrItinClass itinQ16, InstrItinClass itinQ32,
                        string OpcodeStr, string Dt,
-                       Intrinsic IntOp, bit Commutable = 0>
+                       SDPatternOperator IntOp, bit Commutable = 0>
   : N3VInt_QHS<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
                OpcodeStr, Dt, IntOp, Commutable> {
   def v1i64 : N3VDInt<op24, op23, 0b11, op11_8, op4, f, itinD32,
@@ -3255,7 +3285,7 @@ multiclass N3VInt_QHSDSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
                        InstrItinClass itinD16, InstrItinClass itinD32,
                        InstrItinClass itinQ16, InstrItinClass itinQ32,
                        string OpcodeStr, string Dt,
-                       Intrinsic IntOp>
+                       SDPatternOperator IntOp>
   : N3VInt_QHSSh<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
                OpcodeStr, Dt, IntOp> {
   def v1i64 : N3VDIntSh<op24, op23, 0b11, op11_8, op4, f, itinD32,
@@ -3270,7 +3300,7 @@ multiclass N3VInt_QHSDSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
 //   source operand element sizes of 16, 32 and 64 bits:
 multiclass N3VNInt_HSD<bit op24, bit op23, bits<4> op11_8, bit op4,
                        string OpcodeStr, string Dt,
-                       Intrinsic IntOp, bit Commutable = 0> {
+                       SDPatternOperator IntOp, bit Commutable = 0> {
   def v8i8  : N3VNInt<op24, op23, 0b00, op11_8, op4,
                       OpcodeStr, !strconcat(Dt, "16"),
                       v8i8, v8i16, IntOp, Commutable>;
@@ -3330,7 +3360,7 @@ multiclass N3VLExt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
 multiclass N3VLInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
                       InstrItinClass itin16, InstrItinClass itin32,
                       string OpcodeStr, string Dt,
-                      Intrinsic IntOp, bit Commutable = 0> {
+                      SDPatternOperator IntOp, bit Commutable = 0> {
   def v4i32 : N3VLInt<op24, op23, 0b01, op11_8, op4, itin16,
                       OpcodeStr, !strconcat(Dt, "16"),
                       v4i32, v4i16, IntOp, Commutable>;
@@ -3341,7 +3371,7 @@ multiclass N3VLInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
 
 multiclass N3VLIntSL_HS<bit op24, bits<4> op11_8,
                         InstrItinClass itin, string OpcodeStr, string Dt,
-                        Intrinsic IntOp> {
+                        SDPatternOperator IntOp> {
   def v4i16 : N3VLIntSL16<op24, 0b01, op11_8, itin,
                           OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp>;
   def v2i32 : N3VLIntSL<op24, 0b10, op11_8, itin,
@@ -3352,7 +3382,7 @@ multiclass N3VLIntSL_HS<bit op24, bits<4> op11_8,
 multiclass N3VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
                        InstrItinClass itin16, InstrItinClass itin32,
                        string OpcodeStr, string Dt,
-                       Intrinsic IntOp, bit Commutable = 0>
+                       SDPatternOperator IntOp, bit Commutable = 0>
   : N3VLInt_HS<op24, op23, op11_8, op4, itin16, itin32, OpcodeStr, Dt,
                IntOp, Commutable> {
   def v8i16 : N3VLInt<op24, op23, 0b00, op11_8, op4, itin16,
@@ -3363,7 +3393,7 @@ multiclass N3VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
 // ....with explicit extend (VABDL).
 multiclass N3VLIntExt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
                        InstrItinClass itin, string OpcodeStr, string Dt,
-                       Intrinsic IntOp, SDNode ExtOp, bit Commutable = 0> {
+                       SDPatternOperator IntOp, SDNode ExtOp, bit Commutable = 0> {
   def v8i16 : N3VLIntExt<op24, op23, 0b00, op11_8, op4, itin,
                          OpcodeStr, !strconcat(Dt, "8"),
                          v8i16, v8i8, IntOp, ExtOp, Commutable>;
@@ -3436,7 +3466,7 @@ multiclass N3VMulOpSL_HS<bits<4> op11_8,
 //   element sizes of 8, 16 and 32 bits:
 multiclass N3VIntOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
                         InstrItinClass itinD, InstrItinClass itinQ,
-                        string OpcodeStr, string Dt, Intrinsic IntOp,
+                        string OpcodeStr, string Dt, SDPatternOperator IntOp,
                         SDNode OpNode> {
   // 64-bit vector types.
   def v8i8  : N3VDIntOp<op24, op23, 0b00, op11_8, op4, itinD,
@@ -3459,7 +3489,7 @@ multiclass N3VIntOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
 //   element sizes of 8, 16 and 32 bits:
 multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
                        InstrItinClass itinD, InstrItinClass itinQ,
-                       string OpcodeStr, string Dt, Intrinsic IntOp> {
+                       string OpcodeStr, string Dt, SDPatternOperator IntOp> {
   // 64-bit vector types.
   def v8i8  : N3VDInt3<op24, op23, 0b00, op11_8, op4, itinD,
                        OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>;
@@ -3506,7 +3536,7 @@ multiclass N3VLMulOpSL_HS<bit op24, bits<4> op11_8, string OpcodeStr,
 // First with only element sizes of 16 and 32 bits:
 multiclass N3VLInt3_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
                        InstrItinClass itin16, InstrItinClass itin32,
-                       string OpcodeStr, string Dt, Intrinsic IntOp> {
+                       string OpcodeStr, string Dt, SDPatternOperator IntOp> {
   def v4i32 : N3VLInt3<op24, op23, 0b01, op11_8, op4, itin16,
                        OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp>;
   def v2i64 : N3VLInt3<op24, op23, 0b10, op11_8, op4, itin32,
@@ -3514,7 +3544,7 @@ multiclass N3VLInt3_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
 }
 
 multiclass N3VLInt3SL_HS<bit op24, bits<4> op11_8,
-                         string OpcodeStr, string Dt, Intrinsic IntOp> {
+                         string OpcodeStr, string Dt, SDPatternOperator IntOp> {
   def v4i16 : N3VLInt3SL16<op24, 0b01, op11_8, IIC_VMACi16D,
                            OpcodeStr, !strconcat(Dt,"16"), v4i32, v4i16, IntOp>;
   def v2i32 : N3VLInt3SL<op24, 0b10, op11_8, IIC_VMACi32D,
@@ -3524,7 +3554,7 @@ multiclass N3VLInt3SL_HS<bit op24, bits<4> op11_8,
 // ....then also with element size of 8 bits:
 multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
                         InstrItinClass itin16, InstrItinClass itin32,
-                        string OpcodeStr, string Dt, Intrinsic IntOp>
+                        string OpcodeStr, string Dt, SDPatternOperator IntOp>
   : N3VLInt3_HS<op24, op23, op11_8, op4, itin16, itin32, OpcodeStr, Dt, IntOp> {
   def v8i16 : N3VLInt3<op24, op23, 0b00, op11_8, op4, itin16,
                        OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, IntOp>;
@@ -3533,7 +3563,7 @@ multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
 // ....with explicit extend (VABAL).
 multiclass N3VLIntExtOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
                             InstrItinClass itin, string OpcodeStr, string Dt,
-                            Intrinsic IntOp, SDNode ExtOp, SDNode OpNode> {
+                            SDPatternOperator IntOp, SDNode ExtOp, SDNode OpNode> {
   def v8i16 : N3VLIntExtOp<op24, op23, 0b00, op11_8, op4, itin,
                            OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8,
                            IntOp, ExtOp, OpNode>;
@@ -3550,7 +3580,7 @@ multiclass N3VLIntExtOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
 //   element sizes of 8, 16 and 32 bits:
 multiclass N2VPLInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
                         bits<5> op11_7, bit op4,
-                        string OpcodeStr, string Dt, Intrinsic IntOp> {
+                        string OpcodeStr, string Dt, SDPatternOperator IntOp> {
   // 64-bit vector types.
   def v8i8  : N2VDPLInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
                         OpcodeStr, !strconcat(Dt, "8"), v4i16, v8i8, IntOp>;
@@ -3573,7 +3603,7 @@ multiclass N2VPLInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
 //   element sizes of 8, 16 and 32 bits:
 multiclass N2VPLInt2_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
                          bits<5> op11_7, bit op4,
-                         string OpcodeStr, string Dt, Intrinsic IntOp> {
+                         string OpcodeStr, string Dt, SDPatternOperator IntOp> {
   // 64-bit vector types.
   def v8i8  : N2VDPLInt2<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
                          OpcodeStr, !strconcat(Dt, "8"), v4i16, v8i8, IntOp>;
@@ -3668,33 +3698,6 @@ multiclass N2VShR_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
   def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, N2RegVShRFrm, itin, shr_imm64,
                      OpcodeStr, !strconcat(Dt, "64"), v2i64, OpNode>;
                              // imm6 = xxxxxx
-
-  // Aliases for two-operand forms (source and dest regs the same).
-  def : NEONInstAlias<!strconcat(OpcodeStr, "${p}.", Dt, "8 $Vdn, $imm"),
-                      (!cast<Instruction>(!strconcat(baseOpc, "v8i8"))
-                          DPR:$Vdn, DPR:$Vdn, shr_imm8:$imm, pred:$p)>;
-  def : NEONInstAlias<!strconcat(OpcodeStr, "${p}.", Dt, "16 $Vdn, $imm"),
-                      (!cast<Instruction>(!strconcat(baseOpc, "v4i16"))
-                          DPR:$Vdn, DPR:$Vdn, shr_imm16:$imm, pred:$p)>;
-  def : NEONInstAlias<!strconcat(OpcodeStr, "${p}.", Dt, "32 $Vdn, $imm"),
-                      (!cast<Instruction>(!strconcat(baseOpc, "v2i32"))
-                          DPR:$Vdn, DPR:$Vdn, shr_imm32:$imm, pred:$p)>;
-  def : NEONInstAlias<!strconcat(OpcodeStr, "${p}.", Dt, "64 $Vdn, $imm"),
-                      (!cast<Instruction>(!strconcat(baseOpc, "v1i64"))
-                          DPR:$Vdn, DPR:$Vdn, shr_imm64:$imm, pred:$p)>;
-
-  def : NEONInstAlias<!strconcat(OpcodeStr, "${p}.", Dt, "8 $Vdn, $imm"),
-                      (!cast<Instruction>(!strconcat(baseOpc, "v16i8"))
-                          QPR:$Vdn, QPR:$Vdn, shr_imm8:$imm, pred:$p)>;
-  def : NEONInstAlias<!strconcat(OpcodeStr, "${p}.", Dt, "16 $Vdn, $imm"),
-                      (!cast<Instruction>(!strconcat(baseOpc, "v8i16"))
-                          QPR:$Vdn, QPR:$Vdn, shr_imm16:$imm, pred:$p)>;
-  def : NEONInstAlias<!strconcat(OpcodeStr, "${p}.", Dt, "32 $Vdn, $imm"),
-                      (!cast<Instruction>(!strconcat(baseOpc, "v4i32"))
-                          QPR:$Vdn, QPR:$Vdn, shr_imm32:$imm, pred:$p)>;
-  def : NEONInstAlias<!strconcat(OpcodeStr, "${p}.", Dt, "64 $Vdn, $imm"),
-                      (!cast<Instruction>(!strconcat(baseOpc, "v2i64"))
-                          QPR:$Vdn, QPR:$Vdn, shr_imm64:$imm, pred:$p)>;
 }
 
 // Neon Shift-Accumulate vector operations,
@@ -4133,16 +4136,16 @@ def  VFMSfq   : N3VQMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACQ, "vfms", "f32",
                 Requires<[HasVFP4,UseFusedMAC]>;
 
 // Match @llvm.fma.* intrinsics
-def : Pat<(v2f32 (fma DPR:$src1, DPR:$Vn, DPR:$Vm)),
+def : Pat<(v2f32 (fma DPR:$Vn, DPR:$Vm, DPR:$src1)),
           (VFMAfd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
           Requires<[HasVFP4]>;
-def : Pat<(v4f32 (fma QPR:$src1, QPR:$Vn, QPR:$Vm)),
+def : Pat<(v4f32 (fma QPR:$Vn, QPR:$Vm, QPR:$src1)),
           (VFMAfq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
           Requires<[HasVFP4]>;
-def : Pat<(v2f32 (fma (fneg DPR:$src1), DPR:$Vn, DPR:$Vm)),
+def : Pat<(v2f32 (fma (fneg DPR:$Vn), DPR:$Vm, DPR:$src1)),
           (VFMSfd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
       Requires<[HasVFP4]>;
-def : Pat<(v4f32 (fma (fneg QPR:$src1), QPR:$Vn, QPR:$Vm)),
+def : Pat<(v4f32 (fma (fneg QPR:$Vn), QPR:$Vm, QPR:$src1)),
           (VFMSfq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
       Requires<[HasVFP4]>;
 
@@ -4305,6 +4308,7 @@ def VORRiv4i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 1, 0, 1,
 
 
 //   VBIC     : Vector Bitwise Bit Clear (AND NOT)
+let TwoOperandAliasConstraint = "$Vn = $Vd" in {
 def  VBICd    : N3VX<0, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd),
                      (ins DPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VBINiD,
                      "vbic", "$Vd, $Vn, $Vm", "",
@@ -4315,6 +4319,7 @@ def  VBICq    : N3VX<0, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
                      "vbic", "$Vd, $Vn, $Vm", "",
                      [(set QPR:$Vd, (v4i32 (and QPR:$Vn,
                                                  (vnotq QPR:$Vm))))]>;
+}
 
 def VBICiv4i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 0, 1, 1,
                           (outs DPR:$Vd), (ins nImmSplatI16:$SIMM, DPR:$src),
@@ -4820,14 +4825,14 @@ defm VCLS     : N2VInt_QHS<0b11, 0b11, 0b00, 0b01000, 0,
 //   VCLZ     : Vector Count Leading Zeros
 defm VCLZ     : N2VInt_QHS<0b11, 0b11, 0b00, 0b01001, 0,
                            IIC_VCNTiD, IIC_VCNTiQ, "vclz", "i",
-                           int_arm_neon_vclz>;
+                           ctlz>;
 //   VCNT     : Vector Count One Bits
 def  VCNTd    : N2VDInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0,
                         IIC_VCNTiD, "vcnt", "8",
-                        v8i8, v8i8, int_arm_neon_vcnt>;
+                        v8i8, v8i8, ctpop>;
 def  VCNTq    : N2VQInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0,
                         IIC_VCNTiQ, "vcnt", "8",
-                        v16i8, v16i8, int_arm_neon_vcnt>;
+                        v16i8, v16i8, ctpop>;
 
 // Vector Swap
 def  VSWPd    : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 0, 0,
@@ -5308,6 +5313,9 @@ def : AlignedVEXTq<v2f32, v4f32, DSubReg_i32_reg>;
 
 //   VEXT     : Vector Extract
 
+
+// All of these have a two-operand InstAlias.
+let TwoOperandAliasConstraint = "$Vn = $Vd" in {
 class VEXTd<string OpcodeStr, string Dt, ValueType Ty, Operand immTy>
   : N3V<0,1,0b11,{?,?,?,?},0,0, (outs DPR:$Vd),
         (ins DPR:$Vn, DPR:$Vm, immTy:$index), NVExtFrm,
@@ -5327,6 +5335,7 @@ class VEXTq<string OpcodeStr, string Dt, ValueType Ty, Operand immTy>
   bits<4> index;
   let Inst{11-8} = index{3-0};
 }
+}
 
 def VEXTd8  : VEXTd<"vext", "8",  v8i8, imm0_7> {
   let Inst{11-8} = index{3-0};
@@ -5588,47 +5597,51 @@ def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>;
 // Vector lengthening move with load, matching extending loads.
 
 // extload, zextload and sextload for a standard lengthening load. Example:
-// Lengthen_Single<"8", "i16", "i8"> = Pat<(v8i16 (extloadvi8 addrmode5:$addr))
-//                                         (VMOVLuv8i16 (VLDRD addrmode5:$addr))>;
+// Lengthen_Single<"8", "i16", "i8"> = 
+//     Pat<(v8i16 (extloadvi8 addrmode6oneL32:$addr))
+//         (VMOVLuv8i16 (VLD1LNd32 addrmode6oneL32:$addr,
+//                                 (f64 (IMPLICIT_DEF)), (i32 0)))>;
 multiclass Lengthen_Single<string DestLanes, string DestTy, string SrcTy> {
   def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
-                    (!cast<PatFrag>("extloadv" # SrcTy) addrmode5:$addr)),
+                    (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)),
                   (!cast<Instruction>("VMOVLuv" # DestLanes # DestTy)
-                    (VLDRD addrmode5:$addr))>;
+                    (VLD1LNd32 addrmode6oneL32:$addr, 
+                               (f64 (IMPLICIT_DEF)), (i32 0)))>;
   def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
-                  (!cast<PatFrag>("zextloadv" # SrcTy) addrmode5:$addr)),
+                  (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)),
                 (!cast<Instruction>("VMOVLuv" # DestLanes # DestTy)
-                  (VLDRD addrmode5:$addr))>;
+                    (VLD1LNd32 addrmode6oneL32:$addr, 
+                               (f64 (IMPLICIT_DEF)), (i32 0)))>;
   def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
-                  (!cast<PatFrag>("sextloadv" # SrcTy) addrmode5:$addr)),
+                  (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)),
                 (!cast<Instruction>("VMOVLsv" # DestLanes # DestTy)
-                  (VLDRD addrmode5:$addr))>;
+                    (VLD1LNd32 addrmode6oneL32:$addr, 
+                               (f64 (IMPLICIT_DEF)), (i32 0)))>;
 }
 
 // extload, zextload and sextload for a lengthening load which only uses
 // half the lanes available. Example:
 // Lengthen_HalfSingle<"4", "i16", "8", "i16", "i8"> =
-//     Pat<(v4i16 (extloadvi8 addrmode5:$addr))
-//         (EXTRACT_SUBREG (VMOVLuv8i16 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-//                                                     (VLDRS addrmode5:$addr),
-//                                                     ssub_0)),
+//     Pat<(v4i16 (extloadvi8 addrmode6oneL32:$addr)),
+//         (EXTRACT_SUBREG (VMOVLuv8i16 (VLD1LNd32 addrmode6oneL32:$addr, 
+//                                      (f64 (IMPLICIT_DEF)), (i32 0))),
 //                         dsub_0)>;
 multiclass Lengthen_HalfSingle<string DestLanes, string DestTy, string SrcTy,
                                string InsnLanes, string InsnTy> {
   def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
-                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode5:$addr)),
+                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)),
        (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # InsnLanes # InsnTy)
-         (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr), ssub_0)),
+         (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
          dsub_0)>;
   def _Z   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
-                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode5:$addr)),
+                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)),
        (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # InsnLanes # InsnTy)
-         (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr), ssub_0)),
+         (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
          dsub_0)>;
   def _S   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
-                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode5:$addr)),
+                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)),
        (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # InsnLanes # InsnTy)
-         (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr), ssub_0)),
+         (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
          dsub_0)>;
 }
 
@@ -5637,33 +5650,33 @@ multiclass Lengthen_HalfSingle<string DestLanes, string DestTy, string SrcTy,
 //
 // Lengthen_Double<"4", "i32", "i8", "8", "i16", "4", "i32", qsub_0> =
 //     Pat<(v4i32 (extloadvi8 addrmode5:$addr))
-//         (EXTRACT_SUBREG (VMOVLuv4i32 
-//           (EXTRACT_SUBREG (VMOVLuv8i16 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-//                                                       (VLDRS addrmode5:$addr),
-//                                                       ssub_0)),
+//         (EXTRACT_SUBREG (VMOVLuv4i32
+//           (EXTRACT_SUBREG (VMOVLuv8i16 (VLD1LNd32 addrmode6oneL32:$addr,
+//                                                   (f64 (IMPLICIT_DEF)),
+//                                                   (i32 0))),
 //                           dsub_0)),
-//           qsub_0)>;
+//           dsub_0)>;
 multiclass Lengthen_Double<string DestLanes, string DestTy, string SrcTy,
                            string Insn1Lanes, string Insn1Ty, string Insn2Lanes,
                            string Insn2Ty> {
   def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
-                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode5:$addr)),
+                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)),
          (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
            (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
-             (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr),
-              ssub_0)), dsub_0))>;
+             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), 
+             dsub_0))>;
   def _Z   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
-                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode5:$addr)),
+                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)),
          (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
            (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
-             (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr),
-              ssub_0)), dsub_0))>;
+             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), 
+             dsub_0))>;
   def _S   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
-                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode5:$addr)),
+                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)),
          (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty)
            (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty)
-             (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr),
-              ssub_0)), dsub_0))>;
+             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), 
+             dsub_0))>;
 }
 
 // extload, zextload and sextload for a lengthening load followed by another
@@ -5671,36 +5684,35 @@ multiclass Lengthen_Double<string DestLanes, string DestTy, string SrcTy,
 // requiring half the available lanes (a 64-bit outcome instead of a 128-bit).
 //
 // Lengthen_HalfDouble<"2", "i32", "i8", "8", "i16", "4", "i32"> =
-//     Pat<(v4i32 (extloadvi8 addrmode5:$addr))
-//         (EXTRACT_SUBREG (VMOVLuv4i32 
-//           (EXTRACT_SUBREG (VMOVLuv8i16 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-//                                                       (VLDRS addrmode5:$addr),
-//                                                       ssub_0)),
-//                           dsub_0)),
-//           dsub_0)>;
+// Pat<(v4i32 (extloadvi8 addrmode5:$addr))
+//     (EXTRACT_SUBREG (VMOVLuv4i32
+//       (EXTRACT_SUBREG (VMOVLuv8i16 (VLD1LNd32 addrmode6oneL32:$addr,
+//                                               (f64 (IMPLICIT_DEF)), (i32 0))),
+//                       dsub_0)),
+//       dsub_0)>;
 multiclass Lengthen_HalfDouble<string DestLanes, string DestTy, string SrcTy,
                            string Insn1Lanes, string Insn1Ty, string Insn2Lanes,
                            string Insn2Ty> {
   def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
-                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode5:$addr)),
+                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)),
          (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
            (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
-             (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr),
-              ssub_0)), dsub_0)),
+             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), 
+             dsub_0)),
           dsub_0)>;
   def _Z   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
-                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode5:$addr)),
+                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)),
          (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
            (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
-             (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr),
-              ssub_0)), dsub_0)),
+             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), 
+             dsub_0)),
           dsub_0)>;
   def _S   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
-                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode5:$addr)),
+                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)),
          (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty)
            (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty)
-             (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr),
-              ssub_0)), dsub_0)),
+             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), 
+             dsub_0)),
           dsub_0)>;
 }
 
@@ -5720,18 +5732,18 @@ defm : Lengthen_HalfDouble<"2", "i32", "i8", "8", "i16", "4", "i32">;
 defm : Lengthen_Double<"2", "i64", "i16", "4", "i32", "2", "i64">;
 
 // Triple lengthening - v2i8 -> v2i16 -> v2i32 -> v2i64
-def : Pat<(v2i64 (extloadvi8 addrmode5:$addr)),
+def : Pat<(v2i64 (extloadvi8 addrmode6oneL32:$addr)),
       (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16
-         (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr), ssub_0)),
-         dsub_0)), dsub_0))>;
-def : Pat<(v2i64 (zextloadvi8 addrmode5:$addr)),
+         (VLD1LNd32 addrmode6oneL32:$addr, 
+                    (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>;
+def : Pat<(v2i64 (zextloadvi8 addrmode6oneL32:$addr)),
       (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16
-         (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr), ssub_0)),
-         dsub_0)), dsub_0))>;
-def : Pat<(v2i64 (sextloadvi8 addrmode5:$addr)),
+         (VLD1LNd32 addrmode6oneL32:$addr,
+                    (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>;
+def : Pat<(v2i64 (sextloadvi8 addrmode6oneL32:$addr)),
       (VMOVLsv2i64 (EXTRACT_SUBREG (VMOVLsv4i32 (EXTRACT_SUBREG (VMOVLsv8i16
-         (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr), ssub_0)),
-         dsub_0)), dsub_0))>;
+         (VLD1LNd32 addrmode6oneL32:$addr,
+                    (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>;
 
 //===----------------------------------------------------------------------===//
 // Assembler aliases
@@ -5742,69 +5754,6 @@ def : VFP2InstAlias<"fmdhr${p} $Dd, $Rn",
 def : VFP2InstAlias<"fmdlr${p} $Dd, $Rn",
                     (VSETLNi32 DPR:$Dd, GPR:$Rn, 0, pred:$p)>;
 
-
-// VADD two-operand aliases.
-def : NEONInstAlias<"vadd${p}.i8 $Vdn, $Vm",
-                    (VADDv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vadd${p}.i16 $Vdn, $Vm",
-                    (VADDv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vadd${p}.i32 $Vdn, $Vm",
-                    (VADDv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vadd${p}.i64 $Vdn, $Vm",
-                    (VADDv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-
-def : NEONInstAlias<"vadd${p}.i8 $Vdn, $Vm",
-                    (VADDv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vadd${p}.i16 $Vdn, $Vm",
-                    (VADDv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vadd${p}.i32 $Vdn, $Vm",
-                    (VADDv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vadd${p}.i64 $Vdn, $Vm",
-                    (VADDv1i64 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-
-def : NEONInstAlias<"vadd${p}.f32 $Vdn, $Vm",
-                    (VADDfd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vadd${p}.f32 $Vdn, $Vm",
-                    (VADDfq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-
-// VSUB two-operand aliases.
-def : NEONInstAlias<"vsub${p}.i8 $Vdn, $Vm",
-                    (VSUBv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vsub${p}.i16 $Vdn, $Vm",
-                    (VSUBv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vsub${p}.i32 $Vdn, $Vm",
-                    (VSUBv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vsub${p}.i64 $Vdn, $Vm",
-                    (VSUBv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-
-def : NEONInstAlias<"vsub${p}.i8 $Vdn, $Vm",
-                    (VSUBv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vsub${p}.i16 $Vdn, $Vm",
-                    (VSUBv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vsub${p}.i32 $Vdn, $Vm",
-                    (VSUBv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vsub${p}.i64 $Vdn, $Vm",
-                    (VSUBv1i64 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-
-def : NEONInstAlias<"vsub${p}.f32 $Vdn, $Vm",
-                    (VSUBfd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vsub${p}.f32 $Vdn, $Vm",
-                    (VSUBfq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-
-// VADDW two-operand aliases.
-def : NEONInstAlias<"vaddw${p}.s8 $Vdn, $Vm",
-                    (VADDWsv8i16 QPR:$Vdn, QPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vaddw${p}.s16 $Vdn, $Vm",
-                    (VADDWsv4i32 QPR:$Vdn, QPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vaddw${p}.s32 $Vdn, $Vm",
-                    (VADDWsv2i64 QPR:$Vdn, QPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vaddw${p}.u8 $Vdn, $Vm",
-                    (VADDWuv8i16 QPR:$Vdn, QPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vaddw${p}.u16 $Vdn, $Vm",
-                    (VADDWuv4i32 QPR:$Vdn, QPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vaddw${p}.u32 $Vdn, $Vm",
-                    (VADDWuv2i64 QPR:$Vdn, QPR:$Vdn, DPR:$Vm, pred:$p)>;
-
 // VAND/VBIC/VEOR/VORR accept but do not require a type suffix.
 defm : NEONDTAnyInstAlias<"vand${p}", "$Vd, $Vn, $Vm",
                          (VANDd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
@@ -5823,23 +5772,6 @@ defm : NEONDTAnyInstAlias<"vorr${p}", "$Vd, $Vn, $Vm",
 defm : NEONDTAnyInstAlias<"vorr${p}", "$Vd, $Vn, $Vm",
                          (VORRq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
 // ... two-operand aliases
-def : NEONInstAlias<"vand${p} $Vdn, $Vm",
-                    (VANDd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vand${p} $Vdn, $Vm",
-                    (VANDq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vbic${p} $Vdn, $Vm",
-                    (VBICd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vbic${p} $Vdn, $Vm",
-                    (VBICq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"veor${p} $Vdn, $Vm",
-                    (VEORd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"veor${p} $Vdn, $Vm",
-                    (VEORq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vorr${p} $Vdn, $Vm",
-                    (VORRd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vorr${p} $Vdn, $Vm",
-                    (VORRq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-
 defm : NEONDTAnyInstAlias<"vand${p}", "$Vdn, $Vm",
                          (VANDd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
 defm : NEONDTAnyInstAlias<"vand${p}", "$Vdn, $Vm",
@@ -5853,212 +5785,6 @@ defm : NEONDTAnyInstAlias<"vorr${p}", "$Vdn, $Vm",
 defm : NEONDTAnyInstAlias<"vorr${p}", "$Vdn, $Vm",
                          (VORRq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
 
-// VMUL two-operand aliases.
-def : NEONInstAlias<"vmul${p}.p8 $Qdn, $Qm",
-                    (VMULpq QPR:$Qdn, QPR:$Qdn, QPR:$Qm, pred:$p)>;
-def : NEONInstAlias<"vmul${p}.i8 $Qdn, $Qm",
-                    (VMULv16i8 QPR:$Qdn, QPR:$Qdn, QPR:$Qm, pred:$p)>;
-def : NEONInstAlias<"vmul${p}.i16 $Qdn, $Qm",
-                    (VMULv8i16 QPR:$Qdn, QPR:$Qdn, QPR:$Qm, pred:$p)>;
-def : NEONInstAlias<"vmul${p}.i32 $Qdn, $Qm",
-                    (VMULv4i32 QPR:$Qdn, QPR:$Qdn, QPR:$Qm, pred:$p)>;
-
-def : NEONInstAlias<"vmul${p}.p8 $Ddn, $Dm",
-                    (VMULpd DPR:$Ddn, DPR:$Ddn, DPR:$Dm, pred:$p)>;
-def : NEONInstAlias<"vmul${p}.i8 $Ddn, $Dm",
-                    (VMULv8i8 DPR:$Ddn, DPR:$Ddn, DPR:$Dm, pred:$p)>;
-def : NEONInstAlias<"vmul${p}.i16 $Ddn, $Dm",
-                    (VMULv4i16 DPR:$Ddn, DPR:$Ddn, DPR:$Dm, pred:$p)>;
-def : NEONInstAlias<"vmul${p}.i32 $Ddn, $Dm",
-                    (VMULv2i32 DPR:$Ddn, DPR:$Ddn, DPR:$Dm, pred:$p)>;
-
-def : NEONInstAlias<"vmul${p}.f32 $Qdn, $Qm",
-                    (VMULfq QPR:$Qdn, QPR:$Qdn, QPR:$Qm, pred:$p)>;
-def : NEONInstAlias<"vmul${p}.f32 $Ddn, $Dm",
-                    (VMULfd DPR:$Ddn, DPR:$Ddn, DPR:$Dm, pred:$p)>;
-
-def : NEONInstAlias<"vmul${p}.i16 $Ddn, $Dm$lane",
-                    (VMULslv4i16 DPR:$Ddn, DPR:$Ddn, DPR_8:$Dm,
-                                 VectorIndex16:$lane, pred:$p)>;
-def : NEONInstAlias<"vmul${p}.i16 $Qdn, $Dm$lane",
-                    (VMULslv8i16 QPR:$Qdn, QPR:$Qdn, DPR_8:$Dm,
-                                 VectorIndex16:$lane, pred:$p)>;
-
-def : NEONInstAlias<"vmul${p}.i32 $Ddn, $Dm$lane",
-                    (VMULslv2i32 DPR:$Ddn, DPR:$Ddn, DPR_VFP2:$Dm,
-                                 VectorIndex32:$lane, pred:$p)>;
-def : NEONInstAlias<"vmul${p}.i32 $Qdn, $Dm$lane",
-                    (VMULslv4i32 QPR:$Qdn, QPR:$Qdn, DPR_VFP2:$Dm,
-                                 VectorIndex32:$lane, pred:$p)>;
-
-def : NEONInstAlias<"vmul${p}.f32 $Ddn, $Dm$lane",
-                    (VMULslfd DPR:$Ddn, DPR:$Ddn, DPR_VFP2:$Dm,
-                              VectorIndex32:$lane, pred:$p)>;
-def : NEONInstAlias<"vmul${p}.f32 $Qdn, $Dm$lane",
-                    (VMULslfq QPR:$Qdn, QPR:$Qdn, DPR_VFP2:$Dm,
-                              VectorIndex32:$lane, pred:$p)>;
-
-// VQADD (register) two-operand aliases.
-def : NEONInstAlias<"vqadd${p}.s8 $Vdn, $Vm",
-                    (VQADDsv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vqadd${p}.s16 $Vdn, $Vm",
-                    (VQADDsv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vqadd${p}.s32 $Vdn, $Vm",
-                    (VQADDsv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vqadd${p}.s64 $Vdn, $Vm",
-                    (VQADDsv1i64 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vqadd${p}.u8 $Vdn, $Vm",
-                    (VQADDuv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vqadd${p}.u16 $Vdn, $Vm",
-                    (VQADDuv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vqadd${p}.u32 $Vdn, $Vm",
-                    (VQADDuv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vqadd${p}.u64 $Vdn, $Vm",
-                    (VQADDuv1i64 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-
-def : NEONInstAlias<"vqadd${p}.s8 $Vdn, $Vm",
-                    (VQADDsv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vqadd${p}.s16 $Vdn, $Vm",
-                    (VQADDsv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vqadd${p}.s32 $Vdn, $Vm",
-                    (VQADDsv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vqadd${p}.s64 $Vdn, $Vm",
-                    (VQADDsv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vqadd${p}.u8 $Vdn, $Vm",
-                    (VQADDuv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vqadd${p}.u16 $Vdn, $Vm",
-                    (VQADDuv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vqadd${p}.u32 $Vdn, $Vm",
-                    (VQADDuv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vqadd${p}.u64 $Vdn, $Vm",
-                    (VQADDuv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-
-// VSHL (immediate) two-operand aliases.
-def : NEONInstAlias<"vshl${p}.i8 $Vdn, $imm",
-                    (VSHLiv8i8 DPR:$Vdn, DPR:$Vdn, imm0_7:$imm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.i16 $Vdn, $imm",
-                    (VSHLiv4i16 DPR:$Vdn, DPR:$Vdn, imm0_15:$imm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.i32 $Vdn, $imm",
-                    (VSHLiv2i32 DPR:$Vdn, DPR:$Vdn, imm0_31:$imm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.i64 $Vdn, $imm",
-                    (VSHLiv1i64 DPR:$Vdn, DPR:$Vdn, imm0_63:$imm, pred:$p)>;
-
-def : NEONInstAlias<"vshl${p}.i8 $Vdn, $imm",
-                    (VSHLiv16i8 QPR:$Vdn, QPR:$Vdn, imm0_7:$imm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.i16 $Vdn, $imm",
-                    (VSHLiv8i16 QPR:$Vdn, QPR:$Vdn, imm0_15:$imm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.i32 $Vdn, $imm",
-                    (VSHLiv4i32 QPR:$Vdn, QPR:$Vdn, imm0_31:$imm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.i64 $Vdn, $imm",
-                    (VSHLiv2i64 QPR:$Vdn, QPR:$Vdn, imm0_63:$imm, pred:$p)>;
-
-// VSHL (register) two-operand aliases.
-def : NEONInstAlias<"vshl${p}.s8 $Vdn, $Vm",
-                    (VSHLsv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.s16 $Vdn, $Vm",
-                    (VSHLsv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.s32 $Vdn, $Vm",
-                    (VSHLsv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.s64 $Vdn, $Vm",
-                    (VSHLsv1i64 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.u8 $Vdn, $Vm",
-                    (VSHLuv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.u16 $Vdn, $Vm",
-                    (VSHLuv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.u32 $Vdn, $Vm",
-                    (VSHLuv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.u64 $Vdn, $Vm",
-                    (VSHLuv1i64 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-
-def : NEONInstAlias<"vshl${p}.s8 $Vdn, $Vm",
-                    (VSHLsv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.s16 $Vdn, $Vm",
-                    (VSHLsv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.s32 $Vdn, $Vm",
-                    (VSHLsv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.s64 $Vdn, $Vm",
-                    (VSHLsv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.u8 $Vdn, $Vm",
-                    (VSHLuv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.u16 $Vdn, $Vm",
-                    (VSHLuv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.u32 $Vdn, $Vm",
-                    (VSHLuv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vshl${p}.u64 $Vdn, $Vm",
-                    (VSHLuv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-
-// VSHR (immediate) two-operand aliases.
-def : NEONInstAlias<"vshr${p}.s8 $Vdn, $imm",
-                    (VSHRsv8i8 DPR:$Vdn, DPR:$Vdn, shr_imm8:$imm, pred:$p)>;
-def : NEONInstAlias<"vshr${p}.s16 $Vdn, $imm",
-                    (VSHRsv4i16 DPR:$Vdn, DPR:$Vdn, shr_imm16:$imm, pred:$p)>;
-def : NEONInstAlias<"vshr${p}.s32 $Vdn, $imm",
-                    (VSHRsv2i32 DPR:$Vdn, DPR:$Vdn, shr_imm32:$imm, pred:$p)>;
-def : NEONInstAlias<"vshr${p}.s64 $Vdn, $imm",
-                    (VSHRsv1i64 DPR:$Vdn, DPR:$Vdn, shr_imm64:$imm, pred:$p)>;
-
-def : NEONInstAlias<"vshr${p}.s8 $Vdn, $imm",
-                    (VSHRsv16i8 QPR:$Vdn, QPR:$Vdn, shr_imm8:$imm, pred:$p)>;
-def : NEONInstAlias<"vshr${p}.s16 $Vdn, $imm",
-                    (VSHRsv8i16 QPR:$Vdn, QPR:$Vdn, shr_imm16:$imm, pred:$p)>;
-def : NEONInstAlias<"vshr${p}.s32 $Vdn, $imm",
-                    (VSHRsv4i32 QPR:$Vdn, QPR:$Vdn, shr_imm32:$imm, pred:$p)>;
-def : NEONInstAlias<"vshr${p}.s64 $Vdn, $imm",
-                    (VSHRsv2i64 QPR:$Vdn, QPR:$Vdn, shr_imm64:$imm, pred:$p)>;
-
-def : NEONInstAlias<"vshr${p}.u8 $Vdn, $imm",
-                    (VSHRuv8i8 DPR:$Vdn, DPR:$Vdn, shr_imm8:$imm, pred:$p)>;
-def : NEONInstAlias<"vshr${p}.u16 $Vdn, $imm",
-                    (VSHRuv4i16 DPR:$Vdn, DPR:$Vdn, shr_imm16:$imm, pred:$p)>;
-def : NEONInstAlias<"vshr${p}.u32 $Vdn, $imm",
-                    (VSHRuv2i32 DPR:$Vdn, DPR:$Vdn, shr_imm32:$imm, pred:$p)>;
-def : NEONInstAlias<"vshr${p}.u64 $Vdn, $imm",
-                    (VSHRuv1i64 DPR:$Vdn, DPR:$Vdn, shr_imm64:$imm, pred:$p)>;
-
-def : NEONInstAlias<"vshr${p}.u8 $Vdn, $imm",
-                    (VSHRuv16i8 QPR:$Vdn, QPR:$Vdn, shr_imm8:$imm, pred:$p)>;
-def : NEONInstAlias<"vshr${p}.u16 $Vdn, $imm",
-                    (VSHRuv8i16 QPR:$Vdn, QPR:$Vdn, shr_imm16:$imm, pred:$p)>;
-def : NEONInstAlias<"vshr${p}.u32 $Vdn, $imm",
-                    (VSHRuv4i32 QPR:$Vdn, QPR:$Vdn, shr_imm32:$imm, pred:$p)>;
-def : NEONInstAlias<"vshr${p}.u64 $Vdn, $imm",
-                    (VSHRuv2i64 QPR:$Vdn, QPR:$Vdn, shr_imm64:$imm, pred:$p)>;
-
-// VRSHL two-operand aliases.
-def : NEONInstAlias<"vrshl${p}.s8 $Vdn, $Vm",
-                    (VRSHLsv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vrshl${p}.s16 $Vdn, $Vm",
-                    (VRSHLsv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vrshl${p}.s32 $Vdn, $Vm",
-                    (VRSHLsv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vrshl${p}.s64 $Vdn, $Vm",
-                    (VRSHLsv1i64 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vrshl${p}.u8 $Vdn, $Vm",
-                    (VRSHLuv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vrshl${p}.u16 $Vdn, $Vm",
-                    (VRSHLuv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vrshl${p}.u32 $Vdn, $Vm",
-                    (VRSHLuv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vrshl${p}.u64 $Vdn, $Vm",
-                    (VRSHLuv1i64 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-
-def : NEONInstAlias<"vrshl${p}.s8 $Vdn, $Vm",
-                    (VRSHLsv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vrshl${p}.s16 $Vdn, $Vm",
-                    (VRSHLsv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vrshl${p}.s32 $Vdn, $Vm",
-                    (VRSHLsv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vrshl${p}.s64 $Vdn, $Vm",
-                    (VRSHLsv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vrshl${p}.u8 $Vdn, $Vm",
-                    (VRSHLuv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vrshl${p}.u16 $Vdn, $Vm",
-                    (VRSHLuv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vrshl${p}.u32 $Vdn, $Vm",
-                    (VRSHLuv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vrshl${p}.u64 $Vdn, $Vm",
-                    (VRSHLuv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-
 // VLD1 single-lane pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
 def VLD1LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld1${p}", ".8", "$list, $addr",
@@ -6223,17 +5949,17 @@ def VST2LNqWB_register_Asm_32 :
 
 // VLD3 all-lanes pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
-def VLD3DUPdAsm_8  : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
+def VLD3DUPdAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
                (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>;
-def VLD3DUPdAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
+def VLD3DUPdAsm_16: NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
                (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>;
-def VLD3DUPdAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
+def VLD3DUPdAsm_32: NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
                (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>;
-def VLD3DUPqAsm_8  : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
+def VLD3DUPqAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
                (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>;
-def VLD3DUPqAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
+def VLD3DUPqAsm_16: NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
                (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>;
-def VLD3DUPqAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
+def VLD3DUPqAsm_32: NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
                (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>;
 
 def VLD3DUPdWB_fixed_Asm_8 :
@@ -6499,17 +6225,17 @@ def VST3qWB_register_Asm_32 :
 
 // VLD4 all-lanes pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
-def VLD4DUPdAsm_8  : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
+def VLD4DUPdAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
                (ins VecListFourDAllLanes:$list, addrmode6:$addr, pred:$p)>;
-def VLD4DUPdAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
+def VLD4DUPdAsm_16: NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
                (ins VecListFourDAllLanes:$list, addrmode6:$addr, pred:$p)>;
-def VLD4DUPdAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
+def VLD4DUPdAsm_32: NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
                (ins VecListFourDAllLanes:$list, addrmode6:$addr, pred:$p)>;
-def VLD4DUPqAsm_8  : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
+def VLD4DUPqAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
                (ins VecListFourQAllLanes:$list, addrmode6:$addr, pred:$p)>;
-def VLD4DUPqAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
+def VLD4DUPqAsm_16: NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
                (ins VecListFourQAllLanes:$list, addrmode6:$addr, pred:$p)>;
-def VLD4DUPqAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
+def VLD4DUPqAsm_32: NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
                (ins VecListFourQAllLanes:$list, addrmode6:$addr, pred:$p)>;
 
 def VLD4DUPdWB_fixed_Asm_8 :
@@ -6845,277 +6571,6 @@ def : NEONInstAlias<"vclt${p}.u32 $Qd, $Qn, $Qm",
 def : NEONInstAlias<"vclt${p}.f32 $Qd, $Qn, $Qm",
                     (VCGTfq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
 
-// Two-operand variants for VEXT
-def : NEONInstAlias<"vext${p}.8 $Vdn, $Vm, $imm",
-                  (VEXTd8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, imm0_7:$imm, pred:$p)>;
-def : NEONInstAlias<"vext${p}.16 $Vdn, $Vm, $imm",
-                  (VEXTd16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, imm0_3:$imm, pred:$p)>;
-def : NEONInstAlias<"vext${p}.32 $Vdn, $Vm, $imm",
-                  (VEXTd32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, imm0_1:$imm, pred:$p)>;
-
-def : NEONInstAlias<"vext${p}.8 $Vdn, $Vm, $imm",
-                  (VEXTq8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, imm0_15:$imm, pred:$p)>;
-def : NEONInstAlias<"vext${p}.16 $Vdn, $Vm, $imm",
-                  (VEXTq16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, imm0_7:$imm, pred:$p)>;
-def : NEONInstAlias<"vext${p}.32 $Vdn, $Vm, $imm",
-                  (VEXTq32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, imm0_3:$imm, pred:$p)>;
-def : NEONInstAlias<"vext${p}.64 $Vdn, $Vm, $imm",
-                  (VEXTq64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, imm0_1:$imm, pred:$p)>;
-
-// Two-operand variants for VQDMULH
-def : NEONInstAlias<"vqdmulh${p}.s16 $Vdn, $Vm",
-                    (VQDMULHv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vqdmulh${p}.s32 $Vdn, $Vm",
-                    (VQDMULHv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-
-def : NEONInstAlias<"vqdmulh${p}.s16 $Vdn, $Vm",
-                    (VQDMULHv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vqdmulh${p}.s32 $Vdn, $Vm",
-                    (VQDMULHv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-
-// Two-operand variants for VMAX.
-def : NEONInstAlias<"vmax${p}.s8 $Vdn, $Vm",
-                    (VMAXsv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmax${p}.s16 $Vdn, $Vm",
-                    (VMAXsv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmax${p}.s32 $Vdn, $Vm",
-                    (VMAXsv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmax${p}.u8 $Vdn, $Vm",
-                    (VMAXuv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmax${p}.u16 $Vdn, $Vm",
-                    (VMAXuv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmax${p}.u32 $Vdn, $Vm",
-                    (VMAXuv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmax${p}.f32 $Vdn, $Vm",
-                    (VMAXfd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-
-def : NEONInstAlias<"vmax${p}.s8 $Vdn, $Vm",
-                    (VMAXsv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmax${p}.s16 $Vdn, $Vm",
-                    (VMAXsv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmax${p}.s32 $Vdn, $Vm",
-                    (VMAXsv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmax${p}.u8 $Vdn, $Vm",
-                    (VMAXuv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmax${p}.u16 $Vdn, $Vm",
-                    (VMAXuv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmax${p}.u32 $Vdn, $Vm",
-                    (VMAXuv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmax${p}.f32 $Vdn, $Vm",
-                    (VMAXfq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-
-// Two-operand variants for VMIN.
-def : NEONInstAlias<"vmin${p}.s8 $Vdn, $Vm",
-                    (VMINsv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmin${p}.s16 $Vdn, $Vm",
-                    (VMINsv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmin${p}.s32 $Vdn, $Vm",
-                    (VMINsv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmin${p}.u8 $Vdn, $Vm",
-                    (VMINuv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmin${p}.u16 $Vdn, $Vm",
-                    (VMINuv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmin${p}.u32 $Vdn, $Vm",
-                    (VMINuv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmin${p}.f32 $Vdn, $Vm",
-                    (VMINfd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-
-def : NEONInstAlias<"vmin${p}.s8 $Vdn, $Vm",
-                    (VMINsv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmin${p}.s16 $Vdn, $Vm",
-                    (VMINsv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmin${p}.s32 $Vdn, $Vm",
-                    (VMINsv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmin${p}.u8 $Vdn, $Vm",
-                    (VMINuv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmin${p}.u16 $Vdn, $Vm",
-                    (VMINuv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmin${p}.u32 $Vdn, $Vm",
-                    (VMINuv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vmin${p}.f32 $Vdn, $Vm",
-                    (VMINfq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-
-// Two-operand variants for VPADD.
-def : NEONInstAlias<"vpadd${p}.i8 $Vdn, $Vm",
-                    (VPADDi8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vpadd${p}.i16 $Vdn, $Vm",
-                    (VPADDi16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vpadd${p}.i32 $Vdn, $Vm",
-                    (VPADDi32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vpadd${p}.f32 $Vdn, $Vm",
-                    (VPADDf DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-
-// Two-operand variants for VSRA.
-    // Signed.
-def : NEONInstAlias<"vsra${p}.s8 $Vdm, $imm",
-                    (VSRAsv8i8 DPR:$Vdm, DPR:$Vdm, shr_imm8:$imm, pred:$p)>;
-def : NEONInstAlias<"vsra${p}.s16 $Vdm, $imm",
-                    (VSRAsv4i16 DPR:$Vdm, DPR:$Vdm, shr_imm16:$imm, pred:$p)>;
-def : NEONInstAlias<"vsra${p}.s32 $Vdm, $imm",
-                    (VSRAsv2i32 DPR:$Vdm, DPR:$Vdm, shr_imm32:$imm, pred:$p)>;
-def : NEONInstAlias<"vsra${p}.s64 $Vdm, $imm",
-                    (VSRAsv1i64 DPR:$Vdm, DPR:$Vdm, shr_imm64:$imm, pred:$p)>;
-
-def : NEONInstAlias<"vsra${p}.s8 $Vdm, $imm",
-                    (VSRAsv16i8 QPR:$Vdm, QPR:$Vdm, shr_imm8:$imm, pred:$p)>;
-def : NEONInstAlias<"vsra${p}.s16 $Vdm, $imm",
-                    (VSRAsv8i16 QPR:$Vdm, QPR:$Vdm, shr_imm16:$imm, pred:$p)>;
-def : NEONInstAlias<"vsra${p}.s32 $Vdm, $imm",
-                    (VSRAsv4i32 QPR:$Vdm, QPR:$Vdm, shr_imm32:$imm, pred:$p)>;
-def : NEONInstAlias<"vsra${p}.s64 $Vdm, $imm",
-                    (VSRAsv2i64 QPR:$Vdm, QPR:$Vdm, shr_imm64:$imm, pred:$p)>;
-
-    // Unsigned.
-def : NEONInstAlias<"vsra${p}.u8 $Vdm, $imm",
-                    (VSRAuv8i8 DPR:$Vdm, DPR:$Vdm, shr_imm8:$imm, pred:$p)>;
-def : NEONInstAlias<"vsra${p}.u16 $Vdm, $imm",
-                    (VSRAuv4i16 DPR:$Vdm, DPR:$Vdm, shr_imm16:$imm, pred:$p)>;
-def : NEONInstAlias<"vsra${p}.u32 $Vdm, $imm",
-                    (VSRAuv2i32 DPR:$Vdm, DPR:$Vdm, shr_imm32:$imm, pred:$p)>;
-def : NEONInstAlias<"vsra${p}.u64 $Vdm, $imm",
-                    (VSRAuv1i64 DPR:$Vdm, DPR:$Vdm, shr_imm64:$imm, pred:$p)>;
-
-def : NEONInstAlias<"vsra${p}.u8 $Vdm, $imm",
-                    (VSRAuv16i8 QPR:$Vdm, QPR:$Vdm, shr_imm8:$imm, pred:$p)>;
-def : NEONInstAlias<"vsra${p}.u16 $Vdm, $imm",
-                    (VSRAuv8i16 QPR:$Vdm, QPR:$Vdm, shr_imm16:$imm, pred:$p)>;
-def : NEONInstAlias<"vsra${p}.u32 $Vdm, $imm",
-                    (VSRAuv4i32 QPR:$Vdm, QPR:$Vdm, shr_imm32:$imm, pred:$p)>;
-def : NEONInstAlias<"vsra${p}.u64 $Vdm, $imm",
-                    (VSRAuv2i64 QPR:$Vdm, QPR:$Vdm, shr_imm64:$imm, pred:$p)>;
-
-// Two-operand variants for VSRI.
-def : NEONInstAlias<"vsri${p}.8 $Vdm, $imm",
-                    (VSRIv8i8 DPR:$Vdm, DPR:$Vdm, shr_imm8:$imm, pred:$p)>;
-def : NEONInstAlias<"vsri${p}.16 $Vdm, $imm",
-                    (VSRIv4i16 DPR:$Vdm, DPR:$Vdm, shr_imm16:$imm, pred:$p)>;
-def : NEONInstAlias<"vsri${p}.32 $Vdm, $imm",
-                    (VSRIv2i32 DPR:$Vdm, DPR:$Vdm, shr_imm32:$imm, pred:$p)>;
-def : NEONInstAlias<"vsri${p}.64 $Vdm, $imm",
-                    (VSRIv1i64 DPR:$Vdm, DPR:$Vdm, shr_imm64:$imm, pred:$p)>;
-
-def : NEONInstAlias<"vsri${p}.8 $Vdm, $imm",
-                    (VSRIv16i8 QPR:$Vdm, QPR:$Vdm, shr_imm8:$imm, pred:$p)>;
-def : NEONInstAlias<"vsri${p}.16 $Vdm, $imm",
-                    (VSRIv8i16 QPR:$Vdm, QPR:$Vdm, shr_imm16:$imm, pred:$p)>;
-def : NEONInstAlias<"vsri${p}.32 $Vdm, $imm",
-                    (VSRIv4i32 QPR:$Vdm, QPR:$Vdm, shr_imm32:$imm, pred:$p)>;
-def : NEONInstAlias<"vsri${p}.64 $Vdm, $imm",
-                    (VSRIv2i64 QPR:$Vdm, QPR:$Vdm, shr_imm64:$imm, pred:$p)>;
-
-// Two-operand variants for VSLI.
-def : NEONInstAlias<"vsli${p}.8 $Vdm, $imm",
-                    (VSLIv8i8 DPR:$Vdm, DPR:$Vdm, shr_imm8:$imm, pred:$p)>;
-def : NEONInstAlias<"vsli${p}.16 $Vdm, $imm",
-                    (VSLIv4i16 DPR:$Vdm, DPR:$Vdm, shr_imm16:$imm, pred:$p)>;
-def : NEONInstAlias<"vsli${p}.32 $Vdm, $imm",
-                    (VSLIv2i32 DPR:$Vdm, DPR:$Vdm, shr_imm32:$imm, pred:$p)>;
-def : NEONInstAlias<"vsli${p}.64 $Vdm, $imm",
-                    (VSLIv1i64 DPR:$Vdm, DPR:$Vdm, shr_imm64:$imm, pred:$p)>;
-
-def : NEONInstAlias<"vsli${p}.8 $Vdm, $imm",
-                    (VSLIv16i8 QPR:$Vdm, QPR:$Vdm, shr_imm8:$imm, pred:$p)>;
-def : NEONInstAlias<"vsli${p}.16 $Vdm, $imm",
-                    (VSLIv8i16 QPR:$Vdm, QPR:$Vdm, shr_imm16:$imm, pred:$p)>;
-def : NEONInstAlias<"vsli${p}.32 $Vdm, $imm",
-                    (VSLIv4i32 QPR:$Vdm, QPR:$Vdm, shr_imm32:$imm, pred:$p)>;
-def : NEONInstAlias<"vsli${p}.64 $Vdm, $imm",
-                    (VSLIv2i64 QPR:$Vdm, QPR:$Vdm, shr_imm64:$imm, pred:$p)>;
-
-// Two-operand variants for VHSUB.
-    // Signed.
-def : NEONInstAlias<"vhsub${p}.s8 $Vdn, $Vm",
-                    (VHSUBsv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vhsub${p}.s16 $Vdn, $Vm",
-                    (VHSUBsv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vhsub${p}.s32 $Vdn, $Vm",
-                    (VHSUBsv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-
-def : NEONInstAlias<"vhsub${p}.s8 $Vdn, $Vm",
-                    (VHSUBsv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vhsub${p}.s16 $Vdn, $Vm",
-                    (VHSUBsv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vhsub${p}.s32 $Vdn, $Vm",
-                    (VHSUBsv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-
-    // Unsigned.
-def : NEONInstAlias<"vhsub${p}.u8 $Vdn, $Vm",
-                    (VHSUBuv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vhsub${p}.u16 $Vdn, $Vm",
-                    (VHSUBuv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vhsub${p}.u32 $Vdn, $Vm",
-                    (VHSUBuv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-
-def : NEONInstAlias<"vhsub${p}.u8 $Vdn, $Vm",
-                    (VHSUBuv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vhsub${p}.u16 $Vdn, $Vm",
-                    (VHSUBuv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vhsub${p}.u32 $Vdn, $Vm",
-                    (VHSUBuv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-
-
-// Two-operand variants for VHADD.
-    // Signed.
-def : NEONInstAlias<"vhadd${p}.s8 $Vdn, $Vm",
-                    (VHADDsv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vhadd${p}.s16 $Vdn, $Vm",
-                    (VHADDsv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vhadd${p}.s32 $Vdn, $Vm",
-                    (VHADDsv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-
-def : NEONInstAlias<"vhadd${p}.s8 $Vdn, $Vm",
-                    (VHADDsv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vhadd${p}.s16 $Vdn, $Vm",
-                    (VHADDsv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vhadd${p}.s32 $Vdn, $Vm",
-                    (VHADDsv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-
-    // Unsigned.
-def : NEONInstAlias<"vhadd${p}.u8 $Vdn, $Vm",
-                    (VHADDuv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vhadd${p}.u16 $Vdn, $Vm",
-                    (VHADDuv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vhadd${p}.u32 $Vdn, $Vm",
-                    (VHADDuv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
-
-def : NEONInstAlias<"vhadd${p}.u8 $Vdn, $Vm",
-                    (VHADDuv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vhadd${p}.u16 $Vdn, $Vm",
-                    (VHADDuv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-def : NEONInstAlias<"vhadd${p}.u32 $Vdn, $Vm",
-                    (VHADDuv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
-
-// Two-operand variants for VRHADD.
-    // Signed.
-def : NEONInstAlias<"vrhadd${p}.s8 $Vdn, $Rm",
-                    (VRHADDsv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Rm, pred:$p)>;
-def : NEONInstAlias<"vrhadd${p}.s16 $Vdn, $Rm",
-                    (VRHADDsv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Rm, pred:$p)>;
-def : NEONInstAlias<"vrhadd${p}.s32 $Vdn, $Rm",
-                    (VRHADDsv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Rm, pred:$p)>;
-
-def : NEONInstAlias<"vrhadd${p}.s8 $Vdn, $Rm",
-                    (VRHADDsv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Rm, pred:$p)>;
-def : NEONInstAlias<"vrhadd${p}.s16 $Vdn, $Rm",
-                    (VRHADDsv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Rm, pred:$p)>;
-def : NEONInstAlias<"vrhadd${p}.s32 $Vdn, $Rm",
-                    (VRHADDsv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Rm, pred:$p)>;
-
-    // Unsigned.
-def : NEONInstAlias<"vrhadd${p}.u8 $Vdn, $Rm",
-                    (VRHADDuv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Rm, pred:$p)>;
-def : NEONInstAlias<"vrhadd${p}.u16 $Vdn, $Rm",
-                    (VRHADDuv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Rm, pred:$p)>;
-def : NEONInstAlias<"vrhadd${p}.u32 $Vdn, $Rm",
-                    (VRHADDuv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Rm, pred:$p)>;
-
-def : NEONInstAlias<"vrhadd${p}.u8 $Vdn, $Rm",
-                    (VRHADDuv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Rm, pred:$p)>;
-def : NEONInstAlias<"vrhadd${p}.u16 $Vdn, $Rm",
-                    (VRHADDuv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Rm, pred:$p)>;
-def : NEONInstAlias<"vrhadd${p}.u32 $Vdn, $Rm",
-                    (VRHADDuv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Rm, pred:$p)>;
-
 // VSWP allows, but does not require, a type suffix.
 defm : NEONDTAnyInstAlias<"vswp${p}", "$Vd, $Vm",
                          (VSWPd DPR:$Vd, DPR:$Vm, pred:$p)>;
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 6335229..554f6d9 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -32,9 +32,6 @@ def imm_sr : Operand<i32>, PatLeaf<(imm), [{
   let ParserMatchClass = ThumbSRImmAsmOperand;
 }
 
-def imm_neg_XFORM : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(-(int)N->getZExtValue(), MVT::i32);
-}]>;
 def imm_comp_XFORM : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(~((uint32_t)N->getZExtValue()), MVT::i32);
 }]>;
@@ -258,16 +255,20 @@ def tNOP : T1pI<(outs), (ins), NoItinerary, "nop", "", []>,
         Requires<[IsThumb2]>;
 
 def tYIELD : T1pI<(outs), (ins), NoItinerary, "yield", "", []>,
-           T1SystemEncoding<0x10>; // A8.6.410
+           T1SystemEncoding<0x10>, // A8.6.410
+           Requires<[IsThumb2]>;
 
 def tWFE : T1pI<(outs), (ins), NoItinerary, "wfe", "", []>,
-           T1SystemEncoding<0x20>; // A8.6.408
+           T1SystemEncoding<0x20>, // A8.6.408
+           Requires<[IsThumb2]>;
 
 def tWFI : T1pI<(outs), (ins), NoItinerary, "wfi", "", []>,
-           T1SystemEncoding<0x30>; // A8.6.409
+           T1SystemEncoding<0x30>, // A8.6.409
+           Requires<[IsThumb2]>;
 
 def tSEV : T1pI<(outs), (ins), NoItinerary, "sev", "", []>,
-           T1SystemEncoding<0x40>; // A8.6.157
+           T1SystemEncoding<0x40>, // A8.6.157
+           Requires<[IsThumb2]>;
 
 // The imm operand $val can be used by a debugger to store more information
 // about the breakpoint.
@@ -363,8 +364,8 @@ def : tInstAlias<"sub${p} sp, sp, $imm",
                  (tSUBspi SP, t_imm0_508s4:$imm, pred:$p)>;
 
 // ADD <Rm>, sp
-def tADDrSP : T1pIt<(outs GPR:$Rdn), (ins GPR:$Rn, GPRsp:$sp), IIC_iALUr,
-                  "add", "\t$Rdn, $sp, $Rn", []>,
+def tADDrSP : T1pI<(outs GPR:$Rdn), (ins GPRsp:$sp, GPR:$Rn), IIC_iALUr,
+                   "add", "\t$Rdn, $sp, $Rn", []>,
               T1Special<{0,0,?,?}> {
   // A8.6.9 Encoding T1
   bits<4> Rdn;
@@ -419,34 +420,35 @@ let isCall = 1,
   Defs = [LR], Uses = [SP] in {
   // Also used for Thumb2
   def tBL  : TIx2<0b11110, 0b11, 1,
-                  (outs), (ins pred:$p, t_bltarget:$func, variable_ops), IIC_Br,
+                  (outs), (ins pred:$p, t_bltarget:$func), IIC_Br,
                   "bl${p}\t$func",
                   [(ARMtcall tglobaladdr:$func)]>,
              Requires<[IsThumb]> {
-    bits<22> func;
-    let Inst{26} = func{21};
+    bits<24> func;
+    let Inst{26} = func{23};
     let Inst{25-16} = func{20-11};
-    let Inst{13} = 1;
-    let Inst{11} = 1;
+    let Inst{13} = func{22};
+    let Inst{11} = func{21};
     let Inst{10-0} = func{10-0};
   }
 
   // ARMv5T and above, also used for Thumb2
   def tBLXi : TIx2<0b11110, 0b11, 0,
-                 (outs), (ins pred:$p, t_blxtarget:$func, variable_ops), IIC_Br,
+                 (outs), (ins pred:$p, t_blxtarget:$func), IIC_Br,
                    "blx${p}\t$func",
                    [(ARMcall tglobaladdr:$func)]>,
               Requires<[IsThumb, HasV5T]> {
-    bits<21> func;
+    bits<24> func;
+    let Inst{26} = func{23};
     let Inst{25-16} = func{20-11};
-    let Inst{13} = 1;
-    let Inst{11} = 1;
+    let Inst{13} = func{22};
+    let Inst{11} = func{21};
     let Inst{10-1} = func{10-1};
     let Inst{0} = 0; // func{0} is assumed zero
   }
 
   // Also used for Thumb2
-  def tBLXr : TI<(outs), (ins pred:$p, GPR:$func, variable_ops), IIC_Br,
+  def tBLXr : TI<(outs), (ins pred:$p, GPR:$func), IIC_Br,
                   "blx${p}\t$func",
                   [(ARMtcall GPR:$func)]>,
               Requires<[IsThumb, HasV5T]>,
@@ -457,7 +459,7 @@ let isCall = 1,
   }
 
   // ARMv4T
-  def tBX_CALL : tPseudoInst<(outs), (ins tGPR:$func, variable_ops),
+  def tBX_CALL : tPseudoInst<(outs), (ins tGPR:$func),
                   4, IIC_Br,
                   [(ARMcall_nolink tGPR:$func)]>,
             Requires<[IsThumb, IsThumb1Only]>;
@@ -504,7 +506,7 @@ let isBranch = 1, isTerminator = 1 in
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
   // IOS versions.
   let Uses = [SP] in {
-    def tTAILJMPr : tPseudoExpand<(outs), (ins tcGPR:$dst, variable_ops),
+    def tTAILJMPr : tPseudoExpand<(outs), (ins tcGPR:$dst),
                      4, IIC_Br, [],
                      (tBX GPR:$dst, (ops 14, zero_reg))>,
                      Requires<[IsThumb]>;
@@ -514,7 +516,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
   // Non-IOS version:
   let Uses = [SP] in {
     def tTAILJMPdND : tPseudoExpand<(outs),
-                   (ins t_brtarget:$dst, pred:$p, variable_ops),
+                   (ins t_brtarget:$dst, pred:$p),
                    4, IIC_Br, [],
                    (tB t_brtarget:$dst, pred:$p)>,
                  Requires<[IsThumb, IsNotIOS]>;
@@ -1398,7 +1400,7 @@ def : InstAlias<"nop", (tMOVr R8, R8, 14, 0)>,Requires<[IsThumb, IsThumb1Only]>;
 
 // For round-trip assembly/disassembly, we have to handle a CPS instruction
 // without any iflags. That's not, strictly speaking, valid syntax, but it's
-// a useful extention and assembles to defined behaviour (the insn does
+// a useful extension and assembles to defined behaviour (the insn does
 // nothing).
 def : tInstAlias<"cps$imod", (tCPS imod_op:$imod, 0)>;
 def : tInstAlias<"cps$imod", (tCPS imod_op:$imod, 0)>;
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index e6fb9d5..d83530a 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -62,6 +62,15 @@ def t2_so_imm_neg_XFORM : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(-((int)N->getZExtValue()), MVT::i32);
 }]>;
 
+// so_imm_notSext_XFORM - Return a so_imm value packed into the format
+// described for so_imm_notSext def below, with sign extension from 16
+// bits.
+def t2_so_imm_notSext16_XFORM : SDNodeXForm<imm, [{
+  APInt apIntN = N->getAPIntValue();
+  unsigned N16bitSignExt = apIntN.trunc(16).sext(32).getZExtValue();
+  return CurDAG->getTargetConstant(~N16bitSignExt, MVT::i32);
+}]>;
+
 // t2_so_imm - Match a 32-bit immediate operand, which is an
 // 8-bit immediate rotated by an arbitrary number of bits, or an 8-bit
 // immediate splatted into multiple bytes of the word.
@@ -86,6 +95,17 @@ def t2_so_imm_not : Operand<i32>, PatLeaf<(imm), [{
   let ParserMatchClass = t2_so_imm_not_asmoperand;
 }
 
+// t2_so_imm_notSext - match an immediate that is a complement of a t2_so_imm
+// if the upper 16 bits are zero.
+def t2_so_imm_notSext : Operand<i32>, PatLeaf<(imm), [{
+    APInt apIntN = N->getAPIntValue();
+    if (!apIntN.isIntN(16)) return false;
+    unsigned N16bitSignExt = apIntN.trunc(16).sext(32).getZExtValue();
+    return ARM_AM::getT2SOImmVal(~N16bitSignExt) != -1;
+  }], t2_so_imm_notSext16_XFORM> {
+  let ParserMatchClass = t2_so_imm_not_asmoperand;
+}
+
 // t2_so_imm_neg - Match an immediate that is a negation of a t2_so_imm.
 def t2_so_imm_neg_asmoperand : AsmOperandClass { let Name = "T2SOImmNeg"; }
 def t2_so_imm_neg : Operand<i32>, PatLeaf<(imm), [{
@@ -668,16 +688,16 @@ let hasPostISelHook = 1, Defs = [CPSR] in {
 multiclass T2I_rbin_s_is<PatFrag opnode> {
    // shifted imm
    def ri : t2PseudoInst<(outs rGPR:$Rd),
-                         (ins GPRnopc:$Rn, t2_so_imm:$imm, pred:$p),
+                         (ins rGPR:$Rn, t2_so_imm:$imm, pred:$p),
                          4, IIC_iALUi,
                          [(set rGPR:$Rd, CPSR, (opnode t2_so_imm:$imm,
-                                                GPRnopc:$Rn))]>;
+                                                rGPR:$Rn))]>;
    // shifted register
    def rs : t2PseudoInst<(outs rGPR:$Rd),
-                         (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm, pred:$p),
+                         (ins rGPR:$Rn, t2_so_reg:$ShiftedRm, pred:$p),
                          4, IIC_iALUsi,
                          [(set rGPR:$Rd, CPSR, (opnode t2_so_reg:$ShiftedRm,
-                                                GPRnopc:$Rn))]>;
+                                                rGPR:$Rn))]>;
 }
 }
 
@@ -1911,11 +1931,16 @@ def : T2Pat<(add        GPR:$src, t2_so_imm_neg:$imm),
             (t2SUBri    GPR:$src, t2_so_imm_neg:$imm)>;
 def : T2Pat<(add        GPR:$src, imm0_4095_neg:$imm),
             (t2SUBri12  GPR:$src, imm0_4095_neg:$imm)>;
+def : T2Pat<(add        GPR:$src, imm0_65535_neg:$imm),
+            (t2SUBrr    GPR:$src, (t2MOVi16 (imm_neg_XFORM imm:$imm)))>;
+
 let AddedComplexity = 1 in
 def : T2Pat<(ARMaddc    rGPR:$src, imm0_255_neg:$imm),
             (t2SUBSri   rGPR:$src, imm0_255_neg:$imm)>;
 def : T2Pat<(ARMaddc    rGPR:$src, t2_so_imm_neg:$imm),
             (t2SUBSri   rGPR:$src, t2_so_imm_neg:$imm)>;
+def : T2Pat<(ARMaddc    rGPR:$src, imm0_65535_neg:$imm),
+            (t2SUBSrr   rGPR:$src, (t2MOVi16 (imm_neg_XFORM imm:$imm)))>;
 // The with-carry-in form matches bitwise not instead of the negation.
 // Effectively, the inverse interpretation of the carry flag already accounts
 // for part of the negation.
@@ -1924,6 +1949,8 @@ def : T2Pat<(ARMadde    rGPR:$src, imm0_255_not:$imm, CPSR),
             (t2SBCri    rGPR:$src, imm0_255_not:$imm)>;
 def : T2Pat<(ARMadde    rGPR:$src, t2_so_imm_not:$imm, CPSR),
             (t2SBCri    rGPR:$src, t2_so_imm_not:$imm)>;
+def : T2Pat<(ARMadde    rGPR:$src, imm0_65535_neg:$imm, CPSR),
+            (t2SBCrr    rGPR:$src, (t2MOVi16 (imm_neg_XFORM imm:$imm)))>;
 
 // Select Bytes -- for disassembly only
 
@@ -2134,8 +2161,8 @@ defm t2ROR  : T2I_sh_ir<0b11, "ror", imm0_31,
                         BinOpFrag<(rotr node:$LHS, node:$RHS)>, "t2ROR">;
 
 // (rotr x, (and y, 0x...1f)) ==> (ROR x, y)
-def : Pat<(rotr rGPR:$lhs, (and rGPR:$rhs, lo5AllOne)),
-          (t2RORrr rGPR:$lhs, rGPR:$rhs)>;
+def : T2Pat<(rotr rGPR:$lhs, (and rGPR:$rhs, lo5AllOne)),
+            (t2RORrr rGPR:$lhs, rGPR:$rhs)>;
 
 let Uses = [CPSR] in {
 def t2RRX : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi,
@@ -2332,6 +2359,17 @@ let AddedComplexity = 1 in
 def : T2Pat<(and     rGPR:$src, t2_so_imm_not:$imm),
             (t2BICri rGPR:$src, t2_so_imm_not:$imm)>;
 
+// top16Zero - answer true if the upper 16 bits of $src are 0, false otherwise
+def top16Zero: PatLeaf<(i32 rGPR:$src), [{
+  return CurDAG->MaskedValueIsZero(SDValue(N,0), APInt::getHighBitsSet(32, 16));
+  }]>;
+
+// so_imm_notSext is needed instead of so_imm_not, as the value of imm
+// will match the extended, not the original bitWidth for $src.
+def : T2Pat<(and top16Zero:$src, t2_so_imm_notSext:$imm),
+            (t2BICri rGPR:$src, t2_so_imm_notSext:$imm)>;
+
+
 // FIXME: Disable this pattern on Darwin to workaround an assembler bug.
 def : T2Pat<(or      rGPR:$src, t2_so_imm_not:$imm),
             (t2ORNri rGPR:$src, t2_so_imm_not:$imm)>,
@@ -2849,20 +2887,64 @@ def : T2Pat<(ARMcmpZ  GPRnopc:$lhs, rGPR:$rhs),
 def : T2Pat<(ARMcmpZ  GPRnopc:$lhs, t2_so_reg:$rhs),
             (t2CMPrs  GPRnopc:$lhs, t2_so_reg:$rhs)>;
 
-//FIXME: Disable CMN, as CCodes are backwards from compare expectations
-//       Compare-to-zero still works out, just not the relationals
-//defm t2CMN  : T2I_cmp_irs<0b1000, "cmn",
-//                          BinOpFrag<(ARMcmp node:$LHS,(ineg node:$RHS))>>;
-defm t2CMNz : T2I_cmp_irs<0b1000, "cmn",
-                          IIC_iCMPi, IIC_iCMPr, IIC_iCMPsi,
-                          BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>,
-                          "t2CMNz">;
+let isCompare = 1, Defs = [CPSR] in {
+   // shifted imm
+   def t2CMNri : T2OneRegCmpImm<
+                (outs), (ins GPRnopc:$Rn, t2_so_imm:$imm), IIC_iCMPi,
+                "cmn", ".w\t$Rn, $imm",
+                [(ARMcmn GPRnopc:$Rn, (ineg t2_so_imm:$imm))]> {
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24-21} = 0b1000;
+     let Inst{20} = 1; // The S bit.
+     let Inst{15} = 0;
+     let Inst{11-8} = 0b1111; // Rd
+   }
+   // register
+   def t2CMNzrr : T2TwoRegCmp<
+                (outs), (ins GPRnopc:$Rn, rGPR:$Rm), IIC_iCMPr,
+                "cmn", ".w\t$Rn, $Rm",
+                [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
+                  GPRnopc:$Rn, rGPR:$Rm)]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = 0b1000;
+     let Inst{20} = 1; // The S bit.
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{11-8} = 0b1111; // Rd
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
+   // shifted register
+   def t2CMNzrs : T2OneRegCmpShiftedReg<
+                (outs), (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm), IIC_iCMPsi,
+                "cmn", ".w\t$Rn, $ShiftedRm",
+                [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
+                  GPRnopc:$Rn, t2_so_reg:$ShiftedRm)]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = 0b1000;
+     let Inst{20} = 1; // The S bit.
+     let Inst{11-8} = 0b1111; // Rd
+   }
+}
 
-//def : T2Pat<(ARMcmp  GPR:$src, t2_so_imm_neg:$imm),
-//            (t2CMNri GPR:$src, t2_so_imm_neg:$imm)>;
+// Assembler aliases w/o the ".w" suffix.
+// No alias here for 'rr' version as not all instantiations of this multiclass
+// want one (CMP in particular, does not).
+def : t2InstAlias<!strconcat("cmn", "${p}", " $Rn, $imm"),
+   (!cast<Instruction>(!strconcat("t2CMN", "ri")) GPRnopc:$Rn,
+                                                  t2_so_imm:$imm, pred:$p)>;
+def : t2InstAlias<!strconcat("cmn", "${p}", " $Rn, $shift"),
+   (!cast<Instruction>(!strconcat("t2CMNz", "rs")) GPRnopc:$Rn,
+                                                  t2_so_reg:$shift,
+                                                  pred:$p)>;
 
-def : T2Pat<(ARMcmpZ  GPRnopc:$src, t2_so_imm_neg:$imm),
-            (t2CMNzri GPRnopc:$src, t2_so_imm_neg:$imm)>;
+def : T2Pat<(ARMcmp  GPR:$src, t2_so_imm_neg:$imm),
+            (t2CMNri GPR:$src, t2_so_imm_neg:$imm)>;
+
+def : T2Pat<(ARMcmpZ GPRnopc:$src, t2_so_imm_neg:$imm),
+            (t2CMNri GPRnopc:$src, t2_so_imm_neg:$imm)>;
 
 defm t2TST  : T2I_cmp_irs<0b0000, "tst",
                           IIC_iTSTi, IIC_iTSTr, IIC_iTSTsi,
@@ -3017,7 +3099,7 @@ def t2DSB : AInoP<(outs), (ins memb_opt:$opt), ThumbFrm, NoItinerary,
 
 def t2ISB : AInoP<(outs), (ins memb_opt:$opt), ThumbFrm, NoItinerary,
                   "isb", "\t$opt",
-                  []>, Requires<[IsThumb2, HasDB]> {
+                  []>, Requires<[IsThumb, HasDB]> {
   bits<4> opt;
   let Inst{31-4} = 0xf3bf8f6;
   let Inst{3-0} = opt;
@@ -3271,7 +3353,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
   // IOS version.
   let Uses = [SP] in
   def tTAILJMPd: tPseudoExpand<(outs),
-                   (ins uncondbrtarget:$dst, pred:$p, variable_ops),
+                   (ins uncondbrtarget:$dst, pred:$p),
                    4, IIC_Br, [],
                    (t2B uncondbrtarget:$dst, pred:$p)>,
                  Requires<[IsThumb2, IsIOS]>;
@@ -3281,7 +3363,7 @@ let isCall = 1, Defs = [LR], Uses = [SP] in {
   // mov lr, pc; b if callee is marked noreturn to avoid confusing the
   // return stack predictor.
   def t2BMOVPCB_CALL : tPseudoInst<(outs),
-                                   (ins t_bltarget:$func, variable_ops),
+                                   (ins t_bltarget:$func),
                                6, IIC_Br, [(ARMcall_nolink tglobaladdr:$func)]>,
                         Requires<[IsThumb]>;
 }
@@ -3382,21 +3464,18 @@ let imod = 0, iflags = 0, M = 1 in
 
 // A6.3.4 Branches and miscellaneous control
 // Table A6-14 Change Processor State, and hint instructions
-class T2I_hint<bits<8> op7_0, string opc, string asm>
-  : T2I<(outs), (ins), NoItinerary, opc, asm, []> {
-  let Inst{31-20} = 0xf3a;
-  let Inst{19-16} = 0b1111;
-  let Inst{15-14} = 0b10;
-  let Inst{12} = 0;
-  let Inst{10-8} = 0b000;
-  let Inst{7-0} = op7_0;
+def t2HINT : T2I<(outs), (ins imm0_255:$imm), NoItinerary, "hint", "\t$imm",[]>{
+  bits<8> imm;
+  let Inst{31-8} = 0b111100111010111110000000;
+  let Inst{7-0} = imm;
 }
 
-def t2NOP   : T2I_hint<0b00000000, "nop",   ".w">;
-def t2YIELD : T2I_hint<0b00000001, "yield", ".w">;
-def t2WFE   : T2I_hint<0b00000010, "wfe",   ".w">;
-def t2WFI   : T2I_hint<0b00000011, "wfi",   ".w">;
-def t2SEV   : T2I_hint<0b00000100, "sev",   ".w">;
+def : t2InstAlias<"hint$p.w $imm", (t2HINT imm0_255:$imm, pred:$p)>;
+def : t2InstAlias<"nop$p.w", (t2HINT 0, pred:$p)>;
+def : t2InstAlias<"yield$p.w", (t2HINT 1, pred:$p)>;
+def : t2InstAlias<"wfe$p.w", (t2HINT 2, pred:$p)>;
+def : t2InstAlias<"wfi$p.w", (t2HINT 3, pred:$p)>;
+def : t2InstAlias<"sev$p.w", (t2HINT 4, pred:$p)>;
 
 def t2DBG : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "dbg", "\t$opt", []> {
   bits<4> opt;
@@ -3622,8 +3701,8 @@ defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l">;
 // A/R class MRS.
 //
 // A/R class can only move from CPSR or SPSR.
-def t2MRS_AR : T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, apsr", []>,
-               Requires<[IsThumb2,IsARClass]> {
+def t2MRS_AR : T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, apsr",
+                  []>, Requires<[IsThumb2,IsARClass]> {
   bits<4> Rd;
   let Inst{31-12} = 0b11110011111011111000;
   let Inst{11-8} = Rd;
@@ -3632,8 +3711,8 @@ def t2MRS_AR : T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, apsr", []>
 
 def : t2InstAlias<"mrs${p} $Rd, cpsr", (t2MRS_AR GPR:$Rd, pred:$p)>;
 
-def t2MRSsys_AR: T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, spsr", []>,
-                 Requires<[IsThumb2,IsARClass]> {
+def t2MRSsys_AR: T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, spsr",
+                   []>, Requires<[IsThumb2,IsARClass]> {
   bits<4> Rd;
   let Inst{31-12} = 0b11110011111111111000;
   let Inst{11-8} = Rd;
@@ -3646,7 +3725,7 @@ def t2MRSsys_AR: T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, spsr", [
 // the A/R class (a full msr_mask).
 def t2MRS_M : T2I<(outs rGPR:$Rd), (ins msr_mask:$mask), NoItinerary,
                   "mrs", "\t$Rd, $mask", []>,
-              Requires<[IsThumb2,IsMClass]> {
+              Requires<[IsThumb,IsMClass]> {
   bits<4> Rd;
   bits<8> mask;
   let Inst{31-12} = 0b11110011111011111000;
@@ -3682,14 +3761,14 @@ def t2MSR_AR : T2I<(outs), (ins msr_mask:$mask, rGPR:$Rn),
 // Move from ARM core register to Special Register
 def t2MSR_M : T2I<(outs), (ins msr_mask:$SYSm, rGPR:$Rn),
                   NoItinerary, "msr", "\t$SYSm, $Rn", []>,
-              Requires<[IsThumb2,IsMClass]> {
-  bits<8> SYSm;
+              Requires<[IsThumb,IsMClass]> {
+  bits<12> SYSm;
   bits<4> Rn;
   let Inst{31-21} = 0b11110011100;
   let Inst{20}    = 0b0;
   let Inst{19-16} = Rn;
   let Inst{15-12} = 0b1000;
-  let Inst{7-0}  = SYSm;
+  let Inst{11-0}  = SYSm;
 }
 
 
@@ -3969,6 +4048,17 @@ def : t2InstAlias<"add${s}${p} $Rdn, $imm",
 def : t2InstAlias<"add${p} $Rdn, $imm",
            (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095_neg:$imm, pred:$p)>;
 
+def : t2InstAlias<"add${s}${p}.w $Rd, $Rn, $imm",
+        (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm, pred:$p,
+                 cc_out:$s)>;
+def : t2InstAlias<"addw${p} $Rd, $Rn, $imm",
+           (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095_neg:$imm, pred:$p)>;
+def : t2InstAlias<"add${s}${p}.w $Rdn, $imm",
+      (t2SUBri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm_neg:$imm, pred:$p,
+               cc_out:$s)>;
+def : t2InstAlias<"addw${p} $Rdn, $imm",
+           (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095_neg:$imm, pred:$p)>;
+
 
 // Aliases for SUB without the ".w" optional width specifier.
 def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $imm",
@@ -4002,9 +4092,9 @@ def : t2InstAlias<"tst${p} $Rn, $Rm",
                   (t2TSTrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>;
 
 // Memory barriers
-def : InstAlias<"dmb", (t2DMB 0xf)>, Requires<[IsThumb2, HasDB]>;
-def : InstAlias<"dsb", (t2DSB 0xf)>, Requires<[IsThumb2, HasDB]>;
-def : InstAlias<"isb", (t2ISB 0xf)>, Requires<[IsThumb2, HasDB]>;
+def : InstAlias<"dmb", (t2DMB 0xf)>, Requires<[IsThumb, HasDB]>;
+def : InstAlias<"dsb", (t2DSB 0xf)>, Requires<[IsThumb, HasDB]>;
+def : InstAlias<"isb", (t2ISB 0xf)>, Requires<[IsThumb, HasDB]>;
 
 // Alias for LDR, LDRB, LDRH, LDRSB, and LDRSH without the ".w" optional
 // width specifier.
@@ -4213,7 +4303,7 @@ def : t2InstAlias<"add${s}${p} $Rd, $imm",
                            pred:$p, cc_out:$s)>;
 // Same for CMP <--> CMN via t2_so_imm_neg
 def : t2InstAlias<"cmp${p} $Rd, $imm",
-                  (t2CMNzri rGPR:$Rd, t2_so_imm_neg:$imm, pred:$p)>;
+                  (t2CMNri rGPR:$Rd, t2_so_imm_neg:$imm, pred:$p)>;
 def : t2InstAlias<"cmn${p} $Rd, $imm",
                   (t2CMPri rGPR:$Rd, t2_so_imm_neg:$imm, pred:$p)>;
 
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index 3600b88..23c132e 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -221,11 +221,13 @@ defm : VFPDTAnyInstAlias<"vpop${p}", "$r",
 // FP Binary Operations.
 //
 
+let TwoOperandAliasConstraint = "$Dn = $Dd" in
 def VADDD  : ADbI<0b11100, 0b11, 0, 0,
                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
                   IIC_fpALU64, "vadd", ".f64\t$Dd, $Dn, $Dm",
                   [(set DPR:$Dd, (fadd DPR:$Dn, (f64 DPR:$Dm)))]>;
 
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VADDS  : ASbIn<0b11100, 0b11, 0, 0,
                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                    IIC_fpALU32, "vadd", ".f32\t$Sd, $Sn, $Sm",
@@ -235,11 +237,13 @@ def VADDS  : ASbIn<0b11100, 0b11, 0, 0,
   let D = VFPNeonA8Domain;
 }
 
+let TwoOperandAliasConstraint = "$Dn = $Dd" in
 def VSUBD  : ADbI<0b11100, 0b11, 1, 0,
                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
                   IIC_fpALU64, "vsub", ".f64\t$Dd, $Dn, $Dm",
                   [(set DPR:$Dd, (fsub DPR:$Dn, (f64 DPR:$Dm)))]>;
 
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VSUBS  : ASbIn<0b11100, 0b11, 1, 0,
                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                    IIC_fpALU32, "vsub", ".f32\t$Sd, $Sn, $Sm",
@@ -249,21 +253,25 @@ def VSUBS  : ASbIn<0b11100, 0b11, 1, 0,
   let D = VFPNeonA8Domain;
 }
 
+let TwoOperandAliasConstraint = "$Dn = $Dd" in
 def VDIVD  : ADbI<0b11101, 0b00, 0, 0,
                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
                   IIC_fpDIV64, "vdiv", ".f64\t$Dd, $Dn, $Dm",
                   [(set DPR:$Dd, (fdiv DPR:$Dn, (f64 DPR:$Dm)))]>;
 
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VDIVS  : ASbI<0b11101, 0b00, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                   IIC_fpDIV32, "vdiv", ".f32\t$Sd, $Sn, $Sm",
                   [(set SPR:$Sd, (fdiv SPR:$Sn, SPR:$Sm))]>;
 
+let TwoOperandAliasConstraint = "$Dn = $Dd" in
 def VMULD  : ADbI<0b11100, 0b10, 0, 0,
                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
                   IIC_fpMUL64, "vmul", ".f64\t$Dd, $Dn, $Dm",
                   [(set DPR:$Dd, (fmul DPR:$Dn, (f64 DPR:$Dm)))]>;
 
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VMULS  : ASbIn<0b11100, 0b10, 0, 0,
                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                    IIC_fpMUL32, "vmul", ".f32\t$Sd, $Sn, $Sm",
@@ -559,8 +567,8 @@ def VMOVRRS  : AVConv3I<0b11000101, 0b1010,
   bits<4> Rt2;
 
   // Encode instruction operands.
-  let Inst{3-0}   = src1{3-0};
-  let Inst{5}     = src1{4};
+  let Inst{3-0}   = src1{4-1};
+  let Inst{5}     = src1{0};
   let Inst{15-12} = Rt;
   let Inst{19-16} = Rt2;
 
@@ -609,8 +617,8 @@ def VMOVSRR : AVConv5I<0b11000100, 0b1010,
   bits<4> src2;
 
   // Encode instruction operands.
-  let Inst{3-0}   = dst1{3-0};
-  let Inst{5}     = dst1{4};
+  let Inst{3-0}   = dst1{4-1};
+  let Inst{5}     = dst1{0};
   let Inst{15-12} = src1;
   let Inst{19-16} = src2;
 
@@ -819,9 +827,9 @@ let Constraints = "$a = $dst" in {
 // FP to Fixed-Point:
 
 // Single Precision register
-class AVConv1XInsS_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4, bit op5, 
-                dag oops, dag iops, InstrItinClass itin, string opc, string asm,
-                list<dag> pattern>
+class AVConv1XInsS_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
+                          bit op5, dag oops, dag iops, InstrItinClass itin,
+                          string opc, string asm, list<dag> pattern>
   : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern> {
   bits<5> dst;
   // if dp_operation then UInt(D:Vd) else UInt(Vd:D);
@@ -830,9 +838,9 @@ class AVConv1XInsS_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4, bi
 }
 
 // Double Precision register
-class AVConv1XInsD_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4, bit op5, 
-                dag oops, dag iops, InstrItinClass itin, string opc, string asm,
-                list<dag> pattern>
+class AVConv1XInsD_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
+                          bit op5, dag oops, dag iops, InstrItinClass itin,
+                          string opc, string asm, list<dag> pattern>
   : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern> {
   bits<5> dst;
   // if dp_operation then UInt(D:Vd) else UInt(Vd:D);
@@ -1081,10 +1089,11 @@ def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
           Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
 
 // Match @llvm.fma.* intrinsics
-def : Pat<(f64 (fma DPR:$Ddin, DPR:$Dn, DPR:$Dm)),
+// (fma x, y, z) -> (vfms z, x, y)
+def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, DPR:$Ddin)),
           (VFMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
       Requires<[HasVFP4]>;
-def : Pat<(f32 (fma SPR:$Sdin, SPR:$Sn, SPR:$Sm)),
+def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, SPR:$Sdin)),
           (VFMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
 
@@ -1115,18 +1124,18 @@ def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
           Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
 
 // Match @llvm.fma.* intrinsics
-// (fma (fneg x), y, z) -> (vfms x, y, z)
-def : Pat<(f64 (fma (fneg DPR:$Ddin), DPR:$Dn, DPR:$Dm)),
+// (fma (fneg x), y, z) -> (vfms z, x, y)
+def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin)),
           (VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
       Requires<[HasVFP4]>;
-def : Pat<(f32 (fma (fneg SPR:$Sdin), SPR:$Sn, SPR:$Sm)),
+def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin)),
           (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
-// (fneg (fma x, (fneg y), z) -> (vfms x, y, z)
-def : Pat<(fneg (f64 (fma DPR:$Ddin, (fneg DPR:$Dn), DPR:$Dm))),
+// (fma x, (fneg y), z) -> (vfms z, x, y)
+def : Pat<(f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin)),
           (VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
       Requires<[HasVFP4]>;
-def : Pat<(fneg (f32 (fma SPR:$Sdin, (fneg SPR:$Sn), SPR:$Sm))),
+def : Pat<(f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin)),
           (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
 
@@ -1157,18 +1166,18 @@ def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
           Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
 
 // Match @llvm.fma.* intrinsics
-// (fneg (fma x, y, z)) -> (vfnma x, y, z)
-def : Pat<(fneg (fma (f64 DPR:$Ddin), (f64 DPR:$Dn), (f64 DPR:$Dm))),
+// (fneg (fma x, y, z)) -> (vfnma z, x, y)
+def : Pat<(fneg (fma (f64 DPR:$Dn), (f64 DPR:$Dm), (f64 DPR:$Ddin))),
           (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
       Requires<[HasVFP4]>;
-def : Pat<(fneg (fma (f32 SPR:$Sdin), (f32 SPR:$Sn), (f32 SPR:$Sm))),
+def : Pat<(fneg (fma (f32 SPR:$Sn), (f32 SPR:$Sm), (f32 SPR:$Sdin))),
           (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
-// (fma (fneg x), y, (fneg z)) -> (vfnma x, y, z)
-def : Pat<(f64 (fma (fneg DPR:$Ddin), DPR:$Dn, (fneg DPR:$Dm))),
+// (fma (fneg x), y, (fneg z)) -> (vfnma z, x, y)
+def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))),
           (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
       Requires<[HasVFP4]>;
-def : Pat<(f32 (fma (fneg SPR:$Sdin), SPR:$Sn, (fneg SPR:$Sm))),
+def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, (fneg SPR:$Sdin))),
           (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
 
@@ -1198,18 +1207,26 @@ def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
           Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
 
 // Match @llvm.fma.* intrinsics
-// (fneg (fma (fneg x), y, z)) -> (vnfms x, y, z)
-def : Pat<(fneg (f64 (fma (fneg DPR:$Ddin), DPR:$Dn, DPR:$Dm))),
+
+// (fma x, y, (fneg z)) -> (vfnms z, x, y))
+def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, (fneg DPR:$Ddin))),
+          (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4]>;
+def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, (fneg SPR:$Sdin))),
+          (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
+// (fneg (fma (fneg x), y, z)) -> (vfnms z, x, y)
+def : Pat<(fneg (f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))),
           (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
       Requires<[HasVFP4]>;
-def : Pat<(fneg (f32 (fma (fneg SPR:$Sdin), SPR:$Sn, SPR:$Sm))),
+def : Pat<(fneg (f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin))),
           (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
-// (fma x, (fneg y), z) -> (vnfms x, y, z)
-def : Pat<(f64 (fma DPR:$Ddin, (fneg DPR:$Dn), DPR:$Dm)),
+// (fneg (fma x, (fneg y), z) -> (vfnms z, x, y)
+def : Pat<(fneg (f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin))),
           (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
       Requires<[HasVFP4]>;
-def : Pat<(f32 (fma SPR:$Sdin, (fneg SPR:$Sn), SPR:$Sm)),
+def : Pat<(fneg (f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin))),
           (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
 
@@ -1426,22 +1443,6 @@ def : VFP2InstAlias<"vldr${p}.64 $Dd, $addr",
 def : VFP2InstAlias<"vstr${p}.64 $Dd, $addr",
                     (VSTRD DPR:$Dd, addrmode5:$addr, pred:$p)>;
 
-// VMUL has a two-operand form (implied destination operand)
-def : VFP2InstAlias<"vmul${p}.f64 $Dn, $Dm",
-                    (VMULD DPR:$Dn, DPR:$Dn, DPR:$Dm, pred:$p)>;
-def : VFP2InstAlias<"vmul${p}.f32 $Sn, $Sm",
-                    (VMULS SPR:$Sn, SPR:$Sn, SPR:$Sm, pred:$p)>;
-// VADD has a two-operand form (implied destination operand)
-def : VFP2InstAlias<"vadd${p}.f64 $Dn, $Dm",
-                    (VADDD DPR:$Dn, DPR:$Dn, DPR:$Dm, pred:$p)>;
-def : VFP2InstAlias<"vadd${p}.f32 $Sn, $Sm",
-                    (VADDS SPR:$Sn, SPR:$Sn, SPR:$Sm, pred:$p)>;
-// VSUB has a two-operand form (implied destination operand)
-def : VFP2InstAlias<"vsub${p}.f64 $Dn, $Dm",
-                    (VSUBD DPR:$Dn, DPR:$Dn, DPR:$Dm, pred:$p)>;
-def : VFP2InstAlias<"vsub${p}.f32 $Sn, $Sm",
-                    (VSUBS SPR:$Sn, SPR:$Sn, SPR:$Sm, pred:$p)>;
-
 // VMOV can accept optional 32-bit or less data type suffix suffix.
 def : VFP2InstAlias<"vmov${p}.8 $Rt, $Sn",
                     (VMOVRS GPR:$Rt, SPR:$Sn, pred:$p)>;
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 9ef2ace..cb1b2a2 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -1177,8 +1177,6 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
                       BaseReg, false, BaseUndef, false, OffUndef,
                       Pred, PredReg, TII, isT2);
         NewBBI = llvm::prior(MBBI);
-        if (isT2 && NewOpc == ARM::t2LDRi8 && OffImm+4 >= 0)
-          NewOpc = ARM::t2LDRi12;
         InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
                       EvenReg, EvenDeadKill, false,
                       BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
@@ -1326,7 +1324,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
         // First advance to the instruction just before the start of the chain.
         AdvanceRS(MBB, MemOps);
         // Find a scratch register.
-        unsigned Scratch = RS->FindUnusedReg(ARM::GPRRegisterClass);
+        unsigned Scratch = RS->FindUnusedReg(&ARM::GPRRegClass);
         // Process the load / store instructions.
         RS->forward(prior(MBBI));
 
@@ -1739,7 +1737,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
           Ops.pop_back();
 
           const MCInstrDesc &MCID = TII->get(NewOpc);
-          const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI);
+          const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI, *MF);
           MRI->constrainRegClass(EvenReg, TRC);
           MRI->constrainRegClass(OddReg, TRC);
 
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index 1466e98..3857647 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -267,21 +267,16 @@ def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
 // Subset of DPR that are accessible with VFP2 (and so that also have
 // 32-bit SPR subregs).
 def DPR_VFP2 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
-                             (trunc DPR, 16)> {
-  let SubRegClasses = [(SPR ssub_0, ssub_1)];
-}
+                             (trunc DPR, 16)>;
 
 // Subset of DPR which can be used as a source of NEON scalars for 16-bit
 // operations
 def DPR_8 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
-                          (trunc DPR, 8)> {
-  let SubRegClasses = [(SPR_8 ssub_0, ssub_1)];
-}
+                          (trunc DPR, 8)>;
 
 // Generic 128-bit vector register class.
 def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
                         (sequence "Q%u", 0, 15)> {
-  let SubRegClasses = [(DPR dsub_0, dsub_1)];
   // Allocate non-VFP2 aliases Q8-Q15 first.
   let AltOrders = [(rotl QPR, 8)];
   let AltOrderSelect = [{ return 1; }];
@@ -289,17 +284,11 @@ def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
 
 // Subset of QPR that have 32-bit SPR subregs.
 def QPR_VFP2 : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-                             128, (trunc QPR, 8)> {
-  let SubRegClasses = [(SPR      ssub_0, ssub_1, ssub_2, ssub_3),
-                       (DPR_VFP2 dsub_0, dsub_1)];
-}
+                             128, (trunc QPR, 8)>;
 
 // Subset of QPR that have DPR_8 and SPR_8 subregs.
 def QPR_8 : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-                           128, (trunc QPR, 4)> {
-  let SubRegClasses = [(SPR_8 ssub_0, ssub_1, ssub_2, ssub_3),
-                       (DPR_8 dsub_0, dsub_1)];
-}
+                           128, (trunc QPR, 4)>;
 
 // Pseudo-registers representing odd-even pairs of D registers. The even-odd
 // pairs are already represented by the Q registers.
@@ -338,8 +327,6 @@ def Tuples2Q : RegisterTuples<[qsub_0, qsub_1], [(shl QPR, 0), (shl QPR, 1)]>;
 // Pseudo 256-bit vector register class to model pairs of Q registers
 // (4 consecutive D registers).
 def QQPR : RegisterClass<"ARM", [v4i64], 256, (add Tuples2Q)> {
-  let SubRegClasses = [(DPR dsub_0, dsub_1, dsub_2, dsub_3),
-                       (QPR qsub_0, qsub_1)];
   // Allocate non-VFP2 aliases first.
   let AltOrders = [(rotl QQPR, 8)];
   let AltOrderSelect = [{ return 1; }];
@@ -363,9 +350,6 @@ def Tuples2QQ : RegisterTuples<[qqsub_0, qqsub_1],
 // Pseudo 512-bit vector register class to model 4 consecutive Q registers
 // (8 consecutive D registers).
 def QQQQPR : RegisterClass<"ARM", [v8i64], 256, (add Tuples2QQ)> {
-  let SubRegClasses = [(DPR dsub_0, dsub_1, dsub_2, dsub_3,
-                            dsub_4, dsub_5, dsub_6, dsub_7),
-                       (QPR qsub_0, qsub_1, qsub_2, qsub_3)];
   // Allocate non-VFP2 aliases first.
   let AltOrders = [(rotl QQQQPR, 8)];
   let AltOrderSelect = [{ return 1; }];
diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td
index 45486fd..81d2fa3 100644
--- a/lib/Target/ARM/ARMSchedule.td
+++ b/lib/Target/ARM/ARMSchedule.td
@@ -70,11 +70,11 @@ def IIC_iLoad_bh_siu : InstrItinClass;
 def IIC_iLoad_d_i  : InstrItinClass;
 def IIC_iLoad_d_r  : InstrItinClass;
 def IIC_iLoad_d_ru : InstrItinClass;
-def IIC_iLoad_m    : InstrItinClass<0>;  // micro-coded
-def IIC_iLoad_mu   : InstrItinClass<0>;  // micro-coded
-def IIC_iLoad_mBr  : InstrItinClass<0>;  // micro-coded
-def IIC_iPop       : InstrItinClass<0>;  // micro-coded
-def IIC_iPop_Br    : InstrItinClass<0>;  // micro-coded
+def IIC_iLoad_m    : InstrItinClass;
+def IIC_iLoad_mu   : InstrItinClass;
+def IIC_iLoad_mBr  : InstrItinClass;
+def IIC_iPop       : InstrItinClass;
+def IIC_iPop_Br    : InstrItinClass;
 def IIC_iLoadiALU  : InstrItinClass;
 def IIC_iStore_i   : InstrItinClass;
 def IIC_iStore_r   : InstrItinClass;
@@ -91,8 +91,8 @@ def IIC_iStore_bh_siu : InstrItinClass;
 def IIC_iStore_d_i   : InstrItinClass;
 def IIC_iStore_d_r   : InstrItinClass;
 def IIC_iStore_d_ru  : InstrItinClass;
-def IIC_iStore_m   : InstrItinClass<0>;  // micro-coded
-def IIC_iStore_mu  : InstrItinClass<0>;  // micro-coded
+def IIC_iStore_m   : InstrItinClass;
+def IIC_iStore_mu  : InstrItinClass;
 def IIC_Preload    : InstrItinClass;
 def IIC_Br         : InstrItinClass;
 def IIC_fpSTAT     : InstrItinClass;
@@ -126,12 +126,12 @@ def IIC_fpSQRT32   : InstrItinClass;
 def IIC_fpSQRT64   : InstrItinClass;
 def IIC_fpLoad32   : InstrItinClass;
 def IIC_fpLoad64   : InstrItinClass;
-def IIC_fpLoad_m   : InstrItinClass<0>;  // micro-coded
-def IIC_fpLoad_mu  : InstrItinClass<0>;  // micro-coded
+def IIC_fpLoad_m   : InstrItinClass;
+def IIC_fpLoad_mu  : InstrItinClass;
 def IIC_fpStore32  : InstrItinClass;
 def IIC_fpStore64  : InstrItinClass;
-def IIC_fpStore_m  : InstrItinClass<0>;  // micro-coded
-def IIC_fpStore_mu : InstrItinClass<0>;  // micro-coded
+def IIC_fpStore_m  : InstrItinClass;
+def IIC_fpStore_mu : InstrItinClass;
 def IIC_VLD1       : InstrItinClass;
 def IIC_VLD1x2     : InstrItinClass;
 def IIC_VLD1x3     : InstrItinClass;
@@ -258,8 +258,6 @@ def IIC_VTBX4      : InstrItinClass;
 //===----------------------------------------------------------------------===//
 // Processor instruction itineraries.
 
-def GenericItineraries : ProcessorItineraries<[], [], []>;
-
 include "ARMScheduleV6.td"
 include "ARMScheduleA8.td"
 include "ARMScheduleA9.td"
diff --git a/lib/Target/ARM/ARMScheduleA8.td b/lib/Target/ARM/ARMScheduleA8.td
index 8b1fb93..56197d4 100644
--- a/lib/Target/ARM/ARMScheduleA8.td
+++ b/lib/Target/ARM/ARMScheduleA8.td
@@ -151,28 +151,30 @@ def CortexA8Itineraries : ProcessorItineraries<
   // Load multiple, def is the 5th operand. Pipeline 0 only.
   // FIXME: A8_LSPipe cycle time is dynamic, this assumes 3 to 4 registers.
   InstrItinData<IIC_iLoad_m  , [InstrStage<2, [A8_Pipe0], 0>,
-                                InstrStage<2, [A8_LSPipe]>], [1, 1, 1, 1, 3]>,
+                                InstrStage<2, [A8_LSPipe]>],
+                [1, 1, 1, 1, 3], [], -1>, // dynamic uops
   //
   // Load multiple + update, defs are the 1st and 5th operands.
   InstrItinData<IIC_iLoad_mu , [InstrStage<3, [A8_Pipe0], 0>,
-                                InstrStage<3, [A8_LSPipe]>], [2, 1, 1, 1, 3]>,
+                                InstrStage<3, [A8_LSPipe]>],
+                [2, 1, 1, 1, 3], [], -1>, // dynamic uops
   //
   // Load multiple plus branch
   InstrItinData<IIC_iLoad_mBr, [InstrStage<3, [A8_Pipe0], 0>,
                                 InstrStage<3, [A8_LSPipe]>,
                                 InstrStage<1, [A8_Pipe0, A8_Pipe1]>],
-                               [1, 2, 1, 1, 3]>,
+                              [1, 2, 1, 1, 3], [], -1>, // dynamic uops
   //
   // Pop, def is the 3rd operand.
   InstrItinData<IIC_iPop  ,    [InstrStage<3, [A8_Pipe0], 0>,
-                                InstrStage<3, [A8_LSPipe]>], [1, 1, 3]>,
+                                InstrStage<3, [A8_LSPipe]>],
+                [1, 1, 3], [], -1>, // dynamic uops
   //
   // Push, def is the 3th operand.
   InstrItinData<IIC_iPop_Br,   [InstrStage<3, [A8_Pipe0], 0>,
                                 InstrStage<3, [A8_LSPipe]>,
                                 InstrStage<1, [A8_Pipe0, A8_Pipe1]>],
-                               [1, 1, 3]>,
-
+                               [1, 1, 3], [], -1>, // dynamic uops
   //
   // iLoadi + iALUr for t2LDRpci_pic.
   InstrItinData<IIC_iLoadiALU, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
@@ -227,12 +229,13 @@ def CortexA8Itineraries : ProcessorItineraries<
   // Store multiple. Pipeline 0 only.
   // FIXME: A8_LSPipe cycle time is dynamic, this assumes 3 to 4 registers.
   InstrItinData<IIC_iStore_m , [InstrStage<2, [A8_Pipe0], 0>,
-                                InstrStage<2, [A8_LSPipe]>]>,
+                                InstrStage<2, [A8_LSPipe]>],
+                [], [], -1>, // dynamic uops
   //
   // Store multiple + update
   InstrItinData<IIC_iStore_mu, [InstrStage<2, [A8_Pipe0], 0>,
-                                InstrStage<2, [A8_LSPipe]>], [2]>,
-
+                                InstrStage<2, [A8_LSPipe]>],
+                [2], [], -1>, // dynamic uops
   //
   // Preload
   InstrItinData<IIC_Preload, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
@@ -393,14 +396,16 @@ def CortexA8Itineraries : ProcessorItineraries<
                                InstrStage<1, [A8_NLSPipe], 0>,
                                InstrStage<1, [A8_LSPipe]>,
                                InstrStage<1, [A8_NLSPipe], 0>,
-                               InstrStage<1, [A8_LSPipe]>], [1, 1, 1, 2]>,
+                               InstrStage<1, [A8_LSPipe]>],
+                [1, 1, 1, 2], [], -1>, // dynamic uops
   //
   // FP Load Multiple + update
   InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NLSPipe], 0>,
                                InstrStage<1, [A8_LSPipe]>,
                                InstrStage<1, [A8_NLSPipe], 0>,
-                               InstrStage<1, [A8_LSPipe]>], [2, 1, 1, 1, 2]>,
+                               InstrStage<1, [A8_LSPipe]>],
+                [2, 1, 1, 1, 2], [], -1>, // dynamic uops
   //
   // Single-precision FP Store
   InstrItinData<IIC_fpStore32,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
@@ -419,15 +424,16 @@ def CortexA8Itineraries : ProcessorItineraries<
                                InstrStage<1, [A8_NLSPipe], 0>,
                                InstrStage<1, [A8_LSPipe]>,
                                InstrStage<1, [A8_NLSPipe], 0>,
-                               InstrStage<1, [A8_LSPipe]>], [1, 1, 1, 1]>,
+                               InstrStage<1, [A8_LSPipe]>],
+                [1, 1, 1, 1], [], -1>, // dynamic uops
   //
   // FP Store Multiple + update
   InstrItinData<IIC_fpStore_mu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                 InstrStage<1, [A8_NLSPipe], 0>,
                                 InstrStage<1, [A8_LSPipe]>,
                                 InstrStage<1, [A8_NLSPipe], 0>,
-                                InstrStage<1, [A8_LSPipe]>], [2, 1, 1, 1, 1]>,
-
+                                InstrStage<1, [A8_LSPipe]>],
+                [2, 1, 1, 1, 1], [], -1>, // dynamic uops
   // NEON
   // Issue through integer pipeline, and execute in NEON unit.
   //
@@ -1051,3 +1057,18 @@ def CortexA8Itineraries : ProcessorItineraries<
                                InstrStage<1, [A8_NPipe], 0>,
                             InstrStage<2, [A8_NLSPipe]>], [4, 1, 2, 2, 3, 3, 1]>
 ]>;
+
+// ===---------------------------------------------------------------------===//
+// This following definitions describe the simple machine model which
+// will replace itineraries.
+
+// Cortex-A8 machine model for scheduling and other instruction cost heuristics.
+def CortexA8Model : SchedMachineModel {
+  let IssueWidth = 2; // 2 micro-ops are dispatched per cycle.
+  let MinLatency = -1; // OperandCycles are interpreted as MinLatency.
+  let LoadLatency = 2; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+
+  let Itineraries = CortexA8Itineraries;
+}
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index 0d710cc..738974e 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -11,6 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+// ===---------------------------------------------------------------------===//
+// This section contains legacy support for itineraries. This is
+// required until SD and PostRA schedulers are replaced by MachineScheduler.
+
 //
 // Ad-hoc scheduling information derived from pretty vague "Cortex-A9 Technical
 // Reference Manual".
@@ -280,7 +284,8 @@ def CortexA9Itineraries : ProcessorItineraries<
                                 InstrStage<2, [A9_AGU], 1>,
                                 InstrStage<2, [A9_LSUnit]>],
                                [1, 1, 1, 1, 3],
-                         [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>,
+                         [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass],
+                         -1>, // dynamic uops
   //
   // Load multiple + update, defs are the 1st and 5th operands.
   InstrItinData<IIC_iLoad_mu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
@@ -288,7 +293,8 @@ def CortexA9Itineraries : ProcessorItineraries<
                                 InstrStage<2, [A9_AGU], 1>,
                                 InstrStage<2, [A9_LSUnit]>],
                                [2, 1, 1, 1, 3],
-                         [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>,
+                         [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass],
+                         -1>, // dynamic uops
   //
   // Load multiple plus branch
   InstrItinData<IIC_iLoad_mBr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
@@ -297,7 +303,8 @@ def CortexA9Itineraries : ProcessorItineraries<
                                 InstrStage<2, [A9_LSUnit]>,
                                 InstrStage<1, [A9_Branch]>],
                                [1, 2, 1, 1, 3],
-                         [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>,
+                         [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass],
+                         -1>, // dynamic uops
   //
   // Pop, def is the 3rd operand.
   InstrItinData<IIC_iPop  ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
@@ -305,7 +312,8 @@ def CortexA9Itineraries : ProcessorItineraries<
                                 InstrStage<2, [A9_AGU], 1>,
                                 InstrStage<2, [A9_LSUnit]>],
                                [1, 1, 3],
-                               [NoBypass, NoBypass, A9_LdBypass]>,
+                               [NoBypass, NoBypass, A9_LdBypass],
+                               -1>, // dynamic uops
   //
   // Pop + branch, def is the 3rd operand.
   InstrItinData<IIC_iPop_Br,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
@@ -314,8 +322,8 @@ def CortexA9Itineraries : ProcessorItineraries<
                                 InstrStage<2, [A9_LSUnit]>,
                                 InstrStage<1, [A9_Branch]>],
                                [1, 1, 3],
-                               [NoBypass, NoBypass, A9_LdBypass]>,
-
+                               [NoBypass, NoBypass, A9_LdBypass],
+                               -1>, // dynamic uops
   //
   // iLoadi + iALUr for t2LDRpci_pic.
   InstrItinData<IIC_iLoadiALU, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
@@ -409,14 +417,15 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_iStore_m , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                 InstrStage<1, [A9_MUX0], 0>,
                                 InstrStage<1, [A9_AGU], 0>,
-                                InstrStage<2, [A9_LSUnit]>]>,
+                                InstrStage<2, [A9_LSUnit]>],
+                [], [], -1>, // dynamic uops
   //
   // Store multiple + update
   InstrItinData<IIC_iStore_mu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                 InstrStage<1, [A9_MUX0], 0>,
                                 InstrStage<1, [A9_AGU], 0>,
-                                InstrStage<2, [A9_LSUnit]>], [2]>,
-
+                                InstrStage<2, [A9_LSUnit]>],
+                [2], [], -1>, // dynamic uops
   //
   // Preload
   InstrItinData<IIC_Preload,   [InstrStage<1, [A9_Issue0, A9_Issue1]>], [1, 1]>,
@@ -713,7 +722,8 @@ def CortexA9Itineraries : ProcessorItineraries<
                                InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
                                InstrStage<1, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1]>,
+                               InstrStage<2, [A9_LSUnit]>],
+                [1, 1, 1, 1], [], -1>, // dynamic uops
   //
   // FP Load Multiple + update
   // FIXME: assumes 2 doubles which requires 2 LS cycles.
@@ -722,7 +732,8 @@ def CortexA9Itineraries : ProcessorItineraries<
                                InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
                                InstrStage<1, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1]>,
+                               InstrStage<2, [A9_LSUnit]>],
+                [2, 1, 1, 1], [], -1>, // dynamic uops
   //
   // Single-precision FP Store
   InstrItinData<IIC_fpStore32,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
@@ -749,7 +760,8 @@ def CortexA9Itineraries : ProcessorItineraries<
                                InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
                                InstrStage<1, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1]>,
+                               InstrStage<2, [A9_LSUnit]>],
+                [1, 1, 1, 1], [], -1>, // dynamic uops
   //
   // FP Store Multiple + update
   // FIXME: assumes 2 doubles which requires 2 LS cycles.
@@ -758,7 +770,8 @@ def CortexA9Itineraries : ProcessorItineraries<
                                 InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                 InstrStage<2, [A9_DRegsN],   0, Reserved>,
                                 InstrStage<1, [A9_NPipe], 0>,
-                                InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1]>,
+                                InstrStage<2, [A9_LSUnit]>],
+                [2, 1, 1, 1], [], -1>, // dynamic uops
   // NEON
   // VLD1
   InstrItinData<IIC_VLD1,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
@@ -1861,3 +1874,21 @@ def CortexA9Itineraries : ProcessorItineraries<
                                InstrStage<2, [A9_NPipe]>],
                               [4, 1, 2, 2, 3, 3, 1]>
 ]>;
+
+// ===---------------------------------------------------------------------===//
+// This following definitions describe the simple machine model which
+// will replace itineraries.
+
+// Cortex-A9 machine model for scheduling and other instruction cost heuristics.
+def CortexA9Model : SchedMachineModel {
+  let IssueWidth = 2; // 2 micro-ops are dispatched per cycle.
+  let MinLatency = 0; // Data dependencies are allowed within dispatch groups.
+  let LoadLatency = 2; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+
+  let Itineraries = CortexA9Itineraries;
+}
+
+// TODO: Add Cortex-A9 processor and scheduler resources.
+
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index e2530d0..31d5d38 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -179,8 +179,7 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
   Args.push_back(Entry);
 
   // Emit __eabi_memset call
-  std::pair<SDValue,SDValue> CallResult =
-    TLI.LowerCallTo(Chain,
+  TargetLowering::CallLoweringInfo CLI(Chain,
                     Type::getVoidTy(*DAG.getContext()), // return type
                     false, // return sign ext
                     false, // return zero ext
@@ -193,7 +192,9 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
                     false, // is return val used
                     DAG.getExternalSymbol(TLI.getLibcallName(RTLIB::MEMSET),
                                           TLI.getPointerTy()), // callee
-                    Args, DAG, dl); // arg list, DAG and debug
+                    Args, DAG, dl);
+  std::pair<SDValue,SDValue> CallResult =
+    TLI.LowerCallTo(CLI);
 
   return CallResult.second;
 }
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index ca172ed..e067a9f 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -67,6 +67,7 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
   , HasDataBarrier(false)
   , Pref32BitThumb(false)
   , AvoidCPSRPartialUpdate(false)
+  , HasRAS(false)
   , HasMPExtension(false)
   , FPOnlySP(false)
   , AllowsUnalignedMem(false)
@@ -82,7 +83,7 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
   // Insert the architecture feature derived from the target triple into the
   // feature string. This is important for setting features that are implied
   // based on the architecture version.
-  std::string ArchFS = ARM_MC::ParseARMTriple(TT);
+  std::string ArchFS = ARM_MC::ParseARMTriple(TT, CPUString);
   if (!FS.empty()) {
     if (!ArchFS.empty())
       ArchFS = ArchFS + "," + FS;
@@ -99,10 +100,7 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
   // Initialize scheduling itinerary for the specified CPU.
   InstrItins = getInstrItineraryForCPU(CPUString);
 
-  // After parsing Itineraries, set ItinData.IssueWidth.
-  computeIssueWidth();
-
-  if (TT.find("eabi") != std::string::npos)
+  if ((TT.find("eabi") != std::string::npos) || (isTargetIOS() && isMClass()))
     // FIXME: We might want to separate AAPCS and EABI. Some systems, e.g.
     // Darwin-EABI conforms to AACPS but not the rest of EABI.
     TargetABI = ARM_ABI_AAPCS;
@@ -192,22 +190,6 @@ unsigned ARMSubtarget::getMispredictionPenalty() const {
   return 10;
 }
 
-void ARMSubtarget::computeIssueWidth() {
-  unsigned allStage1Units = 0;
-  for (const InstrItinerary *itin = InstrItins.Itineraries;
-       itin->FirstStage != ~0U; ++itin) {
-    const InstrStage *IS = InstrItins.Stages + itin->FirstStage;
-    allStage1Units |= IS->getUnits();
-  }
-  InstrItins.IssueWidth = 0;
-  while (allStage1Units) {
-    ++InstrItins.IssueWidth;
-    // clear the lowest bit
-    allStage1Units ^= allStage1Units & ~(allStage1Units - 1);
-  }
-  assert(InstrItins.IssueWidth <= 2 && "itinerary bug, too many stage 1 units");
-}
-
 bool ARMSubtarget::enablePostRAScheduler(
            CodeGenOpt::Level OptLevel,
            TargetSubtargetInfo::AntiDepBreakMode& Mode,
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 047efc2..171c9ad 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -136,22 +136,22 @@ TargetPassConfig *ARMBaseTargetMachine::createPassConfig(PassManagerBase &PM) {
 
 bool ARMPassConfig::addPreISel() {
   if (TM->getOptLevel() != CodeGenOpt::None && EnableGlobalMerge)
-    PM.add(createGlobalMergePass(TM->getTargetLowering()));
+    addPass(createGlobalMergePass(TM->getTargetLowering()));
 
   return false;
 }
 
 bool ARMPassConfig::addInstSelector() {
-  PM.add(createARMISelDag(getARMTargetMachine(), getOptLevel()));
+  addPass(createARMISelDag(getARMTargetMachine(), getOptLevel()));
   return false;
 }
 
 bool ARMPassConfig::addPreRegAlloc() {
   // FIXME: temporarily disabling load / store optimization pass for Thumb1.
   if (getOptLevel() != CodeGenOpt::None && !getARMSubtarget().isThumb1Only())
-    PM.add(createARMLoadStoreOptimizationPass(true));
+    addPass(createARMLoadStoreOptimizationPass(true));
   if (getOptLevel() != CodeGenOpt::None && getARMSubtarget().isCortexA9())
-    PM.add(createMLxExpansionPass());
+    addPass(createMLxExpansionPass());
   return true;
 }
 
@@ -159,23 +159,23 @@ bool ARMPassConfig::addPreSched2() {
   // FIXME: temporarily disabling load / store optimization pass for Thumb1.
   if (getOptLevel() != CodeGenOpt::None) {
     if (!getARMSubtarget().isThumb1Only()) {
-      PM.add(createARMLoadStoreOptimizationPass());
+      addPass(createARMLoadStoreOptimizationPass());
       printAndVerify("After ARM load / store optimizer");
     }
     if (getARMSubtarget().hasNEON())
-      PM.add(createExecutionDependencyFixPass(&ARM::DPRRegClass));
+      addPass(createExecutionDependencyFixPass(&ARM::DPRRegClass));
   }
 
   // Expand some pseudo instructions into multiple instructions to allow
   // proper scheduling.
-  PM.add(createARMExpandPseudoPass());
+  addPass(createARMExpandPseudoPass());
 
   if (getOptLevel() != CodeGenOpt::None) {
     if (!getARMSubtarget().isThumb1Only())
-      addPass(IfConverterID);
+      addPass(&IfConverterID);
   }
   if (getARMSubtarget().isThumb2())
-    PM.add(createThumb2ITBlockPass());
+    addPass(createThumb2ITBlockPass());
 
   return true;
 }
@@ -183,13 +183,13 @@ bool ARMPassConfig::addPreSched2() {
 bool ARMPassConfig::addPreEmitPass() {
   if (getARMSubtarget().isThumb2()) {
     if (!getARMSubtarget().prefers32BitThumb())
-      PM.add(createThumb2SizeReductionPass());
+      addPass(createThumb2SizeReductionPass());
 
     // Constant island pass work on unbundled instructions.
-    addPass(UnpackMachineBundlesID);
+    addPass(&UnpackMachineBundlesID);
   }
 
-  PM.add(createARMConstantIslandPass());
+  addPass(createARMConstantIslandPass());
 
   return true;
 }
diff --git a/lib/Target/ARM/ARMTargetObjectFile.cpp b/lib/Target/ARM/ARMTargetObjectFile.cpp
index a5ea1c2..3d85ca7 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -24,20 +24,11 @@ using namespace dwarf;
 
 void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
                                         const TargetMachine &TM) {
+  bool isAAPCS_ABI = TM.getSubtarget<ARMSubtarget>().isAAPCS_ABI();
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
-  isAAPCS_ABI = TM.getSubtarget<ARMSubtarget>().isAAPCS_ABI();
+  InitializeELF(isAAPCS_ABI);
 
   if (isAAPCS_ABI) {
-    StaticCtorSection =
-      getContext().getELFSection(".init_array", ELF::SHT_INIT_ARRAY,
-                                 ELF::SHF_WRITE |
-                                 ELF::SHF_ALLOC,
-                                 SectionKind::getDataRel());
-    StaticDtorSection =
-      getContext().getELFSection(".fini_array", ELF::SHT_FINI_ARRAY,
-                                 ELF::SHF_WRITE |
-                                 ELF::SHF_ALLOC,
-                                 SectionKind::getDataRel());
     LSDASection = NULL;
   }
 
@@ -47,33 +38,3 @@ void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
                                0,
                                SectionKind::getMetadata());
 }
-
-const MCSection *
-ARMElfTargetObjectFile::getStaticCtorSection(unsigned Priority) const {
-  if (!isAAPCS_ABI)
-    return TargetLoweringObjectFileELF::getStaticCtorSection(Priority);
-
-  if (Priority == 65535)
-    return StaticCtorSection;
-
-  // Emit ctors in priority order.
-  std::string Name = std::string(".init_array.") + utostr(Priority);
-  return getContext().getELFSection(Name, ELF::SHT_INIT_ARRAY,
-                                    ELF::SHF_ALLOC | ELF::SHF_WRITE,
-                                    SectionKind::getDataRel());
-}
-
-const MCSection *
-ARMElfTargetObjectFile::getStaticDtorSection(unsigned Priority) const {
-  if (!isAAPCS_ABI)
-    return TargetLoweringObjectFileELF::getStaticDtorSection(Priority);
-
-  if (Priority == 65535)
-    return StaticDtorSection;
-
-  // Emit dtors in priority order.
-  std::string Name = std::string(".fini_array.") + utostr(Priority);
-  return getContext().getELFSection(Name, ELF::SHT_FINI_ARRAY,
-                                    ELF::SHF_ALLOC | ELF::SHF_WRITE,
-                                    SectionKind::getDataRel());
-}
diff --git a/lib/Target/ARM/ARMTargetObjectFile.h b/lib/Target/ARM/ARMTargetObjectFile.h
index ff21060..c6a7261 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.h
+++ b/lib/Target/ARM/ARMTargetObjectFile.h
@@ -20,7 +20,6 @@ class TargetMachine;
 class ARMElfTargetObjectFile : public TargetLoweringObjectFileELF {
 protected:
   const MCSection *AttributesSection;
-  bool isAAPCS_ABI;
 public:
   ARMElfTargetObjectFile() :
     TargetLoweringObjectFileELF(),
@@ -32,9 +31,6 @@ public:
   virtual const MCSection *getAttributesSection() const {
     return AttributesSection;
   }
-
-  const MCSection * getStaticCtorSection(unsigned Priority) const;
-  const MCSection * getStaticDtorSection(unsigned Priority) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 2c53e3f..4497720 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -236,7 +236,10 @@ public:
     Match_RequiresITBlock = FIRST_TARGET_MATCH_RESULT_TY,
     Match_RequiresNotITBlock,
     Match_RequiresV6,
-    Match_RequiresThumb2
+    Match_RequiresThumb2,
+#define GET_OPERAND_DIAGNOSTIC_TYPES
+#include "ARMGenAsmMatcher.inc"
+
   };
 
   ARMAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser)
@@ -914,7 +917,9 @@ public:
     // Immediate offset in range [-255, 255].
     if (!Memory.OffsetImm) return true;
     int64_t Val = Memory.OffsetImm->getValue();
-    return Val > -256 && Val < 256;
+    // The #-0 offset is encoded as INT32_MIN, and we have to check 
+    // for this too.
+    return (Val > -256 && Val < 256) || Val == INT32_MIN;
   }
   bool isAM3Offset() const {
     if (Kind != k_Immediate && Kind != k_PostIndexRegister)
@@ -1446,8 +1451,10 @@ public:
     assert(isRegShiftedImm() &&
            "addRegShiftedImmOperands() on non RegShiftedImm!");
     Inst.addOperand(MCOperand::CreateReg(RegShiftedImm.SrcReg));
+    // Shift of #32 is encoded as 0 where permitted
+    unsigned Imm = (RegShiftedImm.ShiftImm == 32 ? 0 : RegShiftedImm.ShiftImm);
     Inst.addOperand(MCOperand::CreateImm(
-      ARM_AM::getSORegOpc(RegShiftedImm.ShiftTy, RegShiftedImm.ShiftImm)));
+      ARM_AM::getSORegOpc(RegShiftedImm.ShiftTy, Imm)));
   }
 
   void addShifterImmOperands(MCInst &Inst, unsigned N) const {
@@ -2301,7 +2308,7 @@ void ARMOperand::print(raw_ostream &OS) const {
     OS << "<ccout " << getReg() << ">";
     break;
   case k_ITCondMask: {
-    static const char *MaskStr[] = {
+    static const char *const MaskStr[] = {
       "()", "(t)", "(e)", "(tt)", "(et)", "(te)", "(ee)", "(ttt)", "(ett)",
       "(tet)", "(eet)", "(tte)", "(ete)", "(tee)", "(eee)"
     };
@@ -2672,7 +2679,7 @@ parseITCondCode(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   const AsmToken &Tok = Parser.getTok();
   if (!Tok.is(AsmToken::Identifier))
     return MatchOperand_NoMatch;
-  unsigned CC = StringSwitch<unsigned>(Tok.getString())
+  unsigned CC = StringSwitch<unsigned>(Tok.getString().lower())
     .Case("eq", ARMCC::EQ)
     .Case("ne", ARMCC::NE)
     .Case("hs", ARMCC::HS)
@@ -3249,10 +3256,11 @@ ARMAsmParser::OperandMatchResultTy ARMAsmParser::
 parseMemBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
-  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+  if (!Tok.is(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
   StringRef OptStr = Tok.getString();
 
-  unsigned Opt = StringSwitch<unsigned>(OptStr.slice(0, OptStr.size()))
+  unsigned Opt = StringSwitch<unsigned>(OptStr.slice(0, OptStr.size()).lower())
     .Case("sy",    ARM_MB::SY)
     .Case("st",    ARM_MB::ST)
     .Case("sh",    ARM_MB::ISH)
@@ -3280,7 +3288,8 @@ ARMAsmParser::OperandMatchResultTy ARMAsmParser::
 parseProcIFlagsOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
-  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+  if (!Tok.is(AsmToken::Identifier)) 
+    return MatchOperand_NoMatch;
   StringRef IFlagsStr = Tok.getString();
 
   // An iflags string of "none" is interpreted to mean that none of the AIF
@@ -3320,26 +3329,51 @@ parseMSRMaskOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     // See ARMv6-M 10.1.1
     std::string Name = Mask.lower();
     unsigned FlagsVal = StringSwitch<unsigned>(Name)
-      .Case("apsr", 0)
-      .Case("iapsr", 1)
-      .Case("eapsr", 2)
-      .Case("xpsr", 3)
-      .Case("ipsr", 5)
-      .Case("epsr", 6)
-      .Case("iepsr", 7)
-      .Case("msp", 8)
-      .Case("psp", 9)
-      .Case("primask", 16)
-      .Case("basepri", 17)
-      .Case("basepri_max", 18)
-      .Case("faultmask", 19)
-      .Case("control", 20)
+      // Note: in the documentation:
+      //  ARM deprecates using MSR APSR without a _<bits> qualifier as an alias
+      //  for MSR APSR_nzcvq.
+      // but we do make it an alias here.  This is so to get the "mask encoding"
+      // bits correct on MSR APSR writes.
+      //
+      // FIXME: Note the 0xc00 "mask encoding" bits version of the registers
+      // should really only be allowed when writing a special register.  Note
+      // they get dropped in the MRS instruction reading a special register as
+      // the SYSm field is only 8 bits.
+      //
+      // FIXME: the _g and _nzcvqg versions are only allowed if the processor
+      // includes the DSP extension but that is not checked.
+      .Case("apsr", 0x800)
+      .Case("apsr_nzcvq", 0x800)
+      .Case("apsr_g", 0x400)
+      .Case("apsr_nzcvqg", 0xc00)
+      .Case("iapsr", 0x801)
+      .Case("iapsr_nzcvq", 0x801)
+      .Case("iapsr_g", 0x401)
+      .Case("iapsr_nzcvqg", 0xc01)
+      .Case("eapsr", 0x802)
+      .Case("eapsr_nzcvq", 0x802)
+      .Case("eapsr_g", 0x402)
+      .Case("eapsr_nzcvqg", 0xc02)
+      .Case("xpsr", 0x803)
+      .Case("xpsr_nzcvq", 0x803)
+      .Case("xpsr_g", 0x403)
+      .Case("xpsr_nzcvqg", 0xc03)
+      .Case("ipsr", 0x805)
+      .Case("epsr", 0x806)
+      .Case("iepsr", 0x807)
+      .Case("msp", 0x808)
+      .Case("psp", 0x809)
+      .Case("primask", 0x810)
+      .Case("basepri", 0x811)
+      .Case("basepri_max", 0x812)
+      .Case("faultmask", 0x813)
+      .Case("control", 0x814)
       .Default(~0U);
 
     if (FlagsVal == ~0U)
       return MatchOperand_NoMatch;
 
-    if (!hasV7Ops() && FlagsVal >= 17 && FlagsVal <= 19)
+    if (!hasV7Ops() && FlagsVal >= 0x811 && FlagsVal <= 0x813)
       // basepri, basepri_max and faultmask only valid for V7m.
       return MatchOperand_NoMatch;
 
@@ -5315,6 +5349,16 @@ validateInstruction(MCInst &Inst,
                    "registers must be in range r0-r7");
     break;
   }
+  case ARM::tADDrSP: {
+    // If the non-SP source operand and the destination operand are not the
+    // same, we need thumb2 (for the wide encoding), or we have an error.
+    if (!isThumbTwo() &&
+        Inst.getOperand(0).getReg() != Inst.getOperand(2).getReg()) {
+      return Error(Operands[4]->getStartLoc(),
+                   "source register must be the same as destination");
+    }
+    break;
+  }
   }
 
   return false;
@@ -6750,8 +6794,8 @@ processInstruction(MCInst &Inst,
     case ARM_AM::ror: newOpc = ARM::t2RORri; isNarrow = false; break;
     case ARM_AM::rrx: isNarrow = false; newOpc = ARM::t2RRX; break;
     }
-    unsigned Ammount = ARM_AM::getSORegOffset(Inst.getOperand(2).getImm());
-    if (Ammount == 32) Ammount = 0;
+    unsigned Amount = ARM_AM::getSORegOffset(Inst.getOperand(2).getImm());
+    if (Amount == 32) Amount = 0;
     TmpInst.setOpcode(newOpc);
     TmpInst.addOperand(Inst.getOperand(0)); // Rd
     if (isNarrow)
@@ -6759,7 +6803,7 @@ processInstruction(MCInst &Inst,
           Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : 0));
     TmpInst.addOperand(Inst.getOperand(1)); // Rn
     if (newOpc != ARM::t2RRX)
-      TmpInst.addOperand(MCOperand::CreateImm(Ammount));
+      TmpInst.addOperand(MCOperand::CreateImm(Amount));
     TmpInst.addOperand(Inst.getOperand(3)); // CondCode
     TmpInst.addOperand(Inst.getOperand(4));
     if (!isNarrow)
@@ -6809,6 +6853,9 @@ processInstruction(MCInst &Inst,
     // A shift by zero is a plain MOVr, not a MOVsi.
     unsigned Amt = Inst.getOperand(2).getImm();
     unsigned Opc = Amt == 0 ? ARM::MOVr : ARM::MOVsi;
+    // A shift by 32 should be encoded as 0 when permitted
+    if (Amt == 32 && (ShiftTy == ARM_AM::lsr || ShiftTy == ARM_AM::asr))
+      Amt = 0;
     unsigned Shifter = ARM_AM::getSORegOpc(ShiftTy, Amt);
     MCInst TmpInst;
     TmpInst.setOpcode(Opc);
@@ -6985,6 +7032,16 @@ processInstruction(MCInst &Inst,
     Inst = TmpInst;
     return true;
   }
+  case ARM::tADDrSP: {
+    // If the non-SP source operand and the destination operand are not the
+    // same, we need to use the 32-bit encoding if it's available.
+    if (Inst.getOperand(0).getReg() != Inst.getOperand(2).getReg()) {
+      Inst.setOpcode(ARM::t2ADDrr);
+      Inst.addOperand(MCOperand::CreateReg(0)); // cc_out
+      return true;
+    }
+    break;
+  }
   case ARM::tB:
     // A Thumb conditional branch outside of an IT block is a tBcc.
     if (Inst.getOperand(1).getImm() != ARMCC::AL && !inITBlock()) {
@@ -7154,7 +7211,9 @@ processInstruction(MCInst &Inst,
   }
   case ARM::MOVsi: {
     ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(Inst.getOperand(2).getImm());
-    if (SOpc == ARM_AM::rrx) return false;
+    // rrx shifts and asr/lsr of #32 is encoded as 0
+    if (SOpc == ARM_AM::rrx || SOpc == ARM_AM::asr || SOpc == ARM_AM::lsr) 
+      return false;
     if (ARM_AM::getSORegOffset(Inst.getOperand(2).getImm()) == 0) {
       // Shifting by zero is accepted as a vanilla 'MOVr'
       MCInst TmpInst;
@@ -7188,7 +7247,9 @@ processInstruction(MCInst &Inst,
     case ARM::ADDrsi: newOpc = ARM::ADDrr; break;
     }
     // If the shift is by zero, use the non-shifted instruction definition.
-    if (ARM_AM::getSORegOffset(Inst.getOperand(3).getImm()) == 0) {
+    // The exception is for right shifts, where 0 == 32
+    if (ARM_AM::getSORegOffset(Inst.getOperand(3).getImm()) == 0 &&
+        !(SOpc == ARM_AM::lsr || SOpc == ARM_AM::asr)) {
       MCInst TmpInst;
       TmpInst.setOpcode(newOpc);
       TmpInst.addOperand(Inst.getOperand(0));
@@ -7207,9 +7268,7 @@ processInstruction(MCInst &Inst,
     // The mask bits for all but the first condition are represented as
     // the low bit of the condition code value implies 't'. We currently
     // always have 1 implies 't', so XOR toggle the bits if the low bit
-    // of the condition code is zero. The encoding also expects the low
-    // bit of the condition to be encoded as bit 4 of the mask operand,
-    // so mask that in if needed
+    // of the condition code is zero. 
     MCOperand &MO = Inst.getOperand(1);
     unsigned Mask = MO.getImm();
     unsigned OrigMask = Mask;
@@ -7218,8 +7277,7 @@ processInstruction(MCInst &Inst,
       assert(Mask && TZ <= 3 && "illegal IT mask value!");
       for (unsigned i = 3; i != TZ; --i)
         Mask ^= 1 << i;
-    } else
-      Mask |= 0x10;
+    }
     MO.setImm(Mask);
 
     // Set up the IT block state according to the IT instruction we just
@@ -7231,6 +7289,86 @@ processInstruction(MCInst &Inst,
     ITState.FirstCond = true;
     break;
   }
+  case ARM::t2LSLrr:
+  case ARM::t2LSRrr:
+  case ARM::t2ASRrr:
+  case ARM::t2SBCrr:
+  case ARM::t2RORrr:
+  case ARM::t2BICrr:
+  {
+    // Assemblers should use the narrow encodings of these instructions when permissible.
+    if ((isARMLowRegister(Inst.getOperand(1).getReg()) &&
+         isARMLowRegister(Inst.getOperand(2).getReg())) &&
+        Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
+        ((!inITBlock() && Inst.getOperand(5).getReg() == ARM::CPSR) ||
+         (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) && 
+        (!static_cast<ARMOperand*>(Operands[3])->isToken() ||
+         !static_cast<ARMOperand*>(Operands[3])->getToken().equals_lower(".w"))) {
+      unsigned NewOpc;
+      switch (Inst.getOpcode()) {
+        default: llvm_unreachable("unexpected opcode");
+        case ARM::t2LSLrr: NewOpc = ARM::tLSLrr; break;
+        case ARM::t2LSRrr: NewOpc = ARM::tLSRrr; break;
+        case ARM::t2ASRrr: NewOpc = ARM::tASRrr; break;
+        case ARM::t2SBCrr: NewOpc = ARM::tSBC; break;
+        case ARM::t2RORrr: NewOpc = ARM::tROR; break;
+        case ARM::t2BICrr: NewOpc = ARM::tBIC; break;
+      }
+      MCInst TmpInst;
+      TmpInst.setOpcode(NewOpc);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(5));
+      TmpInst.addOperand(Inst.getOperand(1));
+      TmpInst.addOperand(Inst.getOperand(2));
+      TmpInst.addOperand(Inst.getOperand(3));
+      TmpInst.addOperand(Inst.getOperand(4));
+      Inst = TmpInst;
+      return true;
+    }
+    return false;
+  }
+  case ARM::t2ANDrr:
+  case ARM::t2EORrr:
+  case ARM::t2ADCrr:
+  case ARM::t2ORRrr:
+  {
+    // Assemblers should use the narrow encodings of these instructions when permissible.
+    // These instructions are special in that they are commutable, so shorter encodings
+    // are available more often.
+    if ((isARMLowRegister(Inst.getOperand(1).getReg()) &&
+         isARMLowRegister(Inst.getOperand(2).getReg())) &&
+        (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() ||
+         Inst.getOperand(0).getReg() == Inst.getOperand(2).getReg()) &&
+        ((!inITBlock() && Inst.getOperand(5).getReg() == ARM::CPSR) ||
+         (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) && 
+        (!static_cast<ARMOperand*>(Operands[3])->isToken() ||
+         !static_cast<ARMOperand*>(Operands[3])->getToken().equals_lower(".w"))) {
+      unsigned NewOpc;
+      switch (Inst.getOpcode()) {
+        default: llvm_unreachable("unexpected opcode");
+        case ARM::t2ADCrr: NewOpc = ARM::tADC; break;
+        case ARM::t2ANDrr: NewOpc = ARM::tAND; break;
+        case ARM::t2EORrr: NewOpc = ARM::tEOR; break;
+        case ARM::t2ORRrr: NewOpc = ARM::tORR; break;
+      }
+      MCInst TmpInst;
+      TmpInst.setOpcode(NewOpc);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(5));
+      if (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()) {
+        TmpInst.addOperand(Inst.getOperand(1));
+        TmpInst.addOperand(Inst.getOperand(2));
+      } else {
+        TmpInst.addOperand(Inst.getOperand(2));
+        TmpInst.addOperand(Inst.getOperand(1));
+      }
+      TmpInst.addOperand(Inst.getOperand(3));
+      TmpInst.addOperand(Inst.getOperand(4));
+      Inst = TmpInst;
+      return true;
+    }
+    return false;
+  }
   }
   return false;
 }
@@ -7277,6 +7415,7 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
   return Match_Success;
 }
 
+static const char *getSubtargetFeatureName(unsigned Val);
 bool ARMAsmParser::
 MatchAndEmitInstruction(SMLoc IDLoc,
                         SmallVectorImpl<MCParsedAsmOperand*> &Operands,
@@ -7317,9 +7456,21 @@ MatchAndEmitInstruction(SMLoc IDLoc,
     Inst.setLoc(IDLoc);
     Out.EmitInstruction(Inst);
     return false;
-  case Match_MissingFeature:
-    Error(IDLoc, "instruction requires a CPU feature not currently enabled");
-    return true;
+  case Match_MissingFeature: {
+    assert(ErrorInfo && "Unknown missing feature!");
+    // Special case the error message for the very common case where only
+    // a single subtarget feature is missing (Thumb vs. ARM, e.g.).
+    std::string Msg = "instruction requires:";
+    unsigned Mask = 1;
+    for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
+      if (ErrorInfo & Mask) {
+        Msg += " ";
+        Msg += getSubtargetFeatureName(ErrorInfo & Mask);
+      }
+      Mask <<= 1;
+    }
+    return Error(IDLoc, Msg);
+  }
   case Match_InvalidOperand: {
     SMLoc ErrorLoc = IDLoc;
     if (ErrorInfo != ~0U) {
@@ -7336,7 +7487,7 @@ MatchAndEmitInstruction(SMLoc IDLoc,
     return Error(IDLoc, "invalid instruction",
                  ((ARMOperand*)Operands[0])->getLocRange());
   case Match_ConversionFail:
-    // The converter function will have already emited a diagnostic.
+    // The converter function will have already emitted a diagnostic.
     return true;
   case Match_RequiresNotITBlock:
     return Error(IDLoc, "flag setting instruction only valid outside IT block");
@@ -7346,6 +7497,11 @@ MatchAndEmitInstruction(SMLoc IDLoc,
     return Error(IDLoc, "instruction variant requires ARMv6 or later");
   case Match_RequiresThumb2:
     return Error(IDLoc, "instruction variant requires Thumb2");
+  case Match_ImmRange0_15: {
+    SMLoc ErrorLoc = ((ARMOperand*)Operands[ErrorInfo])->getStartLoc();
+    if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+    return Error(ErrorLoc, "immediate operand must be in the range [0,15]");
+  }
   }
 
   llvm_unreachable("Implement any new match types added!");
@@ -7582,5 +7738,6 @@ extern "C" void LLVMInitializeARMAsmParser() {
 }
 
 #define GET_REGISTER_MATCHER
+#define GET_SUBTARGET_FEATURE_NAME
 #define GET_MATCHER_IMPLEMENTATION
 #include "ARMGenAsmMatcher.inc"
diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt
index 9a2aab5..ac916cc 100644
--- a/lib/Target/ARM/CMakeLists.txt
+++ b/lib/Target/ARM/CMakeLists.txt
@@ -49,6 +49,8 @@ add_llvm_target(ARMCodeGen
   Thumb2SizeReduction.cpp
   )
 
+add_dependencies(LLVMARMCodeGen intrinsics_gen)
+
 # workaround for hanging compilation on MSVC9, 10
 if( MSVC_VERSION EQUAL 1600 OR MSVC_VERSION EQUAL 1500 )
 set_property(
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 912935d..47cca2a 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -24,12 +24,66 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include <vector>
 
 using namespace llvm;
 
 typedef MCDisassembler::DecodeStatus DecodeStatus;
 
 namespace {
+  // Handles the condition code status of instructions in IT blocks
+  class ITStatus
+  {
+    public:
+      // Returns the condition code for instruction in IT block
+      unsigned getITCC() {
+        unsigned CC = ARMCC::AL;
+        if (instrInITBlock())
+          CC = ITStates.back();
+        return CC;
+      }
+
+      // Advances the IT block state to the next T or E
+      void advanceITState() {
+        ITStates.pop_back();
+      }
+
+      // Returns true if the current instruction is in an IT block
+      bool instrInITBlock() {
+        return !ITStates.empty();
+      }
+
+      // Returns true if current instruction is the last instruction in an IT block
+      bool instrLastInITBlock() {
+        return ITStates.size() == 1;
+      }
+
+      // Called when decoding an IT instruction. Sets the IT state for the following
+      // instructions that for the IT block. Firstcond and Mask correspond to the 
+      // fields in the IT instruction encoding.
+      void setITState(char Firstcond, char Mask) {
+        // (3 - the number of trailing zeros) is the number of then / else.
+        unsigned CondBit0 = Firstcond & 1;
+        unsigned NumTZ = CountTrailingZeros_32(Mask);
+        unsigned char CCBits = static_cast<unsigned char>(Firstcond & 0xf);
+        assert(NumTZ <= 3 && "Invalid IT mask!");
+        // push condition codes onto the stack the correct order for the pops
+        for (unsigned Pos = NumTZ+1; Pos <= 3; ++Pos) {
+          bool T = ((Mask >> Pos) & 1) == CondBit0;
+          if (T)
+            ITStates.push_back(CCBits);
+          else
+            ITStates.push_back(CCBits ^ 1);
+        }
+        ITStates.push_back(CCBits);
+      }
+
+    private:
+      std::vector<unsigned char> ITStates;
+  };
+}
+
+namespace {
 /// ARMDisassembler - ARM disassembler for all ARM platforms.
 class ARMDisassembler : public MCDisassembler {
 public:
@@ -78,7 +132,7 @@ public:
   /// getEDInfo - See MCDisassembler.
   const EDInstInfo *getEDInfo() const;
 private:
-  mutable std::vector<unsigned> ITBlock;
+  mutable ITStatus ITBlock;
   DecodeStatus AddThumbPredicate(MCInst&) const;
   void UpdateThumbVFPPredicate(MCInst&) const;
 };
@@ -549,7 +603,7 @@ static bool tryAddingSymbolicOperand(uint64_t Address, int32_t Value,
 /// These can often be values in a literal pool near the Address of the
 /// instruction.  The Address of the instruction and its immediate Value are
 /// used as a possible literal pool entry.  The SymbolLookUp call back will
-/// return the name of a symbol referenced by the the literal pool's entry if
+/// return the name of a symbol referenced by the literal pool's entry if
 /// the referenced address is that of a symbol.  Or it will return a pointer to
 /// a literal 'C' string if the referenced address of the literal pool's entry
 /// is an address into a section with 'C' string literals.
@@ -612,7 +666,7 @@ ThumbDisassembler::AddThumbPredicate(MCInst &MI) const {
     case ARM::tSETEND:
       // Some instructions (mostly conditional branches) are not
       // allowed in IT blocks.
-      if (!ITBlock.empty())
+      if (ITBlock.instrInITBlock())
         S = SoftFail;
       else
         return Success;
@@ -623,7 +677,7 @@ ThumbDisassembler::AddThumbPredicate(MCInst &MI) const {
     case ARM::t2TBH:
       // Some instructions (mostly unconditional branches) can
       // only appears at the end of, or outside of, an IT.
-      if (ITBlock.size() > 1)
+      if (ITBlock.instrInITBlock() && !ITBlock.instrLastInITBlock())
         S = SoftFail;
       break;
     default:
@@ -633,13 +687,11 @@ ThumbDisassembler::AddThumbPredicate(MCInst &MI) const {
   // If we're in an IT block, base the predicate on that.  Otherwise,
   // assume a predicate of AL.
   unsigned CC;
-  if (!ITBlock.empty()) {
-    CC = ITBlock.back();
-    if (CC == 0xF)
-      CC = ARMCC::AL;
-    ITBlock.pop_back();
-  } else
+  CC = ITBlock.getITCC();
+  if (CC == 0xF) 
     CC = ARMCC::AL;
+  if (ITBlock.instrInITBlock())
+    ITBlock.advanceITState();
 
   const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo;
   unsigned short NumOps = ARMInsts[MI.getOpcode()].NumOperands;
@@ -674,11 +726,9 @@ ThumbDisassembler::AddThumbPredicate(MCInst &MI) const {
 // context as a post-pass.
 void ThumbDisassembler::UpdateThumbVFPPredicate(MCInst &MI) const {
   unsigned CC;
-  if (!ITBlock.empty()) {
-    CC = ITBlock.back();
-    ITBlock.pop_back();
-  } else
-    CC = ARMCC::AL;
+  CC = ITBlock.getITCC();
+  if (ITBlock.instrInITBlock())
+    ITBlock.advanceITState();
 
   const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo;
   MCInst::iterator I = MI.begin();
@@ -726,7 +776,7 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   result = decodeThumbSBitInstruction16(MI, insn16, Address, this, STI);
   if (result) {
     Size = 2;
-    bool InITBlock = !ITBlock.empty();
+    bool InITBlock = ITBlock.instrInITBlock();
     Check(result, AddThumbPredicate(MI));
     AddThumb1SBit(MI, InITBlock);
     return result;
@@ -739,7 +789,7 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
 
     // Nested IT blocks are UNPREDICTABLE.  Must be checked before we add
     // the Thumb predicate.
-    if (MI.getOpcode() == ARM::t2IT && !ITBlock.empty())
+    if (MI.getOpcode() == ARM::t2IT && ITBlock.instrInITBlock())
       result = MCDisassembler::SoftFail;
 
     Check(result, AddThumbPredicate(MI));
@@ -749,21 +799,9 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     // to the subsequent instructions.
     if (MI.getOpcode() == ARM::t2IT) {
 
-      // (3 - the number of trailing zeros) is the number of then / else.
-      unsigned firstcond = MI.getOperand(0).getImm();
+      unsigned Firstcond = MI.getOperand(0).getImm();
       unsigned Mask = MI.getOperand(1).getImm();
-      unsigned CondBit0 = Mask >> 4 & 1;
-      unsigned NumTZ = CountTrailingZeros_32(Mask);
-      assert(NumTZ <= 3 && "Invalid IT mask!");
-      for (unsigned Pos = 3, e = NumTZ; Pos > e; --Pos) {
-        bool T = ((Mask >> Pos) & 1) == CondBit0;
-        if (T)
-          ITBlock.insert(ITBlock.begin(), firstcond);
-        else
-          ITBlock.insert(ITBlock.begin(), firstcond ^ 1);
-      }
-
-      ITBlock.push_back(firstcond);
+      ITBlock.setITState(Firstcond, Mask);
     }
 
     return result;
@@ -783,7 +821,7 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   result = decodeThumbInstruction32(MI, insn32, Address, this, STI);
   if (result != MCDisassembler::Fail) {
     Size = 4;
-    bool InITBlock = ITBlock.size();
+    bool InITBlock = ITBlock.instrInITBlock();
     Check(result, AddThumbPredicate(MI));
     AddThumb1SBit(MI, InITBlock);
     return result;
@@ -1186,8 +1224,8 @@ static DecodeStatus DecodeSPRRegListOperand(MCInst &Inst, unsigned Val,
                                  uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
-  unsigned Vd = fieldFromInstruction32(Val, 8, 4);
-  unsigned regs = Val & 0xFF;
+  unsigned Vd = fieldFromInstruction32(Val, 8, 5);
+  unsigned regs = fieldFromInstruction32(Val, 0, 8);
 
   if (!Check(S, DecodeSPRRegisterClass(Inst, Vd, Address, Decoder)))
     return MCDisassembler::Fail;
@@ -1203,8 +1241,10 @@ static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val,
                                  uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
-  unsigned Vd = fieldFromInstruction32(Val, 8, 4);
-  unsigned regs = (Val & 0xFF) / 2;
+  unsigned Vd = fieldFromInstruction32(Val, 8, 5);
+  unsigned regs = fieldFromInstruction32(Val, 0, 8);
+
+  regs = regs >> 1;
 
   if (!Check(S, DecodeDPRRegisterClass(Inst, Vd, Address, Decoder)))
       return MCDisassembler::Fail;
@@ -2976,7 +3016,7 @@ static DecodeStatus DecodeThumbBROperand(MCInst &Inst, unsigned Val,
 
 static DecodeStatus DecodeT2BROperand(MCInst &Inst, unsigned Val,
                                  uint64_t Address, const void *Decoder) {
-  if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<22>(Val<<1) + 4,
+  if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<21>(Val) + 4,
                                 true, 4, Inst, Decoder))
     Inst.addOperand(MCOperand::CreateImm(SignExtend32<21>(Val)));
   return MCDisassembler::Success;
@@ -3258,9 +3298,9 @@ static DecodeStatus DecodeThumbAddSPReg(MCInst &Inst, uint16_t Insn,
 
     if (!Check(S, DecodeGPRRegisterClass(Inst, Rdm, Address, Decoder)))
     return MCDisassembler::Fail;
+    Inst.addOperand(MCOperand::CreateReg(ARM::SP));
     if (!Check(S, DecodeGPRRegisterClass(Inst, Rdm, Address, Decoder)))
     return MCDisassembler::Fail;
-    Inst.addOperand(MCOperand::CreateReg(ARM::SP));
   } else if (Inst.getOpcode() == ARM::tADDspr) {
     unsigned Rm = fieldFromInstruction16(Insn, 3, 4);
 
@@ -3299,10 +3339,25 @@ static DecodeStatus DecodePostIdxReg(MCInst &Inst, unsigned Insn,
 
 static DecodeStatus DecodeThumbBLXOffset(MCInst &Inst, unsigned Val,
                                  uint64_t Address, const void *Decoder) {
+  // Val is passed in as S:J1:J2:imm10H:imm10L:'0'
+  // Note only one trailing zero not two.  Also the J1 and J2 values are from
+  // the encoded instruction.  So here change to I1 and I2 values via:
+  // I1 = NOT(J1 EOR S);
+  // I2 = NOT(J2 EOR S);
+  // and build the imm32 with two trailing zeros as documented:
+  // imm32 = SignExtend(S:I1:I2:imm10H:imm10L:'00', 32);
+  unsigned S = (Val >> 23) & 1;
+  unsigned J1 = (Val >> 22) & 1;
+  unsigned J2 = (Val >> 21) & 1;
+  unsigned I1 = !(J1 ^ S);
+  unsigned I2 = !(J2 ^ S);
+  unsigned tmp = (Val & ~0x600000) | (I1 << 22) | (I2 << 21);
+  int imm32 = SignExtend32<25>(tmp << 1);
+
   if (!tryAddingSymbolicOperand(Address,
-                                (Address & ~2u) + SignExtend32<22>(Val << 1) + 4,
+                                (Address & ~2u) + imm32 + 4,
                                 true, 4, Inst, Decoder))
-    Inst.addOperand(MCOperand::CreateImm(SignExtend32<22>(Val << 1)));
+    Inst.addOperand(MCOperand::CreateImm(imm32));
   return MCDisassembler::Success;
 }
 
@@ -3408,17 +3463,32 @@ static DecodeStatus DecodeT2SOImm(MCInst &Inst, unsigned Val,
 static DecodeStatus
 DecodeThumbBCCTargetOperand(MCInst &Inst, unsigned Val,
                             uint64_t Address, const void *Decoder){
-  if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<8>(Val<<1) + 4,
+  if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<9>(Val<<1) + 4,
                                 true, 2, Inst, Decoder))
-    Inst.addOperand(MCOperand::CreateImm(SignExtend32<8>(Val << 1)));
+    Inst.addOperand(MCOperand::CreateImm(SignExtend32<9>(Val << 1)));
   return MCDisassembler::Success;
 }
 
 static DecodeStatus DecodeThumbBLTargetOperand(MCInst &Inst, unsigned Val,
                                        uint64_t Address, const void *Decoder){
-  if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<22>(Val<<1) + 4,
+  // Val is passed in as S:J1:J2:imm10:imm11
+  // Note no trailing zero after imm11.  Also the J1 and J2 values are from
+  // the encoded instruction.  So here change to I1 and I2 values via:
+  // I1 = NOT(J1 EOR S);
+  // I2 = NOT(J2 EOR S);
+  // and build the imm32 with one trailing zero as documented:
+  // imm32 = SignExtend(S:I1:I2:imm10:imm11:'0', 32);
+  unsigned S = (Val >> 23) & 1;
+  unsigned J1 = (Val >> 22) & 1;
+  unsigned J2 = (Val >> 21) & 1;
+  unsigned I1 = !(J1 ^ S);
+  unsigned I2 = !(J2 ^ S);
+  unsigned tmp = (Val & ~0x600000) | (I1 << 22) | (I2 << 21);
+  int imm32 = SignExtend32<25>(tmp << 1);
+
+  if (!tryAddingSymbolicOperand(Address, Address + imm32 + 4,
                                 true, 4, Inst, Decoder))
-    Inst.addOperand(MCOperand::CreateImm(SignExtend32<22>(Val << 1)));
+    Inst.addOperand(MCOperand::CreateImm(imm32));
   return MCDisassembler::Success;
 }
 
@@ -4128,9 +4198,9 @@ static DecodeStatus DecodeVMOVSRR(MCInst &Inst, unsigned Insn,
   DecodeStatus S = MCDisassembler::Success;
   unsigned Rt  = fieldFromInstruction32(Insn, 12, 4);
   unsigned Rt2 = fieldFromInstruction32(Insn, 16, 4);
-  unsigned Rm  = fieldFromInstruction32(Insn,  0, 4);
+  unsigned Rm  = fieldFromInstruction32(Insn,  5, 1);
   unsigned pred = fieldFromInstruction32(Insn, 28, 4);
-  Rm |= fieldFromInstruction32(Insn, 5, 1) << 4;
+  Rm |= fieldFromInstruction32(Insn, 0, 4) << 1;
 
   if (Rt == 0xF || Rt2 == 0xF || Rm == 0x1F)
     S = MCDisassembler::SoftFail;
@@ -4154,9 +4224,9 @@ static DecodeStatus DecodeVMOVRRS(MCInst &Inst, unsigned Insn,
   DecodeStatus S = MCDisassembler::Success;
   unsigned Rt  = fieldFromInstruction32(Insn, 12, 4);
   unsigned Rt2 = fieldFromInstruction32(Insn, 16, 4);
-  unsigned Rm  = fieldFromInstruction32(Insn,  0, 4);
+  unsigned Rm  = fieldFromInstruction32(Insn,  5, 1);
   unsigned pred = fieldFromInstruction32(Insn, 28, 4);
-  Rm |= fieldFromInstruction32(Insn, 5, 1) << 4;
+  Rm |= fieldFromInstruction32(Insn, 0, 4) << 1;
 
   if (Rt == 0xF || Rt2 == 0xF || Rm == 0x1F)
     S = MCDisassembler::SoftFail;
@@ -4179,19 +4249,14 @@ static DecodeStatus DecodeIT(MCInst &Inst, unsigned Insn,
                              uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
   unsigned pred = fieldFromInstruction16(Insn, 4, 4);
-  // The InstPrinter needs to have the low bit of the predicate in
-  // the mask operand to be able to print it properly.
-  unsigned mask = fieldFromInstruction16(Insn, 0, 5);
+  unsigned mask = fieldFromInstruction16(Insn, 0, 4);
 
   if (pred == 0xF) {
     pred = 0xE;
     S = MCDisassembler::SoftFail;
   }
 
-  if ((mask & 0xF) == 0) {
-    // Preserve the high bit of the mask, which is the low bit of
-    // the predicate.
-    mask &= 0x10;
+  if (mask == 0x0) {
     mask |= 0x8;
     S = MCDisassembler::SoftFail;
   }
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index cbd81c1..2f6b1b0 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -52,6 +52,27 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
                                StringRef Annot) {
   unsigned Opcode = MI->getOpcode();
 
+  // Check for HINT instructions w/ canonical names.
+  if (Opcode == ARM::HINT || Opcode == ARM::t2HINT) {
+    switch (MI->getOperand(0).getImm()) {
+    case 0: O << "\tnop"; break;
+    case 1: O << "\tyield"; break;
+    case 2: O << "\twfe"; break;
+    case 3: O << "\twfi"; break;
+    case 4: O << "\tsev"; break;
+    default:
+      // Anything else should just print normally.
+      printInstruction(MI, O);
+      printAnnotation(O, Annot);
+      return;
+    }
+    printPredicateOperand(MI, 1, O);
+    if (Opcode == ARM::t2HINT)
+      O << ".w";
+    printAnnotation(O, Annot);
+    return;
+  }
+
   // Check for MOVs and print canonical forms, instead.
   if (Opcode == ARM::MOVsr) {
     // FIXME: Thumb variants?
@@ -426,9 +447,13 @@ void ARMInstPrinter::printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op,
     return;
   }
 
-  if (unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm()))
+  //If the op is sub we have to print the immediate even if it is 0 
+  unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm());
+  ARM_AM::AddrOpc op = ARM_AM::getAM3Op(MO3.getImm());
+ 
+  if (ImmOffs || (op == ARM_AM::sub))
     O << ", #"
-      << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO3.getImm()))
+      << ARM_AM::getAddrOpcStr(op)
       << ImmOffs;
   O << ']';
 }
@@ -643,22 +668,50 @@ void ARMInstPrinter::printMSRMaskOperand(const MCInst *MI, unsigned OpNum,
   unsigned Mask = Op.getImm() & 0xf;
 
   if (getAvailableFeatures() & ARM::FeatureMClass) {
-    switch (Op.getImm()) {
+    unsigned SYSm = Op.getImm();
+    unsigned Opcode = MI->getOpcode();
+    // For reads of the special registers ignore the "mask encoding" bits
+    // which are only for writes.
+    if (Opcode == ARM::t2MRS_M)
+      SYSm &= 0xff;
+    switch (SYSm) {
     default: llvm_unreachable("Unexpected mask value!");
-    case 0: O << "apsr"; return;
-    case 1: O << "iapsr"; return;
-    case 2: O << "eapsr"; return;
-    case 3: O << "xpsr"; return;
-    case 5: O << "ipsr"; return;
-    case 6: O << "epsr"; return;
-    case 7: O << "iepsr"; return;
-    case 8: O << "msp"; return;
-    case 9: O << "psp"; return;
-    case 16: O << "primask"; return;
-    case 17: O << "basepri"; return;
-    case 18: O << "basepri_max"; return;
-    case 19: O << "faultmask"; return;
-    case 20: O << "control"; return;
+    case     0:
+    case 0x800: O << "apsr"; return; // with _nzcvq bits is an alias for aspr
+    case 0x400: O << "apsr_g"; return;
+    case 0xc00: O << "apsr_nzcvqg"; return;
+    case     1:
+    case 0x801: O << "iapsr"; return; // with _nzcvq bits is an alias for iapsr
+    case 0x401: O << "iapsr_g"; return;
+    case 0xc01: O << "iapsr_nzcvqg"; return;
+    case     2:
+    case 0x802: O << "eapsr"; return; // with _nzcvq bits is an alias for eapsr
+    case 0x402: O << "eapsr_g"; return;
+    case 0xc02: O << "eapsr_nzcvqg"; return;
+    case     3:
+    case 0x803: O << "xpsr"; return; // with _nzcvq bits is an alias for xpsr
+    case 0x403: O << "xpsr_g"; return;
+    case 0xc03: O << "xpsr_nzcvqg"; return;
+    case     5:
+    case 0x805: O << "ipsr"; return;
+    case     6:
+    case 0x806: O << "epsr"; return;
+    case     7:
+    case 0x807: O << "iepsr"; return;
+    case     8:
+    case 0x808: O << "msp"; return;
+    case     9:
+    case 0x809: O << "psp"; return;
+    case  0x10:
+    case 0x810: O << "primask"; return;
+    case  0x11:
+    case 0x811: O << "basepri"; return;
+    case  0x12:
+    case 0x812: O << "basepri_max"; return;
+    case  0x13:
+    case 0x813: O << "faultmask"; return;
+    case  0x14:
+    case 0x814: O << "control"; return;
     }
   }
 
@@ -754,7 +807,8 @@ void ARMInstPrinter::printThumbITMask(const MCInst *MI, unsigned OpNum,
                                       raw_ostream &O) {
   // (3 - the number of trailing zeros) is the number of then / else.
   unsigned Mask = MI->getOperand(OpNum).getImm();
-  unsigned CondBit0 = Mask >> 4 & 1;
+  unsigned Firstcond = MI->getOperand(OpNum-1).getImm();
+  unsigned CondBit0 = Firstcond & 1;
   unsigned NumTZ = CountTrailingZeros_32(Mask);
   assert(NumTZ <= 3 && "Invalid IT mask!");
   for (unsigned Pos = 3, e = NumTZ; Pos > e; --Pos) {
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index d10bfc1..ac6ce64 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -12,6 +12,7 @@
 #include "MCTargetDesc/ARMFixupKinds.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
@@ -84,7 +85,8 @@ public:
 { "fixup_arm_thumb_bl",      0,            32,  MCFixupKindInfo::FKF_IsPCRel },
 { "fixup_arm_thumb_blx",     0,            32,  MCFixupKindInfo::FKF_IsPCRel },
 { "fixup_arm_thumb_cb",      0,            16,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_thumb_cp",      0,             8,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_cp",      0,             8,  MCFixupKindInfo::FKF_IsPCRel |
+                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
 { "fixup_arm_thumb_bcc",     0,             8,  MCFixupKindInfo::FKF_IsPCRel },
 // movw / movt: 16-bits immediate but scattered into two chunks 0 - 12, 16 - 19.
 { "fixup_arm_movt_hi16",     0,            20,  0 },
@@ -110,32 +112,7 @@ public:
   void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
                          const MCFixup &Fixup, const MCFragment *DF,
                          MCValue &Target, uint64_t &Value,
-                         bool &IsResolved) {
-    const MCSymbolRefExpr *A = Target.getSymA();
-    // Some fixups to thumb function symbols need the low bit (thumb bit)
-    // twiddled.
-    if ((unsigned)Fixup.getKind() != ARM::fixup_arm_ldst_pcrel_12 &&
-        (unsigned)Fixup.getKind() != ARM::fixup_t2_ldst_pcrel_12 &&
-        (unsigned)Fixup.getKind() != ARM::fixup_arm_adr_pcrel_12 &&
-        (unsigned)Fixup.getKind() != ARM::fixup_thumb_adr_pcrel_10 &&
-        (unsigned)Fixup.getKind() != ARM::fixup_t2_adr_pcrel_12 &&
-        (unsigned)Fixup.getKind() != ARM::fixup_arm_thumb_cp) {
-      if (A) {
-        const MCSymbol &Sym = A->getSymbol().AliasedSymbol();
-        if (Asm.isThumbFunc(&Sym))
-          Value |= 1;
-      }
-    }
-    // We must always generate a relocation for BL/BLX instructions if we have
-    // a symbol to reference, as the linker relies on knowing the destination
-    // symbol's thumb-ness to get interworking right.
-    if (A && ((unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_blx ||
-              (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl ||
-              (unsigned)Fixup.getKind() == ARM::fixup_arm_blx ||
-              (unsigned)Fixup.getKind() == ARM::fixup_arm_uncondbl ||
-              (unsigned)Fixup.getKind() == ARM::fixup_arm_condbl))
-      IsResolved = false;
-  }
+                         bool &IsResolved);
 
   bool mayNeedRelaxation(const MCInst &Inst) const;
 
@@ -269,7 +246,9 @@ bool ARMAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
   return true;
 }
 
-static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
+static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
+                                 MCContext *Ctx = NULL) {
+  unsigned Kind = Fixup.getKind();
   switch (Kind) {
   default:
     llvm_unreachable("Unknown fixup kind!");
@@ -322,7 +301,8 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
       Value = -Value;
       isAdd = false;
     }
-    assert ((Value < 4096) && "Out of range pc-relative fixup value!");
+    if (Ctx && Value >= 4096)
+      Ctx->FatalError(Fixup.getLoc(), "out of range pc-relative fixup value");
     Value |= isAdd << 23;
 
     // Same addressing mode as fixup_arm_pcrel_10,
@@ -345,8 +325,8 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
       Value = -Value;
       opc = 2; // 0b0010
     }
-    assert(ARM_AM::getSOImmVal(Value) != -1 &&
-           "Out of range pc-relative fixup value!");
+    if (Ctx && ARM_AM::getSOImmVal(Value) == -1)
+      Ctx->FatalError(Fixup.getLoc(), "out of range pc-relative fixup value");
     // Encode the immediate and shift the opcode into place.
     return ARM_AM::getSOImmVal(Value) | (opc << 21);
   }
@@ -414,39 +394,65 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
     return swapped;
   }
   case ARM::fixup_arm_thumb_bl: {
-    // The value doesn't encode the low bit (always zero) and is offset by
-    // four. The value is encoded into disjoint bit positions in the destination
-    // opcode. x = unchanged, I = immediate value bit, S = sign extension bit
-    //
-    //   BL:  xxxxxSIIIIIIIIII xxxxxIIIIIIIIIII
-    //
-    // Note that the halfwords are stored high first, low second; so we need
-    // to transpose the fixup value here to map properly.
-    unsigned isNeg = (int64_t(Value - 4) < 0) ? 1 : 0;
-    uint32_t Binary = 0;
-    Value = 0x3fffff & ((Value - 4) >> 1);
-    Binary  = (Value & 0x7ff) << 16;    // Low imm11 value.
-    Binary |= (Value & 0x1ffc00) >> 11; // High imm10 value.
-    Binary |= isNeg << 10;              // Sign bit.
-    return Binary;
+     // The value doesn't encode the low bit (always zero) and is offset by
+     // four. The 32-bit immediate value is encoded as
+     //   imm32 = SignExtend(S:I1:I2:imm10:imm11:0)
+     // where I1 = NOT(J1 ^ S) and I2 = NOT(J2 ^ S).
+     // The value is encoded into disjoint bit positions in the destination
+     // opcode. x = unchanged, I = immediate value bit, S = sign extension bit,
+     // J = either J1 or J2 bit
+     //
+     //   BL:  xxxxxSIIIIIIIIII xxJxJIIIIIIIIIII
+     //
+     // Note that the halfwords are stored high first, low second; so we need
+     // to transpose the fixup value here to map properly.
+     uint32_t offset = (Value - 4) >> 1;
+     uint32_t signBit = (offset & 0x800000) >> 23;
+     uint32_t I1Bit = (offset & 0x400000) >> 22;
+     uint32_t J1Bit = (I1Bit ^ 0x1) ^ signBit;
+     uint32_t I2Bit = (offset & 0x200000) >> 21;
+     uint32_t J2Bit = (I2Bit ^ 0x1) ^ signBit;
+     uint32_t imm10Bits = (offset & 0x1FF800) >> 11;
+     uint32_t imm11Bits = (offset & 0x000007FF);
+ 
+     uint32_t Binary = 0;
+     uint32_t firstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10Bits);
+     uint32_t secondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) |
+                           (uint16_t)imm11Bits);
+     Binary |= secondHalf << 16;
+     Binary |= firstHalf;
+     return Binary;
+
   }
   case ARM::fixup_arm_thumb_blx: {
-    // The value doesn't encode the low two bits (always zero) and is offset by
-    // four (see fixup_arm_thumb_cp). The value is encoded into disjoint bit
-    // positions in the destination opcode. x = unchanged, I = immediate value
-    // bit, S = sign extension bit, 0 = zero.
-    //
-    //   BLX: xxxxxSIIIIIIIIII xxxxxIIIIIIIIII0
-    //
-    // Note that the halfwords are stored high first, low second; so we need
-    // to transpose the fixup value here to map properly.
-    unsigned isNeg = (int64_t(Value-4) < 0) ? 1 : 0;
-    uint32_t Binary = 0;
-    Value = 0xfffff & ((Value - 2) >> 2);
-    Binary  = (Value & 0x3ff) << 17;    // Low imm10L value.
-    Binary |= (Value & 0xffc00) >> 10;  // High imm10H value.
-    Binary |= isNeg << 10;              // Sign bit.
-    return Binary;
+     // The value doesn't encode the low two bits (always zero) and is offset by
+     // four (see fixup_arm_thumb_cp). The 32-bit immediate value is encoded as
+     //   imm32 = SignExtend(S:I1:I2:imm10H:imm10L:00)
+     // where I1 = NOT(J1 ^ S) and I2 = NOT(J2 ^ S).
+     // The value is encoded into disjoint bit positions in the destination 
+     // opcode. x = unchanged, I = immediate value bit, S = sign extension bit, 
+     // J = either J1 or J2 bit, 0 = zero.
+     //
+     //   BLX: xxxxxSIIIIIIIIII xxJxJIIIIIIIIII0
+     //
+     // Note that the halfwords are stored high first, low second; so we need
+     // to transpose the fixup value here to map properly.
+     uint32_t offset = (Value - 2) >> 2;
+     uint32_t signBit = (offset & 0x400000) >> 22;
+     uint32_t I1Bit = (offset & 0x200000) >> 21;
+     uint32_t J1Bit = (I1Bit ^ 0x1) ^ signBit;
+     uint32_t I2Bit = (offset & 0x100000) >> 20;
+     uint32_t J2Bit = (I2Bit ^ 0x1) ^ signBit;
+     uint32_t imm10HBits = (offset & 0xFFC00) >> 10;
+     uint32_t imm10LBits = (offset & 0x3FF);
+ 
+     uint32_t Binary = 0;
+     uint32_t firstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10HBits);
+     uint32_t secondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) | 
+                           ((uint16_t)imm10LBits) << 1);
+     Binary |= secondHalf << 16;
+     Binary |= firstHalf;
+     return Binary;
   }
   case ARM::fixup_arm_thumb_cp:
     // Offset by 4, and don't encode the low two bits. Two bytes of that
@@ -473,7 +479,8 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
       isAdd = false;
     }
     // The value has the low 4 bits encoded in [3:0] and the high 4 in [11:8].
-    assert ((Value < 256) && "Out of range pc-relative fixup value!");
+    if (Ctx && Value >= 256)
+      Ctx->FatalError(Fixup.getLoc(), "out of range pc-relative fixup value");
     Value = (Value & 0xf) | ((Value & 0xf0) << 4);
     return Value | (isAdd << 23);
   }
@@ -491,7 +498,8 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
     }
     // These values don't encode the low two bits since they're always zero.
     Value >>= 2;
-    assert ((Value < 256) && "Out of range pc-relative fixup value!");
+    if (Ctx && Value >= 256)
+      Ctx->FatalError(Fixup.getLoc(), "out of range pc-relative fixup value");
     Value |= isAdd << 23;
 
     // Same addressing mode as fixup_arm_pcrel_10, but with 16-bit halfwords
@@ -507,6 +515,43 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
   }
 }
 
+void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
+                                      const MCAsmLayout &Layout,
+                                      const MCFixup &Fixup,
+                                      const MCFragment *DF,
+                                      MCValue &Target, uint64_t &Value,
+                                      bool &IsResolved) {
+  const MCSymbolRefExpr *A = Target.getSymA();
+  // Some fixups to thumb function symbols need the low bit (thumb bit)
+  // twiddled.
+  if ((unsigned)Fixup.getKind() != ARM::fixup_arm_ldst_pcrel_12 &&
+      (unsigned)Fixup.getKind() != ARM::fixup_t2_ldst_pcrel_12 &&
+      (unsigned)Fixup.getKind() != ARM::fixup_arm_adr_pcrel_12 &&
+      (unsigned)Fixup.getKind() != ARM::fixup_thumb_adr_pcrel_10 &&
+      (unsigned)Fixup.getKind() != ARM::fixup_t2_adr_pcrel_12 &&
+      (unsigned)Fixup.getKind() != ARM::fixup_arm_thumb_cp) {
+    if (A) {
+      const MCSymbol &Sym = A->getSymbol().AliasedSymbol();
+      if (Asm.isThumbFunc(&Sym))
+        Value |= 1;
+    }
+  }
+  // We must always generate a relocation for BL/BLX instructions if we have
+  // a symbol to reference, as the linker relies on knowing the destination
+  // symbol's thumb-ness to get interworking right.
+  if (A && ((unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_blx ||
+            (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl ||
+            (unsigned)Fixup.getKind() == ARM::fixup_arm_blx ||
+            (unsigned)Fixup.getKind() == ARM::fixup_arm_uncondbl ||
+            (unsigned)Fixup.getKind() == ARM::fixup_arm_condbl))
+    IsResolved = false;
+
+  // Try to get the encoded value for the fixup as-if we're mapping it into
+  // the instruction. This allows adjustFixupValue() to issue a diagnostic
+  // if the value aren't invalid.
+  (void)adjustFixupValue(Fixup, Value, &Asm.getContext());
+}
+
 namespace {
 
 // FIXME: This should be in a separate file.
@@ -530,7 +575,7 @@ public:
 void ELFARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
                                   unsigned DataSize, uint64_t Value) const {
   unsigned NumBytes = 4;        // FIXME: 2 for Thumb
-  Value = adjustFixupValue(Fixup.getKind(), Value);
+  Value = adjustFixupValue(Fixup, Value);
   if (!Value) return;           // Doesn't change encoding.
 
   unsigned Offset = Fixup.getOffset();
@@ -615,7 +660,7 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
 void DarwinARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
                                      unsigned DataSize, uint64_t Value) const {
   unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
-  Value = adjustFixupValue(Fixup.getKind(), Value);
+  Value = adjustFixupValue(Fixup, Value);
   if (!Value) return;           // Doesn't change encoding.
 
   unsigned Offset = Fixup.getOffset();
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index aa649ba..7d6acbc 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -178,9 +178,8 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
         break;
       }
       break;
-    case ARM::fixup_arm_uncondbl:
     case ARM::fixup_arm_blx:
-    case ARM::fixup_arm_uncondbranch:
+    case ARM::fixup_arm_uncondbl:
       switch (Modifier) {
       case MCSymbolRefExpr::VK_ARM_PLT:
         Type = ELF::R_ARM_PLT32;
@@ -192,6 +191,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
       break;
     case ARM::fixup_arm_condbl:
     case ARM::fixup_arm_condbranch:
+    case ARM::fixup_arm_uncondbranch:
       Type = ELF::R_ARM_JUMP24;
       break;
     case ARM::fixup_arm_movt_hi16:
@@ -252,10 +252,8 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
     case ARM::fixup_arm_thumb_cp:
     case ARM::fixup_arm_thumb_br:
       llvm_unreachable("Unimplemented");
-    case ARM::fixup_arm_uncondbranch:
-      Type = ELF::R_ARM_CALL;
-      break;
     case ARM::fixup_arm_condbranch:
+    case ARM::fixup_arm_uncondbranch:
       Type = ELF::R_ARM_JUMP24;
       break;
     case ARM::fixup_arm_movt_hi16:
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
index 03e8d5f..d32805e 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -22,40 +22,14 @@ EnableARMEHABI("arm-enable-ehabi", cl::Hidden,
   cl::init(false));
 
 
-static const char *const arm_asm_table[] = {
-  "{r0}", "r0",
-  "{r1}", "r1",
-  "{r2}", "r2",
-  "{r3}", "r3",
-  "{r4}", "r4",
-  "{r5}", "r5",
-  "{r6}", "r6",
-  "{r7}", "r7",
-  "{r8}", "r8",
-  "{r9}", "r9",
-  "{r10}", "r10",
-  "{r11}", "r11",
-  "{r12}", "r12",
-  "{r13}", "r13",
-  "{r14}", "r14",
-  "{lr}", "lr",
-  "{sp}", "sp",
-  "{ip}", "ip",
-  "{fp}", "fp",
-  "{sl}", "sl",
-  "{memory}", "memory",
-  "{cc}", "cc",
-  0,0
-};
-
 void ARMMCAsmInfoDarwin::anchor() { }
 
 ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin() {
-  AsmTransCBE = arm_asm_table;
   Data64bitsDirective = 0;
   CommentString = "@";
   Code16Directive = ".code\t16";
   Code32Directive = ".code\t32";
+  UseDataRegionDirectives = true;
 
   SupportsDebugInformation = true;
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index 10d1c48..1964bcd 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -336,6 +336,7 @@ public:
 } // end anonymous namespace
 
 MCCodeEmitter *llvm::createARMMCCodeEmitter(const MCInstrInfo &MCII,
+                                            const MCRegisterInfo &MRI,
                                             const MCSubtargetInfo &STI,
                                             MCContext &Ctx) {
   return new ARMMCCodeEmitter(MCII, STI, Ctx);
@@ -861,11 +862,11 @@ ARMMCCodeEmitter::getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx,
 
   // Handle :upper16: and :lower16: assembly prefixes.
   const MCExpr *E = MO.getExpr();
+  MCFixupKind Kind;
   if (E->getKind() == MCExpr::Target) {
     const ARMMCExpr *ARM16Expr = cast<ARMMCExpr>(E);
     E = ARM16Expr->getSubExpr();
 
-    MCFixupKind Kind;
     switch (ARM16Expr->getKind()) {
     default: llvm_unreachable("Unsupported ARMFixup");
     case ARMMCExpr::VK_ARM_HI16:
@@ -891,9 +892,21 @@ ARMMCCodeEmitter::getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx,
     }
     Fixups.push_back(MCFixup::Create(0, E, Kind, MI.getLoc()));
     return 0;
-  };
-
-  llvm_unreachable("Unsupported MCExpr type in MCOperand!");
+  }
+  // If the expression doesn't have :upper16: or :lower16: on it,
+  // it's just a plain immediate expression, and those evaluate to
+  // the lower 16 bits of the expression regardless of whether
+  // we have a movt or a movw.
+  if (!isTargetDarwin() && EvaluateAsPCRel(E))
+    Kind = MCFixupKind(isThumb2()
+                       ? ARM::fixup_t2_movw_lo16_pcrel
+                       : ARM::fixup_arm_movw_lo16_pcrel);
+  else
+    Kind = MCFixupKind(isThumb2()
+                       ? ARM::fixup_t2_movw_lo16
+                       : ARM::fixup_arm_movw_lo16);
+  Fixups.push_back(MCFixup::Create(0, E, Kind, MI.getLoc()));
+  return 0;
 }
 
 uint32_t ARMMCCodeEmitter::
@@ -1192,8 +1205,7 @@ getSORegImmOpValue(const MCInst &MI, unsigned OpIdx,
   // Encode shift_imm bit[11:7].
   Binary |= SBits << 4;
   unsigned Offset = ARM_AM::getSORegOffset(MO1.getImm());
-  assert(Offset && "Offset must be in range 1-32!");
-  if (Offset == 32) Offset = 0;
+  assert(Offset < 32 && "Offset must be in range 0-31!");
   return Binary | (Offset << 7);
 }
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index e3512cd..5df84c8 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -35,7 +35,7 @@
 
 using namespace llvm;
 
-std::string ARM_MC::ParseARMTriple(StringRef TT) {
+std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) {
   // Set the boolean corresponding to the current target triple, or the default
   // if one cannot be determined, to true.
   unsigned Len = TT.size();
@@ -51,27 +51,48 @@ std::string ARM_MC::ParseARMTriple(StringRef TT) {
       Idx = 6;
   }
 
+  bool NoCPU = CPU == "generic" || CPU.empty();
   std::string ARMArchFeature;
   if (Idx) {
     unsigned SubVer = TT[Idx];
     if (SubVer >= '7' && SubVer <= '9') {
       if (Len >= Idx+2 && TT[Idx+1] == 'm') {
-        // v7m: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureMClass
-        ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+mclass";
+        if (NoCPU)
+          // v7m: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureMClass
+          ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+mclass";
+        else
+          // Use CPU to figure out the exact features.
+          ARMArchFeature = "+v7";
       } else if (Len >= Idx+3 && TT[Idx+1] == 'e'&& TT[Idx+2] == 'm') {
-        // v7em: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureDSPThumb2,
-        //       FeatureT2XtPk, FeatureMClass
-        ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+t2dsp,t2xtpk,+mclass";
-      } else
-        // v7a: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureT2XtPk
-        ARMArchFeature = "+v7,+neon,+db,+t2dsp,+t2xtpk";
+        if (NoCPU)
+          // v7em: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureDSPThumb2,
+          //       FeatureT2XtPk, FeatureMClass
+          ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+t2dsp,t2xtpk,+mclass";
+        else
+          // Use CPU to figure out the exact features.
+          ARMArchFeature = "+v7";
+      } else {
+        // v7 CPUs have lots of different feature sets. If no CPU is specified,
+        // then assume v7a (e.g. cortex-a8) feature set. Otherwise, return
+        // the "minimum" feature set and use CPU string to figure out the exact
+        // features.
+        if (NoCPU)
+          // v7a: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureT2XtPk
+          ARMArchFeature = "+v7,+neon,+db,+t2dsp,+t2xtpk";
+        else
+          // Use CPU to figure out the exact features.
+          ARMArchFeature = "+v7";
+      }
     } else if (SubVer == '6') {
       if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == '2')
         ARMArchFeature = "+v6t2";
-      else if (Len >= Idx+2 && TT[Idx+1] == 'm')
-        // v6m: FeatureNoARM, FeatureMClass
-        ARMArchFeature = "+v6t2,+noarm,+mclass";
-      else
+      else if (Len >= Idx+2 && TT[Idx+1] == 'm') {
+        if (NoCPU)
+          // v6m: FeatureNoARM, FeatureMClass
+          ARMArchFeature = "+v6,+noarm,+mclass";
+        else
+          ARMArchFeature = "+v6";
+      } else
         ARMArchFeature = "+v6";
     } else if (SubVer == '5') {
       if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == 'e')
@@ -94,7 +115,7 @@ std::string ARM_MC::ParseARMTriple(StringRef TT) {
 
 MCSubtargetInfo *ARM_MC::createARMMCSubtargetInfo(StringRef TT, StringRef CPU,
                                                   StringRef FS) {
-  std::string ArchFS = ARM_MC::ParseARMTriple(TT);
+  std::string ArchFS = ARM_MC::ParseARMTriple(TT, CPU);
   if (!FS.empty()) {
     if (!ArchFS.empty())
       ArchFS = ArchFS + "," + FS.str();
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
index 88472d7..510302d 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -23,6 +23,7 @@ class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
 class MCObjectWriter;
+class MCRegisterInfo;
 class MCSubtargetInfo;
 class StringRef;
 class Target;
@@ -31,7 +32,7 @@ class raw_ostream;
 extern Target TheARMTarget, TheThumbTarget;
 
 namespace ARM_MC {
-  std::string ParseARMTriple(StringRef TT);
+  std::string ParseARMTriple(StringRef TT, StringRef CPU);
 
   /// createARMMCSubtargetInfo - Create a ARM MCSubtargetInfo instance.
   /// This is exposed so Asm parser, etc. do not need to go through
@@ -41,6 +42,7 @@ namespace ARM_MC {
 }
 
 MCCodeEmitter *createARMMCCodeEmitter(const MCInstrInfo &MCII,
+                                      const MCRegisterInfo &MRI,
                                       const MCSubtargetInfo &STI,
                                       MCContext &Ctx);
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index 8057cb6..78faf59 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -190,7 +190,7 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
   //      0 - arm instructions
   //      1 - thumb instructions
   // the other half of the relocated expression is in the following pair
-  // relocation entry in the the low 16 bits of r_address field.
+  // relocation entry in the low 16 bits of r_address field.
   unsigned ThumbBit = 0;
   unsigned MovtBit = 0;
   switch ((unsigned)Fixup.getKind()) {
diff --git a/lib/Target/ARM/MLxExpansionPass.cpp b/lib/Target/ARM/MLxExpansionPass.cpp
index 2899836..ad60e32 100644
--- a/lib/Target/ARM/MLxExpansionPass.cpp
+++ b/lib/Target/ARM/MLxExpansionPass.cpp
@@ -220,7 +220,9 @@ MLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
 
   const MCInstrDesc &MCID1 = TII->get(MulOpc);
   const MCInstrDesc &MCID2 = TII->get(AddSubOpc);
-  unsigned TmpReg = MRI->createVirtualRegister(TII->getRegClass(MCID1, 0, TRI));
+  const MachineFunction &MF = *MI->getParent()->getParent();
+  unsigned TmpReg = MRI->createVirtualRegister(
+                      TII->getRegClass(MCID1, 0, TRI, MF));
 
   MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(), MCID1, TmpReg)
     .addReg(Src1Reg, getKillRegState(Src1Kill))
diff --git a/lib/Target/ARM/README.txt b/lib/Target/ARM/README.txt
index 3eddda8..57dc6cb 100644
--- a/lib/Target/ARM/README.txt
+++ b/lib/Target/ARM/README.txt
@@ -710,3 +710,24 @@ targets, e.g., PPC, that share this behavior, it would be best to implement
 this in a target-independent way: we should probably fold that (when using
 "undefined at zero" semantics) to set the "defined at zero" bit and have
 the code generator expand out the right code.
+
+
+//===---------------------------------------------------------------------===//
+
+Clean up the test/MC/ARM files to have more robust register choices.
+
+R0 should not be used as a register operand in the assembler tests as it's then
+not possible to distinguish between a correct encoding and a missing operand
+encoding, as zero is the default value for the binary encoder.
+e.g.,
+    add r0, r0  // bad
+    add r3, r5  // good
+
+Register operands should be distinct. That is, when the encoding does not
+require two syntactical operands to refer to the same register, two different
+registers should be used in the test so as to catch errors where the
+operands are swapped in the encoding.
+e.g.,
+    subs.w r1, r1, r1 // bad
+    subs.w r1, r2, r3 // good
+
diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp
index e03e758..735b255 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -53,11 +53,11 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                     unsigned SrcReg, bool isKill, int FI,
                     const TargetRegisterClass *RC,
                     const TargetRegisterInfo *TRI) const {
-  assert((RC == ARM::tGPRRegisterClass ||
+  assert((RC == &ARM::tGPRRegClass ||
           (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
            isARMLowRegister(SrcReg))) && "Unknown regclass!");
 
-  if (RC == ARM::tGPRRegisterClass ||
+  if (RC == &ARM::tGPRRegClass ||
       (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
        isARMLowRegister(SrcReg))) {
     DebugLoc DL;
@@ -81,11 +81,11 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                      unsigned DestReg, int FI,
                      const TargetRegisterClass *RC,
                      const TargetRegisterInfo *TRI) const {
-  assert((RC == ARM::tGPRRegisterClass ||
+  assert((RC == &ARM::tGPRRegClass ||
           (TargetRegisterInfo::isPhysicalRegister(DestReg) &&
            isARMLowRegister(DestReg))) && "Unknown regclass!");
 
-  if (RC == ARM::tGPRRegisterClass ||
+  if (RC == &ARM::tGPRRegClass ||
       (TargetRegisterInfo::isPhysicalRegister(DestReg) &&
        isARMLowRegister(DestReg))) {
     DebugLoc DL;
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/Thumb1RegisterInfo.cpp
index ef77bbd..a39b722 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb1RegisterInfo.cpp
@@ -49,13 +49,14 @@ const TargetRegisterClass*
 Thumb1RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC)
                                                                          const {
   if (ARM::tGPRRegClass.hasSubClassEq(RC))
-    return ARM::tGPRRegisterClass;
+    return &ARM::tGPRRegClass;
   return ARMBaseRegisterInfo::getLargestLegalSuperClass(RC);
 }
 
 const TargetRegisterClass *
-Thumb1RegisterInfo::getPointerRegClass(unsigned Kind) const {
-  return ARM::tGPRRegisterClass;
+Thumb1RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
+                                                                         const {
+  return &ARM::tGPRRegClass;
 }
 
 /// emitLoadConstPool - Emits a load from constpool to materialize the
@@ -109,7 +110,7 @@ void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB,
     unsigned LdReg = DestReg;
     if (DestReg == ARM::SP) {
       assert(BaseReg == ARM::SP && "Unexpected!");
-      LdReg = MF.getRegInfo().createVirtualRegister(ARM::tGPRRegisterClass);
+      LdReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass);
     }
 
     if (NumBytes <= 255 && NumBytes >= 0)
@@ -693,7 +694,7 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       // register. The offset is already handled in the vreg value.
       MI.getOperand(i+1).ChangeToRegister(FrameReg, false, false, false);
   } else if (MI.mayStore()) {
-      VReg = MF.getRegInfo().createVirtualRegister(ARM::tGPRRegisterClass);
+      VReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass);
       bool UseRR = false;
 
       if (Opcode == ARM::tSTRspi) {
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.h b/lib/Target/ARM/Thumb1RegisterInfo.h
index 6971842..f2e4b08 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.h
+++ b/lib/Target/ARM/Thumb1RegisterInfo.h
@@ -30,7 +30,8 @@ public:
   const TargetRegisterClass*
   getLargestLegalSuperClass(const TargetRegisterClass *RC) const;
 
-  const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const;
+  const TargetRegisterClass*
+  getPointerRegClass(const MachineFunction &MF, unsigned Kind = 0) const;
 
   /// emitLoadConstPool - Emits a load from constpool to materialize the
   /// specified immediate.
diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
index ecb4c2f..d54aa93 100644
--- a/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -24,8 +24,6 @@ STATISTIC(NumMovedInsts, "Number of predicated instructions moved");
 
 namespace {
   class Thumb2ITBlockPass : public MachineFunctionPass {
-    bool PreRegAlloc;
-
   public:
     static char ID;
     Thumb2ITBlockPass() : MachineFunctionPass(ID) {}
@@ -76,16 +74,14 @@ static void TrackDefUses(MachineInstr *MI,
   for (unsigned i = 0, e = LocalUses.size(); i != e; ++i) {
     unsigned Reg = LocalUses[i];
     Uses.insert(Reg);
-    for (const uint16_t *Subreg = TRI->getSubRegisters(Reg);
-         *Subreg; ++Subreg)
+    for (MCSubRegIterator Subreg(Reg, TRI); Subreg.isValid(); ++Subreg)
       Uses.insert(*Subreg);
   }
 
   for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) {
     unsigned Reg = LocalDefs[i];
     Defs.insert(Reg);
-    for (const uint16_t *Subreg = TRI->getSubRegisters(Reg);
-         *Subreg; ++Subreg)
+    for (MCSubRegIterator Subreg(Reg, TRI); Subreg.isValid(); ++Subreg)
       Defs.insert(*Subreg);
     if (Reg == ARM::CPSR)
       continue;
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index 8ab486b..2097bb9 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -126,9 +126,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                     unsigned SrcReg, bool isKill, int FI,
                     const TargetRegisterClass *RC,
                     const TargetRegisterInfo *TRI) const {
-  if (RC == ARM::GPRRegisterClass   || RC == ARM::tGPRRegisterClass ||
-      RC == ARM::tcGPRRegisterClass || RC == ARM::rGPRRegisterClass ||
-      RC == ARM::GPRnopcRegisterClass) {
+  if (RC == &ARM::GPRRegClass   || RC == &ARM::tGPRRegClass ||
+      RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass ||
+      RC == &ARM::GPRnopcRegClass) {
     DebugLoc DL;
     if (I != MBB.end()) DL = I->getDebugLoc();
 
@@ -153,9 +153,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                      unsigned DestReg, int FI,
                      const TargetRegisterClass *RC,
                      const TargetRegisterInfo *TRI) const {
-  if (RC == ARM::GPRRegisterClass   || RC == ARM::tGPRRegisterClass ||
-      RC == ARM::tcGPRRegisterClass || RC == ARM::rGPRRegisterClass ||
-      RC == ARM::GPRnopcRegisterClass) {
+  if (RC == &ARM::GPRRegClass   || RC == &ARM::tGPRRegClass ||
+      RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass ||
+      RC == &ARM::GPRnopcRegClass) {
     DebugLoc DL;
     if (I != MBB.end()) DL = I->getDebugLoc();
 
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index b5a397e..f18f491 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -67,6 +67,7 @@ namespace {
     { ARM::t2BICrr, 0,            ARM::tBIC,     0,   0,    0,   1,  0,0, 1,0 },
     //FIXME: Disable CMN, as CCodes are backwards from compare expectations
     //{ ARM::t2CMNrr, ARM::tCMN,  0,             0,   0,    1,   0,  2,0, 0,0 },
+    { ARM::t2CMNzrr, ARM::tCMNz,  0,             0,   0,    1,   0,  2,0, 0,0 },
     { ARM::t2CMPri, ARM::tCMPi8,  0,             8,   0,    1,   0,  2,0, 0,0 },
     { ARM::t2CMPrr, ARM::tCMPhir, 0,             0,   0,    0,   0,  2,0, 0,1 },
     { ARM::t2EORrr, 0,            ARM::tEOR,     0,   0,    0,   1,  0,0, 1,0 },
diff --git a/lib/Target/CellSPU/CMakeLists.txt b/lib/Target/CellSPU/CMakeLists.txt
index cf4f796..1f8ca86 100644
--- a/lib/Target/CellSPU/CMakeLists.txt
+++ b/lib/Target/CellSPU/CMakeLists.txt
@@ -24,5 +24,7 @@ add_llvm_target(CellSPUCodeGen
   SPUNopFiller.cpp
   )
 
+add_dependencies(LLVMCellSPUCodeGen intrinsics_gen)
+
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/CellSPU/README.txt b/lib/Target/CellSPU/README.txt
index 3e7e0b6..3bce960 100644
--- a/lib/Target/CellSPU/README.txt
+++ b/lib/Target/CellSPU/README.txt
@@ -37,6 +37,20 @@ to add 'spu' to configure's --enable-targets option, e.g.:
 ---------------------------------------------------------------------------
 
 TODO:
+* In commit r142152 vector legalization was set to element promotion per
+  default. This breaks half vectors (e.g. v2i32) badly as they get element
+  promoted to much slower types (v2i64).
+
+* Many CellSPU specific codegen tests only grep & count the number of 
+  instructions, not checking their place with FileCheck. There have also
+  been some commits that change the CellSPU checks, some of which might
+  have not been thoroughly scrutinized w.r.t. to the changes they cause in SPU
+  assembly. (especially since about the time of r142152)  
+
+* Some of the i64 math have huge tablegen rules, which sometime cause
+  tablegen to run out of memory. See e.g. bug 8850. i64 arithmetics 
+  should probably be done with libraries.
+
 * Create a machine pass for performing dual-pipeline scheduling specifically
   for CellSPU, and insert branch prediction instructions as needed.
 
diff --git a/lib/Target/CellSPU/SPUAsmPrinter.cpp b/lib/Target/CellSPU/SPUAsmPrinter.cpp
index 14021fe..03d5a9a 100644
--- a/lib/Target/CellSPU/SPUAsmPrinter.cpp
+++ b/lib/Target/CellSPU/SPUAsmPrinter.cpp
@@ -301,7 +301,9 @@ bool SPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
     if (ExtraCode[1] != 0) return true; // Unknown modifier.
 
     switch (ExtraCode[0]) {
-    default: return true;  // Unknown modifier.
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
     case 'L': // Write second word of DImode reference.
       // Verify that this operand has two consecutive registers.
       if (!MI->getOperand(OpNo).isReg() ||
diff --git a/lib/Target/CellSPU/SPUHazardRecognizers.cpp b/lib/Target/CellSPU/SPUHazardRecognizers.cpp
index 403d7ef..67a83f1 100644
--- a/lib/Target/CellSPU/SPUHazardRecognizers.cpp
+++ b/lib/Target/CellSPU/SPUHazardRecognizers.cpp
@@ -30,12 +30,6 @@ using namespace llvm;
 // very little right now.
 //===----------------------------------------------------------------------===//
 
-SPUHazardRecognizer::SPUHazardRecognizer(const TargetInstrInfo &tii) :
-  TII(tii),
-  EvenOdd(0)
-{
-}
-
 /// Return the pipeline hazard type encountered or generated by this
 /// instruction. Currently returns NoHazard.
 ///
diff --git a/lib/Target/CellSPU/SPUHazardRecognizers.h b/lib/Target/CellSPU/SPUHazardRecognizers.h
index 675632c..30acaea 100644
--- a/lib/Target/CellSPU/SPUHazardRecognizers.h
+++ b/lib/Target/CellSPU/SPUHazardRecognizers.h
@@ -24,12 +24,8 @@ class TargetInstrInfo;
 /// SPUHazardRecognizer
 class SPUHazardRecognizer : public ScheduleHazardRecognizer
 {
-private:
-  const TargetInstrInfo &TII;
-  int EvenOdd;
-
 public:
-  SPUHazardRecognizer(const TargetInstrInfo &TII);
+  SPUHazardRecognizer(const TargetInstrInfo &/*TII*/) {}
   virtual HazardType getHazardType(SUnit *SU, int Stalls);
   virtual void EmitInstruction(SUnit *SU);
   virtual void AdvanceCycle();
diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp
index 0623741..4e9fcd1 100644
--- a/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@@ -77,12 +77,14 @@ namespace {
     // Splice the libcall in wherever FindInputOutputChains tells us to.
     Type *RetTy =
                 Op.getNode()->getValueType(0).getTypeForEVT(*DAG.getContext());
-    std::pair<SDValue, SDValue> CallInfo =
-            TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
+    TargetLowering::CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned,
+                                         false, false,
                             0, TLI.getLibcallCallingConv(LC),
                             /*isTailCall=*/false,
-                            /*doesNotRet=*/false, /*isReturnValueUsed=*/true,
+                                         /*doesNotRet=*/false,
+                                         /*isReturnValueUsed=*/true,
                             Callee, Args, DAG, Op.getDebugLoc());
+    std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
     return CallInfo.first;
   }
@@ -100,13 +102,13 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   setLibcallName(RTLIB::DIV_F64, "__fast_divdf3");
 
   // Set up the SPU's register classes:
-  addRegisterClass(MVT::i8,   SPU::R8CRegisterClass);
-  addRegisterClass(MVT::i16,  SPU::R16CRegisterClass);
-  addRegisterClass(MVT::i32,  SPU::R32CRegisterClass);
-  addRegisterClass(MVT::i64,  SPU::R64CRegisterClass);
-  addRegisterClass(MVT::f32,  SPU::R32FPRegisterClass);
-  addRegisterClass(MVT::f64,  SPU::R64FPRegisterClass);
-  addRegisterClass(MVT::i128, SPU::GPRCRegisterClass);
+  addRegisterClass(MVT::i8,   &SPU::R8CRegClass);
+  addRegisterClass(MVT::i16,  &SPU::R16CRegClass);
+  addRegisterClass(MVT::i32,  &SPU::R32CRegClass);
+  addRegisterClass(MVT::i64,  &SPU::R64CRegClass);
+  addRegisterClass(MVT::f32,  &SPU::R32FPRegClass);
+  addRegisterClass(MVT::f64,  &SPU::R64FPRegClass);
+  addRegisterClass(MVT::i128, &SPU::GPRCRegClass);
 
   // SPU has no sign or zero extended loads for i1, i8, i16:
   setLoadExtAction(ISD::EXTLOAD,  MVT::i1, Promote);
@@ -397,12 +399,12 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
 
   // First set operation action for all vector types to expand. Then we
   // will selectively turn on ones that can be effectively codegen'd.
-  addRegisterClass(MVT::v16i8, SPU::VECREGRegisterClass);
-  addRegisterClass(MVT::v8i16, SPU::VECREGRegisterClass);
-  addRegisterClass(MVT::v4i32, SPU::VECREGRegisterClass);
-  addRegisterClass(MVT::v2i64, SPU::VECREGRegisterClass);
-  addRegisterClass(MVT::v4f32, SPU::VECREGRegisterClass);
-  addRegisterClass(MVT::v2f64, SPU::VECREGRegisterClass);
+  addRegisterClass(MVT::v16i8, &SPU::VECREGRegClass);
+  addRegisterClass(MVT::v8i16, &SPU::VECREGRegClass);
+  addRegisterClass(MVT::v4i32, &SPU::VECREGRegClass);
+  addRegisterClass(MVT::v2i64, &SPU::VECREGRegClass);
+  addRegisterClass(MVT::v4f32, &SPU::VECREGRegClass);
+  addRegisterClass(MVT::v2f64, &SPU::VECREGRegClass);
 
   for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
        i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
@@ -1133,7 +1135,7 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain,
 
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), ArgLocs, *DAG.getContext());
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
   // FIXME: allow for other calling conventions
   CCInfo.AnalyzeFormalArguments(Ins, CCC_SPU);
 
@@ -1263,14 +1265,19 @@ static SDNode *isLSAAddress(SDValue Op, SelectionDAG &DAG) {
 }
 
 SDValue
-SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
-                             CallingConv::ID CallConv, bool isVarArg,
-                             bool doesNotRet, bool &isTailCall,
-                             const SmallVectorImpl<ISD::OutputArg> &Outs,
-                             const SmallVectorImpl<SDValue> &OutVals,
-                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                             DebugLoc dl, SelectionDAG &DAG,
+SPUTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                              SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
+
   // CellSPU target does not yet support tail call optimization.
   isTailCall = false;
 
@@ -1280,7 +1287,7 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), ArgLocs, *DAG.getContext());
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
   // FIXME: allow for other calling conventions
   CCInfo.AnalyzeCallOperands(Outs, CCC_SPU);
 
@@ -1441,7 +1448,7 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   // Now handle the return value(s)
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		    getTargetMachine(), RVLocs, *DAG.getContext());
+                    getTargetMachine(), RVLocs, *DAG.getContext());
   CCRetInfo.AnalyzeCallResult(Ins, CCC_SPU);
 
 
@@ -1468,7 +1475,7 @@ SPUTargetLowering::LowerReturn(SDValue Chain,
 
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), RVLocs, *DAG.getContext());
+                 getTargetMachine(), RVLocs, *DAG.getContext());
   CCInfo.AnalyzeReturn(Outs, RetCC_SPU);
 
   // If this is the first return lowered for this function, add the regs to the
@@ -3139,16 +3146,16 @@ SPUTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
     case 'b':   // R1-R31
     case 'r':   // R0-R31
       if (VT == MVT::i64)
-        return std::make_pair(0U, SPU::R64CRegisterClass);
-      return std::make_pair(0U, SPU::R32CRegisterClass);
+        return std::make_pair(0U, &SPU::R64CRegClass);
+      return std::make_pair(0U, &SPU::R32CRegClass);
     case 'f':
       if (VT == MVT::f32)
-        return std::make_pair(0U, SPU::R32FPRegisterClass);
-      else if (VT == MVT::f64)
-        return std::make_pair(0U, SPU::R64FPRegisterClass);
+        return std::make_pair(0U, &SPU::R32FPRegClass);
+      if (VT == MVT::f64)
+        return std::make_pair(0U, &SPU::R64FPRegClass);
       break;
     case 'v':
-      return std::make_pair(0U, SPU::GPRCRegisterClass);
+      return std::make_pair(0U, &SPU::GPRCRegClass);
     }
   }
 
diff --git a/lib/Target/CellSPU/SPUISelLowering.h b/lib/Target/CellSPU/SPUISelLowering.h
index e3db7b2..9f1599f 100644
--- a/lib/Target/CellSPU/SPUISelLowering.h
+++ b/lib/Target/CellSPU/SPUISelLowering.h
@@ -86,7 +86,6 @@ namespace llvm {
   class SPUTargetLowering :
     public TargetLowering
   {
-    int VarArgsFrameIndex;            // FrameIndex for start of varargs area.
     SPUTargetMachine &SPUTM;
 
   public:
@@ -159,13 +158,7 @@ namespace llvm {
                            SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
-      LowerCall(SDValue Chain, SDValue Callee,
-                CallingConv::ID CallConv, bool isVarArg,
-                bool doesNotRet, bool &isTailCall,
-                const SmallVectorImpl<ISD::OutputArg> &Outs,
-                const SmallVectorImpl<SDValue> &OutVals,
-                const SmallVectorImpl<ISD::InputArg> &Ins,
-                DebugLoc dl, SelectionDAG &DAG,
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
                 SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
diff --git a/lib/Target/CellSPU/SPUInstrInfo.cpp b/lib/Target/CellSPU/SPUInstrInfo.cpp
index 759923d..b25a639 100644
--- a/lib/Target/CellSPU/SPUInstrInfo.cpp
+++ b/lib/Target/CellSPU/SPUInstrInfo.cpp
@@ -140,29 +140,27 @@ SPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator MI,
                                   unsigned SrcReg, bool isKill, int FrameIdx,
                                   const TargetRegisterClass *RC,
-                                  const TargetRegisterInfo *TRI) const
-{
+                                  const TargetRegisterInfo *TRI) const {
   unsigned opc;
   bool isValidFrameIdx = (FrameIdx < SPUFrameLowering::maxFrameOffset());
-  if (RC == SPU::GPRCRegisterClass) {
-    opc = (isValidFrameIdx ? SPU::STQDr128 : SPU::STQXr128);
-  } else if (RC == SPU::R64CRegisterClass) {
-    opc = (isValidFrameIdx ? SPU::STQDr64 : SPU::STQXr64);
-  } else if (RC == SPU::R64FPRegisterClass) {
-    opc = (isValidFrameIdx ? SPU::STQDr64 : SPU::STQXr64);
-  } else if (RC == SPU::R32CRegisterClass) {
-    opc = (isValidFrameIdx ? SPU::STQDr32 : SPU::STQXr32);
-  } else if (RC == SPU::R32FPRegisterClass) {
-    opc = (isValidFrameIdx ? SPU::STQDr32 : SPU::STQXr32);
-  } else if (RC == SPU::R16CRegisterClass) {
-    opc = (isValidFrameIdx ? SPU::STQDr16 : SPU::STQXr16);
-  } else if (RC == SPU::R8CRegisterClass) {
-    opc = (isValidFrameIdx ? SPU::STQDr8 : SPU::STQXr8);
-  } else if (RC == SPU::VECREGRegisterClass) {
-    opc = (isValidFrameIdx) ? SPU::STQDv16i8 : SPU::STQXv16i8;
-  } else {
+  if (RC == &SPU::GPRCRegClass)
+    opc = isValidFrameIdx ? SPU::STQDr128 : SPU::STQXr128;
+  else if (RC == &SPU::R64CRegClass)
+    opc = isValidFrameIdx ? SPU::STQDr64 : SPU::STQXr64;
+  else if (RC == &SPU::R64FPRegClass)
+    opc = isValidFrameIdx ? SPU::STQDr64 : SPU::STQXr64;
+  else if (RC == &SPU::R32CRegClass)
+    opc = isValidFrameIdx ? SPU::STQDr32 : SPU::STQXr32;
+  else if (RC == &SPU::R32FPRegClass)
+    opc = isValidFrameIdx ? SPU::STQDr32 : SPU::STQXr32;
+  else if (RC == &SPU::R16CRegClass)
+    opc = isValidFrameIdx ? SPU::STQDr16 : SPU::STQXr16;
+  else if (RC == &SPU::R8CRegClass)
+    opc = isValidFrameIdx ? SPU::STQDr8 : SPU::STQXr8;
+  else if (RC == &SPU::VECREGRegClass)
+    opc = isValidFrameIdx ? SPU::STQDv16i8 : SPU::STQXv16i8;
+  else
     llvm_unreachable("Unknown regclass!");
-  }
 
   DebugLoc DL;
   if (MI != MBB.end()) DL = MI->getDebugLoc();
@@ -175,29 +173,27 @@ SPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MI,
                                    unsigned DestReg, int FrameIdx,
                                    const TargetRegisterClass *RC,
-                                   const TargetRegisterInfo *TRI) const
-{
+                                   const TargetRegisterInfo *TRI) const {
   unsigned opc;
   bool isValidFrameIdx = (FrameIdx < SPUFrameLowering::maxFrameOffset());
-  if (RC == SPU::GPRCRegisterClass) {
-    opc = (isValidFrameIdx ? SPU::LQDr128 : SPU::LQXr128);
-  } else if (RC == SPU::R64CRegisterClass) {
-    opc = (isValidFrameIdx ? SPU::LQDr64 : SPU::LQXr64);
-  } else if (RC == SPU::R64FPRegisterClass) {
-    opc = (isValidFrameIdx ? SPU::LQDr64 : SPU::LQXr64);
-  } else if (RC == SPU::R32CRegisterClass) {
-    opc = (isValidFrameIdx ? SPU::LQDr32 : SPU::LQXr32);
-  } else if (RC == SPU::R32FPRegisterClass) {
-    opc = (isValidFrameIdx ? SPU::LQDr32 : SPU::LQXr32);
-  } else if (RC == SPU::R16CRegisterClass) {
-    opc = (isValidFrameIdx ? SPU::LQDr16 : SPU::LQXr16);
-  } else if (RC == SPU::R8CRegisterClass) {
-    opc = (isValidFrameIdx ? SPU::LQDr8 : SPU::LQXr8);
-  } else if (RC == SPU::VECREGRegisterClass) {
-    opc = (isValidFrameIdx) ? SPU::LQDv16i8 : SPU::LQXv16i8;
-  } else {
+  if (RC == &SPU::GPRCRegClass)
+    opc = isValidFrameIdx ? SPU::LQDr128 : SPU::LQXr128;
+  else if (RC == &SPU::R64CRegClass)
+    opc = isValidFrameIdx ? SPU::LQDr64 : SPU::LQXr64;
+  else if (RC == &SPU::R64FPRegClass)
+    opc = isValidFrameIdx ? SPU::LQDr64 : SPU::LQXr64;
+  else if (RC == &SPU::R32CRegClass)
+    opc = isValidFrameIdx ? SPU::LQDr32 : SPU::LQXr32;
+  else if (RC == &SPU::R32FPRegClass)
+    opc = isValidFrameIdx ? SPU::LQDr32 : SPU::LQXr32;
+  else if (RC == &SPU::R16CRegClass)
+    opc = isValidFrameIdx ? SPU::LQDr16 : SPU::LQXr16;
+  else if (RC == &SPU::R8CRegClass)
+    opc = isValidFrameIdx ? SPU::LQDr8 : SPU::LQXr8;
+  else if (RC == &SPU::VECREGRegClass)
+    opc = isValidFrameIdx ? SPU::LQDv16i8 : SPU::LQXv16i8;
+  else
     llvm_unreachable("Unknown regclass in loadRegFromStackSlot!");
-  }
 
   DebugLoc DL;
   if (MI != MBB.end()) DL = MI->getDebugLoc();
@@ -340,11 +336,11 @@ SPUInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
 static MachineBasicBlock::iterator findHBRPosition(MachineBasicBlock &MBB)
 {
    MachineBasicBlock::iterator J = MBB.end();
-	for( int i=0; i<8; i++) {
-		if( J == MBB.begin() ) return J;
-		J--;
-	}
-	return J;
+   for( int i=0; i<8; i++) {
+     if( J == MBB.begin() ) return J;
+     J--;
+   }
+   return J;
 }
 
 unsigned
@@ -360,7 +356,7 @@ SPUInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   MachineInstrBuilder MIB;
   //TODO: make a more accurate algorithm.
   bool haveHBR = MBB.size()>8;
-  
+
   removeHBR(MBB);
   MCSymbol *branchLabel = MBB.getParent()->getContext().CreateTempSymbol();
   // Add a label just before the branch
@@ -382,7 +378,7 @@ SPUInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
         MIB = BuildMI( MBB, findHBRPosition(MBB), DL, get(SPU::HBRA));
         MIB.addSym(branchLabel);
         MIB.addMBB(TBB);
-      }	
+      }
     } else {
       // Conditional branch
       MIB = BuildMI(&MBB, DL, get(Cond[0].getImm()));
@@ -392,7 +388,7 @@ SPUInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
         MIB = BuildMI(MBB, findHBRPosition(MBB), DL, get(SPU::HBRA));
         MIB.addSym(branchLabel);
         MIB.addMBB(TBB);
-      }	
+      }
 
       DEBUG(errs() << "Inserted one-way cond branch:   ");
       DEBUG((*MIB).dump());
@@ -410,7 +406,7 @@ SPUInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
       MIB = BuildMI( MBB, findHBRPosition(MBB), DL, get(SPU::HBRA));
       MIB.addSym(branchLabel);
       MIB.addMBB(FBB);
-    }	
+    }
 
     DEBUG(errs() << "Inserted conditional branch:    ");
     DEBUG((*MIB).dump());
diff --git a/lib/Target/CellSPU/SPUInstrInfo.td b/lib/Target/CellSPU/SPUInstrInfo.td
index f76ebd7..117acd7 100644
--- a/lib/Target/CellSPU/SPUInstrInfo.td
+++ b/lib/Target/CellSPU/SPUInstrInfo.td
@@ -3421,14 +3421,14 @@ let isCall = 1,
   // Branch relative and set link: Used if we actually know that the target
   // is within [-32768, 32767] bytes of the target
   def BRSL:
-    BranchSetLink<0b011001100, (outs), (ins relcalltarget:$func, variable_ops),
+    BranchSetLink<0b011001100, (outs), (ins relcalltarget:$func),
       "brsl\t$$lr, $func",
       [(SPUcall (SPUpcrel tglobaladdr:$func, 0))]>;
 
   // Branch absolute and set link: Used if we actually know that the target
   // is an absolute address
   def BRASL:
-    BranchSetLink<0b011001100, (outs), (ins calltarget:$func, variable_ops),
+    BranchSetLink<0b011001100, (outs), (ins calltarget:$func),
       "brasl\t$$lr, $func",
       [(SPUcall (SPUaform tglobaladdr:$func, 0))]>;
 
diff --git a/lib/Target/CellSPU/SPURegisterInfo.cpp b/lib/Target/CellSPU/SPURegisterInfo.cpp
index 1b2da5f..e6c872d 100644
--- a/lib/Target/CellSPU/SPURegisterInfo.cpp
+++ b/lib/Target/CellSPU/SPURegisterInfo.cpp
@@ -193,7 +193,8 @@ SPURegisterInfo::SPURegisterInfo(const SPUSubtarget &subtarget,
 /// getPointerRegClass - Return the register class to use to hold pointers.
 /// This is used for addressing modes.
 const TargetRegisterClass *
-SPURegisterInfo::getPointerRegClass(unsigned Kind) const {
+SPURegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
+                                                                        const {
   return &SPU::R32CRegClass;
 }
 
diff --git a/lib/Target/CellSPU/SPURegisterInfo.h b/lib/Target/CellSPU/SPURegisterInfo.h
index e5ab224..e9f9aba 100644
--- a/lib/Target/CellSPU/SPURegisterInfo.h
+++ b/lib/Target/CellSPU/SPURegisterInfo.h
@@ -46,7 +46,7 @@ namespace llvm {
     /// getPointerRegClass - Return the register class to use to hold pointers.
     /// This is used for addressing modes.
     virtual const TargetRegisterClass *
-    getPointerRegClass(unsigned Kind = 0) const;
+    getPointerRegClass(const MachineFunction &MF, unsigned Kind = 0) const;
 
     /// After allocating this many registers, the allocator should feel
     /// register pressure. The value is a somewhat random guess, based on the
@@ -63,6 +63,11 @@ namespace llvm {
     virtual bool requiresRegisterScavenging(const MachineFunction &MF) const
     { return true; }
 
+    //! Enable tracking of liveness after register allocation, since register
+    // scavenging is enabled.
+    virtual bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const
+    { return true; }
+
     //! Return the reserved registers
     BitVector getReservedRegs(const MachineFunction &MF) const;
 
diff --git a/lib/Target/CellSPU/SPUTargetMachine.cpp b/lib/Target/CellSPU/SPUTargetMachine.cpp
index 21f6b25..54764f1 100644
--- a/lib/Target/CellSPU/SPUTargetMachine.cpp
+++ b/lib/Target/CellSPU/SPUTargetMachine.cpp
@@ -72,7 +72,7 @@ TargetPassConfig *SPUTargetMachine::createPassConfig(PassManagerBase &PM) {
 
 bool SPUPassConfig::addInstSelector() {
   // Install an instruction selector.
-  PM.add(createSPUISelDag(getSPUTargetMachine()));
+  addPass(createSPUISelDag(getSPUTargetMachine()));
   return false;
 }
 
@@ -85,9 +85,9 @@ bool SPUPassConfig::addPreEmitPass() {
     (BuilderFunc)(intptr_t)sys::DynamicLibrary::SearchForAddressOfSymbol(
           "createTCESchedulerPass");
   if (schedulerCreator != NULL)
-      PM.add(schedulerCreator("cellspu"));
+      addPass(schedulerCreator("cellspu"));
 
   //align instructions with nops/lnops for dual issue
-  PM.add(createSPUNopFillerPass(getSPUTargetMachine()));
+  addPass(createSPUNopFillerPass(getSPUTargetMachine()));
   return true;
 }
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
index 69f0ff8..c8e757b 100644
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -130,6 +130,7 @@ namespace {
   private:
     void printLinkageType(GlobalValue::LinkageTypes LT);
     void printVisibilityType(GlobalValue::VisibilityTypes VisTypes);
+    void printThreadLocalMode(GlobalVariable::ThreadLocalMode TLM);
     void printCallingConv(CallingConv::ID cc);
     void printEscapedString(const std::string& str);
     void printCFP(const ConstantFP* CFP);
@@ -325,6 +326,26 @@ void CppWriter::printVisibilityType(GlobalValue::VisibilityTypes VisType) {
   }
 }
 
+void CppWriter::printThreadLocalMode(GlobalVariable::ThreadLocalMode TLM) {
+  switch (TLM) {
+    case GlobalVariable::NotThreadLocal:
+      Out << "GlobalVariable::NotThreadLocal";
+      break;
+    case GlobalVariable::GeneralDynamicTLSModel:
+      Out << "GlobalVariable::GeneralDynamicTLSModel";
+      break;
+    case GlobalVariable::LocalDynamicTLSModel:
+      Out << "GlobalVariable::LocalDynamicTLSModel";
+      break;
+    case GlobalVariable::InitialExecTLSModel:
+      Out << "GlobalVariable::InitialExecTLSModel";
+      break;
+    case GlobalVariable::LocalExecTLSModel:
+      Out << "GlobalVariable::LocalExecTLSModel";
+      break;
+  }
+}
+
 // printEscapedString - Print each character of the specified string, escaping
 // it if it is not printable or if it is an escape char.
 void CppWriter::printEscapedString(const std::string &Str) {
@@ -496,7 +517,7 @@ void CppWriter::printAttributes(const AttrListPtr &PAL,
       Out << "Attrs.push_back(PAWI);";
       nl(Out);
     }
-    Out << name << "_PAL = AttrListPtr::get(Attrs.begin(), Attrs.end());";
+    Out << name << "_PAL = AttrListPtr::get(Attrs);";
     nl(Out);
     out(); nl(Out);
     Out << '}'; nl(Out);
@@ -996,7 +1017,9 @@ void CppWriter::printVariableHead(const GlobalVariable *GV) {
   }
   if (GV->isThreadLocal()) {
     printCppName(GV);
-    Out << "->setThreadLocal(true);";
+    Out << "->setThreadLocalMode(";
+    printThreadLocalMode(GV->getThreadLocalMode());
+    Out << ");";
     nl(Out);
   }
   if (is_inline) {
@@ -1105,7 +1128,7 @@ void CppWriter::printInstruction(const Instruction *I,
     nl(Out);
     for (SwitchInst::ConstCaseIt i = SI->case_begin(), e = SI->case_end();
          i != e; ++i) {
-      const ConstantInt* CaseVal = i.getCaseValue();
+      const IntegersSubset CaseVal = i.getCaseValueEx();
       const BasicBlock *BB = i.getCaseSuccessor();
       Out << iName << "->addCase("
           << getOpName(CaseVal) << ", "
@@ -2078,7 +2101,9 @@ char CppWriter::ID = 0;
 bool CPPTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
                                            formatted_raw_ostream &o,
                                            CodeGenFileType FileType,
-                                           bool DisableVerify) {
+                                           bool DisableVerify,
+                                           AnalysisID StartAfter,
+                                           AnalysisID StopAfter) {
   if (FileType != TargetMachine::CGFT_AssemblyFile) return true;
   PM.add(new CppWriter(o));
   return false;
diff --git a/lib/Target/CppBackend/CPPTargetMachine.h b/lib/Target/CppBackend/CPPTargetMachine.h
index 92bca6c..9cbe798 100644
--- a/lib/Target/CppBackend/CPPTargetMachine.h
+++ b/lib/Target/CppBackend/CPPTargetMachine.h
@@ -31,7 +31,9 @@ struct CPPTargetMachine : public TargetMachine {
   virtual bool addPassesToEmitFile(PassManagerBase &PM,
                                    formatted_raw_ostream &Out,
                                    CodeGenFileType FileType,
-                                   bool DisableVerify);
+                                   bool DisableVerify,
+                                   AnalysisID StartAfter,
+                                   AnalysisID StopAfter);
 
   virtual const TargetData *getTargetData() const { return 0; }
 };
diff --git a/lib/Target/Hexagon/CMakeLists.txt b/lib/Target/Hexagon/CMakeLists.txt
index af9e813..1f2d8ac 100644
--- a/lib/Target/Hexagon/CMakeLists.txt
+++ b/lib/Target/Hexagon/CMakeLists.txt
@@ -28,8 +28,12 @@ add_llvm_target(HexagonCodeGen
   HexagonSubtarget.cpp
   HexagonTargetMachine.cpp
   HexagonTargetObjectFile.cpp
+  HexagonVLIWPacketizer.cpp
+  HexagonNewValueJump.cpp
 )
 
+add_dependencies(LLVMHexagonCodeGen intrinsics_gen)
+
 add_subdirectory(TargetInfo)
 add_subdirectory(InstPrinter)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/Hexagon/Hexagon.h b/lib/Target/Hexagon/Hexagon.h
index 0808323..45f857b 100644
--- a/lib/Target/Hexagon/Hexagon.h
+++ b/lib/Target/Hexagon/Hexagon.h
@@ -40,6 +40,9 @@ namespace llvm {
   FunctionPass *createHexagonHardwareLoops();
   FunctionPass *createHexagonPeephole();
   FunctionPass *createHexagonFixupHwLoops();
+  FunctionPass *createHexagonPacketizer();
+  FunctionPass *createHexagonNewValueJump();
+
 
 /* TODO: object output.
   MCCodeEmitter *createHexagonMCCodeEmitter(const Target &,
@@ -47,7 +50,8 @@ namespace llvm {
                                             MCContext &Ctx);
 */
 /* TODO: assembler input.
-  TargetAsmBackend *createHexagonAsmBackend(const Target &, const std::string &);
+  TargetAsmBackend *createHexagonAsmBackend(const Target &,
+                                                  const std::string &);
 */
   void HexagonLowerToMC(const MachineInstr *MI, MCInst &MCI,
                         HexagonAsmPrinter &AP);
@@ -67,7 +71,7 @@ namespace llvm {
 // Normal instruction size (in bytes).
 #define HEXAGON_INSTR_SIZE 4
 
-// Maximum number of words in a packet (in instructions).
+// Maximum number of words and instructions in a packet.
 #define HEXAGON_PACKET_SIZE 4
 
 #endif
diff --git a/lib/Target/Hexagon/Hexagon.td b/lib/Target/Hexagon/Hexagon.td
index 4a50d16..451e562 100644
--- a/lib/Target/Hexagon/Hexagon.td
+++ b/lib/Target/Hexagon/Hexagon.td
@@ -28,6 +28,8 @@ def ArchV3       : SubtargetFeature<"v3", "HexagonArchVersion", "V3",
                                     "Hexagon v3">;
 def ArchV4       : SubtargetFeature<"v4", "HexagonArchVersion", "V4",
                                     "Hexagon v4">;
+def ArchV5       : SubtargetFeature<"v5", "HexagonArchVersion", "V5",
+                                    "Hexagon v5">;
 
 //===----------------------------------------------------------------------===//
 // Register File, Calling Conv, Instruction Descriptions
@@ -45,13 +47,15 @@ def HexagonInstrInfo : InstrInfo;
 // Hexagon processors supported.
 //===----------------------------------------------------------------------===//
 
-class Proc<string Name, ProcessorItineraries Itin,
+class Proc<string Name, SchedMachineModel Model,
            list<SubtargetFeature> Features>
- : Processor<Name, Itin, Features>;
+ : ProcessorModel<Name, Model, Features>;
+
+def : Proc<"hexagonv2", HexagonModel,   [ArchV2]>;
+def : Proc<"hexagonv3", HexagonModel,   [ArchV2, ArchV3]>;
+def : Proc<"hexagonv4", HexagonModelV4, [ArchV2, ArchV3, ArchV4]>;
+def : Proc<"hexagonv5", HexagonModelV4, [ArchV2, ArchV3, ArchV4, ArchV5]>;
 
-def : Proc<"hexagonv2", HexagonItineraries,   [ArchV2]>;
-def : Proc<"hexagonv3", HexagonItineraries,   [ArchV2, ArchV3]>;
-def : Proc<"hexagonv4", HexagonItinerariesV4, [ArchV2, ArchV3, ArchV4]>;
 
 // Hexagon Uses the MC printer for assembler output, so make sure the TableGen
 // AsmWriter bits get associated with the correct class.
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index 39bf45d..5fa4740 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -13,11 +13,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 #define DEBUG_TYPE "asm-printer"
 #include "Hexagon.h"
 #include "HexagonAsmPrinter.h"
 #include "HexagonMachineFunctionInfo.h"
+#include "HexagonMCInst.h"
 #include "HexagonTargetMachine.h"
 #include "HexagonSubtarget.h"
 #include "InstPrinter/HexagonInstPrinter.h"
@@ -77,8 +77,7 @@ void HexagonAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
   const MachineOperand &MO = MI->getOperand(OpNo);
 
   switch (MO.getType()) {
-  default:
-    assert(0 && "<unknown operand type>");
+  default: llvm_unreachable ("<unknown operand type>");
   case MachineOperand::MO_Register:
     O << HexagonInstPrinter::getRegisterName(MO.getReg());
     return;
@@ -134,7 +133,9 @@ bool HexagonAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
     if (ExtraCode[1] != 0) return true; // Unknown modifier.
 
     switch (ExtraCode[0]) {
-    default: return true;  // Unknown modifier.
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, OS);
     case 'c': // Don't print "$" before a global var name or constant.
       // Hexagon never has a prefix.
       printOperand(MI, OpNo, OS);
@@ -196,10 +197,45 @@ void HexagonAsmPrinter::printPredicateOperand(const MachineInstr *MI,
 /// the current output stream.
 ///
 void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  MCInst MCI;
-
-  HexagonLowerToMC(MI, MCI, *this);
-  OutStreamer.EmitInstruction(MCI);
+  if (MI->isBundle()) {
+    std::vector<const MachineInstr*> BundleMIs;
+
+    const MachineBasicBlock *MBB = MI->getParent();
+    MachineBasicBlock::const_instr_iterator MII = MI;
+    ++MII;
+    unsigned int IgnoreCount = 0;
+    while (MII != MBB->end() && MII->isInsideBundle()) {
+      const MachineInstr *MInst = MII;
+      if (MInst->getOpcode() == TargetOpcode::DBG_VALUE ||
+          MInst->getOpcode() == TargetOpcode::IMPLICIT_DEF) {
+          IgnoreCount++;
+          ++MII;
+          continue;
+      }
+      //BundleMIs.push_back(&*MII);
+      BundleMIs.push_back(MInst);
+      ++MII;
+    }
+    unsigned Size = BundleMIs.size();
+    assert((Size+IgnoreCount) == MI->getBundleSize() && "Corrupt Bundle!");
+    for (unsigned Index = 0; Index < Size; Index++) {
+      HexagonMCInst MCI;
+      MCI.setStartPacket(Index == 0);
+      MCI.setEndPacket(Index == (Size-1));
+
+      HexagonLowerToMC(BundleMIs[Index], MCI, *this);
+      OutStreamer.EmitInstruction(MCI);
+    }
+  }
+  else {
+    HexagonMCInst MCI;
+    if (MI->getOpcode() == Hexagon::ENDLOOP0) {
+      MCI.setStartPacket(true);
+      MCI.setEndPacket(true);
+    }
+    HexagonLowerToMC(MI, MCI, *this);
+    OutStreamer.EmitInstruction(MCI);
+  }
 
   return;
 }
@@ -241,15 +277,15 @@ void HexagonAsmPrinter::printGlobalOperand(const MachineInstr *MI, int OpNo,
 void HexagonAsmPrinter::printJumpTable(const MachineInstr *MI, int OpNo,
                                        raw_ostream &O) {
   const MachineOperand &MO = MI->getOperand(OpNo);
-  assert( (MO.getType() == MachineOperand::MO_JumpTableIndex) &&
-	  "Expecting jump table index");
+  assert( (MO.getType() == MachineOperand::MO_JumpTableIndex) && 
+           "Expecting jump table index");
 
   // Hexagon_TODO: Do we need name mangling?
   O << *GetJTISymbol(MO.getIndex());
 }
 
 void HexagonAsmPrinter::printConstantPool(const MachineInstr *MI, int OpNo,
-                                          raw_ostream &O) {
+                                       raw_ostream &O) {
   const MachineOperand &MO = MI->getOperand(OpNo);
   assert( (MO.getType() == MachineOperand::MO_ConstantPoolIndex) &&
           "Expecting constant pool index");
diff --git a/lib/Target/Hexagon/HexagonCallingConv.td b/lib/Target/Hexagon/HexagonCallingConv.td
index bd9608b..e61b2a7 100644
--- a/lib/Target/Hexagon/HexagonCallingConv.td
+++ b/lib/Target/Hexagon/HexagonCallingConv.td
@@ -17,8 +17,8 @@
 
 // Hexagon 32-bit C return-value convention.
 def RetCC_Hexagon32 : CallingConv<[
-  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3, R4, R5]>>,
-  CCIfType<[i64], CCAssignToReg<[D0, D1, D2]>>,
+  CCIfType<[i32, f32], CCAssignToReg<[R0, R1, R2, R3, R4, R5]>>,
+  CCIfType<[i64, f64], CCAssignToReg<[D0, D1, D2]>>,
 
   // Alternatively, they are assigned to the stack in 4-byte aligned units.
   CCAssignToStack<4, 4>
@@ -27,8 +27,8 @@ def RetCC_Hexagon32 : CallingConv<[
 // Hexagon 32-bit C Calling convention.
 def CC_Hexagon32 : CallingConv<[
   // All arguments get passed in integer registers if there is space.
-  CCIfType<[i32, i16, i8], CCAssignToReg<[R0, R1, R2, R3, R4, R5]>>,
-  CCIfType<[i64], CCAssignToReg<[D0, D1, D2]>>,
+  CCIfType<[f32, i32, i16, i8], CCAssignToReg<[R0, R1, R2, R3, R4, R5]>>,
+  CCIfType<[f64, i64], CCAssignToReg<[D0, D1, D2]>>,
 
   // Alternatively, they are assigned to the stack in 4-byte aligned units.
   CCAssignToStack<4, 4>
diff --git a/lib/Target/Hexagon/HexagonCallingConvLower.cpp b/lib/Target/Hexagon/HexagonCallingConvLower.cpp
index 46c20e9..ba8e679 100644
--- a/lib/Target/Hexagon/HexagonCallingConvLower.cpp
+++ b/lib/Target/Hexagon/HexagonCallingConvLower.cpp
@@ -56,11 +56,8 @@ void Hexagon_CCState::HandleByVal(unsigned ValNo, EVT ValVT,
 
 /// MarkAllocated - Mark a register and all of its aliases as allocated.
 void Hexagon_CCState::MarkAllocated(unsigned Reg) {
-  UsedRegs[Reg/32] |= 1 << (Reg&31);
-
-  if (const uint16_t *RegAliases = TRI.getAliasSet(Reg))
-    for (; (Reg = *RegAliases); ++RegAliases)
-      UsedRegs[Reg/32] |= 1 << (Reg&31);
+  for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
+    UsedRegs[*AI/32] |= 1 << (*AI&31);
 }
 
 /// AnalyzeFormalArguments - Analyze an ISD::FORMAL_ARGUMENTS node,
diff --git a/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp b/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
index 2100474..ae2ca37 100644
--- a/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
+++ b/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
@@ -7,9 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 // The Hexagon processor has no instructions that load or store predicate
-// registers directly.  So, when these registers must be spilled a general 
-// purpose register must be found and the value copied to/from it from/to 
-// the predicate register.  This code currently does not use the register 
+// registers directly.  So, when these registers must be spilled a general
+// purpose register must be found and the value copied to/from it from/to
+// the predicate register.  This code currently does not use the register
 // scavenger mechanism available in the allocator.  There are two registers
 // reserved to allow spilling/restoring predicate registers.  One is used to
 // hold the predicate value.  The other is used when stack frame offsets are
@@ -84,7 +84,7 @@ bool HexagonExpandPredSpillCode::runOnMachineFunction(MachineFunction &Fn) {
         int SrcReg = MI->getOperand(2).getReg();
         assert(Hexagon::PredRegsRegClass.contains(SrcReg) &&
                "Not a predicate register");
-        if (!TII->isValidOffset(Hexagon::STriw, Offset)) {
+        if (!TII->isValidOffset(Hexagon::STriw_indexed, Offset)) {
           if (!TII->isValidOffset(Hexagon::ADD_ri, Offset)) {
             BuildMI(*MBB, MII, MI->getDebugLoc(),
                     TII->get(Hexagon::CONST32_Int_Real),
@@ -95,7 +95,7 @@ bool HexagonExpandPredSpillCode::runOnMachineFunction(MachineFunction &Fn) {
             BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_RsPd),
                       HEXAGON_RESERVED_REG_2).addReg(SrcReg);
             BuildMI(*MBB, MII, MI->getDebugLoc(),
-                    TII->get(Hexagon::STriw))
+                    TII->get(Hexagon::STriw_indexed))
               .addReg(HEXAGON_RESERVED_REG_1)
               .addImm(0).addReg(HEXAGON_RESERVED_REG_2);
           } else {
@@ -103,7 +103,8 @@ bool HexagonExpandPredSpillCode::runOnMachineFunction(MachineFunction &Fn) {
                       HEXAGON_RESERVED_REG_1).addReg(FP).addImm(Offset);
             BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_RsPd),
                       HEXAGON_RESERVED_REG_2).addReg(SrcReg);
-            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::STriw))
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+                          TII->get(Hexagon::STriw_indexed))
               .addReg(HEXAGON_RESERVED_REG_1)
               .addImm(0)
               .addReg(HEXAGON_RESERVED_REG_2);
@@ -111,7 +112,8 @@ bool HexagonExpandPredSpillCode::runOnMachineFunction(MachineFunction &Fn) {
         } else {
           BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_RsPd),
                     HEXAGON_RESERVED_REG_2).addReg(SrcReg);
-          BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::STriw)).
+          BuildMI(*MBB, MII, MI->getDebugLoc(),
+                        TII->get(Hexagon::STriw_indexed)).
                     addReg(FP).addImm(Offset).addReg(HEXAGON_RESERVED_REG_2);
         }
         MII = MBB->erase(MI);
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index e8a6924..cd682df 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -209,6 +209,16 @@ bool HexagonFrameLowering::hasFP(const MachineFunction &MF) const {
           FuncInfo->hasClobberLR() );
 }
 
+static inline
+unsigned uniqueSuperReg(unsigned Reg, const TargetRegisterInfo *TRI) {
+  MCSuperRegIterator SRI(Reg, TRI);
+  assert(SRI.isValid() && "Expected a superreg");
+  unsigned SuperReg = *SRI;
+  ++SRI;
+  assert(!SRI.isValid() && "Expected exactly one superreg");
+  return SuperReg;
+}
+
 bool
 HexagonFrameLowering::spillCalleeSavedRegisters(
                                         MachineBasicBlock &MBB,
@@ -235,26 +245,21 @@ HexagonFrameLowering::spillCalleeSavedRegisters(
     //
     // Check if we can use a double-word store.
     //
-    const uint16_t* SuperReg = TRI->getSuperRegisters(Reg);
-
-    // Assume that there is exactly one superreg.
-    assert(SuperReg[0] && !SuperReg[1] && "Expected exactly one superreg");
+    unsigned SuperReg = uniqueSuperReg(Reg, TRI);
     bool CanUseDblStore = false;
     const TargetRegisterClass* SuperRegClass = 0;
 
     if (ContiguousRegs && (i < CSI.size()-1)) {
-      const uint16_t* SuperRegNext = TRI->getSuperRegisters(CSI[i+1].getReg());
-      assert(SuperRegNext[0] && !SuperRegNext[1] &&
-             "Expected exactly one superreg");
-      SuperRegClass = TRI->getMinimalPhysRegClass(SuperReg[0]);
-      CanUseDblStore = (SuperRegNext[0] == SuperReg[0]);
+      unsigned SuperRegNext = uniqueSuperReg(CSI[i+1].getReg(), TRI);
+      SuperRegClass = TRI->getMinimalPhysRegClass(SuperReg);
+      CanUseDblStore = (SuperRegNext == SuperReg);
     }
 
 
     if (CanUseDblStore) {
-      TII.storeRegToStackSlot(MBB, MI, SuperReg[0], true,
+      TII.storeRegToStackSlot(MBB, MI, SuperReg, true,
                               CSI[i+1].getFrameIdx(), SuperRegClass, TRI);
-      MBB.addLiveIn(SuperReg[0]);
+      MBB.addLiveIn(SuperReg);
       ++i;
     } else {
       // Cannot use a double-word store.
@@ -295,25 +300,20 @@ bool HexagonFrameLowering::restoreCalleeSavedRegisters(
     //
     // Check if we can use a double-word load.
     //
-    const uint16_t* SuperReg = TRI->getSuperRegisters(Reg);
+    unsigned SuperReg = uniqueSuperReg(Reg, TRI);
     const TargetRegisterClass* SuperRegClass = 0;
-
-    // Assume that there is exactly one superreg.
-    assert(SuperReg[0] && !SuperReg[1] && "Expected exactly one superreg");
     bool CanUseDblLoad = false;
     if (ContiguousRegs && (i < CSI.size()-1)) {
-      const uint16_t* SuperRegNext = TRI->getSuperRegisters(CSI[i+1].getReg());
-      assert(SuperRegNext[0] && !SuperRegNext[1] &&
-             "Expected exactly one superreg");
-      SuperRegClass = TRI->getMinimalPhysRegClass(SuperReg[0]);
-      CanUseDblLoad = (SuperRegNext[0] == SuperReg[0]);
+      unsigned SuperRegNext = uniqueSuperReg(CSI[i+1].getReg(), TRI);
+      SuperRegClass = TRI->getMinimalPhysRegClass(SuperReg);
+      CanUseDblLoad = (SuperRegNext == SuperReg);
     }
 
 
     if (CanUseDblLoad) {
-      TII.loadRegFromStackSlot(MBB, MI, SuperReg[0], CSI[i+1].getFrameIdx(),
+      TII.loadRegFromStackSlot(MBB, MI, SuperReg, CSI[i+1].getFrameIdx(),
                                SuperRegClass, TRI);
-      MBB.addLiveIn(SuperReg[0]);
+      MBB.addLiveIn(SuperReg);
       ++i;
     } else {
       // Cannot use a double-word load.
diff --git a/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index 57772a5..1357cc5 100644
--- a/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -491,7 +491,7 @@ bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L) {
               TII->get(Hexagon::NEG), CountReg).addReg(CountReg1);
     }
 
-    // Add the Loop instruction to the begining of the loop.
+    // Add the Loop instruction to the beginning of the loop.
     BuildMI(*Preheader, InsertPos, InsertPos->getDebugLoc(),
             TII->get(Hexagon::LOOP0_r)).addMBB(LoopStart).addReg(CountReg);
   } else {
@@ -623,7 +623,7 @@ void HexagonFixupHwLoops::convertLoopInstr(MachineFunction &MF,
   const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
   MachineBasicBlock *MBB = MII->getParent();
   DebugLoc DL = MII->getDebugLoc();
-  unsigned Scratch = RS.scavengeRegister(Hexagon::IntRegsRegisterClass, MII, 0);
+  unsigned Scratch = RS.scavengeRegister(&Hexagon::IntRegsRegClass, MII, 0);
 
   // First, set the LC0 with the trip count.
   if (MII->getOperand(1).isReg()) {
diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index 9df965e..5499134 100644
--- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -90,7 +90,9 @@ public:
   SDNode *SelectMul(SDNode *N);
   SDNode *SelectZeroExtend(SDNode *N);
   SDNode *SelectIntrinsicWOChain(SDNode *N);
+  SDNode *SelectIntrinsicWChain(SDNode *N);
   SDNode *SelectConstant(SDNode *N);
+  SDNode *SelectConstantFP(SDNode *N);
   SDNode *SelectAdd(SDNode *N);
 
   // Include the pieces autogenerated from the target description.
@@ -318,7 +320,7 @@ SDNode *HexagonDAGToDAGISel::SelectBaseOffsetLoad(LoadSDNode *LD, DebugLoc dl) {
       else if (LoadedVT == MVT::i32) Opcode = Hexagon::LDriw_indexed;
       else if (LoadedVT == MVT::i16) Opcode = Hexagon::LDrih_indexed;
       else if (LoadedVT == MVT::i8) Opcode = Hexagon::LDrib_indexed;
-      else assert (0 && "unknown memory type");
+      else llvm_unreachable("unknown memory type");
 
       // Build indexed load.
       SDValue TargetConstOff = CurDAG->getTargetConstant(Offset, PointerTy);
@@ -375,7 +377,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoadSignExtend64(LoadSDNode *LD,
       };
       ReplaceUses(Froms, Tos, 3);
       return Result_2;
-    } 
+    }
     SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
     SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32);
     SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32,
@@ -516,7 +518,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, DebugLoc dl) {
     else
       Opcode = zextval ? Hexagon::LDriub : Hexagon::LDrib;
   } else
-    assert (0 && "unknown memory type");
+    llvm_unreachable("unknown memory type");
 
   // For zero ext i64 loads, we need to add combine instructions.
   if (LD->getValueType(0) == MVT::i64 &&
@@ -613,7 +615,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, DebugLoc dl) {
     else if (StoredVT == MVT::i32) Opcode = Hexagon::POST_STwri;
     else if (StoredVT == MVT::i16) Opcode = Hexagon::POST_SThri;
     else if (StoredVT == MVT::i8) Opcode = Hexagon::POST_STbri;
-    else assert (0 && "unknown memory type");
+    else llvm_unreachable("unknown memory type");
 
     // Build post increment store.
     SDNode* Result = CurDAG->getMachineNode(Opcode, dl, MVT::i32,
@@ -636,10 +638,10 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, DebugLoc dl) {
 
   // Figure out the opcode.
   if (StoredVT == MVT::i64) Opcode = Hexagon::STrid;
-  else if (StoredVT == MVT::i32) Opcode = Hexagon::STriw;
+  else if (StoredVT == MVT::i32) Opcode = Hexagon::STriw_indexed;
   else if (StoredVT == MVT::i16) Opcode = Hexagon::STrih;
   else if (StoredVT == MVT::i8) Opcode = Hexagon::STrib;
-  else assert (0 && "unknown memory type");
+  else llvm_unreachable("unknown memory type");
 
   // Build regular store.
   SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32);
@@ -693,7 +695,7 @@ SDNode *HexagonDAGToDAGISel::SelectBaseOffsetStore(StoreSDNode *ST,
         else if (StoredVT == MVT::i32) Opcode = Hexagon::STriw_indexed;
         else if (StoredVT == MVT::i16) Opcode = Hexagon::STrih_indexed;
         else if (StoredVT == MVT::i8) Opcode = Hexagon::STrib_indexed;
-        else assert (0 && "unknown memory type");
+        else llvm_unreachable("unknown memory type");
 
         SDValue Ops[] = {SDValue(NewBase,0),
                          CurDAG->getTargetConstant(Offset,PointerTy),
@@ -723,7 +725,7 @@ SDNode *HexagonDAGToDAGISel::SelectStore(SDNode *N) {
   if (AM != ISD::UNINDEXED) {
     return SelectIndexedStore(ST, dl);
   }
-   
+
   return SelectBaseOffsetStore(ST, dl);
 }
 
@@ -752,7 +754,7 @@ SDNode *HexagonDAGToDAGISel::SelectMul(SDNode *N) {
     if (MulOp0.getOpcode() == ISD::SIGN_EXTEND) {
       SDValue Sext0 = MulOp0.getOperand(0);
       if (Sext0.getNode()->getValueType(0) != MVT::i32) {
-        SelectCode(N);
+        return SelectCode(N);
       }
 
       OP0 = Sext0;
@@ -761,7 +763,7 @@ SDNode *HexagonDAGToDAGISel::SelectMul(SDNode *N) {
       if (LD->getMemoryVT() != MVT::i32 ||
           LD->getExtensionType() != ISD::SEXTLOAD ||
           LD->getAddressingMode() != ISD::UNINDEXED) {
-        SelectCode(N);
+        return SelectCode(N);
       }
 
       SDValue Chain = LD->getChain();
@@ -1128,12 +1130,12 @@ SDNode *HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) {
     // For immediates, lower it.
     for (unsigned i = 1; i < N->getNumOperands(); ++i) {
       SDNode *Arg = N->getOperand(i).getNode();
-      const TargetRegisterClass *RC = TII->getRegClass(MCID, i, TRI);
+      const TargetRegisterClass *RC = TII->getRegClass(MCID, i, TRI, *MF);
 
-      if (RC == Hexagon::IntRegsRegisterClass ||
-          RC == Hexagon::DoubleRegsRegisterClass) {
+      if (RC == &Hexagon::IntRegsRegClass ||
+          RC == &Hexagon::DoubleRegsRegClass) {
         Ops.push_back(SDValue(Arg, 0));
-      } else if (RC == Hexagon::PredRegsRegisterClass) {
+      } else if (RC == &Hexagon::PredRegsRegClass) {
         // Do the transfer.
         SDNode *PdRs = CurDAG->getMachineNode(Hexagon::TFR_PdRs, dl, MVT::i1,
                                               SDValue(Arg, 0));
@@ -1158,6 +1160,25 @@ SDNode *HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) {
   return SelectCode(N);
 }
 
+//
+// Map floating point constant values.
+//
+SDNode *HexagonDAGToDAGISel::SelectConstantFP(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N);
+  APFloat APF = CN->getValueAPF();
+  if (N->getValueType(0) == MVT::f32) {
+    return CurDAG->getMachineNode(Hexagon::TFRI_f, dl, MVT::f32,
+              CurDAG->getTargetConstantFP(APF.convertToFloat(), MVT::f32));
+  }
+  else if (N->getValueType(0) == MVT::f64) {
+    return CurDAG->getMachineNode(Hexagon::CONST64_Float_Real, dl, MVT::f64,
+              CurDAG->getTargetConstantFP(APF.convertToDouble(), MVT::f64));
+  }
+
+  return SelectCode(N);
+}
+
 
 //
 // Map predicate true (encoded as -1 in LLVM) to a XOR.
@@ -1215,7 +1236,7 @@ SDNode *HexagonDAGToDAGISel::SelectAdd(SDNode *N) {
 
   // Build Rd = Rd' + asr(Rs, Rt). The machine constraints will ensure that
   // Rd and Rd' are assigned to the same register
-  SDNode* Result = CurDAG->getMachineNode(Hexagon::ASR_rr_acc, dl, MVT::i32,
+  SDNode* Result = CurDAG->getMachineNode(Hexagon::ASR_ADD_rr, dl, MVT::i32,
                                           N->getOperand(1),
                                           Src1->getOperand(0),
                                           Src1->getOperand(1));
@@ -1234,6 +1255,9 @@ SDNode *HexagonDAGToDAGISel::Select(SDNode *N) {
   case ISD::Constant:
     return SelectConstant(N);
 
+  case ISD::ConstantFP:
+    return SelectConstantFP(N);
+
   case ISD::ADD:
     return SelectAdd(N);
 
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index d6da0d0..703a128 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -32,9 +32,11 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+
 using namespace llvm;
 
 const unsigned Hexagon_MAX_RET_SIZE = 64;
@@ -101,12 +103,12 @@ CC_Hexagon_VarArg (unsigned ValNo, MVT ValVT,
     State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo));
     return false;
   }
-  if (LocVT == MVT::i32) {
+  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
     ofst = State.AllocateStack(4, 4);
     State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo));
     return false;
   }
-  if (LocVT == MVT::i64) {
+  if (LocVT == MVT::i64 || LocVT == MVT::f64) {
     ofst = State.AllocateStack(8, 8);
     State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo));
     return false;
@@ -140,12 +142,12 @@ CC_Hexagon (unsigned ValNo, MVT ValVT,
       LocInfo = CCValAssign::AExt;
   }
 
-  if (LocVT == MVT::i32) {
+  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
     if (!CC_Hexagon32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
       return false;
   }
 
-  if (LocVT == MVT::i64) {
+  if (LocVT == MVT::i64 || LocVT == MVT::f64) {
     if (!CC_Hexagon64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
       return false;
   }
@@ -215,12 +217,12 @@ static bool RetCC_Hexagon(unsigned ValNo, MVT ValVT,
       LocInfo = CCValAssign::AExt;
   }
 
-  if (LocVT == MVT::i32) {
+  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
     if (!RetCC_Hexagon32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
     return false;
   }
 
-  if (LocVT == MVT::i64) {
+  if (LocVT == MVT::i64 || LocVT == MVT::f64) {
     if (!RetCC_Hexagon64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
     return false;
   }
@@ -232,7 +234,7 @@ static bool RetCC_Hexagon32(unsigned ValNo, MVT ValVT,
                             MVT LocVT, CCValAssign::LocInfo LocInfo,
                             ISD::ArgFlagsTy ArgFlags, CCState &State) {
 
-  if (LocVT == MVT::i32) {
+  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
     if (unsigned Reg = State.AllocateReg(Hexagon::R0)) {
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
       return false;
@@ -247,7 +249,7 @@ static bool RetCC_Hexagon32(unsigned ValNo, MVT ValVT,
 static bool RetCC_Hexagon64(unsigned ValNo, MVT ValVT,
                             MVT LocVT, CCValAssign::LocInfo LocInfo,
                             ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  if (LocVT == MVT::i64) {
+  if (LocVT == MVT::i64 || LocVT == MVT::f64) {
     if (unsigned Reg = State.AllocateReg(Hexagon::D0)) {
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
       return false;
@@ -297,7 +299,7 @@ HexagonTargetLowering::LowerReturn(SDValue Chain,
 
   // CCState - Info about the registers and stack slot.
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), RVLocs, *DAG.getContext());
+                 getTargetMachine(), RVLocs, *DAG.getContext());
 
   // Analyze return values of ISD::RET
   CCInfo.AnalyzeReturn(Outs, RetCC_Hexagon);
@@ -349,7 +351,7 @@ HexagonTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   SmallVector<CCValAssign, 16> RVLocs;
 
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), RVLocs, *DAG.getContext());
+                 getTargetMachine(), RVLocs, *DAG.getContext());
 
   CCInfo.AnalyzeCallResult(Ins, RetCC_Hexagon);
 
@@ -368,21 +370,25 @@ HexagonTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 /// LowerCall - Functions arguments are copied from virtual regs to
 /// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
 SDValue
-HexagonTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
-                                 CallingConv::ID CallConv, bool isVarArg,
-                                 bool doesNotRet, bool &isTailCall,
-                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                 const SmallVectorImpl<SDValue> &OutVals,
-                                 const SmallVectorImpl<ISD::InputArg> &Ins,
-                                 DebugLoc dl, SelectionDAG &DAG,
+HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                  SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
 
   bool IsStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), ArgLocs, *DAG.getContext());
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   // Check for varargs.
   NumNamedVarArgParams = -1;
@@ -502,7 +508,7 @@ HexagonTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   // Build a sequence of copy-to-reg nodes chained together with token
   // chain and flag operands which copy the outgoing args into registers.
-  // The InFlag in necessary since all emited instructions must be
+  // The InFlag in necessary since all emitted instructions must be
   // stuck together.
   SDValue InFlag;
   if (!isTailCall) {
@@ -522,7 +528,7 @@ HexagonTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     // than necessary, because it means that each store effectively depends
     // on every argument instead of just those arguments it would clobber.
     //
-    // Do not flag preceeding copytoreg stuff together with the following stuff.
+    // Do not flag preceding copytoreg stuff together with the following stuff.
     InFlag = SDValue();
     for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
       Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
@@ -811,7 +817,7 @@ const {
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), ArgLocs, *DAG.getContext());
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   CCInfo.AnalyzeFormalArguments(Ins, CC_Hexagon);
 
@@ -837,14 +843,15 @@ const {
       // 1. int, long long, ptr args that get allocated in register.
       // 2. Large struct that gets an register to put its address in.
       EVT RegVT = VA.getLocVT();
-      if (RegVT == MVT::i8 || RegVT == MVT::i16 || RegVT == MVT::i32) {
+      if (RegVT == MVT::i8 || RegVT == MVT::i16 ||
+          RegVT == MVT::i32 || RegVT == MVT::f32) {
         unsigned VReg =
-          RegInfo.createVirtualRegister(Hexagon::IntRegsRegisterClass);
+          RegInfo.createVirtualRegister(&Hexagon::IntRegsRegClass);
         RegInfo.addLiveIn(VA.getLocReg(), VReg);
         InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
       } else if (RegVT == MVT::i64) {
         unsigned VReg =
-          RegInfo.createVirtualRegister(Hexagon::DoubleRegsRegisterClass);
+          RegInfo.createVirtualRegister(&Hexagon::DoubleRegsRegClass);
         RegInfo.addLiveIn(VA.getLocReg(), VReg);
         InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
       } else {
@@ -916,14 +923,33 @@ HexagonTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue
 HexagonTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue CC = Op.getOperand(4);
+  SDValue TrueVal = Op.getOperand(2);
+  SDValue FalseVal = Op.getOperand(3);
+  DebugLoc dl = Op.getDebugLoc();
   SDNode* OpNode = Op.getNode();
+  EVT SVT = OpNode->getValueType(0);
 
-  SDValue Cond = DAG.getNode(ISD::SETCC, Op.getDebugLoc(), MVT::i1,
-                             Op.getOperand(2), Op.getOperand(3),
-                             Op.getOperand(4));
-  return DAG.getNode(ISD::SELECT, Op.getDebugLoc(), OpNode->getValueType(0),
-                     Cond, Op.getOperand(0),
-                     Op.getOperand(1));
+  SDValue Cond = DAG.getNode(ISD::SETCC, dl, MVT::i1, LHS, RHS, CC);
+  return DAG.getNode(ISD::SELECT, dl, SVT, Cond, TrueVal, FalseVal);
+}
+
+SDValue
+HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
+  EVT ValTy = Op.getValueType();
+
+  DebugLoc dl = Op.getDebugLoc();
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  SDValue Res;
+  if (CP->isMachineConstantPoolEntry())
+    Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), ValTy,
+                                    CP->getAlignment());
+  else
+    Res = DAG.getTargetConstantPool(CP->getConstVal(), ValTy,
+                                    CP->getAlignment());
+  return DAG.getNode(HexagonISD::CONST32, dl, ValTy, Res);
 }
 
 SDValue
@@ -1008,11 +1034,18 @@ HexagonTargetLowering::HexagonTargetLowering(HexagonTargetMachine
   : TargetLowering(targetmachine, new HexagonTargetObjectFile()),
     TM(targetmachine) {
 
+    const HexagonRegisterInfo* QRI = TM.getRegisterInfo();
+
     // Set up the register classes.
-    addRegisterClass(MVT::i32, Hexagon::IntRegsRegisterClass);
-    addRegisterClass(MVT::i64, Hexagon::DoubleRegsRegisterClass);
+    addRegisterClass(MVT::i32, &Hexagon::IntRegsRegClass);
+    addRegisterClass(MVT::i64, &Hexagon::DoubleRegsRegClass);
+
+    if (QRI->Subtarget.hasV5TOps()) {
+      addRegisterClass(MVT::f32, &Hexagon::IntRegsRegClass);
+      addRegisterClass(MVT::f64, &Hexagon::DoubleRegsRegClass);
+    }
 
-    addRegisterClass(MVT::i1, Hexagon::PredRegsRegisterClass);
+    addRegisterClass(MVT::i1, &Hexagon::PredRegsRegClass);
 
     computeRegisterProperties();
 
@@ -1026,32 +1059,16 @@ HexagonTargetLowering::HexagonTargetLowering(HexagonTargetMachine
     //
     // Library calls for unsupported operations
     //
-    setLibcallName(RTLIB::OGT_F64, "__hexagon_gtdf2");
 
-    setLibcallName(RTLIB::SINTTOFP_I64_F64, "__hexagon_floatdidf");
     setLibcallName(RTLIB::SINTTOFP_I128_F64, "__hexagon_floattidf");
     setLibcallName(RTLIB::SINTTOFP_I128_F32, "__hexagon_floattisf");
-    setLibcallName(RTLIB::UINTTOFP_I32_F32, "__hexagon_floatunsisf");
-    setLibcallName(RTLIB::UINTTOFP_I64_F32, "__hexagon_floatundisf");
-    setLibcallName(RTLIB::SINTTOFP_I64_F32, "__hexagon_floatdisf");
-    setLibcallName(RTLIB::UINTTOFP_I64_F64, "__hexagon_floatundidf");
 
-    setLibcallName(RTLIB::FPTOUINT_F32_I32, "__hexagon_fixunssfsi");
-    setLibcallName(RTLIB::FPTOUINT_F32_I64, "__hexagon_fixunssfdi");
     setLibcallName(RTLIB::FPTOUINT_F32_I128, "__hexagon_fixunssfti");
-
-    setLibcallName(RTLIB::FPTOUINT_F64_I32, "__hexagon_fixunsdfsi");
-    setLibcallName(RTLIB::FPTOUINT_F64_I64, "__hexagon_fixunsdfdi");
     setLibcallName(RTLIB::FPTOUINT_F64_I128, "__hexagon_fixunsdfti");
 
-    setLibcallName(RTLIB::UINTTOFP_I32_F64, "__hexagon_floatunsidf");
-    setLibcallName(RTLIB::FPTOSINT_F32_I64, "__hexagon_fixsfdi");
     setLibcallName(RTLIB::FPTOSINT_F32_I128, "__hexagon_fixsfti");
-    setLibcallName(RTLIB::FPTOSINT_F64_I64, "__hexagon_fixdfdi");
     setLibcallName(RTLIB::FPTOSINT_F64_I128, "__hexagon_fixdfti");
 
-    setLibcallName(RTLIB::OGT_F64, "__hexagon_gtdf2");
-
     setLibcallName(RTLIB::SDIV_I32, "__hexagon_divsi3");
     setOperationAction(ISD::SDIV,  MVT::i32, Expand);
     setLibcallName(RTLIB::SREM_I32, "__hexagon_umodsi3");
@@ -1080,92 +1097,184 @@ HexagonTargetLowering::HexagonTargetLowering(HexagonTargetMachine
     setLibcallName(RTLIB::DIV_F64, "__hexagon_divdf3");
     setOperationAction(ISD::FDIV,  MVT::f64, Expand);
 
-    setLibcallName(RTLIB::FPEXT_F32_F64, "__hexagon_extendsfdf2");
-    setOperationAction(ISD::FP_EXTEND,  MVT::f32, Expand);
+    setOperationAction(ISD::FSQRT,  MVT::f32, Expand);
+    setOperationAction(ISD::FSQRT,  MVT::f64, Expand);
+    setOperationAction(ISD::FSIN,  MVT::f32, Expand);
+    setOperationAction(ISD::FSIN,  MVT::f64, Expand);
+
+    if (QRI->Subtarget.hasV5TOps()) {
+      // Hexagon V5 Support.
+      setOperationAction(ISD::FADD,       MVT::f32, Legal);
+      setOperationAction(ISD::FADD,       MVT::f64, Legal);
+      setOperationAction(ISD::FP_EXTEND,  MVT::f32, Legal);
+      setCondCodeAction(ISD::SETOEQ,      MVT::f32, Legal);
+      setCondCodeAction(ISD::SETOEQ,      MVT::f64, Legal);
+      setCondCodeAction(ISD::SETUEQ,      MVT::f32, Legal);
+      setCondCodeAction(ISD::SETUEQ,      MVT::f64, Legal);
+
+      setCondCodeAction(ISD::SETOGE,      MVT::f32, Legal);
+      setCondCodeAction(ISD::SETOGE,      MVT::f64, Legal);
+      setCondCodeAction(ISD::SETUGE,      MVT::f32, Legal);
+      setCondCodeAction(ISD::SETUGE,      MVT::f64, Legal);
+
+      setCondCodeAction(ISD::SETOGT,      MVT::f32, Legal);
+      setCondCodeAction(ISD::SETOGT,      MVT::f64, Legal);
+      setCondCodeAction(ISD::SETUGT,      MVT::f32, Legal);
+      setCondCodeAction(ISD::SETUGT,      MVT::f64, Legal);
+
+      setCondCodeAction(ISD::SETOLE,      MVT::f32, Legal);
+      setCondCodeAction(ISD::SETOLE,      MVT::f64, Legal);
+      setCondCodeAction(ISD::SETOLT,      MVT::f32, Legal);
+      setCondCodeAction(ISD::SETOLT,      MVT::f64, Legal);
+
+      setOperationAction(ISD::ConstantFP,  MVT::f32, Legal);
+      setOperationAction(ISD::ConstantFP,  MVT::f64, Legal);
+
+      setOperationAction(ISD::FP_TO_UINT, MVT::i1, Promote);
+      setOperationAction(ISD::FP_TO_SINT, MVT::i1, Promote);
+      setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote);
+      setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote);
+
+      setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
+      setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
+      setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
+      setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
+
+      setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
+      setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
+      setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
+      setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
+
+      setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
+      setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
+      setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
+      setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
+
+      setOperationAction(ISD::FP_TO_UINT, MVT::i64, Legal);
+      setOperationAction(ISD::FP_TO_SINT, MVT::i64, Legal);
+      setOperationAction(ISD::UINT_TO_FP, MVT::i64, Legal);
+      setOperationAction(ISD::SINT_TO_FP, MVT::i64, Legal);
+
+      setOperationAction(ISD::FABS,  MVT::f32, Legal);
+      setOperationAction(ISD::FABS,  MVT::f64, Expand);
+
+      setOperationAction(ISD::FNEG,  MVT::f32, Legal);
+      setOperationAction(ISD::FNEG,  MVT::f64, Expand);
+    } else {
 
-    setLibcallName(RTLIB::SINTTOFP_I32_F32, "__hexagon_floatsisf");
-    setOperationAction(ISD::SINT_TO_FP,  MVT::i32, Expand);
+      // Expand fp<->uint.
+      setOperationAction(ISD::FP_TO_SINT,  MVT::i32, Expand);
+      setOperationAction(ISD::FP_TO_UINT,  MVT::i32, Expand);
 
-    setLibcallName(RTLIB::ADD_F64, "__hexagon_adddf3");
-    setOperationAction(ISD::FADD,  MVT::f64, Expand);
+      setOperationAction(ISD::SINT_TO_FP,  MVT::i32, Expand);
+      setOperationAction(ISD::UINT_TO_FP,  MVT::i32, Expand);
 
-    setLibcallName(RTLIB::ADD_F32, "__hexagon_addsf3");
-    setOperationAction(ISD::FADD,  MVT::f32, Expand);
+      setLibcallName(RTLIB::SINTTOFP_I64_F32, "__hexagon_floatdisf");
+      setLibcallName(RTLIB::UINTTOFP_I64_F32, "__hexagon_floatundisf");
 
-    setLibcallName(RTLIB::ADD_F32, "__hexagon_addsf3");
-    setOperationAction(ISD::FADD,  MVT::f32, Expand);
+      setLibcallName(RTLIB::UINTTOFP_I32_F32, "__hexagon_floatunsisf");
+      setLibcallName(RTLIB::SINTTOFP_I32_F32, "__hexagon_floatsisf");
 
-    setLibcallName(RTLIB::OEQ_F32, "__hexagon_eqsf2");
-    setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
+      setLibcallName(RTLIB::SINTTOFP_I64_F64, "__hexagon_floatdidf");
+      setLibcallName(RTLIB::UINTTOFP_I64_F64, "__hexagon_floatundidf");
 
-    setLibcallName(RTLIB::FPTOSINT_F64_I32, "__hexagon_fixdfsi");
-    setOperationAction(ISD::FP_TO_SINT, MVT::f64, Expand);
+      setLibcallName(RTLIB::UINTTOFP_I32_F64, "__hexagon_floatunsidf");
+      setLibcallName(RTLIB::SINTTOFP_I32_F64, "__hexagon_floatsidf");
 
-    setLibcallName(RTLIB::FPTOSINT_F32_I32, "__hexagon_fixsfsi");
-    setOperationAction(ISD::FP_TO_SINT, MVT::f32, Expand);
+      setLibcallName(RTLIB::FPTOUINT_F32_I32, "__hexagon_fixunssfsi");
+      setLibcallName(RTLIB::FPTOUINT_F32_I64, "__hexagon_fixunssfdi");
 
-    setLibcallName(RTLIB::SINTTOFP_I32_F64, "__hexagon_floatsidf");
-    setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
+      setLibcallName(RTLIB::FPTOSINT_F64_I64, "__hexagon_fixdfdi");
+      setLibcallName(RTLIB::FPTOSINT_F32_I64, "__hexagon_fixsfdi");
 
-    setLibcallName(RTLIB::OGE_F64, "__hexagon_gedf2");
-    setCondCodeAction(ISD::SETOGE, MVT::f64, Expand);
+      setLibcallName(RTLIB::FPTOUINT_F64_I32, "__hexagon_fixunsdfsi");
+      setLibcallName(RTLIB::FPTOUINT_F64_I64, "__hexagon_fixunsdfdi");
 
-    setLibcallName(RTLIB::OGE_F32, "__hexagon_gesf2");
-    setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
+      setLibcallName(RTLIB::ADD_F64, "__hexagon_adddf3");
+      setOperationAction(ISD::FADD,  MVT::f64, Expand);
 
-    setLibcallName(RTLIB::OGT_F32, "__hexagon_gtsf2");
-    setCondCodeAction(ISD::SETOGT, MVT::f32, Expand);
+      setLibcallName(RTLIB::ADD_F32, "__hexagon_addsf3");
+      setOperationAction(ISD::FADD,  MVT::f32, Expand);
 
-    setLibcallName(RTLIB::OLE_F64, "__hexagon_ledf2");
-    setCondCodeAction(ISD::SETOLE, MVT::f64, Expand);
+      setLibcallName(RTLIB::FPEXT_F32_F64, "__hexagon_extendsfdf2");
+      setOperationAction(ISD::FP_EXTEND,  MVT::f32, Expand);
 
-    setLibcallName(RTLIB::OLE_F32, "__hexagon_lesf2");
-    setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
+      setLibcallName(RTLIB::OEQ_F32, "__hexagon_eqsf2");
+      setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
 
-    setLibcallName(RTLIB::OLT_F64, "__hexagon_ltdf2");
-    setCondCodeAction(ISD::SETOLT, MVT::f64, Expand);
+      setLibcallName(RTLIB::OEQ_F64, "__hexagon_eqdf2");
+      setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
 
-    setLibcallName(RTLIB::OLT_F32, "__hexagon_ltsf2");
-    setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
+      setLibcallName(RTLIB::OGE_F32, "__hexagon_gesf2");
+      setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
 
-    setLibcallName(RTLIB::SREM_I32, "__hexagon_modsi3");
-    setOperationAction(ISD::SREM, MVT::i32, Expand);
+      setLibcallName(RTLIB::OGE_F64, "__hexagon_gedf2");
+      setCondCodeAction(ISD::SETOGE, MVT::f64, Expand);
+
+      setLibcallName(RTLIB::OGT_F32, "__hexagon_gtsf2");
+      setCondCodeAction(ISD::SETOGT, MVT::f32, Expand);
+
+      setLibcallName(RTLIB::OGT_F64, "__hexagon_gtdf2");
+      setCondCodeAction(ISD::SETOGT, MVT::f64, Expand);
 
-    setLibcallName(RTLIB::MUL_F64, "__hexagon_muldf3");
-    setOperationAction(ISD::FMUL, MVT::f64, Expand);
+      setLibcallName(RTLIB::FPTOSINT_F64_I32, "__hexagon_fixdfsi");
+      setOperationAction(ISD::FP_TO_SINT, MVT::f64, Expand);
 
-    setLibcallName(RTLIB::MUL_F32, "__hexagon_mulsf3");
-    setOperationAction(ISD::MUL, MVT::f32, Expand);
+      setLibcallName(RTLIB::FPTOSINT_F32_I32, "__hexagon_fixsfsi");
+      setOperationAction(ISD::FP_TO_SINT, MVT::f32, Expand);
 
-    setLibcallName(RTLIB::UNE_F64, "__hexagon_nedf2");
-    setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
+      setLibcallName(RTLIB::OLE_F64, "__hexagon_ledf2");
+      setCondCodeAction(ISD::SETOLE, MVT::f64, Expand);
 
-    setLibcallName(RTLIB::UNE_F32, "__hexagon_nesf2");
+      setLibcallName(RTLIB::OLE_F32, "__hexagon_lesf2");
+      setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
 
+      setLibcallName(RTLIB::OLT_F64, "__hexagon_ltdf2");
+      setCondCodeAction(ISD::SETOLT, MVT::f64, Expand);
 
-    setLibcallName(RTLIB::SUB_F64, "__hexagon_subdf3");
-    setOperationAction(ISD::SUB, MVT::f64, Expand);
+      setLibcallName(RTLIB::OLT_F32, "__hexagon_ltsf2");
+      setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
 
-    setLibcallName(RTLIB::SUB_F32, "__hexagon_subsf3");
-    setOperationAction(ISD::SUB, MVT::f32, Expand);
+      setLibcallName(RTLIB::MUL_F64, "__hexagon_muldf3");
+      setOperationAction(ISD::FMUL, MVT::f64, Expand);
 
-    setLibcallName(RTLIB::FPROUND_F64_F32, "__hexagon_truncdfsf2");
-    setOperationAction(ISD::FP_ROUND, MVT::f64, Expand);
+      setLibcallName(RTLIB::MUL_F32, "__hexagon_mulsf3");
+      setOperationAction(ISD::MUL, MVT::f32, Expand);
 
-    setLibcallName(RTLIB::UO_F64, "__hexagon_unorddf2");
-    setCondCodeAction(ISD::SETUO, MVT::f64, Expand);
+      setLibcallName(RTLIB::UNE_F64, "__hexagon_nedf2");
+      setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
 
-    setLibcallName(RTLIB::O_F64, "__hexagon_unorddf2");
-    setCondCodeAction(ISD::SETO, MVT::f64, Expand);
+      setLibcallName(RTLIB::UNE_F32, "__hexagon_nesf2");
 
-    setLibcallName(RTLIB::OEQ_F64, "__hexagon_eqdf2");
-    setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
+      setLibcallName(RTLIB::SUB_F64, "__hexagon_subdf3");
+      setOperationAction(ISD::SUB, MVT::f64, Expand);
 
-    setLibcallName(RTLIB::O_F32, "__hexagon_unordsf2");
-    setCondCodeAction(ISD::SETO, MVT::f32, Expand);
+      setLibcallName(RTLIB::SUB_F32, "__hexagon_subsf3");
+      setOperationAction(ISD::SUB, MVT::f32, Expand);
 
-    setLibcallName(RTLIB::UO_F32, "__hexagon_unordsf2");
-    setCondCodeAction(ISD::SETUO, MVT::f32, Expand);
+      setLibcallName(RTLIB::FPROUND_F64_F32, "__hexagon_truncdfsf2");
+      setOperationAction(ISD::FP_ROUND, MVT::f64, Expand);
+
+      setLibcallName(RTLIB::UO_F64, "__hexagon_unorddf2");
+      setCondCodeAction(ISD::SETUO, MVT::f64, Expand);
+
+      setLibcallName(RTLIB::O_F64, "__hexagon_unorddf2");
+      setCondCodeAction(ISD::SETO, MVT::f64, Expand);
+
+      setLibcallName(RTLIB::O_F32, "__hexagon_unordsf2");
+      setCondCodeAction(ISD::SETO, MVT::f32, Expand);
+
+      setLibcallName(RTLIB::UO_F32, "__hexagon_unordsf2");
+      setCondCodeAction(ISD::SETUO, MVT::f32, Expand);
+
+      setOperationAction(ISD::FABS,  MVT::f32, Expand);
+      setOperationAction(ISD::FABS,  MVT::f64, Expand);
+      setOperationAction(ISD::FNEG,  MVT::f32, Expand);
+      setOperationAction(ISD::FNEG,  MVT::f64, Expand);
+    }
+
+    setLibcallName(RTLIB::SREM_I32, "__hexagon_modsi3");
+    setOperationAction(ISD::SREM, MVT::i32, Expand);
 
     setIndexedLoadAction(ISD::POST_INC, MVT::i8, Legal);
     setIndexedLoadAction(ISD::POST_INC, MVT::i16, Legal);
@@ -1206,20 +1315,33 @@ HexagonTargetLowering::HexagonTargetLowering(HexagonTargetMachine
 
     setOperationAction(ISD::BSWAP, MVT::i64, Expand);
 
-    // Expand fp<->uint.
-    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
-    setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
-
-    // Hexagon has no select or setcc: expand to SELECT_CC.
-    setOperationAction(ISD::SELECT, MVT::f32, Expand);
-    setOperationAction(ISD::SELECT, MVT::f64, Expand);
-
     // Lower SELECT_CC to SETCC and SELECT.
     setOperationAction(ISD::SELECT_CC, MVT::i32,   Custom);
     setOperationAction(ISD::SELECT_CC, MVT::i64,   Custom);
-    // This is a workaround documented in DAGCombiner.cpp:2892 We don't
-    // support SELECT_CC on every type.
-    setOperationAction(ISD::SELECT_CC, MVT::Other,   Expand);
+
+    if (QRI->Subtarget.hasV5TOps()) {
+
+      // We need to make the operation type of SELECT node to be Custom,
+      // such that we don't go into the infinite loop of
+      // select ->  setcc -> select_cc -> select loop.
+      setOperationAction(ISD::SELECT, MVT::f32, Custom);
+      setOperationAction(ISD::SELECT, MVT::f64, Custom);
+
+      setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+      setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+      setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
+
+    } else {
+
+      // Hexagon has no select or setcc: expand to SELECT_CC.
+      setOperationAction(ISD::SELECT, MVT::f32, Expand);
+      setOperationAction(ISD::SELECT, MVT::f64, Expand);
+
+      // This is a workaround documented in DAGCombiner.cpp:2892 We don't
+      // support SELECT_CC on every type.
+      setOperationAction(ISD::SELECT_CC, MVT::Other,   Expand);
+
+    }
 
     setOperationAction(ISD::BR_CC, MVT::Other, Expand);
     setOperationAction(ISD::BRIND, MVT::Other, Expand);
@@ -1305,22 +1427,22 @@ const char*
 HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
     default: return 0;
-    case HexagonISD::CONST32:    return "HexagonISD::CONST32";
+    case HexagonISD::CONST32:     return "HexagonISD::CONST32";
     case HexagonISD::ADJDYNALLOC: return "HexagonISD::ADJDYNALLOC";
-    case HexagonISD::CMPICC:     return "HexagonISD::CMPICC";
-    case HexagonISD::CMPFCC:     return "HexagonISD::CMPFCC";
-    case HexagonISD::BRICC:      return "HexagonISD::BRICC";
-    case HexagonISD::BRFCC:      return "HexagonISD::BRFCC";
-    case HexagonISD::SELECT_ICC: return "HexagonISD::SELECT_ICC";
-    case HexagonISD::SELECT_FCC: return "HexagonISD::SELECT_FCC";
-    case HexagonISD::Hi:         return "HexagonISD::Hi";
-    case HexagonISD::Lo:         return "HexagonISD::Lo";
-    case HexagonISD::FTOI:       return "HexagonISD::FTOI";
-    case HexagonISD::ITOF:       return "HexagonISD::ITOF";
-    case HexagonISD::CALL:       return "HexagonISD::CALL";
-    case HexagonISD::RET_FLAG:   return "HexagonISD::RET_FLAG";
-    case HexagonISD::BR_JT:      return "HexagonISD::BR_JT";
-    case HexagonISD::TC_RETURN:  return "HexagonISD::TC_RETURN";
+    case HexagonISD::CMPICC:      return "HexagonISD::CMPICC";
+    case HexagonISD::CMPFCC:      return "HexagonISD::CMPFCC";
+    case HexagonISD::BRICC:       return "HexagonISD::BRICC";
+    case HexagonISD::BRFCC:       return "HexagonISD::BRFCC";
+    case HexagonISD::SELECT_ICC:  return "HexagonISD::SELECT_ICC";
+    case HexagonISD::SELECT_FCC:  return "HexagonISD::SELECT_FCC";
+    case HexagonISD::Hi:          return "HexagonISD::Hi";
+    case HexagonISD::Lo:          return "HexagonISD::Lo";
+    case HexagonISD::FTOI:        return "HexagonISD::FTOI";
+    case HexagonISD::ITOF:        return "HexagonISD::ITOF";
+    case HexagonISD::CALL:        return "HexagonISD::CALL";
+    case HexagonISD::RET_FLAG:    return "HexagonISD::RET_FLAG";
+    case HexagonISD::BR_JT:       return "HexagonISD::BR_JT";
+    case HexagonISD::TC_RETURN:   return "HexagonISD::TC_RETURN";
   }
 }
 
@@ -1345,9 +1467,10 @@ SDValue
 HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
     default: llvm_unreachable("Should not custom lower this!");
+    case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
       // Frame & Return address.  Currently unimplemented.
-    case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
-    case ISD::FRAMEADDR:  return LowerFRAMEADDR(Op, DAG);
+    case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
+    case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
     case ISD::GlobalTLSAddress:
                           llvm_unreachable("TLS not implemented for Hexagon.");
     case ISD::MEMBARRIER:         return LowerMEMBARRIER(Op, DAG);
@@ -1357,9 +1480,10 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     case ISD::BR_JT:              return LowerBR_JT(Op, DAG);
 
     case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
-    case ISD::SELECT_CC:        return LowerSELECT_CC(Op, DAG);
+    case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
+    case ISD::SELECT:             return Op;
     case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
-  case ISD::INLINEASM:          return LowerINLINEASM(Op, DAG);
+    case ISD::INLINEASM:          return LowerINLINEASM(Op, DAG);
 
   }
 }
@@ -1402,9 +1526,11 @@ HexagonTargetLowering::getRegForInlineAsmConstraint(const
        case MVT::i32:
        case MVT::i16:
        case MVT::i8:
-         return std::make_pair(0U, Hexagon::IntRegsRegisterClass);
+       case MVT::f32:
+         return std::make_pair(0U, &Hexagon::IntRegsRegClass);
        case MVT::i64:
-         return std::make_pair(0U, Hexagon::DoubleRegsRegisterClass);
+       case MVT::f64:
+         return std::make_pair(0U, &Hexagon::DoubleRegsRegClass);
       }
     default:
       llvm_unreachable("Unknown asm register class");
@@ -1414,6 +1540,14 @@ HexagonTargetLowering::getRegForInlineAsmConstraint(const
   return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
 }
 
+/// isFPImmLegal - Returns true if the target can instruction select the
+/// specified FP immediate natively. If false, the legalizer will
+/// materialize the FP immediate as a load from a constant pool.
+bool HexagonTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  const HexagonRegisterInfo* QRI = TM.getRegisterInfo();
+  return QRI->Subtarget.hasV5TOps();
+}
+
 /// isLegalAddressingMode - Return true if the addressing mode represented by
 /// AM is legal for this target, for a load/store of the specified type.
 bool HexagonTargetLowering::isLegalAddressingMode(const AddrMode &AM,
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index 4208bcb..fe6c905 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -27,6 +27,7 @@ namespace llvm {
 
       CONST32,
       CONST32_GP,  // For marking data present in GP.
+      FCONST32,
       SETCC,
       ADJDYNALLOC,
       ARGEXTEND,
@@ -48,6 +49,7 @@ namespace llvm {
       BR_JT,       // Jump table.
       BARRIER,     // Memory barrier.
       WrapperJT,
+      WrapperCP,
       TC_RETURN
     };
   }
@@ -94,13 +96,7 @@ namespace llvm {
                                  SmallVectorImpl<SDValue> &InVals) const;
     SDValue LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const;
 
-    SDValue LowerCall(SDValue Chain, SDValue Callee,
-                      CallingConv::ID CallConv, bool isVarArg,
-                      bool doesNotRet, bool &isTailCall,
-                      const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      const SmallVectorImpl<SDValue> &OutVals,
-                      const SmallVectorImpl<ISD::InputArg> &Ins,
-                      DebugLoc dl, SelectionDAG &DAG,
+    SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
                       SmallVectorImpl<SDValue> &InVals) const;
 
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
@@ -128,6 +124,7 @@ namespace llvm {
                                  MachineBasicBlock *BB) const;
 
     SDValue  LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+    SDValue  LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
     virtual EVT getSetCCResultType(EVT VT) const {
       return MVT::i1;
     }
@@ -150,6 +147,7 @@ namespace llvm {
     /// mode is legal for a load/store of any legal type.
     /// TODO: Handle pre/postinc as well.
     virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const;
+    virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
 
     /// isLegalICmpImmediate - Return true if the specified immediate is legal
     /// icmp immediate, that is the target has icmp instructions which can
diff --git a/lib/Target/Hexagon/HexagonImmediates.td b/lib/Target/Hexagon/HexagonImmediates.td
index e78bb79..18692c4 100644
--- a/lib/Target/Hexagon/HexagonImmediates.td
+++ b/lib/Target/Hexagon/HexagonImmediates.td
@@ -371,7 +371,7 @@ def s4_3ImmPred  : PatLeaf<(i32 imm), [{
 def u64ImmPred  : PatLeaf<(i64 imm), [{
   // immS16 predicate - True if the immediate fits in a 16-bit sign extended
   // field.
-  // Adding "N ||" to supress gcc unused warning.
+  // Adding "N ||" to suppress gcc unused warning.
   return (N || true);
 }]>;
 
diff --git a/lib/Target/Hexagon/HexagonInstrFormats.td b/lib/Target/Hexagon/HexagonInstrFormats.td
index c9f16fb..e472d49 100644
--- a/lib/Target/Hexagon/HexagonInstrFormats.td
+++ b/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -13,29 +13,48 @@
 //                    *** Must match HexagonBaseInfo.h ***
 //===----------------------------------------------------------------------===//
 
+class Type<bits<5> t> {
+  bits<5> Value = t;
+}
+def TypePSEUDO : Type<0>;
+def TypeALU32  : Type<1>;
+def TypeCR     : Type<2>;
+def TypeJR     : Type<3>;
+def TypeJ      : Type<4>;
+def TypeLD     : Type<5>;
+def TypeST     : Type<6>;
+def TypeSYSTEM : Type<7>;
+def TypeXTYPE  : Type<8>;
+def TypeMARKER : Type<31>;
 
 //===----------------------------------------------------------------------===//
 //                         Intruction Class Declaration +
 //===----------------------------------------------------------------------===//
 
 class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
-                  string cstr, InstrItinClass itin> : Instruction {
+                  string cstr, InstrItinClass itin, Type type> : Instruction {
   field bits<32> Inst;
 
   let Namespace = "Hexagon";
 
   dag OutOperandList = outs;
   dag InOperandList = ins;
-  let AsmString   = asmstr;
+  let AsmString = asmstr;
   let Pattern = pattern;
   let Constraints = cstr;
-  let Itinerary   = itin;
-
-  // *** The code below must match HexagonBaseInfo.h ***
-
+  let Itinerary = itin;
+  let Size = 4;
+
+  // *** Must match HexagonBaseInfo.h ***
+  // Instruction type according to the ISA.
+  Type HexagonType = type;
+  let TSFlags{4-0} = HexagonType.Value;
+  // Solo instructions, i.e., those that cannot be in a packet with others.
+  bits<1> isHexagonSolo = 0;
+  let TSFlags{5} = isHexagonSolo;
   // Predicated instructions.
   bits<1> isPredicated = 0;
-  let TSFlags{1} = isPredicated;
+  let TSFlags{6} = isPredicated;
 
   // *** The code above must match HexagonBaseInfo.h ***
 }
@@ -47,17 +66,25 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
 // LD Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
 class LDInst<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : InstHexagon<outs, ins, asmstr, pattern, "", LD> {
+  : InstHexagon<outs, ins, asmstr, pattern, "", LD, TypeLD> {
   bits<5> rd;
   bits<5> rs;
   bits<13> imm13;
 }
 
+class LDInst2<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstHexagon<outs, ins, asmstr, pattern, "", LD, TypeLD> {
+  bits<5> rd;
+  bits<5> rs;
+  bits<13> imm13;
+  let mayLoad = 1;
+}
+
 // LD Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
 class LDInstPost<dag outs, dag ins, string asmstr, list<dag> pattern,
                  string cstr>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr,  LD> {
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, LD, TypeLD> {
   bits<5> rd;
   bits<5> rs;
   bits<5> rt;
@@ -68,7 +95,24 @@ class LDInstPost<dag outs, dag ins, string asmstr, list<dag> pattern,
 // ST Instruction Class in V4    can take SLOT0 & SLOT1.
 // Definition of the instruction class CHANGED from V2/V3 to V4.
 class STInst<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : InstHexagon<outs, ins, asmstr, pattern, "",  ST> {
+  : InstHexagon<outs, ins, asmstr, pattern, "", ST, TypeST> {
+  bits<5> rd;
+  bits<5> rs;
+  bits<13> imm13;
+}
+
+class STInst2<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstHexagon<outs, ins, asmstr, pattern, "", ST, TypeST> {
+  bits<5> rd;
+  bits<5> rs;
+  bits<13> imm13;
+  let mayStore = 1;
+}
+
+// SYSTEM Instruction Class in V4 can take SLOT0 only
+// In V2/V3 we used ST for this but in v4 ST can take SLOT0 or SLOT1.
+class SYSInst<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstHexagon<outs, ins, asmstr, pattern, "", SYS, TypeSYSTEM> {
   bits<5> rd;
   bits<5> rs;
   bits<13> imm13;
@@ -79,7 +123,7 @@ class STInst<dag outs, dag ins, string asmstr, list<dag> pattern>
 // Definition of the instruction class CHANGED from V2/V3 to V4.
 class STInstPost<dag outs, dag ins, string asmstr, list<dag> pattern,
                  string cstr>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr,  ST> {
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, ST, TypeST> {
   bits<5> rd;
   bits<5> rs;
   bits<5> rt;
@@ -89,7 +133,7 @@ class STInstPost<dag outs, dag ins, string asmstr, list<dag> pattern,
 // ALU32 Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
 class ALU32Type<dag outs, dag ins, string asmstr, list<dag> pattern>
-   : InstHexagon<outs, ins, asmstr, pattern, "",  ALU32> {
+   : InstHexagon<outs, ins, asmstr, pattern, "", ALU32, TypeALU32> {
   bits<5>  rd;
   bits<5>  rs;
   bits<5>  rt;
@@ -102,7 +146,17 @@ class ALU32Type<dag outs, dag ins, string asmstr, list<dag> pattern>
 // Definition of the instruction class NOT CHANGED.
 // Name of the Instruction Class changed from ALU64 to XTYPE from V2/V3 to V4.
 class ALU64Type<dag outs, dag ins, string asmstr, list<dag> pattern>
-   : InstHexagon<outs, ins, asmstr, pattern, "",  ALU64> {
+   : InstHexagon<outs, ins, asmstr, pattern, "", ALU64, TypeXTYPE> {
+  bits<5>  rd;
+  bits<5>  rs;
+  bits<5>  rt;
+  bits<16> imm16;
+  bits<16> imm16_2;
+}
+
+class ALU64_acc<dag outs, dag ins, string asmstr, list<dag> pattern,
+   string cstr>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, ALU64, TypeXTYPE> {
   bits<5>  rd;
   bits<5>  rs;
   bits<5>  rt;
@@ -115,7 +169,7 @@ class ALU64Type<dag outs, dag ins, string asmstr, list<dag> pattern>
 // Definition of the instruction class NOT CHANGED.
 // Name of the Instruction Class changed from M to XTYPE from V2/V3 to V4.
 class MInst<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : InstHexagon<outs, ins, asmstr, pattern, "",  M> {
+  : InstHexagon<outs, ins, asmstr, pattern, "", M, TypeXTYPE> {
   bits<5> rd;
   bits<5> rs;
   bits<5> rt;
@@ -126,8 +180,8 @@ class MInst<dag outs, dag ins, string asmstr, list<dag> pattern>
 // Definition of the instruction class NOT CHANGED.
 // Name of the Instruction Class changed from M to XTYPE from V2/V3 to V4.
 class MInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern,
-                string cstr>
-    : InstHexagon<outs, ins, asmstr, pattern, cstr,  M> {
+    string cstr>
+    : InstHexagon<outs, ins, asmstr, pattern, cstr, M, TypeXTYPE> {
   bits<5> rd;
   bits<5> rs;
   bits<5> rt;
@@ -138,9 +192,7 @@ class MInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern,
 // Definition of the instruction class NOT CHANGED.
 // Name of the Instruction Class changed from S to XTYPE from V2/V3 to V4.
 class SInst<dag outs, dag ins, string asmstr, list<dag> pattern>
-//: InstHexagon<outs, ins, asmstr, pattern, cstr, !if(V4T, XTYPE_V4, M)> {
-  : InstHexagon<outs, ins, asmstr, pattern, "",  S> {
-//  : InstHexagon<outs, ins, asmstr, pattern, "", S> {
+  : InstHexagon<outs, ins, asmstr, pattern, "", S, TypeXTYPE> {
   bits<5> rd;
   bits<5> rs;
   bits<5> rt;
@@ -151,8 +203,8 @@ class SInst<dag outs, dag ins, string asmstr, list<dag> pattern>
 // Definition of the instruction class NOT CHANGED.
 // Name of the Instruction Class changed from S to XTYPE from V2/V3 to V4.
 class SInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern,
-                string cstr>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr,  S> {
+   string cstr>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, S, TypeXTYPE> {
 //  : InstHexagon<outs, ins, asmstr, pattern, cstr,  S> {
 //  : InstHexagon<outs, ins, asmstr, pattern, cstr, !if(V4T, XTYPE_V4, S)> {
   bits<5> rd;
@@ -163,14 +215,14 @@ class SInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern,
 // J Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
 class JType<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : InstHexagon<outs, ins, asmstr, pattern, "",  J> {
+  : InstHexagon<outs, ins, asmstr, pattern, "", J, TypeJ> {
   bits<16> imm16;
 }
 
 // JR Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
 class JRType<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : InstHexagon<outs, ins, asmstr, pattern, "", JR> {
+  : InstHexagon<outs, ins, asmstr, pattern, "", JR, TypeJR> {
   bits<5>  rs;
   bits<5>  pu; // Predicate register
 }
@@ -178,15 +230,22 @@ class JRType<dag outs, dag ins, string asmstr, list<dag> pattern>
 // CR Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
 class CRInst<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : InstHexagon<outs, ins, asmstr, pattern, "", CR> {
+  : InstHexagon<outs, ins, asmstr, pattern, "", CR, TypeCR> {
   bits<5> rs;
   bits<10> imm10;
 }
 
+class Marker<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstHexagon<outs, ins, asmstr, pattern, "", MARKER, TypeMARKER> {
+  let isCodeGenOnly = 1;
+  let isPseudo = 1;
+}
 
 class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
- : InstHexagon<outs, ins, asmstr, pattern, "", PSEUDO>;
-
+  : InstHexagon<outs, ins, asmstr, pattern, "", PSEUDO, TypePSEUDO> {
+  let isCodeGenOnly = 1;
+  let isPseudo = 1;
+}
 
 //===----------------------------------------------------------------------===//
 //                         Intruction Classes Definitions -
@@ -222,6 +281,11 @@ class ALU64_rr<dag outs, dag ins, string asmstr, list<dag> pattern>
    : ALU64Type<outs, ins, asmstr, pattern> {
 }
 
+class ALU64_ri<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : ALU64Type<outs, ins, asmstr, pattern> {
+  let rt{0-4} = 0;
+}
+
 // J Type Instructions.
 class JInst<dag outs, dag ins, string asmstr, list<dag> pattern>
   : JType<outs, ins, asmstr, pattern> {
@@ -234,15 +298,31 @@ class JRInst<dag outs, dag ins, string asmstr, list<dag> pattern>
 
 
 // Post increment ST Instruction.
-class STInstPI<dag outs, dag ins, string asmstr, list<dag> pattern, string cstr>
+class STInstPI<dag outs, dag ins, string asmstr, list<dag> pattern,
+               string cstr>
+  : STInstPost<outs, ins, asmstr, pattern, cstr> {
+  let rt{0-4} = 0;
+}
+
+class STInst2PI<dag outs, dag ins, string asmstr, list<dag> pattern,
+                string cstr>
   : STInstPost<outs, ins, asmstr, pattern, cstr> {
   let rt{0-4} = 0;
+  let mayStore = 1;
 }
 
 // Post increment LD Instruction.
-class LDInstPI<dag outs, dag ins, string asmstr, list<dag> pattern, string cstr>
+class LDInstPI<dag outs, dag ins, string asmstr, list<dag> pattern,
+               string cstr>
+  : LDInstPost<outs, ins, asmstr, pattern, cstr> {
+  let rt{0-4} = 0;
+}
+
+class LDInst2PI<dag outs, dag ins, string asmstr, list<dag> pattern,
+                string cstr>
   : LDInstPost<outs, ins, asmstr, pattern, cstr> {
   let rt{0-4} = 0;
+  let mayLoad = 1;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonInstrFormatsV4.td b/lib/Target/Hexagon/HexagonInstrFormatsV4.td
index bd5e449..49741a3 100644
--- a/lib/Target/Hexagon/HexagonInstrFormatsV4.td
+++ b/lib/Target/Hexagon/HexagonInstrFormatsV4.td
@@ -11,11 +11,25 @@
 //
 //===----------------------------------------------------------------------===//
 
+//----------------------------------------------------------------------------//
+//                         Hexagon Intruction Flags +
+//
+//                        *** Must match BaseInfo.h ***
+//----------------------------------------------------------------------------//
+
+def TypeMEMOP  : Type<9>;
+def TypeNV     : Type<10>;
+def TypePREFIX : Type<30>;
+
+//----------------------------------------------------------------------------//
+//                         Intruction Classes Definitions +
+//----------------------------------------------------------------------------//
+
 //
 // NV type instructions.
 //
 class NVInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : InstHexagon<outs, ins, asmstr, pattern, "", NV_V4> {
+  : InstHexagon<outs, ins, asmstr, pattern, "", NV_V4, TypeNV> {
   bits<5> rd;
   bits<5> rs;
   bits<13> imm13;
@@ -24,7 +38,7 @@ class NVInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern>
 // Definition of Post increment new value store.
 class NVInstPost_V4<dag outs, dag ins, string asmstr, list<dag> pattern,
                     string cstr>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, NV_V4> {
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, NV_V4, TypeNV> {
   bits<5> rd;
   bits<5> rs;
   bits<5> rt;
@@ -39,8 +53,15 @@ class NVInstPI_V4<dag outs, dag ins, string asmstr, list<dag> pattern,
 }
 
 class MEMInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : InstHexagon<outs, ins, asmstr, pattern, "",  MEM_V4> {
+  : InstHexagon<outs, ins, asmstr, pattern, "", MEM_V4, TypeMEMOP> {
   bits<5> rd;
   bits<5> rs;
   bits<6> imm6;
 }
+
+class Immext<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstHexagon<outs, ins, asmstr, pattern, "", PREFIX, TypePREFIX> {
+  let isCodeGenOnly = 1;
+
+  bits<26> imm26;
+}
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 77b3663..c8f933d 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -11,10 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Hexagon.h"
 #include "HexagonInstrInfo.h"
 #include "HexagonRegisterInfo.h"
 #include "HexagonSubtarget.h"
+#include "Hexagon.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/DFAPacketizer.h"
@@ -34,24 +34,23 @@ using namespace llvm;
 /// Constants for Hexagon instructions.
 ///
 const int Hexagon_MEMW_OFFSET_MAX = 4095;
-const int Hexagon_MEMW_OFFSET_MIN = 4096;
+const int Hexagon_MEMW_OFFSET_MIN = -4096;
 const int Hexagon_MEMD_OFFSET_MAX = 8191;
-const int Hexagon_MEMD_OFFSET_MIN = 8192;
+const int Hexagon_MEMD_OFFSET_MIN = -8192;
 const int Hexagon_MEMH_OFFSET_MAX = 2047;
-const int Hexagon_MEMH_OFFSET_MIN = 2048;
+const int Hexagon_MEMH_OFFSET_MIN = -2048;
 const int Hexagon_MEMB_OFFSET_MAX = 1023;
-const int Hexagon_MEMB_OFFSET_MIN = 1024;
+const int Hexagon_MEMB_OFFSET_MIN = -1024;
 const int Hexagon_ADDI_OFFSET_MAX = 32767;
-const int Hexagon_ADDI_OFFSET_MIN = 32768;
+const int Hexagon_ADDI_OFFSET_MIN = -32768;
 const int Hexagon_MEMD_AUTOINC_MAX = 56;
-const int Hexagon_MEMD_AUTOINC_MIN = 64;
+const int Hexagon_MEMD_AUTOINC_MIN = -64;
 const int Hexagon_MEMW_AUTOINC_MAX = 28;
-const int Hexagon_MEMW_AUTOINC_MIN = 32;
+const int Hexagon_MEMW_AUTOINC_MIN = -32;
 const int Hexagon_MEMH_AUTOINC_MAX = 14;
-const int Hexagon_MEMH_AUTOINC_MIN = 16;
+const int Hexagon_MEMH_AUTOINC_MIN = -16;
 const int Hexagon_MEMB_AUTOINC_MAX = 7;
-const int Hexagon_MEMB_AUTOINC_MIN = 8;
-
+const int Hexagon_MEMB_AUTOINC_MIN = -8;
 
 
 HexagonInstrInfo::HexagonInstrInfo(HexagonSubtarget &ST)
@@ -70,6 +69,7 @@ unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
 
 
   switch (MI->getOpcode()) {
+  default: break;
   case Hexagon::LDriw:
   case Hexagon::LDrid:
   case Hexagon::LDrih:
@@ -81,11 +81,7 @@ unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
       return MI->getOperand(0).getReg();
     }
     break;
-
-  default:
-    break;
   }
-
   return 0;
 }
 
@@ -98,21 +94,18 @@ unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
 unsigned HexagonInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
                                             int &FrameIndex) const {
   switch (MI->getOpcode()) {
+  default: break;
   case Hexagon::STriw:
   case Hexagon::STrid:
   case Hexagon::STrih:
   case Hexagon::STrib:
     if (MI->getOperand(2).isFI() &&
         MI->getOperand(1).isImm() && (MI->getOperand(1).getImm() == 0)) {
-      FrameIndex = MI->getOperand(2).getIndex();
-      return MI->getOperand(0).getReg();
+      FrameIndex = MI->getOperand(0).getIndex();
+      return MI->getOperand(2).getReg();
     }
     break;
-
-  default:
-    break;
   }
-
   return 0;
 }
 
@@ -176,6 +169,7 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
                                  MachineBasicBlock *&FBB,
                                  SmallVectorImpl<MachineOperand> &Cond,
                                  bool AllowModify) const {
+  TBB = NULL;
   FBB = NULL;
 
   // If the block has no terminators, it just falls into the block after it.
@@ -328,7 +322,8 @@ void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
             DestReg).addReg(SrcReg).addReg(SrcReg);
     return;
   }
-  if (Hexagon::DoubleRegsRegClass.contains(DestReg, SrcReg)) {
+  if (Hexagon::DoubleRegsRegClass.contains(DestReg) &&
+      Hexagon::IntRegsRegClass.contains(SrcReg)) {
     // We can have an overlap between single and double reg: r1:0 = r0.
     if(SrcReg == RI.getSubReg(DestReg, Hexagon::subreg_loreg)) {
         // r1:0 = r0
@@ -343,7 +338,8 @@ void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     }
     return;
   }
-  if (Hexagon::CRRegsRegClass.contains(DestReg, SrcReg)) {
+  if (Hexagon::CRRegsRegClass.contains(DestReg) &&
+      Hexagon::IntRegsRegClass.contains(SrcReg)) {
     BuildMI(MBB, I, DL, get(Hexagon::TFCR), DestReg).addReg(SrcReg);
     return;
   }
@@ -370,15 +366,15 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                       MFI.getObjectSize(FI),
                       Align);
 
-  if (Hexagon::IntRegsRegisterClass->hasSubClassEq(RC)) {
+  if (Hexagon::IntRegsRegClass.hasSubClassEq(RC)) {
     BuildMI(MBB, I, DL, get(Hexagon::STriw))
           .addFrameIndex(FI).addImm(0)
           .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
-  } else if (Hexagon::DoubleRegsRegisterClass->hasSubClassEq(RC)) {
+  } else if (Hexagon::DoubleRegsRegClass.hasSubClassEq(RC)) {
     BuildMI(MBB, I, DL, get(Hexagon::STrid))
           .addFrameIndex(FI).addImm(0)
           .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
-  } else if (Hexagon::PredRegsRegisterClass->hasSubClassEq(RC)) {
+  } else if (Hexagon::PredRegsRegClass.hasSubClassEq(RC)) {
     BuildMI(MBB, I, DL, get(Hexagon::STriw_pred))
           .addFrameIndex(FI).addImm(0)
           .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
@@ -415,14 +411,13 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                       MachineMemOperand::MOLoad,
                       MFI.getObjectSize(FI),
                       Align);
-
-  if (RC == Hexagon::IntRegsRegisterClass) {
+  if (RC == &Hexagon::IntRegsRegClass) {
     BuildMI(MBB, I, DL, get(Hexagon::LDriw), DestReg)
           .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
-  } else if (RC == Hexagon::DoubleRegsRegisterClass) {
+  } else if (RC == &Hexagon::DoubleRegsRegClass) {
     BuildMI(MBB, I, DL, get(Hexagon::LDrid), DestReg)
           .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
-  } else if (RC == Hexagon::PredRegsRegisterClass) {
+  } else if (RC == &Hexagon::PredRegsRegClass) {
     BuildMI(MBB, I, DL, get(Hexagon::LDriw_pred), DestReg)
           .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
   } else {
@@ -453,11 +448,11 @@ unsigned HexagonInstrInfo::createVR(MachineFunction* MF, MVT VT) const {
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
   const TargetRegisterClass *TRC;
   if (VT == MVT::i1) {
-    TRC =  Hexagon::PredRegsRegisterClass;
-  } else if (VT == MVT::i32) {
-    TRC =  Hexagon::IntRegsRegisterClass;
-  } else if (VT == MVT::i64) {
-    TRC =  Hexagon::DoubleRegsRegisterClass;
+    TRC = &Hexagon::PredRegsRegClass;
+  } else if (VT == MVT::i32 || VT == MVT::f32) {
+    TRC = &Hexagon::IntRegsRegClass;
+  } else if (VT == MVT::i64 || VT == MVT::f64) {
+    TRC = &Hexagon::DoubleRegsRegClass;
   } else {
     llvm_unreachable("Cannot handle this register class");
   }
@@ -466,7 +461,852 @@ unsigned HexagonInstrInfo::createVR(MachineFunction* MF, MVT VT) const {
   return NewReg;
 }
 
+bool HexagonInstrInfo::isExtendable(const MachineInstr *MI) const {
+  switch(MI->getOpcode()) {
+    default: return false;
+    // JMP_EQri
+    case Hexagon::JMP_EQriPt_nv_V4:
+    case Hexagon::JMP_EQriPnt_nv_V4:
+    case Hexagon::JMP_EQriNotPt_nv_V4:
+    case Hexagon::JMP_EQriNotPnt_nv_V4:
+
+    // JMP_EQri - with -1
+    case Hexagon::JMP_EQriPtneg_nv_V4:
+    case Hexagon::JMP_EQriPntneg_nv_V4:
+    case Hexagon::JMP_EQriNotPtneg_nv_V4:
+    case Hexagon::JMP_EQriNotPntneg_nv_V4:
+
+    // JMP_EQrr
+    case Hexagon::JMP_EQrrPt_nv_V4:
+    case Hexagon::JMP_EQrrPnt_nv_V4:
+    case Hexagon::JMP_EQrrNotPt_nv_V4:
+    case Hexagon::JMP_EQrrNotPnt_nv_V4:
+
+    // JMP_GTri
+    case Hexagon::JMP_GTriPt_nv_V4:
+    case Hexagon::JMP_GTriPnt_nv_V4:
+    case Hexagon::JMP_GTriNotPt_nv_V4:
+    case Hexagon::JMP_GTriNotPnt_nv_V4:
+
+    // JMP_GTri - with -1
+    case Hexagon::JMP_GTriPtneg_nv_V4:
+    case Hexagon::JMP_GTriPntneg_nv_V4:
+    case Hexagon::JMP_GTriNotPtneg_nv_V4:
+    case Hexagon::JMP_GTriNotPntneg_nv_V4:
 
+    // JMP_GTrr
+    case Hexagon::JMP_GTrrPt_nv_V4:
+    case Hexagon::JMP_GTrrPnt_nv_V4:
+    case Hexagon::JMP_GTrrNotPt_nv_V4:
+    case Hexagon::JMP_GTrrNotPnt_nv_V4:
+
+    // JMP_GTrrdn
+    case Hexagon::JMP_GTrrdnPt_nv_V4:
+    case Hexagon::JMP_GTrrdnPnt_nv_V4:
+    case Hexagon::JMP_GTrrdnNotPt_nv_V4:
+    case Hexagon::JMP_GTrrdnNotPnt_nv_V4:
+
+    // JMP_GTUri
+    case Hexagon::JMP_GTUriPt_nv_V4:
+    case Hexagon::JMP_GTUriPnt_nv_V4:
+    case Hexagon::JMP_GTUriNotPt_nv_V4:
+    case Hexagon::JMP_GTUriNotPnt_nv_V4:
+
+    // JMP_GTUrr
+    case Hexagon::JMP_GTUrrPt_nv_V4:
+    case Hexagon::JMP_GTUrrPnt_nv_V4:
+    case Hexagon::JMP_GTUrrNotPt_nv_V4:
+    case Hexagon::JMP_GTUrrNotPnt_nv_V4:
+
+    // JMP_GTUrrdn
+    case Hexagon::JMP_GTUrrdnPt_nv_V4:
+    case Hexagon::JMP_GTUrrdnPnt_nv_V4:
+    case Hexagon::JMP_GTUrrdnNotPt_nv_V4:
+    case Hexagon::JMP_GTUrrdnNotPnt_nv_V4:
+
+    // TFR_FI
+    case Hexagon::TFR_FI:
+      return true;
+  }
+}
+
+bool HexagonInstrInfo::isExtended(const MachineInstr *MI) const {
+  switch(MI->getOpcode()) {
+    default: return false;
+    // JMP_EQri
+    case Hexagon::JMP_EQriPt_ie_nv_V4:
+    case Hexagon::JMP_EQriPnt_ie_nv_V4:
+    case Hexagon::JMP_EQriNotPt_ie_nv_V4:
+    case Hexagon::JMP_EQriNotPnt_ie_nv_V4:
+
+    // JMP_EQri - with -1
+    case Hexagon::JMP_EQriPtneg_ie_nv_V4:
+    case Hexagon::JMP_EQriPntneg_ie_nv_V4:
+    case Hexagon::JMP_EQriNotPtneg_ie_nv_V4:
+    case Hexagon::JMP_EQriNotPntneg_ie_nv_V4:
+
+    // JMP_EQrr
+    case Hexagon::JMP_EQrrPt_ie_nv_V4:
+    case Hexagon::JMP_EQrrPnt_ie_nv_V4:
+    case Hexagon::JMP_EQrrNotPt_ie_nv_V4:
+    case Hexagon::JMP_EQrrNotPnt_ie_nv_V4:
+
+    // JMP_GTri
+    case Hexagon::JMP_GTriPt_ie_nv_V4:
+    case Hexagon::JMP_GTriPnt_ie_nv_V4:
+    case Hexagon::JMP_GTriNotPt_ie_nv_V4:
+    case Hexagon::JMP_GTriNotPnt_ie_nv_V4:
+
+    // JMP_GTri - with -1
+    case Hexagon::JMP_GTriPtneg_ie_nv_V4:
+    case Hexagon::JMP_GTriPntneg_ie_nv_V4:
+    case Hexagon::JMP_GTriNotPtneg_ie_nv_V4:
+    case Hexagon::JMP_GTriNotPntneg_ie_nv_V4:
+
+    // JMP_GTrr
+    case Hexagon::JMP_GTrrPt_ie_nv_V4:
+    case Hexagon::JMP_GTrrPnt_ie_nv_V4:
+    case Hexagon::JMP_GTrrNotPt_ie_nv_V4:
+    case Hexagon::JMP_GTrrNotPnt_ie_nv_V4:
+
+    // JMP_GTrrdn
+    case Hexagon::JMP_GTrrdnPt_ie_nv_V4:
+    case Hexagon::JMP_GTrrdnPnt_ie_nv_V4:
+    case Hexagon::JMP_GTrrdnNotPt_ie_nv_V4:
+    case Hexagon::JMP_GTrrdnNotPnt_ie_nv_V4:
+
+    // JMP_GTUri
+    case Hexagon::JMP_GTUriPt_ie_nv_V4:
+    case Hexagon::JMP_GTUriPnt_ie_nv_V4:
+    case Hexagon::JMP_GTUriNotPt_ie_nv_V4:
+    case Hexagon::JMP_GTUriNotPnt_ie_nv_V4:
+
+    // JMP_GTUrr
+    case Hexagon::JMP_GTUrrPt_ie_nv_V4:
+    case Hexagon::JMP_GTUrrPnt_ie_nv_V4:
+    case Hexagon::JMP_GTUrrNotPt_ie_nv_V4:
+    case Hexagon::JMP_GTUrrNotPnt_ie_nv_V4:
+
+    // JMP_GTUrrdn
+    case Hexagon::JMP_GTUrrdnPt_ie_nv_V4:
+    case Hexagon::JMP_GTUrrdnPnt_ie_nv_V4:
+    case Hexagon::JMP_GTUrrdnNotPt_ie_nv_V4:
+    case Hexagon::JMP_GTUrrdnNotPnt_ie_nv_V4:
+
+    // V4 absolute set addressing.
+    case Hexagon::LDrid_abs_setimm_V4:
+    case Hexagon::LDriw_abs_setimm_V4:
+    case Hexagon::LDrih_abs_setimm_V4:
+    case Hexagon::LDrib_abs_setimm_V4:
+    case Hexagon::LDriuh_abs_setimm_V4:
+    case Hexagon::LDriub_abs_setimm_V4:
+
+    case Hexagon::STrid_abs_setimm_V4:
+    case Hexagon::STrib_abs_setimm_V4:
+    case Hexagon::STrih_abs_setimm_V4:
+    case Hexagon::STriw_abs_setimm_V4:
+
+    // V4 global address load.
+    case Hexagon::LDrid_GP_cPt_V4 :
+    case Hexagon::LDrid_GP_cNotPt_V4 :
+    case Hexagon::LDrid_GP_cdnPt_V4 :
+    case Hexagon::LDrid_GP_cdnNotPt_V4 :
+    case Hexagon::LDrib_GP_cPt_V4 :
+    case Hexagon::LDrib_GP_cNotPt_V4 :
+    case Hexagon::LDrib_GP_cdnPt_V4 :
+    case Hexagon::LDrib_GP_cdnNotPt_V4 :
+    case Hexagon::LDriub_GP_cPt_V4 :
+    case Hexagon::LDriub_GP_cNotPt_V4 :
+    case Hexagon::LDriub_GP_cdnPt_V4 :
+    case Hexagon::LDriub_GP_cdnNotPt_V4 :
+    case Hexagon::LDrih_GP_cPt_V4 :
+    case Hexagon::LDrih_GP_cNotPt_V4 :
+    case Hexagon::LDrih_GP_cdnPt_V4 :
+    case Hexagon::LDrih_GP_cdnNotPt_V4 :
+    case Hexagon::LDriuh_GP_cPt_V4 :
+    case Hexagon::LDriuh_GP_cNotPt_V4 :
+    case Hexagon::LDriuh_GP_cdnPt_V4 :
+    case Hexagon::LDriuh_GP_cdnNotPt_V4 :
+    case Hexagon::LDriw_GP_cPt_V4 :
+    case Hexagon::LDriw_GP_cNotPt_V4 :
+    case Hexagon::LDriw_GP_cdnPt_V4 :
+    case Hexagon::LDriw_GP_cdnNotPt_V4 :
+    case Hexagon::LDd_GP_cPt_V4 :
+    case Hexagon::LDd_GP_cNotPt_V4 :
+    case Hexagon::LDd_GP_cdnPt_V4 :
+    case Hexagon::LDd_GP_cdnNotPt_V4 :
+    case Hexagon::LDb_GP_cPt_V4 :
+    case Hexagon::LDb_GP_cNotPt_V4 :
+    case Hexagon::LDb_GP_cdnPt_V4 :
+    case Hexagon::LDb_GP_cdnNotPt_V4 :
+    case Hexagon::LDub_GP_cPt_V4 :
+    case Hexagon::LDub_GP_cNotPt_V4 :
+    case Hexagon::LDub_GP_cdnPt_V4 :
+    case Hexagon::LDub_GP_cdnNotPt_V4 :
+    case Hexagon::LDh_GP_cPt_V4 :
+    case Hexagon::LDh_GP_cNotPt_V4 :
+    case Hexagon::LDh_GP_cdnPt_V4 :
+    case Hexagon::LDh_GP_cdnNotPt_V4 :
+    case Hexagon::LDuh_GP_cPt_V4 :
+    case Hexagon::LDuh_GP_cNotPt_V4 :
+    case Hexagon::LDuh_GP_cdnPt_V4 :
+    case Hexagon::LDuh_GP_cdnNotPt_V4 :
+    case Hexagon::LDw_GP_cPt_V4 :
+    case Hexagon::LDw_GP_cNotPt_V4 :
+    case Hexagon::LDw_GP_cdnPt_V4 :
+    case Hexagon::LDw_GP_cdnNotPt_V4 :
+
+    // V4 global address store.
+    case Hexagon::STrid_GP_cPt_V4 :
+    case Hexagon::STrid_GP_cNotPt_V4 :
+    case Hexagon::STrid_GP_cdnPt_V4 :
+    case Hexagon::STrid_GP_cdnNotPt_V4 :
+    case Hexagon::STrib_GP_cPt_V4 :
+    case Hexagon::STrib_GP_cNotPt_V4 :
+    case Hexagon::STrib_GP_cdnPt_V4 :
+    case Hexagon::STrib_GP_cdnNotPt_V4 :
+    case Hexagon::STrih_GP_cPt_V4 :
+    case Hexagon::STrih_GP_cNotPt_V4 :
+    case Hexagon::STrih_GP_cdnPt_V4 :
+    case Hexagon::STrih_GP_cdnNotPt_V4 :
+    case Hexagon::STriw_GP_cPt_V4 :
+    case Hexagon::STriw_GP_cNotPt_V4 :
+    case Hexagon::STriw_GP_cdnPt_V4 :
+    case Hexagon::STriw_GP_cdnNotPt_V4 :
+    case Hexagon::STd_GP_cPt_V4 :
+    case Hexagon::STd_GP_cNotPt_V4 :
+    case Hexagon::STd_GP_cdnPt_V4 :
+    case Hexagon::STd_GP_cdnNotPt_V4 :
+    case Hexagon::STb_GP_cPt_V4 :
+    case Hexagon::STb_GP_cNotPt_V4 :
+    case Hexagon::STb_GP_cdnPt_V4 :
+    case Hexagon::STb_GP_cdnNotPt_V4 :
+    case Hexagon::STh_GP_cPt_V4 :
+    case Hexagon::STh_GP_cNotPt_V4 :
+    case Hexagon::STh_GP_cdnPt_V4 :
+    case Hexagon::STh_GP_cdnNotPt_V4 :
+    case Hexagon::STw_GP_cPt_V4 :
+    case Hexagon::STw_GP_cNotPt_V4 :
+    case Hexagon::STw_GP_cdnPt_V4 :
+    case Hexagon::STw_GP_cdnNotPt_V4 :
+
+    // V4 predicated global address new value store.
+    case Hexagon::STrib_GP_cPt_nv_V4 :
+    case Hexagon::STrib_GP_cNotPt_nv_V4 :
+    case Hexagon::STrib_GP_cdnPt_nv_V4 :
+    case Hexagon::STrib_GP_cdnNotPt_nv_V4 :
+    case Hexagon::STrih_GP_cPt_nv_V4 :
+    case Hexagon::STrih_GP_cNotPt_nv_V4 :
+    case Hexagon::STrih_GP_cdnPt_nv_V4 :
+    case Hexagon::STrih_GP_cdnNotPt_nv_V4 :
+    case Hexagon::STriw_GP_cPt_nv_V4 :
+    case Hexagon::STriw_GP_cNotPt_nv_V4 :
+    case Hexagon::STriw_GP_cdnPt_nv_V4 :
+    case Hexagon::STriw_GP_cdnNotPt_nv_V4 :
+    case Hexagon::STb_GP_cPt_nv_V4 :
+    case Hexagon::STb_GP_cNotPt_nv_V4 :
+    case Hexagon::STb_GP_cdnPt_nv_V4 :
+    case Hexagon::STb_GP_cdnNotPt_nv_V4 :
+    case Hexagon::STh_GP_cPt_nv_V4 :
+    case Hexagon::STh_GP_cNotPt_nv_V4 :
+    case Hexagon::STh_GP_cdnPt_nv_V4 :
+    case Hexagon::STh_GP_cdnNotPt_nv_V4 :
+    case Hexagon::STw_GP_cPt_nv_V4 :
+    case Hexagon::STw_GP_cNotPt_nv_V4 :
+    case Hexagon::STw_GP_cdnPt_nv_V4 :
+    case Hexagon::STw_GP_cdnNotPt_nv_V4 :
+
+    // TFR_FI
+    case Hexagon::TFR_FI_immext_V4:
+
+    // TFRI_F
+    case Hexagon::TFRI_f:
+    case Hexagon::TFRI_cPt_f:
+    case Hexagon::TFRI_cNotPt_f:
+    case Hexagon::CONST64_Float_Real:
+      return true;
+  }
+}
+
+bool HexagonInstrInfo::isNewValueJump(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+    default: return false;
+    // JMP_EQri
+    case Hexagon::JMP_EQriPt_nv_V4:
+    case Hexagon::JMP_EQriPnt_nv_V4:
+    case Hexagon::JMP_EQriNotPt_nv_V4:
+    case Hexagon::JMP_EQriNotPnt_nv_V4:
+    case Hexagon::JMP_EQriPt_ie_nv_V4:
+    case Hexagon::JMP_EQriPnt_ie_nv_V4:
+    case Hexagon::JMP_EQriNotPt_ie_nv_V4:
+    case Hexagon::JMP_EQriNotPnt_ie_nv_V4:
+
+    // JMP_EQri - with -1
+    case Hexagon::JMP_EQriPtneg_nv_V4:
+    case Hexagon::JMP_EQriPntneg_nv_V4:
+    case Hexagon::JMP_EQriNotPtneg_nv_V4:
+    case Hexagon::JMP_EQriNotPntneg_nv_V4:
+    case Hexagon::JMP_EQriPtneg_ie_nv_V4:
+    case Hexagon::JMP_EQriPntneg_ie_nv_V4:
+    case Hexagon::JMP_EQriNotPtneg_ie_nv_V4:
+    case Hexagon::JMP_EQriNotPntneg_ie_nv_V4:
+
+    // JMP_EQrr
+    case Hexagon::JMP_EQrrPt_nv_V4:
+    case Hexagon::JMP_EQrrPnt_nv_V4:
+    case Hexagon::JMP_EQrrNotPt_nv_V4:
+    case Hexagon::JMP_EQrrNotPnt_nv_V4:
+    case Hexagon::JMP_EQrrPt_ie_nv_V4:
+    case Hexagon::JMP_EQrrPnt_ie_nv_V4:
+    case Hexagon::JMP_EQrrNotPt_ie_nv_V4:
+    case Hexagon::JMP_EQrrNotPnt_ie_nv_V4:
+
+    // JMP_GTri
+    case Hexagon::JMP_GTriPt_nv_V4:
+    case Hexagon::JMP_GTriPnt_nv_V4:
+    case Hexagon::JMP_GTriNotPt_nv_V4:
+    case Hexagon::JMP_GTriNotPnt_nv_V4:
+    case Hexagon::JMP_GTriPt_ie_nv_V4:
+    case Hexagon::JMP_GTriPnt_ie_nv_V4:
+    case Hexagon::JMP_GTriNotPt_ie_nv_V4:
+    case Hexagon::JMP_GTriNotPnt_ie_nv_V4:
+
+    // JMP_GTri - with -1
+    case Hexagon::JMP_GTriPtneg_nv_V4:
+    case Hexagon::JMP_GTriPntneg_nv_V4:
+    case Hexagon::JMP_GTriNotPtneg_nv_V4:
+    case Hexagon::JMP_GTriNotPntneg_nv_V4:
+    case Hexagon::JMP_GTriPtneg_ie_nv_V4:
+    case Hexagon::JMP_GTriPntneg_ie_nv_V4:
+    case Hexagon::JMP_GTriNotPtneg_ie_nv_V4:
+    case Hexagon::JMP_GTriNotPntneg_ie_nv_V4:
+
+    // JMP_GTrr
+    case Hexagon::JMP_GTrrPt_nv_V4:
+    case Hexagon::JMP_GTrrPnt_nv_V4:
+    case Hexagon::JMP_GTrrNotPt_nv_V4:
+    case Hexagon::JMP_GTrrNotPnt_nv_V4:
+    case Hexagon::JMP_GTrrPt_ie_nv_V4:
+    case Hexagon::JMP_GTrrPnt_ie_nv_V4:
+    case Hexagon::JMP_GTrrNotPt_ie_nv_V4:
+    case Hexagon::JMP_GTrrNotPnt_ie_nv_V4:
+
+    // JMP_GTrrdn
+    case Hexagon::JMP_GTrrdnPt_nv_V4:
+    case Hexagon::JMP_GTrrdnPnt_nv_V4:
+    case Hexagon::JMP_GTrrdnNotPt_nv_V4:
+    case Hexagon::JMP_GTrrdnNotPnt_nv_V4:
+    case Hexagon::JMP_GTrrdnPt_ie_nv_V4:
+    case Hexagon::JMP_GTrrdnPnt_ie_nv_V4:
+    case Hexagon::JMP_GTrrdnNotPt_ie_nv_V4:
+    case Hexagon::JMP_GTrrdnNotPnt_ie_nv_V4:
+
+    // JMP_GTUri
+    case Hexagon::JMP_GTUriPt_nv_V4:
+    case Hexagon::JMP_GTUriPnt_nv_V4:
+    case Hexagon::JMP_GTUriNotPt_nv_V4:
+    case Hexagon::JMP_GTUriNotPnt_nv_V4:
+    case Hexagon::JMP_GTUriPt_ie_nv_V4:
+    case Hexagon::JMP_GTUriPnt_ie_nv_V4:
+    case Hexagon::JMP_GTUriNotPt_ie_nv_V4:
+    case Hexagon::JMP_GTUriNotPnt_ie_nv_V4:
+
+    // JMP_GTUrr
+    case Hexagon::JMP_GTUrrPt_nv_V4:
+    case Hexagon::JMP_GTUrrPnt_nv_V4:
+    case Hexagon::JMP_GTUrrNotPt_nv_V4:
+    case Hexagon::JMP_GTUrrNotPnt_nv_V4:
+    case Hexagon::JMP_GTUrrPt_ie_nv_V4:
+    case Hexagon::JMP_GTUrrPnt_ie_nv_V4:
+    case Hexagon::JMP_GTUrrNotPt_ie_nv_V4:
+    case Hexagon::JMP_GTUrrNotPnt_ie_nv_V4:
+
+    // JMP_GTUrrdn
+    case Hexagon::JMP_GTUrrdnPt_nv_V4:
+    case Hexagon::JMP_GTUrrdnPnt_nv_V4:
+    case Hexagon::JMP_GTUrrdnNotPt_nv_V4:
+    case Hexagon::JMP_GTUrrdnNotPnt_nv_V4:
+    case Hexagon::JMP_GTUrrdnPt_ie_nv_V4:
+    case Hexagon::JMP_GTUrrdnPnt_ie_nv_V4:
+    case Hexagon::JMP_GTUrrdnNotPt_ie_nv_V4:
+    case Hexagon::JMP_GTUrrdnNotPnt_ie_nv_V4:
+      return true;
+  }
+}
+
+unsigned HexagonInstrInfo::getImmExtForm(const MachineInstr* MI) const {
+  switch(MI->getOpcode()) {
+    default: llvm_unreachable("Unknown type of instruction.");
+    // JMP_EQri
+    case Hexagon::JMP_EQriPt_nv_V4:
+      return Hexagon::JMP_EQriPt_ie_nv_V4;
+    case Hexagon::JMP_EQriNotPt_nv_V4:
+      return Hexagon::JMP_EQriNotPt_ie_nv_V4;
+    case Hexagon::JMP_EQriPnt_nv_V4:
+      return Hexagon::JMP_EQriPnt_ie_nv_V4;
+    case Hexagon::JMP_EQriNotPnt_nv_V4:
+      return Hexagon::JMP_EQriNotPnt_ie_nv_V4;
+
+    // JMP_EQri -- with -1
+    case Hexagon::JMP_EQriPtneg_nv_V4:
+      return Hexagon::JMP_EQriPtneg_ie_nv_V4;
+    case Hexagon::JMP_EQriNotPtneg_nv_V4:
+      return Hexagon::JMP_EQriNotPtneg_ie_nv_V4;
+    case Hexagon::JMP_EQriPntneg_nv_V4:
+      return Hexagon::JMP_EQriPntneg_ie_nv_V4;
+    case Hexagon::JMP_EQriNotPntneg_nv_V4:
+      return Hexagon::JMP_EQriNotPntneg_ie_nv_V4;
+
+    // JMP_EQrr
+    case Hexagon::JMP_EQrrPt_nv_V4:
+      return Hexagon::JMP_EQrrPt_ie_nv_V4;
+    case Hexagon::JMP_EQrrNotPt_nv_V4:
+      return Hexagon::JMP_EQrrNotPt_ie_nv_V4;
+    case Hexagon::JMP_EQrrPnt_nv_V4:
+      return Hexagon::JMP_EQrrPnt_ie_nv_V4;
+    case Hexagon::JMP_EQrrNotPnt_nv_V4:
+      return Hexagon::JMP_EQrrNotPnt_ie_nv_V4;
+
+    // JMP_GTri
+    case Hexagon::JMP_GTriPt_nv_V4:
+      return Hexagon::JMP_GTriPt_ie_nv_V4;
+    case Hexagon::JMP_GTriNotPt_nv_V4:
+      return Hexagon::JMP_GTriNotPt_ie_nv_V4;
+    case Hexagon::JMP_GTriPnt_nv_V4:
+      return Hexagon::JMP_GTriPnt_ie_nv_V4;
+    case Hexagon::JMP_GTriNotPnt_nv_V4:
+      return Hexagon::JMP_GTriNotPnt_ie_nv_V4;
+
+    // JMP_GTri -- with -1
+    case Hexagon::JMP_GTriPtneg_nv_V4:
+      return Hexagon::JMP_GTriPtneg_ie_nv_V4;
+    case Hexagon::JMP_GTriNotPtneg_nv_V4:
+      return Hexagon::JMP_GTriNotPtneg_ie_nv_V4;
+    case Hexagon::JMP_GTriPntneg_nv_V4:
+      return Hexagon::JMP_GTriPntneg_ie_nv_V4;
+    case Hexagon::JMP_GTriNotPntneg_nv_V4:
+      return Hexagon::JMP_GTriNotPntneg_ie_nv_V4;
+
+    // JMP_GTrr
+    case Hexagon::JMP_GTrrPt_nv_V4:
+      return Hexagon::JMP_GTrrPt_ie_nv_V4;
+    case Hexagon::JMP_GTrrNotPt_nv_V4:
+      return Hexagon::JMP_GTrrNotPt_ie_nv_V4;
+    case Hexagon::JMP_GTrrPnt_nv_V4:
+      return Hexagon::JMP_GTrrPnt_ie_nv_V4;
+    case Hexagon::JMP_GTrrNotPnt_nv_V4:
+      return Hexagon::JMP_GTrrNotPnt_ie_nv_V4;
+
+    // JMP_GTrrdn
+    case Hexagon::JMP_GTrrdnPt_nv_V4:
+      return Hexagon::JMP_GTrrdnPt_ie_nv_V4;
+    case Hexagon::JMP_GTrrdnNotPt_nv_V4:
+      return Hexagon::JMP_GTrrdnNotPt_ie_nv_V4;
+    case Hexagon::JMP_GTrrdnPnt_nv_V4:
+      return Hexagon::JMP_GTrrdnPnt_ie_nv_V4;
+    case Hexagon::JMP_GTrrdnNotPnt_nv_V4:
+      return Hexagon::JMP_GTrrdnNotPnt_ie_nv_V4;
+
+    // JMP_GTUri
+    case Hexagon::JMP_GTUriPt_nv_V4:
+      return Hexagon::JMP_GTUriPt_ie_nv_V4;
+    case Hexagon::JMP_GTUriNotPt_nv_V4:
+      return Hexagon::JMP_GTUriNotPt_ie_nv_V4;
+    case Hexagon::JMP_GTUriPnt_nv_V4:
+      return Hexagon::JMP_GTUriPnt_ie_nv_V4;
+    case Hexagon::JMP_GTUriNotPnt_nv_V4:
+      return Hexagon::JMP_GTUriNotPnt_ie_nv_V4;
+
+    // JMP_GTUrr
+    case Hexagon::JMP_GTUrrPt_nv_V4:
+      return Hexagon::JMP_GTUrrPt_ie_nv_V4;
+    case Hexagon::JMP_GTUrrNotPt_nv_V4:
+      return Hexagon::JMP_GTUrrNotPt_ie_nv_V4;
+    case Hexagon::JMP_GTUrrPnt_nv_V4:
+      return Hexagon::JMP_GTUrrPnt_ie_nv_V4;
+    case Hexagon::JMP_GTUrrNotPnt_nv_V4:
+      return Hexagon::JMP_GTUrrNotPnt_ie_nv_V4;
+
+    // JMP_GTUrrdn
+    case Hexagon::JMP_GTUrrdnPt_nv_V4:
+      return Hexagon::JMP_GTUrrdnPt_ie_nv_V4;
+    case Hexagon::JMP_GTUrrdnNotPt_nv_V4:
+      return Hexagon::JMP_GTUrrdnNotPt_ie_nv_V4;
+    case Hexagon::JMP_GTUrrdnPnt_nv_V4:
+      return Hexagon::JMP_GTUrrdnPnt_ie_nv_V4;
+    case Hexagon::JMP_GTUrrdnNotPnt_nv_V4:
+      return Hexagon::JMP_GTUrrdnNotPnt_ie_nv_V4;
+
+    case Hexagon::TFR_FI:
+        return Hexagon::TFR_FI_immext_V4;
+
+    case Hexagon::MEMw_ADDSUBi_indexed_MEM_V4 :
+    case Hexagon::MEMw_ADDi_indexed_MEM_V4 :
+    case Hexagon::MEMw_SUBi_indexed_MEM_V4 :
+    case Hexagon::MEMw_ADDr_indexed_MEM_V4 :
+    case Hexagon::MEMw_SUBr_indexed_MEM_V4 :
+    case Hexagon::MEMw_ANDr_indexed_MEM_V4 :
+    case Hexagon::MEMw_ORr_indexed_MEM_V4 :
+    case Hexagon::MEMw_ADDSUBi_MEM_V4 :
+    case Hexagon::MEMw_ADDi_MEM_V4 :
+    case Hexagon::MEMw_SUBi_MEM_V4 :
+    case Hexagon::MEMw_ADDr_MEM_V4 :
+    case Hexagon::MEMw_SUBr_MEM_V4 :
+    case Hexagon::MEMw_ANDr_MEM_V4 :
+    case Hexagon::MEMw_ORr_MEM_V4 :
+    case Hexagon::MEMh_ADDSUBi_indexed_MEM_V4 :
+    case Hexagon::MEMh_ADDi_indexed_MEM_V4 :
+    case Hexagon::MEMh_SUBi_indexed_MEM_V4 :
+    case Hexagon::MEMh_ADDr_indexed_MEM_V4 :
+    case Hexagon::MEMh_SUBr_indexed_MEM_V4 :
+    case Hexagon::MEMh_ANDr_indexed_MEM_V4 :
+    case Hexagon::MEMh_ORr_indexed_MEM_V4 :
+    case Hexagon::MEMh_ADDSUBi_MEM_V4 :
+    case Hexagon::MEMh_ADDi_MEM_V4 :
+    case Hexagon::MEMh_SUBi_MEM_V4 :
+    case Hexagon::MEMh_ADDr_MEM_V4 :
+    case Hexagon::MEMh_SUBr_MEM_V4 :
+    case Hexagon::MEMh_ANDr_MEM_V4 :
+    case Hexagon::MEMh_ORr_MEM_V4 :
+    case Hexagon::MEMb_ADDSUBi_indexed_MEM_V4 :
+    case Hexagon::MEMb_ADDi_indexed_MEM_V4 :
+    case Hexagon::MEMb_SUBi_indexed_MEM_V4 :
+    case Hexagon::MEMb_ADDr_indexed_MEM_V4 :
+    case Hexagon::MEMb_SUBr_indexed_MEM_V4 :
+    case Hexagon::MEMb_ANDr_indexed_MEM_V4 :
+    case Hexagon::MEMb_ORr_indexed_MEM_V4 :
+    case Hexagon::MEMb_ADDSUBi_MEM_V4 :
+    case Hexagon::MEMb_ADDi_MEM_V4 :
+    case Hexagon::MEMb_SUBi_MEM_V4 :
+    case Hexagon::MEMb_ADDr_MEM_V4 :
+    case Hexagon::MEMb_SUBr_MEM_V4 :
+    case Hexagon::MEMb_ANDr_MEM_V4 :
+    case Hexagon::MEMb_ORr_MEM_V4 :
+      llvm_unreachable("Needs implementing.");
+  }
+}
+
+unsigned HexagonInstrInfo::getNormalBranchForm(const MachineInstr* MI) const {
+  switch(MI->getOpcode()) {
+    default: llvm_unreachable("Unknown type of jump instruction.");
+    // JMP_EQri
+    case Hexagon::JMP_EQriPt_ie_nv_V4:
+      return Hexagon::JMP_EQriPt_nv_V4;
+    case Hexagon::JMP_EQriNotPt_ie_nv_V4:
+      return Hexagon::JMP_EQriNotPt_nv_V4;
+    case Hexagon::JMP_EQriPnt_ie_nv_V4:
+      return Hexagon::JMP_EQriPnt_nv_V4;
+    case Hexagon::JMP_EQriNotPnt_ie_nv_V4:
+      return Hexagon::JMP_EQriNotPnt_nv_V4;
+
+    // JMP_EQri -- with -1
+    case Hexagon::JMP_EQriPtneg_ie_nv_V4:
+      return Hexagon::JMP_EQriPtneg_nv_V4;
+    case Hexagon::JMP_EQriNotPtneg_ie_nv_V4:
+      return Hexagon::JMP_EQriNotPtneg_nv_V4;
+    case Hexagon::JMP_EQriPntneg_ie_nv_V4:
+      return Hexagon::JMP_EQriPntneg_nv_V4;
+    case Hexagon::JMP_EQriNotPntneg_ie_nv_V4:
+      return Hexagon::JMP_EQriNotPntneg_nv_V4;
+
+    // JMP_EQrr
+    case Hexagon::JMP_EQrrPt_ie_nv_V4:
+      return Hexagon::JMP_EQrrPt_nv_V4;
+    case Hexagon::JMP_EQrrNotPt_ie_nv_V4:
+      return Hexagon::JMP_EQrrNotPt_nv_V4;
+    case Hexagon::JMP_EQrrPnt_ie_nv_V4:
+      return Hexagon::JMP_EQrrPnt_nv_V4;
+    case Hexagon::JMP_EQrrNotPnt_ie_nv_V4:
+      return Hexagon::JMP_EQrrNotPnt_nv_V4;
+
+    // JMP_GTri
+    case Hexagon::JMP_GTriPt_ie_nv_V4:
+      return Hexagon::JMP_GTriPt_nv_V4;
+    case Hexagon::JMP_GTriNotPt_ie_nv_V4:
+      return Hexagon::JMP_GTriNotPt_nv_V4;
+    case Hexagon::JMP_GTriPnt_ie_nv_V4:
+      return Hexagon::JMP_GTriPnt_nv_V4;
+    case Hexagon::JMP_GTriNotPnt_ie_nv_V4:
+      return Hexagon::JMP_GTriNotPnt_nv_V4;
+
+    // JMP_GTri -- with -1
+    case Hexagon::JMP_GTriPtneg_ie_nv_V4:
+      return Hexagon::JMP_GTriPtneg_nv_V4;
+    case Hexagon::JMP_GTriNotPtneg_ie_nv_V4:
+      return Hexagon::JMP_GTriNotPtneg_nv_V4;
+    case Hexagon::JMP_GTriPntneg_ie_nv_V4:
+      return Hexagon::JMP_GTriPntneg_nv_V4;
+    case Hexagon::JMP_GTriNotPntneg_ie_nv_V4:
+      return Hexagon::JMP_GTriNotPntneg_nv_V4;
+
+    // JMP_GTrr
+    case Hexagon::JMP_GTrrPt_ie_nv_V4:
+      return Hexagon::JMP_GTrrPt_nv_V4;
+    case Hexagon::JMP_GTrrNotPt_ie_nv_V4:
+      return Hexagon::JMP_GTrrNotPt_nv_V4;
+    case Hexagon::JMP_GTrrPnt_ie_nv_V4:
+      return Hexagon::JMP_GTrrPnt_nv_V4;
+    case Hexagon::JMP_GTrrNotPnt_ie_nv_V4:
+      return Hexagon::JMP_GTrrNotPnt_nv_V4;
+
+    // JMP_GTrrdn
+    case Hexagon::JMP_GTrrdnPt_ie_nv_V4:
+      return Hexagon::JMP_GTrrdnPt_nv_V4;
+    case Hexagon::JMP_GTrrdnNotPt_ie_nv_V4:
+      return Hexagon::JMP_GTrrdnNotPt_nv_V4;
+    case Hexagon::JMP_GTrrdnPnt_ie_nv_V4:
+      return Hexagon::JMP_GTrrdnPnt_nv_V4;
+    case Hexagon::JMP_GTrrdnNotPnt_ie_nv_V4:
+      return Hexagon::JMP_GTrrdnNotPnt_nv_V4;
+
+    // JMP_GTUri
+    case Hexagon::JMP_GTUriPt_ie_nv_V4:
+      return Hexagon::JMP_GTUriPt_nv_V4;
+    case Hexagon::JMP_GTUriNotPt_ie_nv_V4:
+      return Hexagon::JMP_GTUriNotPt_nv_V4;
+    case Hexagon::JMP_GTUriPnt_ie_nv_V4:
+      return Hexagon::JMP_GTUriPnt_nv_V4;
+    case Hexagon::JMP_GTUriNotPnt_ie_nv_V4:
+      return Hexagon::JMP_GTUriNotPnt_nv_V4;
+
+    // JMP_GTUrr
+    case Hexagon::JMP_GTUrrPt_ie_nv_V4:
+      return Hexagon::JMP_GTUrrPt_nv_V4;
+    case Hexagon::JMP_GTUrrNotPt_ie_nv_V4:
+      return Hexagon::JMP_GTUrrNotPt_nv_V4;
+    case Hexagon::JMP_GTUrrPnt_ie_nv_V4:
+      return Hexagon::JMP_GTUrrPnt_nv_V4;
+    case Hexagon::JMP_GTUrrNotPnt_ie_nv_V4:
+      return Hexagon::JMP_GTUrrNotPnt_nv_V4;
+
+    // JMP_GTUrrdn
+    case Hexagon::JMP_GTUrrdnPt_ie_nv_V4:
+      return Hexagon::JMP_GTUrrdnPt_nv_V4;
+    case Hexagon::JMP_GTUrrdnNotPt_ie_nv_V4:
+      return Hexagon::JMP_GTUrrdnNotPt_nv_V4;
+    case Hexagon::JMP_GTUrrdnPnt_ie_nv_V4:
+      return Hexagon::JMP_GTUrrdnPnt_nv_V4;
+    case Hexagon::JMP_GTUrrdnNotPnt_ie_nv_V4:
+      return Hexagon::JMP_GTUrrdnNotPnt_nv_V4;
+  }
+}
+
+
+bool HexagonInstrInfo::isNewValueStore(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+    default: return false;
+    // Store Byte
+    case Hexagon::STrib_nv_V4:
+    case Hexagon::STrib_indexed_nv_V4:
+    case Hexagon::STrib_indexed_shl_nv_V4:
+    case Hexagon::STrib_shl_nv_V4:
+    case Hexagon::STrib_GP_nv_V4:
+    case Hexagon::STb_GP_nv_V4:
+    case Hexagon::POST_STbri_nv_V4:
+    case Hexagon::STrib_cPt_nv_V4:
+    case Hexagon::STrib_cdnPt_nv_V4:
+    case Hexagon::STrib_cNotPt_nv_V4:
+    case Hexagon::STrib_cdnNotPt_nv_V4:
+    case Hexagon::STrib_indexed_cPt_nv_V4:
+    case Hexagon::STrib_indexed_cdnPt_nv_V4:
+    case Hexagon::STrib_indexed_cNotPt_nv_V4:
+    case Hexagon::STrib_indexed_cdnNotPt_nv_V4:
+    case Hexagon::STrib_indexed_shl_cPt_nv_V4:
+    case Hexagon::STrib_indexed_shl_cdnPt_nv_V4:
+    case Hexagon::STrib_indexed_shl_cNotPt_nv_V4:
+    case Hexagon::STrib_indexed_shl_cdnNotPt_nv_V4:
+    case Hexagon::POST_STbri_cPt_nv_V4:
+    case Hexagon::POST_STbri_cdnPt_nv_V4:
+    case Hexagon::POST_STbri_cNotPt_nv_V4:
+    case Hexagon::POST_STbri_cdnNotPt_nv_V4:
+    case Hexagon::STb_GP_cPt_nv_V4:
+    case Hexagon::STb_GP_cNotPt_nv_V4:
+    case Hexagon::STb_GP_cdnPt_nv_V4:
+    case Hexagon::STb_GP_cdnNotPt_nv_V4:
+    case Hexagon::STrib_GP_cPt_nv_V4:
+    case Hexagon::STrib_GP_cNotPt_nv_V4:
+    case Hexagon::STrib_GP_cdnPt_nv_V4:
+    case Hexagon::STrib_GP_cdnNotPt_nv_V4:
+    case Hexagon::STrib_abs_nv_V4:
+    case Hexagon::STrib_abs_cPt_nv_V4:
+    case Hexagon::STrib_abs_cdnPt_nv_V4:
+    case Hexagon::STrib_abs_cNotPt_nv_V4:
+    case Hexagon::STrib_abs_cdnNotPt_nv_V4:
+    case Hexagon::STrib_imm_abs_nv_V4:
+    case Hexagon::STrib_imm_abs_cPt_nv_V4:
+    case Hexagon::STrib_imm_abs_cdnPt_nv_V4:
+    case Hexagon::STrib_imm_abs_cNotPt_nv_V4:
+    case Hexagon::STrib_imm_abs_cdnNotPt_nv_V4:
+
+    // Store Halfword
+    case Hexagon::STrih_nv_V4:
+    case Hexagon::STrih_indexed_nv_V4:
+    case Hexagon::STrih_indexed_shl_nv_V4:
+    case Hexagon::STrih_shl_nv_V4:
+    case Hexagon::STrih_GP_nv_V4:
+    case Hexagon::STh_GP_nv_V4:
+    case Hexagon::POST_SThri_nv_V4:
+    case Hexagon::STrih_cPt_nv_V4:
+    case Hexagon::STrih_cdnPt_nv_V4:
+    case Hexagon::STrih_cNotPt_nv_V4:
+    case Hexagon::STrih_cdnNotPt_nv_V4:
+    case Hexagon::STrih_indexed_cPt_nv_V4:
+    case Hexagon::STrih_indexed_cdnPt_nv_V4:
+    case Hexagon::STrih_indexed_cNotPt_nv_V4:
+    case Hexagon::STrih_indexed_cdnNotPt_nv_V4:
+    case Hexagon::STrih_indexed_shl_cPt_nv_V4:
+    case Hexagon::STrih_indexed_shl_cdnPt_nv_V4:
+    case Hexagon::STrih_indexed_shl_cNotPt_nv_V4:
+    case Hexagon::STrih_indexed_shl_cdnNotPt_nv_V4:
+    case Hexagon::POST_SThri_cPt_nv_V4:
+    case Hexagon::POST_SThri_cdnPt_nv_V4:
+    case Hexagon::POST_SThri_cNotPt_nv_V4:
+    case Hexagon::POST_SThri_cdnNotPt_nv_V4:
+    case Hexagon::STh_GP_cPt_nv_V4:
+    case Hexagon::STh_GP_cNotPt_nv_V4:
+    case Hexagon::STh_GP_cdnPt_nv_V4:
+    case Hexagon::STh_GP_cdnNotPt_nv_V4:
+    case Hexagon::STrih_GP_cPt_nv_V4:
+    case Hexagon::STrih_GP_cNotPt_nv_V4:
+    case Hexagon::STrih_GP_cdnPt_nv_V4:
+    case Hexagon::STrih_GP_cdnNotPt_nv_V4:
+    case Hexagon::STrih_abs_nv_V4:
+    case Hexagon::STrih_abs_cPt_nv_V4:
+    case Hexagon::STrih_abs_cdnPt_nv_V4:
+    case Hexagon::STrih_abs_cNotPt_nv_V4:
+    case Hexagon::STrih_abs_cdnNotPt_nv_V4:
+    case Hexagon::STrih_imm_abs_nv_V4:
+    case Hexagon::STrih_imm_abs_cPt_nv_V4:
+    case Hexagon::STrih_imm_abs_cdnPt_nv_V4:
+    case Hexagon::STrih_imm_abs_cNotPt_nv_V4:
+    case Hexagon::STrih_imm_abs_cdnNotPt_nv_V4:
+
+    // Store Word
+    case Hexagon::STriw_nv_V4:
+    case Hexagon::STriw_indexed_nv_V4:
+    case Hexagon::STriw_indexed_shl_nv_V4:
+    case Hexagon::STriw_shl_nv_V4:
+    case Hexagon::STriw_GP_nv_V4:
+    case Hexagon::STw_GP_nv_V4:
+    case Hexagon::POST_STwri_nv_V4:
+    case Hexagon::STriw_cPt_nv_V4:
+    case Hexagon::STriw_cdnPt_nv_V4:
+    case Hexagon::STriw_cNotPt_nv_V4:
+    case Hexagon::STriw_cdnNotPt_nv_V4:
+    case Hexagon::STriw_indexed_cPt_nv_V4:
+    case Hexagon::STriw_indexed_cdnPt_nv_V4:
+    case Hexagon::STriw_indexed_cNotPt_nv_V4:
+    case Hexagon::STriw_indexed_cdnNotPt_nv_V4:
+    case Hexagon::STriw_indexed_shl_cPt_nv_V4:
+    case Hexagon::STriw_indexed_shl_cdnPt_nv_V4:
+    case Hexagon::STriw_indexed_shl_cNotPt_nv_V4:
+    case Hexagon::STriw_indexed_shl_cdnNotPt_nv_V4:
+    case Hexagon::POST_STwri_cPt_nv_V4:
+    case Hexagon::POST_STwri_cdnPt_nv_V4:
+    case Hexagon::POST_STwri_cNotPt_nv_V4:
+    case Hexagon::POST_STwri_cdnNotPt_nv_V4:
+    case Hexagon::STw_GP_cPt_nv_V4:
+    case Hexagon::STw_GP_cNotPt_nv_V4:
+    case Hexagon::STw_GP_cdnPt_nv_V4:
+    case Hexagon::STw_GP_cdnNotPt_nv_V4:
+    case Hexagon::STriw_GP_cPt_nv_V4:
+    case Hexagon::STriw_GP_cNotPt_nv_V4:
+    case Hexagon::STriw_GP_cdnPt_nv_V4:
+    case Hexagon::STriw_GP_cdnNotPt_nv_V4:
+    case Hexagon::STriw_abs_nv_V4:
+    case Hexagon::STriw_abs_cPt_nv_V4:
+    case Hexagon::STriw_abs_cdnPt_nv_V4:
+    case Hexagon::STriw_abs_cNotPt_nv_V4:
+    case Hexagon::STriw_abs_cdnNotPt_nv_V4:
+    case Hexagon::STriw_imm_abs_nv_V4:
+    case Hexagon::STriw_imm_abs_cPt_nv_V4:
+    case Hexagon::STriw_imm_abs_cdnPt_nv_V4:
+    case Hexagon::STriw_imm_abs_cNotPt_nv_V4:
+    case Hexagon::STriw_imm_abs_cdnNotPt_nv_V4:
+      return true;
+  }
+}
+
+bool HexagonInstrInfo::isPostIncrement (const MachineInstr* MI) const {
+  switch (MI->getOpcode())
+  {
+    default: return false;
+    // Load Byte
+    case Hexagon::POST_LDrib:
+    case Hexagon::POST_LDrib_cPt:
+    case Hexagon::POST_LDrib_cNotPt:
+    case Hexagon::POST_LDrib_cdnPt_V4:
+    case Hexagon::POST_LDrib_cdnNotPt_V4:
+
+    // Load unsigned byte
+    case Hexagon::POST_LDriub:
+    case Hexagon::POST_LDriub_cPt:
+    case Hexagon::POST_LDriub_cNotPt:
+    case Hexagon::POST_LDriub_cdnPt_V4:
+    case Hexagon::POST_LDriub_cdnNotPt_V4:
+
+    // Load halfword
+    case Hexagon::POST_LDrih:
+    case Hexagon::POST_LDrih_cPt:
+    case Hexagon::POST_LDrih_cNotPt:
+    case Hexagon::POST_LDrih_cdnPt_V4:
+    case Hexagon::POST_LDrih_cdnNotPt_V4:
+
+    // Load unsigned halfword
+    case Hexagon::POST_LDriuh:
+    case Hexagon::POST_LDriuh_cPt:
+    case Hexagon::POST_LDriuh_cNotPt:
+    case Hexagon::POST_LDriuh_cdnPt_V4:
+    case Hexagon::POST_LDriuh_cdnNotPt_V4:
+
+    // Load word
+    case Hexagon::POST_LDriw:
+    case Hexagon::POST_LDriw_cPt:
+    case Hexagon::POST_LDriw_cNotPt:
+    case Hexagon::POST_LDriw_cdnPt_V4:
+    case Hexagon::POST_LDriw_cdnNotPt_V4:
+
+    // Load double word
+    case Hexagon::POST_LDrid:
+    case Hexagon::POST_LDrid_cPt:
+    case Hexagon::POST_LDrid_cNotPt:
+    case Hexagon::POST_LDrid_cdnPt_V4:
+    case Hexagon::POST_LDrid_cdnNotPt_V4:
+
+    // Store byte
+    case Hexagon::POST_STbri:
+    case Hexagon::POST_STbri_cPt:
+    case Hexagon::POST_STbri_cNotPt:
+    case Hexagon::POST_STbri_cdnPt_V4:
+    case Hexagon::POST_STbri_cdnNotPt_V4:
+
+    // Store halfword
+    case Hexagon::POST_SThri:
+    case Hexagon::POST_SThri_cPt:
+    case Hexagon::POST_SThri_cNotPt:
+    case Hexagon::POST_SThri_cdnPt_V4:
+    case Hexagon::POST_SThri_cdnNotPt_V4:
+
+    // Store word
+    case Hexagon::POST_STwri:
+    case Hexagon::POST_STwri_cPt:
+    case Hexagon::POST_STwri_cNotPt:
+    case Hexagon::POST_STwri_cdnPt_V4:
+    case Hexagon::POST_STwri_cdnNotPt_V4:
+
+    // Store double word
+    case Hexagon::POST_STdri:
+    case Hexagon::POST_STdri_cPt:
+    case Hexagon::POST_STdri_cNotPt:
+    case Hexagon::POST_STdri_cdnPt_V4:
+    case Hexagon::POST_STdri_cdnNotPt_V4:
+      return true;
+  }
+}
+
+bool HexagonInstrInfo::isSaveCalleeSavedRegsCall(const MachineInstr *MI) const {
+  return MI->getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4;
+}
 
 bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const {
   bool isPred = MI->getDesc().isPredicable();
@@ -548,7 +1388,7 @@ bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const {
   case Hexagon::SXTH:
   case Hexagon::ZXTB:
   case Hexagon::ZXTH:
-    return Subtarget.getHexagonArchVersion() == HexagonSubtarget::V4;
+    return Subtarget.hasV4TOps();
 
   case Hexagon::JMPR:
     return false;
@@ -557,8 +1397,27 @@ bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const {
   return true;
 }
 
+// This function performs the following inversiones:
+//
+//  cPt    ---> cNotPt
+//  cNotPt ---> cPt
+//
+// however, these inversiones are NOT included:
+//
+//  cdnPt      -X-> cdnNotPt
+//  cdnNotPt   -X-> cdnPt
+//  cPt_nv     -X-> cNotPt_nv (new value stores)
+//  cNotPt_nv  -X-> cPt_nv    (new value stores)
+//
+// because only the following transformations are allowed:
+//
+//  cNotPt  ---> cdnNotPt
+//  cPt     ---> cdnPt
+//  cNotPt  ---> cNotPt_nv
+//  cPt     ---> cPt_nv
 unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const {
   switch(Opc) {
+    default: llvm_unreachable("Unexpected predicated instruction");
     case Hexagon::TFR_cPt:
       return Hexagon::TFR_cNotPt;
     case Hexagon::TFR_cNotPt:
@@ -805,6 +1664,47 @@ unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const {
     case Hexagon::STrid_indexed_shl_cNotPt_V4:
       return Hexagon::STrid_indexed_shl_cPt_V4;
 
+    // V4 Store to global address.
+    case Hexagon::STd_GP_cPt_V4:
+      return Hexagon::STd_GP_cNotPt_V4;
+    case Hexagon::STd_GP_cNotPt_V4:
+      return Hexagon::STd_GP_cPt_V4;
+
+    case Hexagon::STb_GP_cPt_V4:
+      return Hexagon::STb_GP_cNotPt_V4;
+    case Hexagon::STb_GP_cNotPt_V4:
+      return Hexagon::STb_GP_cPt_V4;
+
+    case Hexagon::STh_GP_cPt_V4:
+      return Hexagon::STh_GP_cNotPt_V4;
+    case Hexagon::STh_GP_cNotPt_V4:
+      return Hexagon::STh_GP_cPt_V4;
+
+    case Hexagon::STw_GP_cPt_V4:
+      return Hexagon::STw_GP_cNotPt_V4;
+    case Hexagon::STw_GP_cNotPt_V4:
+      return Hexagon::STw_GP_cPt_V4;
+
+    case Hexagon::STrid_GP_cPt_V4:
+      return Hexagon::STrid_GP_cNotPt_V4;
+    case Hexagon::STrid_GP_cNotPt_V4:
+      return Hexagon::STrid_GP_cPt_V4;
+
+    case Hexagon::STrib_GP_cPt_V4:
+      return Hexagon::STrib_GP_cNotPt_V4;
+    case Hexagon::STrib_GP_cNotPt_V4:
+      return Hexagon::STrib_GP_cPt_V4;
+
+    case Hexagon::STrih_GP_cPt_V4:
+      return Hexagon::STrih_GP_cNotPt_V4;
+    case Hexagon::STrih_GP_cNotPt_V4:
+      return Hexagon::STrih_GP_cPt_V4;
+
+    case Hexagon::STriw_GP_cPt_V4:
+      return Hexagon::STriw_GP_cNotPt_V4;
+    case Hexagon::STriw_GP_cNotPt_V4:
+      return Hexagon::STriw_GP_cPt_V4;
+
   // Load.
     case Hexagon::LDrid_cPt:
       return Hexagon::LDrid_cNotPt;
@@ -1009,9 +1909,6 @@ unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const {
       return Hexagon::JMP_GTUrrdnNotPnt_nv_V4;
     case Hexagon::JMP_GTUrrdnNotPnt_nv_V4:
       return Hexagon::JMP_GTUrrdnPnt_nv_V4;
-
-  default:
-    llvm_unreachable("Unexpected predicated instruction");
   }
 }
 
@@ -1022,12 +1919,21 @@ getMatchingCondBranchOpcode(int Opc, bool invertPredicate) const {
   case Hexagon::TFR:
     return !invertPredicate ? Hexagon::TFR_cPt :
                               Hexagon::TFR_cNotPt;
+  case Hexagon::TFRI_f:
+    return !invertPredicate ? Hexagon::TFRI_cPt_f :
+                              Hexagon::TFRI_cNotPt_f;
   case Hexagon::TFRI:
     return !invertPredicate ? Hexagon::TFRI_cPt :
                               Hexagon::TFRI_cNotPt;
   case Hexagon::JMP:
     return !invertPredicate ? Hexagon::JMP_c :
                               Hexagon::JMP_cNot;
+  case Hexagon::JMP_EQrrPt_nv_V4:
+    return !invertPredicate ? Hexagon::JMP_EQrrPt_nv_V4 :
+                              Hexagon::JMP_EQrrNotPt_nv_V4;
+  case Hexagon::JMP_EQriPt_nv_V4:
+    return !invertPredicate ? Hexagon::JMP_EQriPt_nv_V4 :
+                              Hexagon::JMP_EQriNotPt_nv_V4;
   case Hexagon::ADD_ri:
     return !invertPredicate ? Hexagon::ADD_ri_cPt :
                               Hexagon::ADD_ri_cNotPt;
@@ -1121,6 +2027,46 @@ getMatchingCondBranchOpcode(int Opc, bool invertPredicate) const {
   case Hexagon::LDriw_indexed_shl_V4:
     return !invertPredicate ? Hexagon::LDriw_indexed_shl_cPt_V4 :
                               Hexagon::LDriw_indexed_shl_cNotPt_V4;
+
+  // V4 Load from global address
+  case Hexagon::LDrid_GP_V4:
+    return !invertPredicate ? Hexagon::LDrid_GP_cPt_V4 :
+                              Hexagon::LDrid_GP_cNotPt_V4;
+  case Hexagon::LDrib_GP_V4:
+    return !invertPredicate ? Hexagon::LDrib_GP_cPt_V4 :
+                              Hexagon::LDrib_GP_cNotPt_V4;
+  case Hexagon::LDriub_GP_V4:
+    return !invertPredicate ? Hexagon::LDriub_GP_cPt_V4 :
+                              Hexagon::LDriub_GP_cNotPt_V4;
+  case Hexagon::LDrih_GP_V4:
+    return !invertPredicate ? Hexagon::LDrih_GP_cPt_V4 :
+                              Hexagon::LDrih_GP_cNotPt_V4;
+  case Hexagon::LDriuh_GP_V4:
+    return !invertPredicate ? Hexagon::LDriuh_GP_cPt_V4 :
+                              Hexagon::LDriuh_GP_cNotPt_V4;
+  case Hexagon::LDriw_GP_V4:
+    return !invertPredicate ? Hexagon::LDriw_GP_cPt_V4 :
+                              Hexagon::LDriw_GP_cNotPt_V4;
+
+  case Hexagon::LDd_GP_V4:
+    return !invertPredicate ? Hexagon::LDd_GP_cPt_V4 :
+                              Hexagon::LDd_GP_cNotPt_V4;
+  case Hexagon::LDb_GP_V4:
+    return !invertPredicate ? Hexagon::LDb_GP_cPt_V4 :
+                              Hexagon::LDb_GP_cNotPt_V4;
+  case Hexagon::LDub_GP_V4:
+    return !invertPredicate ? Hexagon::LDub_GP_cPt_V4 :
+                              Hexagon::LDub_GP_cNotPt_V4;
+  case Hexagon::LDh_GP_V4:
+    return !invertPredicate ? Hexagon::LDh_GP_cPt_V4 :
+                              Hexagon::LDh_GP_cNotPt_V4;
+  case Hexagon::LDuh_GP_V4:
+    return !invertPredicate ? Hexagon::LDuh_GP_cPt_V4 :
+                              Hexagon::LDuh_GP_cNotPt_V4;
+  case Hexagon::LDw_GP_V4:
+    return !invertPredicate ? Hexagon::LDw_GP_cPt_V4 :
+                              Hexagon::LDw_GP_cNotPt_V4;
+
     // Byte.
   case Hexagon::POST_STbri:
     return !invertPredicate ? Hexagon::POST_STbri_cPt :
@@ -1182,6 +2128,34 @@ getMatchingCondBranchOpcode(int Opc, bool invertPredicate) const {
   case Hexagon::STrid_indexed_shl_V4:
     return !invertPredicate ? Hexagon::STrid_indexed_shl_cPt_V4 :
                               Hexagon::STrid_indexed_shl_cNotPt_V4;
+
+  // V4 Store to global address
+  case Hexagon::STrid_GP_V4:
+    return !invertPredicate ? Hexagon::STrid_GP_cPt_V4 :
+                              Hexagon::STrid_GP_cNotPt_V4;
+  case Hexagon::STrib_GP_V4:
+    return !invertPredicate ? Hexagon::STrib_GP_cPt_V4 :
+                              Hexagon::STrib_GP_cNotPt_V4;
+  case Hexagon::STrih_GP_V4:
+    return !invertPredicate ? Hexagon::STrih_GP_cPt_V4 :
+                              Hexagon::STrih_GP_cNotPt_V4;
+  case Hexagon::STriw_GP_V4:
+    return !invertPredicate ? Hexagon::STriw_GP_cPt_V4 :
+                              Hexagon::STriw_GP_cNotPt_V4;
+
+  case Hexagon::STd_GP_V4:
+    return !invertPredicate ? Hexagon::STd_GP_cPt_V4 :
+                              Hexagon::STd_GP_cNotPt_V4;
+  case Hexagon::STb_GP_V4:
+    return !invertPredicate ? Hexagon::STb_GP_cPt_V4 :
+                              Hexagon::STb_GP_cNotPt_V4;
+  case Hexagon::STh_GP_V4:
+    return !invertPredicate ? Hexagon::STh_GP_cPt_V4 :
+                              Hexagon::STh_GP_cNotPt_V4;
+  case Hexagon::STw_GP_V4:
+    return !invertPredicate ? Hexagon::STw_GP_cPt_V4 :
+                              Hexagon::STw_GP_cNotPt_V4;
+
   // Load.
   case Hexagon::LDrid:
     return !invertPredicate ? Hexagon::LDrid_cPt :
@@ -1201,9 +2175,6 @@ getMatchingCondBranchOpcode(int Opc, bool invertPredicate) const {
   case Hexagon::LDriub:
     return !invertPredicate ? Hexagon::LDriub_cPt :
                               Hexagon::LDriub_cNotPt;
-  case Hexagon::LDriubit:
-    return !invertPredicate ? Hexagon::LDriub_cPt :
-                              Hexagon::LDriub_cNotPt;
  // Load Indexed.
   case Hexagon::LDrid_indexed:
     return !invertPredicate ? Hexagon::LDrid_indexed_cPt :
@@ -1297,7 +2268,7 @@ PredicateInstruction(MachineInstr *MI,
 bool
 HexagonInstrInfo::
 isProfitableToIfCvt(MachineBasicBlock &MBB,
-                    unsigned NumCyles,
+                    unsigned NumCycles,
                     unsigned ExtraPredCycles,
                     const BranchProbability &Probability) const {
   return true;
@@ -1323,7 +2294,6 @@ bool HexagonInstrInfo::isPredicated(const MachineInstr *MI) const {
   return ((F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask);
 }
 
-
 bool
 HexagonInstrInfo::DefinesPredicate(MachineInstr *MI,
                                    std::vector<MachineOperand> &Pred) const {
@@ -1331,7 +2301,7 @@ HexagonInstrInfo::DefinesPredicate(MachineInstr *MI,
     MachineOperand MO = MI->getOperand(oper);
     if (MO.isReg() && MO.isDef()) {
       const TargetRegisterClass* RC = RI.getMinimalPhysRegClass(MO.getReg());
-      if (RC == Hexagon::PredRegsRegisterClass) {
+      if (RC == &Hexagon::PredRegsRegClass) {
         Pred.push_back(MO);
         return true;
       }
@@ -1373,6 +2343,7 @@ isProfitableToDupForIfCvt(MachineBasicBlock &MBB,unsigned NumInstrs,
 
 bool HexagonInstrInfo::isDeallocRet(const MachineInstr *MI) const {
   switch (MI->getOpcode()) {
+  default: return false;
   case Hexagon::DEALLOC_RET_V4 :
   case Hexagon::DEALLOC_RET_cPt_V4 :
   case Hexagon::DEALLOC_RET_cNotPt_V4 :
@@ -1382,7 +2353,6 @@ bool HexagonInstrInfo::isDeallocRet(const MachineInstr *MI) const {
   case Hexagon::DEALLOC_RET_cNotdnPt_V4 :
    return true;
   }
-  return false;
 }
 
 
@@ -1396,13 +2366,17 @@ isValidOffset(const int Opcode, const int Offset) const {
   switch(Opcode) {
 
   case Hexagon::LDriw:
+  case Hexagon::LDriw_f:
   case Hexagon::STriw:
+  case Hexagon::STriw_f:
     assert((Offset % 4 == 0) && "Offset has incorrect alignment");
     return (Offset >= Hexagon_MEMW_OFFSET_MIN) &&
       (Offset <= Hexagon_MEMW_OFFSET_MAX);
 
   case Hexagon::LDrid:
+  case Hexagon::LDrid_f:
   case Hexagon::STrid:
+  case Hexagon::STrid_f:
     assert((Offset % 8 == 0) && "Offset has incorrect alignment");
     return (Offset >= Hexagon_MEMD_OFFSET_MIN) &&
       (Offset <= Hexagon_MEMD_OFFSET_MAX);
@@ -1410,7 +2384,6 @@ isValidOffset(const int Opcode, const int Offset) const {
   case Hexagon::LDrih:
   case Hexagon::LDriuh:
   case Hexagon::STrih:
-  case Hexagon::LDrih_ae:
     assert((Offset % 2 == 0) && "Offset has incorrect alignment");
     return (Offset >= Hexagon_MEMH_OFFSET_MIN) &&
       (Offset <= Hexagon_MEMH_OFFSET_MAX);
@@ -1418,9 +2391,6 @@ isValidOffset(const int Opcode, const int Offset) const {
   case Hexagon::LDrib:
   case Hexagon::STrib:
   case Hexagon::LDriub:
-  case Hexagon::LDriubit:
-  case Hexagon::LDrib_ae:
-  case Hexagon::LDriub_ae:
     return (Offset >= Hexagon_MEMB_OFFSET_MIN) &&
       (Offset <= Hexagon_MEMB_OFFSET_MAX);
 
@@ -1528,6 +2498,7 @@ bool HexagonInstrInfo::
 isMemOp(const MachineInstr *MI) const {
   switch (MI->getOpcode())
   {
+    default: return false;
     case Hexagon::MEMw_ADDSUBi_indexed_MEM_V4 :
     case Hexagon::MEMw_ADDi_indexed_MEM_V4 :
     case Hexagon::MEMw_SUBi_indexed_MEM_V4 :
@@ -1570,28 +2541,59 @@ isMemOp(const MachineInstr *MI) const {
     case Hexagon::MEMb_SUBr_MEM_V4 :
     case Hexagon::MEMb_ANDr_MEM_V4 :
     case Hexagon::MEMb_ORr_MEM_V4 :
-    return true;
+      return true;
   }
-  return false;
 }
 
 
 bool HexagonInstrInfo::
 isSpillPredRegOp(const MachineInstr *MI) const {
-  switch (MI->getOpcode())
-  {
+  switch (MI->getOpcode()) {
+    default: return false;
     case Hexagon::STriw_pred :
     case Hexagon::LDriw_pred :
-    return true;
+      return true;
+  }
+}
+
+bool HexagonInstrInfo::isNewValueJumpCandidate(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+    default: return false;
+    case Hexagon::CMPEQrr:
+    case Hexagon::CMPEQri:
+    case Hexagon::CMPLTrr:
+    case Hexagon::CMPGTrr:
+    case Hexagon::CMPGTri:
+    case Hexagon::CMPLTUrr:
+    case Hexagon::CMPGTUrr:
+    case Hexagon::CMPGTUri:
+    case Hexagon::CMPGEri:
+    case Hexagon::CMPGEUri:
+      return true;
   }
-  return false;
 }
 
+bool HexagonInstrInfo::
+isConditionalTransfer (const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+    default: return false;
+    case Hexagon::TFR_cPt:
+    case Hexagon::TFR_cNotPt:
+    case Hexagon::TFRI_cPt:
+    case Hexagon::TFRI_cNotPt:
+    case Hexagon::TFR_cdnPt:
+    case Hexagon::TFR_cdnNotPt:
+    case Hexagon::TFRI_cdnPt:
+    case Hexagon::TFRI_cdnNotPt:
+      return true;
+  }
+}
 
 bool HexagonInstrInfo::isConditionalALU32 (const MachineInstr* MI) const {
   const HexagonRegisterInfo& QRI = getRegisterInfo();
   switch (MI->getOpcode())
   {
+    default: return false;
     case Hexagon::ADD_ri_cPt:
     case Hexagon::ADD_ri_cNotPt:
     case Hexagon::ADD_rr_cPt:
@@ -1619,19 +2621,16 @@ bool HexagonInstrInfo::isConditionalALU32 (const MachineInstr* MI) const {
     case Hexagon::ZXTB_cNotPt_V4:
     case Hexagon::ZXTH_cPt_V4:
     case Hexagon::ZXTH_cNotPt_V4:
-      return QRI.Subtarget.getHexagonArchVersion() == HexagonSubtarget::V4;
-
-    default:
-      return false;
+      return QRI.Subtarget.hasV4TOps();
   }
 }
 
-
 bool HexagonInstrInfo::
 isConditionalLoad (const MachineInstr* MI) const {
   const HexagonRegisterInfo& QRI = getRegisterInfo();
   switch (MI->getOpcode())
   {
+    default: return false;
     case Hexagon::LDrid_cPt :
     case Hexagon::LDrid_cNotPt :
     case Hexagon::LDrid_indexed_cPt :
@@ -1669,7 +2668,7 @@ isConditionalLoad (const MachineInstr* MI) const {
     case Hexagon::POST_LDriuh_cNotPt :
     case Hexagon::POST_LDriub_cPt :
     case Hexagon::POST_LDriub_cNotPt :
-      return QRI.Subtarget.getHexagonArchVersion() == HexagonSubtarget::V4;
+      return QRI.Subtarget.hasV4TOps();
     case Hexagon::LDrid_indexed_cPt_V4 :
     case Hexagon::LDrid_indexed_cNotPt_V4 :
     case Hexagon::LDrid_indexed_shl_cPt_V4 :
@@ -1694,12 +2693,136 @@ isConditionalLoad (const MachineInstr* MI) const {
     case Hexagon::LDriw_indexed_cNotPt_V4 :
     case Hexagon::LDriw_indexed_shl_cPt_V4 :
     case Hexagon::LDriw_indexed_shl_cNotPt_V4 :
-      return QRI.Subtarget.getHexagonArchVersion() == HexagonSubtarget::V4;
-    default:
-      return false;
+      return QRI.Subtarget.hasV4TOps();
   }
 }
 
+// Returns true if an instruction is a conditional store.
+//
+// Note: It doesn't include conditional new-value stores as they can't be
+// converted to .new predicate.
+//
+//               p.new NV store [ if(p0.new)memw(R0+#0)=R2.new ]
+//                ^           ^
+//               /             \ (not OK. it will cause new-value store to be
+//              /               X conditional on p0.new while R2 producer is
+//             /                 \ on p0)
+//            /                   \.
+//     p.new store                 p.old NV store
+// [if(p0.new)memw(R0+#0)=R2]    [if(p0)memw(R0+#0)=R2.new]
+//            ^                  ^
+//             \                /
+//              \              /
+//               \            /
+//                 p.old store
+//             [if (p0)memw(R0+#0)=R2]
+//
+// The above diagram shows the steps involoved in the conversion of a predicated
+// store instruction to its .new predicated new-value form.
+//
+// The following set of instructions further explains the scenario where
+// conditional new-value store becomes invalid when promoted to .new predicate
+// form.
+//
+// { 1) if (p0) r0 = add(r1, r2)
+//   2) p0 = cmp.eq(r3, #0) }
+//
+//   3) if (p0) memb(r1+#0) = r0  --> this instruction can't be grouped with
+// the first two instructions because in instr 1, r0 is conditional on old value
+// of p0 but its use in instr 3 is conditional on p0 modified by instr 2 which
+// is not valid for new-value stores.
+bool HexagonInstrInfo::
+isConditionalStore (const MachineInstr* MI) const {
+  const HexagonRegisterInfo& QRI = getRegisterInfo();
+  switch (MI->getOpcode())
+  {
+    default: return false;
+    case Hexagon::STrib_imm_cPt_V4 :
+    case Hexagon::STrib_imm_cNotPt_V4 :
+    case Hexagon::STrib_indexed_shl_cPt_V4 :
+    case Hexagon::STrib_indexed_shl_cNotPt_V4 :
+    case Hexagon::STrib_cPt :
+    case Hexagon::STrib_cNotPt :
+    case Hexagon::POST_STbri_cPt :
+    case Hexagon::POST_STbri_cNotPt :
+    case Hexagon::STrid_indexed_cPt :
+    case Hexagon::STrid_indexed_cNotPt :
+    case Hexagon::STrid_indexed_shl_cPt_V4 :
+    case Hexagon::POST_STdri_cPt :
+    case Hexagon::POST_STdri_cNotPt :
+    case Hexagon::STrih_cPt :
+    case Hexagon::STrih_cNotPt :
+    case Hexagon::STrih_indexed_cPt :
+    case Hexagon::STrih_indexed_cNotPt :
+    case Hexagon::STrih_imm_cPt_V4 :
+    case Hexagon::STrih_imm_cNotPt_V4 :
+    case Hexagon::STrih_indexed_shl_cPt_V4 :
+    case Hexagon::STrih_indexed_shl_cNotPt_V4 :
+    case Hexagon::POST_SThri_cPt :
+    case Hexagon::POST_SThri_cNotPt :
+    case Hexagon::STriw_cPt :
+    case Hexagon::STriw_cNotPt :
+    case Hexagon::STriw_indexed_cPt :
+    case Hexagon::STriw_indexed_cNotPt :
+    case Hexagon::STriw_imm_cPt_V4 :
+    case Hexagon::STriw_imm_cNotPt_V4 :
+    case Hexagon::STriw_indexed_shl_cPt_V4 :
+    case Hexagon::STriw_indexed_shl_cNotPt_V4 :
+    case Hexagon::POST_STwri_cPt :
+    case Hexagon::POST_STwri_cNotPt :
+      return QRI.Subtarget.hasV4TOps();
+
+    // V4 global address store before promoting to dot new.
+    case Hexagon::STrid_GP_cPt_V4 :
+    case Hexagon::STrid_GP_cNotPt_V4 :
+    case Hexagon::STrib_GP_cPt_V4 :
+    case Hexagon::STrib_GP_cNotPt_V4 :
+    case Hexagon::STrih_GP_cPt_V4 :
+    case Hexagon::STrih_GP_cNotPt_V4 :
+    case Hexagon::STriw_GP_cPt_V4 :
+    case Hexagon::STriw_GP_cNotPt_V4 :
+    case Hexagon::STd_GP_cPt_V4 :
+    case Hexagon::STd_GP_cNotPt_V4 :
+    case Hexagon::STb_GP_cPt_V4 :
+    case Hexagon::STb_GP_cNotPt_V4 :
+    case Hexagon::STh_GP_cPt_V4 :
+    case Hexagon::STh_GP_cNotPt_V4 :
+    case Hexagon::STw_GP_cPt_V4 :
+    case Hexagon::STw_GP_cNotPt_V4 :
+      return QRI.Subtarget.hasV4TOps();
+
+    // Predicated new value stores (i.e. if (p0) memw(..)=r0.new) are excluded
+    // from the "Conditional Store" list. Because a predicated new value store
+    // would NOT be promoted to a double dot new store. See diagram below:
+    // This function returns yes for those stores that are predicated but not
+    // yet promoted to predicate dot new instructions.
+    //
+    //                          +---------------------+
+    //                    /-----| if (p0) memw(..)=r0 |---------\~
+    //                   ||     +---------------------+         ||
+    //          promote  ||       /\       /\                   ||  promote
+    //                   ||      /||\     /||\                  ||
+    //                  \||/    demote     ||                  \||/
+    //                   \/       ||       ||                   \/
+    //       +-------------------------+   ||   +-------------------------+
+    //       | if (p0.new) memw(..)=r0 |   ||   | if (p0) memw(..)=r0.new |
+    //       +-------------------------+   ||   +-------------------------+
+    //                        ||           ||         ||
+    //                        ||         demote      \||/
+    //                      promote        ||         \/ NOT possible
+    //                        ||           ||         /\~
+    //                       \||/          ||        /||\~
+    //                        \/           ||         ||
+    //                      +-----------------------------+
+    //                      | if (p0.new) memw(..)=r0.new |
+    //                      +-----------------------------+
+    //                           Double Dot New Store
+    //
+  }
+}
+
+
+
 DFAPacketizer *HexagonInstrInfo::
 CreateTargetScheduleState(const TargetMachine *TM,
                            const ScheduleDAG *DAG) const {
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h
index 7306870..2bb53f8 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -112,7 +112,7 @@ public:
   PredicateInstruction(MachineInstr *MI,
                        const SmallVectorImpl<MachineOperand> &Cond) const;
 
-  virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
+  virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
                                    unsigned ExtraPredCycles,
                                    const BranchProbability &Probability) const;
 
@@ -160,10 +160,21 @@ public:
   bool isS8_Immediate(const int value) const;
   bool isS6_Immediate(const int value) const;
 
+  bool isSaveCalleeSavedRegsCall(const MachineInstr* MI) const;
+  bool isConditionalTransfer(const MachineInstr* MI) const;
   bool isConditionalALU32 (const MachineInstr* MI) const;
   bool isConditionalLoad (const MachineInstr* MI) const;
+  bool isConditionalStore(const MachineInstr* MI) const;
   bool isDeallocRet(const MachineInstr *MI) const;
   unsigned getInvertedPredicatedOpcode(const int Opc) const;
+  bool isExtendable(const MachineInstr* MI) const;
+  bool isExtended(const MachineInstr* MI) const;
+  bool isPostIncrement(const MachineInstr* MI) const;
+  bool isNewValueStore(const MachineInstr* MI) const;
+  bool isNewValueJump(const MachineInstr* MI) const;
+  bool isNewValueJumpCandidate(const MachineInstr *MI) const;
+  unsigned getImmExtForm(const MachineInstr* MI) const;
+  unsigned getNormalBranchForm(const MachineInstr* MI) const;
 
 private:
   int getMatchingCondBranchOpcode(int Opc, bool sense) const;
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.td b/lib/Target/Hexagon/HexagonInstrInfo.td
index b563ac3..c7be5ce 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.td
+++ b/lib/Target/Hexagon/HexagonInstrInfo.td
@@ -25,7 +25,10 @@ def HasV3TOnly                  : Predicate<"Subtarget.hasV3TOpsOnly()">;
 def NoV3T                       : Predicate<"!Subtarget.hasV3TOps()">;
 def HasV4T                      : Predicate<"Subtarget.hasV4TOps()">;
 def NoV4T                       : Predicate<"!Subtarget.hasV4TOps()">;
+def HasV5T                      : Predicate<"Subtarget.hasV5TOps()">;
+def NoV5T                       : Predicate<"!Subtarget.hasV5TOps()">;
 def UseMEMOP                    : Predicate<"Subtarget.useMemOps()">;
+def IEEERndNearV5T              : Predicate<"Subtarget.modeIEEERndNear()">;
 
 // Addressing modes.
 def ADDRrr : ComplexPattern<i32, 2, "SelectADDRrr", [], []>;
@@ -84,10 +87,12 @@ def symbolLo32 : Operand<i32> {
 multiclass ALU32_rr_ri<string OpcStr, SDNode OpNode> {
   def rr : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
                  !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
-                 [(set IntRegs:$dst, (OpNode IntRegs:$b, IntRegs:$c))]>;
+                 [(set (i32 IntRegs:$dst), (OpNode (i32 IntRegs:$b),
+                                                   (i32 IntRegs:$c)))]>;
   def ri : ALU32_ri<(outs IntRegs:$dst), (ins s10Imm:$b, IntRegs:$c),
                  !strconcat("$dst = ", !strconcat(OpcStr, "(#$b, $c)")),
-                 [(set IntRegs:$dst, (OpNode s10Imm:$b, IntRegs:$c))]>;
+                 [(set (i32 IntRegs:$dst), (OpNode s10Imm:$b,
+                                                   (i32 IntRegs:$c)))]>;
 }
 
 // Multi-class for compare ops.
@@ -95,111 +100,114 @@ let isCompare = 1 in {
 multiclass CMP64_rr<string OpcStr, PatFrag OpNode> {
   def rr : ALU64_rr<(outs PredRegs:$dst), (ins DoubleRegs:$b, DoubleRegs:$c),
                  !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
-                 [(set PredRegs:$dst, (OpNode DoubleRegs:$b, DoubleRegs:$c))]>;
+                 [(set (i1 PredRegs:$dst),
+                       (OpNode (i64 DoubleRegs:$b), (i64 DoubleRegs:$c)))]>;
 }
 multiclass CMP32_rr<string OpcStr, PatFrag OpNode> {
   def rr : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
                  !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
-                 [(set PredRegs:$dst, (OpNode IntRegs:$b, IntRegs:$c))]>;
+                 [(set (i1 PredRegs:$dst),
+                       (OpNode (i32 IntRegs:$b), (i32 IntRegs:$c)))]>;
 }
 
 multiclass CMP32_rr_ri_s10<string OpcStr, PatFrag OpNode> {
   def rr : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
                  !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
-                 [(set PredRegs:$dst, (OpNode IntRegs:$b, IntRegs:$c))]>;
+                 [(set (i1 PredRegs:$dst),
+                       (OpNode (i32 IntRegs:$b), (i32 IntRegs:$c)))]>;
   def ri : ALU32_ri<(outs PredRegs:$dst), (ins IntRegs:$b, s10Imm:$c),
                  !strconcat("$dst = ", !strconcat(OpcStr, "($b, #$c)")),
-                 [(set PredRegs:$dst, (OpNode IntRegs:$b, s10ImmPred:$c))]>;
+                 [(set (i1 PredRegs:$dst),
+                       (OpNode (i32 IntRegs:$b), s10ImmPred:$c))]>;
 }
 
 multiclass CMP32_rr_ri_u9<string OpcStr, PatFrag OpNode> {
   def rr : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
                  !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
-                 [(set PredRegs:$dst, (OpNode IntRegs:$b, IntRegs:$c))]>;
+                 [(set (i1 PredRegs:$dst),
+                       (OpNode (i32 IntRegs:$b), (i32 IntRegs:$c)))]>;
   def ri : ALU32_ri<(outs PredRegs:$dst), (ins IntRegs:$b, u9Imm:$c),
                  !strconcat("$dst = ", !strconcat(OpcStr, "($b, #$c)")),
-                 [(set PredRegs:$dst, (OpNode IntRegs:$b, u9ImmPred:$c))]>;
+                 [(set (i1 PredRegs:$dst),
+                       (OpNode (i32 IntRegs:$b), u9ImmPred:$c))]>;
 }
 
-multiclass CMP32_ri_u9<string OpcStr, PatFrag OpNode> {
-  def ri : ALU32_ri<(outs PredRegs:$dst), (ins IntRegs:$b, u9Imm:$c),
+multiclass CMP32_ri_u8<string OpcStr, PatFrag OpNode> {
+  def ri : ALU32_ri<(outs PredRegs:$dst), (ins IntRegs:$b, u8Imm:$c),
                  !strconcat("$dst = ", !strconcat(OpcStr, "($b, #$c)")),
-                 [(set PredRegs:$dst, (OpNode IntRegs:$b, u9ImmPred:$c))]>;
+                 [(set (i1 PredRegs:$dst), (OpNode (i32 IntRegs:$b),
+                                                   u8ImmPred:$c))]>;
 }
 
 multiclass CMP32_ri_s8<string OpcStr, PatFrag OpNode> {
   def ri : ALU32_ri<(outs PredRegs:$dst), (ins IntRegs:$b, s8Imm:$c),
                  !strconcat("$dst = ", !strconcat(OpcStr, "($b, #$c)")),
-                 [(set PredRegs:$dst, (OpNode IntRegs:$b, s8ImmPred:$c))]>;
+                 [(set (i1 PredRegs:$dst), (OpNode (i32 IntRegs:$b),
+                                                   s8ImmPred:$c))]>;
 }
 }
 
 //===----------------------------------------------------------------------===//
-// Instructions
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// http://qualnet.qualcomm.com/~erich/v1/htmldocs/index.html
-// http://qualnet.qualcomm.com/~erich/v2/htmldocs/index.html
-// http://qualnet.qualcomm.com/~erich/v3/htmldocs/index.html
-// http://qualnet.qualcomm.com/~erich/v4/htmldocs/index.html
-// http://qualnet.qualcomm.com/~erich/v5/htmldocs/index.html
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
 // ALU32/ALU +
 //===----------------------------------------------------------------------===//
 // Add.
-let isPredicable = 1 in
+let isCommutable = 1, isPredicable = 1 in
 def ADD_rr : ALU32_rr<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = add($src1, $src2)",
-            [(set IntRegs:$dst, (add IntRegs:$src1, IntRegs:$src2))]>;
+            [(set (i32 IntRegs:$dst), (add (i32 IntRegs:$src1),
+                                           (i32 IntRegs:$src2)))]>;
 
 let isPredicable = 1 in
 def ADD_ri : ALU32_ri<(outs IntRegs:$dst),
             (ins IntRegs:$src1, s16Imm:$src2),
             "$dst = add($src1, #$src2)",
-            [(set IntRegs:$dst, (add IntRegs:$src1, s16ImmPred:$src2))]>;
+            [(set (i32 IntRegs:$dst), (add (i32 IntRegs:$src1),
+                                           s16ImmPred:$src2))]>;
 
 // Logical operations.
 let isPredicable = 1 in
 def XOR_rr : ALU32_rr<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = xor($src1, $src2)",
-            [(set IntRegs:$dst, (xor IntRegs:$src1, IntRegs:$src2))]>;
+            [(set (i32 IntRegs:$dst), (xor (i32 IntRegs:$src1),
+                                           (i32 IntRegs:$src2)))]>;
 
-let isPredicable = 1 in
+let isCommutable = 1, isPredicable = 1 in
 def AND_rr : ALU32_rr<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = and($src1, $src2)",
-            [(set IntRegs:$dst, (and IntRegs:$src1, IntRegs:$src2))]>;
+            [(set (i32 IntRegs:$dst), (and (i32 IntRegs:$src1),
+                                           (i32 IntRegs:$src2)))]>;
 
 def OR_ri : ALU32_ri<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, s8Imm:$src2),
+            (ins IntRegs:$src1, s10Imm:$src2),
             "$dst = or($src1, #$src2)",
-            [(set IntRegs:$dst, (or IntRegs:$src1, s8ImmPred:$src2))]>;
+            [(set (i32 IntRegs:$dst), (or (i32 IntRegs:$src1),
+                                          s10ImmPred:$src2))]>;
 
 def NOT_rr : ALU32_rr<(outs IntRegs:$dst),
             (ins IntRegs:$src1),
             "$dst = not($src1)",
-            [(set IntRegs:$dst, (not IntRegs:$src1))]>;
+            [(set (i32 IntRegs:$dst), (not (i32 IntRegs:$src1)))]>;
 
 def AND_ri : ALU32_ri<(outs IntRegs:$dst),
             (ins IntRegs:$src1, s10Imm:$src2),
             "$dst = and($src1, #$src2)",
-            [(set IntRegs:$dst, (and IntRegs:$src1, s10ImmPred:$src2))]>;
+            [(set (i32 IntRegs:$dst), (and (i32 IntRegs:$src1),
+                                           s10ImmPred:$src2))]>;
 
-let isPredicable = 1 in
+let isCommutable = 1, isPredicable = 1 in
 def OR_rr : ALU32_rr<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = or($src1, $src2)",
-            [(set IntRegs:$dst, (or IntRegs:$src1, IntRegs:$src2))]>;
+            [(set (i32 IntRegs:$dst), (or (i32 IntRegs:$src1),
+                                          (i32 IntRegs:$src2)))]>;
 
 // Negate.
 def NEG : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1),
           "$dst = neg($src1)",
-          [(set IntRegs:$dst, (ineg IntRegs:$src1))]>;
+          [(set (i32 IntRegs:$dst), (ineg (i32 IntRegs:$src1)))]>;
 // Nop.
 let neverHasSideEffects = 1 in
 def NOP : ALU32_rr<(outs), (ins),
@@ -211,13 +219,20 @@ let isPredicable = 1 in
 def SUB_rr : ALU32_rr<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = sub($src1, $src2)",
-            [(set IntRegs:$dst, (sub IntRegs:$src1, IntRegs:$src2))]>;
+            [(set (i32 IntRegs:$dst), (sub (i32 IntRegs:$src1),
+                                           (i32 IntRegs:$src2)))]>;
+
+// Rd32=sub(#s10,Rs32)
+def SUB_ri : ALU32_ri<(outs IntRegs:$dst),
+            (ins s10Imm:$src1, IntRegs:$src2),
+            "$dst = sub(#$src1, $src2)",
+            [(set IntRegs:$dst, (sub s10ImmPred:$src1, IntRegs:$src2))]>;
 
 // Transfer immediate.
-let isReMaterializable = 1, isPredicable = 1 in
+let isMoveImm = 1, isReMaterializable = 1, isPredicable = 1 in
 def TFRI : ALU32_ri<(outs IntRegs:$dst), (ins s16Imm:$src1),
            "$dst = #$src1",
-           [(set IntRegs:$dst, s16ImmPred:$src1)]>;
+           [(set (i32 IntRegs:$dst), s16ImmPred:$src1)]>;
 
 // Transfer register.
 let neverHasSideEffects = 1, isPredicable = 1 in
@@ -225,6 +240,11 @@ def TFR : ALU32_ri<(outs IntRegs:$dst), (ins IntRegs:$src1),
           "$dst = $src1",
           []>;
 
+let neverHasSideEffects = 1, isPredicable = 1 in
+def TFR64 : ALU32_ri<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1),
+          "$dst = $src1",
+          []>;
+
 // Transfer control register.
 let neverHasSideEffects = 1 in
 def TFCR : CRInst<(outs CRRegs:$dst), (ins IntRegs:$src1),
@@ -246,6 +266,12 @@ def COMBINE_rr : ALU32_rr<(outs DoubleRegs:$dst),
             "$dst = combine($src1, $src2)",
             []>;
 
+let neverHasSideEffects = 1 in
+def COMBINE_ii : ALU32_ii<(outs DoubleRegs:$dst),
+            (ins s8Imm:$src1, s8Imm:$src2),
+            "$dst = combine(#$src1, #$src2)",
+            []>;
+
 // Mux.
 def VMUX_prr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins PredRegs:$src1,
                                                    DoubleRegs:$src2,
@@ -256,48 +282,52 @@ def VMUX_prr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins PredRegs:$src1,
 def MUX_rr : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1,
                                             IntRegs:$src2, IntRegs:$src3),
              "$dst = mux($src1, $src2, $src3)",
-             [(set IntRegs:$dst, (select PredRegs:$src1, IntRegs:$src2,
-                                         IntRegs:$src3))]>;
+             [(set (i32 IntRegs:$dst), (i32 (select (i1 PredRegs:$src1),
+                                                    (i32 IntRegs:$src2),
+                                                    (i32 IntRegs:$src3))))]>;
 
 def MUX_ir : ALU32_ir<(outs IntRegs:$dst), (ins PredRegs:$src1, s8Imm:$src2,
                                                 IntRegs:$src3),
              "$dst = mux($src1, #$src2, $src3)",
-             [(set IntRegs:$dst, (select PredRegs:$src1,
-                                         s8ImmPred:$src2, IntRegs:$src3))]>;
+             [(set (i32 IntRegs:$dst), (i32 (select (i1 PredRegs:$src1),
+                                                    s8ImmPred:$src2,
+                                                    (i32 IntRegs:$src3))))]>;
 
 def MUX_ri : ALU32_ri<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2,
                                                 s8Imm:$src3),
              "$dst = mux($src1, $src2, #$src3)",
-             [(set IntRegs:$dst, (select PredRegs:$src1, IntRegs:$src2,
-                                         s8ImmPred:$src3))]>;
+             [(set (i32 IntRegs:$dst), (i32 (select (i1 PredRegs:$src1),
+                                                    (i32 IntRegs:$src2),
+                                                    s8ImmPred:$src3)))]>;
 
 def MUX_ii : ALU32_ii<(outs IntRegs:$dst), (ins PredRegs:$src1, s8Imm:$src2,
                                                 s8Imm:$src3),
              "$dst = mux($src1, #$src2, #$src3)",
-             [(set IntRegs:$dst, (select PredRegs:$src1, s8ImmPred:$src2,
-                                         s8ImmPred:$src3))]>;
+             [(set (i32 IntRegs:$dst), (i32 (select (i1 PredRegs:$src1),
+                                                    s8ImmPred:$src2,
+                                                    s8ImmPred:$src3)))]>;
 
 // Shift halfword.
 let isPredicable = 1 in
 def ASLH : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1),
            "$dst = aslh($src1)",
-           [(set IntRegs:$dst, (shl 16, IntRegs:$src1))]>;
+           [(set (i32 IntRegs:$dst), (shl 16, (i32 IntRegs:$src1)))]>;
 
 let isPredicable = 1 in
 def ASRH : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1),
            "$dst = asrh($src1)",
-           [(set IntRegs:$dst, (sra 16, IntRegs:$src1))]>;
+           [(set (i32 IntRegs:$dst), (sra 16, (i32 IntRegs:$src1)))]>;
 
 // Sign extend.
 let isPredicable = 1 in
 def SXTB : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1),
            "$dst = sxtb($src1)",
-           [(set IntRegs:$dst, (sext_inreg IntRegs:$src1, i8))]>;
+           [(set (i32 IntRegs:$dst), (sext_inreg (i32 IntRegs:$src1), i8))]>;
 
 let isPredicable = 1 in
 def SXTH : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1),
            "$dst = sxth($src1)",
-           [(set IntRegs:$dst, (sext_inreg IntRegs:$src1, i16))]>;
+           [(set (i32 IntRegs:$dst), (sext_inreg (i32 IntRegs:$src1), i16))]>;
 
 // Zero extend.
 let isPredicable = 1, neverHasSideEffects = 1 in
@@ -321,25 +351,25 @@ def ZXTH : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1),
 // Conditional add.
 let neverHasSideEffects = 1, isPredicated = 1 in
 def ADD_ri_cPt : ALU32_ri<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, s16Imm:$src3),
+            (ins PredRegs:$src1, IntRegs:$src2, s8Imm:$src3),
             "if ($src1) $dst = add($src2, #$src3)",
             []>;
 
 let neverHasSideEffects = 1, isPredicated = 1 in
 def ADD_ri_cNotPt : ALU32_ri<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, s16Imm:$src3),
+            (ins PredRegs:$src1, IntRegs:$src2, s8Imm:$src3),
             "if (!$src1) $dst = add($src2, #$src3)",
             []>;
 
 let neverHasSideEffects = 1, isPredicated = 1 in
 def ADD_ri_cdnPt : ALU32_ri<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, s16Imm:$src3),
+            (ins PredRegs:$src1, IntRegs:$src2, s8Imm:$src3),
             "if ($src1.new) $dst = add($src2, #$src3)",
             []>;
 
 let neverHasSideEffects = 1, isPredicated = 1 in
 def ADD_ri_cdnNotPt : ALU32_ri<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, s16Imm:$src3),
+            (ins PredRegs:$src1, IntRegs:$src2, s8Imm:$src3),
             "if (!$src1.new) $dst = add($src2, #$src3)",
             []>;
 
@@ -497,7 +527,6 @@ def SUB_rr_cdnNotPt : ALU32_rr<(outs IntRegs:$dst),
 
 
 // Conditional transfer.
-
 let neverHasSideEffects = 1, isPredicated = 1 in
 def TFR_cPt : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2),
               "if ($src1) $dst = $src2",
@@ -510,6 +539,18 @@ def TFR_cNotPt : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1,
                  []>;
 
 let neverHasSideEffects = 1, isPredicated = 1 in
+def TFR64_cPt : ALU32_rr<(outs DoubleRegs:$dst), (ins PredRegs:$src1,
+                                                   DoubleRegs:$src2),
+              "if ($src1) $dst = $src2",
+              []>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def TFR64_cNotPt : ALU32_rr<(outs DoubleRegs:$dst), (ins PredRegs:$src1,
+                                                    DoubleRegs:$src2),
+                 "if (!$src1) $dst = $src2",
+                 []>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
 def TFRI_cPt : ALU32_ri<(outs IntRegs:$dst), (ins PredRegs:$src1, s12Imm:$src2),
                "if ($src1) $dst = #$src2",
                []>;
@@ -548,25 +589,14 @@ def TFRI_cdnNotPt : ALU32_ri<(outs IntRegs:$dst), (ins PredRegs:$src1,
 defm CMPGTU : CMP32_rr_ri_u9<"cmp.gtu", setugt>;
 defm CMPGT : CMP32_rr_ri_s10<"cmp.gt", setgt>;
 defm CMPLT : CMP32_rr<"cmp.lt", setlt>;
+defm CMPLTU : CMP32_rr<"cmp.ltu", setult>;
 defm CMPEQ : CMP32_rr_ri_s10<"cmp.eq", seteq>;
 defm CMPGE : CMP32_ri_s8<"cmp.ge", setge>;
-defm CMPGEU : CMP32_ri_u9<"cmp.geu", setuge>;
+defm CMPGEU : CMP32_ri_u8<"cmp.geu", setuge>;
 //===----------------------------------------------------------------------===//
 // ALU32/PRED -
 //===----------------------------------------------------------------------===//
 
-//===----------------------------------------------------------------------===//
-// ALU32/VH +
-//===----------------------------------------------------------------------===//
-// Vector add halfwords
-
-// Vector averagehalfwords
-
-// Vector subtract halfwords
-//===----------------------------------------------------------------------===//
-// ALU32/VH -
-//===----------------------------------------------------------------------===//
-
 
 //===----------------------------------------------------------------------===//
 // ALU64/ALU +
@@ -575,8 +605,8 @@ defm CMPGEU : CMP32_ri_u9<"cmp.geu", setuge>;
 def ADD64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
                                                      DoubleRegs:$src2),
                "$dst = add($src1, $src2)",
-               [(set DoubleRegs:$dst, (add DoubleRegs:$src1,
-                                           DoubleRegs:$src2))]>;
+               [(set (i64 DoubleRegs:$dst), (add (i64 DoubleRegs:$src1),
+                                                 (i64 DoubleRegs:$src2)))]>;
 
 // Add halfword.
 
@@ -589,40 +619,93 @@ defm CMPGTU64 : CMP64_rr<"cmp.gtu", setugt>;
 def AND_rr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
                                                      DoubleRegs:$src2),
                "$dst = and($src1, $src2)",
-               [(set DoubleRegs:$dst, (and DoubleRegs:$src1,
-                                           DoubleRegs:$src2))]>;
+               [(set (i64 DoubleRegs:$dst), (and (i64 DoubleRegs:$src1),
+                                                 (i64 DoubleRegs:$src2)))]>;
 
 def OR_rr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
                                                     DoubleRegs:$src2),
               "$dst = or($src1, $src2)",
-              [(set DoubleRegs:$dst, (or DoubleRegs:$src1, DoubleRegs:$src2))]>;
+              [(set (i64 DoubleRegs:$dst), (or (i64 DoubleRegs:$src1),
+                                               (i64 DoubleRegs:$src2)))]>;
 
 def XOR_rr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
                                                      DoubleRegs:$src2),
                "$dst = xor($src1, $src2)",
-               [(set DoubleRegs:$dst, (xor DoubleRegs:$src1,
-                                           DoubleRegs:$src2))]>;
+               [(set (i64 DoubleRegs:$dst), (xor (i64 DoubleRegs:$src1),
+                                                 (i64 DoubleRegs:$src2)))]>;
 
 // Maximum.
 def MAXw_rr : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
               "$dst = max($src2, $src1)",
-              [(set IntRegs:$dst, (select (i1 (setlt IntRegs:$src2,
-                                                     IntRegs:$src1)),
-                                          IntRegs:$src1, IntRegs:$src2))]>;
+              [(set (i32 IntRegs:$dst),
+                    (i32 (select (i1 (setlt (i32 IntRegs:$src2),
+                                            (i32 IntRegs:$src1))),
+                                 (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>;
+
+def MAXUw_rr : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+              "$dst = maxu($src2, $src1)",
+              [(set (i32 IntRegs:$dst),
+                    (i32 (select (i1 (setult (i32 IntRegs:$src2),
+                                             (i32 IntRegs:$src1))),
+                                 (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>;
+
+def MAXd_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                    DoubleRegs:$src2),
+              "$dst = max($src2, $src1)",
+              [(set (i64 DoubleRegs:$dst),
+                    (i64 (select (i1 (setlt (i64 DoubleRegs:$src2),
+                                            (i64 DoubleRegs:$src1))),
+                                 (i64 DoubleRegs:$src1),
+                                 (i64 DoubleRegs:$src2))))]>;
+
+def MAXUd_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2),
+              "$dst = maxu($src2, $src1)",
+              [(set (i64 DoubleRegs:$dst),
+                    (i64 (select (i1 (setult (i64 DoubleRegs:$src2),
+                                             (i64 DoubleRegs:$src1))),
+                                 (i64 DoubleRegs:$src1),
+                                 (i64 DoubleRegs:$src2))))]>;
 
 // Minimum.
 def MINw_rr : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
               "$dst = min($src2, $src1)",
-              [(set IntRegs:$dst, (select (i1 (setgt IntRegs:$src2,
-                                                     IntRegs:$src1)),
-                                          IntRegs:$src1, IntRegs:$src2))]>;
+              [(set (i32 IntRegs:$dst),
+                    (i32 (select (i1 (setgt (i32 IntRegs:$src2),
+                                            (i32 IntRegs:$src1))),
+                                 (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>;
+
+def MINUw_rr : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+              "$dst = minu($src2, $src1)",
+              [(set (i32 IntRegs:$dst),
+                    (i32 (select (i1 (setugt (i32 IntRegs:$src2),
+                                             (i32 IntRegs:$src1))),
+                                 (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>;
+
+def MINd_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                    DoubleRegs:$src2),
+              "$dst = min($src2, $src1)",
+              [(set (i64 DoubleRegs:$dst),
+                    (i64 (select (i1 (setgt (i64 DoubleRegs:$src2),
+                                            (i64 DoubleRegs:$src1))),
+                                 (i64 DoubleRegs:$src1),
+                                 (i64 DoubleRegs:$src2))))]>;
+
+def MINUd_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2),
+              "$dst = minu($src2, $src1)",
+              [(set (i64 DoubleRegs:$dst),
+                    (i64 (select (i1 (setugt (i64 DoubleRegs:$src2),
+                                             (i64 DoubleRegs:$src1))),
+                                 (i64 DoubleRegs:$src1),
+                                 (i64 DoubleRegs:$src2))))]>;
 
 // Subtract.
 def SUB64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
                                                      DoubleRegs:$src2),
                "$dst = sub($src1, $src2)",
-               [(set DoubleRegs:$dst, (sub DoubleRegs:$src1,
-                                           DoubleRegs:$src2))]>;
+               [(set (i64 DoubleRegs:$dst), (sub (i64 DoubleRegs:$src1),
+                                                 (i64 DoubleRegs:$src2)))]>;
 
 // Subtract halfword.
 
@@ -652,30 +735,6 @@ def TFR_64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1),
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-// ALU64/VB +
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-// ALU64/VB -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// ALU64/VH +
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-// ALU64/VH -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// ALU64/VW +
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-// ALU64/VW -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
 // CR +
 //===----------------------------------------------------------------------===//
 // Logical reductions on predicates.
@@ -687,7 +746,8 @@ def TFR_64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1),
 // Logical operations on predicates.
 def AND_pp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1, PredRegs:$src2),
              "$dst = and($src1, $src2)",
-             [(set PredRegs:$dst, (and PredRegs:$src1, PredRegs:$src2))]>;
+             [(set (i1 PredRegs:$dst), (and (i1 PredRegs:$src1),
+                                            (i1 PredRegs:$src2)))]>;
 
 let neverHasSideEffects = 1 in
 def AND_pnotp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1,
@@ -726,15 +786,17 @@ def MASK_p : SInst<(outs DoubleRegs:$dst), (ins PredRegs:$src1),
 
 def NOT_p : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1),
              "$dst = not($src1)",
-             [(set PredRegs:$dst, (not PredRegs:$src1))]>;
+             [(set (i1 PredRegs:$dst), (not (i1 PredRegs:$src1)))]>;
 
 def OR_pp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1, PredRegs:$src2),
             "$dst = or($src1, $src2)",
-            [(set PredRegs:$dst, (or PredRegs:$src1, PredRegs:$src2))]>;
+            [(set (i1 PredRegs:$dst), (or (i1 PredRegs:$src1),
+                                          (i1 PredRegs:$src2)))]>;
 
 def XOR_pp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1, PredRegs:$src2),
              "$dst = xor($src1, $src2)",
-             [(set PredRegs:$dst, (xor PredRegs:$src1, PredRegs:$src2))]>;
+             [(set (i1 PredRegs:$dst), (xor (i1 PredRegs:$src1),
+                                            (i1 PredRegs:$src2)))]>;
 
 
 // User control register transfer.
@@ -760,7 +822,7 @@ let isBranch = 1, isTerminator=1, Defs = [PC],
   def JMP_c : JInst< (outs),
                  (ins PredRegs:$src, brtarget:$offset),
                  "if ($src) jump $offset",
-                 [(brcond PredRegs:$src, bb:$offset)]>;
+                 [(brcond (i1 PredRegs:$src), bb:$offset)]>;
 }
 
 // if (!p0) jump
@@ -826,7 +888,7 @@ def retflag : SDNode<"HexagonISD::RET_FLAG", SDTNone,
                                [SDNPHasChain, SDNPOptInGlue]>;
 
 // Jump to address from register.
-let isReturn = 1, isTerminator = 1, isBarrier = 1,
+let isPredicable =1, isReturn = 1, isTerminator = 1, isBarrier = 1,
   Defs = [PC], Uses = [R31] in {
   def JMPR: JRInst<(outs), (ins),
                    "jumpr r31",
@@ -834,7 +896,7 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1,
 }
 
 // Jump to address from register.
-let isReturn = 1, isTerminator = 1, isBarrier = 1,
+let isReturn = 1, isTerminator = 1, isBarrier = 1, isPredicated = 1,
   Defs = [PC], Uses = [R31] in {
   def JMPR_cPt: JRInst<(outs), (ins PredRegs:$src1),
                        "if ($src1) jumpr r31",
@@ -842,7 +904,7 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1,
 }
 
 // Jump to address from register.
-let isReturn = 1, isTerminator = 1, isBarrier = 1,
+let isReturn = 1, isTerminator = 1, isBarrier = 1, isPredicated = 1,
   Defs = [PC], Uses = [R31] in {
   def JMPR_cNotPt: JRInst<(outs), (ins PredRegs:$src1),
                           "if (!$src1) jumpr r31",
@@ -865,96 +927,99 @@ let isPredicable = 1 in
 def LDrid : LDInst<(outs DoubleRegs:$dst),
             (ins MEMri:$addr),
             "$dst = memd($addr)",
-            [(set DoubleRegs:$dst, (load ADDRriS11_3:$addr))]>;
+            [(set (i64 DoubleRegs:$dst), (i64 (load ADDRriS11_3:$addr)))]>;
 
 let isPredicable = 1, AddedComplexity = 20 in
 def LDrid_indexed : LDInst<(outs DoubleRegs:$dst),
             (ins IntRegs:$src1, s11_3Imm:$offset),
-            "$dst=memd($src1+#$offset)",
-            [(set DoubleRegs:$dst, (load (add IntRegs:$src1,
-                                              s11_3ImmPred:$offset)))]>;
+            "$dst = memd($src1+#$offset)",
+            [(set (i64 DoubleRegs:$dst),
+                  (i64 (load (add (i32 IntRegs:$src1),
+                                  s11_3ImmPred:$offset))))]>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrid_GP : LDInst<(outs DoubleRegs:$dst),
+let neverHasSideEffects = 1 in
+def LDrid_GP : LDInst2<(outs DoubleRegs:$dst),
             (ins globaladdress:$global, u16Imm:$offset),
-            "$dst=memd(#$global+$offset)",
-            []>;
+            "$dst = memd(#$global+$offset)",
+            []>,
+            Requires<[NoV4T]>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDd_GP : LDInst<(outs DoubleRegs:$dst),
+let neverHasSideEffects = 1 in
+def LDd_GP : LDInst2<(outs DoubleRegs:$dst),
             (ins globaladdress:$global),
-            "$dst=memd(#$global)",
-            []>;
+            "$dst = memd(#$global)",
+            []>,
+            Requires<[NoV4T]>;
 
-let isPredicable = 1, mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDrid : LDInstPI<(outs DoubleRegs:$dst, IntRegs:$dst2),
+let isPredicable = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDrid : LDInst2PI<(outs DoubleRegs:$dst, IntRegs:$dst2),
             (ins IntRegs:$src1, s4Imm:$offset),
             "$dst = memd($src1++#$offset)",
             [],
             "$src1 = $dst2">;
 
 // Load doubleword conditionally.
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrid_cPt : LDInst<(outs DoubleRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrid_cPt : LDInst2<(outs DoubleRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if ($src1) $dst = memd($addr)",
             []>;
 
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrid_cNotPt : LDInst<(outs DoubleRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrid_cNotPt : LDInst2<(outs DoubleRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if (!$src1) $dst = memd($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrid_indexed_cPt : LDInst<(outs DoubleRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrid_indexed_cPt : LDInst2<(outs DoubleRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_3Imm:$src3),
-            "if ($src1) $dst=memd($src2+#$src3)",
+            "if ($src1) $dst = memd($src2+#$src3)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrid_indexed_cNotPt : LDInst<(outs DoubleRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrid_indexed_cNotPt : LDInst2<(outs DoubleRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_3Imm:$src3),
-            "if (!$src1) $dst=memd($src2+#$src3)",
+            "if (!$src1) $dst = memd($src2+#$src3)",
             []>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDrid_cPt : LDInstPI<(outs DoubleRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDrid_cPt : LDInst2PI<(outs DoubleRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_3Imm:$src3),
             "if ($src1) $dst1 = memd($src2++#$src3)",
             [],
             "$src2 = $dst2">;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDrid_cNotPt : LDInstPI<(outs DoubleRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDrid_cNotPt : LDInst2PI<(outs DoubleRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_3Imm:$src3),
             "if (!$src1) $dst1 = memd($src2++#$src3)",
             [],
             "$src2 = $dst2">;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrid_cdnPt : LDInst<(outs DoubleRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrid_cdnPt : LDInst2<(outs DoubleRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if ($src1.new) $dst = memd($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrid_cdnNotPt : LDInst<(outs DoubleRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrid_cdnNotPt : LDInst2<(outs DoubleRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if (!$src1.new) $dst = memd($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrid_indexed_cdnPt : LDInst<(outs DoubleRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrid_indexed_cdnPt : LDInst2<(outs DoubleRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_3Imm:$src3),
-            "if ($src1.new) $dst=memd($src2+#$src3)",
+            "if ($src1.new) $dst = memd($src2+#$src3)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrid_indexed_cdnNotPt : LDInst<(outs DoubleRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrid_indexed_cdnNotPt : LDInst2<(outs DoubleRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_3Imm:$src3),
-            "if (!$src1.new) $dst=memd($src2+#$src3)",
+            "if (!$src1.new) $dst = memd($src2+#$src3)",
             []>;
 
 
@@ -963,114 +1028,113 @@ let isPredicable = 1 in
 def LDrib : LDInst<(outs IntRegs:$dst),
             (ins MEMri:$addr),
             "$dst = memb($addr)",
-            [(set IntRegs:$dst, (sextloadi8 ADDRriS11_0:$addr))]>;
+            [(set (i32 IntRegs:$dst), (i32 (sextloadi8 ADDRriS11_0:$addr)))]>;
 
-def LDrib_ae : LDInst<(outs IntRegs:$dst),
-            (ins MEMri:$addr),
-            "$dst = memb($addr)",
-            [(set IntRegs:$dst, (extloadi8 ADDRriS11_0:$addr))]>;
+// Load byte any-extend.
+def : Pat < (i32 (extloadi8 ADDRriS11_0:$addr)),
+            (i32 (LDrib ADDRriS11_0:$addr)) >;
 
 // Indexed load byte.
 let isPredicable = 1, AddedComplexity = 20 in
 def LDrib_indexed : LDInst<(outs IntRegs:$dst),
             (ins IntRegs:$src1, s11_0Imm:$offset),
-            "$dst=memb($src1+#$offset)",
-            [(set IntRegs:$dst, (sextloadi8 (add IntRegs:$src1,
-                                                 s11_0ImmPred:$offset)))]>;
-
+            "$dst = memb($src1+#$offset)",
+            [(set (i32 IntRegs:$dst),
+                  (i32 (sextloadi8 (add (i32 IntRegs:$src1),
+                                        s11_0ImmPred:$offset))))]>;
 
 // Indexed load byte any-extend.
 let AddedComplexity = 20 in
-def LDrib_ae_indexed : LDInst<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, s11_0Imm:$offset),
-            "$dst=memb($src1+#$offset)",
-            [(set IntRegs:$dst, (extloadi8 (add IntRegs:$src1,
-                                                s11_0ImmPred:$offset)))]>;
+def : Pat < (i32 (extloadi8 (add IntRegs:$src1, s11_0ImmPred:$offset))),
+            (i32 (LDrib_indexed IntRegs:$src1, s11_0ImmPred:$offset)) >;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrib_GP : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1 in
+def LDrib_GP : LDInst2<(outs IntRegs:$dst),
             (ins globaladdress:$global, u16Imm:$offset),
-            "$dst=memb(#$global+$offset)",
-            []>;
+            "$dst = memb(#$global+$offset)",
+            []>,
+            Requires<[NoV4T]>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDb_GP : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1 in
+def LDb_GP : LDInst2<(outs IntRegs:$dst),
             (ins globaladdress:$global),
-            "$dst=memb(#$global)",
-            []>;
+            "$dst = memb(#$global)",
+            []>,
+            Requires<[NoV4T]>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDub_GP : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1 in
+def LDub_GP : LDInst2<(outs IntRegs:$dst),
             (ins globaladdress:$global),
-            "$dst=memub(#$global)",
-            []>;
+            "$dst = memub(#$global)",
+            []>,
+            Requires<[NoV4T]>;
 
-let isPredicable = 1, mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDrib : LDInstPI<(outs IntRegs:$dst, IntRegs:$dst2),
+let isPredicable = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDrib : LDInst2PI<(outs IntRegs:$dst, IntRegs:$dst2),
             (ins IntRegs:$src1, s4Imm:$offset),
             "$dst = memb($src1++#$offset)",
             [],
             "$src1 = $dst2">;
 
 // Load byte conditionally.
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrib_cPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrib_cPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if ($src1) $dst = memb($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrib_cNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrib_cNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if (!$src1) $dst = memb($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrib_indexed_cPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrib_indexed_cPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3),
             "if ($src1) $dst = memb($src2+#$src3)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrib_indexed_cNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrib_indexed_cNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3),
             "if (!$src1) $dst = memb($src2+#$src3)",
             []>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDrib_cPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDrib_cPt : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_0Imm:$src3),
             "if ($src1) $dst1 = memb($src2++#$src3)",
             [],
             "$src2 = $dst2">;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDrib_cNotPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDrib_cNotPt : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_0Imm:$src3),
             "if (!$src1) $dst1 = memb($src2++#$src3)",
             [],
             "$src2 = $dst2">;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrib_cdnPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrib_cdnPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if ($src1.new) $dst = memb($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrib_cdnNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrib_cdnNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if (!$src1.new) $dst = memb($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrib_indexed_cdnPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrib_indexed_cdnPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3),
             "if ($src1.new) $dst = memb($src2+#$src3)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrib_indexed_cdnNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrib_indexed_cdnNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3),
             "if (!$src1.new) $dst = memb($src2+#$src3)",
             []>;
@@ -1081,112 +1145,110 @@ let isPredicable = 1 in
 def LDrih : LDInst<(outs IntRegs:$dst),
             (ins MEMri:$addr),
             "$dst = memh($addr)",
-            [(set IntRegs:$dst, (sextloadi16 ADDRriS11_1:$addr))]>;
+            [(set (i32 IntRegs:$dst), (i32 (sextloadi16 ADDRriS11_1:$addr)))]>;
 
 let isPredicable = 1, AddedComplexity = 20 in
 def LDrih_indexed : LDInst<(outs IntRegs:$dst),
             (ins IntRegs:$src1, s11_1Imm:$offset),
-            "$dst=memh($src1+#$offset)",
-            [(set IntRegs:$dst, (sextloadi16 (add IntRegs:$src1,
-                                                  s11_1ImmPred:$offset)))] >;
+            "$dst = memh($src1+#$offset)",
+            [(set (i32 IntRegs:$dst),
+                  (i32 (sextloadi16 (add (i32 IntRegs:$src1),
+                                         s11_1ImmPred:$offset))))]>;
 
-def LDrih_ae : LDInst<(outs IntRegs:$dst),
-            (ins MEMri:$addr),
-            "$dst = memh($addr)",
-            [(set IntRegs:$dst, (extloadi16 ADDRriS11_1:$addr))]>;
+def : Pat < (i32 (extloadi16 ADDRriS11_1:$addr)),
+            (i32 (LDrih ADDRriS11_1:$addr))>;
 
 let AddedComplexity = 20 in
-def LDrih_ae_indexed : LDInst<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, s11_1Imm:$offset),
-            "$dst=memh($src1+#$offset)",
-            [(set IntRegs:$dst, (extloadi16 (add IntRegs:$src1,
-                                                 s11_1ImmPred:$offset)))] >;
+def : Pat < (i32 (extloadi16 (add IntRegs:$src1, s11_1ImmPred:$offset))),
+            (i32 (LDrih_indexed IntRegs:$src1, s11_1ImmPred:$offset)) >;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrih_GP : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1 in
+def LDrih_GP : LDInst2<(outs IntRegs:$dst),
             (ins globaladdress:$global, u16Imm:$offset),
-            "$dst=memh(#$global+$offset)",
-            []>;
+            "$dst = memh(#$global+$offset)",
+            []>,
+            Requires<[NoV4T]>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDh_GP : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1 in
+def LDh_GP : LDInst2<(outs IntRegs:$dst),
             (ins globaladdress:$global),
-            "$dst=memh(#$global)",
-            []>;
+            "$dst = memh(#$global)",
+            []>,
+            Requires<[NoV4T]>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDuh_GP : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1 in
+def LDuh_GP : LDInst2<(outs IntRegs:$dst),
             (ins globaladdress:$global),
-            "$dst=memuh(#$global)",
-            []>;
-
+            "$dst = memuh(#$global)",
+            []>,
+            Requires<[NoV4T]>;
 
-let isPredicable = 1, mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDrih : LDInstPI<(outs IntRegs:$dst, IntRegs:$dst2),
+let isPredicable = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDrih : LDInst2PI<(outs IntRegs:$dst, IntRegs:$dst2),
             (ins IntRegs:$src1, s4Imm:$offset),
             "$dst = memh($src1++#$offset)",
             [],
             "$src1 = $dst2">;
 
 // Load halfword conditionally.
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrih_cPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrih_cPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if ($src1) $dst = memh($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrih_cNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrih_cNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if (!$src1) $dst = memh($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrih_indexed_cPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrih_indexed_cPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3),
             "if ($src1) $dst = memh($src2+#$src3)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrih_indexed_cNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrih_indexed_cNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3),
             "if (!$src1) $dst = memh($src2+#$src3)",
             []>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDrih_cPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDrih_cPt : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_1Imm:$src3),
             "if ($src1) $dst1 = memh($src2++#$src3)",
             [],
             "$src2 = $dst2">;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDrih_cNotPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDrih_cNotPt : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_1Imm:$src3),
             "if (!$src1) $dst1 = memh($src2++#$src3)",
             [],
             "$src2 = $dst2">;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrih_cdnPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrih_cdnPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if ($src1.new) $dst = memh($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrih_cdnNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrih_cdnNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if (!$src1.new) $dst = memh($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrih_indexed_cdnPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrih_indexed_cdnPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3),
             "if ($src1.new) $dst = memh($src2+#$src3)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDrih_indexed_cdnNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrih_indexed_cdnNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3),
             "if (!$src1.new) $dst = memh($src2+#$src3)",
             []>;
@@ -1196,113 +1258,96 @@ let isPredicable = 1 in
 def LDriub : LDInst<(outs IntRegs:$dst),
             (ins MEMri:$addr),
             "$dst = memub($addr)",
-            [(set IntRegs:$dst, (zextloadi8 ADDRriS11_0:$addr))]>;
+            [(set (i32 IntRegs:$dst), (i32 (zextloadi8 ADDRriS11_0:$addr)))]>;
 
-let isPredicable = 1 in
-def LDriubit : LDInst<(outs IntRegs:$dst),
-            (ins MEMri:$addr),
-            "$dst = memub($addr)",
-            [(set IntRegs:$dst, (zextloadi1 ADDRriS11_0:$addr))]>;
+def : Pat < (i32 (zextloadi1 ADDRriS11_0:$addr)),
+            (i32 (LDriub ADDRriS11_0:$addr))>;
 
 let isPredicable = 1, AddedComplexity = 20 in
 def LDriub_indexed : LDInst<(outs IntRegs:$dst),
             (ins IntRegs:$src1, s11_0Imm:$offset),
-            "$dst=memub($src1+#$offset)",
-            [(set IntRegs:$dst, (zextloadi8 (add IntRegs:$src1,
-                                                 s11_0ImmPred:$offset)))]>;
-
-let AddedComplexity = 20 in
-def LDriubit_indexed : LDInst<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, s11_0Imm:$offset),
-            "$dst=memub($src1+#$offset)",
-            [(set IntRegs:$dst, (zextloadi1 (add IntRegs:$src1,
-                                                 s11_0ImmPred:$offset)))]>;
-
-def LDriub_ae : LDInst<(outs IntRegs:$dst),
-            (ins MEMri:$addr),
-            "$dst = memub($addr)",
-            [(set IntRegs:$dst, (extloadi8 ADDRriS11_0:$addr))]>;
-
+            "$dst = memub($src1+#$offset)",
+            [(set (i32 IntRegs:$dst),
+                  (i32 (zextloadi8 (add (i32 IntRegs:$src1),
+                                        s11_0ImmPred:$offset))))]>;
 
 let AddedComplexity = 20 in
-def LDriub_ae_indexed : LDInst<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, s11_0Imm:$offset),
-            "$dst=memub($src1+#$offset)",
-            [(set IntRegs:$dst, (extloadi8 (add IntRegs:$src1,
-                                                s11_0ImmPred:$offset)))]>;
+def : Pat < (i32 (zextloadi1 (add IntRegs:$src1, s11_0ImmPred:$offset))),
+            (i32 (LDriub_indexed IntRegs:$src1, s11_0ImmPred:$offset))>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriub_GP : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1 in
+def LDriub_GP : LDInst2<(outs IntRegs:$dst),
             (ins globaladdress:$global, u16Imm:$offset),
-            "$dst=memub(#$global+$offset)",
-            []>;
+            "$dst = memub(#$global+$offset)",
+            []>,
+            Requires<[NoV4T]>;
 
-let isPredicable = 1, mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDriub : LDInstPI<(outs IntRegs:$dst, IntRegs:$dst2),
+let isPredicable = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDriub : LDInst2PI<(outs IntRegs:$dst, IntRegs:$dst2),
             (ins IntRegs:$src1, s4Imm:$offset),
             "$dst = memub($src1++#$offset)",
             [],
             "$src1 = $dst2">;
 
 // Load unsigned byte conditionally.
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriub_cPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriub_cPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if ($src1) $dst = memub($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriub_cNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriub_cNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if (!$src1) $dst = memub($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriub_indexed_cPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriub_indexed_cPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3),
             "if ($src1) $dst = memub($src2+#$src3)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriub_indexed_cNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriub_indexed_cNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3),
             "if (!$src1) $dst = memub($src2+#$src3)",
             []>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDriub_cPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDriub_cPt : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_0Imm:$src3),
             "if ($src1) $dst1 = memub($src2++#$src3)",
             [],
             "$src2 = $dst2">;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDriub_cNotPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDriub_cNotPt : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_0Imm:$src3),
             "if (!$src1) $dst1 = memub($src2++#$src3)",
             [],
             "$src2 = $dst2">;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriub_cdnPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriub_cdnPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if ($src1.new) $dst = memub($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriub_cdnNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriub_cdnNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if (!$src1.new) $dst = memub($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriub_indexed_cdnPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriub_indexed_cdnPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3),
             "if ($src1.new) $dst = memub($src2+#$src3)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriub_indexed_cdnNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriub_indexed_cdnNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3),
             "if (!$src1.new) $dst = memub($src2+#$src3)",
             []>;
@@ -1312,102 +1357,90 @@ let isPredicable = 1 in
 def LDriuh : LDInst<(outs IntRegs:$dst),
             (ins MEMri:$addr),
             "$dst = memuh($addr)",
-            [(set IntRegs:$dst, (zextloadi16 ADDRriS11_1:$addr))]>;
+            [(set (i32 IntRegs:$dst), (i32 (zextloadi16 ADDRriS11_1:$addr)))]>;
 
 // Indexed load unsigned halfword.
 let isPredicable = 1, AddedComplexity = 20 in
 def LDriuh_indexed : LDInst<(outs IntRegs:$dst),
             (ins IntRegs:$src1, s11_1Imm:$offset),
-            "$dst=memuh($src1+#$offset)",
-            [(set IntRegs:$dst, (zextloadi16 (add IntRegs:$src1,
-                                                  s11_1ImmPred:$offset)))]>;
+            "$dst = memuh($src1+#$offset)",
+            [(set (i32 IntRegs:$dst),
+                  (i32 (zextloadi16 (add (i32 IntRegs:$src1),
+                                         s11_1ImmPred:$offset))))]>;
 
-def LDriuh_ae : LDInst<(outs IntRegs:$dst),
-            (ins MEMri:$addr),
-            "$dst = memuh($addr)",
-            [(set IntRegs:$dst, (extloadi16 ADDRriS11_1:$addr))]>;
-
-
-// Indexed load unsigned halfword any-extend.
-let AddedComplexity = 20 in
-def LDriuh_ae_indexed : LDInst<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, s11_1Imm:$offset),
-            "$dst=memuh($src1+#$offset)",
-            [(set IntRegs:$dst, (extloadi16 (add IntRegs:$src1,
-                                                 s11_1ImmPred:$offset)))] >;
-
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriuh_GP : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1 in
+def LDriuh_GP : LDInst2<(outs IntRegs:$dst),
             (ins globaladdress:$global, u16Imm:$offset),
-            "$dst=memuh(#$global+$offset)",
-            []>;
+            "$dst = memuh(#$global+$offset)",
+            []>,
+            Requires<[NoV4T]>;
 
-let isPredicable = 1, mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDriuh : LDInstPI<(outs IntRegs:$dst, IntRegs:$dst2),
+let isPredicable = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDriuh : LDInst2PI<(outs IntRegs:$dst, IntRegs:$dst2),
             (ins IntRegs:$src1, s4Imm:$offset),
             "$dst = memuh($src1++#$offset)",
             [],
             "$src1 = $dst2">;
 
 // Load unsigned halfword conditionally.
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriuh_cPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriuh_cPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if ($src1) $dst = memuh($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriuh_cNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriuh_cNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if (!$src1) $dst = memuh($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriuh_indexed_cPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriuh_indexed_cPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3),
             "if ($src1) $dst = memuh($src2+#$src3)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriuh_indexed_cNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriuh_indexed_cNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3),
             "if (!$src1) $dst = memuh($src2+#$src3)",
             []>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDriuh_cPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDriuh_cPt : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_1Imm:$src3),
             "if ($src1) $dst1 = memuh($src2++#$src3)",
             [],
             "$src2 = $dst2">;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDriuh_cNotPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDriuh_cNotPt : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_1Imm:$src3),
             "if (!$src1) $dst1 = memuh($src2++#$src3)",
             [],
             "$src2 = $dst2">;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriuh_cdnPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriuh_cdnPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if ($src1.new) $dst = memuh($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriuh_cdnNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriuh_cdnNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if (!$src1.new) $dst = memuh($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriuh_indexed_cdnPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriuh_indexed_cdnPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3),
             "if ($src1.new) $dst = memuh($src2+#$src3)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriuh_indexed_cdnNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriuh_indexed_cdnNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3),
             "if (!$src1.new) $dst = memuh($src2+#$src3)",
             []>;
@@ -1417,10 +1450,10 @@ def LDriuh_indexed_cdnNotPt : LDInst<(outs IntRegs:$dst),
 let isPredicable = 1 in
 def LDriw : LDInst<(outs IntRegs:$dst),
             (ins MEMri:$addr), "$dst = memw($addr)",
-            [(set IntRegs:$dst, (load ADDRriS11_2:$addr))]>;
+            [(set IntRegs:$dst, (i32 (load ADDRriS11_2:$addr)))]>;
 
 // Load predicate.
-let mayLoad = 1, Defs = [R10,R11] in
+let Defs = [R10,R11,D5], neverHasSideEffects = 1 in
 def LDriw_pred : LDInst<(outs PredRegs:$dst),
             (ins MEMri:$addr),
             "Error; should not emit",
@@ -1430,24 +1463,26 @@ def LDriw_pred : LDInst<(outs PredRegs:$dst),
 let isPredicable = 1, AddedComplexity = 20 in
 def LDriw_indexed : LDInst<(outs IntRegs:$dst),
             (ins IntRegs:$src1, s11_2Imm:$offset),
-            "$dst=memw($src1+#$offset)",
-            [(set IntRegs:$dst, (load (add IntRegs:$src1,
-                                           s11_2ImmPred:$offset)))]>;
+            "$dst = memw($src1+#$offset)",
+            [(set IntRegs:$dst, (i32 (load (add IntRegs:$src1,
+                                           s11_2ImmPred:$offset))))]>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriw_GP : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1 in
+def LDriw_GP : LDInst2<(outs IntRegs:$dst),
             (ins globaladdress:$global, u16Imm:$offset),
-            "$dst=memw(#$global+$offset)",
-            []>;
+            "$dst = memw(#$global+$offset)",
+            []>,
+            Requires<[NoV4T]>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDw_GP : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1 in
+def LDw_GP : LDInst2<(outs IntRegs:$dst),
             (ins globaladdress:$global),
-            "$dst=memw(#$global)",
-            []>;
+            "$dst = memw(#$global)",
+            []>,
+            Requires<[NoV4T]>;
 
-let isPredicable = 1, mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDriw : LDInstPI<(outs IntRegs:$dst, IntRegs:$dst2),
+let isPredicable = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
+def POST_LDriw : LDInst2PI<(outs IntRegs:$dst, IntRegs:$dst2),
             (ins IntRegs:$src1, s4Imm:$offset),
             "$dst = memw($src1++#$offset)",
             [],
@@ -1455,71 +1490,71 @@ def POST_LDriw : LDInstPI<(outs IntRegs:$dst, IntRegs:$dst2),
 
 // Load word conditionally.
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriw_cPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriw_cPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if ($src1) $dst = memw($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriw_cNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriw_cNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if (!$src1) $dst = memw($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriw_indexed_cPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriw_indexed_cPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3),
-            "if ($src1) $dst=memw($src2+#$src3)",
+            "if ($src1) $dst = memw($src2+#$src3)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriw_indexed_cNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriw_indexed_cNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3),
-            "if (!$src1) $dst=memw($src2+#$src3)",
+            "if (!$src1) $dst = memw($src2+#$src3)",
             []>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDriw_cPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDriw_cPt : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_2Imm:$src3),
             "if ($src1) $dst1 = memw($src2++#$src3)",
             [],
             "$src2 = $dst2">;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1 in
-def POST_LDriw_cNotPt : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDriw_cNotPt : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_2Imm:$src3),
             "if (!$src1) $dst1 = memw($src2++#$src3)",
             [],
             "$src2 = $dst2">;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriw_cdnPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriw_cdnPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if ($src1.new) $dst = memw($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriw_cdnNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriw_cdnNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, MEMri:$addr),
             "if (!$src1.new) $dst = memw($addr)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriw_indexed_cdnPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriw_indexed_cdnPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3),
-            "if ($src1.new) $dst=memw($src2+#$src3)",
+            "if ($src1.new) $dst = memw($src2+#$src3)",
             []>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in
-def LDriw_indexed_cdnNotPt : LDInst<(outs IntRegs:$dst),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriw_indexed_cdnNotPt : LDInst2<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3),
-            "if (!$src1.new) $dst=memw($src2+#$src3)",
+            "if (!$src1.new) $dst = memw($src2+#$src3)",
             []>;
 
 // Deallocate stack frame.
 let Defs = [R29, R30, R31], Uses = [R29], neverHasSideEffects = 1 in {
-  def DEALLOCFRAME : LDInst<(outs), (ins i32imm:$amt1),
+  def DEALLOCFRAME : LDInst2<(outs), (ins i32imm:$amt1),
                      "deallocframe",
                      []>;
 }
@@ -1550,13 +1585,14 @@ let Defs = [R29, R30, R31], Uses = [R29], neverHasSideEffects = 1 in {
 // Rd=+mpyi(Rs,#u8)
 def MPYI_riu : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u8Imm:$src2),
               "$dst =+ mpyi($src1, #$src2)",
-              [(set IntRegs:$dst, (mul IntRegs:$src1, u8ImmPred:$src2))]>;
+              [(set (i32 IntRegs:$dst), (mul (i32 IntRegs:$src1),
+                                             u8ImmPred:$src2))]>;
 
 // Rd=-mpyi(Rs,#u8)
 def MPYI_rin : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, n8Imm:$src2),
               "$dst =- mpyi($src1, #$src2)",
-              [(set IntRegs:$dst,
-               (mul IntRegs:$src1, n8ImmPred:$src2))]>;
+              [(set (i32 IntRegs:$dst), (mul (i32 IntRegs:$src1),
+                                             n8ImmPred:$src2))]>;
 
 // Rd=mpyi(Rs,#m9)
 // s9 is NOT the same as m9 - but it works.. so far.
@@ -1564,35 +1600,40 @@ def MPYI_rin : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, n8Imm:$src2),
 // depending on the value of m9. See Arch Spec.
 def MPYI_ri : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s9Imm:$src2),
               "$dst = mpyi($src1, #$src2)",
-              [(set IntRegs:$dst, (mul IntRegs:$src1, s9ImmPred:$src2))]>;
+              [(set (i32 IntRegs:$dst), (mul (i32 IntRegs:$src1),
+                                             s9ImmPred:$src2))]>;
 
 // Rd=mpyi(Rs,Rt)
 def MPYI : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
            "$dst = mpyi($src1, $src2)",
-           [(set IntRegs:$dst, (mul IntRegs:$src1, IntRegs:$src2))]>;
+           [(set (i32 IntRegs:$dst), (mul (i32 IntRegs:$src1),
+                                          (i32 IntRegs:$src2)))]>;
 
 // Rx+=mpyi(Rs,#u8)
 def MPYI_acc_ri : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2, u8Imm:$src3),
             "$dst += mpyi($src2, #$src3)",
-            [(set IntRegs:$dst,
-            (add (mul IntRegs:$src2, u8ImmPred:$src3), IntRegs:$src1))],
+            [(set (i32 IntRegs:$dst),
+                  (add (mul (i32 IntRegs:$src2), u8ImmPred:$src3),
+                       (i32 IntRegs:$src1)))],
             "$src1 = $dst">;
 
 // Rx+=mpyi(Rs,Rt)
 def MPYI_acc_rr : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
             "$dst += mpyi($src2, $src3)",
-            [(set IntRegs:$dst,
-            (add (mul IntRegs:$src2, IntRegs:$src3), IntRegs:$src1))],
+            [(set (i32 IntRegs:$dst),
+                  (add (mul (i32 IntRegs:$src2), (i32 IntRegs:$src3)),
+                       (i32 IntRegs:$src1)))],
             "$src1 = $dst">;
 
 // Rx-=mpyi(Rs,#u8)
 def MPYI_sub_ri : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2, u8Imm:$src3),
             "$dst -= mpyi($src2, #$src3)",
-            [(set IntRegs:$dst,
-            (sub IntRegs:$src1, (mul IntRegs:$src2, u8ImmPred:$src3)))],
+            [(set (i32 IntRegs:$dst),
+                  (sub (i32 IntRegs:$src1), (mul (i32 IntRegs:$src2),
+                                                 u8ImmPred:$src3)))],
             "$src1 = $dst">;
 
 // Multiply and use upper result.
@@ -1601,27 +1642,30 @@ def MPYI_sub_ri : MInst_acc<(outs IntRegs:$dst),
 // Rd=mpy(Rs,Rt)
 def MPY : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
           "$dst = mpy($src1, $src2)",
-          [(set IntRegs:$dst, (mulhs IntRegs:$src1, IntRegs:$src2))]>;
+          [(set (i32 IntRegs:$dst), (mulhs (i32 IntRegs:$src1),
+                                           (i32 IntRegs:$src2)))]>;
 
 // Rd=mpy(Rs,Rt):rnd
 // Rd=mpyu(Rs,Rt)
 def MPYU : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
            "$dst = mpyu($src1, $src2)",
-           [(set IntRegs:$dst, (mulhu IntRegs:$src1, IntRegs:$src2))]>;
+           [(set (i32 IntRegs:$dst), (mulhu (i32 IntRegs:$src1),
+                                            (i32 IntRegs:$src2)))]>;
 
 // Multiply and use full result.
 // Rdd=mpyu(Rs,Rt)
 def MPYU64 : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
              "$dst = mpyu($src1, $src2)",
-             [(set DoubleRegs:$dst, (mul (i64 (anyext IntRegs:$src1)),
-              (i64 (anyext IntRegs:$src2))))]>;
+             [(set (i64 DoubleRegs:$dst),
+                   (mul (i64 (anyext (i32 IntRegs:$src1))),
+                        (i64 (anyext (i32 IntRegs:$src2)))))]>;
 
 // Rdd=mpy(Rs,Rt)
 def MPY64 : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
              "$dst = mpy($src1, $src2)",
-             [(set DoubleRegs:$dst, (mul (i64 (sext IntRegs:$src1)),
-              (i64 (sext IntRegs:$src2))))]>;
-
+             [(set (i64 DoubleRegs:$dst),
+                   (mul (i64 (sext (i32 IntRegs:$src1))),
+                        (i64 (sext (i32 IntRegs:$src2)))))]>;
 
 // Multiply and accumulate, use full result.
 // Rxx[+-]=mpy(Rs,Rt)
@@ -1629,18 +1673,20 @@ def MPY64 : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
 def MPY64_acc : MInst_acc<(outs DoubleRegs:$dst),
             (ins DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
             "$dst += mpy($src2, $src3)",
-            [(set DoubleRegs:$dst,
-            (add (mul (i64 (sext IntRegs:$src2)), (i64 (sext IntRegs:$src3))),
-               DoubleRegs:$src1))],
+            [(set (i64 DoubleRegs:$dst),
+            (add (mul (i64 (sext (i32 IntRegs:$src2))),
+                      (i64 (sext (i32 IntRegs:$src3)))),
+                 (i64 DoubleRegs:$src1)))],
             "$src1 = $dst">;
 
 // Rxx-=mpy(Rs,Rt)
 def MPY64_sub : MInst_acc<(outs DoubleRegs:$dst),
             (ins DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
             "$dst -= mpy($src2, $src3)",
-            [(set DoubleRegs:$dst,
-            (sub DoubleRegs:$src1,
-                (mul (i64 (sext IntRegs:$src2)), (i64 (sext IntRegs:$src3)))))],
+            [(set (i64 DoubleRegs:$dst),
+                  (sub (i64 DoubleRegs:$src1),
+                       (mul (i64 (sext (i32 IntRegs:$src2))),
+                            (i64 (sext (i32 IntRegs:$src3))))))],
             "$src1 = $dst">;
 
 // Rxx[+-]=mpyu(Rs,Rt)
@@ -1648,47 +1694,52 @@ def MPY64_sub : MInst_acc<(outs DoubleRegs:$dst),
 def MPYU64_acc : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
                             IntRegs:$src2, IntRegs:$src3),
              "$dst += mpyu($src2, $src3)",
-             [(set DoubleRegs:$dst, (add (mul (i64 (anyext IntRegs:$src2)),
-              (i64 (anyext IntRegs:$src3))),
-               DoubleRegs:$src1))],"$src1 = $dst">;
+             [(set (i64 DoubleRegs:$dst),
+                   (add (mul (i64 (anyext (i32 IntRegs:$src2))),
+                             (i64 (anyext (i32 IntRegs:$src3)))),
+                        (i64 DoubleRegs:$src1)))], "$src1 = $dst">;
 
 // Rxx-=mpyu(Rs,Rt)
 def MPYU64_sub : MInst_acc<(outs DoubleRegs:$dst),
             (ins DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
             "$dst += mpyu($src2, $src3)",
-            [(set DoubleRegs:$dst,
-            (sub DoubleRegs:$src1,
-                    (mul (i64 (anyext IntRegs:$src2)),
-                         (i64 (anyext IntRegs:$src3)))))],
+            [(set (i64 DoubleRegs:$dst),
+                  (sub (i64 DoubleRegs:$src1),
+                       (mul (i64 (anyext (i32 IntRegs:$src2))),
+                            (i64 (anyext (i32 IntRegs:$src3))))))],
             "$src1 = $dst">;
 
 
 def ADDrr_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1,
                             IntRegs:$src2, IntRegs:$src3),
              "$dst += add($src2, $src3)",
-             [(set IntRegs:$dst, (add (add IntRegs:$src2, IntRegs:$src3),
-                                      IntRegs:$src1))],
+             [(set (i32 IntRegs:$dst), (add (add (i32 IntRegs:$src2),
+                                                 (i32 IntRegs:$src3)),
+                                            (i32 IntRegs:$src1)))],
              "$src1 = $dst">;
 
 def ADDri_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1,
                             IntRegs:$src2, s8Imm:$src3),
              "$dst += add($src2, #$src3)",
-             [(set IntRegs:$dst, (add (add IntRegs:$src2, s8ImmPred:$src3),
-                                      IntRegs:$src1))],
+             [(set (i32 IntRegs:$dst), (add (add (i32 IntRegs:$src2),
+                                                 s8ImmPred:$src3),
+                                            (i32 IntRegs:$src1)))],
              "$src1 = $dst">;
 
 def SUBrr_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1,
                             IntRegs:$src2, IntRegs:$src3),
              "$dst -= add($src2, $src3)",
-             [(set IntRegs:$dst, (sub IntRegs:$src1, (add IntRegs:$src2,
-                                                     IntRegs:$src3)))],
+             [(set (i32 IntRegs:$dst),
+                   (sub (i32 IntRegs:$src1), (add (i32 IntRegs:$src2),
+                                                  (i32 IntRegs:$src3))))],
              "$src1 = $dst">;
 
 def SUBri_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1,
                             IntRegs:$src2, s8Imm:$src3),
              "$dst -= add($src2, #$src3)",
-             [(set IntRegs:$dst, (sub IntRegs:$src1,
-                                      (add IntRegs:$src2, s8ImmPred:$src3)))],
+             [(set (i32 IntRegs:$dst), (sub (i32 IntRegs:$src1),
+                                            (add (i32 IntRegs:$src2),
+                                                 s8ImmPred:$src3)))],
              "$src1 = $dst">;
 
 //===----------------------------------------------------------------------===//
@@ -1731,57 +1782,70 @@ let isPredicable = 1 in
 def STrid : STInst<(outs),
             (ins MEMri:$addr, DoubleRegs:$src1),
             "memd($addr) = $src1",
-            [(store DoubleRegs:$src1, ADDRriS11_3:$addr)]>;
+            [(store (i64 DoubleRegs:$src1), ADDRriS11_3:$addr)]>;
 
 // Indexed store double word.
 let AddedComplexity = 10, isPredicable = 1 in
 def STrid_indexed : STInst<(outs),
             (ins IntRegs:$src1, s11_3Imm:$src2,  DoubleRegs:$src3),
             "memd($src1+#$src2) = $src3",
-            [(store DoubleRegs:$src3,
-                                (add IntRegs:$src1, s11_3ImmPred:$src2))]>;
+            [(store (i64 DoubleRegs:$src3),
+                    (add (i32 IntRegs:$src1), s11_3ImmPred:$src2))]>;
 
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrid_GP : STInst<(outs),
+let neverHasSideEffects = 1 in
+def STrid_GP : STInst2<(outs),
             (ins globaladdress:$global, u16Imm:$offset, DoubleRegs:$src),
             "memd(#$global+$offset) = $src",
-            []>;
+            []>,
+            Requires<[NoV4T]>;
+
+let neverHasSideEffects = 1 in
+def STd_GP : STInst2<(outs),
+            (ins globaladdress:$global, DoubleRegs:$src),
+            "memd(#$global) = $src",
+            []>,
+            Requires<[NoV4T]>;
 
 let hasCtrlDep = 1, isPredicable = 1 in
 def POST_STdri : STInstPI<(outs IntRegs:$dst),
             (ins DoubleRegs:$src1, IntRegs:$src2, s4Imm:$offset),
             "memd($src2++#$offset) = $src1",
             [(set IntRegs:$dst,
-            (post_store DoubleRegs:$src1, IntRegs:$src2, s4_3ImmPred:$offset))],
+            (post_store (i64 DoubleRegs:$src1), (i32 IntRegs:$src2),
+                        s4_3ImmPred:$offset))],
             "$src2 = $dst">;
 
 // Store doubleword conditionally.
 // if ([!]Pv) memd(Rs+#u6:3)=Rtt
 // if (Pv) memd(Rs+#u6:3)=Rtt
-let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
-def STrid_cPt : STInst<(outs),
+let AddedComplexity = 10, neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrid_cPt : STInst2<(outs),
             (ins PredRegs:$src1, MEMri:$addr, DoubleRegs:$src2),
             "if ($src1) memd($addr) = $src2",
             []>;
 
 // if (!Pv) memd(Rs+#u6:3)=Rtt
-let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
-def STrid_cNotPt : STInst<(outs),
+let AddedComplexity = 10, neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrid_cNotPt : STInst2<(outs),
             (ins PredRegs:$src1, MEMri:$addr, DoubleRegs:$src2),
             "if (!$src1) memd($addr) = $src2",
             []>;
 
 // if (Pv) memd(Rs+#u6:3)=Rtt
-let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
-def STrid_indexed_cPt : STInst<(outs),
+let AddedComplexity = 10, neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrid_indexed_cPt : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_3Imm:$src3,
                  DoubleRegs:$src4),
             "if ($src1) memd($src2+#$src3) = $src4",
             []>;
 
 // if (!Pv) memd(Rs+#u6:3)=Rtt
-let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
-def STrid_indexed_cNotPt : STInst<(outs),
+let AddedComplexity = 10, neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrid_indexed_cNotPt : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_3Imm:$src3,
                  DoubleRegs:$src4),
             "if (!$src1) memd($src2+#$src3) = $src4",
@@ -1789,8 +1853,9 @@ def STrid_indexed_cNotPt : STInst<(outs),
 
 // if ([!]Pv) memd(Rx++#s4:3)=Rtt
 // if (Pv) memd(Rx++#s4:3)=Rtt
-let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
-def POST_STdri_cPt : STInstPI<(outs IntRegs:$dst),
+let AddedComplexity = 10, neverHasSideEffects = 1,
+    isPredicated = 1 in
+def POST_STdri_cPt : STInst2PI<(outs IntRegs:$dst),
             (ins PredRegs:$src1, DoubleRegs:$src2, IntRegs:$src3,
                  s4_3Imm:$offset),
             "if ($src1) memd($src3++#$offset) = $src2",
@@ -1798,9 +1863,9 @@ def POST_STdri_cPt : STInstPI<(outs IntRegs:$dst),
             "$src3 = $dst">;
 
 // if (!Pv) memd(Rx++#s4:3)=Rtt
-let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1,
+let AddedComplexity = 10, neverHasSideEffects = 1, isPredicated = 1,
     isPredicated = 1 in
-def POST_STdri_cNotPt : STInstPI<(outs IntRegs:$dst),
+def POST_STdri_cNotPt : STInst2PI<(outs IntRegs:$dst),
             (ins PredRegs:$src1, DoubleRegs:$src2, IntRegs:$src3,
                  s4_3Imm:$offset),
             "if (!$src1) memd($src3++#$offset) = $src2",
@@ -1814,27 +1879,30 @@ let isPredicable = 1 in
 def STrib : STInst<(outs),
             (ins MEMri:$addr, IntRegs:$src1),
             "memb($addr) = $src1",
-            [(truncstorei8 IntRegs:$src1, ADDRriS11_0:$addr)]>;
+            [(truncstorei8 (i32 IntRegs:$src1), ADDRriS11_0:$addr)]>;
 
 let AddedComplexity = 10, isPredicable = 1 in
 def STrib_indexed : STInst<(outs),
             (ins IntRegs:$src1, s11_0Imm:$src2, IntRegs:$src3),
             "memb($src1+#$src2) = $src3",
-            [(truncstorei8 IntRegs:$src3, (add IntRegs:$src1,
-                                               s11_0ImmPred:$src2))]>;
+            [(truncstorei8 (i32 IntRegs:$src3), (add (i32 IntRegs:$src1),
+                                                     s11_0ImmPred:$src2))]>;
 
 // memb(gp+#u16:0)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrib_GP : STInst<(outs),
+let neverHasSideEffects = 1 in
+def STrib_GP : STInst2<(outs),
             (ins globaladdress:$global, u16Imm:$offset, IntRegs:$src),
             "memb(#$global+$offset) = $src",
-            []>;
+            []>,
+            Requires<[NoV4T]>;
 
-let mayStore = 1, neverHasSideEffects = 1 in
-def STb_GP   : STInst<(outs),
+// memb(#global)=Rt
+let neverHasSideEffects = 1 in
+def STb_GP : STInst2<(outs),
             (ins globaladdress:$global, IntRegs:$src),
             "memb(#$global) = $src",
-            []>;
+            []>,
+            Requires<[NoV4T]>;
 
 // memb(Rx++#s4:0)=Rt
 let hasCtrlDep = 1, isPredicable = 1 in
@@ -1843,51 +1911,51 @@ def POST_STbri : STInstPI<(outs IntRegs:$dst), (ins IntRegs:$src1,
                                                     s4Imm:$offset),
             "memb($src2++#$offset) = $src1",
             [(set IntRegs:$dst,
-            (post_truncsti8 IntRegs:$src1, IntRegs:$src2,
+            (post_truncsti8 (i32 IntRegs:$src1), (i32 IntRegs:$src2),
                             s4_0ImmPred:$offset))],
             "$src2 = $dst">;
 
 // Store byte conditionally.
 // if ([!]Pv) memb(Rs+#u6:0)=Rt
 // if (Pv) memb(Rs+#u6:0)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrib_cPt : STInst<(outs),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrib_cPt : STInst2<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if ($src1) memb($addr) = $src2",
             []>;
 
 // if (!Pv) memb(Rs+#u6:0)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrib_cNotPt : STInst<(outs),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrib_cNotPt : STInst2<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if (!$src1) memb($addr) = $src2",
             []>;
 
 // if (Pv) memb(Rs+#u6:0)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrib_indexed_cPt : STInst<(outs),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrib_indexed_cPt : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, IntRegs:$src4),
             "if ($src1) memb($src2+#$src3) = $src4",
             []>;
 
 // if (!Pv) memb(Rs+#u6:0)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrib_indexed_cNotPt : STInst<(outs),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrib_indexed_cNotPt : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, IntRegs:$src4),
             "if (!$src1) memb($src2+#$src3) = $src4",
             []>;
 
 // if ([!]Pv) memb(Rx++#s4:0)=Rt
 // if (Pv) memb(Rx++#s4:0)=Rt
-let mayStore = 1, hasCtrlDep = 1, isPredicated = 1 in
-def POST_STbri_cPt : STInstPI<(outs IntRegs:$dst),
+let hasCtrlDep = 1, isPredicated = 1 in
+def POST_STbri_cPt : STInst2PI<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset),
             "if ($src1) memb($src3++#$offset) = $src2",
             [],"$src3 = $dst">;
 
 // if (!Pv) memb(Rx++#s4:0)=Rt
-let mayStore = 1, hasCtrlDep = 1, isPredicated = 1 in
-def POST_STbri_cNotPt : STInstPI<(outs IntRegs:$dst),
+let hasCtrlDep = 1, isPredicated = 1 in
+def POST_STbri_cNotPt : STInst2PI<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset),
             "if (!$src1) memb($src3++#$offset) = $src2",
             [],"$src3 = $dst">;
@@ -1899,27 +1967,29 @@ let isPredicable = 1 in
 def STrih : STInst<(outs),
             (ins MEMri:$addr, IntRegs:$src1),
             "memh($addr) = $src1",
-            [(truncstorei16 IntRegs:$src1, ADDRriS11_1:$addr)]>;
+            [(truncstorei16 (i32 IntRegs:$src1), ADDRriS11_1:$addr)]>;
 
 
 let AddedComplexity = 10, isPredicable = 1 in
 def STrih_indexed : STInst<(outs),
             (ins IntRegs:$src1, s11_1Imm:$src2,  IntRegs:$src3),
             "memh($src1+#$src2) = $src3",
-            [(truncstorei16 IntRegs:$src3, (add IntRegs:$src1,
-                                                s11_1ImmPred:$src2))]>;
+            [(truncstorei16 (i32 IntRegs:$src3), (add (i32 IntRegs:$src1),
+                                                      s11_1ImmPred:$src2))]>;
 
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrih_GP : STInst<(outs),
+let neverHasSideEffects = 1 in
+def STrih_GP : STInst2<(outs),
             (ins globaladdress:$global, u16Imm:$offset, IntRegs:$src),
             "memh(#$global+$offset) = $src",
-            []>;
+            []>,
+            Requires<[NoV4T]>;
 
-let mayStore = 1, neverHasSideEffects = 1 in
-def STh_GP   : STInst<(outs),
+let neverHasSideEffects = 1 in
+def STh_GP   : STInst2<(outs),
             (ins globaladdress:$global, IntRegs:$src),
             "memh(#$global) = $src",
-            []>;
+            []>,
+            Requires<[NoV4T]>;
 
 // memh(Rx++#s4:1)=Rt.H
 // memh(Rx++#s4:1)=Rt
@@ -1928,51 +1998,51 @@ def POST_SThri : STInstPI<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2, s4Imm:$offset),
             "memh($src2++#$offset) = $src1",
             [(set IntRegs:$dst,
-            (post_truncsti16 IntRegs:$src1, IntRegs:$src2,
+            (post_truncsti16 (i32 IntRegs:$src1), (i32 IntRegs:$src2),
                              s4_1ImmPred:$offset))],
             "$src2 = $dst">;
 
 // Store halfword conditionally.
 // if ([!]Pv) memh(Rs+#u6:1)=Rt
 // if (Pv) memh(Rs+#u6:1)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrih_cPt : STInst<(outs),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrih_cPt : STInst2<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if ($src1) memh($addr) = $src2",
             []>;
 
 // if (!Pv) memh(Rs+#u6:1)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrih_cNotPt : STInst<(outs),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrih_cNotPt : STInst2<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if (!$src1) memh($addr) = $src2",
             []>;
 
 // if (Pv) memh(Rs+#u6:1)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrih_indexed_cPt : STInst<(outs),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrih_indexed_cPt : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, IntRegs:$src4),
             "if ($src1) memh($src2+#$src3) = $src4",
             []>;
 
 // if (!Pv) memh(Rs+#u6:1)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrih_indexed_cNotPt : STInst<(outs),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrih_indexed_cNotPt : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, IntRegs:$src4),
             "if (!$src1) memh($src2+#$src3) = $src4",
             []>;
 
 // if ([!]Pv) memh(Rx++#s4:1)=Rt
 // if (Pv) memh(Rx++#s4:1)=Rt
-let mayStore = 1, hasCtrlDep = 1, isPredicated = 1 in
-def POST_SThri_cPt : STInstPI<(outs IntRegs:$dst),
+let hasCtrlDep = 1, isPredicated = 1 in
+def POST_SThri_cPt : STInst2PI<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset),
             "if ($src1) memh($src3++#$offset) = $src2",
             [],"$src3 = $dst">;
 
 // if (!Pv) memh(Rx++#s4:1)=Rt
-let mayStore = 1, hasCtrlDep = 1, isPredicated = 1 in
-def POST_SThri_cNotPt : STInstPI<(outs IntRegs:$dst),
+let hasCtrlDep = 1, isPredicated = 1 in
+def POST_SThri_cNotPt : STInst2PI<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset),
             "if (!$src1) memh($src3++#$offset) = $src2",
             [],"$src3 = $dst">;
@@ -1980,8 +2050,8 @@ def POST_SThri_cNotPt : STInstPI<(outs IntRegs:$dst),
 
 // Store word.
 // Store predicate.
-let Defs = [R10,R11] in
-def STriw_pred : STInst<(outs),
+let Defs = [R10,R11,D5], neverHasSideEffects = 1 in
+def STriw_pred : STInst2<(outs),
             (ins MEMri:$addr, PredRegs:$src1),
             "Error; should not emit",
             []>;
@@ -1991,69 +2061,79 @@ let isPredicable = 1 in
 def STriw : STInst<(outs),
             (ins MEMri:$addr, IntRegs:$src1),
             "memw($addr) = $src1",
-            [(store IntRegs:$src1, ADDRriS11_2:$addr)]>;
+            [(store (i32 IntRegs:$src1), ADDRriS11_2:$addr)]>;
 
 let AddedComplexity = 10, isPredicable = 1 in
 def STriw_indexed : STInst<(outs),
             (ins IntRegs:$src1, s11_2Imm:$src2, IntRegs:$src3),
             "memw($src1+#$src2) = $src3",
-            [(store IntRegs:$src3, (add IntRegs:$src1, s11_2ImmPred:$src2))]>;
+            [(store (i32 IntRegs:$src3),
+                    (add (i32 IntRegs:$src1), s11_2ImmPred:$src2))]>;
 
-let mayStore = 1, neverHasSideEffects = 1 in
-def STriw_GP : STInst<(outs),
+let neverHasSideEffects = 1 in
+def STriw_GP : STInst2<(outs),
             (ins globaladdress:$global, u16Imm:$offset, IntRegs:$src),
             "memw(#$global+$offset) = $src",
-            []>;
+            []>,
+            Requires<[NoV4T]>;
+
+let neverHasSideEffects = 1 in
+def STw_GP : STInst2<(outs),
+            (ins globaladdress:$global, IntRegs:$src),
+            "memw(#$global) = $src",
+            []>,
+            Requires<[NoV4T]>;
 
 let hasCtrlDep = 1, isPredicable = 1  in
 def POST_STwri : STInstPI<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2, s4Imm:$offset),
             "memw($src2++#$offset) = $src1",
             [(set IntRegs:$dst,
-            (post_store IntRegs:$src1, IntRegs:$src2, s4_2ImmPred:$offset))],
+            (post_store (i32 IntRegs:$src1), (i32 IntRegs:$src2),
+                        s4_2ImmPred:$offset))],
             "$src2 = $dst">;
 
 // Store word conditionally.
 // if ([!]Pv) memw(Rs+#u6:2)=Rt
 // if (Pv) memw(Rs+#u6:2)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STriw_cPt : STInst<(outs),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STriw_cPt : STInst2<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if ($src1) memw($addr) = $src2",
             []>;
 
 // if (!Pv) memw(Rs+#u6:2)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STriw_cNotPt : STInst<(outs),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STriw_cNotPt : STInst2<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if (!$src1) memw($addr) = $src2",
             []>;
 
 // if (Pv) memw(Rs+#u6:2)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STriw_indexed_cPt : STInst<(outs),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STriw_indexed_cPt : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, IntRegs:$src4),
             "if ($src1) memw($src2+#$src3) = $src4",
             []>;
 
 // if (!Pv) memw(Rs+#u6:2)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STriw_indexed_cNotPt : STInst<(outs),
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STriw_indexed_cNotPt : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, IntRegs:$src4),
             "if (!$src1) memw($src2+#$src3) = $src4",
             []>;
 
 // if ([!]Pv) memw(Rx++#s4:2)=Rt
 // if (Pv) memw(Rx++#s4:2)=Rt
-let mayStore = 1, hasCtrlDep = 1, isPredicated = 1 in
-def POST_STwri_cPt : STInstPI<(outs IntRegs:$dst),
+let hasCtrlDep = 1, isPredicated = 1 in
+def POST_STwri_cPt : STInst2PI<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset),
             "if ($src1) memw($src3++#$offset) = $src2",
             [],"$src3 = $dst">;
 
 // if (!Pv) memw(Rx++#s4:2)=Rt
-let mayStore = 1, hasCtrlDep = 1, isPredicated = 1 in
-def POST_STwri_cNotPt : STInstPI<(outs IntRegs:$dst),
+let hasCtrlDep = 1, isPredicated = 1 in
+def POST_STwri_cNotPt : STInst2PI<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset),
             "if (!$src1) memw($src3++#$offset) = $src2",
             [],"$src3 = $dst">;
@@ -2062,7 +2142,7 @@ def POST_STwri_cNotPt : STInstPI<(outs IntRegs:$dst),
 
 // Allocate stack frame.
 let Defs = [R29, R30], Uses = [R31, R30], neverHasSideEffects = 1 in {
-  def ALLOCFRAME : STInst<(outs),
+  def ALLOCFRAME : STInst2<(outs),
              (ins i32imm:$amt),
              "allocframe(#$amt)",
              []>;
@@ -2077,13 +2157,13 @@ let Defs = [R29, R30], Uses = [R31, R30], neverHasSideEffects = 1 in {
 // Logical NOT.
 def NOT_rr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1),
                "$dst = not($src1)",
-               [(set DoubleRegs:$dst, (not DoubleRegs:$src1))]>;
+               [(set (i64 DoubleRegs:$dst), (not (i64 DoubleRegs:$src1)))]>;
 
 
 // Sign extend word to doubleword.
 def SXTW : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src1),
            "$dst = sxtw($src1)",
-           [(set DoubleRegs:$dst, (sext IntRegs:$src1))]>;
+           [(set (i64 DoubleRegs:$dst), (sext (i32 IntRegs:$src1)))]>;
 //===----------------------------------------------------------------------===//
 // STYPE/ALU -
 //===----------------------------------------------------------------------===//
@@ -2091,37 +2171,58 @@ def SXTW : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src1),
 //===----------------------------------------------------------------------===//
 // STYPE/BIT +
 //===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// STYPE/BIT -
-//===----------------------------------------------------------------------===//
+// clrbit.
+def CLRBIT : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+            "$dst = clrbit($src1, #$src2)",
+            [(set (i32 IntRegs:$dst), (and (i32 IntRegs:$src1),
+                                           (not
+                                              (shl 1, u5ImmPred:$src2))))]>;
 
+def CLRBIT_31 : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+            "$dst = clrbit($src1, #$src2)",
+            []>;
 
-//===----------------------------------------------------------------------===//
-// STYPE/COMPLEX +
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// STYPE/COMPLEX -
-//===----------------------------------------------------------------------===//
+// Map from r0 = and(r1, 2147483647) to r0 = clrbit(r1, #31).
+def : Pat <(and (i32 IntRegs:$src1), 2147483647),
+      (CLRBIT_31 (i32 IntRegs:$src1), 31)>;
 
-//===----------------------------------------------------------------------===//
-// STYPE/PERM +
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// STYPE/PERM -
-//===----------------------------------------------------------------------===//
+// setbit.
+def SETBIT : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+            "$dst = setbit($src1, #$src2)",
+            [(set (i32 IntRegs:$dst), (or (i32 IntRegs:$src1),
+                                          (shl 1, u5ImmPred:$src2)))]>;
+
+// Map from r0 = or(r1, -2147483648) to r0 = setbit(r1, #31).
+def SETBIT_31 : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+            "$dst = setbit($src1, #$src2)",
+            []>;
+
+def : Pat <(or (i32 IntRegs:$src1), -2147483648),
+      (SETBIT_31 (i32 IntRegs:$src1), 31)>;
+
+// togglebit.
+def TOGBIT : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+            "$dst = setbit($src1, #$src2)",
+            [(set (i32 IntRegs:$dst), (xor (i32 IntRegs:$src1),
+                                          (shl 1, u5ImmPred:$src2)))]>;
+
+// Map from r0 = xor(r1, -2147483648) to r0 = togglebit(r1, #31).
+def TOGBIT_31 : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+            "$dst = togglebit($src1, #$src2)",
+            []>;
+
+def : Pat <(xor (i32 IntRegs:$src1), -2147483648),
+      (TOGBIT_31 (i32 IntRegs:$src1), 31)>;
 
-//===----------------------------------------------------------------------===//
-// STYPE/PRED +
-//===----------------------------------------------------------------------===//
 // Predicate transfer.
 let neverHasSideEffects = 1 in
 def TFR_RsPd : SInst<(outs IntRegs:$dst), (ins PredRegs:$src1),
-               "$dst = $src1  // Should almost never emit this",
+               "$dst = $src1  /* Should almost never emit this. */",
                []>;
 
 def TFR_PdRs : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1),
-               "$dst = $src1  // Should almost never emit!",
-               [(set PredRegs:$dst, (trunc IntRegs:$src1))]>;
+               "$dst = $src1  /* Should almost never emit this. */",
+               [(set (i1 PredRegs:$dst), (trunc (i32 IntRegs:$src1)))]>;
 //===----------------------------------------------------------------------===//
 // STYPE/PRED -
 //===----------------------------------------------------------------------===//
@@ -2132,75 +2233,85 @@ def TFR_PdRs : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1),
 // Shift by immediate.
 def ASR_ri : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
              "$dst = asr($src1, #$src2)",
-             [(set IntRegs:$dst, (sra IntRegs:$src1, u5ImmPred:$src2))]>;
+             [(set (i32 IntRegs:$dst), (sra (i32 IntRegs:$src1),
+                                            u5ImmPred:$src2))]>;
 
 def ASRd_ri : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2),
               "$dst = asr($src1, #$src2)",
-              [(set DoubleRegs:$dst, (sra DoubleRegs:$src1, u6ImmPred:$src2))]>;
+              [(set (i64 DoubleRegs:$dst), (sra (i64 DoubleRegs:$src1),
+                                                u6ImmPred:$src2))]>;
 
 def ASL : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
           "$dst = asl($src1, #$src2)",
-          [(set IntRegs:$dst, (shl IntRegs:$src1, u5ImmPred:$src2))]>;
+          [(set (i32 IntRegs:$dst), (shl (i32 IntRegs:$src1),
+                                         u5ImmPred:$src2))]>;
+
+def ASLd_ri : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2),
+              "$dst = asl($src1, #$src2)",
+              [(set (i64 DoubleRegs:$dst), (shl (i64 DoubleRegs:$src1),
+                                                u6ImmPred:$src2))]>;
 
 def LSR_ri : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
              "$dst = lsr($src1, #$src2)",
-             [(set IntRegs:$dst, (srl IntRegs:$src1, u5ImmPred:$src2))]>;
+             [(set (i32 IntRegs:$dst), (srl (i32 IntRegs:$src1),
+                                            u5ImmPred:$src2))]>;
 
 def LSRd_ri : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2),
               "$dst = lsr($src1, #$src2)",
-              [(set DoubleRegs:$dst, (srl DoubleRegs:$src1, u6ImmPred:$src2))]>;
-
-def LSRd_ri_acc : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
-                                                     DoubleRegs:$src2,
-                                                     u6Imm:$src3),
-              "$dst += lsr($src2, #$src3)",
-              [(set DoubleRegs:$dst, (add DoubleRegs:$src1,
-                                          (srl DoubleRegs:$src2,
-                                           u6ImmPred:$src3)))],
-              "$src1 = $dst">;
-
-// Shift by immediate and accumulate.
-def ASR_rr_acc : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1,
-                                                     IntRegs:$src2,
-                                                     IntRegs:$src3),
-                 "$dst += asr($src2, $src3)",
-                 [], "$src1 = $dst">;
+              [(set (i64 DoubleRegs:$dst), (srl (i64 DoubleRegs:$src1),
+                                                u6ImmPred:$src2))]>;
 
 // Shift by immediate and add.
+let AddedComplexity = 100 in
 def ADDASL : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
                                              u3Imm:$src3),
              "$dst = addasl($src1, $src2, #$src3)",
-             [(set IntRegs:$dst, (add IntRegs:$src1,
-                                      (shl IntRegs:$src2,
-                                           u3ImmPred:$src3)))]>;
+             [(set (i32 IntRegs:$dst), (add (i32 IntRegs:$src1),
+                                       (shl (i32 IntRegs:$src2),
+                                            u3ImmPred:$src3)))]>;
 
 // Shift by register.
 def ASL_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
              "$dst = asl($src1, $src2)",
-             [(set IntRegs:$dst, (shl IntRegs:$src1, IntRegs:$src2))]>;
+             [(set (i32 IntRegs:$dst), (shl (i32 IntRegs:$src1),
+                                            (i32 IntRegs:$src2)))]>;
 
 def ASR_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
              "$dst = asr($src1, $src2)",
-             [(set IntRegs:$dst, (sra IntRegs:$src1, IntRegs:$src2))]>;
+             [(set (i32 IntRegs:$dst), (sra (i32 IntRegs:$src1),
+                                            (i32 IntRegs:$src2)))]>;
 
+def LSL_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             "$dst = lsl($src1, $src2)",
+             [(set (i32 IntRegs:$dst), (shl (i32 IntRegs:$src1),
+                                            (i32 IntRegs:$src2)))]>;
 
 def LSR_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
              "$dst = lsr($src1, $src2)",
-             [(set IntRegs:$dst, (srl IntRegs:$src1, IntRegs:$src2))]>;
+             [(set (i32 IntRegs:$dst), (srl (i32 IntRegs:$src1),
+                                            (i32 IntRegs:$src2)))]>;
+
+def ASLd : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2),
+           "$dst = asl($src1, $src2)",
+           [(set (i64 DoubleRegs:$dst), (shl (i64 DoubleRegs:$src1),
+                                             (i32 IntRegs:$src2)))]>;
 
 def LSLd : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2),
            "$dst = lsl($src1, $src2)",
-           [(set DoubleRegs:$dst, (shl DoubleRegs:$src1, IntRegs:$src2))]>;
+           [(set (i64 DoubleRegs:$dst), (shl (i64 DoubleRegs:$src1),
+                                             (i32 IntRegs:$src2)))]>;
 
 def ASRd_rr : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
                                                  IntRegs:$src2),
               "$dst = asr($src1, $src2)",
-              [(set DoubleRegs:$dst, (sra DoubleRegs:$src1, IntRegs:$src2))]>;
+              [(set (i64 DoubleRegs:$dst), (sra (i64 DoubleRegs:$src1),
+                                                (i32 IntRegs:$src2)))]>;
 
 def LSRd_rr : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
                                                  IntRegs:$src2),
               "$dst = lsr($src1, $src2)",
-              [(set DoubleRegs:$dst, (srl DoubleRegs:$src1, IntRegs:$src2))]>;
+              [(set (i64 DoubleRegs:$dst), (srl (i64 DoubleRegs:$src1),
+                                                (i32 IntRegs:$src2)))]>;
 
 //===----------------------------------------------------------------------===//
 // STYPE/SHIFT -
@@ -2231,8 +2342,8 @@ def SDHexagonBARRIER: SDTypeProfile<0, 0, []>;
 def HexagonBARRIER: SDNode<"HexagonISD::BARRIER", SDHexagonBARRIER,
                            [SDNPHasChain]>;
 
-let hasSideEffects = 1 in
-def BARRIER : STInst<(outs), (ins),
+let hasSideEffects = 1, isHexagonSolo = 1 in
+def BARRIER : SYSInst<(outs), (ins),
                      "barrier",
                      [(HexagonBARRIER)]>;
 
@@ -2244,47 +2355,50 @@ def BARRIER : STInst<(outs), (ins),
 let isReMaterializable = 1 in
 def TFRI64 : ALU64_rr<(outs DoubleRegs:$dst), (ins s8Imm64:$src1),
              "$dst = #$src1",
-             [(set DoubleRegs:$dst, s8Imm64Pred:$src1)]>;
+             [(set (i64 DoubleRegs:$dst), s8Imm64Pred:$src1)]>;
 
 // Pseudo instruction to encode a set of conditional transfers.
 // This instruction is used instead of a mux and trades-off codesize
 // for performance. We conduct this transformation optimistically in
 // the hope that these instructions get promoted to dot-new transfers.
-let AddedComplexity = 100 in
+let AddedComplexity = 100, isPredicated = 1 in
 def TFR_condset_rr : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1,
                                                         IntRegs:$src2,
                                                         IntRegs:$src3),
                      "Error; should not emit",
-                     [(set IntRegs:$dst, (select PredRegs:$src1, IntRegs:$src2,
-                                                 IntRegs:$src3))]>;
-
-let AddedComplexity = 100 in
+                     [(set (i32 IntRegs:$dst),
+                           (i32 (select (i1 PredRegs:$src1),
+                                        (i32 IntRegs:$src2),
+                                        (i32 IntRegs:$src3))))]>;
+let AddedComplexity = 100, isPredicated = 1 in
 def TFR_condset_ri : ALU32_rr<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, s12Imm:$src3),
             "Error; should not emit",
-            [(set IntRegs:$dst,
-            (select PredRegs:$src1, IntRegs:$src2, s12ImmPred:$src3))]>;
+            [(set (i32 IntRegs:$dst),
+             (i32 (select (i1 PredRegs:$src1), (i32 IntRegs:$src2),
+                          s12ImmPred:$src3)))]>;
 
-let AddedComplexity = 100 in
+let AddedComplexity = 100, isPredicated = 1 in
 def TFR_condset_ir : ALU32_rr<(outs IntRegs:$dst),
             (ins PredRegs:$src1, s12Imm:$src2, IntRegs:$src3),
             "Error; should not emit",
-            [(set IntRegs:$dst,
-            (select PredRegs:$src1, s12ImmPred:$src2, IntRegs:$src3))]>;
+            [(set (i32 IntRegs:$dst),
+             (i32 (select (i1 PredRegs:$src1), s12ImmPred:$src2,
+                          (i32 IntRegs:$src3))))]>;
 
-let AddedComplexity = 100 in
+let AddedComplexity = 100, isPredicated = 1 in
 def TFR_condset_ii : ALU32_rr<(outs IntRegs:$dst),
                               (ins PredRegs:$src1, s12Imm:$src2, s12Imm:$src3),
                      "Error; should not emit",
-                     [(set IntRegs:$dst, (select PredRegs:$src1,
-                                                 s12ImmPred:$src2,
-                                                 s12ImmPred:$src3))]>;
+                     [(set (i32 IntRegs:$dst),
+                           (i32 (select (i1 PredRegs:$src1), s12ImmPred:$src2,
+                                        s12ImmPred:$src3)))]>;
 
 // Generate frameindex addresses.
 let isReMaterializable = 1 in
 def TFR_FI : ALU32_ri<(outs IntRegs:$dst), (ins FrameIndex:$src1),
              "$dst = add($src1)",
-             [(set IntRegs:$dst, ADDRri:$src1)]>;
+             [(set (i32 IntRegs:$dst), ADDRri:$src1)]>;
 
 //
 // CR - Type.
@@ -2303,69 +2417,116 @@ def LOOP0_r : CRInst<(outs), (ins brtarget:$offset, IntRegs:$src2),
 
 let isBranch = 1, isTerminator = 1, neverHasSideEffects = 1,
     Defs = [PC, LC0], Uses = [SA0, LC0] in {
-def ENDLOOP0 : CRInst<(outs), (ins brtarget:$offset),
+def ENDLOOP0 : Marker<(outs), (ins brtarget:$offset),
                       ":endloop0",
                       []>;
 }
 
 // Support for generating global address.
 // Taken from X86InstrInfo.td.
-def SDTHexagonCONST32 : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
-                                             SDTCisPtrTy<0>]>;
+def SDTHexagonCONST32 : SDTypeProfile<1, 1, [
+                                            SDTCisVT<0, i32>,
+                                            SDTCisVT<1, i32>,
+                                            SDTCisPtrTy<0>]>;
 def HexagonCONST32 : SDNode<"HexagonISD::CONST32",     SDTHexagonCONST32>;
 def HexagonCONST32_GP : SDNode<"HexagonISD::CONST32_GP",     SDTHexagonCONST32>;
 
+// HI/LO Instructions
+let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+def LO : ALU32_ri<(outs IntRegs:$dst), (ins globaladdress:$global),
+                  "$dst.l = #LO($global)",
+                  []>;
+
+let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+def HI : ALU32_ri<(outs IntRegs:$dst), (ins globaladdress:$global),
+                  "$dst.h = #HI($global)",
+                  []>;
+
+let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+def LOi : ALU32_ri<(outs IntRegs:$dst), (ins i32imm:$imm_value),
+                  "$dst.l = #LO($imm_value)",
+                  []>;
+
+
+let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+def HIi : ALU32_ri<(outs IntRegs:$dst), (ins i32imm:$imm_value),
+                  "$dst.h = #HI($imm_value)",
+                  []>;
+
+let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+def LO_jt : ALU32_ri<(outs IntRegs:$dst), (ins jumptablebase:$jt),
+                  "$dst.l = #LO($jt)",
+                  []>;
+
+let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+def HI_jt : ALU32_ri<(outs IntRegs:$dst), (ins jumptablebase:$jt),
+                  "$dst.h = #HI($jt)",
+                  []>;
+
+
+let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+def LO_label : ALU32_ri<(outs IntRegs:$dst), (ins bblabel:$label),
+                  "$dst.l = #LO($label)",
+                  []>;
+
+let isReMaterializable = 1, isMoveImm = 1 , neverHasSideEffects = 1 in
+def HI_label : ALU32_ri<(outs IntRegs:$dst), (ins bblabel:$label),
+                  "$dst.h = #HI($label)",
+                  []>;
+
 // This pattern is incorrect. When we add small data, we should change
 // this pattern to use memw(#foo).
+// This is for sdata.
 let isMoveImm = 1 in
 def CONST32 : LDInst<(outs IntRegs:$dst), (ins globaladdress:$global),
               "$dst = CONST32(#$global)",
-              [(set IntRegs:$dst,
-              (load (HexagonCONST32 tglobaltlsaddr:$global)))]>;
+              [(set (i32 IntRegs:$dst),
+                    (load (HexagonCONST32 tglobaltlsaddr:$global)))]>;
 
+// This is for non-sdata.
 let isReMaterializable = 1, isMoveImm = 1 in
-def CONST32_set : LDInst<(outs IntRegs:$dst), (ins globaladdress:$global),
+def CONST32_set : LDInst2<(outs IntRegs:$dst), (ins globaladdress:$global),
                   "$dst = CONST32(#$global)",
-                  [(set IntRegs:$dst,
-                  (HexagonCONST32 tglobaladdr:$global))]>;
+                  [(set (i32 IntRegs:$dst),
+                        (HexagonCONST32 tglobaladdr:$global))]>;
 
 let isReMaterializable = 1, isMoveImm = 1 in
-def CONST32_set_jt : LDInst<(outs IntRegs:$dst), (ins jumptablebase:$jt),
+def CONST32_set_jt : LDInst2<(outs IntRegs:$dst), (ins jumptablebase:$jt),
                      "$dst = CONST32(#$jt)",
-                     [(set IntRegs:$dst,
-                     (HexagonCONST32 tjumptable:$jt))]>;
+                     [(set (i32 IntRegs:$dst),
+                           (HexagonCONST32 tjumptable:$jt))]>;
 
 let isReMaterializable = 1, isMoveImm = 1 in
-def CONST32GP_set : LDInst<(outs IntRegs:$dst), (ins globaladdress:$global),
+def CONST32GP_set : LDInst2<(outs IntRegs:$dst), (ins globaladdress:$global),
                     "$dst = CONST32(#$global)",
-                    [(set IntRegs:$dst,
-                    (HexagonCONST32_GP tglobaladdr:$global))]>;
+                    [(set (i32 IntRegs:$dst),
+                          (HexagonCONST32_GP tglobaladdr:$global))]>;
 
 let isReMaterializable = 1, isMoveImm = 1 in
-def CONST32_Int_Real : LDInst<(outs IntRegs:$dst), (ins i32imm:$global),
+def CONST32_Int_Real : LDInst2<(outs IntRegs:$dst), (ins i32imm:$global),
                        "$dst = CONST32(#$global)",
-                       [(set IntRegs:$dst, imm:$global) ]>;
+                       [(set (i32 IntRegs:$dst), imm:$global) ]>;
 
 let isReMaterializable = 1, isMoveImm = 1 in
-def CONST32_Label : LDInst<(outs IntRegs:$dst), (ins bblabel:$label),
+def CONST32_Label : LDInst2<(outs IntRegs:$dst), (ins bblabel:$label),
                     "$dst = CONST32($label)",
-                    [(set IntRegs:$dst, (HexagonCONST32 bbl:$label))]>;
+                    [(set (i32 IntRegs:$dst), (HexagonCONST32 bbl:$label))]>;
 
 let isReMaterializable = 1, isMoveImm = 1 in
-def CONST64_Int_Real : LDInst<(outs DoubleRegs:$dst), (ins i64imm:$global),
+def CONST64_Int_Real : LDInst2<(outs DoubleRegs:$dst), (ins i64imm:$global),
                        "$dst = CONST64(#$global)",
-                       [(set DoubleRegs:$dst, imm:$global) ]>;
+                       [(set (i64 DoubleRegs:$dst), imm:$global) ]>;
 
 def TFR_PdFalse : SInst<(outs PredRegs:$dst), (ins),
                   "$dst = xor($dst, $dst)",
-                  [(set PredRegs:$dst, 0)]>;
+                  [(set (i1 PredRegs:$dst), 0)]>;
 
 def MPY_trsext : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
-                 "$dst = mpy($src1, $src2)",
-                 [(set IntRegs:$dst,
-                       (trunc (i64 (srl (i64 (mul (i64 (sext IntRegs:$src1)),
-                                             (i64 (sext IntRegs:$src2)))),
-                                        (i32 32)))))]>;
+       "$dst = mpy($src1, $src2)",
+       [(set (i32 IntRegs:$dst),
+             (trunc (i64 (srl (i64 (mul (i64 (sext (i32 IntRegs:$src1))),
+                                        (i64 (sext (i32 IntRegs:$src2))))),
+                              (i32 32)))))]>;
 
 // Pseudo instructions.
 def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
@@ -2405,7 +2566,7 @@ let Defs = [R29, R30, R31], Uses = [R29] in {
 let isCall = 1, neverHasSideEffects = 1,
   Defs = [D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10,
           R22, R23, R28, R31, P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
-  def CALL : JInst<(outs), (ins calltarget:$dst, variable_ops),
+  def CALL : JInst<(outs), (ins calltarget:$dst),
              "call $dst", []>;
 }
 
@@ -2413,7 +2574,7 @@ let isCall = 1, neverHasSideEffects = 1,
 let isCall = 1, neverHasSideEffects = 1,
   Defs = [D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10,
           R22, R23, R28, R31, P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
-  def CALLR : JRInst<(outs), (ins IntRegs:$dst, variable_ops),
+  def CALLR : JRInst<(outs), (ins IntRegs:$dst),
               "callr $dst",
               []>;
  }
@@ -2422,25 +2583,25 @@ let isCall = 1, neverHasSideEffects = 1,
 let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1,
   Defs = [D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10,
           R22, R23, R28, R31, P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
-  def TCRETURNtg : JInst<(outs), (ins calltarget:$dst, variable_ops),
+  def TCRETURNtg : JInst<(outs), (ins calltarget:$dst),
              "jump $dst // TAILCALL", []>;
 }
 let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1,
   Defs = [D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10,
           R22, R23, R28, R31, P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
-  def TCRETURNtext : JInst<(outs), (ins calltarget:$dst, variable_ops),
+  def TCRETURNtext : JInst<(outs), (ins calltarget:$dst),
              "jump $dst // TAILCALL", []>;
 }
 
 let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1,
   Defs = [D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10,
           R22, R23, R28, R31, P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
-  def TCRETURNR : JInst<(outs), (ins IntRegs:$dst, variable_ops),
+  def TCRETURNR : JInst<(outs), (ins IntRegs:$dst),
              "jumpr $dst // TAILCALL", []>;
 }
 // Map call instruction.
-def : Pat<(call IntRegs:$dst),
-      (CALLR IntRegs:$dst)>, Requires<[HasV2TOnly]>;
+def : Pat<(call (i32 IntRegs:$dst)),
+      (CALLR (i32 IntRegs:$dst))>, Requires<[HasV2TOnly]>;
 def : Pat<(call tglobaladdr:$dst),
       (CALL tglobaladdr:$dst)>, Requires<[HasV2TOnly]>;
 def : Pat<(call texternalsym:$dst),
@@ -2450,309 +2611,516 @@ def : Pat<(HexagonTCRet tglobaladdr:$dst),
       (TCRETURNtg tglobaladdr:$dst)>;
 def : Pat<(HexagonTCRet texternalsym:$dst),
       (TCRETURNtext texternalsym:$dst)>;
-def : Pat<(HexagonTCRet IntRegs:$dst),
-      (TCRETURNR IntRegs:$dst)>;
+def : Pat<(HexagonTCRet (i32 IntRegs:$dst)),
+      (TCRETURNR (i32 IntRegs:$dst))>;
+
+// Atomic load and store support
+// 8 bit atomic load
+def : Pat<(atomic_load_8 (HexagonCONST32_GP tglobaladdr:$global)),
+          (i32 (LDub_GP tglobaladdr:$global))>,
+            Requires<[NoV4T]>;
+
+def : Pat<(atomic_load_8 (add (HexagonCONST32_GP tglobaladdr:$global),
+                              u16ImmPred:$offset)),
+          (i32 (LDriub_GP tglobaladdr:$global, u16ImmPred:$offset))>,
+            Requires<[NoV4T]>;
+
+def : Pat<(atomic_load_8 ADDRriS11_0:$src1),
+          (i32 (LDriub ADDRriS11_0:$src1))>;
+
+def : Pat<(atomic_load_8 (add (i32 IntRegs:$src1), s11_0ImmPred:$offset)),
+          (i32 (LDriub_indexed (i32 IntRegs:$src1), s11_0ImmPred:$offset))>;
+
+
+
+// 16 bit atomic load
+def : Pat<(atomic_load_16 (HexagonCONST32_GP tglobaladdr:$global)),
+          (i32 (LDuh_GP tglobaladdr:$global))>,
+            Requires<[NoV4T]>;
+
+def : Pat<(atomic_load_16 (add (HexagonCONST32_GP tglobaladdr:$global),
+                               u16ImmPred:$offset)),
+          (i32 (LDriuh_GP tglobaladdr:$global, u16ImmPred:$offset))>,
+            Requires<[NoV4T]>;
+
+def : Pat<(atomic_load_16 ADDRriS11_1:$src1),
+          (i32 (LDriuh ADDRriS11_1:$src1))>;
+
+def : Pat<(atomic_load_16 (add (i32 IntRegs:$src1), s11_1ImmPred:$offset)),
+          (i32 (LDriuh_indexed (i32 IntRegs:$src1), s11_1ImmPred:$offset))>;
+
+
+
+// 32 bit atomic load
+def : Pat<(atomic_load_32 (HexagonCONST32_GP tglobaladdr:$global)),
+          (i32 (LDw_GP tglobaladdr:$global))>,
+            Requires<[NoV4T]>;
+
+def : Pat<(atomic_load_32 (add (HexagonCONST32_GP tglobaladdr:$global),
+                               u16ImmPred:$offset)),
+          (i32 (LDriw_GP tglobaladdr:$global, u16ImmPred:$offset))>,
+            Requires<[NoV4T]>;
+
+def : Pat<(atomic_load_32 ADDRriS11_2:$src1),
+          (i32 (LDriw ADDRriS11_2:$src1))>;
+
+def : Pat<(atomic_load_32 (add (i32 IntRegs:$src1), s11_2ImmPred:$offset)),
+          (i32 (LDriw_indexed (i32 IntRegs:$src1), s11_2ImmPred:$offset))>;
+
+
+// 64 bit atomic load
+def : Pat<(atomic_load_64 (HexagonCONST32_GP tglobaladdr:$global)),
+          (i64 (LDd_GP tglobaladdr:$global))>,
+            Requires<[NoV4T]>;
+
+def : Pat<(atomic_load_64 (add (HexagonCONST32_GP tglobaladdr:$global),
+                               u16ImmPred:$offset)),
+          (i64 (LDrid_GP tglobaladdr:$global, u16ImmPred:$offset))>,
+          Requires<[NoV4T]>;
+
+def : Pat<(atomic_load_64 ADDRriS11_3:$src1),
+          (i64 (LDrid ADDRriS11_3:$src1))>;
+
+def : Pat<(atomic_load_64 (add (i32 IntRegs:$src1), s11_3ImmPred:$offset)),
+          (i64 (LDrid_indexed (i32 IntRegs:$src1), s11_3ImmPred:$offset))>;
 
-// Map from r0 = and(r1, 65535) to r0 = zxth(r1).
-def : Pat <(and IntRegs:$src1, 65535),
-      (ZXTH IntRegs:$src1)>;
+
+// 64 bit atomic store
+def : Pat<(atomic_store_64 (HexagonCONST32_GP tglobaladdr:$global),
+                           (i64 DoubleRegs:$src1)),
+          (STd_GP tglobaladdr:$global, (i64 DoubleRegs:$src1))>,
+          Requires<[NoV4T]>;
+
+def : Pat<(atomic_store_64 (add (HexagonCONST32_GP tglobaladdr:$global),
+                                u16ImmPred:$offset),
+                           (i64 DoubleRegs:$src1)),
+          (STrid_GP tglobaladdr:$global, u16ImmPred:$offset,
+                    (i64 DoubleRegs:$src1))>, Requires<[NoV4T]>;
+
+// 8 bit atomic store
+def : Pat<(atomic_store_8 (HexagonCONST32_GP tglobaladdr:$global),
+                          (i32 IntRegs:$src1)),
+          (STb_GP tglobaladdr:$global, (i32 IntRegs:$src1))>,
+          Requires<[NoV4T]>;
+
+def : Pat<(atomic_store_8 (add (HexagonCONST32_GP tglobaladdr:$global),
+                               u16ImmPred:$offset),
+                          (i32 IntRegs:$src1)),
+          (STrib_GP tglobaladdr:$global, u16ImmPred:$offset,
+                    (i32 IntRegs:$src1))>, Requires<[NoV4T]>;
+
+def : Pat<(atomic_store_8 ADDRriS11_0:$src2, (i32 IntRegs:$src1)),
+          (STrib ADDRriS11_0:$src2, (i32 IntRegs:$src1))>;
+
+def : Pat<(atomic_store_8 (add (i32 IntRegs:$src2), s11_0ImmPred:$offset),
+                          (i32 IntRegs:$src1)),
+          (STrib_indexed (i32 IntRegs:$src2), s11_0ImmPred:$offset,
+                         (i32 IntRegs:$src1))>;
+
+
+// 16 bit atomic store
+def : Pat<(atomic_store_16 (HexagonCONST32_GP tglobaladdr:$global),
+                           (i32 IntRegs:$src1)),
+          (STh_GP tglobaladdr:$global, (i32 IntRegs:$src1))>,
+          Requires<[NoV4T]>;
+
+def : Pat<(atomic_store_16 (add (HexagonCONST32_GP tglobaladdr:$global),
+                                u16ImmPred:$offset),
+                           (i32 IntRegs:$src1)),
+          (STrih_GP tglobaladdr:$global, u16ImmPred:$offset,
+                    (i32 IntRegs:$src1))>, Requires<[NoV4T]>;
+
+def : Pat<(atomic_store_16 ADDRriS11_1:$src2, (i32 IntRegs:$src1)),
+          (STrih ADDRriS11_1:$src2, (i32 IntRegs:$src1))>;
+
+def : Pat<(atomic_store_16 (i32 IntRegs:$src1),
+                          (add (i32 IntRegs:$src2), s11_1ImmPred:$offset)),
+          (STrih_indexed (i32 IntRegs:$src2), s11_1ImmPred:$offset,
+                         (i32 IntRegs:$src1))>;
+
+
+// 32 bit atomic store
+def : Pat<(atomic_store_32 (HexagonCONST32_GP tglobaladdr:$global),
+                           (i32 IntRegs:$src1)),
+          (STw_GP tglobaladdr:$global, (i32 IntRegs:$src1))>,
+          Requires<[NoV4T]>;
+
+def : Pat<(atomic_store_32 (add (HexagonCONST32_GP tglobaladdr:$global),
+                                u16ImmPred:$offset),
+                           (i32 IntRegs:$src1)),
+          (STriw_GP tglobaladdr:$global, u16ImmPred:$offset,
+                                         (i32 IntRegs:$src1))>,
+            Requires<[NoV4T]>;
+
+def : Pat<(atomic_store_32 ADDRriS11_2:$src2, (i32 IntRegs:$src1)),
+          (STriw ADDRriS11_2:$src2, (i32 IntRegs:$src1))>;
+
+def : Pat<(atomic_store_32 (add (i32 IntRegs:$src2), s11_2ImmPred:$offset),
+                           (i32 IntRegs:$src1)),
+          (STriw_indexed (i32 IntRegs:$src2), s11_2ImmPred:$offset,
+                         (i32 IntRegs:$src1))>;
+
+
+
+
+def : Pat<(atomic_store_64 ADDRriS11_3:$src2, (i64 DoubleRegs:$src1)),
+          (STrid ADDRriS11_3:$src2, (i64 DoubleRegs:$src1))>;
+
+def : Pat<(atomic_store_64 (add (i32 IntRegs:$src2), s11_3ImmPred:$offset),
+                           (i64 DoubleRegs:$src1)),
+          (STrid_indexed (i32 IntRegs:$src2), s11_3ImmPred:$offset,
+                         (i64 DoubleRegs:$src1))>;
+
+// Map from r0 = and(r1, 65535) to r0 = zxth(r1)
+def : Pat <(and (i32 IntRegs:$src1), 65535),
+      (ZXTH (i32 IntRegs:$src1))>;
 
 // Map from r0 = and(r1, 255) to r0 = zxtb(r1).
-def : Pat <(and IntRegs:$src1, 255),
-      (ZXTB IntRegs:$src1)>;
+def : Pat <(and (i32 IntRegs:$src1), 255),
+      (ZXTB (i32 IntRegs:$src1))>;
 
 // Map Add(p1, true) to p1 = not(p1).
 //     Add(p1, false) should never be produced,
 //     if it does, it got to be mapped to NOOP.
-def : Pat <(add PredRegs:$src1, -1),
-      (NOT_p PredRegs:$src1)>;
+def : Pat <(add (i1 PredRegs:$src1), -1),
+      (NOT_p (i1 PredRegs:$src1))>;
 
 // Map from p0 = setlt(r0, r1) r2 = mux(p0, r3, r4) =>
 //   p0 = cmp.lt(r0, r1), r0 = mux(p0, r2, r1).
-def : Pat <(select (i1 (setlt IntRegs:$src1, IntRegs:$src2)), IntRegs:$src3,
-                   IntRegs:$src4),
-      (TFR_condset_rr (CMPLTrr IntRegs:$src1, IntRegs:$src2), IntRegs:$src4,
-                      IntRegs:$src3)>, Requires<[HasV2TOnly]>;
+def : Pat <(select (i1 (setlt (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+                   (i32 IntRegs:$src3),
+                   (i32 IntRegs:$src4)),
+      (i32 (TFR_condset_rr (CMPLTrr (i32 IntRegs:$src1), (i32 IntRegs:$src2)),
+                           (i32 IntRegs:$src4), (i32 IntRegs:$src3)))>,
+      Requires<[HasV2TOnly]>;
 
 // Map from p0 = pnot(p0); r0 = mux(p0, #i, #j) => r0 = mux(p0, #j, #i).
-def : Pat <(select (not PredRegs:$src1), s8ImmPred:$src2, s8ImmPred:$src3),
-      (TFR_condset_ii PredRegs:$src1, s8ImmPred:$src3, s8ImmPred:$src2)>;
+def : Pat <(select (not (i1 PredRegs:$src1)), s8ImmPred:$src2, s8ImmPred:$src3),
+      (i32 (TFR_condset_ii (i1 PredRegs:$src1), s8ImmPred:$src3,
+                           s8ImmPred:$src2))>;
+
+// Map from p0 = pnot(p0); r0 = select(p0, #i, r1)
+// => r0 = TFR_condset_ri(p0, r1, #i)
+def : Pat <(select (not (i1 PredRegs:$src1)), s12ImmPred:$src2,
+                   (i32 IntRegs:$src3)),
+      (i32 (TFR_condset_ri (i1 PredRegs:$src1), (i32 IntRegs:$src3),
+                           s12ImmPred:$src2))>;
+
+// Map from p0 = pnot(p0); r0 = mux(p0, r1, #i)
+// => r0 = TFR_condset_ir(p0, #i, r1)
+def : Pat <(select (not PredRegs:$src1), IntRegs:$src2, s12ImmPred:$src3),
+      (i32 (TFR_condset_ir (i1 PredRegs:$src1), s12ImmPred:$src3,
+                           (i32 IntRegs:$src2)))>;
 
 // Map from p0 = pnot(p0); if (p0) jump => if (!p0) jump.
 def : Pat <(brcond (not PredRegs:$src1), bb:$offset),
-      (JMP_cNot PredRegs:$src1, bb:$offset)>;
+      (JMP_cNot (i1 PredRegs:$src1), bb:$offset)>;
 
 // Map from p2 = pnot(p2); p1 = and(p0, p2) => p1 = and(p0, !p2).
 def : Pat <(and PredRegs:$src1, (not PredRegs:$src2)),
-      (AND_pnotp PredRegs:$src1, PredRegs:$src2)>;
+      (i1 (AND_pnotp (i1 PredRegs:$src1), (i1 PredRegs:$src2)))>;
 
 // Map from store(globaladdress + x) -> memd(#foo + x).
 let AddedComplexity = 100 in
-def : Pat <(store DoubleRegs:$src1,
+def : Pat <(store (i64 DoubleRegs:$src1),
                   (add (HexagonCONST32_GP tglobaladdr:$global),
                        u16ImmPred:$offset)),
-      (STrid_GP tglobaladdr:$global, u16ImmPred:$offset, DoubleRegs:$src1)>;
+      (STrid_GP tglobaladdr:$global, u16ImmPred:$offset,
+                (i64 DoubleRegs:$src1))>, Requires<[NoV4T]>;
 
-// Map from store(globaladdress) -> memd(#foo + 0).
+// Map from store(globaladdress) -> memd(#foo).
 let AddedComplexity = 100 in
-def : Pat <(store DoubleRegs:$src1, (HexagonCONST32_GP tglobaladdr:$global)),
-      (STrid_GP tglobaladdr:$global, 0, DoubleRegs:$src1)>;
+def : Pat <(store (i64 DoubleRegs:$src1),
+                  (HexagonCONST32_GP tglobaladdr:$global)),
+      (STd_GP tglobaladdr:$global, (i64 DoubleRegs:$src1))>,
+      Requires<[NoV4T]>;
 
 // Map from store(globaladdress + x) -> memw(#foo + x).
 let AddedComplexity = 100 in
-def : Pat <(store IntRegs:$src1, (add (HexagonCONST32_GP tglobaladdr:$global),
+def : Pat <(store (i32 IntRegs:$src1),
+              (add (HexagonCONST32_GP tglobaladdr:$global),
                                       u16ImmPred:$offset)),
-      (STriw_GP tglobaladdr:$global, u16ImmPred:$offset, IntRegs:$src1)>;
+      (STriw_GP tglobaladdr:$global, u16ImmPred:$offset, (i32 IntRegs:$src1))>,
+      Requires<[NoV4T]>;
 
 // Map from store(globaladdress) -> memw(#foo + 0).
 let AddedComplexity = 100 in
-def : Pat <(store IntRegs:$src1, (HexagonCONST32_GP tglobaladdr:$global)),
-      (STriw_GP tglobaladdr:$global, 0, IntRegs:$src1)>;
+def : Pat <(store (i32 IntRegs:$src1), (HexagonCONST32_GP tglobaladdr:$global)),
+      (STriw_GP tglobaladdr:$global, 0, (i32 IntRegs:$src1))>;
 
-// Map from store(globaladdress) -> memw(#foo + 0).
+// Map from store(globaladdress) -> memw(#foo).
 let AddedComplexity = 100 in
-def : Pat <(store IntRegs:$src1, (HexagonCONST32_GP tglobaladdr:$global)),
-      (STriw_GP tglobaladdr:$global, 0, IntRegs:$src1)>;
+def : Pat <(store (i32 IntRegs:$src1), (HexagonCONST32_GP tglobaladdr:$global)),
+      (STriw_GP tglobaladdr:$global, 0, (i32 IntRegs:$src1))>,
+      Requires<[NoV4T]>;
 
 // Map from store(globaladdress + x) -> memh(#foo + x).
 let AddedComplexity = 100 in
-def : Pat <(truncstorei16 IntRegs:$src1,
+def : Pat <(truncstorei16 (i32 IntRegs:$src1),
                           (add (HexagonCONST32_GP tglobaladdr:$global),
                                u16ImmPred:$offset)),
-      (STrih_GP tglobaladdr:$global, u16ImmPred:$offset, IntRegs:$src1)>;
+      (STrih_GP tglobaladdr:$global, u16ImmPred:$offset, (i32 IntRegs:$src1))>,
+      Requires<[NoV4T]>;
 
 // Map from store(globaladdress) -> memh(#foo).
 let AddedComplexity = 100 in
-def : Pat <(truncstorei16 IntRegs:$src1,
+def : Pat <(truncstorei16 (i32 IntRegs:$src1),
                           (HexagonCONST32_GP tglobaladdr:$global)),
-      (STh_GP tglobaladdr:$global, IntRegs:$src1)>;
+      (STh_GP tglobaladdr:$global, (i32 IntRegs:$src1))>,
+      Requires<[NoV4T]>;
 
 // Map from store(globaladdress + x) -> memb(#foo + x).
 let AddedComplexity = 100 in
-def : Pat <(truncstorei8 IntRegs:$src1,
+def : Pat <(truncstorei8 (i32 IntRegs:$src1),
                          (add (HexagonCONST32_GP tglobaladdr:$global),
                               u16ImmPred:$offset)),
-      (STrib_GP tglobaladdr:$global, u16ImmPred:$offset, IntRegs:$src1)>;
+      (STrib_GP tglobaladdr:$global, u16ImmPred:$offset, (i32 IntRegs:$src1))>,
+      Requires<[NoV4T]>;
 
 // Map from store(globaladdress) -> memb(#foo).
 let AddedComplexity = 100 in
-def : Pat <(truncstorei8 IntRegs:$src1,
+def : Pat <(truncstorei8 (i32 IntRegs:$src1),
                          (HexagonCONST32_GP tglobaladdr:$global)),
-      (STb_GP tglobaladdr:$global, IntRegs:$src1)>;
+      (STb_GP tglobaladdr:$global, (i32 IntRegs:$src1))>,
+      Requires<[NoV4T]>;
 
 // Map from load(globaladdress + x) -> memw(#foo + x).
 let AddedComplexity = 100 in
-def : Pat <(load (add (HexagonCONST32_GP tglobaladdr:$global),
-                      u16ImmPred:$offset)),
-      (LDriw_GP tglobaladdr:$global, u16ImmPred:$offset)>;
+def : Pat <(i32 (load (add (HexagonCONST32_GP tglobaladdr:$global),
+                      u16ImmPred:$offset))),
+      (i32 (LDriw_GP tglobaladdr:$global, u16ImmPred:$offset))>,
+      Requires<[NoV4T]>;
 
-// Map from load(globaladdress) -> memw(#foo + 0).
+// Map from load(globaladdress) -> memw(#foo).
 let AddedComplexity = 100 in
-def : Pat <(load (HexagonCONST32_GP tglobaladdr:$global)),
-      (LDw_GP tglobaladdr:$global)>;
+def : Pat <(i32 (load (HexagonCONST32_GP tglobaladdr:$global))),
+      (i32 (LDw_GP tglobaladdr:$global))>,
+      Requires<[NoV4T]>;
 
 // Map from load(globaladdress + x) -> memd(#foo + x).
 let AddedComplexity = 100 in
 def : Pat <(i64 (load (add (HexagonCONST32_GP tglobaladdr:$global),
                            u16ImmPred:$offset))),
-      (LDrid_GP tglobaladdr:$global, u16ImmPred:$offset)>;
+      (i64 (LDrid_GP tglobaladdr:$global, u16ImmPred:$offset))>,
+      Requires<[NoV4T]>;
 
 // Map from load(globaladdress) -> memw(#foo + 0).
 let AddedComplexity = 100 in
 def : Pat <(i64 (load (HexagonCONST32_GP tglobaladdr:$global))),
-      (LDd_GP tglobaladdr:$global)>;
-
+      (i64 (LDd_GP tglobaladdr:$global))>,
+      Requires<[NoV4T]>;
 
-// Map from Pd = load(globaladdress) -> Rd = memb(globaladdress + 0), Pd = Rd.
+// Map from Pd = load(globaladdress) -> Rd = memb(globaladdress), Pd = Rd.
 let AddedComplexity = 100 in
 def : Pat <(i1 (load (HexagonCONST32_GP tglobaladdr:$global))),
-      (TFR_PdRs (LDrib_GP tglobaladdr:$global, 0))>;
+      (i1 (TFR_PdRs (i32 (LDb_GP tglobaladdr:$global))))>,
+      Requires<[NoV4T]>;
 
 // Map from load(globaladdress + x) -> memh(#foo + x).
 let AddedComplexity = 100 in
-def : Pat <(sextloadi16 (add (HexagonCONST32_GP tglobaladdr:$global),
-                             u16ImmPred:$offset)),
-      (LDrih_GP tglobaladdr:$global, u16ImmPred:$offset)>;
+def : Pat <(i32 (extloadi16 (add (HexagonCONST32_GP tglobaladdr:$global),
+                            u16ImmPred:$offset))),
+      (i32 (LDrih_GP tglobaladdr:$global, u16ImmPred:$offset))>,
+      Requires<[NoV4T]>;
 
-// Map from load(globaladdress) -> memh(#foo + 0).
+// Map from load(globaladdress + x) -> memh(#foo + x).
 let AddedComplexity = 100 in
-def : Pat <(sextloadi16 (HexagonCONST32_GP tglobaladdr:$global)),
-      (LDrih_GP tglobaladdr:$global, 0)>;
+def : Pat <(i32 (sextloadi16 (HexagonCONST32_GP tglobaladdr:$global))),
+      (i32 (LDrih_GP tglobaladdr:$global, 0))>,
+      Requires<[NoV4T]>;
 
 // Map from load(globaladdress + x) -> memuh(#foo + x).
 let AddedComplexity = 100 in
-def : Pat <(zextloadi16 (add (HexagonCONST32_GP tglobaladdr:$global),
-                             u16ImmPred:$offset)),
-      (LDriuh_GP tglobaladdr:$global, u16ImmPred:$offset)>;
+def : Pat <(i32 (zextloadi16 (add (HexagonCONST32_GP tglobaladdr:$global),
+                             u16ImmPred:$offset))),
+      (i32 (LDriuh_GP tglobaladdr:$global, u16ImmPred:$offset))>,
+      Requires<[NoV4T]>;
 
-// Map from load(globaladdress) -> memuh(#foo + 0).
+// Map from load(globaladdress) -> memuh(#foo).
 let AddedComplexity = 100 in
-def : Pat <(zextloadi16 (HexagonCONST32_GP tglobaladdr:$global)),
-      (LDriuh_GP tglobaladdr:$global, 0)>;
+def : Pat <(i32 (zextloadi16 (HexagonCONST32_GP tglobaladdr:$global))),
+      (i32 (LDriuh_GP tglobaladdr:$global, 0))>,
+      Requires<[NoV4T]>;
 
-// Map from load(globaladdress + x) -> memuh(#foo + x).
+// Map from load(globaladdress) -> memh(#foo).
 let AddedComplexity = 100 in
-def : Pat <(extloadi16 (add (HexagonCONST32_GP tglobaladdr:$global),
-                            u16ImmPred:$offset)),
-      (LDriuh_GP tglobaladdr:$global, u16ImmPred:$offset)>;
+def : Pat <(i32 (sextloadi16 (HexagonCONST32_GP tglobaladdr:$global))),
+      (i32 (LDh_GP tglobaladdr:$global))>,
+      Requires<[NoV4T]>;
 
-// Map from load(globaladdress) -> memuh(#foo + 0).
-let AddedComplexity = 100 in
-def : Pat <(extloadi16 (HexagonCONST32_GP tglobaladdr:$global)),
-      (LDriuh_GP tglobaladdr:$global, 0)>;
-// Map from load(globaladdress + x) -> memub(#foo + x).
+// Map from load(globaladdress) -> memuh(#foo).
 let AddedComplexity = 100 in
-def : Pat <(zextloadi8 (add (HexagonCONST32_GP tglobaladdr:$global),
-                            u16ImmPred:$offset)),
-      (LDriub_GP tglobaladdr:$global, u16ImmPred:$offset)>;
+def : Pat <(i32 (zextloadi16 (HexagonCONST32_GP tglobaladdr:$global))),
+      (i32 (LDuh_GP tglobaladdr:$global))>,
+      Requires<[NoV4T]>;
 
-// Map from load(globaladdress) -> memuh(#foo + 0).
+// Map from load(globaladdress + x) -> memb(#foo + x).
 let AddedComplexity = 100 in
-def : Pat <(zextloadi8 (HexagonCONST32_GP tglobaladdr:$global)),
-      (LDriub_GP tglobaladdr:$global, 0)>;
+def : Pat <(i32 (extloadi8 (add (HexagonCONST32_GP tglobaladdr:$global),
+                           u16ImmPred:$offset))),
+      (i32 (LDrib_GP tglobaladdr:$global, u16ImmPred:$offset))>,
+      Requires<[NoV4T]>;
 
 // Map from load(globaladdress + x) -> memb(#foo + x).
 let AddedComplexity = 100 in
-def : Pat <(sextloadi8 (add (HexagonCONST32_GP tglobaladdr:$global),
-                            u16ImmPred:$offset)),
-      (LDrib_GP tglobaladdr:$global, u16ImmPred:$offset)>;
+def : Pat <(i32 (sextloadi8 (add (HexagonCONST32_GP tglobaladdr:$global),
+                            u16ImmPred:$offset))),
+      (i32 (LDrib_GP tglobaladdr:$global, u16ImmPred:$offset))>,
+      Requires<[NoV4T]>;
+
+// Map from load(globaladdress + x) -> memub(#foo + x).
+let AddedComplexity = 100 in
+def : Pat <(i32 (zextloadi8 (add (HexagonCONST32_GP tglobaladdr:$global),
+                            u16ImmPred:$offset))),
+      (i32 (LDriub_GP tglobaladdr:$global, u16ImmPred:$offset))>,
+      Requires<[NoV4T]>;
 
 // Map from load(globaladdress) -> memb(#foo).
 let AddedComplexity = 100 in
-def : Pat <(extloadi8 (HexagonCONST32_GP tglobaladdr:$global)),
-      (LDb_GP tglobaladdr:$global)>;
+def : Pat <(i32 (extloadi8 (HexagonCONST32_GP tglobaladdr:$global))),
+      (i32 (LDb_GP tglobaladdr:$global))>,
+      Requires<[NoV4T]>;
 
 // Map from load(globaladdress) -> memb(#foo).
 let AddedComplexity = 100 in
-def : Pat <(sextloadi8 (HexagonCONST32_GP tglobaladdr:$global)),
-      (LDb_GP tglobaladdr:$global)>;
+def : Pat <(i32 (sextloadi8 (HexagonCONST32_GP tglobaladdr:$global))),
+      (i32 (LDb_GP tglobaladdr:$global))>,
+      Requires<[NoV4T]>;
 
 // Map from load(globaladdress) -> memub(#foo).
 let AddedComplexity = 100 in
-def : Pat <(zextloadi8 (HexagonCONST32_GP tglobaladdr:$global)),
-      (LDub_GP tglobaladdr:$global)>;
+def : Pat <(i32 (zextloadi8 (HexagonCONST32_GP tglobaladdr:$global))),
+      (i32 (LDub_GP tglobaladdr:$global))>,
+      Requires<[NoV4T]>;
 
 // When the Interprocedural Global Variable optimizer realizes that a
 // certain global variable takes only two constant values, it shrinks the
 // global to a boolean. Catch those loads here in the following 3 patterns.
 let AddedComplexity = 100 in
-def : Pat <(extloadi1 (HexagonCONST32_GP tglobaladdr:$global)),
-      (LDb_GP tglobaladdr:$global)>;
-
-let AddedComplexity = 100 in
-def : Pat <(sextloadi1 (HexagonCONST32_GP tglobaladdr:$global)),
-      (LDb_GP tglobaladdr:$global)>;
+def : Pat <(i32 (extloadi1 (HexagonCONST32_GP tglobaladdr:$global))),
+      (i32 (LDb_GP tglobaladdr:$global))>,
+      Requires<[NoV4T]>;
 
 let AddedComplexity = 100 in
-def : Pat <(zextloadi1 (HexagonCONST32_GP tglobaladdr:$global)),
-      (LDub_GP tglobaladdr:$global)>;
+def : Pat <(i32 (sextloadi1 (HexagonCONST32_GP tglobaladdr:$global))),
+      (i32 (LDb_GP tglobaladdr:$global))>,
+      Requires<[NoV4T]>;
 
-// Map from load(globaladdress) -> memh(#foo).
-let AddedComplexity = 100 in
-def : Pat <(extloadi16 (HexagonCONST32_GP tglobaladdr:$global)),
-      (LDh_GP tglobaladdr:$global)>;
-
-// Map from load(globaladdress) -> memh(#foo).
-let AddedComplexity = 100 in
-def : Pat <(sextloadi16 (HexagonCONST32_GP tglobaladdr:$global)),
-      (LDh_GP tglobaladdr:$global)>;
-
-// Map from load(globaladdress) -> memuh(#foo).
 let AddedComplexity = 100 in
-def : Pat <(zextloadi16 (HexagonCONST32_GP tglobaladdr:$global)),
-      (LDuh_GP tglobaladdr:$global)>;
+def : Pat <(i32 (zextloadi1 (HexagonCONST32_GP tglobaladdr:$global))),
+      (i32 (LDub_GP tglobaladdr:$global))>,
+      Requires<[NoV4T]>;
 
 // Map from i1 loads to 32 bits. This assumes that the i1* is byte aligned.
 def : Pat <(i32 (zextloadi1 ADDRriS11_0:$addr)),
-      (AND_rr (LDrib ADDRriS11_0:$addr), (TFRI 0x1))>;
+      (i32 (AND_rr (i32 (LDrib ADDRriS11_0:$addr)), (TFRI 0x1)))>;
 
 // Map from Rdd = sign_extend_inreg(Rss, i32) -> Rdd = SXTW(Rss.lo).
-def : Pat <(i64 (sext_inreg DoubleRegs:$src1, i32)),
-      (i64 (SXTW (EXTRACT_SUBREG DoubleRegs:$src1, subreg_loreg)))>;
+def : Pat <(i64 (sext_inreg (i64 DoubleRegs:$src1), i32)),
+      (i64 (SXTW (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_loreg))))>;
 
 // Map from Rdd = sign_extend_inreg(Rss, i16) -> Rdd = SXTW(SXTH(Rss.lo)).
-def : Pat <(i64 (sext_inreg DoubleRegs:$src1, i16)),
-      (i64 (SXTW (SXTH (EXTRACT_SUBREG DoubleRegs:$src1, subreg_loreg))))>;
+def : Pat <(i64 (sext_inreg (i64 DoubleRegs:$src1), i16)),
+      (i64 (SXTW (i32 (SXTH (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1),
+                                                 subreg_loreg))))))>;
 
 // Map from Rdd = sign_extend_inreg(Rss, i8) -> Rdd = SXTW(SXTB(Rss.lo)).
-def : Pat <(i64 (sext_inreg DoubleRegs:$src1, i8)),
-      (i64 (SXTW (SXTB (EXTRACT_SUBREG DoubleRegs:$src1, subreg_loreg))))>;
+def : Pat <(i64 (sext_inreg (i64 DoubleRegs:$src1), i8)),
+      (i64 (SXTW (i32 (SXTB (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1),
+                                                 subreg_loreg))))))>;
 
-// We want to prevent emiting pnot's as much as possible.
+// We want to prevent emitting pnot's as much as possible.
 // Map brcond with an unsupported setcc to a JMP_cNot.
-def : Pat <(brcond (i1 (setne IntRegs:$src1, IntRegs:$src2)), bb:$offset),
-      (JMP_cNot (CMPEQrr IntRegs:$src1, IntRegs:$src2), bb:$offset)>;
+def : Pat <(brcond (i1 (setne (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+                        bb:$offset),
+      (JMP_cNot (CMPEQrr (i32 IntRegs:$src1), (i32 IntRegs:$src2)),
+                bb:$offset)>;
 
-def : Pat <(brcond (i1 (setne IntRegs:$src1, s10ImmPred:$src2)), bb:$offset),
-      (JMP_cNot (CMPEQri IntRegs:$src1, s10ImmPred:$src2), bb:$offset)>;
+def : Pat <(brcond (i1 (setne (i32 IntRegs:$src1), s10ImmPred:$src2)),
+                        bb:$offset),
+      (JMP_cNot (CMPEQri (i32 IntRegs:$src1), s10ImmPred:$src2), bb:$offset)>;
 
-def : Pat <(brcond (i1 (setne PredRegs:$src1, (i1 -1))), bb:$offset),
-      (JMP_cNot PredRegs:$src1, bb:$offset)>;
+def : Pat <(brcond (i1 (setne (i1 PredRegs:$src1), (i1 -1))), bb:$offset),
+      (JMP_cNot (i1 PredRegs:$src1), bb:$offset)>;
 
-def : Pat <(brcond (i1 (setne PredRegs:$src1, (i1 0))), bb:$offset),
-      (JMP_c PredRegs:$src1, bb:$offset)>;
+def : Pat <(brcond (i1 (setne (i1 PredRegs:$src1), (i1 0))), bb:$offset),
+      (JMP_c (i1 PredRegs:$src1), bb:$offset)>;
 
-def : Pat <(brcond (i1 (setlt IntRegs:$src1, s8ImmPred:$src2)), bb:$offset),
-      (JMP_cNot (CMPGEri IntRegs:$src1, s8ImmPred:$src2), bb:$offset)>;
+def : Pat <(brcond (i1 (setlt (i32 IntRegs:$src1), s8ImmPred:$src2)),
+                        bb:$offset),
+      (JMP_cNot (CMPGEri (i32 IntRegs:$src1), s8ImmPred:$src2), bb:$offset)>;
 
-def : Pat <(brcond (i1 (setlt IntRegs:$src1, IntRegs:$src2)), bb:$offset),
-      (JMP_c (CMPLTrr IntRegs:$src1, IntRegs:$src2), bb:$offset)>;
+def : Pat <(brcond (i1 (setlt (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+                        bb:$offset),
+      (JMP_c (CMPLTrr (i32 IntRegs:$src1), (i32 IntRegs:$src2)), bb:$offset)>;
 
-def : Pat <(brcond (i1 (setuge DoubleRegs:$src1, DoubleRegs:$src2)),
+def : Pat <(brcond (i1 (setuge (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
                    bb:$offset),
-      (JMP_cNot (CMPGTU64rr DoubleRegs:$src2, DoubleRegs:$src1),
+      (JMP_cNot (CMPGTU64rr (i64 DoubleRegs:$src2), (i64 DoubleRegs:$src1)),
                    bb:$offset)>;
 
-def : Pat <(brcond (i1 (setule IntRegs:$src1, IntRegs:$src2)), bb:$offset),
-      (JMP_cNot (CMPGTUrr IntRegs:$src1, IntRegs:$src2), bb:$offset)>;
+def : Pat <(brcond (i1 (setule (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+                        bb:$offset),
+      (JMP_cNot (CMPGTUrr (i32 IntRegs:$src1), (i32 IntRegs:$src2)),
+                bb:$offset)>;
 
-def : Pat <(brcond (i1 (setule DoubleRegs:$src1, DoubleRegs:$src2)),
+def : Pat <(brcond (i1 (setule (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
                    bb:$offset),
-      (JMP_cNot (CMPGTU64rr DoubleRegs:$src1, DoubleRegs:$src2),
-                   bb:$offset)>;
+      (JMP_cNot (CMPGTU64rr (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2)),
+                bb:$offset)>;
 
 // Map from a 64-bit select to an emulated 64-bit mux.
 // Hexagon does not support 64-bit MUXes; so emulate with combines.
-def : Pat <(select PredRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-      (COMBINE_rr
-      (MUX_rr PredRegs:$src1,
-      (EXTRACT_SUBREG DoubleRegs:$src2, subreg_hireg),
-      (EXTRACT_SUBREG DoubleRegs:$src3, subreg_hireg)),
-      (MUX_rr PredRegs:$src1,
-      (EXTRACT_SUBREG DoubleRegs:$src2, subreg_loreg),
-      (EXTRACT_SUBREG DoubleRegs:$src3, subreg_loreg)))>;
+def : Pat <(select (i1 PredRegs:$src1), (i64 DoubleRegs:$src2),
+                   (i64 DoubleRegs:$src3)),
+      (i64 (COMBINE_rr (i32 (MUX_rr (i1 PredRegs:$src1),
+                                    (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
+                                                         subreg_hireg)),
+                                    (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src3),
+                                                         subreg_hireg)))),
+                       (i32 (MUX_rr (i1 PredRegs:$src1),
+                                    (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
+                                                         subreg_loreg)),
+                                    (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src3),
+                                                         subreg_loreg))))))>;
 
 // Map from a 1-bit select to logical ops.
 // From LegalizeDAG.cpp: (B1 ? B2 : B3) <=> (B1 & B2)|(!B1&B3).
-def : Pat <(select PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
-      (OR_pp (AND_pp PredRegs:$src1, PredRegs:$src2),
-             (AND_pp (NOT_p PredRegs:$src1), PredRegs:$src3))>;
+def : Pat <(select (i1 PredRegs:$src1), (i1 PredRegs:$src2),
+                   (i1 PredRegs:$src3)),
+      (OR_pp (AND_pp (i1 PredRegs:$src1), (i1 PredRegs:$src2)),
+             (AND_pp (NOT_p (i1 PredRegs:$src1)), (i1 PredRegs:$src3)))>;
 
 // Map Pd = load(addr) -> Rs = load(addr); Pd = Rs.
 def : Pat<(i1 (load ADDRriS11_2:$addr)),
       (i1 (TFR_PdRs (i32 (LDrib ADDRriS11_2:$addr))))>;
 
 // Map for truncating from 64 immediates to 32 bit immediates.
-def : Pat<(i32 (trunc DoubleRegs:$src)),
-      (i32 (EXTRACT_SUBREG DoubleRegs:$src, subreg_loreg))>;
+def : Pat<(i32 (trunc (i64 DoubleRegs:$src))),
+      (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src), subreg_loreg))>;
 
 // Map for truncating from i64 immediates to i1 bit immediates.
-def :  Pat<(i1 (trunc DoubleRegs:$src)),
-       (i1 (TFR_PdRs (i32(EXTRACT_SUBREG DoubleRegs:$src, subreg_loreg))))>;
+def :  Pat<(i1 (trunc (i64 DoubleRegs:$src))),
+       (i1 (TFR_PdRs (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src),
+                                          subreg_loreg))))>;
 
 // Map memb(Rs) = Rdd -> memb(Rs) = Rt.
-def : Pat<(truncstorei8 DoubleRegs:$src, ADDRriS11_0:$addr),
-      (STrib ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG DoubleRegs:$src,
+def : Pat<(truncstorei8 (i64 DoubleRegs:$src), ADDRriS11_0:$addr),
+      (STrib ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src),
                                                      subreg_loreg)))>;
 
 // Map memh(Rs) = Rdd -> memh(Rs) = Rt.
-def : Pat<(truncstorei16 DoubleRegs:$src, ADDRriS11_0:$addr),
-      (STrih ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG DoubleRegs:$src,
+def : Pat<(truncstorei16 (i64 DoubleRegs:$src), ADDRriS11_0:$addr),
+      (STrih ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src),
+                                                     subreg_loreg)))>;
+// Map memw(Rs) = Rdd -> memw(Rs) = Rt
+def : Pat<(truncstorei32 (i64  DoubleRegs:$src), ADDRriS11_0:$addr),
+      (STriw ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src),
                                                      subreg_loreg)))>;
 
 // Map memw(Rs) = Rdd -> memw(Rs) = Rt.
-def : Pat<(truncstorei32 DoubleRegs:$src, ADDRriS11_0:$addr),
-      (STriw ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG DoubleRegs:$src,
+def : Pat<(truncstorei32 (i64 DoubleRegs:$src), ADDRriS11_0:$addr),
+      (STriw ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src),
                                                      subreg_loreg)))>;
 
 // Map from i1 = constant<-1>; memw(addr) = i1 -> r0 = 1; memw(addr) = r0.
@@ -2763,118 +3131,134 @@ let AddedComplexity = 100 in
 // Map from i1 = constant<-1>; memw(CONST32(#foo)) = i1 -> r0 = 1;
 // memw(#foo) = r0
 def : Pat<(store (i1 -1), (HexagonCONST32_GP tglobaladdr:$global)),
-      (STb_GP tglobaladdr:$global, (TFRI 1))>;
-
+      (STb_GP tglobaladdr:$global, (TFRI 1))>,
+      Requires<[NoV4T]>;
 
 // Map from i1 = constant<-1>; store i1 -> r0 = 1; store r0.
 def : Pat<(store (i1 -1), ADDRriS11_2:$addr),
       (STrib ADDRriS11_2:$addr, (TFRI 1))>;
 
 // Map from memb(Rs) = Pd -> Rt = mux(Pd, #0, #1); store Rt.
-def : Pat<(store PredRegs:$src1, ADDRriS11_2:$addr),
-      (STrib ADDRriS11_2:$addr, (i32 (MUX_ii PredRegs:$src1, 1, 0)) )>;
+def : Pat<(store (i1 PredRegs:$src1), ADDRriS11_2:$addr),
+      (STrib ADDRriS11_2:$addr, (i32 (MUX_ii (i1 PredRegs:$src1), 1, 0)) )>;
 
 // Map Rdd = anyext(Rs) -> Rdd = sxtw(Rs).
 // Hexagon_TODO: We can probably use combine but that will cost 2 instructions.
 // Better way to do this?
-def : Pat<(i64 (anyext IntRegs:$src1)),
-      (i64 (SXTW IntRegs:$src1))>;
+def : Pat<(i64 (anyext (i32 IntRegs:$src1))),
+      (i64 (SXTW (i32 IntRegs:$src1)))>;
 
 // Map cmple -> cmpgt.
 // rs <= rt -> !(rs > rt).
-def : Pat<(i1 (setle IntRegs:$src1, s10ImmPred:$src2)),
-      (i1 (NOT_p (CMPGTri IntRegs:$src1, s10ImmPred:$src2)))>;
+def : Pat<(i1 (setle (i32 IntRegs:$src1), s10ImmPred:$src2)),
+      (i1 (NOT_p (CMPGTri (i32 IntRegs:$src1), s10ImmPred:$src2)))>;
 
 // rs <= rt -> !(rs > rt).
-def : Pat<(i1 (setle IntRegs:$src1, IntRegs:$src2)),
-      (i1 (NOT_p (CMPGTrr IntRegs:$src1, IntRegs:$src2)))>;
+def : Pat<(i1 (setle (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+      (i1 (NOT_p (CMPGTrr (i32 IntRegs:$src1), (i32 IntRegs:$src2))))>;
 
 // Rss <= Rtt -> !(Rss > Rtt).
-def : Pat<(i1 (setle DoubleRegs:$src1, DoubleRegs:$src2)),
-      (i1 (NOT_p (CMPGT64rr DoubleRegs:$src1, DoubleRegs:$src2)))>;
+def : Pat<(i1 (setle (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+      (i1 (NOT_p (CMPGT64rr (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))))>;
 
 // Map cmpne -> cmpeq.
 // Hexagon_TODO: We should improve on this.
 // rs != rt -> !(rs == rt).
-def : Pat <(i1 (setne IntRegs:$src1, s10ImmPred:$src2)),
-      (i1 (NOT_p(i1 (CMPEQri IntRegs:$src1, s10ImmPred:$src2))))>;
+def : Pat <(i1 (setne (i32 IntRegs:$src1), s10ImmPred:$src2)),
+      (i1 (NOT_p(i1 (CMPEQri (i32 IntRegs:$src1), s10ImmPred:$src2))))>;
 
 // Map cmpne(Rs) -> !cmpeqe(Rs).
 // rs != rt -> !(rs == rt).
-def : Pat <(i1 (setne IntRegs:$src1, IntRegs:$src2)),
-      (i1 (NOT_p(i1 (CMPEQrr IntRegs:$src1, IntRegs:$src2))))>;
+def : Pat <(i1 (setne (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+      (i1 (NOT_p (i1 (CMPEQrr (i32 IntRegs:$src1), (i32 IntRegs:$src2)))))>;
 
 // Convert setne back to xor for hexagon since we compute w/ pred registers.
-def : Pat <(i1 (setne PredRegs:$src1, PredRegs:$src2)),
-      (i1 (XOR_pp PredRegs:$src1, PredRegs:$src2))>;
+def : Pat <(i1 (setne (i1 PredRegs:$src1), (i1 PredRegs:$src2))),
+      (i1 (XOR_pp (i1 PredRegs:$src1), (i1 PredRegs:$src2)))>;
 
 // Map cmpne(Rss) -> !cmpew(Rss).
 // rs != rt -> !(rs == rt).
-def : Pat <(i1 (setne DoubleRegs:$src1, DoubleRegs:$src2)),
-      (i1 (NOT_p(i1 (CMPEHexagon4rr DoubleRegs:$src1, DoubleRegs:$src2))))>;
+def : Pat <(i1 (setne (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+      (i1 (NOT_p (i1 (CMPEHexagon4rr (i64 DoubleRegs:$src1),
+                                     (i64 DoubleRegs:$src2)))))>;
 
 // Map cmpge(Rs, Rt) -> !(cmpgt(Rs, Rt).
 // rs >= rt -> !(rt > rs).
-def : Pat <(i1 (setge IntRegs:$src1, IntRegs:$src2)),
-      (i1 (NOT_p(i1 (CMPGTrr IntRegs:$src2, IntRegs:$src1))))>;
+def : Pat <(i1 (setge (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+      (i1 (NOT_p (i1 (CMPGTrr (i32 IntRegs:$src2), (i32 IntRegs:$src1)))))>;
 
-def : Pat <(i1 (setge IntRegs:$src1, s8ImmPred:$src2)),
-      (i1 (CMPGEri IntRegs:$src1, s8ImmPred:$src2))>;
+def : Pat <(i1 (setge (i32 IntRegs:$src1), s8ImmPred:$src2)),
+      (i1 (CMPGEri (i32 IntRegs:$src1), s8ImmPred:$src2))>;
 
 // Map cmpge(Rss, Rtt) -> !cmpgt(Rtt, Rss).
 // rss >= rtt -> !(rtt > rss).
-def : Pat <(i1 (setge DoubleRegs:$src1, DoubleRegs:$src2)),
-      (i1 (NOT_p(i1 (CMPGT64rr DoubleRegs:$src2, DoubleRegs:$src1))))>;
+def : Pat <(i1 (setge (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+      (i1 (NOT_p (i1 (CMPGT64rr (i64 DoubleRegs:$src2),
+                                (i64 DoubleRegs:$src1)))))>;
 
 // Map cmplt(Rs, Imm) -> !cmpge(Rs, Imm).
 // rs < rt -> !(rs >= rt).
-def : Pat <(i1 (setlt IntRegs:$src1, s8ImmPred:$src2)),
-      (i1 (NOT_p (CMPGEri IntRegs:$src1, s8ImmPred:$src2)))>;
+def : Pat <(i1 (setlt (i32 IntRegs:$src1), s8ImmPred:$src2)),
+      (i1 (NOT_p (CMPGEri (i32 IntRegs:$src1), s8ImmPred:$src2)))>;
 
-// Map cmplt(Rs, Rt) -> cmplt(Rs, Rt).
-// rs < rt -> rs < rt. Let assembler map it.
-def : Pat <(i1 (setlt IntRegs:$src1, IntRegs:$src2)),
-      (i1 (CMPLTrr IntRegs:$src2, IntRegs:$src1))>;
+// Map cmplt(Rs, Rt) -> cmpgt(Rt, Rs).
+// rs < rt -> rt > rs.
+// We can let assembler map it, or we can do in the compiler itself.
+def : Pat <(i1 (setlt (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+      (i1 (CMPGTrr (i32 IntRegs:$src2), (i32 IntRegs:$src1)))>;
 
 // Map cmplt(Rss, Rtt) -> cmpgt(Rtt, Rss).
 // rss < rtt -> (rtt > rss).
-def : Pat <(i1 (setlt DoubleRegs:$src1, DoubleRegs:$src2)),
-      (i1 (CMPGT64rr DoubleRegs:$src2, DoubleRegs:$src1))>;
+def : Pat <(i1 (setlt (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+      (i1 (CMPGT64rr (i64 DoubleRegs:$src2), (i64 DoubleRegs:$src1)))>;
 
-// Map from cmpltu(Rs, Rd) -> !cmpgtu(Rs, Rd - 1).
+// Map from cmpltu(Rs, Rd) -> cmpgtu(Rd, Rs)
 // rs < rt -> rt > rs.
-def : Pat <(i1 (setult IntRegs:$src1, IntRegs:$src2)),
-      (i1 (CMPGTUrr IntRegs:$src2, IntRegs:$src1))>;
+// We can let assembler map it, or we can do in the compiler itself.
+def : Pat <(i1 (setult (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+      (i1 (CMPGTUrr (i32 IntRegs:$src2), (i32 IntRegs:$src1)))>;
 
-// Map from cmpltu(Rss, Rdd) -> !cmpgtu(Rss, Rdd - 1).
+// Map from cmpltu(Rss, Rdd) -> cmpgtu(Rdd, Rss).
 // rs < rt -> rt > rs.
-def : Pat <(i1 (setult DoubleRegs:$src1, DoubleRegs:$src2)),
-      (i1 (CMPGTU64rr DoubleRegs:$src2, DoubleRegs:$src1))>;
+def : Pat <(i1 (setult (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+      (i1 (CMPGTU64rr (i64 DoubleRegs:$src2), (i64 DoubleRegs:$src1)))>;
+
+// Generate cmpgeu(Rs, #u8)
+def : Pat <(i1 (setuge (i32 IntRegs:$src1), u8ImmPred:$src2)),
+      (i1 (CMPGEUri (i32 IntRegs:$src1), u8ImmPred:$src2))>;
+
+// Generate cmpgtu(Rs, #u9)
+def : Pat <(i1 (setugt (i32 IntRegs:$src1), u9ImmPred:$src2)),
+      (i1 (CMPGTUri (i32 IntRegs:$src1), u9ImmPred:$src2))>;
 
 // Map from Rs >= Rt -> !(Rt > Rs).
 // rs >= rt -> !(rt > rs).
-def : Pat <(i1 (setuge IntRegs:$src1, IntRegs:$src2)),
-      (i1 (NOT_p (CMPGTUrr IntRegs:$src2, IntRegs:$src1)))>;
+def : Pat <(i1 (setuge (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+      (i1 (NOT_p (CMPGTUrr (i32 IntRegs:$src2), (i32 IntRegs:$src1))))>;
 
 // Map from Rs >= Rt -> !(Rt > Rs).
 // rs >= rt -> !(rt > rs).
-def : Pat <(i1 (setuge DoubleRegs:$src1, DoubleRegs:$src2)),
-      (i1 (NOT_p (CMPGTU64rr DoubleRegs:$src2, DoubleRegs:$src1)))>;
+def : Pat <(i1 (setuge (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+      (i1 (NOT_p (CMPGTU64rr (i64 DoubleRegs:$src2), (i64 DoubleRegs:$src1))))>;
 
 // Map from cmpleu(Rs, Rs) -> !cmpgtu(Rs, Rs).
 // Map from (Rs <= Rt) -> !(Rs > Rt).
-def : Pat <(i1 (setule IntRegs:$src1, IntRegs:$src2)),
-      (i1 (NOT_p (CMPGTUrr IntRegs:$src1, IntRegs:$src2)))>;
+def : Pat <(i1 (setule (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+      (i1 (NOT_p (CMPGTUrr (i32 IntRegs:$src1), (i32 IntRegs:$src2))))>;
 
 // Map from cmpleu(Rss, Rtt) -> !cmpgtu(Rss, Rtt-1).
 // Map from (Rs <= Rt) -> !(Rs > Rt).
-def : Pat <(i1 (setule DoubleRegs:$src1, DoubleRegs:$src2)),
-      (i1 (NOT_p (CMPGTU64rr DoubleRegs:$src1, DoubleRegs:$src2)))>;
+def : Pat <(i1 (setule (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+      (i1 (NOT_p (CMPGTU64rr (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))))>;
 
 // Sign extends.
 // i1 -> i32
-def : Pat <(i32 (sext PredRegs:$src1)),
-      (i32 (MUX_ii PredRegs:$src1, -1, 0))>;
+def : Pat <(i32 (sext (i1 PredRegs:$src1))),
+      (i32 (MUX_ii (i1 PredRegs:$src1), -1, 0))>;
+
+// i1 -> i64
+def : Pat <(i64 (sext (i1 PredRegs:$src1))),
+      (i64 (COMBINE_rr (TFRI -1), (MUX_ii (i1 PredRegs:$src1), -1, 0)))>;
 
 // Convert sign-extended load back to load and sign extend.
 // i8 -> i64
@@ -2899,16 +3283,16 @@ def:  Pat <(i64 (sextloadi32 ADDRriS11_2:$src1)),
 
 // Zero extends.
 // i1 -> i32
-def : Pat <(i32 (zext PredRegs:$src1)),
-      (i32 (MUX_ii PredRegs:$src1, 1, 0))>;
+def : Pat <(i32 (zext (i1 PredRegs:$src1))),
+      (i32 (MUX_ii (i1 PredRegs:$src1), 1, 0))>;
 
 // i1 -> i64
-def : Pat <(i64 (zext PredRegs:$src1)),
-      (i64 (COMBINE_rr (TFRI 0), (MUX_ii PredRegs:$src1, 1, 0)))>;
+def : Pat <(i64 (zext (i1 PredRegs:$src1))),
+      (i64 (COMBINE_rr (TFRI 0), (MUX_ii (i1 PredRegs:$src1), 1, 0)))>;
 
 // i32 -> i64
-def : Pat <(i64 (zext IntRegs:$src1)),
-      (i64 (COMBINE_rr (TFRI 0), IntRegs:$src1))>;
+def : Pat <(i64 (zext (i32 IntRegs:$src1))),
+      (i64 (COMBINE_rr (TFRI 0), (i32 IntRegs:$src1)))>;
 
 // i8 -> i64
 def:  Pat <(i64 (zextloadi8 ADDRriS11_0:$src1)),
@@ -2926,16 +3310,16 @@ def:  Pat <(i32 (zextloadi1 ADDRriS11_0:$src1)),
       (i32 (LDriw ADDRriS11_0:$src1))>;
 
 // Map from Rs = Pd to Pd = mux(Pd, #1, #0)
-def : Pat <(i32 (zext PredRegs:$src1)),
-      (i32 (MUX_ii PredRegs:$src1, 1, 0))>;
+def : Pat <(i32 (zext (i1 PredRegs:$src1))),
+      (i32 (MUX_ii (i1 PredRegs:$src1), 1, 0))>;
 
 // Map from Rs = Pd to Pd = mux(Pd, #1, #0)
-def : Pat <(i32 (anyext PredRegs:$src1)),
-      (i32 (MUX_ii PredRegs:$src1, 1, 0))>;
+def : Pat <(i32 (anyext (i1 PredRegs:$src1))),
+      (i32 (MUX_ii (i1 PredRegs:$src1), 1, 0))>;
 
 // Map from Rss = Pd to Rdd = sxtw (mux(Pd, #1, #0))
-def : Pat <(i64 (anyext PredRegs:$src1)),
-      (i64 (SXTW (i32 (MUX_ii PredRegs:$src1, 1, 0))))>;
+def : Pat <(i64 (anyext (i1 PredRegs:$src1))),
+      (i64 (SXTW (i32 (MUX_ii (i1 PredRegs:$src1), 1, 0))))>;
 
 
 // Any extended 64-bit load.
@@ -2948,75 +3332,103 @@ def:  Pat <(i64 (extloadi16 ADDRriS11_2:$src1)),
       (i64 (COMBINE_rr (TFRI 0), (LDrih ADDRriS11_2:$src1)))>;
 
 // Map from Rdd = zxtw(Rs) -> Rdd = combine(0, Rs).
-def : Pat<(i64 (zext IntRegs:$src1)),
-      (i64 (COMBINE_rr (TFRI 0), IntRegs:$src1))>;
+def : Pat<(i64 (zext (i32 IntRegs:$src1))),
+      (i64 (COMBINE_rr (TFRI 0), (i32 IntRegs:$src1)))>;
 
 // Multiply 64-bit unsigned and use upper result.
-def : Pat <(mulhu DoubleRegs:$src1, DoubleRegs:$src2),
-      (MPYU64_acc(COMBINE_rr (TFRI 0),
-                 (EXTRACT_SUBREG
-                 (LSRd_ri(MPYU64_acc(MPYU64_acc(COMBINE_rr (TFRI 0),
-                                 (EXTRACT_SUBREG (LSRd_ri(MPYU64
-                                 (EXTRACT_SUBREG DoubleRegs:$src1,
-                                                 subreg_loreg),
-                                 (EXTRACT_SUBREG DoubleRegs:$src2,
-                                                 subreg_loreg)),
-                                  32) ,subreg_loreg)),
-                                 (EXTRACT_SUBREG DoubleRegs:$src1,
-                                                 subreg_hireg),
-                                 (EXTRACT_SUBREG DoubleRegs:$src2,
-                                                 subreg_loreg)),
-                              (EXTRACT_SUBREG DoubleRegs:$src1, subreg_loreg),
-                              (EXTRACT_SUBREG DoubleRegs:$src2, subreg_hireg)),
-                          32),subreg_loreg)),
-                 (EXTRACT_SUBREG DoubleRegs:$src1, subreg_hireg),
-                 (EXTRACT_SUBREG DoubleRegs:$src2, subreg_hireg)
-      )>;
+def : Pat <(mulhu (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2)),
+      (i64
+       (MPYU64_acc
+        (i64
+         (COMBINE_rr
+          (TFRI 0),
+           (i32
+            (EXTRACT_SUBREG
+             (i64
+              (LSRd_ri
+               (i64
+                (MPYU64_acc
+                 (i64
+                  (MPYU64_acc
+                   (i64
+                    (COMBINE_rr (TFRI 0),
+                     (i32
+                      (EXTRACT_SUBREG
+                       (i64
+                        (LSRd_ri
+                         (i64
+                          (MPYU64 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1),
+                                                       subreg_loreg)),
+                                  (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
+                                                       subreg_loreg)))), 32)),
+                       subreg_loreg)))),
+                  (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_hireg)),
+                  (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_loreg)))),
+                 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_loreg)),
+                 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_hireg)))),
+               32)), subreg_loreg)))),
+        (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_hireg)),
+        (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_hireg))))>;
 
 // Multiply 64-bit signed and use upper result.
-def : Pat <(mulhs DoubleRegs:$src1, DoubleRegs:$src2),
-      (MPY64_acc(COMBINE_rr (TFRI 0),
-                 (EXTRACT_SUBREG
-                 (LSRd_ri(MPY64_acc(MPY64_acc(COMBINE_rr (TFRI 0),
-                                 (EXTRACT_SUBREG (LSRd_ri(MPYU64
-                                 (EXTRACT_SUBREG DoubleRegs:$src1,
-                                                 subreg_loreg),
-                                 (EXTRACT_SUBREG DoubleRegs:$src2,
-                                                 subreg_loreg)),
-                                  32) ,subreg_loreg)),
-                                 (EXTRACT_SUBREG DoubleRegs:$src1,
-                                                 subreg_hireg),
-                                 (EXTRACT_SUBREG DoubleRegs:$src2,
-                                                 subreg_loreg)),
-                              (EXTRACT_SUBREG DoubleRegs:$src1, subreg_loreg),
-                              (EXTRACT_SUBREG DoubleRegs:$src2, subreg_hireg)),
-                          32),subreg_loreg)),
-                 (EXTRACT_SUBREG DoubleRegs:$src1, subreg_hireg),
-                 (EXTRACT_SUBREG DoubleRegs:$src2, subreg_hireg)
-      )>;
+def : Pat <(mulhs (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2)),
+      (i64
+       (MPY64_acc
+        (i64
+         (COMBINE_rr (TFRI 0),
+          (i32
+           (EXTRACT_SUBREG
+            (i64
+             (LSRd_ri
+              (i64
+               (MPY64_acc
+                (i64
+                 (MPY64_acc
+                  (i64
+                   (COMBINE_rr (TFRI 0),
+                    (i32
+                     (EXTRACT_SUBREG
+                      (i64
+                       (LSRd_ri
+                        (i64
+                         (MPYU64 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1),
+                                                      subreg_loreg)),
+                                 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
+                                                      subreg_loreg)))), 32)),
+                      subreg_loreg)))),
+                  (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_hireg)),
+                  (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_loreg)))),
+                (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_loreg)),
+                (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_hireg)))),
+              32)), subreg_loreg)))),
+        (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_hireg)),
+        (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_hireg))))>;
 
 // Hexagon specific ISD nodes.
-def SDTHexagonADJDYNALLOC : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>]>;
+//def SDTHexagonADJDYNALLOC : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>]>;
+def SDTHexagonADJDYNALLOC : SDTypeProfile<1, 2,
+                                  [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
 def Hexagon_ADJDYNALLOC : SDNode<"HexagonISD::ADJDYNALLOC",
-                                 SDTHexagonADJDYNALLOC>;
+                                  SDTHexagonADJDYNALLOC>;
 // Needed to tag these instructions for stack layout.
 let usesCustomInserter = 1 in
 def ADJDYNALLOC : ALU32_ri<(outs IntRegs:$dst), (ins IntRegs:$src1,
                                                      s16Imm:$src2),
                   "$dst = add($src1, #$src2)",
-                  [(set IntRegs:$dst, (Hexagon_ADJDYNALLOC IntRegs:$src1,
-                                                           s16ImmPred:$src2))]>;
+                  [(set (i32 IntRegs:$dst),
+                        (Hexagon_ADJDYNALLOC (i32 IntRegs:$src1),
+                                             s16ImmPred:$src2))]>;
 
-def SDTHexagonARGEXTEND : SDTypeProfile<1, 1, []>;
+def SDTHexagonARGEXTEND : SDTypeProfile<1, 1, [SDTCisVT<0, i32>]>;
 def Hexagon_ARGEXTEND : SDNode<"HexagonISD::ARGEXTEND", SDTHexagonARGEXTEND>;
 def ARGEXTEND : ALU32_rr <(outs IntRegs:$dst), (ins IntRegs:$src1),
                 "$dst = $src1",
-                [(set IntRegs:$dst, (Hexagon_ARGEXTEND IntRegs:$src1))]>;
+                [(set (i32 IntRegs:$dst),
+                      (Hexagon_ARGEXTEND (i32 IntRegs:$src1)))]>;
 
 let AddedComplexity = 100 in
-def : Pat<(i32 (sext_inreg (Hexagon_ARGEXTEND IntRegs:$src1), i16)),
-      (TFR IntRegs:$src1)>;
-
+def : Pat<(i32 (sext_inreg (Hexagon_ARGEXTEND (i32 IntRegs:$src1)), i16)),
+      (COPY (i32 IntRegs:$src1))>;
 
 def SDHexagonBR_JT: SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
 def HexagonBR_JT: SDNode<"HexagonISD::BR_JT", SDHexagonBR_JT, [SDNPHasChain]>;
@@ -3024,12 +3436,91 @@ def HexagonBR_JT: SDNode<"HexagonISD::BR_JT", SDHexagonBR_JT, [SDNPHasChain]>;
 let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in
 def BR_JT : JRInst<(outs), (ins IntRegs:$src),
                    "jumpr $src",
-                   [(HexagonBR_JT IntRegs:$src)]>;
+                   [(HexagonBR_JT (i32 IntRegs:$src))]>;
+
 def HexagonWrapperJT: SDNode<"HexagonISD::WrapperJT", SDTIntUnaryOp>;
 
 def : Pat<(HexagonWrapperJT tjumptable:$dst),
-          (CONST32_set_jt tjumptable:$dst)>;
+          (i32 (CONST32_set_jt tjumptable:$dst))>;
+
+// XTYPE/SHIFT
+
+// Multi-class for logical operators :
+// Shift by immediate/register and accumulate/logical
+multiclass xtype_imm<string OpcStr, SDNode OpNode1, SDNode OpNode2> {
+  def _ri : SInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2, u5Imm:$src3),
+            !strconcat("$dst ", !strconcat(OpcStr, "($src2, #$src3)")),
+            [(set (i32 IntRegs:$dst),
+                  (OpNode2 (i32 IntRegs:$src1),
+                           (OpNode1 (i32 IntRegs:$src2),
+                                    u5ImmPred:$src3)))],
+            "$src1 = $dst">;
 
+  def d_ri : SInst_acc<(outs DoubleRegs:$dst),
+            (ins DoubleRegs:$src1, DoubleRegs:$src2, u6Imm:$src3),
+            !strconcat("$dst ", !strconcat(OpcStr, "($src2, #$src3)")),
+            [(set (i64 DoubleRegs:$dst), (OpNode2 (i64 DoubleRegs:$src1),
+                          (OpNode1 (i64 DoubleRegs:$src2), u6ImmPred:$src3)))],
+            "$src1 = $dst">;
+}
+
+// Multi-class for logical operators :
+// Shift by register and accumulate/logical (32/64 bits)
+multiclass xtype_reg<string OpcStr, SDNode OpNode1, SDNode OpNode2> {
+  def _rr : SInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            !strconcat("$dst ", !strconcat(OpcStr, "($src2, $src3)")),
+            [(set (i32 IntRegs:$dst),
+                  (OpNode2 (i32 IntRegs:$src1),
+                           (OpNode1 (i32 IntRegs:$src2),
+                                    (i32 IntRegs:$src3))))],
+            "$src1 = $dst">;
+
+  def d_rr : SInst_acc<(outs DoubleRegs:$dst),
+            (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+            !strconcat("$dst ", !strconcat(OpcStr, "($src2, $src3)")),
+            [(set (i64 DoubleRegs:$dst),
+                  (OpNode2 (i64 DoubleRegs:$src1),
+                           (OpNode1 (i64 DoubleRegs:$src2),
+                                    (i32 IntRegs:$src3))))],
+            "$src1 = $dst">;
+
+}
+
+multiclass basic_xtype_imm<string OpcStr, SDNode OpNode> {
+let AddedComplexity = 100 in
+  defm _ADD : xtype_imm< !strconcat("+= ", OpcStr), OpNode, add>;
+  defm _SUB : xtype_imm< !strconcat("-= ", OpcStr), OpNode, sub>;
+  defm _AND : xtype_imm< !strconcat("&= ", OpcStr), OpNode, and>;
+  defm _OR  : xtype_imm< !strconcat("|= ", OpcStr), OpNode, or>;
+}
+
+multiclass basic_xtype_reg<string OpcStr, SDNode OpNode> {
+let AddedComplexity = 100 in
+  defm _ADD : xtype_reg< !strconcat("+= ", OpcStr), OpNode, add>;
+  defm _SUB : xtype_reg< !strconcat("-= ", OpcStr), OpNode, sub>;
+  defm _AND : xtype_reg< !strconcat("&= ", OpcStr), OpNode, and>;
+  defm _OR  : xtype_reg< !strconcat("|= ", OpcStr), OpNode, or>;
+}
+
+multiclass xtype_xor_imm<string OpcStr, SDNode OpNode> {
+let AddedComplexity = 100 in
+  defm _XOR : xtype_imm< !strconcat("^= ", OpcStr), OpNode, xor>;
+}
+
+defm ASL : basic_xtype_imm<"asl", shl>, basic_xtype_reg<"asl", shl>,
+           xtype_xor_imm<"asl", shl>;
+
+defm LSR : basic_xtype_imm<"lsr", srl>, basic_xtype_reg<"lsr", srl>,
+           xtype_xor_imm<"lsr", srl>;
+
+defm ASR : basic_xtype_imm<"asr", sra>, basic_xtype_reg<"asr", sra>;
+defm LSL : basic_xtype_reg<"lsl", shl>;
+
+// Change the sign of the immediate for Rd=-mpyi(Rs,#u8)
+def : Pat <(mul (i32 IntRegs:$src1), (ineg n8ImmPred:$src2)),
+      (i32 (MPYI_rin (i32 IntRegs:$src1), u8ImmPred:$src2))>;
 
 //===----------------------------------------------------------------------===//
 // V3 Instructions +
@@ -3046,3 +3537,19 @@ include "HexagonInstrInfoV3.td"
 //===----------------------------------------------------------------------===//
 
 include "HexagonInstrInfoV4.td"
+
+//===----------------------------------------------------------------------===//
+// V4 Instructions -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// V5 Instructions +
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrInfoV5.td"
+
+//===----------------------------------------------------------------------===//
+// V5 Instructions -
+//===----------------------------------------------------------------------===//
+
+
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV3.td b/lib/Target/Hexagon/HexagonInstrInfoV3.td
index a73897e..157ab3d 100644
--- a/lib/Target/Hexagon/HexagonInstrInfoV3.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoV3.td
@@ -19,7 +19,7 @@
 let isCall = 1, neverHasSideEffects = 1,
   Defs = [D0, D1, D2, D3, D4, D5, D6, D7, R28, R31,
                 P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
-  def CALLv3 : JInst<(outs), (ins calltarget:$dst, variable_ops),
+  def CALLv3 : JInst<(outs), (ins calltarget:$dst),
              "call $dst", []>, Requires<[HasV3T]>;
 }
 
@@ -35,16 +35,17 @@ let isCall = 1, neverHasSideEffects = 1,
 let isCall = 1, neverHasSideEffects = 1,
   Defs = [D0, D1, D2, D3, D4, D5, D6, D7, R28, R31,
                 P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
-  def CALLRv3 : JRInst<(outs), (ins IntRegs:$dst, variable_ops),
+  def CALLRv3 : JRInst<(outs), (ins IntRegs:$dst),
               "callr $dst",
               []>, Requires<[HasV3TOnly]>;
  }
 
 
+// Jump to address from register
 // if(p?.new) jumpr:t r?
 let isReturn = 1, isTerminator = 1, isBarrier = 1,
   Defs = [PC], Uses = [R31] in {
-  def JMPR_cPnewt: JRInst<(outs), (ins PredRegs:$src1, IntRegs:$src2),
+  def JMPR_cdnPt_V3: JRInst<(outs), (ins PredRegs:$src1, IntRegs:$src2),
                        "if ($src1.new) jumpr:t $src2",
                        []>, Requires<[HasV3T]>;
 }
@@ -52,7 +53,7 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1,
 // if (!p?.new) jumpr:t r?
 let isReturn = 1, isTerminator = 1, isBarrier = 1,
   Defs = [PC], Uses = [R31] in {
-  def JMPR_cNotPnewt: JRInst<(outs), (ins PredRegs:$src1, IntRegs:$src2),
+  def JMPR_cdnNotPt_V3: JRInst<(outs), (ins PredRegs:$src1, IntRegs:$src2),
                        "if (!$src1.new) jumpr:t $src2",
                        []>, Requires<[HasV3T]>;
 }
@@ -61,7 +62,7 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1,
 // if(p?.new) jumpr:nt r?
 let isReturn = 1, isTerminator = 1, isBarrier = 1,
   Defs = [PC], Uses = [R31] in {
-  def JMPR_cPnewNt: JRInst<(outs), (ins PredRegs:$src1, IntRegs:$src2),
+  def JMPR_cdnPnt: JRInst<(outs), (ins PredRegs:$src1, IntRegs:$src2),
                        "if ($src1.new) jumpr:nt $src2",
                        []>, Requires<[HasV3T]>;
 }
@@ -69,7 +70,7 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1,
 // if (!p?.new) jumpr:nt r?
 let isReturn = 1, isTerminator = 1, isBarrier = 1,
   Defs = [PC], Uses = [R31] in {
-  def JMPR_cNotPnewNt: JRInst<(outs), (ins PredRegs:$src1, IntRegs:$src2),
+  def JMPR_cdnNotPnt: JRInst<(outs), (ins PredRegs:$src1, IntRegs:$src2),
                        "if (!$src1.new) jumpr:nt $src2",
                        []>, Requires<[HasV3T]>;
 }
@@ -86,20 +87,22 @@ let AddedComplexity = 200 in
 def MAXw_dd : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
                                                     DoubleRegs:$src2),
               "$dst = max($src2, $src1)",
-              [(set DoubleRegs:$dst, (select (i1 (setlt DoubleRegs:$src2,
-                                                        DoubleRegs:$src1)),
-                                             DoubleRegs:$src1,
-                                             DoubleRegs:$src2))]>,
+              [(set (i64 DoubleRegs:$dst),
+                    (i64 (select (i1 (setlt (i64 DoubleRegs:$src2),
+                                            (i64 DoubleRegs:$src1))),
+                                 (i64 DoubleRegs:$src1),
+                                 (i64 DoubleRegs:$src2))))]>,
 Requires<[HasV3T]>;
 
 let AddedComplexity = 200 in
 def MINw_dd : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
                                                     DoubleRegs:$src2),
               "$dst = min($src2, $src1)",
-              [(set DoubleRegs:$dst, (select (i1 (setgt DoubleRegs:$src2,
-                                                        DoubleRegs:$src1)),
-                                             DoubleRegs:$src1,
-                                             DoubleRegs:$src2))]>,
+              [(set (i64 DoubleRegs:$dst),
+                    (i64 (select (i1 (setgt (i64 DoubleRegs:$src2),
+                                            (i64 DoubleRegs:$src1))),
+                                 (i64 DoubleRegs:$src1),
+                                 (i64 DoubleRegs:$src2))))]>,
 Requires<[HasV3T]>;
 
 //===----------------------------------------------------------------------===//
@@ -109,25 +112,25 @@ Requires<[HasV3T]>;
 
 
 
-//def : Pat <(brcond (i1 (seteq IntRegs:$src1, 0)), bb:$offset),
-//      (JMP_RegEzt IntRegs:$src1, bb:$offset)>, Requires<[HasV3T]>;
+//def : Pat <(brcond (i1 (seteq (i32 IntRegs:$src1), 0)), bb:$offset),
+//      (JMP_RegEzt (i32 IntRegs:$src1), bb:$offset)>, Requires<[HasV3T]>;
 
-//def : Pat <(brcond (i1 (setne IntRegs:$src1, 0)), bb:$offset),
-//      (JMP_RegNzt IntRegs:$src1, bb:$offset)>, Requires<[HasV3T]>;
+//def : Pat <(brcond (i1 (setne (i32 IntRegs:$src1), 0)), bb:$offset),
+//      (JMP_RegNzt (i32 IntRegs:$src1), bb:$offset)>, Requires<[HasV3T]>;
 
-//def : Pat <(brcond (i1 (setle IntRegs:$src1, 0)), bb:$offset),
-//      (JMP_RegLezt IntRegs:$src1, bb:$offset)>, Requires<[HasV3T]>;
+//def : Pat <(brcond (i1 (setle (i32 IntRegs:$src1), 0)), bb:$offset),
+//      (JMP_RegLezt (i32 IntRegs:$src1), bb:$offset)>, Requires<[HasV3T]>;
 
-//def : Pat <(brcond (i1 (setge IntRegs:$src1, 0)), bb:$offset),
-//      (JMP_RegGezt IntRegs:$src1, bb:$offset)>, Requires<[HasV3T]>;
+//def : Pat <(brcond (i1 (setge (i32 IntRegs:$src1), 0)), bb:$offset),
+//      (JMP_RegGezt (i32 IntRegs:$src1), bb:$offset)>, Requires<[HasV3T]>;
 
-//def : Pat <(brcond (i1 (setgt IntRegs:$src1, -1)), bb:$offset),
-//      (JMP_RegGezt IntRegs:$src1, bb:$offset)>, Requires<[HasV3T]>;
+//def : Pat <(brcond (i1 (setgt (i32 IntRegs:$src1), -1)), bb:$offset),
+//      (JMP_RegGezt (i32 IntRegs:$src1), bb:$offset)>, Requires<[HasV3T]>;
 
 
 // Map call instruction
-def : Pat<(call IntRegs:$dst),
-      (CALLRv3 IntRegs:$dst)>, Requires<[HasV3T]>;
+def : Pat<(call (i32 IntRegs:$dst)),
+      (CALLRv3 (i32 IntRegs:$dst))>, Requires<[HasV3T]>;
 def : Pat<(call tglobaladdr:$dst),
       (CALLv3 tglobaladdr:$dst)>, Requires<[HasV3T]>;
 def : Pat<(call texternalsym:$dst),
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV4.td b/lib/Target/Hexagon/HexagonInstrInfoV4.td
index 9e60cf2..70448fc 100644
--- a/lib/Target/Hexagon/HexagonInstrInfoV4.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoV4.td
@@ -11,6 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+let neverHasSideEffects = 1 in
+def IMMEXT : Immext<(outs), (ins),
+                    "/* immext #... */",
+                    []>,
+             Requires<[HasV4T]>;
+
 // Hexagon V4 Architecture spec defines 8 instruction classes:
 // LD ST ALU32 XTYPE J JR MEMOP NV CR SYSTEM(system is not implemented in the
 // compiler)
@@ -250,23 +256,151 @@ def ZXTH_cdnNotPt_V4 : ALU32_rr<(outs IntRegs:$dst),
             []>,
             Requires<[HasV4T]>;
 
+// Generate frame index addresses.
+let neverHasSideEffects = 1, isReMaterializable = 1 in
+def TFR_FI_immext_V4 : ALU32_ri<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, s32Imm:$offset),
+            "$dst = add($src1, ##$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
 
 //===----------------------------------------------------------------------===//
 // ALU32 -
 //===----------------------------------------------------------------------===//
 
 
+//===----------------------------------------------------------------------===//
+// ALU32/PERM +
+//===----------------------------------------------------------------------===//
+
+// Combine
+// Rdd=combine(Rs, #s8)
+let neverHasSideEffects = 1 in
+def COMBINE_ri_V4 : ALU32_ri<(outs DoubleRegs:$dst),
+            (ins IntRegs:$src1, s8Imm:$src2),
+            "$dst = combine($src1, #$src2)",
+            []>,
+            Requires<[HasV4T]>;
+// Rdd=combine(#s8, Rs)
+let neverHasSideEffects = 1 in
+def COMBINE_ir_V4 : ALU32_ir<(outs DoubleRegs:$dst),
+            (ins s8Imm:$src1, IntRegs:$src2),
+            "$dst = combine(#$src1, $src2)",
+            []>,
+            Requires<[HasV4T]>;
+//===----------------------------------------------------------------------===//
+// ALU32/PERM +
+//===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
 // LD +
 //===----------------------------------------------------------------------===//
-///
-/// Make sure that in post increment load, the first operand is always the post
-/// increment operand.
-///
-//// Load doubleword.
-// Rdd=memd(Re=#U6)
+//
+// These absolute set addressing mode instructions accept immediate as
+// an operand. We have duplicated these patterns to take global address.
+
+let neverHasSideEffects = 1 in
+def LDrid_abs_setimm_V4 : LDInst2<(outs DoubleRegs:$dst1, IntRegs:$dst2),
+            (ins u6Imm:$addr),
+            "$dst1 = memd($dst2=#$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Rd=memb(Re=#U6)
+let neverHasSideEffects = 1 in
+def LDrib_abs_setimm_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins u6Imm:$addr),
+            "$dst1 = memb($dst2=#$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Rd=memh(Re=#U6)
+let neverHasSideEffects = 1 in
+def LDrih_abs_setimm_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins u6Imm:$addr),
+            "$dst1 = memh($dst2=#$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Rd=memub(Re=#U6)
+let neverHasSideEffects = 1 in
+def LDriub_abs_setimm_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins u6Imm:$addr),
+            "$dst1 = memub($dst2=#$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Rd=memuh(Re=#U6)
+let neverHasSideEffects = 1 in
+def LDriuh_abs_setimm_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins u6Imm:$addr),
+            "$dst1 = memuh($dst2=#$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Rd=memw(Re=#U6)
+let neverHasSideEffects = 1 in
+def LDriw_abs_setimm_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins u6Imm:$addr),
+            "$dst1 = memw($dst2=#$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Following patterns are defined for absolute set addressing mode
+// instruction which take global address as operand.
+let neverHasSideEffects = 1 in
+def LDrid_abs_set_V4 : LDInst2<(outs DoubleRegs:$dst1, IntRegs:$dst2),
+            (ins globaladdress:$addr),
+            "$dst1 = memd($dst2=##$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Rd=memb(Re=#U6)
+let neverHasSideEffects = 1 in
+def LDrib_abs_set_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins globaladdress:$addr),
+            "$dst1 = memb($dst2=##$addr)",
+            []>,
+            Requires<[HasV4T]>;
 
+// Rd=memh(Re=#U6)
+let neverHasSideEffects = 1 in
+def LDrih_abs_set_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins globaladdress:$addr),
+            "$dst1 = memh($dst2=##$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Rd=memub(Re=#U6)
+let neverHasSideEffects = 1 in
+def LDriub_abs_set_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins globaladdress:$addr),
+            "$dst1 = memub($dst2=##$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Rd=memuh(Re=#U6)
+let neverHasSideEffects = 1 in
+def LDriuh_abs_set_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins globaladdress:$addr),
+            "$dst1 = memuh($dst2=##$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Rd=memw(Re=#U6)
+let neverHasSideEffects = 1 in
+def LDriw_abs_set_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins globaladdress:$addr),
+            "$dst1 = memw($dst2=##$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Load doubleword.
+//
+// Make sure that in post increment load, the first operand is always the post
+// increment operand.
+//
 // Rdd=memd(Rs+Rt<<#u2)
 // Special case pattern for indexed load without offset which is easier to
 // match. AddedComplexity of this pattern should be lower than base+offset load
@@ -276,56 +410,58 @@ let AddedComplexity = 10, isPredicable = 1 in
 def LDrid_indexed_V4 : LDInst<(outs DoubleRegs:$dst),
                     (ins IntRegs:$src1, IntRegs:$src2),
                     "$dst=memd($src1+$src2<<#0)",
-                    [(set DoubleRegs:$dst, (load (add IntRegs:$src1,
-                                                      IntRegs:$src2)))]>,
+                    [(set (i64 DoubleRegs:$dst),
+                          (i64 (load (add (i32 IntRegs:$src1),
+                                          (i32 IntRegs:$src2)))))]>,
                     Requires<[HasV4T]>;
 
 let AddedComplexity = 40, isPredicable = 1 in
 def LDrid_indexed_shl_V4 : LDInst<(outs DoubleRegs:$dst),
                     (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset),
                     "$dst=memd($src1+$src2<<#$offset)",
-                    [(set DoubleRegs:$dst, (load (add IntRegs:$src1,
-                                                 (shl IntRegs:$src2,
-                                                      u2ImmPred:$offset))))]>,
+                    [(set (i64 DoubleRegs:$dst),
+                          (i64 (load (add (i32 IntRegs:$src1),
+                                          (shl (i32 IntRegs:$src2),
+                                               u2ImmPred:$offset)))))]>,
                     Requires<[HasV4T]>;
 
 //// Load doubleword conditionally.
 // if ([!]Pv[.new]) Rd=memd(Rs+Rt<<#u2)
 // if (Pv) Rd=memd(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDrid_indexed_cPt_V4 : LDInst<(outs DoubleRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDrid_indexed_cPt_V4 : LDInst2<(outs DoubleRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if ($src1) $dst=memd($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (Pv.new) Rd=memd(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDrid_indexed_cdnPt_V4 : LDInst<(outs DoubleRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDrid_indexed_cdnPt_V4 : LDInst2<(outs DoubleRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if ($src1.new) $dst=memd($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (!Pv) Rd=memd(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDrid_indexed_cNotPt_V4 : LDInst<(outs DoubleRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDrid_indexed_cNotPt_V4 : LDInst2<(outs DoubleRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if (!$src1) $dst=memd($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (!Pv.new) Rd=memd(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDrid_indexed_cdnNotPt_V4 : LDInst<(outs DoubleRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDrid_indexed_cdnNotPt_V4 : LDInst2<(outs DoubleRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if (!$src1.new) $dst=memd($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (Pv) Rd=memd(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDrid_indexed_shl_cPt_V4 : LDInst<(outs DoubleRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDrid_indexed_shl_cPt_V4 : LDInst2<(outs DoubleRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if ($src1) $dst=memd($src2+$src3<<#$offset)",
@@ -333,8 +469,8 @@ def LDrid_indexed_shl_cPt_V4 : LDInst<(outs DoubleRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (Pv.new) Rd=memd(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDrid_indexed_shl_cdnPt_V4 : LDInst<(outs DoubleRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDrid_indexed_shl_cdnPt_V4 : LDInst2<(outs DoubleRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if ($src1.new) $dst=memd($src2+$src3<<#$offset)",
@@ -342,8 +478,8 @@ def LDrid_indexed_shl_cdnPt_V4 : LDInst<(outs DoubleRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (!Pv) Rd=memd(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDrid_indexed_shl_cNotPt_V4 : LDInst<(outs DoubleRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDrid_indexed_shl_cNotPt_V4 : LDInst2<(outs DoubleRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if (!$src1) $dst=memd($src2+$src3<<#$offset)",
@@ -351,8 +487,8 @@ def LDrid_indexed_shl_cNotPt_V4 : LDInst<(outs DoubleRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (!Pv.new) Rd=memd(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDrid_indexed_shl_cdnNotPt_V4 : LDInst<(outs DoubleRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDrid_indexed_shl_cdnNotPt_V4 : LDInst2<(outs DoubleRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if (!$src1.new) $dst=memd($src2+$src3<<#$offset)",
@@ -362,99 +498,101 @@ def LDrid_indexed_shl_cdnNotPt_V4 : LDInst<(outs DoubleRegs:$dst),
 // Rdd=memd(Rt<<#u2+#U6)
 
 //// Load byte.
-// Rd=memb(Re=#U6)
-
 // Rd=memb(Rs+Rt<<#u2)
 let AddedComplexity = 10, isPredicable = 1 in
 def LDrib_indexed_V4 : LDInst<(outs IntRegs:$dst),
                     (ins IntRegs:$src1, IntRegs:$src2),
                     "$dst=memb($src1+$src2<<#0)",
-                    [(set IntRegs:$dst, (sextloadi8 (add IntRegs:$src1,
-                                                         IntRegs:$src2)))]>,
+                    [(set (i32 IntRegs:$dst),
+                          (i32 (sextloadi8 (add (i32 IntRegs:$src1),
+                                                (i32 IntRegs:$src2)))))]>,
                     Requires<[HasV4T]>;
 
 let AddedComplexity = 10, isPredicable = 1 in
 def LDriub_indexed_V4 : LDInst<(outs IntRegs:$dst),
                     (ins IntRegs:$src1, IntRegs:$src2),
                     "$dst=memub($src1+$src2<<#0)",
-                    [(set IntRegs:$dst, (zextloadi8 (add IntRegs:$src1,
-                                                         IntRegs:$src2)))]>,
+                    [(set (i32 IntRegs:$dst),
+                          (i32 (zextloadi8 (add (i32 IntRegs:$src1),
+                                                (i32 IntRegs:$src2)))))]>,
                     Requires<[HasV4T]>;
 
 let AddedComplexity = 10, isPredicable = 1 in
 def LDriub_ae_indexed_V4 : LDInst<(outs IntRegs:$dst),
                     (ins IntRegs:$src1, IntRegs:$src2),
                     "$dst=memub($src1+$src2<<#0)",
-                    [(set IntRegs:$dst, (extloadi8 (add IntRegs:$src1,
-                                                        IntRegs:$src2)))]>,
+                    [(set (i32 IntRegs:$dst),
+                          (i32 (extloadi8 (add (i32 IntRegs:$src1),
+                                               (i32 IntRegs:$src2)))))]>,
                     Requires<[HasV4T]>;
 
 let AddedComplexity = 40, isPredicable = 1 in
 def LDrib_indexed_shl_V4 : LDInst<(outs IntRegs:$dst),
                     (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset),
                     "$dst=memb($src1+$src2<<#$offset)",
-                    [(set IntRegs:$dst,
-                          (sextloadi8 (add IntRegs:$src1,
-                                           (shl IntRegs:$src2,
-                                                u2ImmPred:$offset))))]>,
+                    [(set (i32 IntRegs:$dst),
+                          (i32 (sextloadi8 (add (i32 IntRegs:$src1),
+                                                (shl (i32 IntRegs:$src2),
+                                                     u2ImmPred:$offset)))))]>,
                     Requires<[HasV4T]>;
 
 let AddedComplexity = 40, isPredicable = 1 in
 def LDriub_indexed_shl_V4 : LDInst<(outs IntRegs:$dst),
                     (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset),
                     "$dst=memub($src1+$src2<<#$offset)",
-                    [(set IntRegs:$dst,
-                          (zextloadi8 (add IntRegs:$src1,
-                                           (shl IntRegs:$src2,
-                                                u2ImmPred:$offset))))]>,
+                    [(set (i32 IntRegs:$dst),
+                          (i32 (zextloadi8 (add (i32 IntRegs:$src1),
+                                                (shl (i32 IntRegs:$src2),
+                                                     u2ImmPred:$offset)))))]>,
                     Requires<[HasV4T]>;
 
 let AddedComplexity = 40, isPredicable = 1 in
 def LDriub_ae_indexed_shl_V4 : LDInst<(outs IntRegs:$dst),
                     (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset),
                     "$dst=memub($src1+$src2<<#$offset)",
-                    [(set IntRegs:$dst, (extloadi8 (add IntRegs:$src1,
-                                                   (shl IntRegs:$src2,
-                                                        u2ImmPred:$offset))))]>,
+                    [(set (i32 IntRegs:$dst),
+                          (i32 (extloadi8 (add (i32 IntRegs:$src1),
+                                               (shl (i32 IntRegs:$src2),
+                                                    u2ImmPred:$offset)))))]>,
                     Requires<[HasV4T]>;
 
 //// Load byte conditionally.
 // if ([!]Pv[.new]) Rd=memb(Rs+Rt<<#u2)
 // if (Pv) Rd=memb(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDrib_indexed_cPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDrib_indexed_cPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if ($src1) $dst=memb($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (Pv.new) Rd=memb(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDrib_indexed_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDrib_indexed_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if ($src1.new) $dst=memb($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (!Pv) Rd=memb(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDrib_indexed_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDrib_indexed_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if (!$src1) $dst=memb($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (!Pv.new) Rd=memb(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDrib_indexed_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDrib_indexed_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if (!$src1.new) $dst=memb($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (Pv) Rd=memb(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDrib_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDrib_indexed_shl_cPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if ($src1) $dst=memb($src2+$src3<<#$offset)",
@@ -462,8 +600,8 @@ def LDrib_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (Pv.new) Rd=memb(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDrib_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDrib_indexed_shl_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if ($src1.new) $dst=memb($src2+$src3<<#$offset)",
@@ -471,8 +609,8 @@ def LDrib_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (!Pv) Rd=memb(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDrib_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDrib_indexed_shl_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if (!$src1) $dst=memb($src2+$src3<<#$offset)",
@@ -480,8 +618,8 @@ def LDrib_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (!Pv.new) Rd=memb(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDrib_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDrib_indexed_shl_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if (!$src1.new) $dst=memb($src2+$src3<<#$offset)",
@@ -491,40 +629,40 @@ def LDrib_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
 //// Load unsigned byte conditionally.
 // if ([!]Pv[.new]) Rd=memub(Rs+Rt<<#u2)
 // if (Pv) Rd=memub(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDriub_indexed_cPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDriub_indexed_cPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if ($src1) $dst=memub($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (Pv.new) Rd=memub(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDriub_indexed_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDriub_indexed_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if ($src1.new) $dst=memub($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (!Pv) Rd=memub(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDriub_indexed_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDriub_indexed_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if (!$src1) $dst=memub($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (!Pv.new) Rd=memub(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDriub_indexed_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDriub_indexed_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if (!$src1.new) $dst=memub($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (Pv) Rd=memub(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDriub_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDriub_indexed_shl_cPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if ($src1) $dst=memub($src2+$src3<<#$offset)",
@@ -532,8 +670,8 @@ def LDriub_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (Pv.new) Rd=memub(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDriub_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDriub_indexed_shl_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if ($src1.new) $dst=memub($src2+$src3<<#$offset)",
@@ -541,8 +679,8 @@ def LDriub_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (!Pv) Rd=memub(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDriub_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDriub_indexed_shl_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if (!$src1) $dst=memub($src2+$src3<<#$offset)",
@@ -550,8 +688,8 @@ def LDriub_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (!Pv.new) Rd=memub(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDriub_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDriub_indexed_shl_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if (!$src1.new) $dst=memub($src2+$src3<<#$offset)",
@@ -561,31 +699,32 @@ def LDriub_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
 // Rd=memb(Rt<<#u2+#U6)
 
 //// Load halfword
-// Rd=memh(Re=#U6)
-
 // Rd=memh(Rs+Rt<<#u2)
 let AddedComplexity = 10, isPredicable = 1 in
 def LDrih_indexed_V4 : LDInst<(outs IntRegs:$dst),
                     (ins IntRegs:$src1, IntRegs:$src2),
                     "$dst=memh($src1+$src2<<#0)",
-                    [(set IntRegs:$dst, (sextloadi16 (add IntRegs:$src1,
-                                                          IntRegs:$src2)))]>,
+                    [(set (i32 IntRegs:$dst),
+                          (i32 (sextloadi16 (add (i32 IntRegs:$src1),
+                                                 (i32 IntRegs:$src2)))))]>,
                     Requires<[HasV4T]>;
 
 let AddedComplexity = 10, isPredicable = 1 in
 def LDriuh_indexed_V4 : LDInst<(outs IntRegs:$dst),
                     (ins IntRegs:$src1, IntRegs:$src2),
                     "$dst=memuh($src1+$src2<<#0)",
-                    [(set IntRegs:$dst, (zextloadi16 (add IntRegs:$src1,
-                                                          IntRegs:$src2)))]>,
+                    [(set (i32 IntRegs:$dst),
+                          (i32 (zextloadi16 (add (i32 IntRegs:$src1),
+                                                 (i32 IntRegs:$src2)))))]>,
                     Requires<[HasV4T]>;
 
 let AddedComplexity = 10, isPredicable = 1 in
 def LDriuh_ae_indexed_V4 : LDInst<(outs IntRegs:$dst),
                     (ins IntRegs:$src1, IntRegs:$src2),
                     "$dst=memuh($src1+$src2<<#0)",
-                    [(set IntRegs:$dst, (extloadi16 (add IntRegs:$src1,
-                                                         IntRegs:$src2)))]>,
+                    [(set (i32 IntRegs:$dst),
+                          (i32 (extloadi16 (add (i32 IntRegs:$src1),
+                                                (i32 IntRegs:$src2)))))]>,
                     Requires<[HasV4T]>;
 
 // Rd=memh(Rs+Rt<<#u2)
@@ -593,69 +732,69 @@ let AddedComplexity = 40, isPredicable = 1 in
 def LDrih_indexed_shl_V4 : LDInst<(outs IntRegs:$dst),
                     (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset),
                     "$dst=memh($src1+$src2<<#$offset)",
-                    [(set IntRegs:$dst,
-                          (sextloadi16 (add IntRegs:$src1,
-                                            (shl IntRegs:$src2,
-                                                 u2ImmPred:$offset))))]>,
+                    [(set (i32 IntRegs:$dst),
+                          (i32 (sextloadi16 (add (i32 IntRegs:$src1),
+                                                 (shl (i32 IntRegs:$src2),
+                                                      u2ImmPred:$offset)))))]>,
                     Requires<[HasV4T]>;
 
 let AddedComplexity = 40, isPredicable = 1 in
 def LDriuh_indexed_shl_V4 : LDInst<(outs IntRegs:$dst),
                     (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset),
                     "$dst=memuh($src1+$src2<<#$offset)",
-                    [(set IntRegs:$dst,
-                          (zextloadi16 (add IntRegs:$src1,
-                                            (shl IntRegs:$src2,
-                                                 u2ImmPred:$offset))))]>,
+                    [(set (i32 IntRegs:$dst),
+                          (i32 (zextloadi16 (add (i32 IntRegs:$src1),
+                                                 (shl (i32 IntRegs:$src2),
+                                                      u2ImmPred:$offset)))))]>,
                     Requires<[HasV4T]>;
 
 let AddedComplexity = 40, isPredicable = 1 in
 def LDriuh_ae_indexed_shl_V4 : LDInst<(outs IntRegs:$dst),
                     (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset),
                     "$dst=memuh($src1+$src2<<#$offset)",
-                    [(set IntRegs:$dst,
-                          (extloadi16 (add IntRegs:$src1,
-                                           (shl IntRegs:$src2,
-                                                u2ImmPred:$offset))))]>,
+                    [(set (i32 IntRegs:$dst),
+                          (i32 (extloadi16 (add (i32 IntRegs:$src1),
+                                                (shl (i32 IntRegs:$src2),
+                                                     u2ImmPred:$offset)))))]>,
                     Requires<[HasV4T]>;
 
 //// Load halfword conditionally.
 // if ([!]Pv[.new]) Rd=memh(Rs+Rt<<#u2)
 // if (Pv) Rd=memh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDrih_indexed_cPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDrih_indexed_cPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if ($src1) $dst=memh($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (Pv.new) Rd=memh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDrih_indexed_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDrih_indexed_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if ($src1.new) $dst=memh($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (!Pv) Rd=memh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDrih_indexed_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDrih_indexed_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if (!$src1) $dst=memh($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (!Pv.new) Rd=memh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDrih_indexed_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDrih_indexed_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if (!$src1.new) $dst=memh($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (Pv) Rd=memh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDrih_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDrih_indexed_shl_cPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if ($src1) $dst=memh($src2+$src3<<#$offset)",
@@ -663,8 +802,8 @@ def LDrih_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (Pv.new) Rd=memh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDrih_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDrih_indexed_shl_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if ($src1.new) $dst=memh($src2+$src3<<#$offset)",
@@ -672,8 +811,8 @@ def LDrih_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (!Pv) Rd=memh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDrih_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDrih_indexed_shl_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if (!$src1) $dst=memh($src2+$src3<<#$offset)",
@@ -681,8 +820,8 @@ def LDrih_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (!Pv.new) Rd=memh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDrih_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDrih_indexed_shl_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if (!$src1.new) $dst=memh($src2+$src3<<#$offset)",
@@ -692,40 +831,40 @@ def LDrih_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
 //// Load unsigned halfword conditionally.
 // if ([!]Pv[.new]) Rd=memuh(Rs+Rt<<#u2)
 // if (Pv) Rd=memuh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDriuh_indexed_cPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDriuh_indexed_cPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if ($src1) $dst=memuh($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (Pv.new) Rd=memuh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDriuh_indexed_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDriuh_indexed_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if ($src1.new) $dst=memuh($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (!Pv) Rd=memuh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDriuh_indexed_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDriuh_indexed_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if (!$src1) $dst=memuh($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (!Pv.new) Rd=memuh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDriuh_indexed_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDriuh_indexed_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if (!$src1.new) $dst=memuh($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (Pv) Rd=memuh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDriuh_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDriuh_indexed_shl_cPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if ($src1) $dst=memuh($src2+$src3<<#$offset)",
@@ -733,8 +872,8 @@ def LDriuh_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (Pv.new) Rd=memuh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDriuh_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDriuh_indexed_shl_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if ($src1.new) $dst=memuh($src2+$src3<<#$offset)",
@@ -742,8 +881,8 @@ def LDriuh_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (!Pv) Rd=memuh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDriuh_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDriuh_indexed_shl_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if (!$src1) $dst=memuh($src2+$src3<<#$offset)",
@@ -751,8 +890,8 @@ def LDriuh_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (!Pv.new) Rd=memuh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDriuh_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDriuh_indexed_shl_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if (!$src1.new) $dst=memuh($src2+$src3<<#$offset)",
@@ -762,6 +901,14 @@ def LDriuh_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
 // Rd=memh(Rt<<#u2+#U6)
 
 //// Load word.
+// Load predicate: Fix for bug 5279.
+let neverHasSideEffects = 1 in
+def LDriw_pred_V4 : LDInst2<(outs PredRegs:$dst),
+            (ins MEMri:$addr),
+            "Error; should not emit",
+            []>,
+            Requires<[HasV4T]>;
+
 // Rd=memw(Re=#U6)
 
 // Rd=memw(Rs+Rt<<#u2)
@@ -769,8 +916,9 @@ let AddedComplexity = 10, isPredicable = 1 in
 def LDriw_indexed_V4 : LDInst<(outs IntRegs:$dst),
                     (ins IntRegs:$src1, IntRegs:$src2),
                     "$dst=memw($src1+$src2<<#0)",
-                    [(set IntRegs:$dst, (load (add IntRegs:$src1,
-                                                   IntRegs:$src2)))]>,
+                    [(set (i32 IntRegs:$dst),
+                          (i32 (load (add (i32 IntRegs:$src1),
+                                          (i32 IntRegs:$src2)))))]>,
                     Requires<[HasV4T]>;
 
 // Rd=memw(Rs+Rt<<#u2)
@@ -778,48 +926,49 @@ let AddedComplexity = 40, isPredicable = 1 in
 def LDriw_indexed_shl_V4 : LDInst<(outs IntRegs:$dst),
                     (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset),
                     "$dst=memw($src1+$src2<<#$offset)",
-                    [(set IntRegs:$dst, (load (add IntRegs:$src1,
-                                              (shl IntRegs:$src2,
-                                                   u2ImmPred:$offset))))]>,
+                    [(set (i32 IntRegs:$dst),
+                          (i32 (load (add (i32 IntRegs:$src1),
+                                          (shl (i32 IntRegs:$src2),
+                                               u2ImmPred:$offset)))))]>,
                     Requires<[HasV4T]>;
 
 //// Load word conditionally.
 // if ([!]Pv[.new]) Rd=memw(Rs+Rt<<#u2)
 // if (Pv) Rd=memw(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDriw_indexed_cPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDriw_indexed_cPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if ($src1) $dst=memw($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (Pv.new) Rd=memh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDriw_indexed_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDriw_indexed_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if ($src1.new) $dst=memw($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (!Pv) Rd=memh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDriw_indexed_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDriw_indexed_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if (!$src1) $dst=memw($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (!Pv.new) Rd=memh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 15, isPredicated = 1 in
-def LDriw_indexed_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 15, isPredicated = 1 in
+def LDriw_indexed_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
                     "if (!$src1.new) $dst=memw($src2+$src3<<#0)",
                     []>,
                     Requires<[HasV4T]>;
 
 // if (Pv) Rd=memh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDriw_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDriw_indexed_shl_cPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if ($src1) $dst=memw($src2+$src3<<#$offset)",
@@ -827,8 +976,8 @@ def LDriw_indexed_shl_cPt_V4 : LDInst<(outs IntRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (Pv.new) Rd=memh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDriw_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDriw_indexed_shl_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if ($src1.new) $dst=memw($src2+$src3<<#$offset)",
@@ -836,8 +985,8 @@ def LDriw_indexed_shl_cdnPt_V4 : LDInst<(outs IntRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (!Pv) Rd=memh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDriw_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDriw_indexed_shl_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if (!$src1) $dst=memw($src2+$src3<<#$offset)",
@@ -845,8 +994,8 @@ def LDriw_indexed_shl_cNotPt_V4 : LDInst<(outs IntRegs:$dst),
                     Requires<[HasV4T]>;
 
 // if (!Pv.new) Rd=memh(Rs+Rt<<#u2)
-let mayLoad = 1, AddedComplexity = 45, isPredicated = 1 in
-def LDriw_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
+let AddedComplexity = 45, isPredicated = 1 in
+def LDriw_indexed_shl_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
                     (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3,
                          u2Imm:$offset),
                     "if (!$src1.new) $dst=memw($src2+$src3<<#$offset)",
@@ -859,102 +1008,729 @@ def LDriw_indexed_shl_cdnNotPt_V4 : LDInst<(outs IntRegs:$dst),
 // Post-inc Load, Predicated, Dot new
 
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
-def POST_LDrid_cdnPt_V4 : LDInstPI<(outs DoubleRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDrid_cdnPt_V4 : LDInst2PI<(outs DoubleRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_3Imm:$src3),
             "if ($src1.new) $dst1 = memd($src2++#$src3)",
             [],
             "$src2 = $dst2">,
             Requires<[HasV4T]>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
-def POST_LDrid_cdnNotPt_V4 : LDInstPI<(outs DoubleRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDrid_cdnNotPt_V4 : LDInst2PI<(outs DoubleRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_3Imm:$src3),
             "if (!$src1.new) $dst1 = memd($src2++#$src3)",
             [],
             "$src2 = $dst2">,
             Requires<[HasV4T]>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
-def POST_LDrib_cdnPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDrib_cdnPt_V4 : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_0Imm:$src3),
             "if ($src1.new) $dst1 = memb($src2++#$src3)",
             [],
             "$src2 = $dst2">,
             Requires<[HasV4T]>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
-def POST_LDrib_cdnNotPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDrib_cdnNotPt_V4 : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_0Imm:$src3),
             "if (!$src1.new) $dst1 = memb($src2++#$src3)",
             [],
             "$src2 = $dst2">,
             Requires<[HasV4T]>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
-def POST_LDrih_cdnPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDrih_cdnPt_V4 : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_1Imm:$src3),
             "if ($src1.new) $dst1 = memh($src2++#$src3)",
             [],
             "$src2 = $dst2">,
             Requires<[HasV4T]>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
-def POST_LDrih_cdnNotPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDrih_cdnNotPt_V4 : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_1Imm:$src3),
             "if (!$src1.new) $dst1 = memh($src2++#$src3)",
             [],
             "$src2 = $dst2">,
             Requires<[HasV4T]>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
-def POST_LDriub_cdnPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDriub_cdnPt_V4 : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_0Imm:$src3),
             "if ($src1.new) $dst1 = memub($src2++#$src3)",
             [],
             "$src2 = $dst2">,
             Requires<[HasV4T]>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
-def POST_LDriub_cdnNotPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDriub_cdnNotPt_V4 : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_0Imm:$src3),
             "if (!$src1.new) $dst1 = memub($src2++#$src3)",
             [],
             "$src2 = $dst2">,
             Requires<[HasV4T]>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
-def POST_LDriuh_cdnPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDriuh_cdnPt_V4 : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_1Imm:$src3),
             "if ($src1.new) $dst1 = memuh($src2++#$src3)",
             [],
             "$src2 = $dst2">,
             Requires<[HasV4T]>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
-def POST_LDriuh_cdnNotPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDriuh_cdnNotPt_V4 : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_1Imm:$src3),
             "if (!$src1.new) $dst1 = memuh($src2++#$src3)",
             [],
             "$src2 = $dst2">,
             Requires<[HasV4T]>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
-def POST_LDriw_cdnPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDriw_cdnPt_V4 : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_2Imm:$src3),
             "if ($src1.new) $dst1 = memw($src2++#$src3)",
             [],
             "$src2 = $dst2">,
             Requires<[HasV4T]>;
 
-let mayLoad = 1, hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
-def POST_LDriw_cdnNotPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
+let hasCtrlDep = 1, neverHasSideEffects = 1, isPredicated = 1 in
+def POST_LDriw_cdnNotPt_V4 : LDInst2PI<(outs IntRegs:$dst1, IntRegs:$dst2),
             (ins PredRegs:$src1, IntRegs:$src2, s4_2Imm:$src3),
             "if (!$src1.new) $dst1 = memw($src2++#$src3)",
             [],
             "$src2 = $dst2">,
             Requires<[HasV4T]>;
 
+/// Load from global offset
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def LDrid_GP_V4 : LDInst2<(outs DoubleRegs:$dst),
+            (ins globaladdress:$global, u16Imm:$offset),
+            "$dst=memd(#$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrid_GP_cPt_V4 : LDInst2<(outs DoubleRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if ($src1) $dst=memd(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrid_GP_cNotPt_V4 : LDInst2<(outs DoubleRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if (!$src1) $dst=memd(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrid_GP_cdnPt_V4 : LDInst2<(outs DoubleRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if ($src1.new) $dst=memd(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrid_GP_cdnNotPt_V4 : LDInst2<(outs DoubleRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if (!$src1.new) $dst=memd(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def LDrib_GP_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins globaladdress:$global, u16Imm:$offset),
+            "$dst=memb(#$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrib_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if ($src1) $dst=memb(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrib_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if (!$src1) $dst=memb(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrib_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if ($src1.new) $dst=memb(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrib_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if (!$src1.new) $dst=memb(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def LDriub_GP_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins globaladdress:$global, u16Imm:$offset),
+            "$dst=memub(#$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriub_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if ($src1) $dst=memub(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriub_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if (!$src1) $dst=memub(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriub_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if ($src1.new) $dst=memub(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriub_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if (!$src1.new) $dst=memub(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def LDrih_GP_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins globaladdress:$global, u16Imm:$offset),
+            "$dst=memh(#$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrih_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if ($src1) $dst=memh(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrih_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if (!$src1) $dst=memh(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrih_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if ($src1.new) $dst=memh(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDrih_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if (!$src1.new) $dst=memh(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def LDriuh_GP_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins globaladdress:$global, u16Imm:$offset),
+            "$dst=memuh(#$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriuh_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if ($src1) $dst=memuh(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriuh_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if (!$src1) $dst=memuh(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriuh_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if ($src1.new) $dst=memuh(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriuh_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if (!$src1.new) $dst=memuh(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def LDriw_GP_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins globaladdress:$global, u16Imm:$offset),
+            "$dst=memw(#$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriw_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if ($src1) $dst=memw(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriw_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if (!$src1) $dst=memw(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriw_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if ($src1.new) $dst=memw(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDriw_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
+            "if (!$src1.new) $dst=memw(##$global+$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def LDd_GP_V4 : LDInst2<(outs DoubleRegs:$dst),
+            (ins globaladdress:$global),
+            "$dst=memd(#$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) Rtt=memd(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDd_GP_cPt_V4 : LDInst2<(outs DoubleRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if ($src1) $dst=memd(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+
+// if (!Pv) Rtt=memd(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDd_GP_cNotPt_V4 : LDInst2<(outs DoubleRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if (!$src1) $dst=memd(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) Rtt=memd(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDd_GP_cdnPt_V4 : LDInst2<(outs DoubleRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if ($src1.new) $dst=memd(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+
+// if (!Pv) Rtt=memd(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDd_GP_cdnNotPt_V4 : LDInst2<(outs DoubleRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if (!$src1.new) $dst=memd(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def LDb_GP_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins globaladdress:$global),
+            "$dst=memb(#$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) Rt=memb(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDb_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if ($src1) $dst=memb(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) Rt=memb(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDb_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if (!$src1) $dst=memb(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) Rt=memb(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDb_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if ($src1.new) $dst=memb(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) Rt=memb(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDb_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if (!$src1.new) $dst=memb(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def LDub_GP_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins globaladdress:$global),
+            "$dst=memub(#$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) Rt=memub(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDub_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if ($src1) $dst=memub(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+
+// if (!Pv) Rt=memub(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDub_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if (!$src1) $dst=memub(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) Rt=memub(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDub_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if ($src1.new) $dst=memub(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+
+// if (!Pv) Rt=memub(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDub_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if (!$src1.new) $dst=memub(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def LDh_GP_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins globaladdress:$global),
+            "$dst=memh(#$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) Rt=memh(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDh_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if ($src1) $dst=memh(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) Rt=memh(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDh_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if (!$src1) $dst=memh(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) Rt=memh(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDh_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if ($src1.new) $dst=memh(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) Rt=memh(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDh_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if (!$src1.new) $dst=memh(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def LDuh_GP_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins globaladdress:$global),
+            "$dst=memuh(#$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) Rt=memuh(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDuh_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if ($src1) $dst=memuh(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) Rt=memuh(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDuh_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if (!$src1) $dst=memuh(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) Rt=memuh(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDuh_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if ($src1.new) $dst=memuh(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) Rt=memuh(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDuh_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if (!$src1.new) $dst=memuh(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def LDw_GP_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins globaladdress:$global),
+            "$dst=memw(#$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) Rt=memw(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDw_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if ($src1) $dst=memw(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+
+// if (!Pv) Rt=memw(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDw_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if (!$src1) $dst=memw(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) Rt=memw(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDw_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if ($src1.new) $dst=memw(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+
+// if (!Pv) Rt=memw(##global)
+let neverHasSideEffects = 1, isPredicated = 1 in
+def LDw_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$global),
+            "if (!$src1.new) $dst=memw(##$global)",
+            []>,
+            Requires<[HasV4T]>;
+
+
+
+def : Pat <(atomic_load_64 (HexagonCONST32_GP tglobaladdr:$global)),
+           (i64 (LDd_GP_V4 tglobaladdr:$global))>,
+            Requires<[HasV4T]>;
+
+def : Pat <(atomic_load_32 (HexagonCONST32_GP tglobaladdr:$global)),
+           (i32 (LDw_GP_V4 tglobaladdr:$global))>,
+            Requires<[HasV4T]>;
+
+def : Pat <(atomic_load_16 (HexagonCONST32_GP tglobaladdr:$global)),
+           (i32 (LDuh_GP_V4 tglobaladdr:$global))>,
+            Requires<[HasV4T]>;
+
+def : Pat <(atomic_load_8 (HexagonCONST32_GP tglobaladdr:$global)),
+           (i32 (LDub_GP_V4 tglobaladdr:$global))>,
+            Requires<[HasV4T]>;
+
+// Map from load(globaladdress) -> memw(#foo + 0)
+let AddedComplexity = 100 in
+def : Pat <(i64 (load (HexagonCONST32_GP tglobaladdr:$global))),
+           (i64 (LDd_GP_V4 tglobaladdr:$global))>,
+            Requires<[HasV4T]>;
+
+// Map from Pd = load(globaladdress) -> Rd = memb(globaladdress), Pd = Rd
+let AddedComplexity = 100 in
+def : Pat <(i1 (load (HexagonCONST32_GP tglobaladdr:$global))),
+           (i1 (TFR_PdRs (i32 (LDb_GP_V4 tglobaladdr:$global))))>,
+           Requires<[HasV4T]>;
+
+// When the Interprocedural Global Variable optimizer realizes that a certain
+// global variable takes only two constant values, it shrinks the global to
+// a boolean. Catch those loads here in the following 3 patterns.
+let AddedComplexity = 100 in
+def : Pat <(i32 (extloadi1 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDb_GP_V4 tglobaladdr:$global))>,
+            Requires<[HasV4T]>;
+
+let AddedComplexity = 100 in
+def : Pat <(i32 (sextloadi1 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDb_GP_V4 tglobaladdr:$global))>,
+            Requires<[HasV4T]>;
+
+// Map from load(globaladdress) -> memb(#foo)
+let AddedComplexity = 100 in
+def : Pat <(i32 (extloadi8 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDb_GP_V4 tglobaladdr:$global))>,
+            Requires<[HasV4T]>;
+
+// Map from load(globaladdress) -> memb(#foo)
+let AddedComplexity = 100 in
+def : Pat <(i32 (sextloadi8 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDb_GP_V4 tglobaladdr:$global))>,
+            Requires<[HasV4T]>;
+
+let AddedComplexity = 100 in
+def : Pat <(i32 (zextloadi1 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDub_GP_V4 tglobaladdr:$global))>,
+            Requires<[HasV4T]>;
+
+// Map from load(globaladdress) -> memub(#foo)
+let AddedComplexity = 100 in
+def : Pat <(i32 (zextloadi8 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDub_GP_V4 tglobaladdr:$global))>,
+            Requires<[HasV4T]>;
+
+// Map from load(globaladdress) -> memh(#foo)
+let AddedComplexity = 100 in
+def : Pat <(i32 (extloadi16 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDh_GP_V4 tglobaladdr:$global))>,
+            Requires<[HasV4T]>;
+
+// Map from load(globaladdress) -> memh(#foo)
+let AddedComplexity = 100 in
+def : Pat <(i32 (sextloadi16 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDh_GP_V4 tglobaladdr:$global))>,
+            Requires<[HasV4T]>;
+
+// Map from load(globaladdress) -> memuh(#foo)
+let AddedComplexity = 100 in
+def : Pat <(i32 (zextloadi16 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDuh_GP_V4 tglobaladdr:$global))>,
+            Requires<[HasV4T]>;
+
+// Map from load(globaladdress) -> memw(#foo)
+let AddedComplexity = 100 in
+def : Pat <(i32 (load (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDw_GP_V4 tglobaladdr:$global))>,
+            Requires<[HasV4T]>;
+
+def : Pat <(atomic_load_64 (add (HexagonCONST32_GP tglobaladdr:$global),
+                                u16ImmPred:$offset)),
+           (i64 (LDrid_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
+           Requires<[HasV4T]>;
+
+def : Pat <(atomic_load_32 (add (HexagonCONST32_GP tglobaladdr:$global),
+                                u16ImmPred:$offset)),
+           (i32 (LDriw_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
+            Requires<[HasV4T]>;
+
+def : Pat <(atomic_load_16 (add (HexagonCONST32_GP tglobaladdr:$global),
+                                u16ImmPred:$offset)),
+           (i32 (LDriuh_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
+            Requires<[HasV4T]>;
+
+def : Pat <(atomic_load_8 (add (HexagonCONST32_GP tglobaladdr:$global),
+                               u16ImmPred:$offset)),
+           (i32 (LDriub_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
+           Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memd(#foo + x)
+let AddedComplexity = 100 in
+def : Pat <(i64 (load (add (HexagonCONST32_GP tglobaladdr:$global),
+                           u16ImmPred:$offset))),
+           (i64 (LDrid_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
+           Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memb(#foo + x)
+let AddedComplexity = 100 in
+def : Pat <(i32 (extloadi8 (add (HexagonCONST32_GP tglobaladdr:$global),
+                           u16ImmPred:$offset))),
+           (i32 (LDrib_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
+           Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memb(#foo + x)
+let AddedComplexity = 100 in
+def : Pat <(i32 (sextloadi8 (add (HexagonCONST32_GP tglobaladdr:$global),
+                            u16ImmPred:$offset))),
+           (i32 (LDrib_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
+           Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memub(#foo + x)
+let AddedComplexity = 100 in
+def : Pat <(i32 (zextloadi8 (add (HexagonCONST32_GP tglobaladdr:$global),
+                            u16ImmPred:$offset))),
+           (i32 (LDriub_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
+           Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memuh(#foo + x)
+let AddedComplexity = 100 in
+def : Pat <(i32 (extloadi16 (add (HexagonCONST32_GP tglobaladdr:$global),
+                            u16ImmPred:$offset))),
+           (i32 (LDrih_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
+            Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memh(#foo + x)
+let AddedComplexity = 100 in
+def : Pat <(i32 (sextloadi16 (add (HexagonCONST32_GP tglobaladdr:$global),
+                             u16ImmPred:$offset))),
+           (i32 (LDrih_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
+           Requires<[HasV4T]>;
+
+
+// Map from load(globaladdress + x) -> memuh(#foo + x)
+let AddedComplexity = 100 in
+def : Pat <(i32 (zextloadi16 (add (HexagonCONST32_GP tglobaladdr:$global),
+                             u16ImmPred:$offset))),
+           (i32 (LDriuh_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
+            Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memw(#foo + x)
+let AddedComplexity = 100 in
+def : Pat <(i32 (load (add (HexagonCONST32_GP tglobaladdr:$global),
+                      u16ImmPred:$offset))),
+           (i32 (LDriw_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
+            Requires<[HasV4T]>;
+
 
 //===----------------------------------------------------------------------===//
 // LD -
@@ -971,18 +1747,70 @@ def POST_LDriw_cdnNotPt_V4 : LDInstPI<(outs IntRegs:$dst1, IntRegs:$dst2),
 ///    last operand.
 ///
 
-// Store doubleword.
 // memd(Re=#U6)=Rtt
-// TODO: needs to be implemented
+def STrid_abs_setimm_V4 : STInst2<(outs IntRegs:$dst1),
+            (ins DoubleRegs:$src1, u6Imm:$src2),
+            "memd($dst1=#$src2) = $src1",
+            []>,
+            Requires<[HasV4T]>;
+
+// memb(Re=#U6)=Rs
+def STrib_abs_setimm_V4 : STInst2<(outs IntRegs:$dst1),
+            (ins IntRegs:$src1, u6Imm:$src2),
+            "memb($dst1=#$src2) = $src1",
+            []>,
+            Requires<[HasV4T]>;
+
+// memh(Re=#U6)=Rs
+def STrih_abs_setimm_V4 : STInst2<(outs IntRegs:$dst1),
+            (ins IntRegs:$src1, u6Imm:$src2),
+            "memh($dst1=#$src2) = $src1",
+            []>,
+            Requires<[HasV4T]>;
+
+// memw(Re=#U6)=Rs
+def STriw_abs_setimm_V4 : STInst2<(outs IntRegs:$dst1),
+            (ins IntRegs:$src1, u6Imm:$src2),
+            "memw($dst1=#$src2) = $src1",
+            []>,
+            Requires<[HasV4T]>;
+
+// memd(Re=#U6)=Rtt
+def STrid_abs_set_V4 : STInst2<(outs IntRegs:$dst1),
+            (ins DoubleRegs:$src1, globaladdress:$src2),
+            "memd($dst1=##$src2) = $src1",
+            []>,
+            Requires<[HasV4T]>;
+
+// memb(Re=#U6)=Rs
+def STrib_abs_set_V4 : STInst2<(outs IntRegs:$dst1),
+            (ins IntRegs:$src1, globaladdress:$src2),
+            "memb($dst1=##$src2) = $src1",
+            []>,
+            Requires<[HasV4T]>;
+
+// memh(Re=#U6)=Rs
+def STrih_abs_set_V4 : STInst2<(outs IntRegs:$dst1),
+            (ins IntRegs:$src1, globaladdress:$src2),
+            "memh($dst1=##$src2) = $src1",
+            []>,
+            Requires<[HasV4T]>;
+
+// memw(Re=#U6)=Rs
+def STriw_abs_set_V4 : STInst2<(outs IntRegs:$dst1),
+            (ins IntRegs:$src1, globaladdress:$src2),
+            "memw($dst1=##$src2) = $src1",
+            []>,
+            Requires<[HasV4T]>;
 
-// memd(Rs+#s11:3)=Rtt
 // memd(Rs+Ru<<#u2)=Rtt
 let AddedComplexity = 10, isPredicable = 1 in
 def STrid_indexed_shl_V4 : STInst<(outs),
             (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$src3, DoubleRegs:$src4),
             "memd($src1+$src2<<#$src3) = $src4",
-            [(store DoubleRegs:$src4, (add IntRegs:$src1,
-                                      (shl IntRegs:$src2, u2ImmPred:$src3)))]>,
+            [(store (i64 DoubleRegs:$src4),
+                    (add (i32 IntRegs:$src1),
+                         (shl (i32 IntRegs:$src2), u2ImmPred:$src3)))]>,
             Requires<[HasV4T]>;
 
 // memd(Ru<<#u2+#U6)=Rtt
@@ -990,9 +1818,9 @@ let AddedComplexity = 10 in
 def STrid_shl_V4 : STInst<(outs),
             (ins IntRegs:$src1, u2Imm:$src2, u6Imm:$src3, DoubleRegs:$src4),
             "memd($src1<<#$src2+#$src3) = $src4",
-            [(store DoubleRegs:$src4, (shl IntRegs:$src1,
-                                      (add u2ImmPred:$src2,
-                                           u6ImmPred:$src3)))]>,
+            [(store (i64 DoubleRegs:$src4),
+                    (add (shl (i32 IntRegs:$src1), u2ImmPred:$src2),
+                         u6ImmPred:$src3))]>,
             Requires<[HasV4T]>;
 
 // memd(Rx++#s4:3)=Rtt
@@ -1009,8 +1837,9 @@ def STrid_shl_V4 : STInst<(outs),
 // if ([!]Pv[.new]) memd(Rs+#u6:3)=Rtt
 // if (Pv) memd(Rs+#u6:3)=Rtt
 // if (Pv.new) memd(Rs+#u6:3)=Rtt
-let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
-def STrid_cdnPt_V4 : STInst<(outs),
+let AddedComplexity = 10, neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrid_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, MEMri:$addr, DoubleRegs:$src2),
             "if ($src1.new) memd($addr) = $src2",
             []>,
@@ -1018,8 +1847,9 @@ def STrid_cdnPt_V4 : STInst<(outs),
 
 // if (!Pv) memd(Rs+#u6:3)=Rtt
 // if (!Pv.new) memd(Rs+#u6:3)=Rtt
-let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
-def STrid_cdnNotPt_V4 : STInst<(outs),
+let AddedComplexity = 10, neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrid_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, MEMri:$addr, DoubleRegs:$src2),
             "if (!$src1.new) memd($addr) = $src2",
             []>,
@@ -1027,8 +1857,9 @@ def STrid_cdnNotPt_V4 : STInst<(outs),
 
 // if (Pv) memd(Rs+#u6:3)=Rtt
 // if (Pv.new) memd(Rs+#u6:3)=Rtt
-let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
-def STrid_indexed_cdnPt_V4 : STInst<(outs),
+let AddedComplexity = 10, neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrid_indexed_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_3Imm:$src3,
                  DoubleRegs:$src4),
             "if ($src1.new) memd($src2+#$src3) = $src4",
@@ -1037,8 +1868,9 @@ def STrid_indexed_cdnPt_V4 : STInst<(outs),
 
 // if (!Pv) memd(Rs+#u6:3)=Rtt
 // if (!Pv.new) memd(Rs+#u6:3)=Rtt
-let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
-def STrid_indexed_cdnNotPt_V4 : STInst<(outs),
+let AddedComplexity = 10, neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrid_indexed_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_3Imm:$src3,
                  DoubleRegs:$src4),
             "if (!$src1.new) memd($src2+#$src3) = $src4",
@@ -1047,8 +1879,9 @@ def STrid_indexed_cdnNotPt_V4 : STInst<(outs),
 
 // if ([!]Pv[.new]) memd(Rs+Ru<<#u2)=Rtt
 // if (Pv) memd(Rs+Ru<<#u2)=Rtt
-let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
-def STrid_indexed_shl_cPt_V4 : STInst<(outs),
+let AddedComplexity = 10, neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrid_indexed_shl_cPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  DoubleRegs:$src5),
             "if ($src1) memd($src2+$src3<<#$src4) = $src5",
@@ -1056,24 +1889,27 @@ def STrid_indexed_shl_cPt_V4 : STInst<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv.new) memd(Rs+Ru<<#u2)=Rtt
-let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
-def STrid_indexed_shl_cdnPt_V4 : STInst<(outs),
+let AddedComplexity = 10, neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrid_indexed_shl_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  DoubleRegs:$src5),
-            "if ($src1) memd($src2+$src3<<#$src4) = $src5",
+            "if ($src1.new) memd($src2+$src3<<#$src4) = $src5",
             []>,
             Requires<[HasV4T]>;
 // if (!Pv) memd(Rs+Ru<<#u2)=Rtt
-let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
-def STrid_indexed_shl_cNotPt_V4 : STInst<(outs),
+let AddedComplexity = 10, neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrid_indexed_shl_cNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  DoubleRegs:$src5),
             "if (!$src1) memd($src2+$src3<<#$src4) = $src5",
             []>,
             Requires<[HasV4T]>;
 // if (!Pv.new) memd(Rs+Ru<<#u2)=Rtt
-let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
-def STrid_indexed_shl_cdnNotPt_V4 : STInst<(outs),
+let AddedComplexity = 10, neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrid_indexed_shl_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  DoubleRegs:$src5),
             "if (!$src1.new) memd($src2+$src3<<#$src4) = $src5",
@@ -1083,8 +1919,9 @@ def STrid_indexed_shl_cdnNotPt_V4 : STInst<(outs),
 // if ([!]Pv[.new]) memd(Rx++#s4:3)=Rtt
 // if (Pv) memd(Rx++#s4:3)=Rtt
 // if (Pv.new) memd(Rx++#s4:3)=Rtt
-let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
-def POST_STdri_cdnPt_V4 : STInstPI<(outs IntRegs:$dst),
+let AddedComplexity = 10, neverHasSideEffects = 1,
+    isPredicated = 1 in
+def POST_STdri_cdnPt_V4 : STInst2PI<(outs IntRegs:$dst),
             (ins PredRegs:$src1, DoubleRegs:$src2, IntRegs:$src3,
                  s4_3Imm:$offset),
             "if ($src1.new) memd($src3++#$offset) = $src2",
@@ -1094,8 +1931,9 @@ def POST_STdri_cdnPt_V4 : STInstPI<(outs IntRegs:$dst),
 
 // if (!Pv) memd(Rx++#s4:3)=Rtt
 // if (!Pv.new) memd(Rx++#s4:3)=Rtt
-let AddedComplexity = 10, mayStore = 1, neverHasSideEffects = 1 in
-def POST_STdri_cdnNotPt_V4 : STInstPI<(outs IntRegs:$dst),
+let AddedComplexity = 10, neverHasSideEffects = 1,
+    isPredicated = 1 in
+def POST_STdri_cdnNotPt_V4 : STInst2PI<(outs IntRegs:$dst),
             (ins PredRegs:$src1, DoubleRegs:$src2, IntRegs:$src3,
                  s4_3Imm:$offset),
             "if (!$src1.new) memd($src3++#$offset) = $src2",
@@ -1105,15 +1943,12 @@ def POST_STdri_cdnNotPt_V4 : STInstPI<(outs IntRegs:$dst),
 
 
 // Store byte.
-// memb(Re=#U6)=Rt
-// TODO: needs to be implemented.
-// memb(Rs+#s11:0)=Rt
 // memb(Rs+#u6:0)=#S8
 let AddedComplexity = 10, isPredicable = 1 in
 def STrib_imm_V4 : STInst<(outs),
             (ins IntRegs:$src1, u6_0Imm:$src2, s8Imm:$src3),
             "memb($src1+#$src2) = #$src3",
-            [(truncstorei8 s8ImmPred:$src3, (add IntRegs:$src1,
+            [(truncstorei8 s8ImmPred:$src3, (add (i32 IntRegs:$src1),
                                                  u6_0ImmPred:$src2))]>,
             Requires<[HasV4T]>;
 
@@ -1122,9 +1957,10 @@ let AddedComplexity = 10, isPredicable = 1 in
 def STrib_indexed_shl_V4 : STInst<(outs),
             (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$src3, IntRegs:$src4),
             "memb($src1+$src2<<#$src3) = $src4",
-            [(truncstorei8 IntRegs:$src4, (add IntRegs:$src1,
-                                          (shl IntRegs:$src2,
-                                               u2ImmPred:$src3)))]>,
+            [(truncstorei8 (i32 IntRegs:$src4),
+                           (add (i32 IntRegs:$src1),
+                                (shl (i32 IntRegs:$src2),
+                                          u2ImmPred:$src3)))]>,
             Requires<[HasV4T]>;
 
 // memb(Ru<<#u2+#U6)=Rt
@@ -1132,9 +1968,9 @@ let AddedComplexity = 10 in
 def STrib_shl_V4 : STInst<(outs),
             (ins IntRegs:$src1, u2Imm:$src2, u6Imm:$src3, IntRegs:$src4),
             "memb($src1<<#$src2+#$src3) = $src4",
-            [(truncstorei8 IntRegs:$src4, (shl IntRegs:$src1,
-                                          (add u2ImmPred:$src2,
-                                               u6ImmPred:$src3)))]>,
+            [(truncstorei8 (i32 IntRegs:$src4),
+                           (add (shl (i32 IntRegs:$src1), u2ImmPred:$src2),
+                                u6ImmPred:$src3))]>,
             Requires<[HasV4T]>;
 
 // memb(Rx++#s4:0:circ(Mu))=Rt
@@ -1148,32 +1984,36 @@ def STrib_shl_V4 : STInst<(outs),
 // if ([!]Pv[.new]) memb(#u6)=Rt
 // if ([!]Pv[.new]) memb(Rs+#u6:0)=#S6
 // if (Pv) memb(Rs+#u6:0)=#S6
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrib_imm_cPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrib_imm_cPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, s6Imm:$src4),
             "if ($src1) memb($src2+#$src3) = #$src4",
             []>,
             Requires<[HasV4T]>;
 
 // if (Pv.new) memb(Rs+#u6:0)=#S6
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrib_imm_cdnPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrib_imm_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, s6Imm:$src4),
             "if ($src1.new) memb($src2+#$src3) = #$src4",
             []>,
             Requires<[HasV4T]>;
 
 // if (!Pv) memb(Rs+#u6:0)=#S6
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrib_imm_cNotPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrib_imm_cNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, s6Imm:$src4),
             "if (!$src1) memb($src2+#$src3) = #$src4",
             []>,
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memb(Rs+#u6:0)=#S6
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrib_imm_cdnNotPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrib_imm_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, s6Imm:$src4),
             "if (!$src1.new) memb($src2+#$src3) = #$src4",
             []>,
@@ -1182,8 +2022,9 @@ def STrib_imm_cdnNotPt_V4 : STInst<(outs),
 // if ([!]Pv[.new]) memb(Rs+#u6:0)=Rt
 // if (Pv) memb(Rs+#u6:0)=Rt
 // if (Pv.new) memb(Rs+#u6:0)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrib_cdnPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrib_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if ($src1.new) memb($addr) = $src2",
             []>,
@@ -1191,8 +2032,9 @@ def STrib_cdnPt_V4 : STInst<(outs),
 
 // if (!Pv) memb(Rs+#u6:0)=Rt
 // if (!Pv.new) memb(Rs+#u6:0)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrib_cdnNotPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrib_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if (!$src1.new) memb($addr) = $src2",
             []>,
@@ -1201,16 +2043,18 @@ def STrib_cdnNotPt_V4 : STInst<(outs),
 // if (Pv) memb(Rs+#u6:0)=Rt
 // if (!Pv) memb(Rs+#u6:0)=Rt
 // if (Pv.new) memb(Rs+#u6:0)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrib_indexed_cdnPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrib_indexed_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, IntRegs:$src4),
             "if ($src1.new) memb($src2+#$src3) = $src4",
             []>,
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memb(Rs+#u6:0)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrib_indexed_cdnNotPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrib_indexed_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, IntRegs:$src4),
             "if (!$src1.new) memb($src2+#$src3) = $src4",
             []>,
@@ -1218,8 +2062,9 @@ def STrib_indexed_cdnNotPt_V4 : STInst<(outs),
 
 // if ([!]Pv[.new]) memb(Rs+Ru<<#u2)=Rt
 // if (Pv) memb(Rs+Ru<<#u2)=Rt
-let mayStore = 1, AddedComplexity = 10 in
-def STrib_indexed_shl_cPt_V4 : STInst<(outs),
+let AddedComplexity = 10,
+    isPredicated = 1 in
+def STrib_indexed_shl_cPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
             "if ($src1) memb($src2+$src3<<#$src4) = $src5",
@@ -1227,8 +2072,9 @@ def STrib_indexed_shl_cPt_V4 : STInst<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv.new) memb(Rs+Ru<<#u2)=Rt
-let mayStore = 1, AddedComplexity = 10 in
-def STrib_indexed_shl_cdnPt_V4 : STInst<(outs),
+let AddedComplexity = 10,
+    isPredicated = 1 in
+def STrib_indexed_shl_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
             "if ($src1.new) memb($src2+$src3<<#$src4) = $src5",
@@ -1236,8 +2082,9 @@ def STrib_indexed_shl_cdnPt_V4 : STInst<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv) memb(Rs+Ru<<#u2)=Rt
-let mayStore = 1, AddedComplexity = 10 in
-def STrib_indexed_shl_cNotPt_V4 : STInst<(outs),
+let AddedComplexity = 10,
+    isPredicated = 1 in
+def STrib_indexed_shl_cNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
             "if (!$src1) memb($src2+$src3<<#$src4) = $src5",
@@ -1245,8 +2092,9 @@ def STrib_indexed_shl_cNotPt_V4 : STInst<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memb(Rs+Ru<<#u2)=Rt
-let mayStore = 1, AddedComplexity = 10 in
-def STrib_indexed_shl_cdnNotPt_V4 : STInst<(outs),
+let AddedComplexity = 10,
+    isPredicated = 1 in
+def STrib_indexed_shl_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
             "if (!$src1.new) memb($src2+$src3<<#$src4) = $src5",
@@ -1256,8 +2104,9 @@ def STrib_indexed_shl_cdnNotPt_V4 : STInst<(outs),
 // if ([!]Pv[.new]) memb(Rx++#s4:0)=Rt
 // if (Pv) memb(Rx++#s4:0)=Rt
 // if (Pv.new) memb(Rx++#s4:0)=Rt
-let mayStore = 1, hasCtrlDep = 1 in
-def POST_STbri_cdnPt_V4 : STInstPI<(outs IntRegs:$dst),
+let hasCtrlDep = 1,
+    isPredicated = 1 in
+def POST_STbri_cdnPt_V4 : STInst2PI<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset),
             "if ($src1.new) memb($src3++#$offset) = $src2",
             [],"$src3 = $dst">,
@@ -1265,8 +2114,9 @@ def POST_STbri_cdnPt_V4 : STInstPI<(outs IntRegs:$dst),
 
 // if (!Pv) memb(Rx++#s4:0)=Rt
 // if (!Pv.new) memb(Rx++#s4:0)=Rt
-let mayStore = 1, hasCtrlDep = 1 in
-def POST_STbri_cdnNotPt_V4 : STInstPI<(outs IntRegs:$dst),
+let hasCtrlDep = 1,
+    isPredicated = 1 in
+def POST_STbri_cdnNotPt_V4 : STInst2PI<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset),
             "if (!$src1.new) memb($src3++#$offset) = $src2",
             [],"$src3 = $dst">,
@@ -1274,20 +2124,15 @@ def POST_STbri_cdnNotPt_V4 : STInstPI<(outs IntRegs:$dst),
 
 
 // Store halfword.
-// memh(Re=#U6)=Rt.H
-// TODO: needs to be implemented
-
-// memh(Re=#U6)=Rt
 // TODO: needs to be implemented
-
+// memh(Re=#U6)=Rt.H
 // memh(Rs+#s11:1)=Rt.H
-// memh(Rs+#s11:1)=Rt
 // memh(Rs+#u6:1)=#S8
 let AddedComplexity = 10, isPredicable = 1 in
 def STrih_imm_V4 : STInst<(outs),
             (ins IntRegs:$src1, u6_1Imm:$src2, s8Imm:$src3),
             "memh($src1+#$src2) = #$src3",
-            [(truncstorei16 s8ImmPred:$src3, (add IntRegs:$src1,
+            [(truncstorei16 s8ImmPred:$src3, (add (i32 IntRegs:$src1),
                                                   u6_1ImmPred:$src2))]>,
             Requires<[HasV4T]>;
 
@@ -1299,9 +2144,10 @@ let AddedComplexity = 10, isPredicable = 1 in
 def STrih_indexed_shl_V4 : STInst<(outs),
             (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$src3, IntRegs:$src4),
             "memh($src1+$src2<<#$src3) = $src4",
-            [(truncstorei16 IntRegs:$src4, (add IntRegs:$src1,
-                                          (shl IntRegs:$src2,
-                                               u2ImmPred:$src3)))]>,
+            [(truncstorei16 (i32 IntRegs:$src4),
+                            (add (i32 IntRegs:$src1),
+                                 (shl (i32 IntRegs:$src2),
+                                      u2ImmPred:$src3)))]>,
             Requires<[HasV4T]>;
 
 // memh(Ru<<#u2+#U6)=Rt.H
@@ -1310,9 +2156,9 @@ let AddedComplexity = 10 in
 def STrih_shl_V4 : STInst<(outs),
             (ins IntRegs:$src1, u2Imm:$src2, u6Imm:$src3, IntRegs:$src4),
             "memh($src1<<#$src2+#$src3) = $src4",
-            [(truncstorei16 IntRegs:$src4, (shl IntRegs:$src1,
-                                          (add u2ImmPred:$src2,
-                                               u6ImmPred:$src3)))]>,
+            [(truncstorei16 (i32 IntRegs:$src4),
+                            (add (shl (i32 IntRegs:$src1), u2ImmPred:$src2),
+                                 u6ImmPred:$src3))]>,
             Requires<[HasV4T]>;
 
 // memh(Rx++#s4:1:circ(Mu))=Rt.H
@@ -1323,42 +2169,42 @@ def STrih_shl_V4 : STInst<(outs),
 // memh(Rx++Mu)=Rt
 // memh(Rx++Mu:brev)=Rt.H
 // memh(Rx++Mu:brev)=Rt
-// memh(gp+#u16:1)=Rt.H
 // memh(gp+#u16:1)=Rt
-
-
-// Store halfword conditionally.
 // if ([!]Pv[.new]) memh(#u6)=Rt.H
 // if ([!]Pv[.new]) memh(#u6)=Rt
 
 // if ([!]Pv[.new]) memh(Rs+#u6:1)=#S6
 // if (Pv) memh(Rs+#u6:1)=#S6
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrih_imm_cPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrih_imm_cPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, s6Imm:$src4),
             "if ($src1) memh($src2+#$src3) = #$src4",
             []>,
             Requires<[HasV4T]>;
 
 // if (Pv.new) memh(Rs+#u6:1)=#S6
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrih_imm_cdnPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrih_imm_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, s6Imm:$src4),
             "if ($src1.new) memh($src2+#$src3) = #$src4",
             []>,
             Requires<[HasV4T]>;
 
 // if (!Pv) memh(Rs+#u6:1)=#S6
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrih_imm_cNotPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrih_imm_cNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, s6Imm:$src4),
             "if (!$src1) memh($src2+#$src3) = #$src4",
             []>,
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memh(Rs+#u6:1)=#S6
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrih_imm_cdnNotPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrih_imm_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, s6Imm:$src4),
             "if (!$src1.new) memh($src2+#$src3) = #$src4",
             []>,
@@ -1370,8 +2216,9 @@ def STrih_imm_cdnNotPt_V4 : STInst<(outs),
 // if ([!]Pv[.new]) memh(Rs+#u6:1)=Rt
 // if (Pv) memh(Rs+#u6:1)=Rt
 // if (Pv.new) memh(Rs+#u6:1)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrih_cdnPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrih_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if ($src1.new) memh($addr) = $src2",
             []>,
@@ -1379,24 +2226,27 @@ def STrih_cdnPt_V4 : STInst<(outs),
 
 // if (!Pv) memh(Rs+#u6:1)=Rt
 // if (!Pv.new) memh(Rs+#u6:1)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrih_cdnNotPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrih_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if (!$src1.new) memh($addr) = $src2",
             []>,
             Requires<[HasV4T]>;
 
 // if (Pv.new) memh(Rs+#u6:1)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrih_indexed_cdnPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrih_indexed_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, IntRegs:$src4),
             "if ($src1.new) memh($src2+#$src3) = $src4",
             []>,
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memh(Rs+#u6:1)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrih_indexed_cdnNotPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STrih_indexed_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, IntRegs:$src4),
             "if (!$src1.new) memh($src2+#$src3) = $src4",
             []>,
@@ -1405,8 +2255,9 @@ def STrih_indexed_cdnNotPt_V4 : STInst<(outs),
 // if ([!]Pv[.new]) memh(Rs+Ru<<#u2)=Rt.H
 // if ([!]Pv[.new]) memh(Rs+Ru<<#u2)=Rt
 // if (Pv) memh(Rs+Ru<<#u2)=Rt
-let mayStore = 1, AddedComplexity = 10 in
-def STrih_indexed_shl_cPt_V4 : STInst<(outs),
+let AddedComplexity = 10,
+    isPredicated = 1 in
+def STrih_indexed_shl_cPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
             "if ($src1) memh($src2+$src3<<#$src4) = $src5",
@@ -1414,7 +2265,9 @@ def STrih_indexed_shl_cPt_V4 : STInst<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv.new) memh(Rs+Ru<<#u2)=Rt
-def STrih_indexed_shl_cdnPt_V4 : STInst<(outs),
+let AddedComplexity = 10,
+    isPredicated = 1 in
+def STrih_indexed_shl_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
             "if ($src1.new) memh($src2+$src3<<#$src4) = $src5",
@@ -1422,8 +2275,9 @@ def STrih_indexed_shl_cdnPt_V4 : STInst<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv) memh(Rs+Ru<<#u2)=Rt
-let mayStore = 1, AddedComplexity = 10 in
-def STrih_indexed_shl_cNotPt_V4 : STInst<(outs),
+let AddedComplexity = 10,
+    isPredicated = 1 in
+def STrih_indexed_shl_cNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
             "if (!$src1) memh($src2+$src3<<#$src4) = $src5",
@@ -1431,8 +2285,9 @@ def STrih_indexed_shl_cNotPt_V4 : STInst<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memh(Rs+Ru<<#u2)=Rt
-let mayStore = 1, AddedComplexity = 10 in
-def STrih_indexed_shl_cdnNotPt_V4 : STInst<(outs),
+let AddedComplexity = 10,
+    isPredicated = 1 in
+def STrih_indexed_shl_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
             "if (!$src1.new) memh($src2+$src3<<#$src4) = $src5",
@@ -1445,8 +2300,9 @@ def STrih_indexed_shl_cdnNotPt_V4 : STInst<(outs),
 // if ([!]Pv[.new]) memh(Rx++#s4:1)=Rt
 // if (Pv) memh(Rx++#s4:1)=Rt
 // if (Pv.new) memh(Rx++#s4:1)=Rt
-let mayStore = 1, hasCtrlDep = 1 in
-def POST_SThri_cdnPt_V4 : STInstPI<(outs IntRegs:$dst),
+let hasCtrlDep = 1,
+    isPredicated = 1 in
+def POST_SThri_cdnPt_V4 : STInst2PI<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset),
             "if ($src1.new) memh($src3++#$offset) = $src2",
             [],"$src3 = $dst">,
@@ -1454,8 +2310,9 @@ def POST_SThri_cdnPt_V4 : STInstPI<(outs IntRegs:$dst),
 
 // if (!Pv) memh(Rx++#s4:1)=Rt
 // if (!Pv.new) memh(Rx++#s4:1)=Rt
-let mayStore = 1, hasCtrlDep = 1 in
-def POST_SThri_cdnNotPt_V4 : STInstPI<(outs IntRegs:$dst),
+let hasCtrlDep = 1,
+    isPredicated = 1 in
+def POST_SThri_cdnNotPt_V4 : STInst2PI<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset),
             "if (!$src1.new) memh($src3++#$offset) = $src2",
             [],"$src3 = $dst">,
@@ -1466,13 +2323,22 @@ def POST_SThri_cdnNotPt_V4 : STInstPI<(outs IntRegs:$dst),
 // memw(Re=#U6)=Rt
 // TODO: Needs to be implemented.
 
-// memw(Rs+#s11:2)=Rt
+// Store predicate:
+let neverHasSideEffects = 1 in
+def STriw_pred_V4 : STInst2<(outs),
+            (ins MEMri:$addr, PredRegs:$src1),
+            "Error; should not emit",
+            []>,
+            Requires<[HasV4T]>;
+
+
 // memw(Rs+#u6:2)=#S8
 let AddedComplexity = 10, isPredicable = 1 in
 def STriw_imm_V4 : STInst<(outs),
             (ins IntRegs:$src1, u6_2Imm:$src2, s8Imm:$src3),
             "memw($src1+#$src2) = #$src3",
-            [(store s8ImmPred:$src3, (add IntRegs:$src1, u6_2ImmPred:$src2))]>,
+            [(store s8ImmPred:$src3, (add (i32 IntRegs:$src1),
+                                          u6_2ImmPred:$src2))]>,
             Requires<[HasV4T]>;
 
 // memw(Rs+Ru<<#u2)=Rt
@@ -1480,8 +2346,9 @@ let AddedComplexity = 10, isPredicable = 1 in
 def STriw_indexed_shl_V4 : STInst<(outs),
             (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$src3, IntRegs:$src4),
             "memw($src1+$src2<<#$src3) = $src4",
-            [(store IntRegs:$src4, (add IntRegs:$src1,
-                                    (shl IntRegs:$src2, u2ImmPred:$src3)))]>,
+            [(store (i32 IntRegs:$src4), (add (i32 IntRegs:$src1),
+                                    (shl (i32 IntRegs:$src2),
+                                         u2ImmPred:$src3)))]>,
             Requires<[HasV4T]>;
 
 // memw(Ru<<#u2+#U6)=Rt
@@ -1489,8 +2356,9 @@ let AddedComplexity = 10 in
 def STriw_shl_V4 : STInst<(outs),
             (ins IntRegs:$src1, u2Imm:$src2, u6Imm:$src3, IntRegs:$src4),
             "memw($src1<<#$src2+#$src3) = $src4",
-            [(store IntRegs:$src4, (shl IntRegs:$src1,
-                                   (add u2ImmPred:$src2, u6ImmPred:$src3)))]>,
+            [(store (i32 IntRegs:$src4),
+                    (add (shl (i32 IntRegs:$src1), u2ImmPred:$src2),
+                              u6ImmPred:$src3))]>,
             Requires<[HasV4T]>;
 
 // memw(Rx++#s4:2)=Rt
@@ -1502,37 +2370,39 @@ def STriw_shl_V4 : STInst<(outs),
 
 
 // Store word conditionally.
-// if ([!]Pv[.new]) memw(#u6)=Rt
-// TODO: Needs to be implemented.
 
 // if ([!]Pv[.new]) memw(Rs+#u6:2)=#S6
 // if (Pv) memw(Rs+#u6:2)=#S6
-let mayStore = 1, neverHasSideEffects = 1 in
-def STriw_imm_cPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STriw_imm_cPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, s6Imm:$src4),
             "if ($src1) memw($src2+#$src3) = #$src4",
             []>,
             Requires<[HasV4T]>;
 
 // if (Pv.new) memw(Rs+#u6:2)=#S6
-let mayStore = 1, neverHasSideEffects = 1 in
-def STriw_imm_cdnPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STriw_imm_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, s6Imm:$src4),
             "if ($src1.new) memw($src2+#$src3) = #$src4",
             []>,
             Requires<[HasV4T]>;
 
 // if (!Pv) memw(Rs+#u6:2)=#S6
-let mayStore = 1, neverHasSideEffects = 1 in
-def STriw_imm_cNotPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STriw_imm_cNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, s6Imm:$src4),
             "if (!$src1) memw($src2+#$src3) = #$src4",
             []>,
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memw(Rs+#u6:2)=#S6
-let mayStore = 1, neverHasSideEffects = 1 in
-def STriw_imm_cdnNotPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STriw_imm_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, s6Imm:$src4),
             "if (!$src1.new) memw($src2+#$src3) = #$src4",
             []>,
@@ -1541,8 +2411,9 @@ def STriw_imm_cdnNotPt_V4 : STInst<(outs),
 // if ([!]Pv[.new]) memw(Rs+#u6:2)=Rt
 // if (Pv) memw(Rs+#u6:2)=Rt
 // if (Pv.new) memw(Rs+#u6:2)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STriw_cdnPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STriw_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if ($src1.new) memw($addr) = $src2",
             []>,
@@ -1550,8 +2421,9 @@ def STriw_cdnPt_V4 : STInst<(outs),
 
 // if (!Pv) memw(Rs+#u6:2)=Rt
 // if (!Pv.new) memw(Rs+#u6:2)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STriw_cdnNotPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STriw_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if (!$src1.new) memw($addr) = $src2",
             []>,
@@ -1560,16 +2432,18 @@ def STriw_cdnNotPt_V4 : STInst<(outs),
 // if (Pv) memw(Rs+#u6:2)=Rt
 // if (!Pv) memw(Rs+#u6:2)=Rt
 // if (Pv.new) memw(Rs+#u6:2)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STriw_indexed_cdnPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STriw_indexed_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, IntRegs:$src4),
             "if ($src1.new) memw($src2+#$src3) = $src4",
             []>,
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memw(Rs+#u6:2)=Rt
-let mayStore = 1, neverHasSideEffects = 1 in
-def STriw_indexed_cdnNotPt_V4 : STInst<(outs),
+let neverHasSideEffects = 1,
+    isPredicated = 1 in
+def STriw_indexed_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, IntRegs:$src4),
             "if (!$src1.new) memw($src2+#$src3) = $src4",
             []>,
@@ -1577,8 +2451,9 @@ def STriw_indexed_cdnNotPt_V4 : STInst<(outs),
 
 // if ([!]Pv[.new]) memw(Rs+Ru<<#u2)=Rt
 // if (Pv) memw(Rs+Ru<<#u2)=Rt
-let mayStore = 1, AddedComplexity = 10 in
-def STriw_indexed_shl_cPt_V4 : STInst<(outs),
+let AddedComplexity = 10,
+    isPredicated = 1 in
+def STriw_indexed_shl_cPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
             "if ($src1) memw($src2+$src3<<#$src4) = $src5",
@@ -1586,8 +2461,9 @@ def STriw_indexed_shl_cPt_V4 : STInst<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv.new) memw(Rs+Ru<<#u2)=Rt
-let mayStore = 1, AddedComplexity = 10 in
-def STriw_indexed_shl_cdnPt_V4 : STInst<(outs),
+let AddedComplexity = 10,
+    isPredicated = 1 in
+def STriw_indexed_shl_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
             "if ($src1.new) memw($src2+$src3<<#$src4) = $src5",
@@ -1595,8 +2471,9 @@ def STriw_indexed_shl_cdnPt_V4 : STInst<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv) memw(Rs+Ru<<#u2)=Rt
-let mayStore = 1, AddedComplexity = 10 in
-def STriw_indexed_shl_cNotPt_V4 : STInst<(outs),
+let AddedComplexity = 10,
+    isPredicated = 1 in
+def STriw_indexed_shl_cNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
             "if (!$src1) memw($src2+$src3<<#$src4) = $src5",
@@ -1604,8 +2481,9 @@ def STriw_indexed_shl_cNotPt_V4 : STInst<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memw(Rs+Ru<<#u2)=Rt
-let mayStore = 1, AddedComplexity = 10 in
-def STriw_indexed_shl_cdnNotPt_V4 : STInst<(outs),
+let AddedComplexity = 10,
+    isPredicated = 1 in
+def STriw_indexed_shl_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
             "if (!$src1.new) memw($src2+$src3<<#$src4) = $src5",
@@ -1615,8 +2493,9 @@ def STriw_indexed_shl_cdnNotPt_V4 : STInst<(outs),
 // if ([!]Pv[.new]) memw(Rx++#s4:2)=Rt
 // if (Pv) memw(Rx++#s4:2)=Rt
 // if (Pv.new) memw(Rx++#s4:2)=Rt
-let mayStore = 1, hasCtrlDep = 1 in
-def POST_STwri_cdnPt_V4 : STInstPI<(outs IntRegs:$dst),
+let hasCtrlDep = 1,
+    isPredicated = 1 in
+def POST_STwri_cdnPt_V4 : STInst2PI<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset),
             "if ($src1.new) memw($src3++#$offset) = $src2",
             [],"$src3 = $dst">,
@@ -1624,14 +2503,456 @@ def POST_STwri_cdnPt_V4 : STInstPI<(outs IntRegs:$dst),
 
 // if (!Pv) memw(Rx++#s4:2)=Rt
 // if (!Pv.new) memw(Rx++#s4:2)=Rt
-let mayStore = 1, hasCtrlDep = 1 in
-def POST_STwri_cdnNotPt_V4 : STInstPI<(outs IntRegs:$dst),
+let hasCtrlDep = 1,
+    isPredicated = 1 in
+def POST_STwri_cdnNotPt_V4 : STInst2PI<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset),
             "if (!$src1.new) memw($src3++#$offset) = $src2",
             [],"$src3 = $dst">,
             Requires<[HasV4T]>;
 
 
+/// store to global address
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def STrid_GP_V4 : STInst2<(outs),
+            (ins globaladdress:$global, u16Imm:$offset, DoubleRegs:$src),
+            "memd(#$global+$offset) = $src",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrid_GP_cPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        DoubleRegs:$src2),
+            "if ($src1) memd(##$global+$offset) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrid_GP_cNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        DoubleRegs:$src2),
+            "if (!$src1) memd(##$global+$offset) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrid_GP_cdnPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        DoubleRegs:$src2),
+            "if ($src1.new) memd(##$global+$offset) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrid_GP_cdnNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        DoubleRegs:$src2),
+            "if (!$src1.new) memd(##$global+$offset) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def STrib_GP_V4 : STInst2<(outs),
+            (ins globaladdress:$global, u16Imm:$offset, IntRegs:$src),
+            "memb(#$global+$offset) = $src",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrib_GP_cPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if ($src1) memb(##$global+$offset) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrib_GP_cNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if (!$src1) memb(##$global+$offset) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrib_GP_cdnPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if ($src1.new) memb(##$global+$offset) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrib_GP_cdnNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if (!$src1.new) memb(##$global+$offset) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def STrih_GP_V4 : STInst2<(outs),
+            (ins globaladdress:$global, u16Imm:$offset, IntRegs:$src),
+            "memh(#$global+$offset) = $src",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrih_GP_cPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if ($src1) memh(##$global+$offset) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrih_GP_cNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if (!$src1) memh(##$global+$offset) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrih_GP_cdnPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if ($src1.new) memh(##$global+$offset) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STrih_GP_cdnNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if (!$src1.new) memh(##$global+$offset) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+let isPredicable = 1, neverHasSideEffects = 1 in
+def STriw_GP_V4 : STInst2<(outs),
+            (ins globaladdress:$global, u16Imm:$offset, IntRegs:$src),
+            "memw(#$global+$offset) = $src",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STriw_GP_cPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if ($src1) memw(##$global+$offset) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STriw_GP_cNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if (!$src1) memw(##$global+$offset) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STriw_GP_cdnPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if ($src1.new) memw(##$global+$offset) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STriw_GP_cdnNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if (!$src1.new) memw(##$global+$offset) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+// memd(#global)=Rtt
+let isPredicable = 1, neverHasSideEffects = 1 in
+def STd_GP_V4 : STInst2<(outs),
+            (ins globaladdress:$global, DoubleRegs:$src),
+            "memd(#$global) = $src",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) memd(##global) = Rtt
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STd_GP_cPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, DoubleRegs:$src2),
+            "if ($src1) memd(##$global) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memd(##global) = Rtt
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STd_GP_cNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, DoubleRegs:$src2),
+            "if (!$src1) memd(##$global) = $src2",
+            []>,
+              Requires<[HasV4T]>;
+
+// if (Pv) memd(##global) = Rtt
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STd_GP_cdnPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, DoubleRegs:$src2),
+            "if ($src1.new) memd(##$global) = $src2",
+            []>,
+              Requires<[HasV4T]>;
+
+// if (!Pv) memd(##global) = Rtt
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STd_GP_cdnNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, DoubleRegs:$src2),
+            "if (!$src1.new) memd(##$global) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+
+// memb(#global)=Rt
+let isPredicable = 1, neverHasSideEffects = 1 in
+def STb_GP_V4 : STInst2<(outs),
+            (ins globaladdress:$global, IntRegs:$src),
+            "memb(#$global) = $src",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) memb(##global) = Rt
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STb_GP_cPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if ($src1) memb(##$global) = $src2",
+              []>,
+              Requires<[HasV4T]>;
+
+// if (!Pv) memb(##global) = Rt
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STb_GP_cNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if (!$src1) memb(##$global) = $src2",
+              []>,
+              Requires<[HasV4T]>;
+
+// if (Pv) memb(##global) = Rt
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STb_GP_cdnPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if ($src1.new) memb(##$global) = $src2",
+              []>,
+              Requires<[HasV4T]>;
+
+// if (!Pv) memb(##global) = Rt
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STb_GP_cdnNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if (!$src1.new) memb(##$global) = $src2",
+              []>,
+              Requires<[HasV4T]>;
+
+// memh(#global)=Rt
+let isPredicable = 1, neverHasSideEffects = 1 in
+def STh_GP_V4 : STInst2<(outs),
+            (ins globaladdress:$global, IntRegs:$src),
+            "memh(#$global) = $src",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) memh(##global) = Rt
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STh_GP_cPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if ($src1) memh(##$global) = $src2",
+              []>,
+              Requires<[HasV4T]>;
+
+// if (!Pv) memh(##global) = Rt
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STh_GP_cNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if (!$src1) memh(##$global) = $src2",
+              []>,
+              Requires<[HasV4T]>;
+
+// if (Pv) memh(##global) = Rt
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STh_GP_cdnPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if ($src1.new) memh(##$global) = $src2",
+              []>,
+              Requires<[HasV4T]>;
+
+// if (!Pv) memh(##global) = Rt
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STh_GP_cdnNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if (!$src1.new) memh(##$global) = $src2",
+              []>,
+              Requires<[HasV4T]>;
+
+// memw(#global)=Rt
+let isPredicable = 1, neverHasSideEffects = 1 in
+def STw_GP_V4 : STInst2<(outs),
+            (ins globaladdress:$global, IntRegs:$src),
+            "memw(#$global) = $src",
+              []>,
+              Requires<[HasV4T]>;
+
+// if (Pv) memw(##global) = Rt
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STw_GP_cPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if ($src1) memw(##$global) = $src2",
+              []>,
+              Requires<[HasV4T]>;
+
+// if (!Pv) memw(##global) = Rt
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STw_GP_cNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if (!$src1) memw(##$global) = $src2",
+              []>,
+              Requires<[HasV4T]>;
+
+// if (Pv) memw(##global) = Rt
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STw_GP_cdnPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if ($src1.new) memw(##$global) = $src2",
+              []>,
+              Requires<[HasV4T]>;
+
+// if (!Pv) memw(##global) = Rt
+let neverHasSideEffects = 1, isPredicated = 1 in
+def STw_GP_cdnNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if (!$src1.new) memw(##$global) = $src2",
+            []>,
+              Requires<[HasV4T]>;
+
+// 64 bit atomic store
+def : Pat <(atomic_store_64 (HexagonCONST32_GP tglobaladdr:$global),
+                            (i64 DoubleRegs:$src1)),
+           (STd_GP_V4 tglobaladdr:$global, (i64 DoubleRegs:$src1))>,
+           Requires<[HasV4T]>;
+
+// Map from store(globaladdress) -> memd(#foo)
+let AddedComplexity = 100 in
+def : Pat <(store (i64 DoubleRegs:$src1),
+                  (HexagonCONST32_GP tglobaladdr:$global)),
+           (STd_GP_V4 tglobaladdr:$global, (i64 DoubleRegs:$src1))>,
+           Requires<[HasV4T]>;
+
+// 8 bit atomic store
+def : Pat < (atomic_store_8 (HexagonCONST32_GP tglobaladdr:$global),
+                            (i32 IntRegs:$src1)),
+            (STb_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>,
+              Requires<[HasV4T]>;
+
+// Map from store(globaladdress) -> memb(#foo)
+let AddedComplexity = 100 in
+def : Pat<(truncstorei8 (i32 IntRegs:$src1),
+          (HexagonCONST32_GP tglobaladdr:$global)),
+          (STb_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>,
+          Requires<[HasV4T]>;
+
+// Map from "i1 = constant<-1>; memw(CONST32(#foo)) = i1"
+//       to "r0 = 1; memw(#foo) = r0"
+let AddedComplexity = 100 in
+def : Pat<(store (i1 -1), (HexagonCONST32_GP tglobaladdr:$global)),
+          (STb_GP_V4 tglobaladdr:$global, (TFRI 1))>,
+          Requires<[HasV4T]>;
+
+def : Pat<(atomic_store_16 (HexagonCONST32_GP tglobaladdr:$global),
+                           (i32 IntRegs:$src1)),
+          (STh_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>,
+          Requires<[HasV4T]>;
+
+// Map from store(globaladdress) -> memh(#foo)
+let AddedComplexity = 100 in
+def : Pat<(truncstorei16 (i32 IntRegs:$src1),
+                         (HexagonCONST32_GP tglobaladdr:$global)),
+          (STh_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>,
+          Requires<[HasV4T]>;
+
+// 32 bit atomic store
+def : Pat<(atomic_store_32 (HexagonCONST32_GP tglobaladdr:$global),
+                           (i32 IntRegs:$src1)),
+          (STw_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>,
+          Requires<[HasV4T]>;
+
+// Map from store(globaladdress) -> memw(#foo)
+let AddedComplexity = 100 in
+def : Pat<(store (i32 IntRegs:$src1), (HexagonCONST32_GP tglobaladdr:$global)),
+          (STw_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>,
+          Requires<[HasV4T]>;
+
+def : Pat<(atomic_store_64 (add (HexagonCONST32_GP tglobaladdr:$global),
+                                u16ImmPred:$offset),
+                           (i64 DoubleRegs:$src1)),
+          (STrid_GP_V4 tglobaladdr:$global, u16ImmPred:$offset,
+                                            (i64 DoubleRegs:$src1))>,
+          Requires<[HasV4T]>;
+
+def : Pat<(atomic_store_32 (add (HexagonCONST32_GP tglobaladdr:$global),
+                                u16ImmPred:$offset),
+                           (i32 IntRegs:$src1)),
+          (STriw_GP_V4 tglobaladdr:$global, u16ImmPred:$offset,
+                                            (i32 IntRegs:$src1))>,
+          Requires<[HasV4T]>;
+
+def : Pat<(atomic_store_16 (add (HexagonCONST32_GP tglobaladdr:$global),
+                                u16ImmPred:$offset),
+                           (i32 IntRegs:$src1)),
+          (STrih_GP_V4 tglobaladdr:$global, u16ImmPred:$offset,
+                                            (i32 IntRegs:$src1))>,
+          Requires<[HasV4T]>;
+
+def : Pat<(atomic_store_8 (add (HexagonCONST32_GP tglobaladdr:$global),
+                               u16ImmPred:$offset),
+                          (i32 IntRegs:$src1)),
+          (STrib_GP_V4 tglobaladdr:$global, u16ImmPred:$offset,
+                                            (i32 IntRegs:$src1))>,
+          Requires<[HasV4T]>;
+
+// Map from store(globaladdress + x) -> memd(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(store (i64 DoubleRegs:$src1),
+                    (add (HexagonCONST32_GP tglobaladdr:$global),
+                                        u16ImmPred:$offset)),
+          (STrid_GP_V4 tglobaladdr:$global, u16ImmPred:$offset,
+                                            (i64 DoubleRegs:$src1))>,
+          Requires<[HasV4T]>;
+
+// Map from store(globaladdress + x) -> memb(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(truncstorei8 (i32 IntRegs:$src1),
+                        (add (HexagonCONST32_GP tglobaladdr:$global),
+                             u16ImmPred:$offset)),
+          (STrib_GP_V4 tglobaladdr:$global, u16ImmPred:$offset,
+                                            (i32 IntRegs:$src1))>,
+          Requires<[HasV4T]>;
+
+// Map from store(globaladdress + x) -> memh(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(truncstorei16 (i32 IntRegs:$src1),
+                         (add (HexagonCONST32_GP tglobaladdr:$global),
+                              u16ImmPred:$offset)),
+          (STrih_GP_V4 tglobaladdr:$global, u16ImmPred:$offset,
+                                            (i32 IntRegs:$src1))>,
+          Requires<[HasV4T]>;
+
+// Map from store(globaladdress + x) -> memw(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(store (i32 IntRegs:$src1),
+                 (add (HexagonCONST32_GP tglobaladdr:$global),
+                                u16ImmPred:$offset)),
+          (STriw_GP_V4 tglobaladdr:$global, u16ImmPred:$offset,
+                                            (i32 IntRegs:$src1))>,
+          Requires<[HasV4T]>;
+
+
+
 //===----------------------------------------------------------------------===
 // ST -
 //===----------------------------------------------------------------------===
@@ -1696,11 +3017,19 @@ def STrib_GP_nv_V4 : NVInst_V4<(outs),
             []>,
             Requires<[HasV4T]>;
 
+// memb(#global)=Nt.new
+let mayStore = 1, neverHasSideEffects = 1 in
+def STb_GP_nv_V4 : NVInst_V4<(outs),
+            (ins globaladdress:$global, IntRegs:$src),
+            "memb(#$global) = $src.new",
+            []>,
+            Requires<[HasV4T]>;
 
 // Store new-value byte conditionally.
 // if ([!]Pv[.new]) memb(#u6)=Nt.new
 // if (Pv) memb(Rs+#u6:0)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STrib_cPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if ($src1) memb($addr) = $src2.new",
@@ -1708,7 +3037,8 @@ def STrib_cPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv.new) memb(Rs+#u6:0)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STrib_cdnPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if ($src1.new) memb($addr) = $src2.new",
@@ -1716,7 +3046,8 @@ def STrib_cdnPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv) memb(Rs+#u6:0)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STrib_cNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if (!$src1) memb($addr) = $src2.new",
@@ -1724,7 +3055,8 @@ def STrib_cNotPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memb(Rs+#u6:0)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STrib_cdnNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if (!$src1.new) memb($addr) = $src2.new",
@@ -1732,7 +3064,8 @@ def STrib_cdnNotPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv) memb(Rs+#u6:0)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STrib_indexed_cPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, IntRegs:$src4),
             "if ($src1) memb($src2+#$src3) = $src4.new",
@@ -1740,7 +3073,8 @@ def STrib_indexed_cPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv.new) memb(Rs+#u6:0)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STrib_indexed_cdnPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, IntRegs:$src4),
             "if ($src1.new) memb($src2+#$src3) = $src4.new",
@@ -1748,7 +3082,8 @@ def STrib_indexed_cdnPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv) memb(Rs+#u6:0)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STrib_indexed_cNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, IntRegs:$src4),
             "if (!$src1) memb($src2+#$src3) = $src4.new",
@@ -1756,7 +3091,8 @@ def STrib_indexed_cNotPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memb(Rs+#u6:0)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STrib_indexed_cdnNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_0Imm:$src3, IntRegs:$src4),
             "if (!$src1.new) memb($src2+#$src3) = $src4.new",
@@ -1766,7 +3102,8 @@ def STrib_indexed_cdnNotPt_nv_V4 : NVInst_V4<(outs),
 
 // if ([!]Pv[.new]) memb(Rs+Ru<<#u2)=Nt.new
 // if (Pv) memb(Rs+Ru<<#u2)=Nt.new
-let mayStore = 1, AddedComplexity = 10 in
+let mayStore = 1, AddedComplexity = 10,
+    isPredicated = 1 in
 def STrib_indexed_shl_cPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
@@ -1775,7 +3112,8 @@ def STrib_indexed_shl_cPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv.new) memb(Rs+Ru<<#u2)=Nt.new
-let mayStore = 1, AddedComplexity = 10 in
+let mayStore = 1, AddedComplexity = 10,
+    isPredicated = 1 in
 def STrib_indexed_shl_cdnPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
@@ -1784,7 +3122,8 @@ def STrib_indexed_shl_cdnPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv) memb(Rs+Ru<<#u2)=Nt.new
-let mayStore = 1, AddedComplexity = 10 in
+let mayStore = 1, AddedComplexity = 10,
+    isPredicated = 1 in
 def STrib_indexed_shl_cNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
@@ -1793,7 +3132,8 @@ def STrib_indexed_shl_cNotPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memb(Rs+Ru<<#u2)=Nt.new
-let mayStore = 1, AddedComplexity = 10 in
+let mayStore = 1, AddedComplexity = 10,
+    isPredicated = 1 in
 def STrib_indexed_shl_cdnNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
@@ -1803,7 +3143,8 @@ def STrib_indexed_shl_cdnNotPt_nv_V4 : NVInst_V4<(outs),
 
 // if ([!]Pv[.new]) memb(Rx++#s4:0)=Nt.new
 // if (Pv) memb(Rx++#s4:0)=Nt.new
-let mayStore = 1, hasCtrlDep = 1 in
+let mayStore = 1, hasCtrlDep = 1,
+    isPredicated = 1 in
 def POST_STbri_cPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset),
             "if ($src1) memb($src3++#$offset) = $src2.new",
@@ -1811,7 +3152,8 @@ def POST_STbri_cPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             Requires<[HasV4T]>;
 
 // if (Pv.new) memb(Rx++#s4:0)=Nt.new
-let mayStore = 1, hasCtrlDep = 1 in
+let mayStore = 1, hasCtrlDep = 1,
+    isPredicated = 1 in
 def POST_STbri_cdnPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset),
             "if ($src1.new) memb($src3++#$offset) = $src2.new",
@@ -1819,7 +3161,8 @@ def POST_STbri_cdnPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             Requires<[HasV4T]>;
 
 // if (!Pv) memb(Rx++#s4:0)=Nt.new
-let mayStore = 1, hasCtrlDep = 1 in
+let mayStore = 1, hasCtrlDep = 1,
+    isPredicated = 1 in
 def POST_STbri_cNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset),
             "if (!$src1) memb($src3++#$offset) = $src2.new",
@@ -1827,7 +3170,8 @@ def POST_STbri_cNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memb(Rx++#s4:0)=Nt.new
-let mayStore = 1, hasCtrlDep = 1 in
+let mayStore = 1, hasCtrlDep = 1,
+    isPredicated = 1 in
 def POST_STbri_cdnNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset),
             "if (!$src1.new) memb($src3++#$offset) = $src2.new",
@@ -1889,6 +3233,14 @@ def STrih_GP_nv_V4 : NVInst_V4<(outs),
             []>,
             Requires<[HasV4T]>;
 
+// memh(#global)=Nt.new
+let mayStore = 1, neverHasSideEffects = 1 in
+def STh_GP_nv_V4 : NVInst_V4<(outs),
+            (ins globaladdress:$global, IntRegs:$src),
+            "memh(#$global) = $src.new",
+            []>,
+            Requires<[HasV4T]>;
+
 
 // Store new-value halfword conditionally.
 
@@ -1896,7 +3248,8 @@ def STrih_GP_nv_V4 : NVInst_V4<(outs),
 
 // if ([!]Pv[.new]) memh(Rs+#u6:1)=Nt.new
 // if (Pv) memh(Rs+#u6:1)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STrih_cPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if ($src1) memh($addr) = $src2.new",
@@ -1904,7 +3257,8 @@ def STrih_cPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv.new) memh(Rs+#u6:1)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STrih_cdnPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if ($src1.new) memh($addr) = $src2.new",
@@ -1912,7 +3266,8 @@ def STrih_cdnPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv) memh(Rs+#u6:1)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STrih_cNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if (!$src1) memh($addr) = $src2.new",
@@ -1920,7 +3275,8 @@ def STrih_cNotPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memh(Rs+#u6:1)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STrih_cdnNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if (!$src1.new) memh($addr) = $src2.new",
@@ -1928,7 +3284,8 @@ def STrih_cdnNotPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv) memh(Rs+#u6:1)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STrih_indexed_cPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, IntRegs:$src4),
             "if ($src1) memh($src2+#$src3) = $src4.new",
@@ -1936,7 +3293,8 @@ def STrih_indexed_cPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv.new) memh(Rs+#u6:1)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STrih_indexed_cdnPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, IntRegs:$src4),
             "if ($src1.new) memh($src2+#$src3) = $src4.new",
@@ -1944,7 +3302,8 @@ def STrih_indexed_cdnPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv) memh(Rs+#u6:1)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STrih_indexed_cNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, IntRegs:$src4),
             "if (!$src1) memh($src2+#$src3) = $src4.new",
@@ -1952,7 +3311,8 @@ def STrih_indexed_cNotPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memh(Rs+#u6:1)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STrih_indexed_cdnNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_1Imm:$src3, IntRegs:$src4),
             "if (!$src1.new) memh($src2+#$src3) = $src4.new",
@@ -1961,7 +3321,8 @@ def STrih_indexed_cdnNotPt_nv_V4 : NVInst_V4<(outs),
 
 // if ([!]Pv[.new]) memh(Rs+Ru<<#u2)=Nt.new
 // if (Pv) memh(Rs+Ru<<#u2)=Nt.new
-let mayStore = 1, AddedComplexity = 10 in
+let mayStore = 1, AddedComplexity = 10,
+    isPredicated = 1 in
 def STrih_indexed_shl_cPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
@@ -1970,7 +3331,8 @@ def STrih_indexed_shl_cPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv.new) memh(Rs+Ru<<#u2)=Nt.new
-let mayStore = 1, AddedComplexity = 10 in
+let mayStore = 1, AddedComplexity = 10,
+    isPredicated = 1 in
 def STrih_indexed_shl_cdnPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
@@ -1979,7 +3341,8 @@ def STrih_indexed_shl_cdnPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv) memh(Rs+Ru<<#u2)=Nt.new
-let mayStore = 1, AddedComplexity = 10 in
+let mayStore = 1, AddedComplexity = 10,
+    isPredicated = 1 in
 def STrih_indexed_shl_cNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
@@ -1988,7 +3351,8 @@ def STrih_indexed_shl_cNotPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memh(Rs+Ru<<#u2)=Nt.new
-let mayStore = 1, AddedComplexity = 10 in
+let mayStore = 1, AddedComplexity = 10,
+    isPredicated = 1 in
 def STrih_indexed_shl_cdnNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
@@ -1998,7 +3362,8 @@ def STrih_indexed_shl_cdnNotPt_nv_V4 : NVInst_V4<(outs),
 
 // if ([!]Pv[]) memh(Rx++#s4:1)=Nt.new
 // if (Pv) memh(Rx++#s4:1)=Nt.new
-let mayStore = 1, hasCtrlDep = 1 in
+let mayStore = 1, hasCtrlDep = 1,
+    isPredicated = 1 in
 def POST_SThri_cPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset),
             "if ($src1) memh($src3++#$offset) = $src2.new",
@@ -2006,7 +3371,8 @@ def POST_SThri_cPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             Requires<[HasV4T]>;
 
 // if (Pv.new) memh(Rx++#s4:1)=Nt.new
-let mayStore = 1, hasCtrlDep = 1 in
+let mayStore = 1, hasCtrlDep = 1,
+    isPredicated = 1 in
 def POST_SThri_cdnPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset),
             "if ($src1.new) memh($src3++#$offset) = $src2.new",
@@ -2014,7 +3380,8 @@ def POST_SThri_cdnPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             Requires<[HasV4T]>;
 
 // if (!Pv) memh(Rx++#s4:1)=Nt.new
-let mayStore = 1, hasCtrlDep = 1 in
+let mayStore = 1, hasCtrlDep = 1,
+    isPredicated = 1 in
 def POST_SThri_cNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset),
             "if (!$src1) memh($src3++#$offset) = $src2.new",
@@ -2022,7 +3389,8 @@ def POST_SThri_cNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memh(Rx++#s4:1)=Nt.new
-let mayStore = 1, hasCtrlDep = 1 in
+let mayStore = 1, hasCtrlDep = 1,
+    isPredicated = 1 in
 def POST_SThri_cdnNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset),
             "if (!$src1.new) memh($src3++#$offset) = $src2.new",
@@ -2085,6 +3453,12 @@ def STriw_GP_nv_V4 : NVInst_V4<(outs),
             []>,
             Requires<[HasV4T]>;
 
+let mayStore = 1, neverHasSideEffects = 1 in
+def STw_GP_nv_V4 : NVInst_V4<(outs),
+            (ins globaladdress:$global, IntRegs:$src),
+            "memw(#$global) = $src.new",
+            []>,
+            Requires<[HasV4T]>;
 
 // Store new-value word conditionally.
 
@@ -2092,7 +3466,8 @@ def STriw_GP_nv_V4 : NVInst_V4<(outs),
 
 // if ([!]Pv[.new]) memw(Rs+#u6:2)=Nt.new
 // if (Pv) memw(Rs+#u6:2)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STriw_cPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if ($src1) memw($addr) = $src2.new",
@@ -2100,7 +3475,8 @@ def STriw_cPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv.new) memw(Rs+#u6:2)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STriw_cdnPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if ($src1.new) memw($addr) = $src2.new",
@@ -2108,7 +3484,8 @@ def STriw_cdnPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv) memw(Rs+#u6:2)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STriw_cNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if (!$src1) memw($addr) = $src2.new",
@@ -2116,7 +3493,8 @@ def STriw_cNotPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memw(Rs+#u6:2)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STriw_cdnNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
             "if (!$src1.new) memw($addr) = $src2.new",
@@ -2124,7 +3502,8 @@ def STriw_cdnNotPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv) memw(Rs+#u6:2)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STriw_indexed_cPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, IntRegs:$src4),
             "if ($src1) memw($src2+#$src3) = $src4.new",
@@ -2132,7 +3511,8 @@ def STriw_indexed_cPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv.new) memw(Rs+#u6:2)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STriw_indexed_cdnPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, IntRegs:$src4),
             "if ($src1.new) memw($src2+#$src3) = $src4.new",
@@ -2140,7 +3520,8 @@ def STriw_indexed_cdnPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv) memw(Rs+#u6:2)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STriw_indexed_cNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, IntRegs:$src4),
             "if (!$src1) memw($src2+#$src3) = $src4.new",
@@ -2148,7 +3529,8 @@ def STriw_indexed_cNotPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memw(Rs+#u6:2)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1,
+    isPredicated = 1 in
 def STriw_indexed_cdnNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, u6_2Imm:$src3, IntRegs:$src4),
             "if (!$src1.new) memw($src2+#$src3) = $src4.new",
@@ -2158,7 +3540,8 @@ def STriw_indexed_cdnNotPt_nv_V4 : NVInst_V4<(outs),
 
 // if ([!]Pv[.new]) memw(Rs+Ru<<#u2)=Nt.new
 // if (Pv) memw(Rs+Ru<<#u2)=Nt.new
-let mayStore = 1, AddedComplexity = 10 in
+let mayStore = 1, AddedComplexity = 10,
+    isPredicated = 1 in
 def STriw_indexed_shl_cPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
@@ -2167,7 +3550,8 @@ def STriw_indexed_shl_cPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv.new) memw(Rs+Ru<<#u2)=Nt.new
-let mayStore = 1, AddedComplexity = 10 in
+let mayStore = 1, AddedComplexity = 10,
+    isPredicated = 1 in
 def STriw_indexed_shl_cdnPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
@@ -2176,7 +3560,8 @@ def STriw_indexed_shl_cdnPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv) memw(Rs+Ru<<#u2)=Nt.new
-let mayStore = 1, AddedComplexity = 10 in
+let mayStore = 1, AddedComplexity = 10,
+    isPredicated = 1 in
 def STriw_indexed_shl_cNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
@@ -2185,7 +3570,8 @@ def STriw_indexed_shl_cNotPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memw(Rs+Ru<<#u2)=Nt.new
-let mayStore = 1, AddedComplexity = 10 in
+let mayStore = 1, AddedComplexity = 10,
+    isPredicated = 1 in
 def STriw_indexed_shl_cdnNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
                  IntRegs:$src5),
@@ -2195,7 +3581,8 @@ def STriw_indexed_shl_cdnNotPt_nv_V4 : NVInst_V4<(outs),
 
 // if ([!]Pv[.new]) memw(Rx++#s4:2)=Nt.new
 // if (Pv) memw(Rx++#s4:2)=Nt.new
-let mayStore = 1, hasCtrlDep = 1 in
+let mayStore = 1, hasCtrlDep = 1,
+    isPredicated = 1 in
 def POST_STwri_cPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset),
             "if ($src1) memw($src3++#$offset) = $src2.new",
@@ -2203,7 +3590,8 @@ def POST_STwri_cPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             Requires<[HasV4T]>;
 
 // if (Pv.new) memw(Rx++#s4:2)=Nt.new
-let mayStore = 1, hasCtrlDep = 1 in
+let mayStore = 1, hasCtrlDep = 1,
+    isPredicated = 1 in
 def POST_STwri_cdnPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset),
             "if ($src1.new) memw($src3++#$offset) = $src2.new",
@@ -2211,7 +3599,8 @@ def POST_STwri_cdnPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             Requires<[HasV4T]>;
 
 // if (!Pv) memw(Rx++#s4:2)=Nt.new
-let mayStore = 1, hasCtrlDep = 1 in
+let mayStore = 1, hasCtrlDep = 1,
+    isPredicated = 1 in
 def POST_STwri_cNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset),
             "if (!$src1) memw($src3++#$offset) = $src2.new",
@@ -2219,7 +3608,8 @@ def POST_STwri_cNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             Requires<[HasV4T]>;
 
 // if (!Pv.new) memw(Rx++#s4:2)=Nt.new
-let mayStore = 1, hasCtrlDep = 1 in
+let mayStore = 1, hasCtrlDep = 1,
+    isPredicated = 1 in
 def POST_STwri_cdnNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset),
             "if (!$src1.new) memw($src3++#$offset) = $src2.new",
@@ -2227,6 +3617,199 @@ def POST_STwri_cdnNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             Requires<[HasV4T]>;
 
 
+
+// if (Pv) memb(##global) = Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STb_GP_cPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if ($src1) memb(##$global) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memb(##global) = Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STb_GP_cNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if (!$src1) memb(##$global) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) memb(##global) = Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STb_GP_cdnPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if ($src1.new) memb(##$global) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memb(##global) = Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STb_GP_cdnNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if (!$src1.new) memb(##$global) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) memh(##global) = Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STh_GP_cPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if ($src1) memh(##$global) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memh(##global) = Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STh_GP_cNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if (!$src1) memh(##$global) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) memh(##global) = Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STh_GP_cdnPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if ($src1.new) memh(##$global) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memh(##global) = Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STh_GP_cdnNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if (!$src1.new) memh(##$global) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) memw(##global) = Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STw_GP_cPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if ($src1) memw(##$global) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memw(##global) = Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STw_GP_cNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if (!$src1) memw(##$global) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (Pv) memw(##global) = Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STw_GP_cdnPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if ($src1.new) memw(##$global) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// if (!Pv) memw(##global) = Rt
+let mayStore = 1, neverHasSideEffects = 1 in
+def STw_GP_cdnNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
+            "if (!$src1.new) memw(##$global) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrib_GP_cPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if ($src1) memb(##$global+$offset) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrib_GP_cNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if (!$src1) memb(##$global+$offset) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrib_GP_cdnPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if ($src1.new) memb(##$global+$offset) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrib_GP_cdnNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if (!$src1.new) memb(##$global+$offset) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrih_GP_cPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if ($src1) memh(##$global+$offset) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrih_GP_cNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if (!$src1) memh(##$global+$offset) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrih_GP_cdnPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if ($src1.new) memh(##$global+$offset) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+let mayStore = 1, neverHasSideEffects = 1 in
+def STrih_GP_cdnNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if (!$src1.new) memh(##$global+$offset) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+let mayStore = 1, neverHasSideEffects = 1 in
+def STriw_GP_cPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if ($src1) memw(##$global+$offset) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+let mayStore = 1, neverHasSideEffects = 1 in
+def STriw_GP_cNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if (!$src1) memw(##$global+$offset) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+let mayStore = 1, neverHasSideEffects = 1 in
+def STriw_GP_cdnPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if ($src1.new) memw(##$global+$offset) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
+let mayStore = 1, neverHasSideEffects = 1 in
+def STriw_GP_cdnNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
+                                                        IntRegs:$src2),
+            "if (!$src1.new) memw(##$global+$offset) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+
 //===----------------------------------------------------------------------===//
 // NV/ST -
 //===----------------------------------------------------------------------===//
@@ -2253,7 +3836,8 @@ multiclass NVJ_type_basic_reg<string NotStr, string OpcStr, string TakenStr> {
             Requires<[HasV4T]>;
 }
 
-multiclass NVJ_type_basic_2ndDotNew<string NotStr, string OpcStr, string TakenStr> {
+multiclass NVJ_type_basic_2ndDotNew<string NotStr, string OpcStr,
+                                                   string TakenStr> {
   def _ie_nv_V4 : NVInst_V4<(outs),
             (ins IntRegs:$src1, IntRegs:$src2, brtarget:$offset),
             !strconcat("if (", !strconcat(NotStr, !strconcat(OpcStr,
@@ -2307,7 +3891,8 @@ multiclass NVJ_type_basic_neg<string NotStr, string OpcStr, string TakenStr> {
             Requires<[HasV4T]>;
 }
 
-multiclass NVJ_type_basic_tstbit<string NotStr, string OpcStr, string TakenStr> {
+multiclass NVJ_type_basic_tstbit<string NotStr, string OpcStr,
+                                                string TakenStr> {
   def _ie_nv_V4 : NVInst_V4<(outs),
             (ins IntRegs:$src1, u1Imm:$src2, brtarget:$offset),
             !strconcat("if (", !strconcat(NotStr, !strconcat(OpcStr,
@@ -2416,16 +4001,18 @@ let isBranch = 1, isTerminator=1, neverHasSideEffects = 1, Defs = [PC] in {
 def ADDr_ADDri_V4 : MInst<(outs IntRegs:$dst),
           (ins IntRegs:$src1, IntRegs:$src2, s6Imm:$src3),
           "$dst = add($src1, add($src2, #$src3))",
-          [(set IntRegs:$dst,
-           (add IntRegs:$src1, (add IntRegs:$src2, s6ImmPred:$src3)))]>,
+          [(set (i32 IntRegs:$dst),
+           (add (i32 IntRegs:$src1), (add (i32 IntRegs:$src2),
+                                          s6ImmPred:$src3)))]>,
           Requires<[HasV4T]>;
 
 //  Rd=add(Rs,sub(#s6,Ru))
 def ADDr_SUBri_V4 : MInst<(outs IntRegs:$dst),
           (ins IntRegs:$src1, s6Imm:$src2, IntRegs:$src3),
           "$dst = add($src1, sub(#$src2, $src3))",
-          [(set IntRegs:$dst,
-           (add IntRegs:$src1, (sub s6ImmPred:$src2, IntRegs:$src3)))]>,
+          [(set (i32 IntRegs:$dst),
+           (add (i32 IntRegs:$src1), (sub s6ImmPred:$src2,
+                                          (i32 IntRegs:$src3))))]>,
           Requires<[HasV4T]>;
 
 // Generates the same instruction as ADDr_SUBri_V4 but matches different
@@ -2434,8 +4021,9 @@ def ADDr_SUBri_V4 : MInst<(outs IntRegs:$dst),
 def ADDri_SUBr_V4 : MInst<(outs IntRegs:$dst),
           (ins IntRegs:$src1, s6Imm:$src2, IntRegs:$src3),
           "$dst = add($src1, sub(#$src2, $src3))",
-          [(set IntRegs:$dst,
-           (sub (add IntRegs:$src1, s6ImmPred:$src2), IntRegs:$src3))]>,
+          [(set (i32 IntRegs:$dst),
+                (sub (add (i32 IntRegs:$src1), s6ImmPred:$src2),
+                     (i32 IntRegs:$src3)))]>,
           Requires<[HasV4T]>;
 
 
@@ -2451,16 +4039,16 @@ def ADDri_SUBr_V4 : MInst<(outs IntRegs:$dst),
 def ANDd_NOTd_V4 : MInst<(outs DoubleRegs:$dst),
           (ins DoubleRegs:$src1, DoubleRegs:$src2),
           "$dst = and($src1, ~$src2)",
-          [(set DoubleRegs:$dst, (and DoubleRegs:$src1,
-                                      (not DoubleRegs:$src2)))]>,
+          [(set (i64 DoubleRegs:$dst), (and (i64 DoubleRegs:$src1),
+                                      (not (i64 DoubleRegs:$src2))))]>,
           Requires<[HasV4T]>;
 
 //  Rdd=or(Rtt,~Rss)
 def ORd_NOTd_V4 : MInst<(outs DoubleRegs:$dst),
           (ins DoubleRegs:$src1, DoubleRegs:$src2),
           "$dst = or($src1, ~$src2)",
-          [(set DoubleRegs:$dst,
-           (or DoubleRegs:$src1, (not DoubleRegs:$src2)))]>,
+          [(set (i64 DoubleRegs:$dst),
+           (or (i64 DoubleRegs:$src1), (not (i64 DoubleRegs:$src2))))]>,
           Requires<[HasV4T]>;
 
 
@@ -2469,8 +4057,9 @@ def ORd_NOTd_V4 : MInst<(outs DoubleRegs:$dst),
 def XORd_XORdd: MInst_acc<(outs DoubleRegs:$dst),
           (ins DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
           "$dst ^= xor($src2, $src3)",
-          [(set DoubleRegs:$dst,
-           (xor DoubleRegs:$src1, (xor DoubleRegs:$src2, DoubleRegs:$src3)))],
+          [(set (i64 DoubleRegs:$dst),
+           (xor (i64 DoubleRegs:$src1), (xor (i64 DoubleRegs:$src2),
+                                             (i64 DoubleRegs:$src3))))],
           "$src1 = $dst">,
           Requires<[HasV4T]>;
 
@@ -2480,8 +4069,9 @@ def XORd_XORdd: MInst_acc<(outs DoubleRegs:$dst),
 def ORr_ANDri_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, s10Imm:$src3),
             "$dst = or($src1, and($src2, #$src3))",
-            [(set IntRegs:$dst,
-             (or IntRegs:$src1, (and IntRegs:$src2, s10ImmPred:$src3)))],
+            [(set (i32 IntRegs:$dst),
+                  (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                                s10ImmPred:$src3)))],
             "$src2 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2490,8 +4080,9 @@ def ORr_ANDri_V4 : MInst_acc<(outs IntRegs:$dst),
 def ANDr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst &= and($src2, $src3)",
-            [(set IntRegs:$dst,
-             (and IntRegs:$src1, (and IntRegs:$src2, IntRegs:$src3)))],
+            [(set (i32 IntRegs:$dst),
+                  (and (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                                 (i32 IntRegs:$src3))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2499,8 +4090,9 @@ def ANDr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst),
 def ORr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst |= and($src2, $src3)",
-            [(set IntRegs:$dst,
-             (or IntRegs:$src1, (and IntRegs:$src2, IntRegs:$src3)))],
+            [(set (i32 IntRegs:$dst),
+                  (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                                (i32 IntRegs:$src3))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2508,8 +4100,9 @@ def ORr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst),
 def XORr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst ^= and($src2, $src3)",
-            [(set IntRegs:$dst,
-             (xor IntRegs:$src1, (and IntRegs:$src2, IntRegs:$src3)))],
+            [(set (i32 IntRegs:$dst),
+             (xor (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                            (i32 IntRegs:$src3))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2518,8 +4111,9 @@ def XORr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst),
 def ANDr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst &= and($src2, ~$src3)",
-            [(set IntRegs:$dst,
-             (and IntRegs:$src1, (and IntRegs:$src2, (not IntRegs:$src3))))],
+            [(set (i32 IntRegs:$dst),
+                  (and (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                                 (not (i32 IntRegs:$src3)))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2527,8 +4121,9 @@ def ANDr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst),
 def ORr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst |= and($src2, ~$src3)",
-            [(set IntRegs:$dst,
-             (or IntRegs:$src1, (and IntRegs:$src2, (not IntRegs:$src3))))],
+            [(set (i32 IntRegs:$dst),
+             (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                           (not (i32 IntRegs:$src3)))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2536,8 +4131,9 @@ def ORr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst),
 def XORr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst ^= and($src2, ~$src3)",
-            [(set IntRegs:$dst,
-             (xor IntRegs:$src1, (and IntRegs:$src2, (not IntRegs:$src3))))],
+            [(set (i32 IntRegs:$dst),
+             (xor (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                            (not (i32 IntRegs:$src3)))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2546,8 +4142,9 @@ def XORr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst),
 def ANDr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst &= or($src2, $src3)",
-            [(set IntRegs:$dst,
-             (and IntRegs:$src1, (or IntRegs:$src2, IntRegs:$src3)))],
+            [(set (i32 IntRegs:$dst),
+                  (and (i32 IntRegs:$src1), (or (i32 IntRegs:$src2),
+                                                (i32 IntRegs:$src3))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2555,8 +4152,9 @@ def ANDr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst),
 def ORr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst |= or($src2, $src3)",
-            [(set IntRegs:$dst,
-             (or IntRegs:$src1, (or IntRegs:$src2, IntRegs:$src3)))],
+            [(set (i32 IntRegs:$dst),
+                  (or (i32 IntRegs:$src1), (or (i32 IntRegs:$src2),
+                                               (i32 IntRegs:$src3))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2564,8 +4162,9 @@ def ORr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst),
 def XORr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst ^= or($src2, $src3)",
-            [(set IntRegs:$dst,
-             (xor IntRegs:$src1, (or IntRegs:$src2, IntRegs:$src3)))],
+            [(set (i32 IntRegs:$dst),
+             (xor (i32 IntRegs:$src1), (or (i32 IntRegs:$src2),
+                                           (i32 IntRegs:$src3))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2574,8 +4173,9 @@ def XORr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst),
 def ANDr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst &= xor($src2, $src3)",
-            [(set IntRegs:$dst,
-             (and IntRegs:$src1, (xor IntRegs:$src2, IntRegs:$src3)))],
+            [(set (i32 IntRegs:$dst),
+                  (and (i32 IntRegs:$src1), (xor (i32 IntRegs:$src2),
+                                                 (i32 IntRegs:$src3))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2583,8 +4183,9 @@ def ANDr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst),
 def ORr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst |= xor($src2, $src3)",
-            [(set IntRegs:$dst,
-             (and IntRegs:$src1, (xor IntRegs:$src2, IntRegs:$src3)))],
+            [(set (i32 IntRegs:$dst),
+                  (and (i32 IntRegs:$src1), (xor (i32 IntRegs:$src2),
+                                                 (i32 IntRegs:$src3))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2592,8 +4193,9 @@ def ORr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst),
 def XORr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst ^= xor($src2, $src3)",
-            [(set IntRegs:$dst,
-             (and IntRegs:$src1, (xor IntRegs:$src2, IntRegs:$src3)))],
+            [(set (i32 IntRegs:$dst),
+             (and (i32 IntRegs:$src1), (xor (i32 IntRegs:$src2),
+                                            (i32 IntRegs:$src3))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2601,8 +4203,9 @@ def XORr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst),
 def ORr_ANDri2_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, s10Imm:$src3),
             "$dst |= and($src2, #$src3)",
-            [(set IntRegs:$dst,
-             (or IntRegs:$src1, (and IntRegs:$src2, s10ImmPred:$src3)))],
+            [(set (i32 IntRegs:$dst),
+                  (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                                s10ImmPred:$src3)))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2610,8 +4213,9 @@ def ORr_ANDri2_V4 : MInst_acc<(outs IntRegs:$dst),
 def ORr_ORri_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, s10Imm:$src3),
             "$dst |= or($src2, #$src3)",
-            [(set IntRegs:$dst,
-             (or IntRegs:$src1, (and IntRegs:$src2, s10ImmPred:$src3)))],
+            [(set (i32 IntRegs:$dst),
+                  (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                                s10ImmPred:$src3)))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2663,8 +4267,9 @@ def ORr_ORri_V4 : MInst_acc<(outs IntRegs:$dst),
 def ADDi_MPYri_V4 : MInst<(outs IntRegs:$dst),
             (ins u6Imm:$src1, IntRegs:$src2, u6Imm:$src3),
             "$dst = add(#$src1, mpyi($src2, #$src3))",
-            [(set IntRegs:$dst,
-             (add (mul IntRegs:$src2, u6ImmPred:$src3), u6ImmPred:$src1))]>,
+            [(set (i32 IntRegs:$dst),
+                  (add (mul (i32 IntRegs:$src2), u6ImmPred:$src3),
+                       u6ImmPred:$src1))]>,
             Requires<[HasV4T]>;
 
 // Rd=add(#u6,mpyi(Rs,Rt))
@@ -2672,32 +4277,36 @@ def ADDi_MPYri_V4 : MInst<(outs IntRegs:$dst),
 def ADDi_MPYrr_V4 : MInst<(outs IntRegs:$dst),
             (ins u6Imm:$src1, IntRegs:$src2, IntRegs:$src3),
             "$dst = add(#$src1, mpyi($src2, $src3))",
-            [(set IntRegs:$dst,
-             (add (mul IntRegs:$src2, IntRegs:$src3), u6ImmPred:$src1))]>,
+            [(set (i32 IntRegs:$dst),
+                  (add (mul (i32 IntRegs:$src2), (i32 IntRegs:$src3)),
+                       u6ImmPred:$src1))]>,
             Requires<[HasV4T]>;
 
 // Rd=add(Ru,mpyi(#u6:2,Rs))
 def ADDr_MPYir_V4 : MInst<(outs IntRegs:$dst),
             (ins IntRegs:$src1, u6Imm:$src2, IntRegs:$src3),
             "$dst = add($src1, mpyi(#$src2, $src3))",
-            [(set IntRegs:$dst,
-             (add IntRegs:$src1, (mul IntRegs:$src3, u6_2ImmPred:$src2)))]>,
+            [(set (i32 IntRegs:$dst),
+             (add (i32 IntRegs:$src1), (mul (i32 IntRegs:$src3),
+                                            u6_2ImmPred:$src2)))]>,
             Requires<[HasV4T]>;
 
 // Rd=add(Ru,mpyi(Rs,#u6))
 def ADDr_MPYri_V4 : MInst<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2, u6Imm:$src3),
             "$dst = add($src1, mpyi($src2, #$src3))",
-            [(set IntRegs:$dst,
-             (add IntRegs:$src1, (mul IntRegs:$src2, u6ImmPred:$src3)))]>,
+            [(set (i32 IntRegs:$dst),
+                  (add (i32 IntRegs:$src1), (mul (i32 IntRegs:$src2),
+                                                 u6ImmPred:$src3)))]>,
             Requires<[HasV4T]>;
 
 // Rx=add(Ru,mpyi(Rx,Rs))
 def ADDr_MPYrr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
             "$dst = add($src1, mpyi($src2, $src3))",
-            [(set IntRegs:$dst,
-             (add IntRegs:$src1, (mul IntRegs:$src2, IntRegs:$src3)))],
+            [(set (i32 IntRegs:$dst),
+             (add (i32 IntRegs:$src1), (mul (i32 IntRegs:$src2),
+                                            (i32 IntRegs:$src3))))],
             "$src2 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2745,8 +4354,9 @@ def ADDr_MPYrr_V4 : MInst_acc<(outs IntRegs:$dst),
 def ADDi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3),
             "$dst = add(#$src1, asl($src2, #$src3))",
-            [(set IntRegs:$dst,
-              (add (shl IntRegs:$src2, u5ImmPred:$src3), u8ImmPred:$src1))],
+            [(set (i32 IntRegs:$dst),
+                  (add (shl (i32 IntRegs:$src2), u5ImmPred:$src3),
+                       u8ImmPred:$src1))],
             "$src2 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2754,8 +4364,9 @@ def ADDi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
 def ADDi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3),
             "$dst = add(#$src1, lsr($src2, #$src3))",
-            [(set IntRegs:$dst,
-             (add (srl IntRegs:$src2, u5ImmPred:$src3), u8ImmPred:$src1))],
+            [(set (i32 IntRegs:$dst),
+                  (add (srl (i32 IntRegs:$src2), u5ImmPred:$src3),
+                       u8ImmPred:$src1))],
             "$src2 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2763,8 +4374,9 @@ def ADDi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
 def SUBi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3),
             "$dst = sub(#$src1, asl($src2, #$src3))",
-            [(set IntRegs:$dst,
-             (sub (shl IntRegs:$src2, u5ImmPred:$src3), u8ImmPred:$src1))],
+            [(set (i32 IntRegs:$dst),
+                  (sub (shl (i32 IntRegs:$src2), u5ImmPred:$src3),
+                       u8ImmPred:$src1))],
             "$src2 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2772,8 +4384,9 @@ def SUBi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
 def SUBi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3),
             "$dst = sub(#$src1, lsr($src2, #$src3))",
-            [(set IntRegs:$dst,
-             (sub (srl IntRegs:$src2, u5ImmPred:$src3), u8ImmPred:$src1))],
+            [(set (i32 IntRegs:$dst),
+                  (sub (srl (i32 IntRegs:$src2), u5ImmPred:$src3),
+                       u8ImmPred:$src1))],
             "$src2 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2783,8 +4396,9 @@ def SUBi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
 def ANDi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3),
             "$dst = and(#$src1, asl($src2, #$src3))",
-            [(set IntRegs:$dst,
-             (and (shl IntRegs:$src2, u5ImmPred:$src3), u8ImmPred:$src1))],
+            [(set (i32 IntRegs:$dst),
+                  (and (shl (i32 IntRegs:$src2), u5ImmPred:$src3),
+                       u8ImmPred:$src1))],
             "$src2 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2792,26 +4406,31 @@ def ANDi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
 def ANDi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3),
             "$dst = and(#$src1, lsr($src2, #$src3))",
-            [(set IntRegs:$dst,
-             (and (srl IntRegs:$src2, u5ImmPred:$src3), u8ImmPred:$src1))],
+            [(set (i32 IntRegs:$dst),
+                  (and (srl (i32 IntRegs:$src2), u5ImmPred:$src3),
+                       u8ImmPred:$src1))],
             "$src2 = $dst">,
             Requires<[HasV4T]>;
 
 //Rx=or(#u8,asl(Rx,#U5))
+let AddedComplexity = 30 in
 def ORi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3),
             "$dst = or(#$src1, asl($src2, #$src3))",
-            [(set IntRegs:$dst,
-             (or (shl IntRegs:$src2, u5ImmPred:$src3), u8ImmPred:$src1))],
+            [(set (i32 IntRegs:$dst),
+                  (or (shl (i32 IntRegs:$src2), u5ImmPred:$src3),
+                      u8ImmPred:$src1))],
             "$src2 = $dst">,
             Requires<[HasV4T]>;
 
 //Rx=or(#u8,lsr(Rx,#U5))
+let AddedComplexity = 30 in
 def ORi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3),
             "$dst = or(#$src1, lsr($src2, #$src3))",
-            [(set IntRegs:$dst,
-             (or (srl IntRegs:$src2, u5ImmPred:$src3), u8ImmPred:$src1))],
+            [(set (i32 IntRegs:$dst),
+                  (or (srl (i32 IntRegs:$src2), u5ImmPred:$src3),
+                      u8ImmPred:$src1))],
             "$src2 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2820,7 +4439,8 @@ def ORi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
 //Rd=lsl(#s6,Rt)
 def LSLi_V4 : MInst<(outs IntRegs:$dst), (ins s6Imm:$src1, IntRegs:$src2),
             "$dst = lsl(#$src1, $src2)",
-            [(set IntRegs:$dst, (shl s6ImmPred:$src1, IntRegs:$src2))]>,
+            [(set (i32 IntRegs:$dst), (shl s6ImmPred:$src1,
+                                           (i32 IntRegs:$src2)))]>,
             Requires<[HasV4T]>;
 
 
@@ -2829,8 +4449,9 @@ def LSLi_V4 : MInst<(outs IntRegs:$dst), (ins s6Imm:$src1, IntRegs:$src2),
 def ASLd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst),
             (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
             "$dst ^= asl($src2, $src3)",
-            [(set DoubleRegs:$dst,
-             (xor DoubleRegs:$src1, (shl DoubleRegs:$src2, IntRegs:$src3)))],
+            [(set (i64 DoubleRegs:$dst),
+                  (xor (i64 DoubleRegs:$src1), (shl (i64 DoubleRegs:$src2),
+                                                    (i32 IntRegs:$src3))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2838,8 +4459,9 @@ def ASLd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst),
 def ASRd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst),
             (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
             "$dst ^= asr($src2, $src3)",
-            [(set DoubleRegs:$dst,
-             (xor DoubleRegs:$src1, (sra DoubleRegs:$src2, IntRegs:$src3)))],
+            [(set (i64 DoubleRegs:$dst),
+                  (xor (i64 DoubleRegs:$src1), (sra (i64 DoubleRegs:$src2),
+                                                    (i32 IntRegs:$src3))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2847,8 +4469,9 @@ def ASRd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst),
 def LSLd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst),
             (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
             "$dst ^= lsl($src2, $src3)",
-            [(set DoubleRegs:$dst,
-             (xor DoubleRegs:$src1, (shl DoubleRegs:$src2, IntRegs:$src3)))],
+            [(set (i64 DoubleRegs:$dst), (xor (i64 DoubleRegs:$src1),
+                                              (shl (i64 DoubleRegs:$src2),
+                                                   (i32 IntRegs:$src3))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2856,8 +4479,9 @@ def LSLd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst),
 def LSRd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst),
             (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
             "$dst ^= lsr($src2, $src3)",
-            [(set DoubleRegs:$dst,
-             (xor DoubleRegs:$src1, (srl DoubleRegs:$src2, IntRegs:$src3)))],
+            [(set (i64 DoubleRegs:$dst),
+                  (xor (i64 DoubleRegs:$src1), (srl (i64 DoubleRegs:$src2),
+                                                    (i32 IntRegs:$src3))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
 
@@ -2903,16 +4527,16 @@ let AddedComplexity = 30 in
 def MEMw_ADDSUBi_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_2Imm:$offset, m6Imm:$addend),
             "Error; should not emit",
-            [(store (add (load (add IntRegs:$base, u6_2ImmPred:$offset)),
-m6ImmPred:$addend),
-                    (add IntRegs:$base, u6_2ImmPred:$offset))]>,
+            [(store (add (load (add (i32 IntRegs:$base), u6_2ImmPred:$offset)),
+                         m6ImmPred:$addend),
+                    (add (i32 IntRegs:$base), u6_2ImmPred:$offset))]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memw(Rs+#u6:2) += #U5
 let AddedComplexity = 30 in
 def MEMw_ADDi_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_2Imm:$offset, u5Imm:$addend),
-            "memw($base+#$offset) += $addend",
+            "memw($base+#$offset) += #$addend",
             []>,
             Requires<[HasV4T, UseMEMOP]>;
 
@@ -2920,7 +4544,7 @@ def MEMw_ADDi_indexed_MEM_V4 : MEMInst_V4<(outs),
 let AddedComplexity = 30 in
 def MEMw_SUBi_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_2Imm:$offset, u5Imm:$subend),
-            "memw($base+#$offset) -= $subend",
+            "memw($base+#$offset) -= #$subend",
             []>,
             Requires<[HasV4T, UseMEMOP]>;
 
@@ -2929,9 +4553,9 @@ let AddedComplexity = 30 in
 def MEMw_ADDr_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_2Imm:$offset, IntRegs:$addend),
             "memw($base+#$offset) += $addend",
-            [(store (add (load (add IntRegs:$base, u6_2ImmPred:$offset)),
-IntRegs:$addend),
-                    (add IntRegs:$base, u6_2ImmPred:$offset))]>,
+            [(store (add (load (add (i32 IntRegs:$base), u6_2ImmPred:$offset)),
+                         (i32 IntRegs:$addend)),
+                    (add (i32 IntRegs:$base), u6_2ImmPred:$offset))]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memw(Rs+#u6:2) -= Rt
@@ -2939,19 +4563,19 @@ let AddedComplexity = 30 in
 def MEMw_SUBr_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_2Imm:$offset, IntRegs:$subend),
             "memw($base+#$offset) -= $subend",
-            [(store (sub (load (add IntRegs:$base, u6_2ImmPred:$offset)),
-IntRegs:$subend),
-                    (add IntRegs:$base, u6_2ImmPred:$offset))]>,
+            [(store (sub (load (add (i32 IntRegs:$base), u6_2ImmPred:$offset)),
+                         (i32 IntRegs:$subend)),
+                    (add (i32 IntRegs:$base), u6_2ImmPred:$offset))]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memw(Rs+#u6:2) &= Rt
 let AddedComplexity = 30 in
 def MEMw_ANDr_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_2Imm:$offset, IntRegs:$andend),
-            "memw($base+#$offset) += $andend",
-            [(store (and (load (add IntRegs:$base, u6_2ImmPred:$offset)),
-IntRegs:$andend),
-                    (add IntRegs:$base, u6_2ImmPred:$offset))]>,
+            "memw($base+#$offset) &= $andend",
+            [(store (and (load (add (i32 IntRegs:$base), u6_2ImmPred:$offset)),
+                         (i32 IntRegs:$andend)),
+                    (add (i32 IntRegs:$base), u6_2ImmPred:$offset))]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memw(Rs+#u6:2) |= Rt
@@ -2959,9 +4583,9 @@ let AddedComplexity = 30 in
 def MEMw_ORr_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_2Imm:$offset, IntRegs:$orend),
             "memw($base+#$offset) |= $orend",
-            [(store (or (load (add IntRegs:$base, u6_2ImmPred:$offset)),
-                        IntRegs:$orend),
-                    (add IntRegs:$base, u6_2ImmPred:$offset))]>,
+            [(store (or (load (add (i32 IntRegs:$base), u6_2ImmPred:$offset)),
+                        (i32 IntRegs:$orend)),
+                    (add (i32 IntRegs:$base), u6_2ImmPred:$offset))]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // MEMw_ADDSUBi_V4:
@@ -2996,7 +4620,7 @@ let AddedComplexity = 30 in
 def MEMw_ADDr_MEM_V4 : MEMInst_V4<(outs),
             (ins MEMri:$addr, IntRegs:$addend),
             "memw($addr) += $addend",
-            [(store (add (load ADDRriU6_2:$addr), IntRegs:$addend),
+            [(store (add (load ADDRriU6_2:$addr), (i32 IntRegs:$addend)),
                     ADDRriU6_2:$addr)]>,
             Requires<[HasV4T, UseMEMOP]>;
 
@@ -3005,7 +4629,7 @@ let AddedComplexity = 30 in
 def MEMw_SUBr_MEM_V4 : MEMInst_V4<(outs),
             (ins MEMri:$addr, IntRegs:$subend),
             "memw($addr) -= $subend",
-            [(store (sub (load ADDRriU6_2:$addr), IntRegs:$subend),
+            [(store (sub (load ADDRriU6_2:$addr), (i32 IntRegs:$subend)),
                     ADDRriU6_2:$addr)]>,
             Requires<[HasV4T, UseMEMOP]>;
 
@@ -3014,7 +4638,7 @@ let AddedComplexity = 30 in
 def MEMw_ANDr_MEM_V4 : MEMInst_V4<(outs),
             (ins MEMri:$addr, IntRegs:$andend),
             "memw($addr) &= $andend",
-            [(store (and (load ADDRriU6_2:$addr), IntRegs:$andend),
+            [(store (and (load ADDRriU6_2:$addr), (i32 IntRegs:$andend)),
                     ADDRriU6_2:$addr)]>,
             Requires<[HasV4T, UseMEMOP]>;
 
@@ -3023,8 +4647,8 @@ let AddedComplexity = 30 in
 def MEMw_ORr_MEM_V4 : MEMInst_V4<(outs),
             (ins MEMri:$addr, IntRegs:$orend),
             "memw($addr) |= $orend",
-            [(store (or (load ADDRriU6_2:$addr), IntRegs:$orend),
-ADDRriU6_2:$addr)]>,
+            [(store (or (load ADDRriU6_2:$addr), (i32 IntRegs:$orend)),
+                    ADDRriU6_2:$addr)]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 //===----------------------------------------------------------------------===//
@@ -3060,10 +4684,10 @@ let AddedComplexity = 30 in
 def MEMh_ADDSUBi_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_1Imm:$offset, m6Imm:$addend),
             "Error; should not emit",
-            [(truncstorei16 (add (sextloadi16 (add IntRegs:$base,
+            [(truncstorei16 (add (sextloadi16 (add (i32 IntRegs:$base),
                                                    u6_1ImmPred:$offset)),
                                  m6ImmPred:$addend),
-                            (add IntRegs:$base, u6_1ImmPred:$offset))]>,
+                            (add (i32 IntRegs:$base), u6_1ImmPred:$offset))]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memh(Rs+#u6:1) += #U5
@@ -3087,10 +4711,10 @@ let AddedComplexity = 30 in
 def MEMh_ADDr_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_1Imm:$offset, IntRegs:$addend),
             "memh($base+#$offset) += $addend",
-            [(truncstorei16 (add (sextloadi16 (add IntRegs:$base,
+            [(truncstorei16 (add (sextloadi16 (add (i32 IntRegs:$base),
                                                    u6_1ImmPred:$offset)),
-                                 IntRegs:$addend),
-                            (add IntRegs:$base, u6_1ImmPred:$offset))]>,
+                                 (i32 IntRegs:$addend)),
+                            (add (i32 IntRegs:$base), u6_1ImmPred:$offset))]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memh(Rs+#u6:1) -= Rt
@@ -3098,10 +4722,10 @@ let AddedComplexity = 30 in
 def MEMh_SUBr_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_1Imm:$offset, IntRegs:$subend),
             "memh($base+#$offset) -= $subend",
-            [(truncstorei16 (sub (sextloadi16 (add IntRegs:$base,
+            [(truncstorei16 (sub (sextloadi16 (add (i32 IntRegs:$base),
                                                    u6_1ImmPred:$offset)),
-                                 IntRegs:$subend),
-                            (add IntRegs:$base, u6_1ImmPred:$offset))]>,
+                                 (i32 IntRegs:$subend)),
+                            (add (i32 IntRegs:$base), u6_1ImmPred:$offset))]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memh(Rs+#u6:1) &= Rt
@@ -3109,10 +4733,10 @@ let AddedComplexity = 30 in
 def MEMh_ANDr_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_1Imm:$offset, IntRegs:$andend),
             "memh($base+#$offset) += $andend",
-            [(truncstorei16 (and (sextloadi16 (add IntRegs:$base,
+            [(truncstorei16 (and (sextloadi16 (add (i32 IntRegs:$base),
                                                    u6_1ImmPred:$offset)),
-                                 IntRegs:$andend),
-                            (add IntRegs:$base, u6_1ImmPred:$offset))]>,
+                                 (i32 IntRegs:$andend)),
+                            (add (i32 IntRegs:$base), u6_1ImmPred:$offset))]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memh(Rs+#u6:1) |= Rt
@@ -3120,10 +4744,10 @@ let AddedComplexity = 30 in
 def MEMh_ORr_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_1Imm:$offset, IntRegs:$orend),
             "memh($base+#$offset) |= $orend",
-            [(truncstorei16 (or (sextloadi16 (add IntRegs:$base,
+            [(truncstorei16 (or (sextloadi16 (add (i32 IntRegs:$base),
                                               u6_1ImmPred:$offset)),
-                             IntRegs:$orend),
-                            (add IntRegs:$base, u6_1ImmPred:$offset))]>,
+                             (i32 IntRegs:$orend)),
+                            (add (i32 IntRegs:$base), u6_1ImmPred:$offset))]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // MEMh_ADDSUBi_V4:
@@ -3159,7 +4783,7 @@ def MEMh_ADDr_MEM_V4 : MEMInst_V4<(outs),
             (ins MEMri:$addr, IntRegs:$addend),
             "memh($addr) += $addend",
             [(truncstorei16 (add (sextloadi16 ADDRriU6_1:$addr),
-                                 IntRegs:$addend), ADDRriU6_1:$addr)]>,
+                                 (i32 IntRegs:$addend)), ADDRriU6_1:$addr)]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memh(Rs+#u6:1) -= Rt
@@ -3168,7 +4792,7 @@ def MEMh_SUBr_MEM_V4 : MEMInst_V4<(outs),
             (ins MEMri:$addr, IntRegs:$subend),
             "memh($addr) -= $subend",
             [(truncstorei16 (sub (sextloadi16 ADDRriU6_1:$addr),
-                                 IntRegs:$subend), ADDRriU6_1:$addr)]>,
+                                 (i32 IntRegs:$subend)), ADDRriU6_1:$addr)]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memh(Rs+#u6:1) &= Rt
@@ -3177,7 +4801,7 @@ def MEMh_ANDr_MEM_V4 : MEMInst_V4<(outs),
             (ins MEMri:$addr, IntRegs:$andend),
             "memh($addr) &= $andend",
             [(truncstorei16 (and (sextloadi16 ADDRriU6_1:$addr),
-                                 IntRegs:$andend), ADDRriU6_1:$addr)]>,
+                                 (i32 IntRegs:$andend)), ADDRriU6_1:$addr)]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memh(Rs+#u6:1) |= Rt
@@ -3186,7 +4810,7 @@ def MEMh_ORr_MEM_V4 : MEMInst_V4<(outs),
             (ins MEMri:$addr, IntRegs:$orend),
             "memh($addr) |= $orend",
             [(truncstorei16 (or (sextloadi16 ADDRriU6_1:$addr),
-                                IntRegs:$orend), ADDRriU6_1:$addr)]>,
+                                (i32 IntRegs:$orend)), ADDRriU6_1:$addr)]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 
@@ -3223,10 +4847,10 @@ let AddedComplexity = 30 in
 def MEMb_ADDSUBi_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_0Imm:$offset, m6Imm:$addend),
             "Error; should not emit",
-            [(truncstorei8 (add (sextloadi8 (add IntRegs:$base,
+            [(truncstorei8 (add (sextloadi8 (add (i32 IntRegs:$base),
                                                  u6_0ImmPred:$offset)),
                                 m6ImmPred:$addend),
-                           (add IntRegs:$base, u6_0ImmPred:$offset))]>,
+                           (add (i32 IntRegs:$base), u6_0ImmPred:$offset))]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memb(Rs+#u6:0) += #U5
@@ -3250,10 +4874,10 @@ let AddedComplexity = 30 in
 def MEMb_ADDr_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_0Imm:$offset, IntRegs:$addend),
             "memb($base+#$offset) += $addend",
-            [(truncstorei8 (add (sextloadi8 (add IntRegs:$base,
+            [(truncstorei8 (add (sextloadi8 (add (i32 IntRegs:$base),
                                                  u6_0ImmPred:$offset)),
-                                IntRegs:$addend),
-                           (add IntRegs:$base, u6_0ImmPred:$offset))]>,
+                                (i32 IntRegs:$addend)),
+                           (add (i32 IntRegs:$base), u6_0ImmPred:$offset))]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memb(Rs+#u6:0) -= Rt
@@ -3261,10 +4885,10 @@ let AddedComplexity = 30 in
 def MEMb_SUBr_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_0Imm:$offset, IntRegs:$subend),
             "memb($base+#$offset) -= $subend",
-            [(truncstorei8 (sub (sextloadi8 (add IntRegs:$base,
+            [(truncstorei8 (sub (sextloadi8 (add (i32 IntRegs:$base),
                                                  u6_0ImmPred:$offset)),
-                                IntRegs:$subend),
-                           (add IntRegs:$base, u6_0ImmPred:$offset))]>,
+                                (i32 IntRegs:$subend)),
+                           (add (i32 IntRegs:$base), u6_0ImmPred:$offset))]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memb(Rs+#u6:0) &= Rt
@@ -3272,10 +4896,10 @@ let AddedComplexity = 30 in
 def MEMb_ANDr_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_0Imm:$offset, IntRegs:$andend),
             "memb($base+#$offset) += $andend",
-            [(truncstorei8 (and (sextloadi8 (add IntRegs:$base,
+            [(truncstorei8 (and (sextloadi8 (add (i32 IntRegs:$base),
                                                  u6_0ImmPred:$offset)),
-                                IntRegs:$andend),
-                           (add IntRegs:$base, u6_0ImmPred:$offset))]>,
+                                (i32 IntRegs:$andend)),
+                           (add (i32 IntRegs:$base), u6_0ImmPred:$offset))]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memb(Rs+#u6:0) |= Rt
@@ -3283,10 +4907,10 @@ let AddedComplexity = 30 in
 def MEMb_ORr_indexed_MEM_V4 : MEMInst_V4<(outs),
             (ins IntRegs:$base, u6_0Imm:$offset, IntRegs:$orend),
             "memb($base+#$offset) |= $orend",
-            [(truncstorei8 (or (sextloadi8 (add IntRegs:$base,
+            [(truncstorei8 (or (sextloadi8 (add (i32 IntRegs:$base),
                                                 u6_0ImmPred:$offset)),
-                                IntRegs:$orend),
-                           (add IntRegs:$base, u6_0ImmPred:$offset))]>,
+                               (i32 IntRegs:$orend)),
+                           (add (i32 IntRegs:$base), u6_0ImmPred:$offset))]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // MEMb_ADDSUBi_V4:
@@ -3322,7 +4946,7 @@ def MEMb_ADDr_MEM_V4 : MEMInst_V4<(outs),
             (ins MEMri:$addr, IntRegs:$addend),
             "memb($addr) += $addend",
             [(truncstorei8 (add (sextloadi8 ADDRriU6_0:$addr),
-                                IntRegs:$addend), ADDRriU6_0:$addr)]>,
+                                (i32 IntRegs:$addend)), ADDRriU6_0:$addr)]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memb(Rs+#u6:0) -= Rt
@@ -3331,7 +4955,7 @@ def MEMb_SUBr_MEM_V4 : MEMInst_V4<(outs),
             (ins MEMri:$addr, IntRegs:$subend),
             "memb($addr) -= $subend",
             [(truncstorei8 (sub (sextloadi8 ADDRriU6_0:$addr),
-                                IntRegs:$subend), ADDRriU6_0:$addr)]>,
+                                (i32 IntRegs:$subend)), ADDRriU6_0:$addr)]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memb(Rs+#u6:0) &= Rt
@@ -3340,7 +4964,7 @@ def MEMb_ANDr_MEM_V4 : MEMInst_V4<(outs),
             (ins MEMri:$addr, IntRegs:$andend),
             "memb($addr) &= $andend",
             [(truncstorei8 (and (sextloadi8 ADDRriU6_0:$addr),
-                                IntRegs:$andend), ADDRriU6_0:$addr)]>,
+                                (i32 IntRegs:$andend)), ADDRriU6_0:$addr)]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 // memb(Rs+#u6:0) |= Rt
@@ -3349,7 +4973,7 @@ def MEMb_ORr_MEM_V4 : MEMInst_V4<(outs),
             (ins MEMri:$addr, IntRegs:$orend),
             "memb($addr) |= $orend",
             [(truncstorei8 (or (sextloadi8 ADDRriU6_0:$addr),
-                                IntRegs:$orend), ADDRriU6_0:$addr)]>,
+                               (i32 IntRegs:$orend)), ADDRriU6_0:$addr)]>,
             Requires<[HasV4T, UseMEMOP]>;
 
 
@@ -3364,13 +4988,16 @@ def MEMb_ORr_MEM_V4 : MEMInst_V4<(outs),
 // The implemented patterns are: EQ/GT/GTU.
 // Missing patterns are: GE/GEU/LT/LTU/LE/LEU.
 
+// Following instruction is not being extended as it results into the
+// incorrect code for negative numbers.
 // Pd=cmpb.eq(Rs,#u8)
+
 let isCompare = 1 in
 def CMPbEQri_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, u8Imm:$src2),
             "$dst = cmpb.eq($src1, #$src2)",
-            [(set PredRegs:$dst, (seteq (and IntRegs:$src1, 255),
-                                        u8ImmPred:$src2))]>,
+            [(set (i1 PredRegs:$dst),
+                  (seteq (and (i32 IntRegs:$src1), 255), u8ImmPred:$src2))]>,
             Requires<[HasV4T]>;
 
 // Pd=cmpb.eq(Rs,Rt)
@@ -3378,10 +5005,9 @@ let isCompare = 1 in
 def CMPbEQrr_ubub_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = cmpb.eq($src1, $src2)",
-            [(set PredRegs:$dst, (seteq (and (xor IntRegs:$src1,
-                                                  IntRegs:$src2),
-                                             255),
-                                        0))]>,
+            [(set (i1 PredRegs:$dst),
+                  (seteq (and (xor (i32 IntRegs:$src1),
+                                   (i32 IntRegs:$src2)), 255), 0))]>,
             Requires<[HasV4T]>;
 
 // Pd=cmpb.eq(Rs,Rt)
@@ -3389,17 +5015,9 @@ let isCompare = 1 in
 def CMPbEQrr_sbsb_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = cmpb.eq($src1, $src2)",
-            [(set PredRegs:$dst, (seteq (shl IntRegs:$src1, (i32 24)),
-                                        (shl IntRegs:$src2, (i32 24))))]>,
-            Requires<[HasV4T]>;
-
-// Pd=cmpb.gt(Rs,#s8)
-let isCompare = 1 in
-def CMPbGTri_V4 : MInst<(outs PredRegs:$dst),
-            (ins IntRegs:$src1, s32Imm:$src2),
-            "$dst = cmpb.gt($src1, #$src2)",
-            [(set PredRegs:$dst, (setgt (shl IntRegs:$src1, (i32 24)),
-                                        s32_24ImmPred:$src2))]>,
+            [(set (i1 PredRegs:$dst),
+                  (seteq (shl (i32 IntRegs:$src1), (i32 24)),
+                         (shl (i32 IntRegs:$src2), (i32 24))))]>,
             Requires<[HasV4T]>;
 
 // Pd=cmpb.gt(Rs,Rt)
@@ -3407,8 +5025,9 @@ let isCompare = 1 in
 def CMPbGTrr_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = cmpb.gt($src1, $src2)",
-            [(set PredRegs:$dst, (setgt (shl IntRegs:$src1, (i32 24)),
-                                        (shl IntRegs:$src2, (i32 24))))]>,
+            [(set (i1 PredRegs:$dst),
+                  (setgt (shl (i32 IntRegs:$src1), (i32 24)),
+                         (shl (i32 IntRegs:$src2), (i32 24))))]>,
             Requires<[HasV4T]>;
 
 // Pd=cmpb.gtu(Rs,#u7)
@@ -3416,8 +5035,8 @@ let isCompare = 1 in
 def CMPbGTUri_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, u7Imm:$src2),
             "$dst = cmpb.gtu($src1, #$src2)",
-            [(set PredRegs:$dst, (setugt (and IntRegs:$src1, 255),
-                                         u7ImmPred:$src2))]>,
+            [(set (i1 PredRegs:$dst), (setugt (and (i32 IntRegs:$src1), 255),
+                                              u7ImmPred:$src2))]>,
             Requires<[HasV4T]>;
 
 // Pd=cmpb.gtu(Rs,Rt)
@@ -3425,18 +5044,21 @@ let isCompare = 1 in
 def CMPbGTUrr_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = cmpb.gtu($src1, $src2)",
-            [(set PredRegs:$dst, (setugt (and IntRegs:$src1, 255),
-                                         (and IntRegs:$src2, 255)))]>,
+            [(set (i1 PredRegs:$dst), (setugt (and (i32 IntRegs:$src1), 255),
+                                             (and (i32 IntRegs:$src2), 255)))]>,
             Requires<[HasV4T]>;
 
+// Following instruction is not being extended as it results into the incorrect
+// code for negative numbers.
+
 // Signed half compare(.eq) ri.
 // Pd=cmph.eq(Rs,#s8)
 let isCompare = 1 in
 def CMPhEQri_V4 : MInst<(outs PredRegs:$dst),
-            (ins IntRegs:$src1, u16Imm:$src2),
+            (ins IntRegs:$src1, s8Imm:$src2),
             "$dst = cmph.eq($src1, #$src2)",
-            [(set PredRegs:$dst, (seteq (and IntRegs:$src1, 65535),
-                                        u16_s8ImmPred:$src2))]>,
+            [(set (i1 PredRegs:$dst), (seteq (and (i32 IntRegs:$src1), 65535),
+                                             s8ImmPred:$src2))]>,
             Requires<[HasV4T]>;
 
 // Signed half compare(.eq) rr.
@@ -3449,10 +5071,9 @@ let isCompare = 1 in
 def CMPhEQrr_xor_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = cmph.eq($src1, $src2)",
-            [(set PredRegs:$dst, (seteq (and (xor IntRegs:$src1,
-                                                  IntRegs:$src2),
-                                             65535),
-                                        0))]>,
+            [(set (i1 PredRegs:$dst), (seteq (and (xor (i32 IntRegs:$src1),
+                                                       (i32 IntRegs:$src2)),
+                                                  65535), 0))]>,
             Requires<[HasV4T]>;
 
 // Signed half compare(.eq) rr.
@@ -3465,19 +5086,25 @@ let isCompare = 1 in
 def CMPhEQrr_shl_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = cmph.eq($src1, $src2)",
-            [(set PredRegs:$dst, (seteq (shl IntRegs:$src1, (i32 16)),
-                                        (shl IntRegs:$src2, (i32 16))))]>,
+            [(set (i1 PredRegs:$dst),
+                  (seteq (shl (i32 IntRegs:$src1), (i32 16)),
+                         (shl (i32 IntRegs:$src2), (i32 16))))]>,
             Requires<[HasV4T]>;
 
+/* Incorrect Pattern -- immediate should be right shifted before being
+used in the cmph.gt instruction.
 // Signed half compare(.gt) ri.
 // Pd=cmph.gt(Rs,#s8)
+
 let isCompare = 1 in
 def CMPhGTri_V4 : MInst<(outs PredRegs:$dst),
-            (ins IntRegs:$src1, s32Imm:$src2),
+            (ins IntRegs:$src1, s8Imm:$src2),
             "$dst = cmph.gt($src1, #$src2)",
-            [(set PredRegs:$dst, (setgt (shl IntRegs:$src1, (i32 16)),
-                                        s32_16s8ImmPred:$src2))]>,
+            [(set (i1 PredRegs:$dst),
+                  (setgt (shl (i32 IntRegs:$src1), (i32 16)),
+                         s8ImmPred:$src2))]>,
             Requires<[HasV4T]>;
+*/
 
 // Signed half compare(.gt) rr.
 // Pd=cmph.gt(Rs,Rt)
@@ -3485,8 +5112,9 @@ let isCompare = 1 in
 def CMPhGTrr_shl_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = cmph.gt($src1, $src2)",
-            [(set PredRegs:$dst, (setgt (shl IntRegs:$src1, (i32 16)),
-                                        (shl IntRegs:$src2, (i32 16))))]>,
+            [(set (i1 PredRegs:$dst),
+                  (setgt (shl (i32 IntRegs:$src1), (i32 16)),
+                         (shl (i32 IntRegs:$src2), (i32 16))))]>,
             Requires<[HasV4T]>;
 
 // Unsigned half compare rr (.gtu).
@@ -3495,8 +5123,9 @@ let isCompare = 1 in
 def CMPhGTUrr_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = cmph.gtu($src1, $src2)",
-            [(set PredRegs:$dst, (setugt (and IntRegs:$src1, 65535),
-                                         (and IntRegs:$src2, 65535)))]>,
+            [(set (i1 PredRegs:$dst),
+                  (setugt (and (i32 IntRegs:$src1), 65535),
+                          (and (i32 IntRegs:$src2), 65535)))]>,
             Requires<[HasV4T]>;
 
 // Unsigned half compare ri (.gtu).
@@ -3505,8 +5134,8 @@ let isCompare = 1 in
 def CMPhGTUri_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, u7Imm:$src2),
             "$dst = cmph.gtu($src1, #$src2)",
-            [(set PredRegs:$dst, (setugt (and IntRegs:$src1, 65535),
-                                         u7ImmPred:$src2))]>,
+            [(set (i1 PredRegs:$dst), (setugt (and (i32 IntRegs:$src1), 65535),
+                                              u7ImmPred:$src2))]>,
             Requires<[HasV4T]>;
 
 //===----------------------------------------------------------------------===//
@@ -3523,10 +5152,42 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1, isPredicable = 1,
             Requires<[HasV4T]>;
 }
 
+// Restore registers and dealloc return function call.
+let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1,
+  Defs = [R29, R30, R31, PC] in {
+  def RESTORE_DEALLOC_RET_JMP_V4 : JInst<(outs),
+                                   (ins calltarget:$dst),
+             "jump $dst // Restore_and_dealloc_return",
+             []>,
+             Requires<[HasV4T]>;
+}
+
+// Restore registers and dealloc frame before a tail call.
+let isCall = 1, isBarrier = 1,
+  Defs = [R29, R30, R31, PC] in {
+  def RESTORE_DEALLOC_BEFORE_TAILCALL_V4 : JInst<(outs),
+                                           (ins calltarget:$dst),
+             "call $dst // Restore_and_dealloc_before_tailcall",
+             []>,
+             Requires<[HasV4T]>;
+}
+
+// Save registers function call.
+let isCall = 1, isBarrier = 1,
+  Uses = [R29, R31] in {
+  def SAVE_REGISTERS_CALL_V4 : JInst<(outs),
+                               (ins calltarget:$dst),
+             "call $dst // Save_calle_saved_registers",
+             []>,
+             Requires<[HasV4T]>;
+}
+
 //    if (Ps) dealloc_return
 let isReturn = 1, isTerminator = 1,
-  Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1 in {
-  def DEALLOC_RET_cPt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1, i32imm:$amt1),
+    Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1,
+    isPredicated = 1 in {
+  def DEALLOC_RET_cPt_V4 : NVInst_V4<(outs),
+                           (ins PredRegs:$src1, i32imm:$amt1),
             "if ($src1) dealloc_return",
             []>,
             Requires<[HasV4T]>;
@@ -3534,7 +5195,8 @@ let isReturn = 1, isTerminator = 1,
 
 //    if (!Ps) dealloc_return
 let isReturn = 1, isTerminator = 1,
-  Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1 in {
+    Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1,
+    isPredicated = 1 in {
   def DEALLOC_RET_cNotPt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1,
                                                      i32imm:$amt1),
             "if (!$src1) dealloc_return",
@@ -3544,7 +5206,8 @@ let isReturn = 1, isTerminator = 1,
 
 //    if (Ps.new) dealloc_return:nt
 let isReturn = 1, isTerminator = 1,
-  Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1 in {
+    Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1,
+    isPredicated = 1 in {
   def DEALLOC_RET_cdnPnt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1,
                                                      i32imm:$amt1),
             "if ($src1.new) dealloc_return:nt",
@@ -3554,7 +5217,8 @@ let isReturn = 1, isTerminator = 1,
 
 //    if (!Ps.new) dealloc_return:nt
 let isReturn = 1, isTerminator = 1,
-  Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1 in {
+    Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1,
+    isPredicated = 1 in {
   def DEALLOC_RET_cNotdnPnt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1,
                                                         i32imm:$amt1),
             "if (!$src1.new) dealloc_return:nt",
@@ -3564,7 +5228,8 @@ let isReturn = 1, isTerminator = 1,
 
 //    if (Ps.new) dealloc_return:t
 let isReturn = 1, isTerminator = 1,
-  Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1 in {
+    Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1,
+    isPredicated = 1 in {
   def DEALLOC_RET_cdnPt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1,
                                                     i32imm:$amt1),
             "if ($src1.new) dealloc_return:t",
@@ -3574,10 +5239,539 @@ let isReturn = 1, isTerminator = 1,
 
 //    if (!Ps.new) dealloc_return:nt
 let isReturn = 1, isTerminator = 1,
-  Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1 in {
+    Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1,
+    isPredicated = 1 in {
   def DEALLOC_RET_cNotdnPt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1,
                                                        i32imm:$amt1),
             "if (!$src1.new) dealloc_return:t",
             []>,
             Requires<[HasV4T]>;
 }
+
+
+// Load/Store with absolute addressing mode
+// memw(#u6)=Rt
+
+multiclass ST_abs<string OpcStr> {
+  let isPredicable = 1 in
+  def _abs_V4 : STInst2<(outs),
+            (ins globaladdress:$absaddr, IntRegs:$src),
+            !strconcat(OpcStr, "(##$absaddr) = $src"),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$absaddr, IntRegs:$src2),
+            !strconcat("if ($src1)",
+            !strconcat(OpcStr, "(##$absaddr) = $src2")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$absaddr, IntRegs:$src2),
+            !strconcat("if (!$src1)",
+            !strconcat(OpcStr, "(##$absaddr) = $src2")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cdnPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$absaddr, IntRegs:$src2),
+            !strconcat("if ($src1.new)",
+            !strconcat(OpcStr, "(##$absaddr) = $src2")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cdnNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$absaddr, IntRegs:$src2),
+            !strconcat("if (!$src1.new)",
+            !strconcat(OpcStr, "(##$absaddr) = $src2")),
+            []>,
+            Requires<[HasV4T]>;
+
+  def _abs_nv_V4 : STInst2<(outs),
+            (ins globaladdress:$absaddr, IntRegs:$src),
+            !strconcat(OpcStr, "(##$absaddr) = $src.new"),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cPt_nv_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$absaddr, IntRegs:$src2),
+            !strconcat("if ($src1)",
+            !strconcat(OpcStr, "(##$absaddr) = $src2.new")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cNotPt_nv_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$absaddr, IntRegs:$src2),
+            !strconcat("if (!$src1)",
+            !strconcat(OpcStr, "(##$absaddr) = $src2.new")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cdnPt_nv_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$absaddr, IntRegs:$src2),
+            !strconcat("if ($src1.new)",
+            !strconcat(OpcStr, "(##$absaddr) = $src2.new")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cdnNotPt_nv_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdress:$absaddr, IntRegs:$src2),
+            !strconcat("if (!$src1.new)",
+            !strconcat(OpcStr, "(##$absaddr) = $src2.new")),
+            []>,
+            Requires<[HasV4T]>;
+}
+
+let AddedComplexity = 30, isPredicable = 1 in
+def STrid_abs_V4 : STInst<(outs),
+          (ins globaladdress:$absaddr, DoubleRegs:$src),
+           "memd(##$absaddr) = $src",
+          [(store (i64 DoubleRegs:$src),
+                  (HexagonCONST32 tglobaladdr:$absaddr))]>,
+          Requires<[HasV4T]>;
+
+let AddedComplexity = 30, isPredicated = 1 in
+def STrid_abs_cPt_V4 : STInst2<(outs),
+          (ins PredRegs:$src1, globaladdress:$absaddr, DoubleRegs:$src2),
+          "if ($src1) memd(##$absaddr) = $src2",
+          []>,
+          Requires<[HasV4T]>;
+
+let AddedComplexity = 30, isPredicated = 1 in
+def STrid_abs_cNotPt_V4 : STInst2<(outs),
+          (ins PredRegs:$src1, globaladdress:$absaddr, DoubleRegs:$src2),
+          "if (!$src1) memd(##$absaddr) = $src2",
+          []>,
+          Requires<[HasV4T]>;
+
+let AddedComplexity = 30, isPredicated = 1 in
+def STrid_abs_cdnPt_V4 : STInst2<(outs),
+          (ins PredRegs:$src1, globaladdress:$absaddr, DoubleRegs:$src2),
+          "if ($src1.new) memd(##$absaddr) = $src2",
+          []>,
+          Requires<[HasV4T]>;
+
+let AddedComplexity = 30, isPredicated = 1 in
+def STrid_abs_cdnNotPt_V4 : STInst2<(outs),
+          (ins PredRegs:$src1, globaladdress:$absaddr, DoubleRegs:$src2),
+          "if (!$src1.new) memd(##$absaddr) = $src2",
+          []>,
+          Requires<[HasV4T]>;
+
+defm STrib : ST_abs<"memb">;
+defm STrih : ST_abs<"memh">;
+defm STriw : ST_abs<"memw">;
+
+let Predicates = [HasV4T], AddedComplexity  = 30 in
+def : Pat<(truncstorei8 (i32 IntRegs:$src1),
+                        (HexagonCONST32 tglobaladdr:$absaddr)),
+          (STrib_abs_V4 tglobaladdr: $absaddr, IntRegs: $src1)>;
+
+let Predicates = [HasV4T], AddedComplexity  = 30 in
+def : Pat<(truncstorei16 (i32 IntRegs:$src1),
+                          (HexagonCONST32 tglobaladdr:$absaddr)),
+          (STrih_abs_V4 tglobaladdr: $absaddr, IntRegs: $src1)>;
+
+let Predicates = [HasV4T], AddedComplexity  = 30 in
+def : Pat<(store (i32 IntRegs:$src1), (HexagonCONST32 tglobaladdr:$absaddr)),
+          (STriw_abs_V4 tglobaladdr: $absaddr, IntRegs: $src1)>;
+
+
+multiclass LD_abs<string OpcStr> {
+  let isPredicable = 1 in
+  def _abs_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins globaladdress:$absaddr),
+            !strconcat("$dst = ", !strconcat(OpcStr, "(##$absaddr)")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$absaddr),
+            !strconcat("if ($src1) $dst = ",
+            !strconcat(OpcStr, "(##$absaddr)")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$absaddr),
+            !strconcat("if (!$src1) $dst = ",
+            !strconcat(OpcStr, "(##$absaddr)")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$absaddr),
+            !strconcat("if ($src1.new) $dst = ",
+            !strconcat(OpcStr, "(##$absaddr)")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, globaladdress:$absaddr),
+            !strconcat("if (!$src1.new) $dst = ",
+            !strconcat(OpcStr, "(##$absaddr)")),
+            []>,
+            Requires<[HasV4T]>;
+}
+
+let AddedComplexity = 30 in
+def LDrid_abs_V4 : LDInst<(outs DoubleRegs:$dst),
+          (ins globaladdress:$absaddr),
+          "$dst = memd(##$absaddr)",
+          [(set (i64 DoubleRegs:$dst),
+                (load (HexagonCONST32 tglobaladdr:$absaddr)))]>,
+          Requires<[HasV4T]>;
+
+let AddedComplexity = 30, isPredicated = 1 in
+def LDrid_abs_cPt_V4 : LDInst2<(outs DoubleRegs:$dst),
+          (ins PredRegs:$src1, globaladdress:$absaddr),
+          "if ($src1) $dst = memd(##$absaddr)",
+          []>,
+          Requires<[HasV4T]>;
+
+let AddedComplexity = 30, isPredicated = 1 in
+def LDrid_abs_cNotPt_V4 : LDInst2<(outs DoubleRegs:$dst),
+          (ins PredRegs:$src1, globaladdress:$absaddr),
+          "if (!$src1) $dst = memd(##$absaddr)",
+          []>,
+          Requires<[HasV4T]>;
+
+let AddedComplexity = 30, isPredicated = 1 in
+def LDrid_abs_cdnPt_V4 : LDInst2<(outs DoubleRegs:$dst),
+          (ins PredRegs:$src1, globaladdress:$absaddr),
+          "if ($src1.new) $dst = memd(##$absaddr)",
+          []>,
+          Requires<[HasV4T]>;
+
+let AddedComplexity = 30, isPredicated = 1 in
+def LDrid_abs_cdnNotPt_V4 : LDInst2<(outs DoubleRegs:$dst),
+          (ins PredRegs:$src1, globaladdress:$absaddr),
+          "if (!$src1.new) $dst = memd(##$absaddr)",
+          []>,
+          Requires<[HasV4T]>;
+
+defm LDrib : LD_abs<"memb">;
+defm LDriub : LD_abs<"memub">;
+defm LDrih : LD_abs<"memh">;
+defm LDriuh : LD_abs<"memuh">;
+defm LDriw : LD_abs<"memw">;
+
+
+let Predicates = [HasV4T], AddedComplexity  = 30 in
+def : Pat<(i32 (load (HexagonCONST32 tglobaladdr:$absaddr))),
+          (LDriw_abs_V4 tglobaladdr: $absaddr)>;
+
+let Predicates = [HasV4T], AddedComplexity=30 in
+def : Pat<(i32 (sextloadi8 (HexagonCONST32 tglobaladdr:$absaddr))),
+          (LDrib_abs_V4 tglobaladdr:$absaddr)>;
+
+let Predicates = [HasV4T], AddedComplexity=30 in
+def : Pat<(i32 (zextloadi8 (HexagonCONST32 tglobaladdr:$absaddr))),
+          (LDriub_abs_V4 tglobaladdr:$absaddr)>;
+
+let Predicates = [HasV4T], AddedComplexity=30 in
+def : Pat<(i32 (sextloadi16 (HexagonCONST32 tglobaladdr:$absaddr))),
+          (LDrih_abs_V4 tglobaladdr:$absaddr)>;
+
+let Predicates = [HasV4T], AddedComplexity=30 in
+def : Pat<(i32 (zextloadi16 (HexagonCONST32 tglobaladdr:$absaddr))),
+          (LDriuh_abs_V4 tglobaladdr:$absaddr)>;
+
+// Transfer global address into a register
+let AddedComplexity=50, isMoveImm = 1, isReMaterializable = 1 in
+def TFRI_V4 : ALU32_ri<(outs IntRegs:$dst), (ins globaladdress:$src1),
+           "$dst = ##$src1",
+           [(set IntRegs:$dst, (HexagonCONST32 tglobaladdr:$src1))]>,
+           Requires<[HasV4T]>;
+
+let AddedComplexity=50, neverHasSideEffects = 1, isPredicated = 1 in
+def TFRI_cPt_V4 : ALU32_ri<(outs IntRegs:$dst),
+                           (ins PredRegs:$src1, globaladdress:$src2),
+           "if($src1) $dst = ##$src2",
+           []>,
+           Requires<[HasV4T]>;
+
+let AddedComplexity=50, neverHasSideEffects = 1, isPredicated = 1 in
+def TFRI_cNotPt_V4 : ALU32_ri<(outs IntRegs:$dst),
+                              (ins PredRegs:$src1, globaladdress:$src2),
+           "if(!$src1) $dst = ##$src2",
+           []>,
+           Requires<[HasV4T]>;
+
+let AddedComplexity=50, neverHasSideEffects = 1, isPredicated = 1 in
+def TFRI_cdnPt_V4 : ALU32_ri<(outs IntRegs:$dst),
+                             (ins PredRegs:$src1, globaladdress:$src2),
+           "if($src1.new) $dst = ##$src2",
+           []>,
+           Requires<[HasV4T]>;
+
+let AddedComplexity=50, neverHasSideEffects = 1, isPredicated = 1 in
+def TFRI_cdnNotPt_V4 : ALU32_ri<(outs IntRegs:$dst),
+                                (ins PredRegs:$src1, globaladdress:$src2),
+           "if(!$src1.new) $dst = ##$src2",
+           []>,
+           Requires<[HasV4T]>;
+
+let AddedComplexity = 50, Predicates = [HasV4T] in
+def : Pat<(HexagonCONST32_GP tglobaladdr:$src1),
+           (TFRI_V4 tglobaladdr:$src1)>;
+
+
+// Load - Indirect with long offset: These instructions take global address
+// as an operand
+let AddedComplexity = 10 in
+def LDrid_ind_lo_V4 : LDInst<(outs DoubleRegs:$dst),
+            (ins IntRegs:$src1, u2Imm:$src2, globaladdress:$offset),
+            "$dst=memd($src1<<#$src2+##$offset)",
+            [(set (i64 DoubleRegs:$dst),
+                  (load (add (shl IntRegs:$src1, u2ImmPred:$src2),
+                        (HexagonCONST32 tglobaladdr:$offset))))]>,
+            Requires<[HasV4T]>;
+
+let AddedComplexity = 10 in
+multiclass LD_indirect_lo<string OpcStr, PatFrag OpNode> {
+  def _lo_V4 : LDInst<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, u2Imm:$src2, globaladdress:$offset),
+            !strconcat("$dst = ",
+            !strconcat(OpcStr, "($src1<<#$src2+##$offset)")),
+            [(set IntRegs:$dst,
+                  (i32 (OpNode (add (shl IntRegs:$src1, u2ImmPred:$src2),
+                          (HexagonCONST32 tglobaladdr:$offset)))))]>,
+            Requires<[HasV4T]>;
+}
+
+defm LDrib_ind : LD_indirect_lo<"memb", sextloadi8>;
+defm LDriub_ind : LD_indirect_lo<"memub", zextloadi8>;
+defm LDrih_ind : LD_indirect_lo<"memh", sextloadi16>;
+defm LDriuh_ind : LD_indirect_lo<"memuh", zextloadi16>;
+defm LDriw_ind : LD_indirect_lo<"memw", load>;
+
+// Store - Indirect with long offset: These instructions take global address
+// as an operand
+let AddedComplexity = 10 in
+def STrid_ind_lo_V4 : STInst<(outs),
+            (ins IntRegs:$src1, u2Imm:$src2, globaladdress:$src3,
+                 DoubleRegs:$src4),
+            "memd($src1<<#$src2+#$src3) = $src4",
+            [(store (i64 DoubleRegs:$src4),
+                 (add (shl IntRegs:$src1, u2ImmPred:$src2),
+                      (HexagonCONST32 tglobaladdr:$src3)))]>,
+             Requires<[HasV4T]>;
+
+let AddedComplexity = 10 in
+multiclass ST_indirect_lo<string OpcStr, PatFrag OpNode> {
+  def _lo_V4 : STInst<(outs),
+            (ins IntRegs:$src1, u2Imm:$src2, globaladdress:$src3,
+                 IntRegs:$src4),
+            !strconcat(OpcStr, "($src1<<#$src2+##$src3) = $src4"),
+            [(OpNode (i32 IntRegs:$src4),
+                 (add (shl IntRegs:$src1, u2ImmPred:$src2),
+                      (HexagonCONST32 tglobaladdr:$src3)))]>,
+             Requires<[HasV4T]>;
+}
+
+defm STrib_ind : ST_indirect_lo<"memb", truncstorei8>;
+defm STrih_ind : ST_indirect_lo<"memh", truncstorei16>;
+defm STriw_ind : ST_indirect_lo<"memw", store>;
+
+// Store - absolute addressing mode: These instruction take constant
+// value as the extended operand
+multiclass ST_absimm<string OpcStr> {
+  let isPredicable = 1 in
+  def _abs_V4 : STInst2<(outs),
+            (ins u6Imm:$src1, IntRegs:$src2),
+            !strconcat(OpcStr, "(#$src1) = $src2"),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, u6Imm:$src2, IntRegs:$src3),
+            !strconcat("if ($src1)", !strconcat(OpcStr, "(#$src2) = $src3")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, u6Imm:$src2, IntRegs:$src3),
+            !strconcat("if (!$src1)", !strconcat(OpcStr, "(#$src2) = $src3")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cdnPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, u6Imm:$src2, IntRegs:$src3),
+            !strconcat("if ($src1.new)",
+            !strconcat(OpcStr, "(#$src2) = $src3")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cdnNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, u6Imm:$src2, IntRegs:$src3),
+            !strconcat("if (!$src1.new)",
+            !strconcat(OpcStr, "(#$src2) = $src3")),
+            []>,
+            Requires<[HasV4T]>;
+
+  def _abs_nv_V4 : STInst2<(outs),
+            (ins u6Imm:$src1, IntRegs:$src2),
+            !strconcat(OpcStr, "(#$src1) = $src2.new"),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cPt_nv_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, u6Imm:$src2, IntRegs:$src3),
+            !strconcat("if ($src1)",
+            !strconcat(OpcStr, "(#$src2) = $src3.new")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cNotPt_nv_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, u6Imm:$src2, IntRegs:$src3),
+            !strconcat("if (!$src1)",
+            !strconcat(OpcStr, "(#$src2) = $src3.new")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cdnPt_nv_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, u6Imm:$src2, IntRegs:$src3),
+            !strconcat("if ($src1.new)",
+            !strconcat(OpcStr, "(#$src2) = $src3.new")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cdnNotPt_nv_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, u6Imm:$src2, IntRegs:$src3),
+            !strconcat("if (!$src1.new)",
+            !strconcat(OpcStr, "(#$src2) = $src3.new")),
+            []>,
+            Requires<[HasV4T]>;
+}
+
+defm STrib_imm : ST_absimm<"memb">;
+defm STrih_imm : ST_absimm<"memh">;
+defm STriw_imm : ST_absimm<"memw">;
+
+let Predicates = [HasV4T], AddedComplexity  = 30 in
+def : Pat<(truncstorei8 (i32 IntRegs:$src1), u6ImmPred:$src2),
+          (STrib_imm_abs_V4 u6ImmPred:$src2, IntRegs: $src1)>;
+
+let Predicates = [HasV4T], AddedComplexity  = 30 in
+def : Pat<(truncstorei16 (i32 IntRegs:$src1), u6ImmPred:$src2),
+          (STrih_imm_abs_V4 u6ImmPred:$src2, IntRegs: $src1)>;
+
+let Predicates = [HasV4T], AddedComplexity  = 30 in
+def : Pat<(store (i32 IntRegs:$src1), u6ImmPred:$src2),
+          (STriw_imm_abs_V4 u6ImmPred:$src2, IntRegs: $src1)>;
+
+
+// Load - absolute addressing mode: These instruction take constant
+// value as the extended operand
+
+multiclass LD_absimm<string OpcStr> {
+  let isPredicable = 1 in
+  def _abs_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins u6Imm:$src),
+            !strconcat("$dst = ",
+            !strconcat(OpcStr, "(#$src)")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, u6Imm:$src2),
+            !strconcat("if ($src1) $dst = ",
+            !strconcat(OpcStr, "(#$src2)")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, u6Imm:$src2),
+            !strconcat("if (!$src1) $dst = ",
+            !strconcat(OpcStr, "(#$src2)")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, u6Imm:$src2),
+            !strconcat("if ($src1.new) $dst = ",
+            !strconcat(OpcStr, "(#$src2)")),
+            []>,
+            Requires<[HasV4T]>;
+
+  let isPredicated = 1 in
+  def _abs_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, u6Imm:$src2),
+            !strconcat("if (!$src1.new) $dst = ",
+            !strconcat(OpcStr, "(#$src2)")),
+            []>,
+            Requires<[HasV4T]>;
+}
+
+defm LDrib_imm : LD_absimm<"memb">;
+defm LDriub_imm : LD_absimm<"memub">;
+defm LDrih_imm : LD_absimm<"memh">;
+defm LDriuh_imm : LD_absimm<"memuh">;
+defm LDriw_imm : LD_absimm<"memw">;
+
+let Predicates = [HasV4T], AddedComplexity  = 30 in
+def : Pat<(i32 (load u6ImmPred:$src)),
+          (LDriw_imm_abs_V4 u6ImmPred:$src)>;
+
+let Predicates = [HasV4T], AddedComplexity=30 in
+def : Pat<(i32 (sextloadi8 u6ImmPred:$src)),
+          (LDrib_imm_abs_V4 u6ImmPred:$src)>;
+
+let Predicates = [HasV4T], AddedComplexity=30 in
+def : Pat<(i32 (zextloadi8 u6ImmPred:$src)),
+          (LDriub_imm_abs_V4 u6ImmPred:$src)>;
+
+let Predicates = [HasV4T], AddedComplexity=30 in
+def : Pat<(i32 (sextloadi16 u6ImmPred:$src)),
+          (LDrih_imm_abs_V4 u6ImmPred:$src)>;
+
+let Predicates = [HasV4T], AddedComplexity=30 in
+def : Pat<(i32 (zextloadi16 u6ImmPred:$src)),
+          (LDriuh_imm_abs_V4 u6ImmPred:$src)>;
+
+
+// Indexed store double word - global address.
+// memw(Rs+#u6:2)=#S8
+let AddedComplexity = 10 in
+def STriw_offset_ext_V4 : STInst<(outs),
+            (ins IntRegs:$src1, u6_2Imm:$src2, globaladdress:$src3),
+            "memw($src1+#$src2) = ##$src3",
+            [(store (HexagonCONST32 tglobaladdr:$src3),
+                    (add IntRegs:$src1, u6_2ImmPred:$src2))]>,
+            Requires<[HasV4T]>;
+
+
+// Indexed store double word - global address.
+// memw(Rs+#u6:2)=#S8
+let AddedComplexity = 10 in
+def STrih_offset_ext_V4 : STInst<(outs),
+            (ins IntRegs:$src1, u6_1Imm:$src2, globaladdress:$src3),
+            "memh($src1+#$src2) = ##$src3",
+            [(truncstorei16 (HexagonCONST32 tglobaladdr:$src3),
+                    (add IntRegs:$src1, u6_1ImmPred:$src2))]>,
+            Requires<[HasV4T]>;
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV5.td b/lib/Target/Hexagon/HexagonInstrInfoV5.td
new file mode 100644
index 0000000..92d098c
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonInstrInfoV5.td
@@ -0,0 +1,626 @@
+def SDTHexagonFCONST32 : SDTypeProfile<1, 1, [
+                                            SDTCisVT<0, f32>,
+                                            SDTCisPtrTy<1>]>;
+def HexagonFCONST32 : SDNode<"HexagonISD::FCONST32",     SDTHexagonFCONST32>;
+
+let isReMaterializable = 1, isMoveImm = 1 in
+def FCONST32_nsdata : LDInst<(outs IntRegs:$dst), (ins globaladdress:$global),
+              "$dst = CONST32(#$global)",
+              [(set (f32 IntRegs:$dst),
+              (HexagonFCONST32 tglobaladdr:$global))]>,
+               Requires<[HasV5T]>;
+
+let isReMaterializable = 1, isMoveImm = 1 in
+def CONST64_Float_Real : LDInst<(outs DoubleRegs:$dst), (ins f64imm:$src1),
+                       "$dst = CONST64(#$src1)",
+                       [(set DoubleRegs:$dst, fpimm:$src1)]>,
+          Requires<[HasV5T]>;
+
+let isReMaterializable = 1, isMoveImm = 1 in
+def CONST32_Float_Real : LDInst<(outs IntRegs:$dst), (ins f32imm:$src1),
+                       "$dst = CONST32(#$src1)",
+                       [(set IntRegs:$dst, fpimm:$src1)]>,
+          Requires<[HasV5T]>;
+
+// Transfer immediate float.
+// Only works with single precision fp value.
+// For double precision, use CONST64_float_real, as 64bit transfer
+// can only hold 40-bit values - 32 from const ext + 8 bit immediate.
+let isMoveImm = 1, isReMaterializable = 1, isPredicable = 1 in
+def TFRI_f : ALU32_ri<(outs IntRegs:$dst), (ins f32imm:$src1),
+           "$dst = ##$src1",
+           [(set IntRegs:$dst, fpimm:$src1)]>,
+          Requires<[HasV5T]>;
+
+def TFRI_cPt_f : ALU32_ri<(outs IntRegs:$dst),
+                          (ins PredRegs:$src1, f32imm:$src2),
+           "if ($src1) $dst = ##$src2",
+           []>,
+          Requires<[HasV5T]>;
+
+let isPredicated = 1 in
+def TFRI_cNotPt_f : ALU32_ri<(outs IntRegs:$dst),
+                             (ins PredRegs:$src1, f32imm:$src2),
+           "if (!$src1) $dst = ##$src2",
+           []>,
+          Requires<[HasV5T]>;
+
+// Convert single precision to double precision and vice-versa.
+def CONVERT_sf2df : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
+                "$dst = convert_sf2df($src)",
+                [(set DoubleRegs:$dst, (fextend IntRegs:$src))]>,
+          Requires<[HasV5T]>;
+
+def CONVERT_df2sf : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+                "$dst = convert_df2sf($src)",
+                [(set IntRegs:$dst, (fround DoubleRegs:$src))]>,
+          Requires<[HasV5T]>;
+
+
+// Load.
+def LDrid_f : LDInst<(outs DoubleRegs:$dst),
+            (ins MEMri:$addr),
+            "$dst = memd($addr)",
+            [(set DoubleRegs:$dst, (f64 (load ADDRriS11_3:$addr)))]>,
+          Requires<[HasV5T]>;
+
+
+let AddedComplexity = 20 in
+def LDrid_indexed_f : LDInst<(outs DoubleRegs:$dst),
+            (ins IntRegs:$src1, s11_3Imm:$offset),
+            "$dst = memd($src1+#$offset)",
+            [(set DoubleRegs:$dst, (f64 (load (add IntRegs:$src1,
+                                              s11_3ImmPred:$offset))))]>,
+          Requires<[HasV5T]>;
+
+def LDriw_f : LDInst<(outs IntRegs:$dst),
+            (ins MEMri:$addr), "$dst = memw($addr)",
+            [(set IntRegs:$dst, (f32 (load ADDRriS11_2:$addr)))]>,
+          Requires<[HasV5T]>;
+
+
+let AddedComplexity = 20 in
+def LDriw_indexed_f : LDInst<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, s11_2Imm:$offset),
+            "$dst = memw($src1+#$offset)",
+            [(set IntRegs:$dst, (f32 (load (add IntRegs:$src1,
+                                           s11_2ImmPred:$offset))))]>,
+          Requires<[HasV5T]>;
+
+// Store.
+def STriw_f : STInst<(outs),
+            (ins MEMri:$addr, IntRegs:$src1),
+            "memw($addr) = $src1",
+            [(store (f32 IntRegs:$src1), ADDRriS11_2:$addr)]>,
+          Requires<[HasV5T]>;
+
+let AddedComplexity = 10 in
+def STriw_indexed_f : STInst<(outs),
+            (ins IntRegs:$src1, s11_2Imm:$src2, IntRegs:$src3),
+            "memw($src1+#$src2) = $src3",
+            [(store (f32 IntRegs:$src3),
+                (add IntRegs:$src1, s11_2ImmPred:$src2))]>,
+          Requires<[HasV5T]>;
+
+def STrid_f : STInst<(outs),
+            (ins MEMri:$addr, DoubleRegs:$src1),
+            "memd($addr) = $src1",
+            [(store (f64 DoubleRegs:$src1), ADDRriS11_2:$addr)]>,
+          Requires<[HasV5T]>;
+
+// Indexed store double word.
+let AddedComplexity = 10 in
+def STrid_indexed_f : STInst<(outs),
+            (ins IntRegs:$src1, s11_3Imm:$src2,  DoubleRegs:$src3),
+            "memd($src1+#$src2) = $src3",
+            [(store (f64 DoubleRegs:$src3),
+                                (add IntRegs:$src1, s11_3ImmPred:$src2))]>,
+          Requires<[HasV5T]>;
+
+
+// Add
+let isCommutable = 1 in
+def fADD_rr : ALU64_rr<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = sfadd($src1, $src2)",
+            [(set IntRegs:$dst, (fadd IntRegs:$src1, IntRegs:$src2))]>,
+          Requires<[HasV5T]>;
+
+let isCommutable = 1 in
+def fADD64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2),
+               "$dst = dfadd($src1, $src2)",
+               [(set DoubleRegs:$dst, (fadd DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>,
+          Requires<[HasV5T]>;
+
+def fSUB_rr : ALU64_rr<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = sfsub($src1, $src2)",
+            [(set IntRegs:$dst, (fsub IntRegs:$src1, IntRegs:$src2))]>,
+          Requires<[HasV5T]>;
+
+def fSUB64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2),
+               "$dst = dfsub($src1, $src2)",
+               [(set DoubleRegs:$dst, (fsub DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>,
+               Requires<[HasV5T]>;
+
+let isCommutable = 1 in
+def fMUL_rr : ALU64_rr<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = sfmpy($src1, $src2)",
+            [(set IntRegs:$dst, (fmul IntRegs:$src1, IntRegs:$src2))]>,
+            Requires<[HasV5T]>;
+
+let isCommutable = 1 in
+def fMUL64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2),
+               "$dst = dfmpy($src1, $src2)",
+               [(set DoubleRegs:$dst, (fmul DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>,
+               Requires<[HasV5T]>;
+
+// Compare.
+let isCompare = 1 in {
+multiclass FCMP64_rr<string OpcStr, PatFrag OpNode> {
+  def _rr : ALU64_rr<(outs PredRegs:$dst), (ins DoubleRegs:$b, DoubleRegs:$c),
+                 !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
+                 [(set PredRegs:$dst,
+                        (OpNode (f64 DoubleRegs:$b), (f64 DoubleRegs:$c)))]>,
+                 Requires<[HasV5T]>;
+}
+
+multiclass FCMP32_rr<string OpcStr, PatFrag OpNode> {
+  def _rr : ALU64_rr<(outs PredRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                 !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
+                 [(set PredRegs:$dst,
+                        (OpNode (f32 IntRegs:$b), (f32 IntRegs:$c)))]>,
+                 Requires<[HasV5T]>;
+}
+}
+
+defm FCMPOEQ64 : FCMP64_rr<"dfcmp.eq", setoeq>;
+defm FCMPUEQ64 : FCMP64_rr<"dfcmp.eq", setueq>;
+defm FCMPOGT64 : FCMP64_rr<"dfcmp.gt", setogt>;
+defm FCMPUGT64 : FCMP64_rr<"dfcmp.gt", setugt>;
+defm FCMPOGE64 : FCMP64_rr<"dfcmp.ge", setoge>;
+defm FCMPUGE64 : FCMP64_rr<"dfcmp.ge", setuge>;
+
+defm FCMPOEQ32 : FCMP32_rr<"sfcmp.eq", setoeq>;
+defm FCMPUEQ32 : FCMP32_rr<"sfcmp.eq", setueq>;
+defm FCMPOGT32 : FCMP32_rr<"sfcmp.gt", setogt>;
+defm FCMPUGT32 : FCMP32_rr<"sfcmp.gt", setugt>;
+defm FCMPOGE32 : FCMP32_rr<"sfcmp.ge", setoge>;
+defm FCMPUGE32 : FCMP32_rr<"sfcmp.ge", setuge>;
+
+// olt.
+def : Pat <(i1 (setolt (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
+      (i1 (FCMPOGT32_rr IntRegs:$src2, IntRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+def : Pat <(i1 (setolt (f32 IntRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPOGT32_rr (f32 (TFRI_f fpimm:$src2)), (f32 IntRegs:$src1)))>,
+      Requires<[HasV5T]>;
+
+def : Pat <(i1 (setolt (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
+      (i1 (FCMPOGT64_rr DoubleRegs:$src2, DoubleRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+def : Pat <(i1 (setolt (f64 DoubleRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPOGT64_rr (f64 (CONST64_Float_Real fpimm:$src2)),
+                        (f64 DoubleRegs:$src1)))>,
+      Requires<[HasV5T]>;
+
+// gt.
+def : Pat <(i1 (setugt (f64 DoubleRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPUGT64_rr (f64 DoubleRegs:$src1),
+                        (f64 (CONST64_Float_Real fpimm:$src2))))>,
+      Requires<[HasV5T]>;
+
+def : Pat <(i1 (setugt (f32 IntRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPUGT32_rr (f32 IntRegs:$src1), (f32 (TFRI_f fpimm:$src2))))>,
+      Requires<[HasV5T]>;
+
+// ult.
+def : Pat <(i1 (setult (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
+      (i1 (FCMPUGT32_rr IntRegs:$src2, IntRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+def : Pat <(i1 (setult (f32 IntRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPUGT32_rr (f32 (TFRI_f fpimm:$src2)), (f32 IntRegs:$src1)))>,
+      Requires<[HasV5T]>;
+
+def : Pat <(i1 (setult (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
+      (i1 (FCMPUGT64_rr DoubleRegs:$src2, DoubleRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+def : Pat <(i1 (setult (f64 DoubleRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPUGT64_rr (f64 (CONST64_Float_Real fpimm:$src2)),
+                        (f64 DoubleRegs:$src1)))>,
+      Requires<[HasV5T]>;
+
+// le.
+// rs <= rt -> rt >= rs.
+def : Pat<(i1 (setole (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
+      (i1 (FCMPOGE32_rr IntRegs:$src2, IntRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setole (f32 IntRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPOGE32_rr (f32 (TFRI_f fpimm:$src2)), IntRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+
+// Rss <= Rtt -> Rtt >= Rss.
+def : Pat<(i1 (setole (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
+      (i1 (FCMPOGE64_rr DoubleRegs:$src2, DoubleRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setole (f64 DoubleRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPOGE64_rr (f64 (CONST64_Float_Real fpimm:$src2)),
+                                DoubleRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+// rs <= rt -> rt >= rs.
+def : Pat<(i1 (setule (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
+      (i1 (FCMPUGE32_rr IntRegs:$src2, IntRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setule (f32 IntRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPUGE32_rr (f32 (TFRI_f fpimm:$src2)), IntRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+// Rss <= Rtt -> Rtt >= Rss.
+def : Pat<(i1 (setule (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
+      (i1 (FCMPUGE64_rr DoubleRegs:$src2, DoubleRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setule (f64 DoubleRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPUGE64_rr (f64 (CONST64_Float_Real fpimm:$src2)),
+                                DoubleRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+// ne.
+def : Pat<(i1 (setone (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
+      (i1 (NOT_p (FCMPOEQ32_rr IntRegs:$src1, IntRegs:$src2)))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setone (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
+      (i1 (NOT_p (FCMPOEQ64_rr DoubleRegs:$src1, DoubleRegs:$src2)))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setune (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
+      (i1 (NOT_p (FCMPUEQ32_rr IntRegs:$src1, IntRegs:$src2)))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setune (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
+      (i1 (NOT_p (FCMPUEQ64_rr DoubleRegs:$src1, DoubleRegs:$src2)))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setone (f32 IntRegs:$src1), (fpimm:$src2))),
+      (i1 (NOT_p (FCMPOEQ32_rr IntRegs:$src1, (f32 (TFRI_f fpimm:$src2)))))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setone (f64 DoubleRegs:$src1), (fpimm:$src2))),
+      (i1 (NOT_p (FCMPOEQ64_rr DoubleRegs:$src1,
+                              (f64 (CONST64_Float_Real fpimm:$src2)))))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setune (f32 IntRegs:$src1), (fpimm:$src2))),
+      (i1 (NOT_p (FCMPUEQ32_rr IntRegs:$src1,  (f32 (TFRI_f fpimm:$src2)))))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setune (f64 DoubleRegs:$src1), (fpimm:$src2))),
+      (i1 (NOT_p (FCMPUEQ64_rr DoubleRegs:$src1,
+                              (f64 (CONST64_Float_Real fpimm:$src2)))))>,
+      Requires<[HasV5T]>;
+
+// Convert Integer to Floating Point.
+def CONVERT_d2sf : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_d2sf($src)",
+              [(set (f32 IntRegs:$dst), (sint_to_fp (i64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_ud2sf : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_ud2sf($src)",
+              [(set (f32 IntRegs:$dst), (uint_to_fp (i64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_uw2sf : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_uw2sf($src)",
+              [(set (f32 IntRegs:$dst), (uint_to_fp (i32 IntRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_w2sf : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_w2sf($src)",
+              [(set (f32 IntRegs:$dst), (sint_to_fp (i32 IntRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_d2df : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_d2df($src)",
+              [(set (f64 DoubleRegs:$dst), (sint_to_fp (i64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_ud2df : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_ud2df($src)",
+              [(set (f64 DoubleRegs:$dst), (uint_to_fp (i64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_uw2df : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_uw2df($src)",
+              [(set (f64 DoubleRegs:$dst), (uint_to_fp (i32 IntRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_w2df : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_w2df($src)",
+              [(set (f64 DoubleRegs:$dst), (sint_to_fp (i32 IntRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+// Convert Floating Point to Integer - default.
+def CONVERT_df2uw : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_df2uw($src):chop",
+              [(set (i32 IntRegs:$dst), (fp_to_uint (f64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_df2w : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_df2w($src):chop",
+              [(set (i32 IntRegs:$dst), (fp_to_sint (f64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_sf2uw : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_sf2uw($src):chop",
+              [(set (i32 IntRegs:$dst), (fp_to_uint (f32 IntRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_sf2w : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_sf2w($src):chop",
+              [(set (i32 IntRegs:$dst), (fp_to_sint (f32 IntRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_df2d : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_df2d($src):chop",
+              [(set (i64 DoubleRegs:$dst), (fp_to_sint (f64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_df2ud : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_df2ud($src):chop",
+              [(set (i64 DoubleRegs:$dst), (fp_to_uint (f64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_sf2d : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_sf2d($src):chop",
+              [(set (i64 DoubleRegs:$dst), (fp_to_sint (f32 IntRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_sf2ud : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_sf2ud($src):chop",
+              [(set (i64 DoubleRegs:$dst), (fp_to_uint (f32 IntRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+// Convert Floating Point to Integer: non-chopped.
+let AddedComplexity = 20 in
+def CONVERT_df2uw_nchop : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_df2uw($src)",
+              [(set (i32 IntRegs:$dst), (fp_to_uint (f64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T, IEEERndNearV5T]>;
+
+let AddedComplexity = 20 in
+def CONVERT_df2w_nchop : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_df2w($src)",
+              [(set (i32 IntRegs:$dst), (fp_to_sint (f64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T, IEEERndNearV5T]>;
+
+let AddedComplexity = 20 in
+def CONVERT_sf2uw_nchop : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_sf2uw($src)",
+              [(set (i32 IntRegs:$dst), (fp_to_uint (f32 IntRegs:$src)))]>,
+              Requires<[HasV5T, IEEERndNearV5T]>;
+
+let AddedComplexity = 20 in
+def CONVERT_sf2w_nchop : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_sf2w($src)",
+              [(set (i32 IntRegs:$dst), (fp_to_sint (f32 IntRegs:$src)))]>,
+              Requires<[HasV5T, IEEERndNearV5T]>;
+
+let AddedComplexity = 20 in
+def CONVERT_df2d_nchop : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_df2d($src)",
+              [(set (i64 DoubleRegs:$dst), (fp_to_sint (f64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T, IEEERndNearV5T]>;
+
+let AddedComplexity = 20 in
+def CONVERT_df2ud_nchop : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_df2ud($src)",
+              [(set (i64 DoubleRegs:$dst), (fp_to_uint (f64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T, IEEERndNearV5T]>;
+
+let AddedComplexity = 20 in
+def CONVERT_sf2d_nchop : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_sf2d($src)",
+              [(set (i64 DoubleRegs:$dst), (fp_to_sint (f32 IntRegs:$src)))]>,
+              Requires<[HasV5T, IEEERndNearV5T]>;
+
+let AddedComplexity = 20 in
+def CONVERT_sf2ud_nchop : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_sf2ud($src)",
+              [(set (i64 DoubleRegs:$dst), (fp_to_uint (f32 IntRegs:$src)))]>,
+              Requires<[HasV5T, IEEERndNearV5T]>;
+
+
+
+// Bitcast is different than [fp|sint|uint]_to_[sint|uint|fp].
+def : Pat <(i32 (bitconvert (f32 IntRegs:$src))),
+           (i32 (TFR IntRegs:$src))>,
+          Requires<[HasV5T]>;
+
+def : Pat <(f32 (bitconvert (i32 IntRegs:$src))),
+           (f32 (TFR IntRegs:$src))>,
+          Requires<[HasV5T]>;
+
+def : Pat <(i64 (bitconvert (f64 DoubleRegs:$src))),
+           (i64 (TFR64 DoubleRegs:$src))>,
+          Requires<[HasV5T]>;
+
+def : Pat <(f64 (bitconvert (i64 DoubleRegs:$src))),
+           (f64 (TFR64 DoubleRegs:$src))>,
+          Requires<[HasV5T]>;
+
+// Floating point fused multiply-add.
+def FMADD_dp : ALU64_acc<(outs DoubleRegs:$dst),
+                  (ins DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+              "$dst += dfmpy($src2, $src3)",
+              [(set (f64 DoubleRegs:$dst),
+                  (fma DoubleRegs:$src2, DoubleRegs:$src3, DoubleRegs:$src1))],
+                  "$src1 = $dst">,
+              Requires<[HasV5T]>;
+
+def FMADD_sp : ALU64_acc<(outs IntRegs:$dst),
+                  (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+              "$dst += sfmpy($src2, $src3)",
+              [(set (f32 IntRegs:$dst),
+                  (fma IntRegs:$src2, IntRegs:$src3, IntRegs:$src1))],
+                  "$src1 = $dst">,
+              Requires<[HasV5T]>;
+
+
+// Floating point max/min.
+let AddedComplexity = 100 in
+def FMAX_dp : ALU64_rr<(outs DoubleRegs:$dst),
+                  (ins DoubleRegs:$src1, DoubleRegs:$src2),
+              "$dst = dfmax($src1, $src2)",
+              [(set DoubleRegs:$dst, (f64 (select (i1 (setolt DoubleRegs:$src2,
+                                                        DoubleRegs:$src1)),
+                                             DoubleRegs:$src1,
+                                             DoubleRegs:$src2)))]>,
+               Requires<[HasV5T]>;
+
+let AddedComplexity = 100 in
+def FMAX_sp : ALU64_rr<(outs IntRegs:$dst),
+                  (ins IntRegs:$src1, IntRegs:$src2),
+              "$dst = sfmax($src1, $src2)",
+              [(set IntRegs:$dst, (f32 (select (i1 (setolt IntRegs:$src2,
+                                                        IntRegs:$src1)),
+                                             IntRegs:$src1,
+                                             IntRegs:$src2)))]>,
+               Requires<[HasV5T]>;
+
+let AddedComplexity = 100 in
+def FMIN_dp : ALU64_rr<(outs DoubleRegs:$dst),
+                  (ins DoubleRegs:$src1, DoubleRegs:$src2),
+              "$dst = dfmin($src1, $src2)",
+              [(set DoubleRegs:$dst, (f64 (select (i1 (setogt DoubleRegs:$src2,
+                                                        DoubleRegs:$src1)),
+                                             DoubleRegs:$src1,
+                                             DoubleRegs:$src2)))]>,
+               Requires<[HasV5T]>;
+
+let AddedComplexity = 100 in
+def FMIN_sp : ALU64_rr<(outs IntRegs:$dst),
+                  (ins IntRegs:$src1, IntRegs:$src2),
+              "$dst = sfmin($src1, $src2)",
+              [(set IntRegs:$dst, (f32 (select (i1 (setogt IntRegs:$src2,
+                                                        IntRegs:$src1)),
+                                             IntRegs:$src1,
+                                             IntRegs:$src2)))]>,
+               Requires<[HasV5T]>;
+
+// Pseudo instruction to encode a set of conditional transfers.
+// This instruction is used instead of a mux and trades-off codesize
+// for performance. We conduct this transformation optimistically in
+// the hope that these instructions get promoted to dot-new transfers.
+let AddedComplexity = 100, isPredicated = 1 in
+def TFR_condset_rr_f : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1,
+                                                        IntRegs:$src2,
+                                                        IntRegs:$src3),
+                     "Error; should not emit",
+                     [(set IntRegs:$dst, (f32 (select PredRegs:$src1,
+                                                 IntRegs:$src2,
+                                                 IntRegs:$src3)))]>,
+               Requires<[HasV5T]>;
+
+let AddedComplexity = 100, isPredicated = 1 in
+def TFR_condset_rr64_f : ALU32_rr<(outs DoubleRegs:$dst), (ins PredRegs:$src1,
+                                                        DoubleRegs:$src2,
+                                                        DoubleRegs:$src3),
+                     "Error; should not emit",
+                     [(set DoubleRegs:$dst, (f64 (select PredRegs:$src1,
+                                                 DoubleRegs:$src2,
+                                                 DoubleRegs:$src3)))]>,
+               Requires<[HasV5T]>;
+
+
+
+let AddedComplexity = 100, isPredicated = 1 in
+def TFR_condset_ri_f : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, f32imm:$src3),
+            "Error; should not emit",
+            [(set IntRegs:$dst,
+             (f32 (select PredRegs:$src1, IntRegs:$src2, fpimm:$src3)))]>,
+               Requires<[HasV5T]>;
+
+let AddedComplexity = 100, isPredicated = 1 in
+def TFR_condset_ir_f : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, f32imm:$src2, IntRegs:$src3),
+            "Error; should not emit",
+            [(set IntRegs:$dst,
+             (f32 (select PredRegs:$src1, fpimm:$src2, IntRegs:$src3)))]>,
+               Requires<[HasV5T]>;
+
+let AddedComplexity = 100, isPredicated = 1 in
+def TFR_condset_ii_f : ALU32_rr<(outs IntRegs:$dst),
+                              (ins PredRegs:$src1, f32imm:$src2, f32imm:$src3),
+                     "Error; should not emit",
+                     [(set IntRegs:$dst, (f32 (select PredRegs:$src1,
+                                                 fpimm:$src2,
+                                                 fpimm:$src3)))]>,
+               Requires<[HasV5T]>;
+
+
+def : Pat <(select (i1 (setult (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
+                   (f32 IntRegs:$src3),
+                   (f32 IntRegs:$src4)),
+    (TFR_condset_rr_f (FCMPUGT32_rr IntRegs:$src2, IntRegs:$src1), IntRegs:$src4,
+                      IntRegs:$src3)>, Requires<[HasV5T]>;
+
+def : Pat <(select (i1 (setult (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
+                   (f64 DoubleRegs:$src3),
+                   (f64 DoubleRegs:$src4)),
+      (TFR_condset_rr64_f (FCMPUGT64_rr DoubleRegs:$src2, DoubleRegs:$src1),
+                DoubleRegs:$src4, DoubleRegs:$src3)>, Requires<[HasV5T]>;
+
+// Map from p0 = pnot(p0); r0 = mux(p0, #i, #j) => r0 = mux(p0, #j, #i).
+def : Pat <(select (not PredRegs:$src1), fpimm:$src2, fpimm:$src3),
+      (TFR_condset_ii_f PredRegs:$src1, fpimm:$src3, fpimm:$src2)>;
+
+// Map from p0 = pnot(p0); r0 = select(p0, #i, r1)
+// => r0 = TFR_condset_ri(p0, r1, #i)
+def : Pat <(select (not PredRegs:$src1), fpimm:$src2, IntRegs:$src3),
+      (TFR_condset_ri_f PredRegs:$src1, IntRegs:$src3, fpimm:$src2)>;
+
+// Map from p0 = pnot(p0); r0 = mux(p0, r1, #i)
+// => r0 = TFR_condset_ir(p0, #i, r1)
+def : Pat <(select (not PredRegs:$src1), IntRegs:$src2, fpimm:$src3),
+      (TFR_condset_ir_f PredRegs:$src1, fpimm:$src3, IntRegs:$src2)>;
+
+def : Pat <(i32 (fp_to_sint (f64 DoubleRegs:$src1))),
+          (i32 (EXTRACT_SUBREG (i64 (CONVERT_df2d (f64 DoubleRegs:$src1))), subreg_loreg))>,
+          Requires<[HasV5T]>;
+
+def : Pat <(fabs (f32 IntRegs:$src1)),
+           (CLRBIT_31 (f32 IntRegs:$src1), 31)>,
+          Requires<[HasV5T]>;
+
+def : Pat <(fneg (f32 IntRegs:$src1)),
+           (TOGBIT_31 (f32 IntRegs:$src1), 31)>,
+          Requires<[HasV5T]>;
+
+/*
+def : Pat <(fabs (f64 DoubleRegs:$src1)),
+          (CLRBIT_31 (f32 (EXTRACT_SUBREG DoubleRegs:$src1, subreg_hireg)), 31)>,
+          Requires<[HasV5T]>;
+
+def : Pat <(fabs (f64 DoubleRegs:$src1)),
+          (CLRBIT_31 (f32 (EXTRACT_SUBREG DoubleRegs:$src1, subreg_hireg)), 31)>,
+          Requires<[HasV5T]>;
+          */
diff --git a/lib/Target/Hexagon/HexagonIntrinsics.td b/lib/Target/Hexagon/HexagonIntrinsics.td
index b15e293..99f59d5 100644
--- a/lib/Target/Hexagon/HexagonIntrinsics.td
+++ b/lib/Target/Hexagon/HexagonIntrinsics.td
@@ -551,13 +551,6 @@ class di_SInst_diu6u6<string opc, Intrinsic IntID>
           [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2,
                                         imm:$src3))]>;
 
-class di_SInst_didisi<string opc, Intrinsic IntID>
-  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
-                                       IntRegs:$src3),
-          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2, $src3)")),
-          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2,
-                                        IntRegs:$src3))]>;
-
 class di_SInst_didiqi<string opc, Intrinsic IntID>
   : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
                                        IntRegs:$src3),
@@ -818,6 +811,11 @@ class di_MInst_s8s8<string opc, Intrinsic IntID>
              !strconcat("$dst = ", !strconcat(opc , "(#$src1, #$src2)")),
              [(set DoubleRegs:$dst, (IntID imm:$src1, imm:$src2))]>;
 
+class si_MInst_sis9<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s9Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
 class si_MInst_sisi<string opc, Intrinsic IntID>
   : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
              !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
@@ -952,6 +950,17 @@ class si_SInst_sisi_sat<string opc, Intrinsic IntID>
           !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")),
           [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
 
+class si_SInst_didi_sat<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")),
+          [(set IntRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+class si_SInst_disi_s1_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2):<<1:rnd:sat")),
+             [(set IntRegs:$dst, (IntID DoubleRegs:$src1, IntRegs:$src2))]>;
+
 class si_MInst_sisi_s1_rnd_sat<string opc, Intrinsic IntID>
   : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
              !strconcat("$dst = ", !strconcat(opc ,
@@ -1612,6 +1621,18 @@ class di_MInst_dididi_acc_rnd_sat<string opc, Intrinsic IntID>
                                              DoubleRegs:$src2))],
                "$dst2 = $dst">;
 
+class di_MInst_dididi_acc_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2,
+                                           DoubleRegs:$src1,
+                                           DoubleRegs:$src2),
+               !strconcat("$dst += ",
+                          !strconcat(opc , "($src1, $src2):<<1")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                             DoubleRegs:$src1,
+                                             DoubleRegs:$src2))],
+               "$dst2 = $dst">;
+
+
 class di_MInst_dididi_acc_s1_sat<string opc, Intrinsic IntID>
   : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2,
                                            DoubleRegs:$src1,
@@ -1822,53 +1843,63 @@ class si_MInst_didi<string opc, Intrinsic IntID>
              !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
              [(set IntRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
 
+//
+// LDInst classes.
+//
+let mayLoad = 1, neverHasSideEffects = 1 in
+class di_LDInstPI_diu4<string opc, Intrinsic IntID>
+  : LDInstPI<(outs IntRegs:$dst, DoubleRegs:$dst2),
+           (ins IntRegs:$src1, IntRegs:$src2, CRRegs:$src3, s4Imm:$offset),
+           "$dst2 = memd($src1++#$offset:circ($src3))",
+           [],
+           "$src1 = $dst">;
 
 /********************************************************************
 *            ALU32/ALU                                              *
 *********************************************************************/
 
 // ALU32 / ALU / Add.
-def Hexagon_A2_add:
+def HEXAGON_A2_add:
   si_ALU32_sisi                   <"add",      int_hexagon_A2_add>;
-def Hexagon_A2_addi:
+def HEXAGON_A2_addi:
   si_ALU32_sis16                  <"add",      int_hexagon_A2_addi>;
 
 // ALU32 / ALU / Logical operations.
-def Hexagon_A2_and:
+def HEXAGON_A2_and:
   si_ALU32_sisi                   <"and",      int_hexagon_A2_and>;
-def Hexagon_A2_andir:
+def HEXAGON_A2_andir:
   si_ALU32_sis10                  <"and",      int_hexagon_A2_andir>;
-def Hexagon_A2_not:
+def HEXAGON_A2_not:
   si_ALU32_si                     <"not",      int_hexagon_A2_not>;
-def Hexagon_A2_or:
+def HEXAGON_A2_or:
   si_ALU32_sisi                   <"or",       int_hexagon_A2_or>;
-def Hexagon_A2_orir:
+def HEXAGON_A2_orir:
   si_ALU32_sis10                  <"or",       int_hexagon_A2_orir>;
-def Hexagon_A2_xor:
+def HEXAGON_A2_xor:
   si_ALU32_sisi                   <"xor",      int_hexagon_A2_xor>;
 
 // ALU32 / ALU / Negate.
-def Hexagon_A2_neg:
+def HEXAGON_A2_neg:
   si_ALU32_si                     <"neg",      int_hexagon_A2_neg>;
 
 // ALU32 / ALU / Subtract.
-def Hexagon_A2_sub:
+def HEXAGON_A2_sub:
   si_ALU32_sisi                   <"sub",      int_hexagon_A2_sub>;
-def Hexagon_A2_subri:
+def HEXAGON_A2_subri:
   si_ALU32_s10si                  <"sub",      int_hexagon_A2_subri>;
 
 // ALU32 / ALU / Transfer Immediate.
-def Hexagon_A2_tfril:
+def HEXAGON_A2_tfril:
   si_lo_ALU32_siu16               <"",         int_hexagon_A2_tfril>;
-def Hexagon_A2_tfrih:
+def HEXAGON_A2_tfrih:
   si_hi_ALU32_siu16               <"",         int_hexagon_A2_tfrih>;
-def Hexagon_A2_tfrsi:
+def HEXAGON_A2_tfrsi:
   si_ALU32_s16                    <"",         int_hexagon_A2_tfrsi>;
-def Hexagon_A2_tfrpi:
+def HEXAGON_A2_tfrpi:
   di_ALU32_s8                     <"",         int_hexagon_A2_tfrpi>;
 
 // ALU32 / ALU / Transfer Register.
-def Hexagon_A2_tfr:
+def HEXAGON_A2_tfr:
   si_ALU32_si_tfr                  <"",        int_hexagon_A2_tfr>;
 
 /********************************************************************
@@ -1876,45 +1907,45 @@ def Hexagon_A2_tfr:
 *********************************************************************/
 
 // ALU32 / PERM / Combine.
-def Hexagon_A2_combinew:
+def HEXAGON_A2_combinew:
   di_ALU32_sisi                   <"combine",  int_hexagon_A2_combinew>;
-def Hexagon_A2_combine_hh:
+def HEXAGON_A2_combine_hh:
   si_MInst_sisi_hh                <"combine",  int_hexagon_A2_combine_hh>;
-def Hexagon_A2_combine_lh:
+def HEXAGON_A2_combine_lh:
   si_MInst_sisi_lh                <"combine",  int_hexagon_A2_combine_lh>;
-def Hexagon_A2_combine_hl:
+def HEXAGON_A2_combine_hl:
   si_MInst_sisi_hl                <"combine",  int_hexagon_A2_combine_hl>;
-def Hexagon_A2_combine_ll:
+def HEXAGON_A2_combine_ll:
   si_MInst_sisi_ll                <"combine",  int_hexagon_A2_combine_ll>;
-def Hexagon_A2_combineii:
+def HEXAGON_A2_combineii:
   di_MInst_s8s8                   <"combine",  int_hexagon_A2_combineii>;
 
 // ALU32 / PERM / Mux.
-def Hexagon_C2_mux:
+def HEXAGON_C2_mux:
   si_ALU32_qisisi                 <"mux",      int_hexagon_C2_mux>;
-def Hexagon_C2_muxri:
+def HEXAGON_C2_muxri:
   si_ALU32_qis8si                 <"mux",      int_hexagon_C2_muxri>;
-def Hexagon_C2_muxir:
+def HEXAGON_C2_muxir:
   si_ALU32_qisis8                 <"mux",      int_hexagon_C2_muxir>;
-def Hexagon_C2_muxii:
+def HEXAGON_C2_muxii:
   si_ALU32_qis8s8                 <"mux",      int_hexagon_C2_muxii>;
 
 // ALU32 / PERM / Shift halfword.
-def Hexagon_A2_aslh:
+def HEXAGON_A2_aslh:
   si_ALU32_si                     <"aslh",     int_hexagon_A2_aslh>;
-def Hexagon_A2_asrh:
+def HEXAGON_A2_asrh:
   si_ALU32_si                     <"asrh",     int_hexagon_A2_asrh>;
 def SI_to_SXTHI_asrh:
   si_ALU32_si                     <"asrh",     int_hexagon_SI_to_SXTHI_asrh>;
 
 // ALU32 / PERM / Sign/zero extend.
-def Hexagon_A2_sxth:
+def HEXAGON_A2_sxth:
   si_ALU32_si                     <"sxth",     int_hexagon_A2_sxth>;
-def Hexagon_A2_sxtb:
+def HEXAGON_A2_sxtb:
   si_ALU32_si                     <"sxtb",     int_hexagon_A2_sxtb>;
-def Hexagon_A2_zxth:
+def HEXAGON_A2_zxth:
   si_ALU32_si                     <"zxth",     int_hexagon_A2_zxth>;
-def Hexagon_A2_zxtb:
+def HEXAGON_A2_zxtb:
   si_ALU32_si                     <"zxtb",     int_hexagon_A2_zxtb>;
 
 /********************************************************************
@@ -1922,25 +1953,25 @@ def Hexagon_A2_zxtb:
 *********************************************************************/
 
 // ALU32 / PRED / Compare.
-def Hexagon_C2_cmpeq:
+def HEXAGON_C2_cmpeq:
   qi_ALU32_sisi                   <"cmp.eq",   int_hexagon_C2_cmpeq>;
-def Hexagon_C2_cmpeqi:
+def HEXAGON_C2_cmpeqi:
   qi_ALU32_sis10                  <"cmp.eq",   int_hexagon_C2_cmpeqi>;
-def Hexagon_C2_cmpgei:
+def HEXAGON_C2_cmpgei:
   qi_ALU32_sis8                   <"cmp.ge",   int_hexagon_C2_cmpgei>;
-def Hexagon_C2_cmpgeui:
+def HEXAGON_C2_cmpgeui:
   qi_ALU32_siu8                   <"cmp.geu",  int_hexagon_C2_cmpgeui>;
-def Hexagon_C2_cmpgt:
+def HEXAGON_C2_cmpgt:
   qi_ALU32_sisi                   <"cmp.gt",   int_hexagon_C2_cmpgt>;
-def Hexagon_C2_cmpgti:
+def HEXAGON_C2_cmpgti:
   qi_ALU32_sis10                  <"cmp.gt",   int_hexagon_C2_cmpgti>;
-def Hexagon_C2_cmpgtu:
+def HEXAGON_C2_cmpgtu:
   qi_ALU32_sisi                   <"cmp.gtu",  int_hexagon_C2_cmpgtu>;
-def Hexagon_C2_cmpgtui:
+def HEXAGON_C2_cmpgtui:
   qi_ALU32_siu9                   <"cmp.gtu",  int_hexagon_C2_cmpgtui>;
-def Hexagon_C2_cmplt:
+def HEXAGON_C2_cmplt:
   qi_ALU32_sisi                   <"cmp.lt",   int_hexagon_C2_cmplt>;
-def Hexagon_C2_cmpltu:
+def HEXAGON_C2_cmpltu:
   qi_ALU32_sisi                   <"cmp.ltu",  int_hexagon_C2_cmpltu>;
 
 /********************************************************************
@@ -1949,27 +1980,27 @@ def Hexagon_C2_cmpltu:
 
 // ALU32 / VH / Vector add halfwords.
 // Rd32=vadd[u]h(Rs32,Rt32:sat]
-def Hexagon_A2_svaddh:
+def HEXAGON_A2_svaddh:
   si_ALU32_sisi                   <"vaddh",    int_hexagon_A2_svaddh>;
-def Hexagon_A2_svaddhs:
+def HEXAGON_A2_svaddhs:
   si_ALU32_sisi_sat               <"vaddh",    int_hexagon_A2_svaddhs>;
-def Hexagon_A2_svadduhs:
+def HEXAGON_A2_svadduhs:
   si_ALU32_sisi_sat               <"vadduh",   int_hexagon_A2_svadduhs>;
 
 // ALU32 / VH / Vector average halfwords.
-def Hexagon_A2_svavgh:
+def HEXAGON_A2_svavgh:
   si_ALU32_sisi                   <"vavgh",    int_hexagon_A2_svavgh>;
-def Hexagon_A2_svavghs:
+def HEXAGON_A2_svavghs:
   si_ALU32_sisi_rnd               <"vavgh",    int_hexagon_A2_svavghs>;
-def Hexagon_A2_svnavgh:
+def HEXAGON_A2_svnavgh:
   si_ALU32_sisi                   <"vnavgh",   int_hexagon_A2_svnavgh>;
 
 // ALU32 / VH / Vector subtract halfwords.
-def Hexagon_A2_svsubh:
+def HEXAGON_A2_svsubh:
   si_ALU32_sisi                   <"vsubh",    int_hexagon_A2_svsubh>;
-def Hexagon_A2_svsubhs:
+def HEXAGON_A2_svsubhs:
   si_ALU32_sisi_sat               <"vsubh",    int_hexagon_A2_svsubhs>;
-def Hexagon_A2_svsubuhs:
+def HEXAGON_A2_svsubuhs:
   si_ALU32_sisi_sat               <"vsubuh",   int_hexagon_A2_svsubuhs>;
 
 /********************************************************************
@@ -1977,109 +2008,109 @@ def Hexagon_A2_svsubuhs:
 *********************************************************************/
 
 // ALU64 / ALU / Add.
-def Hexagon_A2_addp:
+def HEXAGON_A2_addp:
   di_ALU64_didi                   <"add",      int_hexagon_A2_addp>;
-def Hexagon_A2_addsat:
+def HEXAGON_A2_addsat:
   si_ALU64_sisi_sat               <"add",      int_hexagon_A2_addsat>;
 
 // ALU64 / ALU / Add halfword.
 // Even though the definition says hl, it should be lh -
 //so DON'T change the class " si_ALU64_sisi_l16_lh " it inherits.
-def Hexagon_A2_addh_l16_hl:
+def HEXAGON_A2_addh_l16_hl:
   si_ALU64_sisi_l16_lh            <"add",      int_hexagon_A2_addh_l16_hl>;
-def Hexagon_A2_addh_l16_ll:
+def HEXAGON_A2_addh_l16_ll:
   si_ALU64_sisi_l16_ll            <"add",      int_hexagon_A2_addh_l16_ll>;
 
-def Hexagon_A2_addh_l16_sat_hl:
+def HEXAGON_A2_addh_l16_sat_hl:
   si_ALU64_sisi_l16_sat_lh        <"add",      int_hexagon_A2_addh_l16_sat_hl>;
-def Hexagon_A2_addh_l16_sat_ll:
+def HEXAGON_A2_addh_l16_sat_ll:
   si_ALU64_sisi_l16_sat_ll        <"add",      int_hexagon_A2_addh_l16_sat_ll>;
 
-def Hexagon_A2_addh_h16_hh:
+def HEXAGON_A2_addh_h16_hh:
   si_ALU64_sisi_h16_hh            <"add",      int_hexagon_A2_addh_h16_hh>;
-def Hexagon_A2_addh_h16_hl:
+def HEXAGON_A2_addh_h16_hl:
   si_ALU64_sisi_h16_hl            <"add",      int_hexagon_A2_addh_h16_hl>;
-def Hexagon_A2_addh_h16_lh:
+def HEXAGON_A2_addh_h16_lh:
   si_ALU64_sisi_h16_lh            <"add",      int_hexagon_A2_addh_h16_lh>;
-def Hexagon_A2_addh_h16_ll:
+def HEXAGON_A2_addh_h16_ll:
   si_ALU64_sisi_h16_ll            <"add",      int_hexagon_A2_addh_h16_ll>;
 
-def Hexagon_A2_addh_h16_sat_hh:
+def HEXAGON_A2_addh_h16_sat_hh:
   si_ALU64_sisi_h16_sat_hh        <"add",      int_hexagon_A2_addh_h16_sat_hh>;
-def Hexagon_A2_addh_h16_sat_hl:
+def HEXAGON_A2_addh_h16_sat_hl:
   si_ALU64_sisi_h16_sat_hl        <"add",      int_hexagon_A2_addh_h16_sat_hl>;
-def Hexagon_A2_addh_h16_sat_lh:
+def HEXAGON_A2_addh_h16_sat_lh:
   si_ALU64_sisi_h16_sat_lh        <"add",      int_hexagon_A2_addh_h16_sat_lh>;
-def Hexagon_A2_addh_h16_sat_ll:
+def HEXAGON_A2_addh_h16_sat_ll:
   si_ALU64_sisi_h16_sat_ll        <"add",      int_hexagon_A2_addh_h16_sat_ll>;
 
 // ALU64 / ALU / Compare.
-def Hexagon_C2_cmpeqp:
+def HEXAGON_C2_cmpeqp:
   qi_ALU64_didi                   <"cmp.eq",   int_hexagon_C2_cmpeqp>;
-def Hexagon_C2_cmpgtp:
+def HEXAGON_C2_cmpgtp:
   qi_ALU64_didi                   <"cmp.gt",   int_hexagon_C2_cmpgtp>;
-def Hexagon_C2_cmpgtup:
+def HEXAGON_C2_cmpgtup:
   qi_ALU64_didi                   <"cmp.gtu",  int_hexagon_C2_cmpgtup>;
 
 // ALU64 / ALU / Logical operations.
-def Hexagon_A2_andp:
+def HEXAGON_A2_andp:
   di_ALU64_didi                   <"and",      int_hexagon_A2_andp>;
-def Hexagon_A2_orp:
+def HEXAGON_A2_orp:
   di_ALU64_didi                   <"or",       int_hexagon_A2_orp>;
-def Hexagon_A2_xorp:
+def HEXAGON_A2_xorp:
   di_ALU64_didi                   <"xor",      int_hexagon_A2_xorp>;
 
 // ALU64 / ALU / Maximum.
-def Hexagon_A2_max:
+def HEXAGON_A2_max:
   si_ALU64_sisi                   <"max",      int_hexagon_A2_max>;
-def Hexagon_A2_maxu:
+def HEXAGON_A2_maxu:
   si_ALU64_sisi                   <"maxu",     int_hexagon_A2_maxu>;
 
 // ALU64 / ALU / Minimum.
-def Hexagon_A2_min:
+def HEXAGON_A2_min:
   si_ALU64_sisi                   <"min",      int_hexagon_A2_min>;
-def Hexagon_A2_minu:
+def HEXAGON_A2_minu:
   si_ALU64_sisi                   <"minu",     int_hexagon_A2_minu>;
 
 // ALU64 / ALU / Subtract.
-def Hexagon_A2_subp:
+def HEXAGON_A2_subp:
   di_ALU64_didi                   <"sub",      int_hexagon_A2_subp>;
-def Hexagon_A2_subsat:
+def HEXAGON_A2_subsat:
   si_ALU64_sisi_sat               <"sub",      int_hexagon_A2_subsat>;
 
 // ALU64 / ALU / Subtract halfword.
 // Even though the definition says hl, it should be lh -
 //so DON'T change the class " si_ALU64_sisi_l16_lh " it inherits.
-def Hexagon_A2_subh_l16_hl:
+def HEXAGON_A2_subh_l16_hl:
   si_ALU64_sisi_l16_lh            <"sub",      int_hexagon_A2_subh_l16_hl>;
-def Hexagon_A2_subh_l16_ll:
+def HEXAGON_A2_subh_l16_ll:
   si_ALU64_sisi_l16_ll            <"sub",      int_hexagon_A2_subh_l16_ll>;
 
-def Hexagon_A2_subh_l16_sat_hl:
+def HEXAGON_A2_subh_l16_sat_hl:
   si_ALU64_sisi_l16_sat_lh        <"sub",      int_hexagon_A2_subh_l16_sat_hl>;
-def Hexagon_A2_subh_l16_sat_ll:
+def HEXAGON_A2_subh_l16_sat_ll:
   si_ALU64_sisi_l16_sat_ll        <"sub",      int_hexagon_A2_subh_l16_sat_ll>;
 
-def Hexagon_A2_subh_h16_hh:
+def HEXAGON_A2_subh_h16_hh:
   si_ALU64_sisi_h16_hh            <"sub",      int_hexagon_A2_subh_h16_hh>;
-def Hexagon_A2_subh_h16_hl:
+def HEXAGON_A2_subh_h16_hl:
   si_ALU64_sisi_h16_hl            <"sub",      int_hexagon_A2_subh_h16_hl>;
-def Hexagon_A2_subh_h16_lh:
+def HEXAGON_A2_subh_h16_lh:
   si_ALU64_sisi_h16_lh            <"sub",      int_hexagon_A2_subh_h16_lh>;
-def Hexagon_A2_subh_h16_ll:
+def HEXAGON_A2_subh_h16_ll:
   si_ALU64_sisi_h16_ll            <"sub",      int_hexagon_A2_subh_h16_ll>;
 
-def Hexagon_A2_subh_h16_sat_hh:
+def HEXAGON_A2_subh_h16_sat_hh:
   si_ALU64_sisi_h16_sat_hh        <"sub",      int_hexagon_A2_subh_h16_sat_hh>;
-def Hexagon_A2_subh_h16_sat_hl:
+def HEXAGON_A2_subh_h16_sat_hl:
   si_ALU64_sisi_h16_sat_hl        <"sub",      int_hexagon_A2_subh_h16_sat_hl>;
-def Hexagon_A2_subh_h16_sat_lh:
+def HEXAGON_A2_subh_h16_sat_lh:
   si_ALU64_sisi_h16_sat_lh        <"sub",      int_hexagon_A2_subh_h16_sat_lh>;
-def Hexagon_A2_subh_h16_sat_ll:
+def HEXAGON_A2_subh_h16_sat_ll:
   si_ALU64_sisi_h16_sat_ll        <"sub",      int_hexagon_A2_subh_h16_sat_ll>;
 
 // ALU64 / ALU / Transfer register.
-def Hexagon_A2_tfrp:
+def HEXAGON_A2_tfrp:
   di_ALU64_di                     <"",         int_hexagon_A2_tfrp>;
 
 /********************************************************************
@@ -2087,7 +2118,7 @@ def Hexagon_A2_tfrp:
 *********************************************************************/
 
 // ALU64 / BIT / Masked parity.
-def Hexagon_S2_parityp:
+def HEXAGON_S2_parityp:
   si_ALU64_didi                   <"parity",   int_hexagon_S2_parityp>;
 
 /********************************************************************
@@ -2095,7 +2126,7 @@ def Hexagon_S2_parityp:
 *********************************************************************/
 
 // ALU64 / PERM / Vector pack high and low halfwords.
-def Hexagon_S2_packhl:
+def HEXAGON_S2_packhl:
   di_ALU64_sisi                   <"packhl",   int_hexagon_S2_packhl>;
 
 /********************************************************************
@@ -2103,37 +2134,37 @@ def Hexagon_S2_packhl:
 *********************************************************************/
 
 // ALU64 / VB / Vector add unsigned bytes.
-def Hexagon_A2_vaddub:
+def HEXAGON_A2_vaddub:
   di_ALU64_didi                   <"vaddub",   int_hexagon_A2_vaddub>;
-def Hexagon_A2_vaddubs:
+def HEXAGON_A2_vaddubs:
   di_ALU64_didi_sat               <"vaddub",   int_hexagon_A2_vaddubs>;
 
 // ALU64 / VB / Vector average unsigned bytes.
-def Hexagon_A2_vavgub:
+def HEXAGON_A2_vavgub:
   di_ALU64_didi                   <"vavgub",   int_hexagon_A2_vavgub>;
-def Hexagon_A2_vavgubr:
+def HEXAGON_A2_vavgubr:
   di_ALU64_didi_rnd               <"vavgub",   int_hexagon_A2_vavgubr>;
 
 // ALU64 / VB / Vector compare unsigned bytes.
-def Hexagon_A2_vcmpbeq:
+def HEXAGON_A2_vcmpbeq:
   qi_ALU64_didi                   <"vcmpb.eq", int_hexagon_A2_vcmpbeq>;
-def Hexagon_A2_vcmpbgtu:
+def HEXAGON_A2_vcmpbgtu:
   qi_ALU64_didi                   <"vcmpb.gtu",int_hexagon_A2_vcmpbgtu>;
 
 // ALU64 / VB / Vector maximum/minimum unsigned bytes.
-def Hexagon_A2_vmaxub:
+def HEXAGON_A2_vmaxub:
   di_ALU64_didi                   <"vmaxub",   int_hexagon_A2_vmaxub>;
-def Hexagon_A2_vminub:
+def HEXAGON_A2_vminub:
   di_ALU64_didi                   <"vminub",   int_hexagon_A2_vminub>;
 
 // ALU64 / VB / Vector subtract unsigned bytes.
-def Hexagon_A2_vsubub:
+def HEXAGON_A2_vsubub:
   di_ALU64_didi                   <"vsubub",   int_hexagon_A2_vsubub>;
-def Hexagon_A2_vsububs:
+def HEXAGON_A2_vsububs:
   di_ALU64_didi_sat               <"vsubub",   int_hexagon_A2_vsububs>;
 
 // ALU64 / VB / Vector mux.
-def Hexagon_C2_vmux:
+def HEXAGON_C2_vmux:
   di_ALU64_qididi                 <"vmux",     int_hexagon_C2_vmux>;
 
 
@@ -2143,58 +2174,58 @@ def Hexagon_C2_vmux:
 
 // ALU64 / VH / Vector add halfwords.
 // Rdd64=vadd[u]h(Rss64,Rtt64:sat]
-def Hexagon_A2_vaddh:
+def HEXAGON_A2_vaddh:
   di_ALU64_didi                   <"vaddh",    int_hexagon_A2_vaddh>;
-def Hexagon_A2_vaddhs:
+def HEXAGON_A2_vaddhs:
   di_ALU64_didi_sat               <"vaddh",    int_hexagon_A2_vaddhs>;
-def Hexagon_A2_vadduhs:
+def HEXAGON_A2_vadduhs:
   di_ALU64_didi_sat               <"vadduh",   int_hexagon_A2_vadduhs>;
 
 // ALU64 / VH / Vector average halfwords.
 // Rdd64=v[n]avg[u]h(Rss64,Rtt64:rnd/:crnd][:sat]
-def Hexagon_A2_vavgh:
+def HEXAGON_A2_vavgh:
   di_ALU64_didi                   <"vavgh",    int_hexagon_A2_vavgh>;
-def Hexagon_A2_vavghcr:
+def HEXAGON_A2_vavghcr:
   di_ALU64_didi_crnd              <"vavgh",    int_hexagon_A2_vavghcr>;
-def Hexagon_A2_vavghr:
+def HEXAGON_A2_vavghr:
   di_ALU64_didi_rnd               <"vavgh",    int_hexagon_A2_vavghr>;
-def Hexagon_A2_vavguh:
+def HEXAGON_A2_vavguh:
   di_ALU64_didi                   <"vavguh",   int_hexagon_A2_vavguh>;
-def Hexagon_A2_vavguhr:
+def HEXAGON_A2_vavguhr:
   di_ALU64_didi_rnd               <"vavguh",   int_hexagon_A2_vavguhr>;
-def Hexagon_A2_vnavgh:
+def HEXAGON_A2_vnavgh:
   di_ALU64_didi                   <"vnavgh",   int_hexagon_A2_vnavgh>;
-def Hexagon_A2_vnavghcr:
+def HEXAGON_A2_vnavghcr:
   di_ALU64_didi_crnd_sat          <"vnavgh",   int_hexagon_A2_vnavghcr>;
-def Hexagon_A2_vnavghr:
+def HEXAGON_A2_vnavghr:
   di_ALU64_didi_rnd_sat           <"vnavgh",   int_hexagon_A2_vnavghr>;
 
 // ALU64 / VH / Vector compare halfwords.
-def Hexagon_A2_vcmpheq:
+def HEXAGON_A2_vcmpheq:
   qi_ALU64_didi                   <"vcmph.eq", int_hexagon_A2_vcmpheq>;
-def Hexagon_A2_vcmphgt:
+def HEXAGON_A2_vcmphgt:
   qi_ALU64_didi                   <"vcmph.gt", int_hexagon_A2_vcmphgt>;
-def Hexagon_A2_vcmphgtu:
+def HEXAGON_A2_vcmphgtu:
   qi_ALU64_didi                   <"vcmph.gtu",int_hexagon_A2_vcmphgtu>;
 
 // ALU64 / VH / Vector maximum halfwords.
-def Hexagon_A2_vmaxh:
+def HEXAGON_A2_vmaxh:
   di_ALU64_didi                   <"vmaxh",    int_hexagon_A2_vmaxh>;
-def Hexagon_A2_vmaxuh:
+def HEXAGON_A2_vmaxuh:
   di_ALU64_didi                   <"vmaxuh",   int_hexagon_A2_vmaxuh>;
 
 // ALU64 / VH / Vector minimum halfwords.
-def Hexagon_A2_vminh:
+def HEXAGON_A2_vminh:
   di_ALU64_didi                   <"vminh",    int_hexagon_A2_vminh>;
-def Hexagon_A2_vminuh:
+def HEXAGON_A2_vminuh:
   di_ALU64_didi                   <"vminuh",   int_hexagon_A2_vminuh>;
 
 // ALU64 / VH / Vector subtract halfwords.
-def Hexagon_A2_vsubh:
+def HEXAGON_A2_vsubh:
   di_ALU64_didi                   <"vsubh",    int_hexagon_A2_vsubh>;
-def Hexagon_A2_vsubhs:
+def HEXAGON_A2_vsubhs:
   di_ALU64_didi_sat               <"vsubh",    int_hexagon_A2_vsubhs>;
-def Hexagon_A2_vsubuhs:
+def HEXAGON_A2_vsubuhs:
   di_ALU64_didi_sat               <"vsubuh",   int_hexagon_A2_vsubuhs>;
 
 
@@ -2204,53 +2235,53 @@ def Hexagon_A2_vsubuhs:
 
 // ALU64 / VW / Vector add words.
 // Rdd32=vaddw(Rss32,Rtt32)[:sat]
-def Hexagon_A2_vaddw:
+def HEXAGON_A2_vaddw:
   di_ALU64_didi                   <"vaddw",    int_hexagon_A2_vaddw>;
-def Hexagon_A2_vaddws:
+def HEXAGON_A2_vaddws:
   di_ALU64_didi_sat               <"vaddw",   int_hexagon_A2_vaddws>;
 
 // ALU64 / VW / Vector average words.
-def Hexagon_A2_vavguw:
+def HEXAGON_A2_vavguw:
   di_ALU64_didi                   <"vavguw",   int_hexagon_A2_vavguw>;
-def Hexagon_A2_vavguwr:
+def HEXAGON_A2_vavguwr:
   di_ALU64_didi_rnd               <"vavguw",   int_hexagon_A2_vavguwr>;
-def Hexagon_A2_vavgw:
+def HEXAGON_A2_vavgw:
   di_ALU64_didi                   <"vavgw",    int_hexagon_A2_vavgw>;
-def Hexagon_A2_vavgwcr:
+def HEXAGON_A2_vavgwcr:
   di_ALU64_didi_crnd              <"vavgw",    int_hexagon_A2_vavgwcr>;
-def Hexagon_A2_vavgwr:
+def HEXAGON_A2_vavgwr:
   di_ALU64_didi_rnd               <"vavgw",    int_hexagon_A2_vavgwr>;
-def Hexagon_A2_vnavgw:
+def HEXAGON_A2_vnavgw:
   di_ALU64_didi                   <"vnavgw",   int_hexagon_A2_vnavgw>;
-def Hexagon_A2_vnavgwcr:
+def HEXAGON_A2_vnavgwcr:
   di_ALU64_didi_crnd_sat          <"vnavgw",   int_hexagon_A2_vnavgwcr>;
-def Hexagon_A2_vnavgwr:
+def HEXAGON_A2_vnavgwr:
   di_ALU64_didi_rnd_sat           <"vnavgw",   int_hexagon_A2_vnavgwr>;
 
 // ALU64 / VW / Vector compare words.
-def Hexagon_A2_vcmpweq:
+def HEXAGON_A2_vcmpweq:
   qi_ALU64_didi                   <"vcmpw.eq", int_hexagon_A2_vcmpweq>;
-def Hexagon_A2_vcmpwgt:
+def HEXAGON_A2_vcmpwgt:
   qi_ALU64_didi                   <"vcmpw.gt", int_hexagon_A2_vcmpwgt>;
-def Hexagon_A2_vcmpwgtu:
+def HEXAGON_A2_vcmpwgtu:
   qi_ALU64_didi                   <"vcmpw.gtu",int_hexagon_A2_vcmpwgtu>;
 
 // ALU64 / VW / Vector maximum words.
-def Hexagon_A2_vmaxw:
+def HEXAGON_A2_vmaxw:
   di_ALU64_didi                   <"vmaxw",    int_hexagon_A2_vmaxw>;
-def Hexagon_A2_vmaxuw:
+def HEXAGON_A2_vmaxuw:
   di_ALU64_didi                   <"vmaxuw",   int_hexagon_A2_vmaxuw>;
 
 // ALU64 / VW / Vector minimum words.
-def Hexagon_A2_vminw:
+def HEXAGON_A2_vminw:
   di_ALU64_didi                   <"vminw",    int_hexagon_A2_vminw>;
-def Hexagon_A2_vminuw:
+def HEXAGON_A2_vminuw:
   di_ALU64_didi                   <"vminuw",   int_hexagon_A2_vminuw>;
 
 // ALU64 / VW / Vector subtract words.
-def Hexagon_A2_vsubw:
+def HEXAGON_A2_vsubw:
   di_ALU64_didi                   <"vsubw",    int_hexagon_A2_vsubw>;
-def Hexagon_A2_vsubws:
+def HEXAGON_A2_vsubws:
   di_ALU64_didi_sat               <"vsubw",    int_hexagon_A2_vsubws>;
 
 
@@ -2259,25 +2290,25 @@ def Hexagon_A2_vsubws:
 *********************************************************************/
 
 // CR / Logical reductions on predicates.
-def Hexagon_C2_all8:
+def HEXAGON_C2_all8:
   qi_SInst_qi                     <"all8",     int_hexagon_C2_all8>;
-def Hexagon_C2_any8:
+def HEXAGON_C2_any8:
   qi_SInst_qi                     <"any8",     int_hexagon_C2_any8>;
 
 // CR / Logical operations on predicates.
-def Hexagon_C2_pxfer_map:
+def HEXAGON_C2_pxfer_map:
   qi_SInst_qi_pxfer               <"",         int_hexagon_C2_pxfer_map>;
-def Hexagon_C2_and:
+def HEXAGON_C2_and:
   qi_SInst_qiqi                   <"and",      int_hexagon_C2_and>;
-def Hexagon_C2_andn:
+def HEXAGON_C2_andn:
   qi_SInst_qiqi_neg               <"and",      int_hexagon_C2_andn>;
-def Hexagon_C2_not:
+def HEXAGON_C2_not:
   qi_SInst_qi                     <"not",      int_hexagon_C2_not>;
-def Hexagon_C2_or:
+def HEXAGON_C2_or:
   qi_SInst_qiqi                   <"or",       int_hexagon_C2_or>;
-def Hexagon_C2_orn:
+def HEXAGON_C2_orn:
   qi_SInst_qiqi_neg               <"or",       int_hexagon_C2_orn>;
-def Hexagon_C2_xor:
+def HEXAGON_C2_xor:
   qi_SInst_qiqi                   <"xor",      int_hexagon_C2_xor>;
 
 
@@ -2286,27 +2317,27 @@ def Hexagon_C2_xor:
 *********************************************************************/
 
 // MTYPE / ALU / Add and accumulate.
-def Hexagon_M2_acci:
+def HEXAGON_M2_acci:
   si_MInst_sisisi_acc             <"add",      int_hexagon_M2_acci>;
-def Hexagon_M2_accii:
+def HEXAGON_M2_accii:
   si_MInst_sisis8_acc             <"add",      int_hexagon_M2_accii>;
-def Hexagon_M2_nacci:
+def HEXAGON_M2_nacci:
   si_MInst_sisisi_nac             <"add",      int_hexagon_M2_nacci>;
-def Hexagon_M2_naccii:
+def HEXAGON_M2_naccii:
   si_MInst_sisis8_nac             <"add",      int_hexagon_M2_naccii>;
 
 // MTYPE / ALU / Subtract and accumulate.
-def Hexagon_M2_subacc:
+def HEXAGON_M2_subacc:
   si_MInst_sisisi_acc             <"sub",      int_hexagon_M2_subacc>;
 
 // MTYPE / ALU / Vector absolute difference.
-def Hexagon_M2_vabsdiffh:
+def HEXAGON_M2_vabsdiffh:
   di_MInst_didi                   <"vabsdiffh",int_hexagon_M2_vabsdiffh>;
-def Hexagon_M2_vabsdiffw:
+def HEXAGON_M2_vabsdiffw:
   di_MInst_didi                   <"vabsdiffw",int_hexagon_M2_vabsdiffw>;
 
 // MTYPE / ALU / XOR and xor with destination.
-def Hexagon_M2_xor_xacc:
+def HEXAGON_M2_xor_xacc:
   si_MInst_sisisi_xacc            <"xor",      int_hexagon_M2_xor_xacc>;
 
 
@@ -2316,91 +2347,91 @@ def Hexagon_M2_xor_xacc:
 
 // MTYPE / COMPLEX / Complex multiply.
 // Rdd[-+]=cmpy(Rs, Rt:<<1]:sat
-def Hexagon_M2_cmpys_s1:
+def HEXAGON_M2_cmpys_s1:
   di_MInst_sisi_s1_sat            <"cmpy",     int_hexagon_M2_cmpys_s1>;
-def Hexagon_M2_cmpys_s0:
+def HEXAGON_M2_cmpys_s0:
   di_MInst_sisi_sat               <"cmpy",     int_hexagon_M2_cmpys_s0>;
-def Hexagon_M2_cmpysc_s1:
+def HEXAGON_M2_cmpysc_s1:
   di_MInst_sisi_s1_sat_conj       <"cmpy",     int_hexagon_M2_cmpysc_s1>;
-def Hexagon_M2_cmpysc_s0:
+def HEXAGON_M2_cmpysc_s0:
   di_MInst_sisi_sat_conj          <"cmpy",     int_hexagon_M2_cmpysc_s0>;
 
-def Hexagon_M2_cmacs_s1:
+def HEXAGON_M2_cmacs_s1:
   di_MInst_disisi_acc_s1_sat      <"cmpy",     int_hexagon_M2_cmacs_s1>;
-def Hexagon_M2_cmacs_s0:
+def HEXAGON_M2_cmacs_s0:
   di_MInst_disisi_acc_sat         <"cmpy",     int_hexagon_M2_cmacs_s0>;
-def Hexagon_M2_cmacsc_s1:
+def HEXAGON_M2_cmacsc_s1:
   di_MInst_disisi_acc_s1_sat_conj <"cmpy",     int_hexagon_M2_cmacsc_s1>;
-def Hexagon_M2_cmacsc_s0:
+def HEXAGON_M2_cmacsc_s0:
   di_MInst_disisi_acc_sat_conj    <"cmpy",     int_hexagon_M2_cmacsc_s0>;
 
-def Hexagon_M2_cnacs_s1:
+def HEXAGON_M2_cnacs_s1:
   di_MInst_disisi_nac_s1_sat      <"cmpy",     int_hexagon_M2_cnacs_s1>;
-def Hexagon_M2_cnacs_s0:
+def HEXAGON_M2_cnacs_s0:
   di_MInst_disisi_nac_sat         <"cmpy",     int_hexagon_M2_cnacs_s0>;
-def Hexagon_M2_cnacsc_s1:
+def HEXAGON_M2_cnacsc_s1:
   di_MInst_disisi_nac_s1_sat_conj <"cmpy",     int_hexagon_M2_cnacsc_s1>;
-def Hexagon_M2_cnacsc_s0:
+def HEXAGON_M2_cnacsc_s0:
   di_MInst_disisi_nac_sat_conj    <"cmpy",     int_hexagon_M2_cnacsc_s0>;
 
 // MTYPE / COMPLEX / Complex multiply real or imaginary.
-def Hexagon_M2_cmpyr_s0:
+def HEXAGON_M2_cmpyr_s0:
   di_MInst_sisi                   <"cmpyr",    int_hexagon_M2_cmpyr_s0>;
-def Hexagon_M2_cmacr_s0:
+def HEXAGON_M2_cmacr_s0:
   di_MInst_disisi_acc             <"cmpyr",    int_hexagon_M2_cmacr_s0>;
 
-def Hexagon_M2_cmpyi_s0:
+def HEXAGON_M2_cmpyi_s0:
   di_MInst_sisi                   <"cmpyi",    int_hexagon_M2_cmpyi_s0>;
-def Hexagon_M2_cmaci_s0:
+def HEXAGON_M2_cmaci_s0:
   di_MInst_disisi_acc             <"cmpyi",    int_hexagon_M2_cmaci_s0>;
 
 // MTYPE / COMPLEX / Complex multiply with round and pack.
 // Rxx32+=cmpy(Rs32,[*]Rt32:<<1]:rnd:sat
-def Hexagon_M2_cmpyrs_s0:
+def HEXAGON_M2_cmpyrs_s0:
   si_MInst_sisi_rnd_sat           <"cmpy",     int_hexagon_M2_cmpyrs_s0>;
-def Hexagon_M2_cmpyrs_s1:
+def HEXAGON_M2_cmpyrs_s1:
   si_MInst_sisi_s1_rnd_sat        <"cmpy",     int_hexagon_M2_cmpyrs_s1>;
 
-def Hexagon_M2_cmpyrsc_s0:
+def HEXAGON_M2_cmpyrsc_s0:
   si_MInst_sisi_rnd_sat_conj      <"cmpy",     int_hexagon_M2_cmpyrsc_s0>;
-def Hexagon_M2_cmpyrsc_s1:
+def HEXAGON_M2_cmpyrsc_s1:
   si_MInst_sisi_s1_rnd_sat_conj   <"cmpy",     int_hexagon_M2_cmpyrsc_s1>;
 
 //MTYPE / COMPLEX / Vector complex multiply real or imaginary.
-def Hexagon_M2_vcmpy_s0_sat_i:
+def HEXAGON_M2_vcmpy_s0_sat_i:
   di_MInst_didi_sat               <"vcmpyi",   int_hexagon_M2_vcmpy_s0_sat_i>;
-def Hexagon_M2_vcmpy_s1_sat_i:
+def HEXAGON_M2_vcmpy_s1_sat_i:
   di_MInst_didi_s1_sat            <"vcmpyi",   int_hexagon_M2_vcmpy_s1_sat_i>;
 
-def Hexagon_M2_vcmpy_s0_sat_r:
+def HEXAGON_M2_vcmpy_s0_sat_r:
   di_MInst_didi_sat               <"vcmpyr",   int_hexagon_M2_vcmpy_s0_sat_r>;
-def Hexagon_M2_vcmpy_s1_sat_r:
+def HEXAGON_M2_vcmpy_s1_sat_r:
   di_MInst_didi_s1_sat            <"vcmpyr",   int_hexagon_M2_vcmpy_s1_sat_r>;
 
-def Hexagon_M2_vcmac_s0_sat_i:
+def HEXAGON_M2_vcmac_s0_sat_i:
   di_MInst_dididi_acc_sat         <"vcmpyi",   int_hexagon_M2_vcmac_s0_sat_i>;
-def Hexagon_M2_vcmac_s0_sat_r:
+def HEXAGON_M2_vcmac_s0_sat_r:
   di_MInst_dididi_acc_sat         <"vcmpyr",   int_hexagon_M2_vcmac_s0_sat_r>;
 
 //MTYPE / COMPLEX / Vector reduce complex multiply real or imaginary.
-def Hexagon_M2_vrcmpyi_s0:
+def HEXAGON_M2_vrcmpyi_s0:
   di_MInst_didi                   <"vrcmpyi",  int_hexagon_M2_vrcmpyi_s0>;
-def Hexagon_M2_vrcmpyr_s0:
+def HEXAGON_M2_vrcmpyr_s0:
   di_MInst_didi                   <"vrcmpyr",  int_hexagon_M2_vrcmpyr_s0>;
 
-def Hexagon_M2_vrcmpyi_s0c:
+def HEXAGON_M2_vrcmpyi_s0c:
   di_MInst_didi_conj              <"vrcmpyi",  int_hexagon_M2_vrcmpyi_s0c>;
-def Hexagon_M2_vrcmpyr_s0c:
+def HEXAGON_M2_vrcmpyr_s0c:
   di_MInst_didi_conj              <"vrcmpyr",  int_hexagon_M2_vrcmpyr_s0c>;
 
-def Hexagon_M2_vrcmaci_s0:
+def HEXAGON_M2_vrcmaci_s0:
   di_MInst_dididi_acc             <"vrcmpyi",  int_hexagon_M2_vrcmaci_s0>;
-def Hexagon_M2_vrcmacr_s0:
+def HEXAGON_M2_vrcmacr_s0:
   di_MInst_dididi_acc             <"vrcmpyr",  int_hexagon_M2_vrcmacr_s0>;
 
-def Hexagon_M2_vrcmaci_s0c:
+def HEXAGON_M2_vrcmaci_s0c:
   di_MInst_dididi_acc_conj        <"vrcmpyi",  int_hexagon_M2_vrcmaci_s0c>;
-def Hexagon_M2_vrcmacr_s0c:
+def HEXAGON_M2_vrcmacr_s0c:
   di_MInst_dididi_acc_conj        <"vrcmpyr",  int_hexagon_M2_vrcmacr_s0c>;
 
 
@@ -2409,115 +2440,120 @@ def Hexagon_M2_vrcmacr_s0c:
 *********************************************************************/
 
 // MTYPE / MPYH / Multiply and use lower result.
-//def Hexagon_M2_mpysmi:
+//def HEXAGON_M2_mpysmi:
+//FIXME: Hexagon_M2_mpysmi should really by of the type si_MInst_sim9,
+// not si_MInst_sis9 - but for now, we will use s9.
+// def Hexagon_M2_mpysmi:
 //  si_MInst_sim9                   <"mpyi",     int_hexagon_M2_mpysmi>;
-def Hexagon_M2_mpyi:
+def Hexagon_M2_mpysmi:
+  si_MInst_sis9                   <"mpyi",     int_hexagon_M2_mpysmi>;
+def HEXAGON_M2_mpyi:
   si_MInst_sisi                   <"mpyi",     int_hexagon_M2_mpyi>;
-def Hexagon_M2_mpyui:
+def HEXAGON_M2_mpyui:
   si_MInst_sisi                   <"mpyui",    int_hexagon_M2_mpyui>;
-def Hexagon_M2_macsip:
+def HEXAGON_M2_macsip:
   si_MInst_sisiu8_acc             <"mpyi",     int_hexagon_M2_macsip>;
-def Hexagon_M2_maci:
+def HEXAGON_M2_maci:
   si_MInst_sisisi_acc             <"mpyi",     int_hexagon_M2_maci>;
-def Hexagon_M2_macsin:
+def HEXAGON_M2_macsin:
   si_MInst_sisiu8_nac             <"mpyi",     int_hexagon_M2_macsin>;
 
 // MTYPE / MPYH / Multiply word by half (32x16).
 //Rdd[+]=vmpywoh(Rss,Rtt)[:<<1][:rnd][:sat]
 //Rdd[+]=vmpyweh(Rss,Rtt)[:<<1][:rnd][:sat]
-def Hexagon_M2_mmpyl_rs1:
+def HEXAGON_M2_mmpyl_rs1:
   di_MInst_didi_s1_rnd_sat        <"vmpyweh",  int_hexagon_M2_mmpyl_rs1>;
-def Hexagon_M2_mmpyl_s1:
+def HEXAGON_M2_mmpyl_s1:
   di_MInst_didi_s1_sat            <"vmpyweh",  int_hexagon_M2_mmpyl_s1>;
-def Hexagon_M2_mmpyl_rs0:
+def HEXAGON_M2_mmpyl_rs0:
   di_MInst_didi_rnd_sat           <"vmpyweh",  int_hexagon_M2_mmpyl_rs0>;
-def Hexagon_M2_mmpyl_s0:
+def HEXAGON_M2_mmpyl_s0:
   di_MInst_didi_sat               <"vmpyweh",  int_hexagon_M2_mmpyl_s0>;
-def Hexagon_M2_mmpyh_rs1:
+def HEXAGON_M2_mmpyh_rs1:
   di_MInst_didi_s1_rnd_sat        <"vmpywoh",  int_hexagon_M2_mmpyh_rs1>;
-def Hexagon_M2_mmpyh_s1:
+def HEXAGON_M2_mmpyh_s1:
   di_MInst_didi_s1_sat            <"vmpywoh",  int_hexagon_M2_mmpyh_s1>;
-def Hexagon_M2_mmpyh_rs0:
+def HEXAGON_M2_mmpyh_rs0:
   di_MInst_didi_rnd_sat           <"vmpywoh",  int_hexagon_M2_mmpyh_rs0>;
-def Hexagon_M2_mmpyh_s0:
+def HEXAGON_M2_mmpyh_s0:
   di_MInst_didi_sat               <"vmpywoh",  int_hexagon_M2_mmpyh_s0>;
-def Hexagon_M2_mmacls_rs1:
+def HEXAGON_M2_mmacls_rs1:
   di_MInst_dididi_acc_s1_rnd_sat  <"vmpyweh",  int_hexagon_M2_mmacls_rs1>;
-def Hexagon_M2_mmacls_s1:
+def HEXAGON_M2_mmacls_s1:
   di_MInst_dididi_acc_s1_sat      <"vmpyweh",  int_hexagon_M2_mmacls_s1>;
-def Hexagon_M2_mmacls_rs0:
+def HEXAGON_M2_mmacls_rs0:
   di_MInst_dididi_acc_rnd_sat     <"vmpyweh",  int_hexagon_M2_mmacls_rs0>;
-def Hexagon_M2_mmacls_s0:
+def HEXAGON_M2_mmacls_s0:
   di_MInst_dididi_acc_sat         <"vmpyweh",  int_hexagon_M2_mmacls_s0>;
-def Hexagon_M2_mmachs_rs1:
+def HEXAGON_M2_mmachs_rs1:
   di_MInst_dididi_acc_s1_rnd_sat  <"vmpywoh",  int_hexagon_M2_mmachs_rs1>;
-def Hexagon_M2_mmachs_s1:
+def HEXAGON_M2_mmachs_s1:
   di_MInst_dididi_acc_s1_sat      <"vmpywoh",  int_hexagon_M2_mmachs_s1>;
-def Hexagon_M2_mmachs_rs0:
+def HEXAGON_M2_mmachs_rs0:
   di_MInst_dididi_acc_rnd_sat     <"vmpywoh",  int_hexagon_M2_mmachs_rs0>;
-def Hexagon_M2_mmachs_s0:
+def HEXAGON_M2_mmachs_s0:
   di_MInst_dididi_acc_sat         <"vmpywoh",  int_hexagon_M2_mmachs_s0>;
 
 // MTYPE / MPYH / Multiply word by unsigned half (32x16).
 //Rdd[+]=vmpywouh(Rss,Rtt)[:<<1][:rnd][:sat]
 //Rdd[+]=vmpyweuh(Rss,Rtt)[:<<1][:rnd][:sat]
-def Hexagon_M2_mmpyul_rs1:
+def HEXAGON_M2_mmpyul_rs1:
   di_MInst_didi_s1_rnd_sat        <"vmpyweuh", int_hexagon_M2_mmpyul_rs1>;
-def Hexagon_M2_mmpyul_s1:
+def HEXAGON_M2_mmpyul_s1:
   di_MInst_didi_s1_sat            <"vmpyweuh", int_hexagon_M2_mmpyul_s1>;
-def Hexagon_M2_mmpyul_rs0:
+def HEXAGON_M2_mmpyul_rs0:
   di_MInst_didi_rnd_sat           <"vmpyweuh", int_hexagon_M2_mmpyul_rs0>;
-def Hexagon_M2_mmpyul_s0:
+def HEXAGON_M2_mmpyul_s0:
   di_MInst_didi_sat               <"vmpyweuh", int_hexagon_M2_mmpyul_s0>;
-def Hexagon_M2_mmpyuh_rs1:
+def HEXAGON_M2_mmpyuh_rs1:
   di_MInst_didi_s1_rnd_sat        <"vmpywouh", int_hexagon_M2_mmpyuh_rs1>;
-def Hexagon_M2_mmpyuh_s1:
+def HEXAGON_M2_mmpyuh_s1:
   di_MInst_didi_s1_sat            <"vmpywouh", int_hexagon_M2_mmpyuh_s1>;
-def Hexagon_M2_mmpyuh_rs0:
+def HEXAGON_M2_mmpyuh_rs0:
   di_MInst_didi_rnd_sat           <"vmpywouh", int_hexagon_M2_mmpyuh_rs0>;
-def Hexagon_M2_mmpyuh_s0:
+def HEXAGON_M2_mmpyuh_s0:
   di_MInst_didi_sat               <"vmpywouh", int_hexagon_M2_mmpyuh_s0>;
-def Hexagon_M2_mmaculs_rs1:
+def HEXAGON_M2_mmaculs_rs1:
   di_MInst_dididi_acc_s1_rnd_sat  <"vmpyweuh", int_hexagon_M2_mmaculs_rs1>;
-def Hexagon_M2_mmaculs_s1:
+def HEXAGON_M2_mmaculs_s1:
   di_MInst_dididi_acc_s1_sat      <"vmpyweuh", int_hexagon_M2_mmaculs_s1>;
-def Hexagon_M2_mmaculs_rs0:
+def HEXAGON_M2_mmaculs_rs0:
   di_MInst_dididi_acc_rnd_sat     <"vmpyweuh", int_hexagon_M2_mmaculs_rs0>;
-def Hexagon_M2_mmaculs_s0:
+def HEXAGON_M2_mmaculs_s0:
   di_MInst_dididi_acc_sat         <"vmpyweuh", int_hexagon_M2_mmaculs_s0>;
-def Hexagon_M2_mmacuhs_rs1:
+def HEXAGON_M2_mmacuhs_rs1:
   di_MInst_dididi_acc_s1_rnd_sat  <"vmpywouh", int_hexagon_M2_mmacuhs_rs1>;
-def Hexagon_M2_mmacuhs_s1:
+def HEXAGON_M2_mmacuhs_s1:
   di_MInst_dididi_acc_s1_sat      <"vmpywouh", int_hexagon_M2_mmacuhs_s1>;
-def Hexagon_M2_mmacuhs_rs0:
+def HEXAGON_M2_mmacuhs_rs0:
   di_MInst_dididi_acc_rnd_sat     <"vmpywouh", int_hexagon_M2_mmacuhs_rs0>;
-def Hexagon_M2_mmacuhs_s0:
+def HEXAGON_M2_mmacuhs_s0:
   di_MInst_dididi_acc_sat         <"vmpywouh", int_hexagon_M2_mmacuhs_s0>;
 
 // MTYPE / MPYH / Multiply and use upper result.
-def Hexagon_M2_hmmpyh_rs1:
+def HEXAGON_M2_hmmpyh_rs1:
   si_MInst_sisi_h_s1_rnd_sat      <"mpy",      int_hexagon_M2_hmmpyh_rs1>;
-def Hexagon_M2_hmmpyl_rs1:
+def HEXAGON_M2_hmmpyl_rs1:
   si_MInst_sisi_l_s1_rnd_sat      <"mpy",      int_hexagon_M2_hmmpyl_rs1>;
-def Hexagon_M2_mpy_up:
+def HEXAGON_M2_mpy_up:
   si_MInst_sisi                   <"mpy",      int_hexagon_M2_mpy_up>;
-def Hexagon_M2_dpmpyss_rnd_s0:
+def HEXAGON_M2_dpmpyss_rnd_s0:
   si_MInst_sisi_rnd               <"mpy",      int_hexagon_M2_dpmpyss_rnd_s0>;
-def Hexagon_M2_mpyu_up:
+def HEXAGON_M2_mpyu_up:
   si_MInst_sisi                   <"mpyu",     int_hexagon_M2_mpyu_up>;
 
 // MTYPE / MPYH / Multiply and use full result.
-def Hexagon_M2_dpmpyuu_s0:
+def HEXAGON_M2_dpmpyuu_s0:
   di_MInst_sisi                   <"mpyu",     int_hexagon_M2_dpmpyuu_s0>;
-def Hexagon_M2_dpmpyuu_acc_s0:
+def HEXAGON_M2_dpmpyuu_acc_s0:
   di_MInst_disisi_acc             <"mpyu",     int_hexagon_M2_dpmpyuu_acc_s0>;
-def Hexagon_M2_dpmpyuu_nac_s0:
+def HEXAGON_M2_dpmpyuu_nac_s0:
   di_MInst_disisi_nac             <"mpyu",     int_hexagon_M2_dpmpyuu_nac_s0>;
-def Hexagon_M2_dpmpyss_s0:
+def HEXAGON_M2_dpmpyss_s0:
   di_MInst_sisi                   <"mpy",      int_hexagon_M2_dpmpyss_s0>;
-def Hexagon_M2_dpmpyss_acc_s0:
+def HEXAGON_M2_dpmpyss_acc_s0:
   di_MInst_disisi_acc             <"mpy",      int_hexagon_M2_dpmpyss_acc_s0>;
-def Hexagon_M2_dpmpyss_nac_s0:
+def HEXAGON_M2_dpmpyss_nac_s0:
   di_MInst_disisi_nac             <"mpy",      int_hexagon_M2_dpmpyss_nac_s0>;
 
 
@@ -2528,334 +2564,334 @@ def Hexagon_M2_dpmpyss_nac_s0:
 // MTYPE / MPYS / Scalar 16x16 multiply signed.
 //Rd=mpy(Rs.[H|L],Rt.[H|L:<<0|:<<1]|
 //          [:<<0[:rnd|:sat|:rnd:sat]|:<<1[:rnd|:sat|:rnd:sat]]]
-def Hexagon_M2_mpy_hh_s0:
+def HEXAGON_M2_mpy_hh_s0:
   si_MInst_sisi_hh                <"mpy",     int_hexagon_M2_mpy_hh_s0>;
-def Hexagon_M2_mpy_hh_s1:
+def HEXAGON_M2_mpy_hh_s1:
   si_MInst_sisi_hh_s1             <"mpy",     int_hexagon_M2_mpy_hh_s1>;
-def Hexagon_M2_mpy_rnd_hh_s1:
+def HEXAGON_M2_mpy_rnd_hh_s1:
   si_MInst_sisi_rnd_hh_s1         <"mpy",     int_hexagon_M2_mpy_rnd_hh_s1>;
-def Hexagon_M2_mpy_sat_rnd_hh_s1:
+def HEXAGON_M2_mpy_sat_rnd_hh_s1:
   si_MInst_sisi_sat_rnd_hh_s1     <"mpy",     int_hexagon_M2_mpy_sat_rnd_hh_s1>;
-def Hexagon_M2_mpy_sat_hh_s1:
+def HEXAGON_M2_mpy_sat_hh_s1:
   si_MInst_sisi_sat_hh_s1         <"mpy",     int_hexagon_M2_mpy_sat_hh_s1>;
-def Hexagon_M2_mpy_rnd_hh_s0:
+def HEXAGON_M2_mpy_rnd_hh_s0:
   si_MInst_sisi_rnd_hh            <"mpy",     int_hexagon_M2_mpy_rnd_hh_s0>;
-def Hexagon_M2_mpy_sat_rnd_hh_s0:
+def HEXAGON_M2_mpy_sat_rnd_hh_s0:
   si_MInst_sisi_sat_rnd_hh        <"mpy",     int_hexagon_M2_mpy_sat_rnd_hh_s0>;
-def Hexagon_M2_mpy_sat_hh_s0:
+def HEXAGON_M2_mpy_sat_hh_s0:
   si_MInst_sisi_sat_hh            <"mpy",     int_hexagon_M2_mpy_sat_hh_s0>;
 
-def Hexagon_M2_mpy_hl_s0:
+def HEXAGON_M2_mpy_hl_s0:
   si_MInst_sisi_hl                <"mpy",     int_hexagon_M2_mpy_hl_s0>;
-def Hexagon_M2_mpy_hl_s1:
+def HEXAGON_M2_mpy_hl_s1:
   si_MInst_sisi_hl_s1             <"mpy",     int_hexagon_M2_mpy_hl_s1>;
-def Hexagon_M2_mpy_rnd_hl_s1:
+def HEXAGON_M2_mpy_rnd_hl_s1:
   si_MInst_sisi_rnd_hl_s1         <"mpy",     int_hexagon_M2_mpy_rnd_hl_s1>;
-def Hexagon_M2_mpy_sat_rnd_hl_s1:
+def HEXAGON_M2_mpy_sat_rnd_hl_s1:
   si_MInst_sisi_sat_rnd_hl_s1     <"mpy",     int_hexagon_M2_mpy_sat_rnd_hl_s1>;
-def Hexagon_M2_mpy_sat_hl_s1:
+def HEXAGON_M2_mpy_sat_hl_s1:
   si_MInst_sisi_sat_hl_s1         <"mpy",     int_hexagon_M2_mpy_sat_hl_s1>;
-def Hexagon_M2_mpy_rnd_hl_s0:
+def HEXAGON_M2_mpy_rnd_hl_s0:
   si_MInst_sisi_rnd_hl            <"mpy",     int_hexagon_M2_mpy_rnd_hl_s0>;
-def Hexagon_M2_mpy_sat_rnd_hl_s0:
+def HEXAGON_M2_mpy_sat_rnd_hl_s0:
   si_MInst_sisi_sat_rnd_hl        <"mpy",     int_hexagon_M2_mpy_sat_rnd_hl_s0>;
-def Hexagon_M2_mpy_sat_hl_s0:
+def HEXAGON_M2_mpy_sat_hl_s0:
   si_MInst_sisi_sat_hl            <"mpy",     int_hexagon_M2_mpy_sat_hl_s0>;
 
-def Hexagon_M2_mpy_lh_s0:
+def HEXAGON_M2_mpy_lh_s0:
   si_MInst_sisi_lh                <"mpy",     int_hexagon_M2_mpy_lh_s0>;
-def Hexagon_M2_mpy_lh_s1:
+def HEXAGON_M2_mpy_lh_s1:
   si_MInst_sisi_lh_s1             <"mpy",     int_hexagon_M2_mpy_lh_s1>;
-def Hexagon_M2_mpy_rnd_lh_s1:
+def HEXAGON_M2_mpy_rnd_lh_s1:
   si_MInst_sisi_rnd_lh_s1         <"mpy",     int_hexagon_M2_mpy_rnd_lh_s1>;
-def Hexagon_M2_mpy_sat_rnd_lh_s1:
+def HEXAGON_M2_mpy_sat_rnd_lh_s1:
   si_MInst_sisi_sat_rnd_lh_s1     <"mpy",     int_hexagon_M2_mpy_sat_rnd_lh_s1>;
-def Hexagon_M2_mpy_sat_lh_s1:
+def HEXAGON_M2_mpy_sat_lh_s1:
   si_MInst_sisi_sat_lh_s1         <"mpy",     int_hexagon_M2_mpy_sat_lh_s1>;
-def Hexagon_M2_mpy_rnd_lh_s0:
+def HEXAGON_M2_mpy_rnd_lh_s0:
   si_MInst_sisi_rnd_lh            <"mpy",     int_hexagon_M2_mpy_rnd_lh_s0>;
-def Hexagon_M2_mpy_sat_rnd_lh_s0:
+def HEXAGON_M2_mpy_sat_rnd_lh_s0:
   si_MInst_sisi_sat_rnd_lh        <"mpy",     int_hexagon_M2_mpy_sat_rnd_lh_s0>;
-def Hexagon_M2_mpy_sat_lh_s0:
+def HEXAGON_M2_mpy_sat_lh_s0:
   si_MInst_sisi_sat_lh            <"mpy",     int_hexagon_M2_mpy_sat_lh_s0>;
 
-def Hexagon_M2_mpy_ll_s0:
+def HEXAGON_M2_mpy_ll_s0:
   si_MInst_sisi_ll                <"mpy",     int_hexagon_M2_mpy_ll_s0>;
-def Hexagon_M2_mpy_ll_s1:
+def HEXAGON_M2_mpy_ll_s1:
   si_MInst_sisi_ll_s1             <"mpy",     int_hexagon_M2_mpy_ll_s1>;
-def Hexagon_M2_mpy_rnd_ll_s1:
+def HEXAGON_M2_mpy_rnd_ll_s1:
   si_MInst_sisi_rnd_ll_s1         <"mpy",     int_hexagon_M2_mpy_rnd_ll_s1>;
-def Hexagon_M2_mpy_sat_rnd_ll_s1:
+def HEXAGON_M2_mpy_sat_rnd_ll_s1:
   si_MInst_sisi_sat_rnd_ll_s1     <"mpy",     int_hexagon_M2_mpy_sat_rnd_ll_s1>;
-def Hexagon_M2_mpy_sat_ll_s1:
+def HEXAGON_M2_mpy_sat_ll_s1:
   si_MInst_sisi_sat_ll_s1         <"mpy",     int_hexagon_M2_mpy_sat_ll_s1>;
-def Hexagon_M2_mpy_rnd_ll_s0:
+def HEXAGON_M2_mpy_rnd_ll_s0:
   si_MInst_sisi_rnd_ll            <"mpy",     int_hexagon_M2_mpy_rnd_ll_s0>;
-def Hexagon_M2_mpy_sat_rnd_ll_s0:
+def HEXAGON_M2_mpy_sat_rnd_ll_s0:
   si_MInst_sisi_sat_rnd_ll        <"mpy",     int_hexagon_M2_mpy_sat_rnd_ll_s0>;
-def Hexagon_M2_mpy_sat_ll_s0:
+def HEXAGON_M2_mpy_sat_ll_s0:
   si_MInst_sisi_sat_ll            <"mpy",     int_hexagon_M2_mpy_sat_ll_s0>;
 
 //Rdd=mpy(Rs.[H|L],Rt.[H|L])[[:<<0|:<<1]|[:<<0:rnd|:<<1:rnd]]
-def Hexagon_M2_mpyd_hh_s0:
+def HEXAGON_M2_mpyd_hh_s0:
   di_MInst_sisi_hh                <"mpy",     int_hexagon_M2_mpyd_hh_s0>;
-def Hexagon_M2_mpyd_hh_s1:
+def HEXAGON_M2_mpyd_hh_s1:
   di_MInst_sisi_hh_s1             <"mpy",     int_hexagon_M2_mpyd_hh_s1>;
-def Hexagon_M2_mpyd_rnd_hh_s1:
+def HEXAGON_M2_mpyd_rnd_hh_s1:
   di_MInst_sisi_rnd_hh_s1         <"mpy",     int_hexagon_M2_mpyd_rnd_hh_s1>;
-def Hexagon_M2_mpyd_rnd_hh_s0:
+def HEXAGON_M2_mpyd_rnd_hh_s0:
   di_MInst_sisi_rnd_hh            <"mpy",     int_hexagon_M2_mpyd_rnd_hh_s0>;
 
-def Hexagon_M2_mpyd_hl_s0:
+def HEXAGON_M2_mpyd_hl_s0:
   di_MInst_sisi_hl                <"mpy",     int_hexagon_M2_mpyd_hl_s0>;
-def Hexagon_M2_mpyd_hl_s1:
+def HEXAGON_M2_mpyd_hl_s1:
   di_MInst_sisi_hl_s1             <"mpy",     int_hexagon_M2_mpyd_hl_s1>;
-def Hexagon_M2_mpyd_rnd_hl_s1:
+def HEXAGON_M2_mpyd_rnd_hl_s1:
   di_MInst_sisi_rnd_hl_s1         <"mpy",     int_hexagon_M2_mpyd_rnd_hl_s1>;
-def Hexagon_M2_mpyd_rnd_hl_s0:
+def HEXAGON_M2_mpyd_rnd_hl_s0:
   di_MInst_sisi_rnd_hl            <"mpy",     int_hexagon_M2_mpyd_rnd_hl_s0>;
 
-def Hexagon_M2_mpyd_lh_s0:
+def HEXAGON_M2_mpyd_lh_s0:
   di_MInst_sisi_lh                <"mpy",     int_hexagon_M2_mpyd_lh_s0>;
-def Hexagon_M2_mpyd_lh_s1:
+def HEXAGON_M2_mpyd_lh_s1:
   di_MInst_sisi_lh_s1             <"mpy",     int_hexagon_M2_mpyd_lh_s1>;
-def Hexagon_M2_mpyd_rnd_lh_s1:
+def HEXAGON_M2_mpyd_rnd_lh_s1:
   di_MInst_sisi_rnd_lh_s1         <"mpy",     int_hexagon_M2_mpyd_rnd_lh_s1>;
-def Hexagon_M2_mpyd_rnd_lh_s0:
+def HEXAGON_M2_mpyd_rnd_lh_s0:
   di_MInst_sisi_rnd_lh            <"mpy",     int_hexagon_M2_mpyd_rnd_lh_s0>;
 
-def Hexagon_M2_mpyd_ll_s0:
+def HEXAGON_M2_mpyd_ll_s0:
   di_MInst_sisi_ll                <"mpy",     int_hexagon_M2_mpyd_ll_s0>;
-def Hexagon_M2_mpyd_ll_s1:
+def HEXAGON_M2_mpyd_ll_s1:
   di_MInst_sisi_ll_s1             <"mpy",     int_hexagon_M2_mpyd_ll_s1>;
-def Hexagon_M2_mpyd_rnd_ll_s1:
+def HEXAGON_M2_mpyd_rnd_ll_s1:
   di_MInst_sisi_rnd_ll_s1         <"mpy",     int_hexagon_M2_mpyd_rnd_ll_s1>;
-def Hexagon_M2_mpyd_rnd_ll_s0:
+def HEXAGON_M2_mpyd_rnd_ll_s0:
   di_MInst_sisi_rnd_ll            <"mpy",     int_hexagon_M2_mpyd_rnd_ll_s0>;
 
 //Rx+=mpy(Rs.[H|L],Rt.[H|L])[[[:<<0|:<<1]|[:<<0:sat|:<<1:sat]]
-def Hexagon_M2_mpy_acc_hh_s0:
+def HEXAGON_M2_mpy_acc_hh_s0:
   si_MInst_sisisi_acc_hh            <"mpy",  int_hexagon_M2_mpy_acc_hh_s0>;
-def Hexagon_M2_mpy_acc_hh_s1:
+def HEXAGON_M2_mpy_acc_hh_s1:
   si_MInst_sisisi_acc_hh_s1         <"mpy",  int_hexagon_M2_mpy_acc_hh_s1>;
-def Hexagon_M2_mpy_acc_sat_hh_s1:
+def HEXAGON_M2_mpy_acc_sat_hh_s1:
   si_MInst_sisisi_acc_sat_hh_s1     <"mpy",  int_hexagon_M2_mpy_acc_sat_hh_s1>;
-def Hexagon_M2_mpy_acc_sat_hh_s0:
+def HEXAGON_M2_mpy_acc_sat_hh_s0:
   si_MInst_sisisi_acc_sat_hh        <"mpy",  int_hexagon_M2_mpy_acc_sat_hh_s0>;
 
-def Hexagon_M2_mpy_acc_hl_s0:
+def HEXAGON_M2_mpy_acc_hl_s0:
   si_MInst_sisisi_acc_hl            <"mpy",  int_hexagon_M2_mpy_acc_hl_s0>;
-def Hexagon_M2_mpy_acc_hl_s1:
+def HEXAGON_M2_mpy_acc_hl_s1:
   si_MInst_sisisi_acc_hl_s1         <"mpy",  int_hexagon_M2_mpy_acc_hl_s1>;
-def Hexagon_M2_mpy_acc_sat_hl_s1:
+def HEXAGON_M2_mpy_acc_sat_hl_s1:
   si_MInst_sisisi_acc_sat_hl_s1     <"mpy",  int_hexagon_M2_mpy_acc_sat_hl_s1>;
-def Hexagon_M2_mpy_acc_sat_hl_s0:
+def HEXAGON_M2_mpy_acc_sat_hl_s0:
   si_MInst_sisisi_acc_sat_hl        <"mpy",  int_hexagon_M2_mpy_acc_sat_hl_s0>;
 
-def Hexagon_M2_mpy_acc_lh_s0:
+def HEXAGON_M2_mpy_acc_lh_s0:
   si_MInst_sisisi_acc_lh            <"mpy",  int_hexagon_M2_mpy_acc_lh_s0>;
-def Hexagon_M2_mpy_acc_lh_s1:
+def HEXAGON_M2_mpy_acc_lh_s1:
   si_MInst_sisisi_acc_lh_s1         <"mpy",  int_hexagon_M2_mpy_acc_lh_s1>;
-def Hexagon_M2_mpy_acc_sat_lh_s1:
+def HEXAGON_M2_mpy_acc_sat_lh_s1:
   si_MInst_sisisi_acc_sat_lh_s1     <"mpy",  int_hexagon_M2_mpy_acc_sat_lh_s1>;
-def Hexagon_M2_mpy_acc_sat_lh_s0:
+def HEXAGON_M2_mpy_acc_sat_lh_s0:
   si_MInst_sisisi_acc_sat_lh        <"mpy",  int_hexagon_M2_mpy_acc_sat_lh_s0>;
 
-def Hexagon_M2_mpy_acc_ll_s0:
+def HEXAGON_M2_mpy_acc_ll_s0:
   si_MInst_sisisi_acc_ll            <"mpy",  int_hexagon_M2_mpy_acc_ll_s0>;
-def Hexagon_M2_mpy_acc_ll_s1:
+def HEXAGON_M2_mpy_acc_ll_s1:
   si_MInst_sisisi_acc_ll_s1         <"mpy",  int_hexagon_M2_mpy_acc_ll_s1>;
-def Hexagon_M2_mpy_acc_sat_ll_s1:
+def HEXAGON_M2_mpy_acc_sat_ll_s1:
   si_MInst_sisisi_acc_sat_ll_s1     <"mpy",  int_hexagon_M2_mpy_acc_sat_ll_s1>;
-def Hexagon_M2_mpy_acc_sat_ll_s0:
+def HEXAGON_M2_mpy_acc_sat_ll_s0:
   si_MInst_sisisi_acc_sat_ll        <"mpy",  int_hexagon_M2_mpy_acc_sat_ll_s0>;
 
 //Rx-=mpy(Rs.[H|L],Rt.[H|L])[[[:<<0|:<<1]|[:<<0:sat|:<<1:sat]]
-def Hexagon_M2_mpy_nac_hh_s0:
+def HEXAGON_M2_mpy_nac_hh_s0:
   si_MInst_sisisi_nac_hh            <"mpy",  int_hexagon_M2_mpy_nac_hh_s0>;
-def Hexagon_M2_mpy_nac_hh_s1:
+def HEXAGON_M2_mpy_nac_hh_s1:
   si_MInst_sisisi_nac_hh_s1         <"mpy",  int_hexagon_M2_mpy_nac_hh_s1>;
-def Hexagon_M2_mpy_nac_sat_hh_s1:
+def HEXAGON_M2_mpy_nac_sat_hh_s1:
   si_MInst_sisisi_nac_sat_hh_s1     <"mpy",  int_hexagon_M2_mpy_nac_sat_hh_s1>;
-def Hexagon_M2_mpy_nac_sat_hh_s0:
+def HEXAGON_M2_mpy_nac_sat_hh_s0:
   si_MInst_sisisi_nac_sat_hh        <"mpy",  int_hexagon_M2_mpy_nac_sat_hh_s0>;
 
-def Hexagon_M2_mpy_nac_hl_s0:
+def HEXAGON_M2_mpy_nac_hl_s0:
   si_MInst_sisisi_nac_hl            <"mpy",  int_hexagon_M2_mpy_nac_hl_s0>;
-def Hexagon_M2_mpy_nac_hl_s1:
+def HEXAGON_M2_mpy_nac_hl_s1:
   si_MInst_sisisi_nac_hl_s1         <"mpy",  int_hexagon_M2_mpy_nac_hl_s1>;
-def Hexagon_M2_mpy_nac_sat_hl_s1:
+def HEXAGON_M2_mpy_nac_sat_hl_s1:
   si_MInst_sisisi_nac_sat_hl_s1     <"mpy",  int_hexagon_M2_mpy_nac_sat_hl_s1>;
-def Hexagon_M2_mpy_nac_sat_hl_s0:
+def HEXAGON_M2_mpy_nac_sat_hl_s0:
   si_MInst_sisisi_nac_sat_hl        <"mpy",  int_hexagon_M2_mpy_nac_sat_hl_s0>;
 
-def Hexagon_M2_mpy_nac_lh_s0:
+def HEXAGON_M2_mpy_nac_lh_s0:
   si_MInst_sisisi_nac_lh            <"mpy",  int_hexagon_M2_mpy_nac_lh_s0>;
-def Hexagon_M2_mpy_nac_lh_s1:
+def HEXAGON_M2_mpy_nac_lh_s1:
   si_MInst_sisisi_nac_lh_s1         <"mpy",  int_hexagon_M2_mpy_nac_lh_s1>;
-def Hexagon_M2_mpy_nac_sat_lh_s1:
+def HEXAGON_M2_mpy_nac_sat_lh_s1:
   si_MInst_sisisi_nac_sat_lh_s1     <"mpy",  int_hexagon_M2_mpy_nac_sat_lh_s1>;
-def Hexagon_M2_mpy_nac_sat_lh_s0:
+def HEXAGON_M2_mpy_nac_sat_lh_s0:
   si_MInst_sisisi_nac_sat_lh        <"mpy",  int_hexagon_M2_mpy_nac_sat_lh_s0>;
 
-def Hexagon_M2_mpy_nac_ll_s0:
+def HEXAGON_M2_mpy_nac_ll_s0:
   si_MInst_sisisi_nac_ll            <"mpy",  int_hexagon_M2_mpy_nac_ll_s0>;
-def Hexagon_M2_mpy_nac_ll_s1:
+def HEXAGON_M2_mpy_nac_ll_s1:
   si_MInst_sisisi_nac_ll_s1         <"mpy",  int_hexagon_M2_mpy_nac_ll_s1>;
-def Hexagon_M2_mpy_nac_sat_ll_s1:
+def HEXAGON_M2_mpy_nac_sat_ll_s1:
   si_MInst_sisisi_nac_sat_ll_s1     <"mpy",  int_hexagon_M2_mpy_nac_sat_ll_s1>;
-def Hexagon_M2_mpy_nac_sat_ll_s0:
+def HEXAGON_M2_mpy_nac_sat_ll_s0:
   si_MInst_sisisi_nac_sat_ll        <"mpy",  int_hexagon_M2_mpy_nac_sat_ll_s0>;
 
 //Rx+=mpy(Rs.[H|L],Rt.[H|L:<<0|:<<1]
-def Hexagon_M2_mpyd_acc_hh_s0:
+def HEXAGON_M2_mpyd_acc_hh_s0:
   di_MInst_disisi_acc_hh          <"mpy",    int_hexagon_M2_mpyd_acc_hh_s0>;
-def Hexagon_M2_mpyd_acc_hh_s1:
+def HEXAGON_M2_mpyd_acc_hh_s1:
   di_MInst_disisi_acc_hh_s1       <"mpy",    int_hexagon_M2_mpyd_acc_hh_s1>;
 
-def Hexagon_M2_mpyd_acc_hl_s0:
+def HEXAGON_M2_mpyd_acc_hl_s0:
   di_MInst_disisi_acc_hl          <"mpy",    int_hexagon_M2_mpyd_acc_hl_s0>;
-def Hexagon_M2_mpyd_acc_hl_s1:
+def HEXAGON_M2_mpyd_acc_hl_s1:
   di_MInst_disisi_acc_hl_s1       <"mpy",    int_hexagon_M2_mpyd_acc_hl_s1>;
 
-def Hexagon_M2_mpyd_acc_lh_s0:
+def HEXAGON_M2_mpyd_acc_lh_s0:
   di_MInst_disisi_acc_lh          <"mpy",    int_hexagon_M2_mpyd_acc_lh_s0>;
-def Hexagon_M2_mpyd_acc_lh_s1:
+def HEXAGON_M2_mpyd_acc_lh_s1:
   di_MInst_disisi_acc_lh_s1       <"mpy",    int_hexagon_M2_mpyd_acc_lh_s1>;
 
-def Hexagon_M2_mpyd_acc_ll_s0:
+def HEXAGON_M2_mpyd_acc_ll_s0:
   di_MInst_disisi_acc_ll          <"mpy",    int_hexagon_M2_mpyd_acc_ll_s0>;
-def Hexagon_M2_mpyd_acc_ll_s1:
+def HEXAGON_M2_mpyd_acc_ll_s1:
   di_MInst_disisi_acc_ll_s1       <"mpy",    int_hexagon_M2_mpyd_acc_ll_s1>;
 
 //Rx-=mpy(Rs.[H|L],Rt.[H|L:<<0|:<<1]
-def Hexagon_M2_mpyd_nac_hh_s0:
+def HEXAGON_M2_mpyd_nac_hh_s0:
   di_MInst_disisi_nac_hh          <"mpy",    int_hexagon_M2_mpyd_nac_hh_s0>;
-def Hexagon_M2_mpyd_nac_hh_s1:
+def HEXAGON_M2_mpyd_nac_hh_s1:
   di_MInst_disisi_nac_hh_s1       <"mpy",    int_hexagon_M2_mpyd_nac_hh_s1>;
 
-def Hexagon_M2_mpyd_nac_hl_s0:
+def HEXAGON_M2_mpyd_nac_hl_s0:
   di_MInst_disisi_nac_hl          <"mpy",    int_hexagon_M2_mpyd_nac_hl_s0>;
-def Hexagon_M2_mpyd_nac_hl_s1:
+def HEXAGON_M2_mpyd_nac_hl_s1:
   di_MInst_disisi_nac_hl_s1       <"mpy",    int_hexagon_M2_mpyd_nac_hl_s1>;
 
-def Hexagon_M2_mpyd_nac_lh_s0:
+def HEXAGON_M2_mpyd_nac_lh_s0:
   di_MInst_disisi_nac_lh          <"mpy",    int_hexagon_M2_mpyd_nac_lh_s0>;
-def Hexagon_M2_mpyd_nac_lh_s1:
+def HEXAGON_M2_mpyd_nac_lh_s1:
   di_MInst_disisi_nac_lh_s1       <"mpy",    int_hexagon_M2_mpyd_nac_lh_s1>;
 
-def Hexagon_M2_mpyd_nac_ll_s0:
+def HEXAGON_M2_mpyd_nac_ll_s0:
   di_MInst_disisi_nac_ll          <"mpy",    int_hexagon_M2_mpyd_nac_ll_s0>;
-def Hexagon_M2_mpyd_nac_ll_s1:
+def HEXAGON_M2_mpyd_nac_ll_s1:
   di_MInst_disisi_nac_ll_s1       <"mpy",    int_hexagon_M2_mpyd_nac_ll_s1>;
 
 // MTYPE / MPYS / Scalar 16x16 multiply unsigned.
 //Rd=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
-def Hexagon_M2_mpyu_hh_s0:
+def HEXAGON_M2_mpyu_hh_s0:
   si_MInst_sisi_hh                <"mpyu",    int_hexagon_M2_mpyu_hh_s0>;
-def Hexagon_M2_mpyu_hh_s1:
+def HEXAGON_M2_mpyu_hh_s1:
   si_MInst_sisi_hh_s1             <"mpyu",    int_hexagon_M2_mpyu_hh_s1>;
-def Hexagon_M2_mpyu_hl_s0:
+def HEXAGON_M2_mpyu_hl_s0:
   si_MInst_sisi_hl                <"mpyu",    int_hexagon_M2_mpyu_hl_s0>;
-def Hexagon_M2_mpyu_hl_s1:
+def HEXAGON_M2_mpyu_hl_s1:
   si_MInst_sisi_hl_s1             <"mpyu",    int_hexagon_M2_mpyu_hl_s1>;
-def Hexagon_M2_mpyu_lh_s0:
+def HEXAGON_M2_mpyu_lh_s0:
   si_MInst_sisi_lh                <"mpyu",    int_hexagon_M2_mpyu_lh_s0>;
-def Hexagon_M2_mpyu_lh_s1:
+def HEXAGON_M2_mpyu_lh_s1:
   si_MInst_sisi_lh_s1             <"mpyu",    int_hexagon_M2_mpyu_lh_s1>;
-def Hexagon_M2_mpyu_ll_s0:
+def HEXAGON_M2_mpyu_ll_s0:
   si_MInst_sisi_ll                <"mpyu",    int_hexagon_M2_mpyu_ll_s0>;
-def Hexagon_M2_mpyu_ll_s1:
+def HEXAGON_M2_mpyu_ll_s1:
   si_MInst_sisi_ll_s1             <"mpyu",    int_hexagon_M2_mpyu_ll_s1>;
 
 //Rdd=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
-def Hexagon_M2_mpyud_hh_s0:
+def HEXAGON_M2_mpyud_hh_s0:
   di_MInst_sisi_hh                <"mpyu",    int_hexagon_M2_mpyud_hh_s0>;
-def Hexagon_M2_mpyud_hh_s1:
+def HEXAGON_M2_mpyud_hh_s1:
   di_MInst_sisi_hh_s1             <"mpyu",    int_hexagon_M2_mpyud_hh_s1>;
-def Hexagon_M2_mpyud_hl_s0:
+def HEXAGON_M2_mpyud_hl_s0:
   di_MInst_sisi_hl                <"mpyu",    int_hexagon_M2_mpyud_hl_s0>;
-def Hexagon_M2_mpyud_hl_s1:
+def HEXAGON_M2_mpyud_hl_s1:
   di_MInst_sisi_hl_s1             <"mpyu",    int_hexagon_M2_mpyud_hl_s1>;
-def Hexagon_M2_mpyud_lh_s0:
+def HEXAGON_M2_mpyud_lh_s0:
   di_MInst_sisi_lh                <"mpyu",    int_hexagon_M2_mpyud_lh_s0>;
-def Hexagon_M2_mpyud_lh_s1:
+def HEXAGON_M2_mpyud_lh_s1:
   di_MInst_sisi_lh_s1             <"mpyu",    int_hexagon_M2_mpyud_lh_s1>;
-def Hexagon_M2_mpyud_ll_s0:
+def HEXAGON_M2_mpyud_ll_s0:
   di_MInst_sisi_ll                <"mpyu",    int_hexagon_M2_mpyud_ll_s0>;
-def Hexagon_M2_mpyud_ll_s1:
+def HEXAGON_M2_mpyud_ll_s1:
   di_MInst_sisi_ll_s1             <"mpyu",    int_hexagon_M2_mpyud_ll_s1>;
 
 //Rd+=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
-def Hexagon_M2_mpyu_acc_hh_s0:
+def HEXAGON_M2_mpyu_acc_hh_s0:
   si_MInst_sisisi_acc_hh            <"mpyu",    int_hexagon_M2_mpyu_acc_hh_s0>;
-def Hexagon_M2_mpyu_acc_hh_s1:
+def HEXAGON_M2_mpyu_acc_hh_s1:
   si_MInst_sisisi_acc_hh_s1         <"mpyu",    int_hexagon_M2_mpyu_acc_hh_s1>;
-def Hexagon_M2_mpyu_acc_hl_s0:
+def HEXAGON_M2_mpyu_acc_hl_s0:
   si_MInst_sisisi_acc_hl            <"mpyu",    int_hexagon_M2_mpyu_acc_hl_s0>;
-def Hexagon_M2_mpyu_acc_hl_s1:
+def HEXAGON_M2_mpyu_acc_hl_s1:
   si_MInst_sisisi_acc_hl_s1         <"mpyu",    int_hexagon_M2_mpyu_acc_hl_s1>;
-def Hexagon_M2_mpyu_acc_lh_s0:
+def HEXAGON_M2_mpyu_acc_lh_s0:
   si_MInst_sisisi_acc_lh            <"mpyu",    int_hexagon_M2_mpyu_acc_lh_s0>;
-def Hexagon_M2_mpyu_acc_lh_s1:
+def HEXAGON_M2_mpyu_acc_lh_s1:
   si_MInst_sisisi_acc_lh_s1         <"mpyu",    int_hexagon_M2_mpyu_acc_lh_s1>;
-def Hexagon_M2_mpyu_acc_ll_s0:
+def HEXAGON_M2_mpyu_acc_ll_s0:
   si_MInst_sisisi_acc_ll            <"mpyu",    int_hexagon_M2_mpyu_acc_ll_s0>;
-def Hexagon_M2_mpyu_acc_ll_s1:
+def HEXAGON_M2_mpyu_acc_ll_s1:
   si_MInst_sisisi_acc_ll_s1         <"mpyu",    int_hexagon_M2_mpyu_acc_ll_s1>;
 
 //Rd+=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
-def Hexagon_M2_mpyu_nac_hh_s0:
+def HEXAGON_M2_mpyu_nac_hh_s0:
   si_MInst_sisisi_nac_hh            <"mpyu",    int_hexagon_M2_mpyu_nac_hh_s0>;
-def Hexagon_M2_mpyu_nac_hh_s1:
+def HEXAGON_M2_mpyu_nac_hh_s1:
   si_MInst_sisisi_nac_hh_s1         <"mpyu",    int_hexagon_M2_mpyu_nac_hh_s1>;
-def Hexagon_M2_mpyu_nac_hl_s0:
+def HEXAGON_M2_mpyu_nac_hl_s0:
   si_MInst_sisisi_nac_hl            <"mpyu",    int_hexagon_M2_mpyu_nac_hl_s0>;
-def Hexagon_M2_mpyu_nac_hl_s1:
+def HEXAGON_M2_mpyu_nac_hl_s1:
   si_MInst_sisisi_nac_hl_s1         <"mpyu",    int_hexagon_M2_mpyu_nac_hl_s1>;
-def Hexagon_M2_mpyu_nac_lh_s0:
+def HEXAGON_M2_mpyu_nac_lh_s0:
   si_MInst_sisisi_nac_lh            <"mpyu",    int_hexagon_M2_mpyu_nac_lh_s0>;
-def Hexagon_M2_mpyu_nac_lh_s1:
+def HEXAGON_M2_mpyu_nac_lh_s1:
   si_MInst_sisisi_nac_lh_s1         <"mpyu",    int_hexagon_M2_mpyu_nac_lh_s1>;
-def Hexagon_M2_mpyu_nac_ll_s0:
+def HEXAGON_M2_mpyu_nac_ll_s0:
   si_MInst_sisisi_nac_ll            <"mpyu",    int_hexagon_M2_mpyu_nac_ll_s0>;
-def Hexagon_M2_mpyu_nac_ll_s1:
+def HEXAGON_M2_mpyu_nac_ll_s1:
   si_MInst_sisisi_nac_ll_s1         <"mpyu",    int_hexagon_M2_mpyu_nac_ll_s1>;
 
 //Rdd+=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
-def Hexagon_M2_mpyud_acc_hh_s0:
+def HEXAGON_M2_mpyud_acc_hh_s0:
   di_MInst_disisi_acc_hh            <"mpyu", int_hexagon_M2_mpyud_acc_hh_s0>;
-def Hexagon_M2_mpyud_acc_hh_s1:
+def HEXAGON_M2_mpyud_acc_hh_s1:
   di_MInst_disisi_acc_hh_s1         <"mpyu", int_hexagon_M2_mpyud_acc_hh_s1>;
-def Hexagon_M2_mpyud_acc_hl_s0:
+def HEXAGON_M2_mpyud_acc_hl_s0:
   di_MInst_disisi_acc_hl            <"mpyu", int_hexagon_M2_mpyud_acc_hl_s0>;
-def Hexagon_M2_mpyud_acc_hl_s1:
+def HEXAGON_M2_mpyud_acc_hl_s1:
   di_MInst_disisi_acc_hl_s1         <"mpyu", int_hexagon_M2_mpyud_acc_hl_s1>;
-def Hexagon_M2_mpyud_acc_lh_s0:
+def HEXAGON_M2_mpyud_acc_lh_s0:
   di_MInst_disisi_acc_lh            <"mpyu", int_hexagon_M2_mpyud_acc_lh_s0>;
-def Hexagon_M2_mpyud_acc_lh_s1:
+def HEXAGON_M2_mpyud_acc_lh_s1:
   di_MInst_disisi_acc_lh_s1         <"mpyu", int_hexagon_M2_mpyud_acc_lh_s1>;
-def Hexagon_M2_mpyud_acc_ll_s0:
+def HEXAGON_M2_mpyud_acc_ll_s0:
   di_MInst_disisi_acc_ll            <"mpyu", int_hexagon_M2_mpyud_acc_ll_s0>;
-def Hexagon_M2_mpyud_acc_ll_s1:
+def HEXAGON_M2_mpyud_acc_ll_s1:
   di_MInst_disisi_acc_ll_s1         <"mpyu", int_hexagon_M2_mpyud_acc_ll_s1>;
 
 //Rdd-=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
-def Hexagon_M2_mpyud_nac_hh_s0:
+def HEXAGON_M2_mpyud_nac_hh_s0:
   di_MInst_disisi_nac_hh            <"mpyu", int_hexagon_M2_mpyud_nac_hh_s0>;
-def Hexagon_M2_mpyud_nac_hh_s1:
+def HEXAGON_M2_mpyud_nac_hh_s1:
   di_MInst_disisi_nac_hh_s1         <"mpyu", int_hexagon_M2_mpyud_nac_hh_s1>;
-def Hexagon_M2_mpyud_nac_hl_s0:
+def HEXAGON_M2_mpyud_nac_hl_s0:
   di_MInst_disisi_nac_hl            <"mpyu", int_hexagon_M2_mpyud_nac_hl_s0>;
-def Hexagon_M2_mpyud_nac_hl_s1:
+def HEXAGON_M2_mpyud_nac_hl_s1:
   di_MInst_disisi_nac_hl_s1         <"mpyu", int_hexagon_M2_mpyud_nac_hl_s1>;
-def Hexagon_M2_mpyud_nac_lh_s0:
+def HEXAGON_M2_mpyud_nac_lh_s0:
   di_MInst_disisi_nac_lh            <"mpyu", int_hexagon_M2_mpyud_nac_lh_s0>;
-def Hexagon_M2_mpyud_nac_lh_s1:
+def HEXAGON_M2_mpyud_nac_lh_s1:
   di_MInst_disisi_nac_lh_s1         <"mpyu", int_hexagon_M2_mpyud_nac_lh_s1>;
-def Hexagon_M2_mpyud_nac_ll_s0:
+def HEXAGON_M2_mpyud_nac_ll_s0:
   di_MInst_disisi_nac_ll            <"mpyu", int_hexagon_M2_mpyud_nac_ll_s0>;
-def Hexagon_M2_mpyud_nac_ll_s1:
+def HEXAGON_M2_mpyud_nac_ll_s1:
   di_MInst_disisi_nac_ll_s1         <"mpyu", int_hexagon_M2_mpyud_nac_ll_s1>;
 
 
@@ -2864,15 +2900,15 @@ def Hexagon_M2_mpyud_nac_ll_s1:
 *********************************************************************/
 
 // MTYPE / VB / Vector reduce add unsigned bytes.
-def Hexagon_A2_vraddub:
+def HEXAGON_A2_vraddub:
   di_MInst_didi                   <"vraddub", int_hexagon_A2_vraddub>;
-def Hexagon_A2_vraddub_acc:
+def HEXAGON_A2_vraddub_acc:
   di_MInst_dididi_acc             <"vraddub", int_hexagon_A2_vraddub_acc>;
 
 // MTYPE / VB / Vector sum of absolute differences unsigned bytes.
-def Hexagon_A2_vrsadub:
+def HEXAGON_A2_vrsadub:
   di_MInst_didi                   <"vrsadub", int_hexagon_A2_vrsadub>;
-def Hexagon_A2_vrsadub_acc:
+def HEXAGON_A2_vrsadub_acc:
   di_MInst_dididi_acc             <"vrsadub", int_hexagon_A2_vrsadub_acc>;
 
 /********************************************************************
@@ -2880,56 +2916,56 @@ def Hexagon_A2_vrsadub_acc:
 *********************************************************************/
 
 // MTYPE / VH / Vector dual multiply.
-def Hexagon_M2_vdmpys_s1:
+def HEXAGON_M2_vdmpys_s1:
   di_MInst_didi_s1_sat            <"vdmpy",   int_hexagon_M2_vdmpys_s1>;
-def Hexagon_M2_vdmpys_s0:
+def HEXAGON_M2_vdmpys_s0:
   di_MInst_didi_sat               <"vdmpy",   int_hexagon_M2_vdmpys_s0>;
-def Hexagon_M2_vdmacs_s1:
+def HEXAGON_M2_vdmacs_s1:
   di_MInst_dididi_acc_s1_sat      <"vdmpy",   int_hexagon_M2_vdmacs_s1>;
-def Hexagon_M2_vdmacs_s0:
+def HEXAGON_M2_vdmacs_s0:
   di_MInst_dididi_acc_sat         <"vdmpy",   int_hexagon_M2_vdmacs_s0>;
 
 // MTYPE / VH / Vector dual multiply with round and pack.
-def Hexagon_M2_vdmpyrs_s0:
+def HEXAGON_M2_vdmpyrs_s0:
   si_MInst_didi_rnd_sat           <"vdmpy",   int_hexagon_M2_vdmpyrs_s0>;
-def Hexagon_M2_vdmpyrs_s1:
+def HEXAGON_M2_vdmpyrs_s1:
   si_MInst_didi_s1_rnd_sat        <"vdmpy",   int_hexagon_M2_vdmpyrs_s1>;
 
 // MTYPE / VH / Vector multiply even halfwords.
-def Hexagon_M2_vmpy2es_s1:
+def HEXAGON_M2_vmpy2es_s1:
   di_MInst_didi_s1_sat            <"vmpyeh",  int_hexagon_M2_vmpy2es_s1>;
-def Hexagon_M2_vmpy2es_s0:
+def HEXAGON_M2_vmpy2es_s0:
   di_MInst_didi_sat               <"vmpyeh",  int_hexagon_M2_vmpy2es_s0>;
-def Hexagon_M2_vmac2es:
+def HEXAGON_M2_vmac2es:
   di_MInst_dididi_acc             <"vmpyeh",  int_hexagon_M2_vmac2es>;
-def Hexagon_M2_vmac2es_s1:
+def HEXAGON_M2_vmac2es_s1:
   di_MInst_dididi_acc_s1_sat      <"vmpyeh",  int_hexagon_M2_vmac2es_s1>;
-def Hexagon_M2_vmac2es_s0:
+def HEXAGON_M2_vmac2es_s0:
   di_MInst_dididi_acc_sat         <"vmpyeh",  int_hexagon_M2_vmac2es_s0>;
 
 // MTYPE / VH / Vector multiply halfwords.
-def Hexagon_M2_vmpy2s_s0:
+def HEXAGON_M2_vmpy2s_s0:
   di_MInst_sisi_sat               <"vmpyh",   int_hexagon_M2_vmpy2s_s0>;
-def Hexagon_M2_vmpy2s_s1:
+def HEXAGON_M2_vmpy2s_s1:
   di_MInst_sisi_s1_sat            <"vmpyh",   int_hexagon_M2_vmpy2s_s1>;
-def Hexagon_M2_vmac2:
+def HEXAGON_M2_vmac2:
   di_MInst_disisi_acc             <"vmpyh",   int_hexagon_M2_vmac2>;
-def Hexagon_M2_vmac2s_s0:
+def HEXAGON_M2_vmac2s_s0:
   di_MInst_disisi_acc_sat         <"vmpyh",   int_hexagon_M2_vmac2s_s0>;
-def Hexagon_M2_vmac2s_s1:
+def HEXAGON_M2_vmac2s_s1:
   di_MInst_disisi_acc_s1_sat      <"vmpyh",   int_hexagon_M2_vmac2s_s1>;
 
 // MTYPE / VH / Vector multiply halfwords with round and pack.
-def Hexagon_M2_vmpy2s_s0pack:
+def HEXAGON_M2_vmpy2s_s0pack:
   si_MInst_sisi_rnd_sat           <"vmpyh",   int_hexagon_M2_vmpy2s_s0pack>;
-def Hexagon_M2_vmpy2s_s1pack:
+def HEXAGON_M2_vmpy2s_s1pack:
   si_MInst_sisi_s1_rnd_sat        <"vmpyh",   int_hexagon_M2_vmpy2s_s1pack>;
 
 // MTYPE / VH / Vector reduce multiply halfwords.
 // Rxx32+=vrmpyh(Rss32,Rtt32)
-def Hexagon_M2_vrmpy_s0:
+def HEXAGON_M2_vrmpy_s0:
   di_MInst_didi                   <"vrmpyh",  int_hexagon_M2_vrmpy_s0>;
-def Hexagon_M2_vrmac_s0:
+def HEXAGON_M2_vrmac_s0:
   di_MInst_dididi_acc             <"vrmpyh",  int_hexagon_M2_vrmac_s0>;
 
 
@@ -2938,25 +2974,25 @@ def Hexagon_M2_vrmac_s0:
 *********************************************************************/
 
 // STYPE / ALU / Absolute value.
-def Hexagon_A2_abs:
+def HEXAGON_A2_abs:
   si_SInst_si                     <"abs",     int_hexagon_A2_abs>;
-def Hexagon_A2_absp:
+def HEXAGON_A2_absp:
   di_SInst_di                     <"abs",     int_hexagon_A2_absp>;
-def Hexagon_A2_abssat:
+def HEXAGON_A2_abssat:
   si_SInst_si_sat                 <"abs",     int_hexagon_A2_abssat>;
 
 // STYPE / ALU / Negate.
-def Hexagon_A2_negp:
+def HEXAGON_A2_negp:
   di_SInst_di                     <"neg",     int_hexagon_A2_negp>;
-def Hexagon_A2_negsat:
+def HEXAGON_A2_negsat:
   si_SInst_si_sat                 <"neg",     int_hexagon_A2_negsat>;
 
 // STYPE / ALU / Logical Not.
-def Hexagon_A2_notp:
+def HEXAGON_A2_notp:
   di_SInst_di                     <"not",     int_hexagon_A2_notp>;
 
 // STYPE / ALU / Sign extend word to doubleword.
-def Hexagon_A2_sxtw:
+def HEXAGON_A2_sxtw:
   di_SInst_si                     <"sxtw",     int_hexagon_A2_sxtw>;
 
 
@@ -2965,88 +3001,88 @@ def Hexagon_A2_sxtw:
 *********************************************************************/
 
 // STYPE / BIT / Count leading.
-def Hexagon_S2_cl0:
+def HEXAGON_S2_cl0:
   si_SInst_si                     <"cl0",     int_hexagon_S2_cl0>;
-def Hexagon_S2_cl0p:
+def HEXAGON_S2_cl0p:
   si_SInst_di                     <"cl0",     int_hexagon_S2_cl0p>;
-def Hexagon_S2_cl1:
+def HEXAGON_S2_cl1:
   si_SInst_si                     <"cl1",     int_hexagon_S2_cl1>;
-def Hexagon_S2_cl1p:
+def HEXAGON_S2_cl1p:
   si_SInst_di                     <"cl1",     int_hexagon_S2_cl1p>;
-def Hexagon_S2_clb:
+def HEXAGON_S2_clb:
   si_SInst_si                     <"clb",     int_hexagon_S2_clb>;
-def Hexagon_S2_clbp:
+def HEXAGON_S2_clbp:
   si_SInst_di                     <"clb",     int_hexagon_S2_clbp>;
-def Hexagon_S2_clbnorm:
+def HEXAGON_S2_clbnorm:
   si_SInst_si                     <"normamt", int_hexagon_S2_clbnorm>;
 
 // STYPE / BIT / Count trailing.
-def Hexagon_S2_ct0:
+def HEXAGON_S2_ct0:
   si_SInst_si                     <"ct0",     int_hexagon_S2_ct0>;
-def Hexagon_S2_ct1:
+def HEXAGON_S2_ct1:
   si_SInst_si                     <"ct1",     int_hexagon_S2_ct1>;
 
 // STYPE / BIT / Compare bit mask.
-def HEXAGON_C2_bitsclr:
+def Hexagon_C2_bitsclr:
   qi_SInst_sisi                   <"bitsclr", int_hexagon_C2_bitsclr>;
-def HEXAGON_C2_bitsclri:
+def Hexagon_C2_bitsclri:
   qi_SInst_siu6                   <"bitsclr", int_hexagon_C2_bitsclri>;
-def HEXAGON_C2_bitsset:
+def Hexagon_C2_bitsset:
   qi_SInst_sisi                   <"bitsset", int_hexagon_C2_bitsset>;
 
 // STYPE / BIT / Extract unsigned.
 // Rd[d][32/64]=extractu(Rs[s],Rt[t],[imm])
-def Hexagon_S2_extractu:
+def HEXAGON_S2_extractu:
   si_SInst_siu5u5                 <"extractu",int_hexagon_S2_extractu>;
-def Hexagon_S2_extractu_rp:
+def HEXAGON_S2_extractu_rp:
   si_SInst_sidi                   <"extractu",int_hexagon_S2_extractu_rp>;
-def Hexagon_S2_extractup:
+def HEXAGON_S2_extractup:
   di_SInst_diu6u6                 <"extractu",int_hexagon_S2_extractup>;
-def Hexagon_S2_extractup_rp:
+def HEXAGON_S2_extractup_rp:
   di_SInst_didi                   <"extractu",int_hexagon_S2_extractup_rp>;
 
 // STYPE / BIT / Insert bitfield.
-def HEXAGON_S2_insert:
+def Hexagon_S2_insert:
   si_SInst_sisiu5u5               <"insert",  int_hexagon_S2_insert>;
-def HEXAGON_S2_insert_rp:
+def Hexagon_S2_insert_rp:
   si_SInst_sisidi                 <"insert",  int_hexagon_S2_insert_rp>;
-def HEXAGON_S2_insertp:
+def Hexagon_S2_insertp:
   di_SInst_didiu6u6               <"insert",  int_hexagon_S2_insertp>;
-def HEXAGON_S2_insertp_rp:
+def Hexagon_S2_insertp_rp:
   di_SInst_dididi                 <"insert",  int_hexagon_S2_insertp_rp>;
 
 // STYPE / BIT / Innterleave/deinterleave.
-def HEXAGON_S2_interleave:
+def Hexagon_S2_interleave:
   di_SInst_di                     <"interleave", int_hexagon_S2_interleave>;
-def HEXAGON_S2_deinterleave:
+def Hexagon_S2_deinterleave:
   di_SInst_di                     <"deinterleave", int_hexagon_S2_deinterleave>;
 
 // STYPE / BIT / Linear feedback-shift Iteration.
-def HEXAGON_S2_lfsp:
+def Hexagon_S2_lfsp:
   di_SInst_didi                   <"lfs",     int_hexagon_S2_lfsp>;
 
 // STYPE / BIT / Bit reverse.
-def HEXAGON_S2_brev:
+def Hexagon_S2_brev:
   si_SInst_si                     <"brev",    int_hexagon_S2_brev>;
 
 // STYPE / BIT / Set/Clear/Toggle Bit.
-def Hexagon_S2_setbit_i:
+def HEXAGON_S2_setbit_i:
   si_SInst_siu5                   <"setbit",  int_hexagon_S2_setbit_i>;
-def Hexagon_S2_togglebit_i:
+def HEXAGON_S2_togglebit_i:
   si_SInst_siu5                   <"togglebit", int_hexagon_S2_togglebit_i>;
-def Hexagon_S2_clrbit_i:
+def HEXAGON_S2_clrbit_i:
   si_SInst_siu5                   <"clrbit",  int_hexagon_S2_clrbit_i>;
-def Hexagon_S2_setbit_r:
+def HEXAGON_S2_setbit_r:
   si_SInst_sisi                   <"setbit",  int_hexagon_S2_setbit_r>;
-def Hexagon_S2_togglebit_r:
+def HEXAGON_S2_togglebit_r:
   si_SInst_sisi                   <"togglebit", int_hexagon_S2_togglebit_r>;
-def Hexagon_S2_clrbit_r:
+def HEXAGON_S2_clrbit_r:
   si_SInst_sisi                   <"clrbit",  int_hexagon_S2_clrbit_r>;
 
 // STYPE / BIT / Test Bit.
-def Hexagon_S2_tstbit_i:
+def HEXAGON_S2_tstbit_i:
   qi_SInst_siu5                   <"tstbit",  int_hexagon_S2_tstbit_i>;
-def Hexagon_S2_tstbit_r:
+def HEXAGON_S2_tstbit_r:
   qi_SInst_sisi                   <"tstbit",  int_hexagon_S2_tstbit_r>;
 
 
@@ -3055,11 +3091,11 @@ def Hexagon_S2_tstbit_r:
 *********************************************************************/
 
 // STYPE / COMPLEX / Vector Complex conjugate.
-def Hexagon_A2_vconj:
+def HEXAGON_A2_vconj:
   di_SInst_di_sat                 <"vconj",   int_hexagon_A2_vconj>;
 
 // STYPE / COMPLEX / Vector Complex rotate.
-def Hexagon_S2_vcrotate:
+def HEXAGON_S2_vcrotate:
   di_SInst_disi                   <"vcrotate",int_hexagon_S2_vcrotate>;
 
 
@@ -3068,102 +3104,102 @@ def Hexagon_S2_vcrotate:
 *********************************************************************/
 
 // STYPE / PERM / Saturate.
-def Hexagon_A2_sat:
+def HEXAGON_A2_sat:
   si_SInst_di                     <"sat",     int_hexagon_A2_sat>;
-def Hexagon_A2_satb:
+def HEXAGON_A2_satb:
   si_SInst_si                     <"satb",    int_hexagon_A2_satb>;
-def Hexagon_A2_sath:
+def HEXAGON_A2_sath:
   si_SInst_si                     <"sath",    int_hexagon_A2_sath>;
-def Hexagon_A2_satub:
+def HEXAGON_A2_satub:
   si_SInst_si                     <"satub",   int_hexagon_A2_satub>;
-def Hexagon_A2_satuh:
+def HEXAGON_A2_satuh:
   si_SInst_si                     <"satuh",   int_hexagon_A2_satuh>;
 
 // STYPE / PERM / Swizzle bytes.
-def Hexagon_A2_swiz:
+def HEXAGON_A2_swiz:
   si_SInst_si                     <"swiz",    int_hexagon_A2_swiz>;
 
 // STYPE / PERM / Vector align.
 // Need custom lowering
-def Hexagon_S2_valignib:
+def HEXAGON_S2_valignib:
   di_SInst_didiu3                 <"valignb", int_hexagon_S2_valignib>;
-def Hexagon_S2_valignrb:
+def HEXAGON_S2_valignrb:
   di_SInst_didiqi                 <"valignb", int_hexagon_S2_valignrb>;
 
 // STYPE / PERM / Vector round and pack.
-def Hexagon_S2_vrndpackwh:
+def HEXAGON_S2_vrndpackwh:
   si_SInst_di                     <"vrndwh",  int_hexagon_S2_vrndpackwh>;
-def Hexagon_S2_vrndpackwhs:
+def HEXAGON_S2_vrndpackwhs:
   si_SInst_di_sat                 <"vrndwh",  int_hexagon_S2_vrndpackwhs>;
 
 // STYPE / PERM / Vector saturate and pack.
-def Hexagon_S2_svsathb:
+def HEXAGON_S2_svsathb:
   si_SInst_si                     <"vsathb",  int_hexagon_S2_svsathb>;
-def Hexagon_S2_vsathb:
+def HEXAGON_S2_vsathb:
   si_SInst_di                     <"vsathb",  int_hexagon_S2_vsathb>;
-def Hexagon_S2_svsathub:
+def HEXAGON_S2_svsathub:
   si_SInst_si                     <"vsathub", int_hexagon_S2_svsathub>;
-def Hexagon_S2_vsathub:
+def HEXAGON_S2_vsathub:
   si_SInst_di                     <"vsathub", int_hexagon_S2_vsathub>;
-def Hexagon_S2_vsatwh:
+def HEXAGON_S2_vsatwh:
   si_SInst_di                     <"vsatwh",  int_hexagon_S2_vsatwh>;
-def Hexagon_S2_vsatwuh:
+def HEXAGON_S2_vsatwuh:
   si_SInst_di                     <"vsatwuh", int_hexagon_S2_vsatwuh>;
 
 // STYPE / PERM / Vector saturate without pack.
-def Hexagon_S2_vsathb_nopack:
+def HEXAGON_S2_vsathb_nopack:
   di_SInst_di                     <"vsathb",  int_hexagon_S2_vsathb_nopack>;
-def Hexagon_S2_vsathub_nopack:
+def HEXAGON_S2_vsathub_nopack:
   di_SInst_di                     <"vsathub", int_hexagon_S2_vsathub_nopack>;
-def Hexagon_S2_vsatwh_nopack:
+def HEXAGON_S2_vsatwh_nopack:
   di_SInst_di                     <"vsatwh",  int_hexagon_S2_vsatwh_nopack>;
-def Hexagon_S2_vsatwuh_nopack:
+def HEXAGON_S2_vsatwuh_nopack:
   di_SInst_di                     <"vsatwuh", int_hexagon_S2_vsatwuh_nopack>;
 
 // STYPE / PERM / Vector shuffle.
-def Hexagon_S2_shuffeb:
+def HEXAGON_S2_shuffeb:
   di_SInst_didi                   <"shuffeb", int_hexagon_S2_shuffeb>;
-def Hexagon_S2_shuffeh:
+def HEXAGON_S2_shuffeh:
   di_SInst_didi                   <"shuffeh", int_hexagon_S2_shuffeh>;
-def Hexagon_S2_shuffob:
+def HEXAGON_S2_shuffob:
   di_SInst_didi                   <"shuffob", int_hexagon_S2_shuffob>;
-def Hexagon_S2_shuffoh:
+def HEXAGON_S2_shuffoh:
   di_SInst_didi                   <"shuffoh", int_hexagon_S2_shuffoh>;
 
 // STYPE / PERM / Vector splat bytes.
-def Hexagon_S2_vsplatrb:
+def HEXAGON_S2_vsplatrb:
   si_SInst_si                     <"vsplatb", int_hexagon_S2_vsplatrb>;
 
 // STYPE / PERM / Vector splat halfwords.
-def Hexagon_S2_vsplatrh:
+def HEXAGON_S2_vsplatrh:
   di_SInst_si                     <"vsplath", int_hexagon_S2_vsplatrh>;
 
 // STYPE / PERM / Vector splice.
-def HEXAGON_S2_vsplicerb:
+def Hexagon_S2_vsplicerb:
   di_SInst_didiqi                 <"vspliceb",int_hexagon_S2_vsplicerb>;
-def HEXAGON_S2_vspliceib:
+def Hexagon_S2_vspliceib:
   di_SInst_didiu3                 <"vspliceb",int_hexagon_S2_vspliceib>;
 
 // STYPE / PERM / Sign extend.
-def Hexagon_S2_vsxtbh:
+def HEXAGON_S2_vsxtbh:
   di_SInst_si                     <"vsxtbh",  int_hexagon_S2_vsxtbh>;
-def Hexagon_S2_vsxthw:
+def HEXAGON_S2_vsxthw:
   di_SInst_si                     <"vsxthw",  int_hexagon_S2_vsxthw>;
 
 // STYPE / PERM / Truncate.
-def Hexagon_S2_vtrunehb:
+def HEXAGON_S2_vtrunehb:
   si_SInst_di                     <"vtrunehb",int_hexagon_S2_vtrunehb>;
-def Hexagon_S2_vtrunohb:
+def HEXAGON_S2_vtrunohb:
   si_SInst_di                     <"vtrunohb",int_hexagon_S2_vtrunohb>;
-def Hexagon_S2_vtrunewh:
+def HEXAGON_S2_vtrunewh:
   di_SInst_didi                   <"vtrunewh",int_hexagon_S2_vtrunewh>;
-def Hexagon_S2_vtrunowh:
+def HEXAGON_S2_vtrunowh:
   di_SInst_didi                   <"vtrunowh",int_hexagon_S2_vtrunowh>;
 
 // STYPE / PERM / Zero extend.
-def Hexagon_S2_vzxtbh:
+def HEXAGON_S2_vzxtbh:
   di_SInst_si                     <"vzxtbh",  int_hexagon_S2_vzxtbh>;
-def Hexagon_S2_vzxthw:
+def HEXAGON_S2_vzxthw:
   di_SInst_si                     <"vzxthw",  int_hexagon_S2_vzxthw>;
 
 
@@ -3172,17 +3208,17 @@ def Hexagon_S2_vzxthw:
 *********************************************************************/
 
 // STYPE / PRED / Mask generate from predicate.
-def Hexagon_C2_mask:
+def HEXAGON_C2_mask:
   di_SInst_qi                     <"mask",   int_hexagon_C2_mask>;
 
 // STYPE / PRED / Predicate transfer.
-def Hexagon_C2_tfrpr:
+def HEXAGON_C2_tfrpr:
   si_SInst_qi                     <"",       int_hexagon_C2_tfrpr>;
-def Hexagon_C2_tfrrp:
+def HEXAGON_C2_tfrrp:
   qi_SInst_si                     <"",       int_hexagon_C2_tfrrp>;
 
 // STYPE / PRED / Viterbi pack even and odd predicate bits.
-def Hexagon_C2_vitpack:
+def HEXAGON_C2_vitpack:
   si_SInst_qiqi                   <"vitpack",int_hexagon_C2_vitpack>;
 
 
@@ -3191,202 +3227,202 @@ def Hexagon_C2_vitpack:
 *********************************************************************/
 
 // STYPE / SHIFT / Shift by immediate.
-def Hexagon_S2_asl_i_r:
+def HEXAGON_S2_asl_i_r:
   si_SInst_siu5                   <"asl",     int_hexagon_S2_asl_i_r>;
-def Hexagon_S2_asr_i_r:
+def HEXAGON_S2_asr_i_r:
   si_SInst_siu5                   <"asr",     int_hexagon_S2_asr_i_r>;
-def Hexagon_S2_lsr_i_r:
+def HEXAGON_S2_lsr_i_r:
   si_SInst_siu5                   <"lsr",     int_hexagon_S2_lsr_i_r>;
-def Hexagon_S2_asl_i_p:
+def HEXAGON_S2_asl_i_p:
   di_SInst_diu6                   <"asl",     int_hexagon_S2_asl_i_p>;
-def Hexagon_S2_asr_i_p:
+def HEXAGON_S2_asr_i_p:
   di_SInst_diu6                   <"asr",     int_hexagon_S2_asr_i_p>;
-def Hexagon_S2_lsr_i_p:
+def HEXAGON_S2_lsr_i_p:
   di_SInst_diu6                   <"lsr",     int_hexagon_S2_lsr_i_p>;
 
 // STYPE / SHIFT / Shift by immediate and accumulate.
-def Hexagon_S2_asl_i_r_acc:
+def HEXAGON_S2_asl_i_r_acc:
   si_SInst_sisiu5_acc             <"asl",     int_hexagon_S2_asl_i_r_acc>;
-def Hexagon_S2_asr_i_r_acc:
+def HEXAGON_S2_asr_i_r_acc:
   si_SInst_sisiu5_acc             <"asr",     int_hexagon_S2_asr_i_r_acc>;
-def Hexagon_S2_lsr_i_r_acc:
+def HEXAGON_S2_lsr_i_r_acc:
   si_SInst_sisiu5_acc             <"lsr",     int_hexagon_S2_lsr_i_r_acc>;
-def Hexagon_S2_asl_i_r_nac:
+def HEXAGON_S2_asl_i_r_nac:
   si_SInst_sisiu5_nac             <"asl",     int_hexagon_S2_asl_i_r_nac>;
-def Hexagon_S2_asr_i_r_nac:
+def HEXAGON_S2_asr_i_r_nac:
   si_SInst_sisiu5_nac             <"asr",     int_hexagon_S2_asr_i_r_nac>;
-def Hexagon_S2_lsr_i_r_nac:
+def HEXAGON_S2_lsr_i_r_nac:
   si_SInst_sisiu5_nac             <"lsr",     int_hexagon_S2_lsr_i_r_nac>;
-def Hexagon_S2_asl_i_p_acc:
+def HEXAGON_S2_asl_i_p_acc:
   di_SInst_didiu6_acc             <"asl",     int_hexagon_S2_asl_i_p_acc>;
-def Hexagon_S2_asr_i_p_acc:
+def HEXAGON_S2_asr_i_p_acc:
   di_SInst_didiu6_acc             <"asr",     int_hexagon_S2_asr_i_p_acc>;
-def Hexagon_S2_lsr_i_p_acc:
+def HEXAGON_S2_lsr_i_p_acc:
   di_SInst_didiu6_acc             <"lsr",     int_hexagon_S2_lsr_i_p_acc>;
-def Hexagon_S2_asl_i_p_nac:
+def HEXAGON_S2_asl_i_p_nac:
   di_SInst_didiu6_nac             <"asl",     int_hexagon_S2_asl_i_p_nac>;
-def Hexagon_S2_asr_i_p_nac:
+def HEXAGON_S2_asr_i_p_nac:
   di_SInst_didiu6_nac             <"asr",     int_hexagon_S2_asr_i_p_nac>;
-def Hexagon_S2_lsr_i_p_nac:
+def HEXAGON_S2_lsr_i_p_nac:
   di_SInst_didiu6_nac             <"lsr",     int_hexagon_S2_lsr_i_p_nac>;
 
 // STYPE / SHIFT / Shift by immediate and add.
-def Hexagon_S2_addasl_rrri:
+def HEXAGON_S2_addasl_rrri:
   si_SInst_sisiu3                 <"addasl",  int_hexagon_S2_addasl_rrri>;
 
 // STYPE / SHIFT / Shift by immediate and logical.
-def Hexagon_S2_asl_i_r_and:
+def HEXAGON_S2_asl_i_r_and:
   si_SInst_sisiu5_and             <"asl",     int_hexagon_S2_asl_i_r_and>;
-def Hexagon_S2_asr_i_r_and:
+def HEXAGON_S2_asr_i_r_and:
   si_SInst_sisiu5_and             <"asr",     int_hexagon_S2_asr_i_r_and>;
-def Hexagon_S2_lsr_i_r_and:
+def HEXAGON_S2_lsr_i_r_and:
   si_SInst_sisiu5_and             <"lsr",     int_hexagon_S2_lsr_i_r_and>;
 
-def Hexagon_S2_asl_i_r_xacc:
+def HEXAGON_S2_asl_i_r_xacc:
   si_SInst_sisiu5_xor             <"asl",     int_hexagon_S2_asl_i_r_xacc>;
-def Hexagon_S2_lsr_i_r_xacc:
+def HEXAGON_S2_lsr_i_r_xacc:
   si_SInst_sisiu5_xor             <"lsr",     int_hexagon_S2_lsr_i_r_xacc>;
 
-def Hexagon_S2_asl_i_r_or:
+def HEXAGON_S2_asl_i_r_or:
   si_SInst_sisiu5_or              <"asl",     int_hexagon_S2_asl_i_r_or>;
-def Hexagon_S2_asr_i_r_or:
+def HEXAGON_S2_asr_i_r_or:
   si_SInst_sisiu5_or              <"asr",     int_hexagon_S2_asr_i_r_or>;
-def Hexagon_S2_lsr_i_r_or:
+def HEXAGON_S2_lsr_i_r_or:
   si_SInst_sisiu5_or              <"lsr",     int_hexagon_S2_lsr_i_r_or>;
 
-def Hexagon_S2_asl_i_p_and:
+def HEXAGON_S2_asl_i_p_and:
   di_SInst_didiu6_and             <"asl",     int_hexagon_S2_asl_i_p_and>;
-def Hexagon_S2_asr_i_p_and:
+def HEXAGON_S2_asr_i_p_and:
   di_SInst_didiu6_and             <"asr",     int_hexagon_S2_asr_i_p_and>;
-def Hexagon_S2_lsr_i_p_and:
+def HEXAGON_S2_lsr_i_p_and:
   di_SInst_didiu6_and             <"lsr",     int_hexagon_S2_lsr_i_p_and>;
 
-def Hexagon_S2_asl_i_p_xacc:
+def HEXAGON_S2_asl_i_p_xacc:
   di_SInst_didiu6_xor             <"asl",     int_hexagon_S2_asl_i_p_xacc>;
-def Hexagon_S2_lsr_i_p_xacc:
+def HEXAGON_S2_lsr_i_p_xacc:
   di_SInst_didiu6_xor             <"lsr",     int_hexagon_S2_lsr_i_p_xacc>;
 
-def Hexagon_S2_asl_i_p_or:
+def HEXAGON_S2_asl_i_p_or:
   di_SInst_didiu6_or              <"asl",     int_hexagon_S2_asl_i_p_or>;
-def Hexagon_S2_asr_i_p_or:
+def HEXAGON_S2_asr_i_p_or:
   di_SInst_didiu6_or              <"asr",     int_hexagon_S2_asr_i_p_or>;
-def Hexagon_S2_lsr_i_p_or:
+def HEXAGON_S2_lsr_i_p_or:
   di_SInst_didiu6_or              <"lsr",     int_hexagon_S2_lsr_i_p_or>;
 
 // STYPE / SHIFT / Shift right by immediate with rounding.
-def Hexagon_S2_asr_i_r_rnd:
+def HEXAGON_S2_asr_i_r_rnd:
   si_SInst_siu5_rnd               <"asr",     int_hexagon_S2_asr_i_r_rnd>;
-def Hexagon_S2_asr_i_r_rnd_goodsyntax:
+def HEXAGON_S2_asr_i_r_rnd_goodsyntax:
   si_SInst_siu5              <"asrrnd",  int_hexagon_S2_asr_i_r_rnd_goodsyntax>;
 
 // STYPE / SHIFT / Shift left by immediate with saturation.
-def Hexagon_S2_asl_i_r_sat:
+def HEXAGON_S2_asl_i_r_sat:
   si_SInst_sisi_sat               <"asl",     int_hexagon_S2_asl_i_r_sat>;
 
 // STYPE / SHIFT / Shift by register.
-def Hexagon_S2_asl_r_r:
+def HEXAGON_S2_asl_r_r:
   si_SInst_sisi                   <"asl",     int_hexagon_S2_asl_r_r>;
-def Hexagon_S2_asr_r_r:
+def HEXAGON_S2_asr_r_r:
   si_SInst_sisi                   <"asr",     int_hexagon_S2_asr_r_r>;
-def Hexagon_S2_lsl_r_r:
+def HEXAGON_S2_lsl_r_r:
   si_SInst_sisi                   <"lsl",     int_hexagon_S2_lsl_r_r>;
-def Hexagon_S2_lsr_r_r:
+def HEXAGON_S2_lsr_r_r:
   si_SInst_sisi                   <"lsr",     int_hexagon_S2_lsr_r_r>;
-def Hexagon_S2_asl_r_p:
+def HEXAGON_S2_asl_r_p:
   di_SInst_disi                   <"asl",     int_hexagon_S2_asl_r_p>;
-def Hexagon_S2_asr_r_p:
+def HEXAGON_S2_asr_r_p:
   di_SInst_disi                   <"asr",     int_hexagon_S2_asr_r_p>;
-def Hexagon_S2_lsl_r_p:
+def HEXAGON_S2_lsl_r_p:
   di_SInst_disi                   <"lsl",     int_hexagon_S2_lsl_r_p>;
-def Hexagon_S2_lsr_r_p:
+def HEXAGON_S2_lsr_r_p:
   di_SInst_disi                   <"lsr",     int_hexagon_S2_lsr_r_p>;
 
 // STYPE / SHIFT / Shift by register and accumulate.
-def Hexagon_S2_asl_r_r_acc:
+def HEXAGON_S2_asl_r_r_acc:
   si_SInst_sisisi_acc             <"asl",     int_hexagon_S2_asl_r_r_acc>;
-def Hexagon_S2_asr_r_r_acc:
+def HEXAGON_S2_asr_r_r_acc:
   si_SInst_sisisi_acc             <"asr",     int_hexagon_S2_asr_r_r_acc>;
-def Hexagon_S2_lsl_r_r_acc:
+def HEXAGON_S2_lsl_r_r_acc:
   si_SInst_sisisi_acc             <"lsl",     int_hexagon_S2_lsl_r_r_acc>;
-def Hexagon_S2_lsr_r_r_acc:
+def HEXAGON_S2_lsr_r_r_acc:
   si_SInst_sisisi_acc             <"lsr",     int_hexagon_S2_lsr_r_r_acc>;
-def Hexagon_S2_asl_r_p_acc:
+def HEXAGON_S2_asl_r_p_acc:
   di_SInst_didisi_acc             <"asl",     int_hexagon_S2_asl_r_p_acc>;
-def Hexagon_S2_asr_r_p_acc:
+def HEXAGON_S2_asr_r_p_acc:
   di_SInst_didisi_acc             <"asr",     int_hexagon_S2_asr_r_p_acc>;
-def Hexagon_S2_lsl_r_p_acc:
+def HEXAGON_S2_lsl_r_p_acc:
   di_SInst_didisi_acc             <"lsl",     int_hexagon_S2_lsl_r_p_acc>;
-def Hexagon_S2_lsr_r_p_acc:
+def HEXAGON_S2_lsr_r_p_acc:
   di_SInst_didisi_acc             <"lsr",     int_hexagon_S2_lsr_r_p_acc>;
 
-def Hexagon_S2_asl_r_r_nac:
+def HEXAGON_S2_asl_r_r_nac:
   si_SInst_sisisi_nac             <"asl",     int_hexagon_S2_asl_r_r_nac>;
-def Hexagon_S2_asr_r_r_nac:
+def HEXAGON_S2_asr_r_r_nac:
   si_SInst_sisisi_nac             <"asr",     int_hexagon_S2_asr_r_r_nac>;
-def Hexagon_S2_lsl_r_r_nac:
+def HEXAGON_S2_lsl_r_r_nac:
   si_SInst_sisisi_nac             <"lsl",     int_hexagon_S2_lsl_r_r_nac>;
-def Hexagon_S2_lsr_r_r_nac:
+def HEXAGON_S2_lsr_r_r_nac:
   si_SInst_sisisi_nac             <"lsr",     int_hexagon_S2_lsr_r_r_nac>;
-def Hexagon_S2_asl_r_p_nac:
+def HEXAGON_S2_asl_r_p_nac:
   di_SInst_didisi_nac             <"asl",     int_hexagon_S2_asl_r_p_nac>;
-def Hexagon_S2_asr_r_p_nac:
+def HEXAGON_S2_asr_r_p_nac:
   di_SInst_didisi_nac             <"asr",     int_hexagon_S2_asr_r_p_nac>;
-def Hexagon_S2_lsl_r_p_nac:
+def HEXAGON_S2_lsl_r_p_nac:
   di_SInst_didisi_nac             <"lsl",     int_hexagon_S2_lsl_r_p_nac>;
-def Hexagon_S2_lsr_r_p_nac:
+def HEXAGON_S2_lsr_r_p_nac:
   di_SInst_didisi_nac             <"lsr",     int_hexagon_S2_lsr_r_p_nac>;
 
 // STYPE / SHIFT / Shift by register and logical.
-def Hexagon_S2_asl_r_r_and:
+def HEXAGON_S2_asl_r_r_and:
   si_SInst_sisisi_and             <"asl",     int_hexagon_S2_asl_r_r_and>;
-def Hexagon_S2_asr_r_r_and:
+def HEXAGON_S2_asr_r_r_and:
   si_SInst_sisisi_and             <"asr",     int_hexagon_S2_asr_r_r_and>;
-def Hexagon_S2_lsl_r_r_and:
+def HEXAGON_S2_lsl_r_r_and:
   si_SInst_sisisi_and             <"lsl",     int_hexagon_S2_lsl_r_r_and>;
-def Hexagon_S2_lsr_r_r_and:
+def HEXAGON_S2_lsr_r_r_and:
   si_SInst_sisisi_and             <"lsr",     int_hexagon_S2_lsr_r_r_and>;
 
-def Hexagon_S2_asl_r_r_or:
+def HEXAGON_S2_asl_r_r_or:
   si_SInst_sisisi_or              <"asl",     int_hexagon_S2_asl_r_r_or>;
-def Hexagon_S2_asr_r_r_or:
+def HEXAGON_S2_asr_r_r_or:
   si_SInst_sisisi_or              <"asr",     int_hexagon_S2_asr_r_r_or>;
-def Hexagon_S2_lsl_r_r_or:
+def HEXAGON_S2_lsl_r_r_or:
   si_SInst_sisisi_or              <"lsl",     int_hexagon_S2_lsl_r_r_or>;
-def Hexagon_S2_lsr_r_r_or:
+def HEXAGON_S2_lsr_r_r_or:
   si_SInst_sisisi_or              <"lsr",     int_hexagon_S2_lsr_r_r_or>;
 
-def Hexagon_S2_asl_r_p_and:
+def HEXAGON_S2_asl_r_p_and:
   di_SInst_didisi_and             <"asl",     int_hexagon_S2_asl_r_p_and>;
-def Hexagon_S2_asr_r_p_and:
+def HEXAGON_S2_asr_r_p_and:
   di_SInst_didisi_and             <"asr",     int_hexagon_S2_asr_r_p_and>;
-def Hexagon_S2_lsl_r_p_and:
+def HEXAGON_S2_lsl_r_p_and:
   di_SInst_didisi_and             <"lsl",     int_hexagon_S2_lsl_r_p_and>;
-def Hexagon_S2_lsr_r_p_and:
+def HEXAGON_S2_lsr_r_p_and:
   di_SInst_didisi_and             <"lsr",     int_hexagon_S2_lsr_r_p_and>;
 
-def Hexagon_S2_asl_r_p_or:
+def HEXAGON_S2_asl_r_p_or:
   di_SInst_didisi_or              <"asl",     int_hexagon_S2_asl_r_p_or>;
-def Hexagon_S2_asr_r_p_or:
+def HEXAGON_S2_asr_r_p_or:
   di_SInst_didisi_or              <"asr",     int_hexagon_S2_asr_r_p_or>;
-def Hexagon_S2_lsl_r_p_or:
+def HEXAGON_S2_lsl_r_p_or:
   di_SInst_didisi_or              <"lsl",     int_hexagon_S2_lsl_r_p_or>;
-def Hexagon_S2_lsr_r_p_or:
+def HEXAGON_S2_lsr_r_p_or:
   di_SInst_didisi_or              <"lsr",     int_hexagon_S2_lsr_r_p_or>;
 
 // STYPE / SHIFT / Shift by register with saturation.
-def Hexagon_S2_asl_r_r_sat:
+def HEXAGON_S2_asl_r_r_sat:
   si_SInst_sisi_sat               <"asl",     int_hexagon_S2_asl_r_r_sat>;
-def Hexagon_S2_asr_r_r_sat:
+def HEXAGON_S2_asr_r_r_sat:
   si_SInst_sisi_sat               <"asr",     int_hexagon_S2_asr_r_r_sat>;
 
 // STYPE / SHIFT / Table Index.
-def HEXAGON_S2_tableidxb_goodsyntax:
+def Hexagon_S2_tableidxb_goodsyntax:
   si_MInst_sisiu4u5          <"tableidxb",int_hexagon_S2_tableidxb_goodsyntax>;
-def HEXAGON_S2_tableidxd_goodsyntax:
+def Hexagon_S2_tableidxd_goodsyntax:
   si_MInst_sisiu4u5          <"tableidxd",int_hexagon_S2_tableidxd_goodsyntax>;
-def HEXAGON_S2_tableidxh_goodsyntax:
+def Hexagon_S2_tableidxh_goodsyntax:
   si_MInst_sisiu4u5          <"tableidxh",int_hexagon_S2_tableidxh_goodsyntax>;
-def HEXAGON_S2_tableidxw_goodsyntax:
+def Hexagon_S2_tableidxw_goodsyntax:
   si_MInst_sisiu4u5          <"tableidxw",int_hexagon_S2_tableidxw_goodsyntax>;
 
 
@@ -3396,29 +3432,29 @@ def HEXAGON_S2_tableidxw_goodsyntax:
 
 // STYPE / VH / Vector absolute value halfwords.
 // Rdd64=vabsh(Rss64)
-def Hexagon_A2_vabsh:
+def HEXAGON_A2_vabsh:
   di_SInst_di                     <"vabsh",   int_hexagon_A2_vabsh>;
-def Hexagon_A2_vabshsat:
+def HEXAGON_A2_vabshsat:
   di_SInst_di_sat                 <"vabsh",   int_hexagon_A2_vabshsat>;
 
 // STYPE / VH / Vector shift halfwords by immediate.
 // Rdd64=v[asl/asr/lsr]h(Rss64,Rt32)
-def Hexagon_S2_asl_i_vh:
+def HEXAGON_S2_asl_i_vh:
   di_SInst_disi                   <"vaslh",   int_hexagon_S2_asl_i_vh>;
-def Hexagon_S2_asr_i_vh:
+def HEXAGON_S2_asr_i_vh:
   di_SInst_disi                   <"vasrh",   int_hexagon_S2_asr_i_vh>;
-def Hexagon_S2_lsr_i_vh:
+def HEXAGON_S2_lsr_i_vh:
   di_SInst_disi                   <"vlsrh",   int_hexagon_S2_lsr_i_vh>;
 
 // STYPE / VH / Vector shift halfwords by register.
 // Rdd64=v[asl/asr/lsl/lsr]w(Rss64,Rt32)
-def Hexagon_S2_asl_r_vh:
+def HEXAGON_S2_asl_r_vh:
   di_SInst_disi                   <"vaslh",   int_hexagon_S2_asl_r_vh>;
-def Hexagon_S2_asr_r_vh:
+def HEXAGON_S2_asr_r_vh:
   di_SInst_disi                   <"vasrh",   int_hexagon_S2_asr_r_vh>;
-def Hexagon_S2_lsl_r_vh:
+def HEXAGON_S2_lsl_r_vh:
   di_SInst_disi                   <"vlslh",   int_hexagon_S2_lsl_r_vh>;
-def Hexagon_S2_lsr_r_vh:
+def HEXAGON_S2_lsr_r_vh:
   di_SInst_disi                   <"vlsrh",   int_hexagon_S2_lsr_r_vh>;
 
 
@@ -3427,36 +3463,41 @@ def Hexagon_S2_lsr_r_vh:
 *********************************************************************/
 
 // STYPE / VW / Vector absolute value words.
-def Hexagon_A2_vabsw:
+def HEXAGON_A2_vabsw:
   di_SInst_di                     <"vabsw",   int_hexagon_A2_vabsw>;
-def Hexagon_A2_vabswsat:
+def HEXAGON_A2_vabswsat:
   di_SInst_di_sat                 <"vabsw",   int_hexagon_A2_vabswsat>;
 
 // STYPE / VW / Vector shift words by immediate.
 // Rdd64=v[asl/vsl]w(Rss64,Rt32)
-def Hexagon_S2_asl_i_vw:
+def HEXAGON_S2_asl_i_vw:
   di_SInst_disi                   <"vaslw",   int_hexagon_S2_asl_i_vw>;
-def Hexagon_S2_asr_i_vw:
+def HEXAGON_S2_asr_i_vw:
   di_SInst_disi                   <"vasrw",   int_hexagon_S2_asr_i_vw>;
-def Hexagon_S2_lsr_i_vw:
+def HEXAGON_S2_lsr_i_vw:
   di_SInst_disi                   <"vlsrw",   int_hexagon_S2_lsr_i_vw>;
 
 // STYPE / VW / Vector shift words by register.
 // Rdd64=v[asl/vsl]w(Rss64,Rt32)
-def Hexagon_S2_asl_r_vw:
+def HEXAGON_S2_asl_r_vw:
   di_SInst_disi                   <"vaslw",   int_hexagon_S2_asl_r_vw>;
-def Hexagon_S2_asr_r_vw:
+def HEXAGON_S2_asr_r_vw:
   di_SInst_disi                   <"vasrw",   int_hexagon_S2_asr_r_vw>;
-def Hexagon_S2_lsl_r_vw:
+def HEXAGON_S2_lsl_r_vw:
   di_SInst_disi                   <"vlslw",   int_hexagon_S2_lsl_r_vw>;
-def Hexagon_S2_lsr_r_vw:
+def HEXAGON_S2_lsr_r_vw:
   di_SInst_disi                   <"vlsrw",   int_hexagon_S2_lsr_r_vw>;
 
 // STYPE / VW / Vector shift words with truncate and pack.
-def Hexagon_S2_asr_r_svw_trun:
+def HEXAGON_S2_asr_r_svw_trun:
   si_SInst_disi                   <"vasrw",   int_hexagon_S2_asr_r_svw_trun>;
-def Hexagon_S2_asr_i_svw_trun:
+def HEXAGON_S2_asr_i_svw_trun:
   si_SInst_diu5                   <"vasrw",   int_hexagon_S2_asr_i_svw_trun>;
 
+// LD / Circular loads.
+def HEXAGON_circ_ldd:
+  di_LDInstPI_diu4                <"circ_ldd", int_hexagon_circ_ldd>;
+
 include "HexagonIntrinsicsV3.td"
 include "HexagonIntrinsicsV4.td"
+include "HexagonIntrinsicsV5.td"
diff --git a/lib/Target/Hexagon/HexagonIntrinsicsDerived.td b/lib/Target/Hexagon/HexagonIntrinsicsDerived.td
index 68eaf68..2788101 100644
--- a/lib/Target/Hexagon/HexagonIntrinsicsDerived.td
+++ b/lib/Target/Hexagon/HexagonIntrinsicsDerived.td
@@ -12,18 +12,28 @@
 // Optimized with intrinisics accumulates
 //
 def : Pat <(mul DoubleRegs:$src1, DoubleRegs:$src2),
-      (COMBINE_rr
-                  (Hexagon_M2_maci
-                           (Hexagon_M2_maci (EXTRACT_SUBREG  (MPYU64 (EXTRACT_SUBREG DoubleRegs:$src1, subreg_loreg),
-                                                           (EXTRACT_SUBREG DoubleRegs:$src2, subreg_loreg)),
-                                       subreg_hireg),
-                                       (EXTRACT_SUBREG DoubleRegs:$src1, subreg_loreg),
-                                       (EXTRACT_SUBREG DoubleRegs:$src2, subreg_hireg)),
-                            (EXTRACT_SUBREG DoubleRegs:$src2, subreg_loreg),
-                            (EXTRACT_SUBREG DoubleRegs:$src1, subreg_hireg)),
-                    (EXTRACT_SUBREG  (MPYU64 (EXTRACT_SUBREG DoubleRegs:$src1, subreg_loreg),
-                                      (EXTRACT_SUBREG DoubleRegs:$src2, subreg_loreg)),
-                     subreg_loreg))>;
+      (i64
+       (COMBINE_rr
+        (HEXAGON_M2_maci
+         (HEXAGON_M2_maci
+          (i32
+           (EXTRACT_SUBREG
+            (i64
+             (MPYU64 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1),
+                                          subreg_loreg)),
+                     (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
+                                          subreg_loreg)))),
+            subreg_hireg)),
+          (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_loreg)),
+          (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_hireg))),
+         (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_loreg)),
+         (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_hireg))),
+        (i32
+         (EXTRACT_SUBREG
+          (i64
+           (MPYU64 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_loreg)),
+                   (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
+                                        subreg_loreg)))), subreg_loreg))))>;
 
 
 
diff --git a/lib/Target/Hexagon/HexagonIntrinsicsV5.td b/lib/Target/Hexagon/HexagonIntrinsicsV5.td
new file mode 100644
index 0000000..1d44b52
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonIntrinsicsV5.td
@@ -0,0 +1,395 @@
+class sf_SInst_sf<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1))]>;
+
+class si_SInst_sf<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1))]>;
+
+class sf_SInst_si<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1))]>;
+
+class sf_SInst_di<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set IntRegs:$dst, (IntID DoubleRegs:$src1))]>;
+
+class sf_SInst_df<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set IntRegs:$dst, (IntID DoubleRegs:$src1))]>;
+
+class si_SInst_df<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set IntRegs:$dst, (IntID DoubleRegs:$src1))]>;
+
+class df_SInst_sf<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1))]>;
+
+class di_SInst_sf<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1))]>;
+
+class df_SInst_si<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1))]>;
+
+class df_SInst_df<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1))]>;
+
+class di_SInst_df<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1))]>;
+
+
+class df_SInst_di<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1))]>;
+
+class sf_MInst_sfsf<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class df_MInst_dfdf<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+           !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+           [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+class qi_ALU64_dfdf<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs PredRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+           !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+           [(set PredRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+class qi_ALU64_dfu5<string opc, Intrinsic IntID>
+  : ALU64_ri<(outs PredRegs:$dst), (ins DoubleRegs:$src1, u5Imm:$src2),
+           !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+           [(set PredRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
+
+
+class sf_MInst_sfsfsf_acc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                        IntRegs:$dst2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1, $src2)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1,
+                                          IntRegs:$src2, IntRegs:$dst2))],
+               "$dst2 = $dst">;
+
+class sf_MInst_sfsfsf_nac<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                        IntRegs:$dst2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1, $src2)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1,
+                                          IntRegs:$src2, IntRegs:$dst2))],
+               "$dst2 = $dst">;
+
+
+class sf_MInst_sfsfsfsi_sc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2, IntRegs:$src3),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1, $src2, $src3):scale")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2, IntRegs:$src3))],
+               "$dst2 = $dst">;
+
+class sf_MInst_sfsfsf_acc_lib<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                        IntRegs:$dst2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1, $src2):lib")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1,
+                                          IntRegs:$src2, IntRegs:$dst2))],
+               "$dst2 = $dst">;
+
+class sf_MInst_sfsfsf_nac_lib<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                        IntRegs:$dst2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1, $src2):lib")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1,
+                                          IntRegs:$src2, IntRegs:$dst2))],
+               "$dst2 = $dst">;
+
+class df_MInst_dfdfdf_acc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
+                                        DoubleRegs:$dst2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1, $src2)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                          DoubleRegs:$src2, DoubleRegs:$dst2))],
+               "$dst2 = $dst">;
+
+class df_MInst_dfdfdf_nac<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
+                                        DoubleRegs:$dst2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1, $src2)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                          DoubleRegs:$src2, DoubleRegs:$dst2))],
+               "$dst2 = $dst">;
+
+
+class df_MInst_dfdfdfsi_sc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                        DoubleRegs:$src2, IntRegs:$src3),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1, $src2, $src3):scale")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
+                                        DoubleRegs:$src2, IntRegs:$src3))],
+               "$dst2 = $dst">;
+
+class df_MInst_dfdfdf_acc_lib<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
+                                        DoubleRegs:$dst2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1, $src2):lib")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                          DoubleRegs:$src2, DoubleRegs:$dst2))],
+               "$dst2 = $dst">;
+
+class df_MInst_dfdfdf_nac_lib<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
+                                        DoubleRegs:$dst2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1, $src2):lib")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                          DoubleRegs:$src2, DoubleRegs:$dst2))],
+               "$dst2 = $dst">;
+
+class qi_SInst_sfsf<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class qi_SInst_sfu5<string opc, Intrinsic IntID>
+  : MInst<(outs PredRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class sf_ALU64_u10_pos<string opc, Intrinsic IntID>
+  : ALU64_ri<(outs IntRegs:$dst), (ins u10Imm:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "#$src1):pos")),
+             [(set IntRegs:$dst, (IntID imm:$src1))]>;
+
+class sf_ALU64_u10_neg<string opc, Intrinsic IntID>
+  : ALU64_ri<(outs IntRegs:$dst), (ins u10Imm:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "#$src1):neg")),
+             [(set IntRegs:$dst, (IntID imm:$src1))]>;
+
+class df_ALU64_u10_pos<string opc, Intrinsic IntID>
+  : ALU64_ri<(outs DoubleRegs:$dst), (ins u10Imm:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "#$src1):pos")),
+             [(set DoubleRegs:$dst, (IntID imm:$src1))]>;
+
+class df_ALU64_u10_neg<string opc, Intrinsic IntID>
+  : ALU64_ri<(outs DoubleRegs:$dst), (ins u10Imm:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "#$src1):neg")),
+             [(set DoubleRegs:$dst, (IntID imm:$src1))]>;
+
+class di_MInst_diu6<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
+
+class di_MInst_diu4_rnd<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u4Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2):rnd")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
+
+class si_MInst_diu4_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, u4Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2):rnd:sat")),
+          [(set IntRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
+
+class si_SInst_diu4_sat<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, u4Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2):sat")),
+          [(set IntRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
+
+
+def HEXAGON_C4_fastcorner9:
+    qi_SInst_qiqi       <"fastcorner9", int_hexagon_C4_fastcorner9>;
+def HEXAGON_C4_fastcorner9_not:
+    qi_SInst_qiqi <"!fastcorner9", int_hexagon_C4_fastcorner9_not>;
+def HEXAGON_M5_vrmpybuu:
+    di_MInst_didi <"vrmpybu", int_hexagon_M5_vrmpybuu>;
+def HEXAGON_M5_vrmacbuu:
+    di_MInst_dididi_acc <"vrmpybu", int_hexagon_M5_vrmacbuu>;
+def HEXAGON_M5_vrmpybsu:
+    di_MInst_didi <"vrmpybsu", int_hexagon_M5_vrmpybsu>;
+def HEXAGON_M5_vrmacbsu:
+    di_MInst_dididi_acc <"vrmpybsu", int_hexagon_M5_vrmacbsu>;
+def HEXAGON_M5_vmpybuu:
+    di_MInst_sisi <"vmpybu", int_hexagon_M5_vmpybuu>;
+def HEXAGON_M5_vmpybsu:
+    di_MInst_sisi <"vmpybsu", int_hexagon_M5_vmpybsu>;
+def HEXAGON_M5_vmacbuu:
+    di_MInst_disisi_acc <"vmpybu", int_hexagon_M5_vmacbuu>;
+def HEXAGON_M5_vmacbsu:
+    di_MInst_disisi_acc <"vmpybsu", int_hexagon_M5_vmacbsu>;
+def HEXAGON_M5_vdmpybsu:
+    di_MInst_didi_sat <"vdmpybsu", int_hexagon_M5_vdmpybsu>;
+def HEXAGON_M5_vdmacbsu:
+    di_MInst_dididi_acc_sat <"vdmpybsu", int_hexagon_M5_vdmacbsu>;
+def HEXAGON_A5_vaddhubs:
+    si_SInst_didi_sat <"vaddhub", int_hexagon_A5_vaddhubs>;
+def HEXAGON_S5_popcountp:
+    si_SInst_di <"popcount", int_hexagon_S5_popcountp>;
+def HEXAGON_S5_asrhub_rnd_sat_goodsyntax:
+    si_MInst_diu4_rnd_sat <"vasrhub", int_hexagon_S5_asrhub_rnd_sat_goodsyntax>;
+def HEXAGON_S5_asrhub_sat:
+    si_SInst_diu4_sat <"vasrhub", int_hexagon_S5_asrhub_sat>;
+def HEXAGON_S5_vasrhrnd_goodsyntax:
+    di_MInst_diu4_rnd <"vasrh", int_hexagon_S5_vasrhrnd_goodsyntax>;
+def HEXAGON_S2_asr_i_p_rnd:
+    di_SInst_diu6 <"asr", int_hexagon_S2_asr_i_p_rnd>;
+def HEXAGON_S2_asr_i_p_rnd_goodsyntax:
+    di_MInst_diu6 <"asrrnd", int_hexagon_S2_asr_i_p_rnd_goodsyntax>;
+def HEXAGON_F2_sfadd:
+    sf_MInst_sfsf <"sfadd", int_hexagon_F2_sfadd>;
+def HEXAGON_F2_sfsub:
+    sf_MInst_sfsf <"sfsub", int_hexagon_F2_sfsub>;
+def HEXAGON_F2_sfmpy:
+    sf_MInst_sfsf <"sfmpy", int_hexagon_F2_sfmpy>;
+def HEXAGON_F2_sffma:
+    sf_MInst_sfsfsf_acc <"sfmpy", int_hexagon_F2_sffma>;
+def HEXAGON_F2_sffma_sc:
+    sf_MInst_sfsfsfsi_sc <"sfmpy", int_hexagon_F2_sffma_sc>;
+def HEXAGON_F2_sffms:
+    sf_MInst_sfsfsf_nac <"sfmpy", int_hexagon_F2_sffms>;
+def HEXAGON_F2_sffma_lib:
+    sf_MInst_sfsfsf_acc_lib <"sfmpy", int_hexagon_F2_sffma_lib>;
+def HEXAGON_F2_sffms_lib:
+    sf_MInst_sfsfsf_nac_lib <"sfmpy", int_hexagon_F2_sffms_lib>;
+def HEXAGON_F2_sfcmpeq:
+    qi_SInst_sfsf <"sfcmp.eq", int_hexagon_F2_sfcmpeq>;
+def HEXAGON_F2_sfcmpgt:
+    qi_SInst_sfsf <"sfcmp.gt", int_hexagon_F2_sfcmpgt>;
+def HEXAGON_F2_sfcmpge:
+    qi_SInst_sfsf <"sfcmp.ge", int_hexagon_F2_sfcmpge>;
+def HEXAGON_F2_sfcmpuo:
+    qi_SInst_sfsf <"sfcmp.uo", int_hexagon_F2_sfcmpuo>;
+def HEXAGON_F2_sfmax:
+    sf_MInst_sfsf <"sfmax", int_hexagon_F2_sfmax>;
+def HEXAGON_F2_sfmin:
+    sf_MInst_sfsf <"sfmin", int_hexagon_F2_sfmin>;
+def HEXAGON_F2_sfclass:
+    qi_SInst_sfu5 <"sfclass", int_hexagon_F2_sfclass>;
+def HEXAGON_F2_sfimm_p:
+    sf_ALU64_u10_pos <"sfmake", int_hexagon_F2_sfimm_p>;
+def HEXAGON_F2_sfimm_n:
+    sf_ALU64_u10_neg <"sfmake", int_hexagon_F2_sfimm_n>;
+def HEXAGON_F2_sffixupn:
+    sf_MInst_sfsf <"sffixupn", int_hexagon_F2_sffixupn>;
+def HEXAGON_F2_sffixupd:
+    sf_MInst_sfsf <"sffixupd", int_hexagon_F2_sffixupd>;
+def HEXAGON_F2_sffixupr:
+    sf_SInst_sf <"sffixupr", int_hexagon_F2_sffixupr>;
+def HEXAGON_F2_dfadd:
+    df_MInst_dfdf <"dfadd", int_hexagon_F2_dfadd>;
+def HEXAGON_F2_dfsub:
+    df_MInst_dfdf <"dfsub", int_hexagon_F2_dfsub>;
+def HEXAGON_F2_dfmpy:
+    df_MInst_dfdf <"dfmpy", int_hexagon_F2_dfmpy>;
+def HEXAGON_F2_dffma:
+    df_MInst_dfdfdf_acc <"dfmpy", int_hexagon_F2_dffma>;
+def HEXAGON_F2_dffms:
+    df_MInst_dfdfdf_nac <"dfmpy", int_hexagon_F2_dffms>;
+def HEXAGON_F2_dffma_lib:
+    df_MInst_dfdfdf_acc_lib <"dfmpy", int_hexagon_F2_dffma_lib>;
+def HEXAGON_F2_dffms_lib:
+    df_MInst_dfdfdf_nac_lib <"dfmpy", int_hexagon_F2_dffms_lib>;
+def HEXAGON_F2_dffma_sc:
+    df_MInst_dfdfdfsi_sc <"dfmpy", int_hexagon_F2_dffma_sc>;
+def HEXAGON_F2_dfmax:
+    df_MInst_dfdf <"dfmax", int_hexagon_F2_dfmax>;
+def HEXAGON_F2_dfmin:
+    df_MInst_dfdf <"dfmin", int_hexagon_F2_dfmin>;
+def HEXAGON_F2_dfcmpeq:
+    qi_ALU64_dfdf <"dfcmp.eq", int_hexagon_F2_dfcmpeq>;
+def HEXAGON_F2_dfcmpgt:
+    qi_ALU64_dfdf <"dfcmp.gt", int_hexagon_F2_dfcmpgt>;
+def HEXAGON_F2_dfcmpge:
+    qi_ALU64_dfdf <"dfcmp.ge", int_hexagon_F2_dfcmpge>;
+def HEXAGON_F2_dfcmpuo:
+    qi_ALU64_dfdf <"dfcmp.uo", int_hexagon_F2_dfcmpuo>;
+def HEXAGON_F2_dfclass:
+    qi_ALU64_dfu5 <"dfclass", int_hexagon_F2_dfclass>;
+def HEXAGON_F2_dfimm_p:
+    df_ALU64_u10_pos <"dfmake", int_hexagon_F2_dfimm_p>;
+def HEXAGON_F2_dfimm_n:
+    df_ALU64_u10_neg <"dfmake", int_hexagon_F2_dfimm_n>;
+def HEXAGON_F2_dffixupn:
+    df_MInst_dfdf <"dffixupn", int_hexagon_F2_dffixupn>;
+def HEXAGON_F2_dffixupd:
+    df_MInst_dfdf <"dffixupd", int_hexagon_F2_dffixupd>;
+def HEXAGON_F2_dffixupr:
+    df_SInst_df <"dffixupr", int_hexagon_F2_dffixupr>;
+def HEXAGON_F2_conv_sf2df:
+    df_SInst_sf <"convert_sf2df", int_hexagon_F2_conv_sf2df>;
+def HEXAGON_F2_conv_df2sf:
+    sf_SInst_df <"convert_df2sf", int_hexagon_F2_conv_df2sf>;
+def HEXAGON_F2_conv_uw2sf:
+    sf_SInst_si <"convert_uw2sf", int_hexagon_F2_conv_uw2sf>;
+def HEXAGON_F2_conv_uw2df:
+    df_SInst_si <"convert_uw2df", int_hexagon_F2_conv_uw2df>;
+def HEXAGON_F2_conv_w2sf:
+    sf_SInst_si <"convert_w2sf", int_hexagon_F2_conv_w2sf>;
+def HEXAGON_F2_conv_w2df:
+    df_SInst_si <"convert_w2df", int_hexagon_F2_conv_w2df>;
+def HEXAGON_F2_conv_ud2sf:
+    sf_SInst_di <"convert_ud2sf", int_hexagon_F2_conv_ud2sf>;
+def HEXAGON_F2_conv_ud2df:
+    df_SInst_di <"convert_ud2df", int_hexagon_F2_conv_ud2df>;
+def HEXAGON_F2_conv_d2sf:
+    sf_SInst_di <"convert_d2sf", int_hexagon_F2_conv_d2sf>;
+def HEXAGON_F2_conv_d2df:
+    df_SInst_di <"convert_d2df", int_hexagon_F2_conv_d2df>;
+def HEXAGON_F2_conv_sf2uw:
+    si_SInst_sf <"convert_sf2uw", int_hexagon_F2_conv_sf2uw>;
+def HEXAGON_F2_conv_sf2w:
+    si_SInst_sf <"convert_sf2w", int_hexagon_F2_conv_sf2w>;
+def HEXAGON_F2_conv_sf2ud:
+    di_SInst_sf <"convert_sf2ud", int_hexagon_F2_conv_sf2ud>;
+def HEXAGON_F2_conv_sf2d:
+    di_SInst_sf <"convert_sf2d", int_hexagon_F2_conv_sf2d>;
+def HEXAGON_F2_conv_df2uw:
+    si_SInst_df <"convert_df2uw", int_hexagon_F2_conv_df2uw>;
+def HEXAGON_F2_conv_df2w:
+    si_SInst_df <"convert_df2w", int_hexagon_F2_conv_df2w>;
+def HEXAGON_F2_conv_df2ud:
+    di_SInst_df <"convert_df2ud", int_hexagon_F2_conv_df2ud>;
+def HEXAGON_F2_conv_df2d:
+    di_SInst_df <"convert_df2d", int_hexagon_F2_conv_df2d>;
+def HEXAGON_F2_conv_sf2uw_chop:
+    si_SInst_sf <"convert_sf2uw", int_hexagon_F2_conv_sf2uw_chop>;
+def HEXAGON_F2_conv_sf2w_chop:
+    si_SInst_sf <"convert_sf2w", int_hexagon_F2_conv_sf2w_chop>;
+def HEXAGON_F2_conv_sf2ud_chop:
+    di_SInst_sf <"convert_sf2ud", int_hexagon_F2_conv_sf2ud_chop>;
+def HEXAGON_F2_conv_sf2d_chop:
+    di_SInst_sf <"convert_sf2d", int_hexagon_F2_conv_sf2d_chop>;
+def HEXAGON_F2_conv_df2uw_chop:
+    si_SInst_df <"convert_df2uw", int_hexagon_F2_conv_df2uw_chop>;
+def HEXAGON_F2_conv_df2w_chop:
+    si_SInst_df <"convert_df2w", int_hexagon_F2_conv_df2w_chop>;
+def HEXAGON_F2_conv_df2ud_chop:
+    di_SInst_df <"convert_df2ud", int_hexagon_F2_conv_df2ud_chop>;
+def HEXAGON_F2_conv_df2d_chop:
+    di_SInst_df <"convert_df2d", int_hexagon_F2_conv_df2d_chop>;
diff --git a/lib/Target/Hexagon/HexagonMCInst.h b/lib/Target/Hexagon/HexagonMCInst.h
new file mode 100644
index 0000000..7a16c24
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonMCInst.h
@@ -0,0 +1,41 @@
+//===- HexagonMCInst.h - Hexagon sub-class of MCInst ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class extends MCInst to allow some VLIW annotation.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONMCINST_H
+#define HEXAGONMCINST_H
+
+#include "llvm/MC/MCInst.h"
+#include "llvm/CodeGen/MachineInstr.h"
+
+namespace llvm {
+  class HexagonMCInst: public MCInst {
+    // Packet start and end markers
+    unsigned startPacket: 1, endPacket: 1;
+    const MachineInstr *MachineI;
+  public:
+    explicit HexagonMCInst(): MCInst(),
+                              startPacket(0), endPacket(0) {}
+
+    const MachineInstr* getMI() const { return MachineI; }
+
+    void setMI(const MachineInstr *MI) { MachineI = MI; }
+
+    bool isStartPacket() const { return (startPacket); }
+    bool isEndPacket() const { return (endPacket); }
+
+    void setStartPacket(bool yes) { startPacket = yes; }
+    void setEndPacket(bool yes) { endPacket = yes; }
+  };
+}
+
+#endif
diff --git a/lib/Target/Hexagon/HexagonMCInstLower.cpp b/lib/Target/Hexagon/HexagonMCInstLower.cpp
index fbb331b..70bddcc 100644
--- a/lib/Target/Hexagon/HexagonMCInstLower.cpp
+++ b/lib/Target/Hexagon/HexagonMCInstLower.cpp
@@ -49,7 +49,7 @@ void llvm::HexagonLowerToMC(const MachineInstr* MI, MCInst& MCI,
     switch (MO.getType()) {
     default:
       MI->dump();
-      assert(0 && "unknown operand type");
+      llvm_unreachable("unknown operand type");
     case MachineOperand::MO_Register:
       // Ignore all implicit register operands.
       if (MO.isImplicit()) continue;
diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp
new file mode 100644
index 0000000..7ece408
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -0,0 +1,647 @@
+//===----- HexagonNewValueJump.cpp - Hexagon Backend New Value Jump -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements NewValueJump pass in Hexagon.
+// Ideally, we should merge this as a Peephole pass prior to register
+// allocation, but because we have a spill in between the feeder and new value
+// jump instructions, we are forced to write after register allocation.
+// Having said that, we should re-attempt to pull this earlier at some point
+// in future.
+
+// The basic approach looks for sequence of predicated jump, compare instruciton
+// that genereates the predicate and, the feeder to the predicate. Once it finds
+// all, it collapses compare and jump instruction into a new valu jump
+// intstructions.
+//
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "hexagon-nvj"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonMachineFunctionInfo.h"
+
+#include <map>
+
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+STATISTIC(NumNVJGenerated, "Number of New Value Jump Instructions created");
+
+static cl::opt<int>
+DbgNVJCount("nvj-count", cl::init(-1), cl::Hidden, cl::desc(
+  "Maximum number of predicated jumps to be converted to New Value Jump"));
+
+static cl::opt<bool> DisableNewValueJumps("disable-nvjump", cl::Hidden,
+    cl::ZeroOrMore, cl::init(false),
+    cl::desc("Disable New Value Jumps"));
+
+namespace {
+  struct HexagonNewValueJump : public MachineFunctionPass {
+    const HexagonInstrInfo    *QII;
+    const HexagonRegisterInfo *QRI;
+
+  public:
+    static char ID;
+
+    HexagonNewValueJump() : MachineFunctionPass(ID) { }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    const char *getPassName() const {
+      return "Hexagon NewValueJump";
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+  private:
+
+  };
+
+} // end of anonymous namespace
+
+char HexagonNewValueJump::ID = 0;
+
+// We have identified this II could be feeder to NVJ,
+// verify that it can be.
+static bool canBeFeederToNewValueJump(const HexagonInstrInfo *QII,
+                                      const TargetRegisterInfo *TRI,
+                                      MachineBasicBlock::iterator II,
+                                      MachineBasicBlock::iterator end,
+                                      MachineBasicBlock::iterator skip,
+                                      MachineFunction &MF) {
+
+  // Predicated instruction can not be feeder to NVJ.
+  if (QII->isPredicated(II))
+    return false;
+
+  // Bail out if feederReg is a paired register (double regs in
+  // our case). One would think that we can check to see if a given
+  // register cmpReg1 or cmpReg2 is a sub register of feederReg
+  // using -- if (QRI->isSubRegister(feederReg, cmpReg1) logic
+  // before the callsite of this function
+  // But we can not as it comes in the following fashion.
+  //    %D0<def> = Hexagon_S2_lsr_r_p %D0<kill>, %R2<kill>
+  //    %R0<def> = KILL %R0, %D0<imp-use,kill>
+  //    %P0<def> = CMPEQri %R0<kill>, 0
+  // Hence, we need to check if it's a KILL instruction.
+  if (II->getOpcode() == TargetOpcode::KILL)
+    return false;
+
+
+  // Make sure there there is no 'def' or 'use' of any of the uses of
+  // feeder insn between it's definition, this MI and jump, jmpInst
+  // skipping compare, cmpInst.
+  // Here's the example.
+  //    r21=memub(r22+r24<<#0)
+  //    p0 = cmp.eq(r21, #0)
+  //    r4=memub(r3+r21<<#0)
+  //    if (p0.new) jump:t .LBB29_45
+  // Without this check, it will be converted into
+  //    r4=memub(r3+r21<<#0)
+  //    r21=memub(r22+r24<<#0)
+  //    p0 = cmp.eq(r21, #0)
+  //    if (p0.new) jump:t .LBB29_45
+  // and result WAR hazards if converted to New Value Jump.
+
+  for (unsigned i = 0; i < II->getNumOperands(); ++i) {
+    if (II->getOperand(i).isReg() &&
+        (II->getOperand(i).isUse() || II->getOperand(i).isDef())) {
+      MachineBasicBlock::iterator localII = II;
+      ++localII;
+      unsigned Reg = II->getOperand(i).getReg();
+      for (MachineBasicBlock::iterator localBegin = localII;
+                        localBegin != end; ++localBegin) {
+        if (localBegin == skip ) continue;
+        // Check for Subregisters too.
+        if (localBegin->modifiesRegister(Reg, TRI) ||
+            localBegin->readsRegister(Reg, TRI))
+          return false;
+      }
+    }
+  }
+  return true;
+}
+
+// These are the common checks that need to performed
+// to determine if
+// 1. compare instruction can be moved before jump.
+// 2. feeder to the compare instruction can be moved before jump.
+static bool commonChecksToProhibitNewValueJump(bool afterRA,
+                          MachineBasicBlock::iterator MII) {
+
+  // If store in path, bail out.
+  if (MII->getDesc().mayStore())
+    return false;
+
+  // if call in path, bail out.
+  if (MII->getOpcode() == Hexagon::CALLv3)
+    return false;
+
+  // if NVJ is running prior to RA, do the following checks.
+  if (!afterRA) {
+    // The following Target Opcode instructions are spurious
+    // to new value jump. If they are in the path, bail out.
+    // KILL sets kill flag on the opcode. It also sets up a
+    // single register, out of pair.
+    //    %D0<def> = Hexagon_S2_lsr_r_p %D0<kill>, %R2<kill>
+    //    %R0<def> = KILL %R0, %D0<imp-use,kill>
+    //    %P0<def> = CMPEQri %R0<kill>, 0
+    // PHI can be anything after RA.
+    // COPY can remateriaze things in between feeder, compare and nvj.
+    if (MII->getOpcode() == TargetOpcode::KILL ||
+        MII->getOpcode() == TargetOpcode::PHI  ||
+        MII->getOpcode() == TargetOpcode::COPY)
+      return false;
+
+    // The following pseudo Hexagon instructions sets "use" and "def"
+    // of registers by individual passes in the backend. At this time,
+    // we don't know the scope of usage and definitions of these
+    // instructions.
+    if (MII->getOpcode() == Hexagon::TFR_condset_rr ||
+        MII->getOpcode() == Hexagon::TFR_condset_ii ||
+        MII->getOpcode() == Hexagon::TFR_condset_ri ||
+        MII->getOpcode() == Hexagon::TFR_condset_ir ||
+        MII->getOpcode() == Hexagon::LDriw_pred     ||
+        MII->getOpcode() == Hexagon::STriw_pred)
+      return false;
+  }
+
+  return true;
+}
+
+static bool canCompareBeNewValueJump(const HexagonInstrInfo *QII,
+                                     const TargetRegisterInfo *TRI,
+                                     MachineBasicBlock::iterator II,
+                                     unsigned pReg,
+                                     bool secondReg,
+                                     bool optLocation,
+                                     MachineBasicBlock::iterator end,
+                                     MachineFunction &MF) {
+
+  MachineInstr *MI = II;
+
+  // If the second operand of the compare is an imm, make sure it's in the
+  // range specified by the arch.
+  if (!secondReg) {
+    int64_t v = MI->getOperand(2).getImm();
+    if (MI->getOpcode() == Hexagon::CMPGEri ||
+       (MI->getOpcode() == Hexagon::CMPGEUri && v > 0))
+      --v;
+
+    if (!(isUInt<5>(v) ||
+         ((MI->getOpcode() == Hexagon::CMPEQri ||
+           MI->getOpcode() == Hexagon::CMPGTri ||
+           MI->getOpcode() == Hexagon::CMPGEri) &&
+          (v == -1))))
+      return false;
+  }
+
+  unsigned cmpReg1, cmpOp2 = 0; // cmpOp2 assignment silences compiler warning.
+  cmpReg1 = MI->getOperand(1).getReg();
+
+  if (secondReg) {
+    cmpOp2 = MI->getOperand(2).getReg();
+
+    // Make sure that that second register is not from COPY
+    // At machine code level, we don't need this, but if we decide
+    // to move new value jump prior to RA, we would be needing this.
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    if (secondReg && !TargetRegisterInfo::isPhysicalRegister(cmpOp2)) {
+      MachineInstr *def = MRI.getVRegDef(cmpOp2);
+      if (def->getOpcode() == TargetOpcode::COPY)
+        return false;
+    }
+  }
+
+  // Walk the instructions after the compare (predicate def) to the jump,
+  // and satisfy the following conditions.
+  ++II ;
+  for (MachineBasicBlock::iterator localII = II; localII != end;
+       ++localII) {
+
+    // Check 1.
+    // If "common" checks fail, bail out.
+    if (!commonChecksToProhibitNewValueJump(optLocation, localII))
+      return false;
+
+    // Check 2.
+    // If there is a def or use of predicate (result of compare), bail out.
+    if (localII->modifiesRegister(pReg, TRI) ||
+        localII->readsRegister(pReg, TRI))
+      return false;
+
+    // Check 3.
+    // If there is a def of any of the use of the compare (operands of compare),
+    // bail out.
+    // Eg.
+    //    p0 = cmp.eq(r2, r0)
+    //    r2 = r4
+    //    if (p0.new) jump:t .LBB28_3
+    if (localII->modifiesRegister(cmpReg1, TRI) ||
+        (secondReg && localII->modifiesRegister(cmpOp2, TRI)))
+      return false;
+  }
+  return true;
+}
+
+// Given a compare operator, return a matching New Value Jump
+// compare operator. Make sure that MI here is included in
+// HexagonInstrInfo.cpp::isNewValueJumpCandidate
+static unsigned getNewValueJumpOpcode(const MachineInstr *MI, int reg,
+                                      bool secondRegNewified) {
+  switch (MI->getOpcode()) {
+    case Hexagon::CMPEQrr:
+      return Hexagon::JMP_EQrrPt_nv_V4;
+
+    case Hexagon::CMPEQri: {
+      if (reg >= 0)
+        return Hexagon::JMP_EQriPt_nv_V4;
+      else
+        return Hexagon::JMP_EQriPtneg_nv_V4;
+    }
+
+    case Hexagon::CMPLTrr:
+    case Hexagon::CMPGTrr: {
+      if (secondRegNewified)
+        return Hexagon::JMP_GTrrdnPt_nv_V4;
+      else
+        return Hexagon::JMP_GTrrPt_nv_V4;
+    }
+
+    case Hexagon::CMPGEri: {
+      if (reg >= 1)
+        return Hexagon::JMP_GTriPt_nv_V4;
+      else
+        return Hexagon::JMP_GTriPtneg_nv_V4;
+    }
+
+    case Hexagon::CMPGTri: {
+      if (reg >= 0)
+        return Hexagon::JMP_GTriPt_nv_V4;
+      else
+        return Hexagon::JMP_GTriPtneg_nv_V4;
+    }
+
+    case Hexagon::CMPLTUrr:
+    case Hexagon::CMPGTUrr: {
+      if (secondRegNewified)
+        return Hexagon::JMP_GTUrrdnPt_nv_V4;
+      else
+        return Hexagon::JMP_GTUrrPt_nv_V4;
+    }
+
+    case Hexagon::CMPGTUri:
+      return Hexagon::JMP_GTUriPt_nv_V4;
+
+    case Hexagon::CMPGEUri: {
+      if (reg == 0)
+        return Hexagon::JMP_EQrrPt_nv_V4;
+      else
+        return Hexagon::JMP_GTUriPt_nv_V4;
+    }
+
+    default:
+       llvm_unreachable("Could not find matching New Value Jump instruction.");
+  }
+  // return *some value* to avoid compiler warning
+  return 0;
+}
+
+bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
+
+  DEBUG(dbgs() << "********** Hexagon New Value Jump **********\n"
+               << "********** Function: "
+               << MF.getFunction()->getName() << "\n");
+
+#if 0
+  // for now disable this, if we move NewValueJump before register
+  // allocation we need this information.
+  LiveVariables &LVs = getAnalysis<LiveVariables>();
+#endif
+
+  QII = static_cast<const HexagonInstrInfo *>(MF.getTarget().getInstrInfo());
+  QRI =
+    static_cast<const HexagonRegisterInfo *>(MF.getTarget().getRegisterInfo());
+
+  if (!QRI->Subtarget.hasV4TOps() ||
+      DisableNewValueJumps) {
+    return false;
+  }
+
+  int nvjCount = DbgNVJCount;
+  int nvjGenerated = 0;
+
+  // Loop through all the bb's of the function
+  for (MachineFunction::iterator MBBb = MF.begin(), MBBe = MF.end();
+        MBBb != MBBe; ++MBBb) {
+    MachineBasicBlock* MBB = MBBb;
+
+    DEBUG(dbgs() << "** dumping bb ** "
+                 << MBB->getNumber() << "\n");
+    DEBUG(MBB->dump());
+    DEBUG(dbgs() << "\n" << "********** dumping instr bottom up **********\n");
+    bool foundJump    = false;
+    bool foundCompare = false;
+    bool invertPredicate = false;
+    unsigned predReg = 0; // predicate reg of the jump.
+    unsigned cmpReg1 = 0;
+    int cmpOp2 = 0;
+    bool MO1IsKill = false;
+    bool MO2IsKill = false;
+    MachineBasicBlock::iterator jmpPos;
+    MachineBasicBlock::iterator cmpPos;
+    MachineInstr *cmpInstr = NULL, *jmpInstr = NULL;
+    MachineBasicBlock *jmpTarget = NULL;
+    bool afterRA = false;
+    bool isSecondOpReg = false;
+    bool isSecondOpNewified = false;
+    // Traverse the basic block - bottom up
+    for (MachineBasicBlock::iterator MII = MBB->end(), E = MBB->begin();
+             MII != E;) {
+      MachineInstr *MI = --MII;
+      if (MI->isDebugValue()) {
+        continue;
+      }
+
+      if ((nvjCount == 0) || (nvjCount > -1 && nvjCount <= nvjGenerated))
+        break;
+
+      DEBUG(dbgs() << "Instr: "; MI->dump(); dbgs() << "\n");
+
+      if (!foundJump &&
+         (MI->getOpcode() == Hexagon::JMP_c ||
+          MI->getOpcode() == Hexagon::JMP_cNot ||
+          MI->getOpcode() == Hexagon::JMP_cdnPt ||
+          MI->getOpcode() == Hexagon::JMP_cdnPnt ||
+          MI->getOpcode() == Hexagon::JMP_cdnNotPt ||
+          MI->getOpcode() == Hexagon::JMP_cdnNotPnt)) {
+        // This is where you would insert your compare and
+        // instr that feeds compare
+        jmpPos = MII;
+        jmpInstr = MI;
+        predReg = MI->getOperand(0).getReg();
+        afterRA = TargetRegisterInfo::isPhysicalRegister(predReg);
+
+        // If ifconverter had not messed up with the kill flags of the
+        // operands, the following check on the kill flag would suffice.
+        // if(!jmpInstr->getOperand(0).isKill()) break;
+
+        // This predicate register is live out out of BB
+        // this would only work if we can actually use Live
+        // variable analysis on phy regs - but LLVM does not
+        // provide LV analysis on phys regs.
+        //if(LVs.isLiveOut(predReg, *MBB)) break;
+
+        // Get all the successors of this block - which will always
+        // be 2. Check if the predicate register is live in in those
+        // successor. If yes, we can not delete the predicate -
+        // I am doing this only because LLVM does not provide LiveOut
+        // at the BB level.
+        bool predLive = false;
+        for (MachineBasicBlock::const_succ_iterator SI = MBB->succ_begin(),
+                            SIE = MBB->succ_end(); SI != SIE; ++SI) {
+          MachineBasicBlock* succMBB = *SI;
+         if (succMBB->isLiveIn(predReg)) {
+            predLive = true;
+          }
+        }
+        if (predLive)
+          break;
+
+        jmpTarget = MI->getOperand(1).getMBB();
+        foundJump = true;
+        if (MI->getOpcode() == Hexagon::JMP_cNot ||
+            MI->getOpcode() == Hexagon::JMP_cdnNotPt ||
+            MI->getOpcode() == Hexagon::JMP_cdnNotPnt) {
+          invertPredicate = true;
+        }
+        continue;
+      }
+
+      // No new value jump if there is a barrier. A barrier has to be in its
+      // own packet. A barrier has zero operands. We conservatively bail out
+      // here if we see any instruction with zero operands.
+      if (foundJump && MI->getNumOperands() == 0)
+        break;
+
+      if (foundJump &&
+         !foundCompare &&
+          MI->getOperand(0).isReg() &&
+          MI->getOperand(0).getReg() == predReg) {
+
+        // Not all compares can be new value compare. Arch Spec: 7.6.1.1
+        if (QII->isNewValueJumpCandidate(MI)) {
+
+          assert((MI->getDesc().isCompare()) &&
+              "Only compare instruction can be collapsed into New Value Jump");
+          isSecondOpReg = MI->getOperand(2).isReg();
+
+          if (!canCompareBeNewValueJump(QII, QRI, MII, predReg, isSecondOpReg,
+                                        afterRA, jmpPos, MF))
+            break;
+
+          cmpInstr = MI;
+          cmpPos = MII;
+          foundCompare = true;
+
+          // We need cmpReg1 and cmpOp2(imm or reg) while building
+          // new value jump instruction.
+          cmpReg1 = MI->getOperand(1).getReg();
+          if (MI->getOperand(1).isKill())
+            MO1IsKill = true;
+
+          if (isSecondOpReg) {
+            cmpOp2 = MI->getOperand(2).getReg();
+            if (MI->getOperand(2).isKill())
+              MO2IsKill = true;
+          } else
+            cmpOp2 = MI->getOperand(2).getImm();
+          continue;
+        }
+      }
+
+      if (foundCompare && foundJump) {
+
+        // If "common" checks fail, bail out on this BB.
+        if (!commonChecksToProhibitNewValueJump(afterRA, MII))
+          break;
+
+        bool foundFeeder = false;
+        MachineBasicBlock::iterator feederPos = MII;
+        if (MI->getOperand(0).isReg() &&
+            MI->getOperand(0).isDef() &&
+           (MI->getOperand(0).getReg() == cmpReg1 ||
+            (isSecondOpReg &&
+             MI->getOperand(0).getReg() == (unsigned) cmpOp2))) {
+
+          unsigned feederReg = MI->getOperand(0).getReg();
+
+          // First try to see if we can get the feeder from the first operand
+          // of the compare. If we can not, and if secondOpReg is true
+          // (second operand of the compare is also register), try that one.
+          // TODO: Try to come up with some heuristic to figure out which
+          // feeder would benefit.
+
+          if (feederReg == cmpReg1) {
+            if (!canBeFeederToNewValueJump(QII, QRI, MII, jmpPos, cmpPos, MF)) {
+              if (!isSecondOpReg)
+                break;
+              else
+                continue;
+            } else
+              foundFeeder = true;
+          }
+
+          if (!foundFeeder &&
+               isSecondOpReg &&
+               feederReg == (unsigned) cmpOp2)
+            if (!canBeFeederToNewValueJump(QII, QRI, MII, jmpPos, cmpPos, MF))
+              break;
+
+          if (isSecondOpReg) {
+            // In case of CMPLT, or CMPLTU, or EQ with the second register
+            // to newify, swap the operands.
+            if (cmpInstr->getOpcode() == Hexagon::CMPLTrr  ||
+                cmpInstr->getOpcode() == Hexagon::CMPLTUrr ||
+                (cmpInstr->getOpcode() == Hexagon::CMPEQrr &&
+                                     feederReg == (unsigned) cmpOp2)) {
+              unsigned tmp = cmpReg1;
+              bool tmpIsKill = MO1IsKill;
+              cmpReg1 = cmpOp2;
+              MO1IsKill = MO2IsKill;
+              cmpOp2 = tmp;
+              MO2IsKill = tmpIsKill;
+            }
+
+            // Now we have swapped the operands, all we need to check is,
+            // if the second operand (after swap) is the feeder.
+            // And if it is, make a note.
+            if (feederReg == (unsigned)cmpOp2)
+              isSecondOpNewified = true;
+          }
+
+          // Now that we are moving feeder close the jump,
+          // make sure we are respecting the kill values of
+          // the operands of the feeder.
+
+          bool updatedIsKill = false;
+          for (unsigned i = 0; i < MI->getNumOperands(); i++) {
+            MachineOperand &MO = MI->getOperand(i);
+            if (MO.isReg() && MO.isUse()) {
+              unsigned feederReg = MO.getReg();
+              for (MachineBasicBlock::iterator localII = feederPos,
+                   end = jmpPos; localII != end; localII++) {
+                MachineInstr *localMI = localII;
+                for (unsigned j = 0; j < localMI->getNumOperands(); j++) {
+                  MachineOperand &localMO = localMI->getOperand(j);
+                  if (localMO.isReg() && localMO.isUse() &&
+                      localMO.isKill() && feederReg == localMO.getReg()) {
+                    // We found that there is kill of a use register
+                    // Set up a kill flag on the register
+                    localMO.setIsKill(false);
+                    MO.setIsKill();
+                    updatedIsKill = true;
+                    break;
+                  }
+                }
+                if (updatedIsKill) break;
+              }
+            }
+            if (updatedIsKill) break;
+          }
+
+          MBB->splice(jmpPos, MI->getParent(), MI);
+          MBB->splice(jmpPos, MI->getParent(), cmpInstr);
+          DebugLoc dl = MI->getDebugLoc();
+          MachineInstr *NewMI;
+
+           assert((QII->isNewValueJumpCandidate(cmpInstr)) &&
+                      "This compare is not a New Value Jump candidate.");
+          unsigned opc = getNewValueJumpOpcode(cmpInstr, cmpOp2,
+                                               isSecondOpNewified);
+          if (invertPredicate)
+            opc = QII->getInvertedPredicatedOpcode(opc);
+
+          // Manage the conversions from CMPGEUri to either CMPEQrr
+          // or CMPGTUri properly. See Arch spec for CMPGEUri instructions.
+          // This has to be after the getNewValueJumpOpcode function call as
+          // second operand of the compare could be modified in this logic.
+          if (cmpInstr->getOpcode() == Hexagon::CMPGEUri) {
+            if (cmpOp2 == 0) {
+              cmpOp2 = cmpReg1;
+              MO2IsKill = MO1IsKill;
+              isSecondOpReg = true;
+            } else
+              --cmpOp2;
+          }
+
+          // Manage the conversions from CMPGEri to CMPGTUri properly.
+          // See Arch spec for CMPGEri instructions.
+          if (cmpInstr->getOpcode() == Hexagon::CMPGEri)
+            --cmpOp2;
+
+          if (isSecondOpReg) {
+            NewMI = BuildMI(*MBB, jmpPos, dl,
+                                  QII->get(opc))
+                                    .addReg(cmpReg1, getKillRegState(MO1IsKill))
+                                    .addReg(cmpOp2, getKillRegState(MO2IsKill))
+                                    .addMBB(jmpTarget);
+          }
+          else {
+            NewMI = BuildMI(*MBB, jmpPos, dl,
+                                  QII->get(opc))
+                                    .addReg(cmpReg1, getKillRegState(MO1IsKill))
+                                    .addImm(cmpOp2)
+                                    .addMBB(jmpTarget);
+          }
+
+          assert(NewMI && "New Value Jump Instruction Not created!");
+          if (cmpInstr->getOperand(0).isReg() &&
+              cmpInstr->getOperand(0).isKill())
+            cmpInstr->getOperand(0).setIsKill(false);
+          if (cmpInstr->getOperand(1).isReg() &&
+              cmpInstr->getOperand(1).isKill())
+            cmpInstr->getOperand(1).setIsKill(false);
+          cmpInstr->eraseFromParent();
+          jmpInstr->eraseFromParent();
+          ++nvjGenerated;
+          ++NumNVJGenerated;
+          break;
+        }
+      }
+    }
+  }
+
+  return true;
+
+}
+
+FunctionPass *llvm::createHexagonNewValueJump() {
+  return new HexagonNewValueJump();
+}
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index 2a9de92..2c23674 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -63,6 +63,7 @@ const uint16_t* HexagonRegisterInfo::getCalleeSavedRegs(const MachineFunction
     return CalleeSavedRegsV2;
   case HexagonSubtarget::V3:
   case HexagonSubtarget::V4:
+  case HexagonSubtarget::V5:
     return CalleeSavedRegsV3;
   }
   llvm_unreachable("Callee saved registers requested for unknown architecture "
@@ -109,6 +110,7 @@ HexagonRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
     return CalleeSavedRegClassesV2;
   case HexagonSubtarget::V3:
   case HexagonSubtarget::V4:
+  case HexagonSubtarget::V5:
     return CalleeSavedRegClassesV3;
   }
   llvm_unreachable("Callee saved register classes requested for unknown "
@@ -179,13 +181,15 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       // r0 = add(r30, #10000)
       // r0 = memw(r0)
       if ( (MI.getOpcode() == Hexagon::LDriw)  ||
-           (MI.getOpcode() == Hexagon::LDrid) ||
-           (MI.getOpcode() == Hexagon::LDrih) ||
-           (MI.getOpcode() == Hexagon::LDriuh) ||
-           (MI.getOpcode() == Hexagon::LDrib) ||
-           (MI.getOpcode() == Hexagon::LDriub) ) {
+           (MI.getOpcode() == Hexagon::LDrid)   ||
+           (MI.getOpcode() == Hexagon::LDrih)   ||
+           (MI.getOpcode() == Hexagon::LDriuh)  ||
+           (MI.getOpcode() == Hexagon::LDrib)   ||
+           (MI.getOpcode() == Hexagon::LDriub)  ||
+           (MI.getOpcode() == Hexagon::LDriw_f) ||
+           (MI.getOpcode() == Hexagon::LDrid_f)) {
         unsigned dstReg = (MI.getOpcode() == Hexagon::LDrid) ?
-          *getSubRegisters(MI.getOperand(0).getReg()) :
+          getSubReg(MI.getOperand(0).getReg(), Hexagon::subreg_loreg) :
           MI.getOperand(0).getReg();
 
         // Check if offset can fit in addi.
@@ -203,10 +207,13 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
         MI.getOperand(i).ChangeToRegister(dstReg, false, false, true);
         MI.getOperand(i+1).ChangeToImmediate(0);
-      } else if ((MI.getOpcode() == Hexagon::STriw) ||
+      } else if ((MI.getOpcode() == Hexagon::STriw_indexed) ||
+                 (MI.getOpcode() == Hexagon::STriw) ||
                  (MI.getOpcode() == Hexagon::STrid) ||
                  (MI.getOpcode() == Hexagon::STrih) ||
-                 (MI.getOpcode() == Hexagon::STrib)) {
+                 (MI.getOpcode() == Hexagon::STrib) ||
+                 (MI.getOpcode() == Hexagon::STrid_f) ||
+                 (MI.getOpcode() == Hexagon::STriw_f)) {
         // For stores, we need a reserved register. Change
         // memw(r30 + #10000) = r0 to:
         //
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.h b/lib/Target/Hexagon/HexagonRegisterInfo.h
index 6cf727b..85355ae 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.h
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.h
@@ -73,6 +73,10 @@ struct HexagonRegisterInfo : public HexagonGenRegisterInfo {
     return true;
   }
 
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+    return true;
+  }
+
   // Debug information queries.
   unsigned getRARegister() const;
   unsigned getFrameRegister(const MachineFunction &MF) const;
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.td b/lib/Target/Hexagon/HexagonRegisterInfo.td
index d44eae3..fe41fc3 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.td
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.td
@@ -131,6 +131,9 @@ let Namespace = "Hexagon" in {
   def SA1 : Rc<2, "sa1">, DwarfRegNum<[69]>;
   def LC1 : Rc<3, "lc1">, DwarfRegNum<[70]>;
 
+  def M0 : Rc<6, "m0">, DwarfRegNum<[71]>;
+  def M1 : Rc<7, "m1">, DwarfRegNum<[72]>;
+
   def PC : Rc<9,  "pc">, DwarfRegNum<[32]>; // is the Dwarf number correct?
   def GP : Rc<11, "gp">, DwarfRegNum<[33]>; // is the Dwarf number correct?
 }
@@ -140,19 +143,15 @@ let Namespace = "Hexagon" in {
 // FIXME: the register order should be defined in terms of the preferred
 // allocation order...
 //
-def IntRegs : RegisterClass<"Hexagon", [i32], 32,
+def IntRegs : RegisterClass<"Hexagon", [i32,f32], 32,
                             (add (sequence "R%u", 0, 9),
                                  (sequence "R%u", 12, 28),
                                  R10, R11, R29, R30, R31)> {
 }
 
-
-
-def DoubleRegs : RegisterClass<"Hexagon", [i64], 64,
+def DoubleRegs : RegisterClass<"Hexagon", [i64,f64], 64,
                                (add (sequence "D%u", 0, 4),
-                                    (sequence "D%u", 6, 13), D5, D14, D15)> {
-  let SubRegClasses = [(IntRegs subreg_loreg, subreg_hireg)];
-}
+                                    (sequence "D%u", 6, 13), D5, D14, D15)>;
 
 
 def PredRegs : RegisterClass<"Hexagon", [i1], 32, (add (sequence "P%u", 0, 3))>
@@ -162,6 +161,7 @@ def PredRegs : RegisterClass<"Hexagon", [i1], 32, (add (sequence "P%u", 0, 3))>
 
 def CRRegs : RegisterClass<"Hexagon", [i32], 32,
                            (add (sequence "LC%u", 0, 1),
-                                (sequence "SA%u", 0, 1), PC, GP)> {
+                                (sequence "SA%u", 0, 1),
+                                (sequence "M%u", 0, 1), PC, GP)> {
   let Size = 32;
 }
diff --git a/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp b/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
index 66a00e1..2468f0b 100644
--- a/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
+++ b/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
@@ -1,4 +1,4 @@
-//===- HexagonRemoveExtendArgs.cpp - Remove unecessary argument sign extends =//
+//===- HexagonRemoveExtendArgs.cpp - Remove unnecessary argument sign extends //
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Target/Hexagon/HexagonSchedule.td b/lib/Target/Hexagon/HexagonSchedule.td
index fbea445..d1076b8 100644
--- a/lib/Target/Hexagon/HexagonSchedule.td
+++ b/lib/Target/Hexagon/HexagonSchedule.td
@@ -13,7 +13,6 @@ def LSUNIT    : FuncUnit;
 def MUNIT     : FuncUnit;
 def SUNIT     : FuncUnit;
 
-
 // Itinerary classes
 def ALU32     : InstrItinClass;
 def ALU64     : InstrItinClass;
@@ -24,23 +23,31 @@ def LD        : InstrItinClass;
 def M         : InstrItinClass;
 def ST        : InstrItinClass;
 def S         : InstrItinClass;
+def SYS       : InstrItinClass;
+def MARKER    : InstrItinClass;
 def PSEUDO    : InstrItinClass;
 
-
 def HexagonItineraries :
- ProcessorItineraries<[LUNIT, LSUNIT, MUNIT, SUNIT], [], [
-  InstrItinData<ALU32      , [InstrStage<1,  [LUNIT, LSUNIT, MUNIT, SUNIT]>]>,
-  InstrItinData<ALU64      , [InstrStage<1,  [MUNIT, SUNIT]>]>,
-  InstrItinData<CR         , [InstrStage<1,  [SUNIT]>]>,
-  InstrItinData<J          , [InstrStage<1,  [SUNIT, MUNIT]>]>,
-  InstrItinData<JR         , [InstrStage<1,  [MUNIT]>]>,
-  InstrItinData<LD         , [InstrStage<1,  [LUNIT, LSUNIT]>]>,
-  InstrItinData<M          , [InstrStage<1,  [MUNIT, SUNIT]>]>,
-  InstrItinData<ST         , [InstrStage<1,  [LSUNIT]>]>,
-  InstrItinData<S          , [InstrStage<1,  [SUNIT, MUNIT]>]>,
-  InstrItinData<PSEUDO     , [InstrStage<1,  [LUNIT, LSUNIT, MUNIT, SUNIT]>]>
-]>;
-
+      ProcessorItineraries<[LUNIT, LSUNIT, MUNIT, SUNIT], [], [
+        InstrItinData<ALU32  , [InstrStage<1, [LUNIT, LSUNIT, MUNIT, SUNIT]>]>,
+        InstrItinData<ALU64  , [InstrStage<1, [MUNIT, SUNIT]>]>,
+        InstrItinData<CR     , [InstrStage<1, [SUNIT]>]>,
+        InstrItinData<J      , [InstrStage<1, [SUNIT, MUNIT]>]>,
+        InstrItinData<JR     , [InstrStage<1, [MUNIT]>]>,
+        InstrItinData<LD     , [InstrStage<1, [LUNIT, LSUNIT]>]>,
+        InstrItinData<M      , [InstrStage<1, [MUNIT, SUNIT]>]>,
+        InstrItinData<ST     , [InstrStage<1, [LSUNIT]>]>,
+        InstrItinData<S      , [InstrStage<1, [SUNIT, MUNIT]>]>,
+        InstrItinData<SYS    , [InstrStage<1, [LSUNIT]>]>,
+        InstrItinData<MARKER , [InstrStage<1, [LUNIT, LSUNIT, MUNIT, SUNIT]>]>,
+        InstrItinData<PSEUDO , [InstrStage<1, [LUNIT, LSUNIT, MUNIT, SUNIT]>]>
+      ]>;
+
+def HexagonModel : SchedMachineModel {
+  // Max issue per cycle == bundle width.
+  let IssueWidth = 4;
+  let Itineraries = HexagonItineraries;
+}
 
 //===----------------------------------------------------------------------===//
 // V4 Machine Info +
diff --git a/lib/Target/Hexagon/HexagonScheduleV4.td b/lib/Target/Hexagon/HexagonScheduleV4.td
index 4cf66fe..9b41126 100644
--- a/lib/Target/Hexagon/HexagonScheduleV4.td
+++ b/lib/Target/Hexagon/HexagonScheduleV4.td
@@ -23,7 +23,6 @@
 //    | SLOT3     |  XTYPE          ALU32     J         CR           |
 //    |===========|==================================================|
 
-
 // Functional Units.
 def SLOT0       : FuncUnit;
 def SLOT1       : FuncUnit;
@@ -34,22 +33,32 @@ def SLOT3       : FuncUnit;
 def NV_V4       : InstrItinClass;
 def MEM_V4      : InstrItinClass;
 // ALU64/M/S Instruction classes of V2 are collectively knownn as XTYPE in V4.
+def PREFIX      : InstrItinClass;
+
+def HexagonItinerariesV4 :
+      ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3], [], [
+        InstrItinData<ALU32  , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU64  , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<CR     , [InstrStage<1, [SLOT3]>]>,
+        InstrItinData<J      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<JR     , [InstrStage<1, [SLOT2]>]>,
+        InstrItinData<LD     , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<M      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<ST     , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<S      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<SYS    , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<NV_V4  , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<MEM_V4 , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<MARKER , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<PREFIX , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<PSEUDO , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>
+      ]>;
 
-def HexagonItinerariesV4 : ProcessorItineraries<
-                    [SLOT0, SLOT1, SLOT2, SLOT3], [], [
-  InstrItinData<LD            , [InstrStage<1,  [SLOT0, SLOT1]>]>,
-  InstrItinData<ST            , [InstrStage<1,  [SLOT0, SLOT1]>]>,
-  InstrItinData<ALU32         , [InstrStage<1,  [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-  InstrItinData<NV_V4         , [InstrStage<1,  [SLOT0]>]>,
-  InstrItinData<MEM_V4        , [InstrStage<1,  [SLOT0]>]>,
-  InstrItinData<J             , [InstrStage<1,  [SLOT2, SLOT3]>]>,
-  InstrItinData<JR            , [InstrStage<1,  [SLOT2]>]>,
-  InstrItinData<CR            , [InstrStage<1,  [SLOT3]>]>,
-  InstrItinData<PSEUDO        , [InstrStage<1,  [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-  InstrItinData<ALU64         , [InstrStage<1,  [SLOT2, SLOT3]>]>,
-  InstrItinData<M             , [InstrStage<1,  [SLOT2, SLOT3]>]>,
-  InstrItinData<S             , [InstrStage<1,  [SLOT2, SLOT3]>]>
-]>;
+def HexagonModelV4 : SchedMachineModel {
+  // Max issue per cycle == bundle width.
+  let IssueWidth = 4;
+  let Itineraries = HexagonItinerariesV4;
+}
 
 //===----------------------------------------------------------------------===//
 // Hexagon V4 Resource Definitions -
diff --git a/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp b/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp
index d10c9f2..a81cd91 100644
--- a/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp
+++ b/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp
@@ -14,7 +14,7 @@
 //   {p0 = cmp.eq(r0,r1)}
 //   {r3 = mux(p0,#1,#3)}
 //
-// This requires two packets.  If we use .new predicated immediate transfers, 
+// This requires two packets.  If we use .new predicated immediate transfers,
 // then we can do this in a single packet, e.g.:
 //
 //   {p0 = cmp.eq(r0,r1)
@@ -81,40 +81,126 @@ bool HexagonSplitTFRCondSets::runOnMachineFunction(MachineFunction &Fn) {
     for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end();
          ++MII) {
       MachineInstr *MI = MII;
-      int Opc = MI->getOpcode();
-      if (Opc == Hexagon::TFR_condset_rr) {
-
-        int DestReg = MI->getOperand(0).getReg();
-        int SrcReg1 = MI->getOperand(2).getReg();
-        int SrcReg2 = MI->getOperand(3).getReg();
-
-        // Minor optimization: do not emit the predicated copy if the source and
-        // the destination is the same register
-        if (DestReg != SrcReg1) {
-          BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_cPt),
-                  DestReg).addReg(MI->getOperand(1).getReg()).addReg(SrcReg1);
+      int Opc1, Opc2;
+      switch(MI->getOpcode()) {
+        case Hexagon::TFR_condset_rr:
+        case Hexagon::TFR_condset_rr_f:
+        case Hexagon::TFR_condset_rr64_f: {
+          int DestReg = MI->getOperand(0).getReg();
+          int SrcReg1 = MI->getOperand(2).getReg();
+          int SrcReg2 = MI->getOperand(3).getReg();
+
+          if (MI->getOpcode() == Hexagon::TFR_condset_rr ||
+              MI->getOpcode() == Hexagon::TFR_condset_rr_f) {
+            Opc1 = Hexagon::TFR_cPt;
+            Opc2 = Hexagon::TFR_cNotPt;
+          }
+          else if (MI->getOpcode() == Hexagon::TFR_condset_rr64_f) {
+            Opc1 = Hexagon::TFR64_cPt;
+            Opc2 = Hexagon::TFR64_cNotPt;
+          }
+
+          // Minor optimization: do not emit the predicated copy if the source
+          // and the destination is the same register.
+          if (DestReg != SrcReg1) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Opc1),
+                    DestReg).addReg(MI->getOperand(1).getReg()).addReg(SrcReg1);
+          }
+          if (DestReg != SrcReg2) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Opc2),
+                    DestReg).addReg(MI->getOperand(1).getReg()).addReg(SrcReg2);
+          }
+          MII = MBB->erase(MI);
+          --MII;
+          break;
         }
-        if (DestReg != SrcReg2) {
-          BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_cNotPt),
-                  DestReg).addReg(MI->getOperand(1).getReg()).addReg(SrcReg2);
+        case Hexagon::TFR_condset_ri:
+        case Hexagon::TFR_condset_ri_f: {
+          int DestReg = MI->getOperand(0).getReg();
+          int SrcReg1 = MI->getOperand(2).getReg();
+
+          //  Do not emit the predicated copy if the source and the destination
+          // is the same register.
+          if (DestReg != SrcReg1) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+              TII->get(Hexagon::TFR_cPt), DestReg).
+              addReg(MI->getOperand(1).getReg()).addReg(SrcReg1);
+          }
+          if (MI->getOpcode() ==  Hexagon::TFR_condset_ri ) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+              TII->get(Hexagon::TFRI_cNotPt), DestReg).
+              addReg(MI->getOperand(1).getReg()).
+              addImm(MI->getOperand(3).getImm());
+          } else if (MI->getOpcode() ==  Hexagon::TFR_condset_ri_f ) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+              TII->get(Hexagon::TFRI_cNotPt_f), DestReg).
+              addReg(MI->getOperand(1).getReg()).
+              addFPImm(MI->getOperand(3).getFPImm());
+          }
+
+          MII = MBB->erase(MI);
+          --MII;
+          break;
+        }
+        case Hexagon::TFR_condset_ir:
+        case Hexagon::TFR_condset_ir_f: {
+          int DestReg = MI->getOperand(0).getReg();
+          int SrcReg2 = MI->getOperand(3).getReg();
+
+          if (MI->getOpcode() ==  Hexagon::TFR_condset_ir ) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+              TII->get(Hexagon::TFRI_cPt), DestReg).
+              addReg(MI->getOperand(1).getReg()).
+              addImm(MI->getOperand(2).getImm());
+          } else if (MI->getOpcode() ==  Hexagon::TFR_condset_ir_f ) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+              TII->get(Hexagon::TFRI_cPt_f), DestReg).
+              addReg(MI->getOperand(1).getReg()).
+              addFPImm(MI->getOperand(2).getFPImm());
+          }
+
+          // Do not emit the predicated copy if the source and
+          // the destination is the same register.
+          if (DestReg != SrcReg2) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+              TII->get(Hexagon::TFR_cNotPt), DestReg).
+              addReg(MI->getOperand(1).getReg()).addReg(SrcReg2);
+          }
+          MII = MBB->erase(MI);
+          --MII;
+          break;
+        }
+        case Hexagon::TFR_condset_ii:
+        case Hexagon::TFR_condset_ii_f: {
+          int DestReg = MI->getOperand(0).getReg();
+          int SrcReg1 = MI->getOperand(1).getReg();
+
+          if (MI->getOpcode() ==  Hexagon::TFR_condset_ii ) {
+            int Immed1 = MI->getOperand(2).getImm();
+            int Immed2 = MI->getOperand(3).getImm();
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+                    TII->get(Hexagon::TFRI_cPt),
+                    DestReg).addReg(SrcReg1).addImm(Immed1);
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+                    TII->get(Hexagon::TFRI_cNotPt),
+                    DestReg).addReg(SrcReg1).addImm(Immed2);
+          } else if (MI->getOpcode() ==  Hexagon::TFR_condset_ii_f ) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+                    TII->get(Hexagon::TFRI_cPt_f), DestReg).
+                    addReg(SrcReg1).
+                    addFPImm(MI->getOperand(2).getFPImm());
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+                    TII->get(Hexagon::TFRI_cNotPt_f), DestReg).
+                    addReg(SrcReg1).
+                    addFPImm(MI->getOperand(3).getFPImm());
+          }
+          MII = MBB->erase(MI);
+          --MII;
+          break;
         }
-        MII = MBB->erase(MI);
-        --MII;
-      } else if (Opc == Hexagon::TFR_condset_ii) {
-        int DestReg = MI->getOperand(0).getReg();
-        int SrcReg1 = MI->getOperand(1).getReg();
-        int Immed1 = MI->getOperand(2).getImm();
-        int Immed2 = MI->getOperand(3).getImm();
-        BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFRI_cPt),
-                DestReg).addReg(SrcReg1).addImm(Immed1);
-        BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFRI_cNotPt),
-                DestReg).addReg(SrcReg1).addImm(Immed2);
-        MII = MBB->erase(MI);
-        --MII;
       }
     }
   }
-
   return true;
 }
 
diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp
index 654d336..5d087db 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -13,6 +13,7 @@
 
 #include "HexagonSubtarget.h"
 #include "Hexagon.h"
+#include "HexagonRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
@@ -29,11 +30,17 @@ static cl::opt<bool>
 EnableMemOps(
     "enable-hexagon-memops",
     cl::Hidden, cl::ZeroOrMore, cl::ValueDisallowed,
-    cl::desc("Generate V4 MEMOP in code generation for Hexagon target"));
+    cl::desc("Generate V4 memop instructions."));
+
+static cl::opt<bool>
+EnableIEEERndNear(
+    "enable-hexagon-ieee-rnd-near",
+    cl::Hidden, cl::ZeroOrMore, cl::init(false),
+    cl::desc("Generate non-chopped conversion from fp to int."));
 
 HexagonSubtarget::HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS):
   HexagonGenSubtargetInfo(TT, CPU, FS),
-  HexagonArchVersion(V1),
+  HexagonArchVersion(V2),
   CPUString(CPU.str()) {
   ParseSubtargetFeatures(CPU, FS);
 
@@ -45,18 +52,27 @@ HexagonSubtarget::HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS):
     break;
   case HexagonSubtarget::V4:
     break;
+  case HexagonSubtarget::V5:
+    break;
   default:
-    llvm_unreachable("Unknown Architecture Version.");
+    // If the programmer has not specified a Hexagon version, default
+    // to -mv4.
+    CPUString = "hexagonv4";
+    HexagonArchVersion = HexagonSubtarget::V4;
+    break;
   }
 
   // Initialize scheduling itinerary for the specified CPU.
   InstrItins = getInstrItineraryForCPU(CPUString);
 
-  // Max issue per cycle == bundle width.
-  InstrItins.IssueWidth = 4;
-
   if (EnableMemOps)
     UseMemOps = true;
   else
     UseMemOps = false;
+
+  if (EnableIEEERndNear)
+    ModeIEEERndNear = true;
+  else
+    ModeIEEERndNear = false;
 }
+
diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h
index 3079086..5d9d6d8 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/lib/Target/Hexagon/HexagonSubtarget.h
@@ -22,16 +22,18 @@
 #include "HexagonGenSubtargetInfo.inc"
 
 #define Hexagon_SMALL_DATA_THRESHOLD 8
+#define Hexagon_SLOTS 4
 
 namespace llvm {
 
 class HexagonSubtarget : public HexagonGenSubtargetInfo {
 
   bool UseMemOps;
+  bool ModeIEEERndNear;
 
 public:
   enum HexagonArchEnum {
-    V1, V2, V3, V4
+    V1, V2, V3, V4, V5
   };
 
   HexagonArchEnum HexagonArchVersion;
@@ -55,7 +57,11 @@ public:
   bool hasV3TOps () const { return HexagonArchVersion >= V3; }
   bool hasV3TOpsOnly () const { return HexagonArchVersion == V3; }
   bool hasV4TOps () const { return HexagonArchVersion >= V4; }
+  bool hasV4TOpsOnly () const { return HexagonArchVersion == V4; }
   bool useMemOps () const { return HexagonArchVersion >= V4 && UseMemOps; }
+  bool hasV5TOps () const { return HexagonArchVersion >= V5; }
+  bool hasV5TOpsOnly () const { return HexagonArchVersion == V5; }
+  bool modeIEEERndNear () const { return ModeIEEERndNear; }
 
   bool isSubtargetV2() const { return HexagonArchVersion == V2;}
   const std::string &getCPUString () const { return CPUString; }
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index b9e6894..a7b291f 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -55,7 +55,9 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, StringRef TT,
                                            CodeModel::Model CM,
                                            CodeGenOpt::Level OL)
   : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-    DataLayout("e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-a0:0") ,
+    DataLayout("e-p:32:32:32-"
+                "i64:64:64-i32:32:32-i16:16:16-i1:32:32-"
+                "f64:64:64-f32:32:32-a0:0-n32") ,
     Subtarget(TT, CPU, FS), InstrInfo(Subtarget), TLInfo(*this),
     TSInfo(*this),
     FrameLowering(Subtarget),
@@ -100,43 +102,47 @@ TargetPassConfig *HexagonTargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 bool HexagonPassConfig::addInstSelector() {
-  PM.add(createHexagonRemoveExtendOps(getHexagonTargetMachine()));
-  PM.add(createHexagonISelDag(getHexagonTargetMachine()));
-  PM.add(createHexagonPeephole());
+  addPass(createHexagonRemoveExtendOps(getHexagonTargetMachine()));
+  addPass(createHexagonISelDag(getHexagonTargetMachine()));
+  addPass(createHexagonPeephole());
   return false;
 }
 
 
 bool HexagonPassConfig::addPreRegAlloc() {
   if (!DisableHardwareLoops) {
-    PM.add(createHexagonHardwareLoops());
+    addPass(createHexagonHardwareLoops());
   }
-
   return false;
 }
 
 bool HexagonPassConfig::addPostRegAlloc() {
-  PM.add(createHexagonCFGOptimizer(getHexagonTargetMachine()));
+  addPass(createHexagonCFGOptimizer(getHexagonTargetMachine()));
   return true;
 }
 
 
 bool HexagonPassConfig::addPreSched2() {
-  addPass(IfConverterID);
+  addPass(&IfConverterID);
   return true;
 }
 
 bool HexagonPassConfig::addPreEmitPass() {
 
   if (!DisableHardwareLoops) {
-    PM.add(createHexagonFixupHwLoops());
+    addPass(createHexagonFixupHwLoops());
   }
 
+  addPass(createHexagonNewValueJump());
+
   // Expand Spill code for predicate registers.
-  PM.add(createHexagonExpandPredSpillCode(getHexagonTargetMachine()));
+  addPass(createHexagonExpandPredSpillCode(getHexagonTargetMachine()));
 
   // Split up TFRcondsets into conditional transfers.
-  PM.add(createHexagonSplitTFRCondSets(getHexagonTargetMachine()));
+  addPass(createHexagonSplitTFRCondSets(getHexagonTargetMachine()));
+
+  // Create Packets.
+  addPass(createHexagonPacketizer());
 
   return false;
 }
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
new file mode 100644
index 0000000..a03ed03
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -0,0 +1,3646 @@
+//===----- HexagonPacketizer.cpp - vliw packetizer ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a simple VLIW packetizer using DFA. The packetizer works on
+// machine basic blocks. For each instruction I in BB, the packetizer consults
+// the DFA to see if machine resources are available to execute I. If so, the
+// packetizer checks if I depends on any instruction J in the current packet.
+// If no dependency is found, I is added to current packet and machine resource
+// is marked as taken. If any dependency is found, a target API call is made to
+// prune the dependence.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "packets"
+#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/LatencyPriorityQueue.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonMachineFunctionInfo.h"
+
+#include <map>
+
+using namespace llvm;
+
+namespace {
+  class HexagonPacketizer : public MachineFunctionPass {
+
+  public:
+    static char ID;
+    HexagonPacketizer() : MachineFunctionPass(ID) {}
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTree>();
+      AU.addRequired<MachineLoopInfo>();
+      AU.addPreserved<MachineLoopInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    const char *getPassName() const {
+      return "Hexagon Packetizer";
+    }
+
+    bool runOnMachineFunction(MachineFunction &Fn);
+  };
+  char HexagonPacketizer::ID = 0;
+
+  class HexagonPacketizerList : public VLIWPacketizerList {
+
+  private:
+
+    // Has the instruction been promoted to a dot-new instruction.
+    bool PromotedToDotNew;
+
+    // Has the instruction been glued to allocframe.
+    bool GlueAllocframeStore;
+
+    // Has the feeder instruction been glued to new value jump.
+    bool GlueToNewValueJump;
+
+    // Check if there is a dependence between some instruction already in this
+    // packet and this instruction.
+    bool Dependence;
+
+    // Only check for dependence if there are resources available to
+    // schedule this instruction.
+    bool FoundSequentialDependence;
+
+  public:
+    // Ctor.
+    HexagonPacketizerList(MachineFunction &MF, MachineLoopInfo &MLI,
+                          MachineDominatorTree &MDT);
+
+    // initPacketizerState - initialize some internal flags.
+    void initPacketizerState();
+
+    // ignorePseudoInstruction - Ignore bundling of pseudo instructions.
+    bool ignorePseudoInstruction(MachineInstr *MI, MachineBasicBlock *MBB);
+
+    // isSoloInstruction - return true if instruction MI can not be packetized
+    // with any other instruction, which means that MI itself is a packet.
+    bool isSoloInstruction(MachineInstr *MI);
+
+    // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ
+    // together.
+    bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ);
+
+    // isLegalToPruneDependencies - Is it legal to prune dependece between SUI
+    // and SUJ.
+    bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ);
+
+    MachineBasicBlock::iterator addToPacket(MachineInstr *MI);
+  private:
+    bool IsCallDependent(MachineInstr* MI, SDep::Kind DepType, unsigned DepReg);
+    bool PromoteToDotNew(MachineInstr* MI, SDep::Kind DepType,
+                    MachineBasicBlock::iterator &MII,
+                    const TargetRegisterClass* RC);
+    bool CanPromoteToDotNew(MachineInstr* MI, SUnit* PacketSU,
+                    unsigned DepReg,
+                    std::map <MachineInstr*, SUnit*> MIToSUnit,
+                    MachineBasicBlock::iterator &MII,
+                    const TargetRegisterClass* RC);
+    bool CanPromoteToNewValue(MachineInstr* MI, SUnit* PacketSU,
+                    unsigned DepReg,
+                    std::map <MachineInstr*, SUnit*> MIToSUnit,
+                    MachineBasicBlock::iterator &MII);
+    bool CanPromoteToNewValueStore(MachineInstr* MI, MachineInstr* PacketMI,
+                    unsigned DepReg,
+                    std::map <MachineInstr*, SUnit*> MIToSUnit);
+    bool DemoteToDotOld(MachineInstr* MI);
+    bool ArePredicatesComplements(MachineInstr* MI1, MachineInstr* MI2,
+                    std::map <MachineInstr*, SUnit*> MIToSUnit);
+    bool RestrictingDepExistInPacket(MachineInstr*,
+                    unsigned, std::map <MachineInstr*, SUnit*>);
+    bool isNewifiable(MachineInstr* MI);
+    bool isCondInst(MachineInstr* MI);
+    bool IsNewifyStore (MachineInstr* MI);
+    bool tryAllocateResourcesForConstExt(MachineInstr* MI);
+    bool canReserveResourcesForConstExt(MachineInstr *MI);
+    void reserveResourcesForConstExt(MachineInstr* MI);
+    bool isNewValueInst(MachineInstr* MI);
+    bool isDotNewInst(MachineInstr* MI);
+  };
+}
+
+// HexagonPacketizerList Ctor.
+HexagonPacketizerList::HexagonPacketizerList(
+  MachineFunction &MF, MachineLoopInfo &MLI,MachineDominatorTree &MDT)
+  : VLIWPacketizerList(MF, MLI, MDT, true){
+}
+
+bool HexagonPacketizer::runOnMachineFunction(MachineFunction &Fn) {
+  const TargetInstrInfo *TII = Fn.getTarget().getInstrInfo();
+  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
+  MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
+
+  // Instantiate the packetizer.
+  HexagonPacketizerList Packetizer(Fn, MLI, MDT);
+
+  // DFA state table should not be empty.
+  assert(Packetizer.getResourceTracker() && "Empty DFA table!");
+
+  //
+  // Loop over all basic blocks and remove KILL pseudo-instructions
+  // These instructions confuse the dependence analysis. Consider:
+  // D0 = ...   (Insn 0)
+  // R0 = KILL R0, D0 (Insn 1)
+  // R0 = ... (Insn 2)
+  // Here, Insn 1 will result in the dependence graph not emitting an output
+  // dependence between Insn 0 and Insn 2. This can lead to incorrect
+  // packetization
+  //
+  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
+       MBB != MBBe; ++MBB) {
+    MachineBasicBlock::iterator End = MBB->end();
+    MachineBasicBlock::iterator MI = MBB->begin();
+    while (MI != End) {
+      if (MI->isKill()) {
+        MachineBasicBlock::iterator DeleteMI = MI;
+        ++MI;
+        MBB->erase(DeleteMI);
+        End = MBB->end();
+        continue;
+      }
+      ++MI;
+    }
+  }
+
+  // Loop over all of the basic blocks.
+  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
+       MBB != MBBe; ++MBB) {
+    // Find scheduling regions and schedule / packetize each region.
+    unsigned RemainingCount = MBB->size();
+    for(MachineBasicBlock::iterator RegionEnd = MBB->end();
+        RegionEnd != MBB->begin();) {
+      // The next region starts above the previous region. Look backward in the
+      // instruction stream until we find the nearest boundary.
+      MachineBasicBlock::iterator I = RegionEnd;
+      for(;I != MBB->begin(); --I, --RemainingCount) {
+        if (TII->isSchedulingBoundary(llvm::prior(I), MBB, Fn))
+          break;
+      }
+      I = MBB->begin();
+
+      // Skip empty scheduling regions.
+      if (I == RegionEnd) {
+        RegionEnd = llvm::prior(RegionEnd);
+        --RemainingCount;
+        continue;
+      }
+      // Skip regions with one instruction.
+      if (I == llvm::prior(RegionEnd)) {
+        RegionEnd = llvm::prior(RegionEnd);
+        continue;
+      }
+
+      Packetizer.PacketizeMIs(MBB, I, RegionEnd);
+      RegionEnd = I;
+    }
+  }
+
+  return true;
+}
+
+
+static bool IsIndirectCall(MachineInstr* MI) {
+  return ((MI->getOpcode() == Hexagon::CALLR) ||
+          (MI->getOpcode() == Hexagon::CALLRv3));
+}
+
+// Reserve resources for constant extender. Trigure an assertion if
+// reservation fail.
+void HexagonPacketizerList::reserveResourcesForConstExt(MachineInstr* MI) {
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  MachineInstr *PseudoMI = MI->getParent()->getParent()->CreateMachineInstr(
+                                  QII->get(Hexagon::IMMEXT), MI->getDebugLoc());
+
+  if (ResourceTracker->canReserveResources(PseudoMI)) {
+    ResourceTracker->reserveResources(PseudoMI);
+    MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI);
+  } else {
+    MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI);
+    llvm_unreachable("can not reserve resources for constant extender.");
+  }
+  return;
+}
+
+bool HexagonPacketizerList::canReserveResourcesForConstExt(MachineInstr *MI) {
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  assert(QII->isExtended(MI) &&
+         "Should only be called for constant extended instructions");
+  MachineFunction *MF = MI->getParent()->getParent();
+  MachineInstr *PseudoMI = MF->CreateMachineInstr(QII->get(Hexagon::IMMEXT),
+                                                  MI->getDebugLoc());
+  bool CanReserve = ResourceTracker->canReserveResources(PseudoMI);
+  MF->DeleteMachineInstr(PseudoMI);
+  return CanReserve;
+}
+
+// Allocate resources (i.e. 4 bytes) for constant extender. If succeed, return
+// true, otherwise, return false.
+bool HexagonPacketizerList::tryAllocateResourcesForConstExt(MachineInstr* MI) {
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  MachineInstr *PseudoMI = MI->getParent()->getParent()->CreateMachineInstr(
+                                  QII->get(Hexagon::IMMEXT), MI->getDebugLoc());
+
+  if (ResourceTracker->canReserveResources(PseudoMI)) {
+    ResourceTracker->reserveResources(PseudoMI);
+    MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI);
+    return true;
+  } else {
+    MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI);
+    return false;
+  }
+}
+
+
+bool HexagonPacketizerList::IsCallDependent(MachineInstr* MI,
+                                          SDep::Kind DepType,
+                                          unsigned DepReg) {
+
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  const HexagonRegisterInfo* QRI =
+              (const HexagonRegisterInfo *) TM.getRegisterInfo();
+
+  // Check for lr dependence
+  if (DepReg == QRI->getRARegister()) {
+    return true;
+  }
+
+  if (QII->isDeallocRet(MI)) {
+    if (DepReg == QRI->getFrameRegister() ||
+        DepReg == QRI->getStackRegister())
+      return true;
+  }
+
+  // Check if this is a predicate dependence
+  const TargetRegisterClass* RC = QRI->getMinimalPhysRegClass(DepReg);
+  if (RC == &Hexagon::PredRegsRegClass) {
+    return true;
+  }
+
+  //
+  // Lastly check for an operand used in an indirect call
+  // If we had an attribute for checking if an instruction is an indirect call,
+  // then we could have avoided this relatively brittle implementation of
+  // IsIndirectCall()
+  //
+  // Assumes that the first operand of the CALLr is the function address
+  //
+  if (IsIndirectCall(MI) && (DepType == SDep::Data)) {
+    MachineOperand MO = MI->getOperand(0);
+    if (MO.isReg() && MO.isUse() && (MO.getReg() == DepReg)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+static bool IsRegDependence(const SDep::Kind DepType) {
+  return (DepType == SDep::Data || DepType == SDep::Anti ||
+          DepType == SDep::Output);
+}
+
+static bool IsDirectJump(MachineInstr* MI) {
+  return (MI->getOpcode() == Hexagon::JMP);
+}
+
+static bool IsSchedBarrier(MachineInstr* MI) {
+  switch (MI->getOpcode()) {
+  case Hexagon::BARRIER:
+    return true;
+  }
+  return false;
+}
+
+static bool IsControlFlow(MachineInstr* MI) {
+  return (MI->getDesc().isTerminator() || MI->getDesc().isCall());
+}
+
+bool HexagonPacketizerList::isNewValueInst(MachineInstr* MI) {
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  if (QII->isNewValueJump(MI))
+    return true;
+
+  if (QII->isNewValueStore(MI))
+    return true;
+
+  return false;
+}
+
+// Function returns true if an instruction can be promoted to the new-value
+// store. It will always return false for v2 and v3.
+// It lists all the conditional and unconditional stores that can be promoted
+// to the new-value stores.
+
+bool HexagonPacketizerList::IsNewifyStore (MachineInstr* MI) {
+  const HexagonRegisterInfo* QRI =
+                          (const HexagonRegisterInfo *) TM.getRegisterInfo();
+  switch (MI->getOpcode())
+  {
+    // store byte
+    case Hexagon::STrib:
+    case Hexagon::STrib_indexed:
+    case Hexagon::STrib_indexed_shl_V4:
+    case Hexagon::STrib_shl_V4:
+    case Hexagon::STrib_GP_V4:
+    case Hexagon::STb_GP_V4:
+    case Hexagon::POST_STbri:
+    case Hexagon::STrib_cPt:
+    case Hexagon::STrib_cdnPt_V4:
+    case Hexagon::STrib_cNotPt:
+    case Hexagon::STrib_cdnNotPt_V4:
+    case Hexagon::STrib_indexed_cPt:
+    case Hexagon::STrib_indexed_cdnPt_V4:
+    case Hexagon::STrib_indexed_cNotPt:
+    case Hexagon::STrib_indexed_cdnNotPt_V4:
+    case Hexagon::STrib_indexed_shl_cPt_V4:
+    case Hexagon::STrib_indexed_shl_cdnPt_V4:
+    case Hexagon::STrib_indexed_shl_cNotPt_V4:
+    case Hexagon::STrib_indexed_shl_cdnNotPt_V4:
+    case Hexagon::POST_STbri_cPt:
+    case Hexagon::POST_STbri_cdnPt_V4:
+    case Hexagon::POST_STbri_cNotPt:
+    case Hexagon::POST_STbri_cdnNotPt_V4:
+    case Hexagon::STb_GP_cPt_V4:
+    case Hexagon::STb_GP_cNotPt_V4:
+    case Hexagon::STb_GP_cdnPt_V4:
+    case Hexagon::STb_GP_cdnNotPt_V4:
+    case Hexagon::STrib_GP_cPt_V4:
+    case Hexagon::STrib_GP_cNotPt_V4:
+    case Hexagon::STrib_GP_cdnPt_V4:
+    case Hexagon::STrib_GP_cdnNotPt_V4:
+
+    // store halfword
+    case Hexagon::STrih:
+    case Hexagon::STrih_indexed:
+    case Hexagon::STrih_indexed_shl_V4:
+    case Hexagon::STrih_shl_V4:
+    case Hexagon::STrih_GP_V4:
+    case Hexagon::STh_GP_V4:
+    case Hexagon::POST_SThri:
+    case Hexagon::STrih_cPt:
+    case Hexagon::STrih_cdnPt_V4:
+    case Hexagon::STrih_cNotPt:
+    case Hexagon::STrih_cdnNotPt_V4:
+    case Hexagon::STrih_indexed_cPt:
+    case Hexagon::STrih_indexed_cdnPt_V4:
+    case Hexagon::STrih_indexed_cNotPt:
+    case Hexagon::STrih_indexed_cdnNotPt_V4:
+    case Hexagon::STrih_indexed_shl_cPt_V4:
+    case Hexagon::STrih_indexed_shl_cdnPt_V4:
+    case Hexagon::STrih_indexed_shl_cNotPt_V4:
+    case Hexagon::STrih_indexed_shl_cdnNotPt_V4:
+    case Hexagon::POST_SThri_cPt:
+    case Hexagon::POST_SThri_cdnPt_V4:
+    case Hexagon::POST_SThri_cNotPt:
+    case Hexagon::POST_SThri_cdnNotPt_V4:
+    case Hexagon::STh_GP_cPt_V4:
+    case Hexagon::STh_GP_cNotPt_V4:
+    case Hexagon::STh_GP_cdnPt_V4:
+    case Hexagon::STh_GP_cdnNotPt_V4:
+    case Hexagon::STrih_GP_cPt_V4:
+    case Hexagon::STrih_GP_cNotPt_V4:
+    case Hexagon::STrih_GP_cdnPt_V4:
+    case Hexagon::STrih_GP_cdnNotPt_V4:
+
+    // store word
+    case Hexagon::STriw:
+    case Hexagon::STriw_indexed:
+    case Hexagon::STriw_indexed_shl_V4:
+    case Hexagon::STriw_shl_V4:
+    case Hexagon::STriw_GP_V4:
+    case Hexagon::STw_GP_V4:
+    case Hexagon::POST_STwri:
+    case Hexagon::STriw_cPt:
+    case Hexagon::STriw_cdnPt_V4:
+    case Hexagon::STriw_cNotPt:
+    case Hexagon::STriw_cdnNotPt_V4:
+    case Hexagon::STriw_indexed_cPt:
+    case Hexagon::STriw_indexed_cdnPt_V4:
+    case Hexagon::STriw_indexed_cNotPt:
+    case Hexagon::STriw_indexed_cdnNotPt_V4:
+    case Hexagon::STriw_indexed_shl_cPt_V4:
+    case Hexagon::STriw_indexed_shl_cdnPt_V4:
+    case Hexagon::STriw_indexed_shl_cNotPt_V4:
+    case Hexagon::STriw_indexed_shl_cdnNotPt_V4:
+    case Hexagon::POST_STwri_cPt:
+    case Hexagon::POST_STwri_cdnPt_V4:
+    case Hexagon::POST_STwri_cNotPt:
+    case Hexagon::POST_STwri_cdnNotPt_V4:
+    case Hexagon::STw_GP_cPt_V4:
+    case Hexagon::STw_GP_cNotPt_V4:
+    case Hexagon::STw_GP_cdnPt_V4:
+    case Hexagon::STw_GP_cdnNotPt_V4:
+    case Hexagon::STriw_GP_cPt_V4:
+    case Hexagon::STriw_GP_cNotPt_V4:
+    case Hexagon::STriw_GP_cdnPt_V4:
+    case Hexagon::STriw_GP_cdnNotPt_V4:
+        return QRI->Subtarget.hasV4TOps();
+  }
+  return false;
+}
+
+static bool IsLoopN(MachineInstr *MI) {
+  return (MI->getOpcode() == Hexagon::LOOP0_i ||
+          MI->getOpcode() == Hexagon::LOOP0_r);
+}
+
+/// DoesModifyCalleeSavedReg - Returns true if the instruction modifies a
+/// callee-saved register.
+static bool DoesModifyCalleeSavedReg(MachineInstr *MI,
+                                     const TargetRegisterInfo *TRI) {
+  for (const uint16_t *CSR = TRI->getCalleeSavedRegs(); *CSR; ++CSR) {
+    unsigned CalleeSavedReg = *CSR;
+    if (MI->modifiesRegister(CalleeSavedReg, TRI))
+      return true;
+  }
+  return false;
+}
+
+// Return the new value instruction for a given store.
+static int GetDotNewOp(const int opc) {
+  switch (opc) {
+  default: llvm_unreachable("Unknown .new type");
+  // store new value byte
+  case Hexagon::STrib:
+    return Hexagon::STrib_nv_V4;
+
+  case Hexagon::STrib_indexed:
+    return Hexagon::STrib_indexed_nv_V4;
+
+  case Hexagon::STrib_indexed_shl_V4:
+    return Hexagon::STrib_indexed_shl_nv_V4;
+
+  case Hexagon::STrib_shl_V4:
+    return Hexagon::STrib_shl_nv_V4;
+
+  case Hexagon::STrib_GP_V4:
+    return Hexagon::STrib_GP_nv_V4;
+
+  case Hexagon::STb_GP_V4:
+    return Hexagon::STb_GP_nv_V4;
+
+  case Hexagon::POST_STbri:
+    return Hexagon::POST_STbri_nv_V4;
+
+  case Hexagon::STrib_cPt:
+    return Hexagon::STrib_cPt_nv_V4;
+
+  case Hexagon::STrib_cdnPt_V4:
+    return Hexagon::STrib_cdnPt_nv_V4;
+
+  case Hexagon::STrib_cNotPt:
+    return Hexagon::STrib_cNotPt_nv_V4;
+
+  case Hexagon::STrib_cdnNotPt_V4:
+    return Hexagon::STrib_cdnNotPt_nv_V4;
+
+  case Hexagon::STrib_indexed_cPt:
+    return Hexagon::STrib_indexed_cPt_nv_V4;
+
+  case Hexagon::STrib_indexed_cdnPt_V4:
+    return Hexagon::STrib_indexed_cdnPt_nv_V4;
+
+  case Hexagon::STrib_indexed_cNotPt:
+    return Hexagon::STrib_indexed_cNotPt_nv_V4;
+
+  case Hexagon::STrib_indexed_cdnNotPt_V4:
+    return Hexagon::STrib_indexed_cdnNotPt_nv_V4;
+
+  case Hexagon::STrib_indexed_shl_cPt_V4:
+    return Hexagon::STrib_indexed_shl_cPt_nv_V4;
+
+  case Hexagon::STrib_indexed_shl_cdnPt_V4:
+    return Hexagon::STrib_indexed_shl_cdnPt_nv_V4;
+
+  case Hexagon::STrib_indexed_shl_cNotPt_V4:
+    return Hexagon::STrib_indexed_shl_cNotPt_nv_V4;
+
+  case Hexagon::STrib_indexed_shl_cdnNotPt_V4:
+    return Hexagon::STrib_indexed_shl_cdnNotPt_nv_V4;
+
+  case Hexagon::POST_STbri_cPt:
+    return Hexagon::POST_STbri_cPt_nv_V4;
+
+  case Hexagon::POST_STbri_cdnPt_V4:
+    return Hexagon::POST_STbri_cdnPt_nv_V4;
+
+  case Hexagon::POST_STbri_cNotPt:
+    return Hexagon::POST_STbri_cNotPt_nv_V4;
+
+  case Hexagon::POST_STbri_cdnNotPt_V4:
+    return Hexagon::POST_STbri_cdnNotPt_nv_V4;
+
+  case Hexagon::STb_GP_cPt_V4:
+    return Hexagon::STb_GP_cPt_nv_V4;
+
+  case Hexagon::STb_GP_cNotPt_V4:
+    return Hexagon::STb_GP_cNotPt_nv_V4;
+
+  case Hexagon::STb_GP_cdnPt_V4:
+    return Hexagon::STb_GP_cdnPt_nv_V4;
+
+  case Hexagon::STb_GP_cdnNotPt_V4:
+    return Hexagon::STb_GP_cdnNotPt_nv_V4;
+
+  case Hexagon::STrib_GP_cPt_V4:
+    return Hexagon::STrib_GP_cPt_nv_V4;
+
+  case Hexagon::STrib_GP_cNotPt_V4:
+    return Hexagon::STrib_GP_cNotPt_nv_V4;
+
+  case Hexagon::STrib_GP_cdnPt_V4:
+    return Hexagon::STrib_GP_cdnPt_nv_V4;
+
+  case Hexagon::STrib_GP_cdnNotPt_V4:
+    return Hexagon::STrib_GP_cdnNotPt_nv_V4;
+
+  // store new value halfword
+  case Hexagon::STrih:
+    return Hexagon::STrih_nv_V4;
+
+  case Hexagon::STrih_indexed:
+    return Hexagon::STrih_indexed_nv_V4;
+
+  case Hexagon::STrih_indexed_shl_V4:
+    return Hexagon::STrih_indexed_shl_nv_V4;
+
+  case Hexagon::STrih_shl_V4:
+    return Hexagon::STrih_shl_nv_V4;
+
+  case Hexagon::STrih_GP_V4:
+    return Hexagon::STrih_GP_nv_V4;
+
+  case Hexagon::STh_GP_V4:
+    return Hexagon::STh_GP_nv_V4;
+
+  case Hexagon::POST_SThri:
+    return Hexagon::POST_SThri_nv_V4;
+
+  case Hexagon::STrih_cPt:
+    return Hexagon::STrih_cPt_nv_V4;
+
+  case Hexagon::STrih_cdnPt_V4:
+    return Hexagon::STrih_cdnPt_nv_V4;
+
+  case Hexagon::STrih_cNotPt:
+    return Hexagon::STrih_cNotPt_nv_V4;
+
+  case Hexagon::STrih_cdnNotPt_V4:
+    return Hexagon::STrih_cdnNotPt_nv_V4;
+
+  case Hexagon::STrih_indexed_cPt:
+    return Hexagon::STrih_indexed_cPt_nv_V4;
+
+  case Hexagon::STrih_indexed_cdnPt_V4:
+    return Hexagon::STrih_indexed_cdnPt_nv_V4;
+
+  case Hexagon::STrih_indexed_cNotPt:
+    return Hexagon::STrih_indexed_cNotPt_nv_V4;
+
+  case Hexagon::STrih_indexed_cdnNotPt_V4:
+    return Hexagon::STrih_indexed_cdnNotPt_nv_V4;
+
+  case Hexagon::STrih_indexed_shl_cPt_V4:
+    return Hexagon::STrih_indexed_shl_cPt_nv_V4;
+
+  case Hexagon::STrih_indexed_shl_cdnPt_V4:
+    return Hexagon::STrih_indexed_shl_cdnPt_nv_V4;
+
+  case Hexagon::STrih_indexed_shl_cNotPt_V4:
+    return Hexagon::STrih_indexed_shl_cNotPt_nv_V4;
+
+  case Hexagon::STrih_indexed_shl_cdnNotPt_V4:
+    return Hexagon::STrih_indexed_shl_cdnNotPt_nv_V4;
+
+  case Hexagon::POST_SThri_cPt:
+    return Hexagon::POST_SThri_cPt_nv_V4;
+
+  case Hexagon::POST_SThri_cdnPt_V4:
+    return Hexagon::POST_SThri_cdnPt_nv_V4;
+
+  case Hexagon::POST_SThri_cNotPt:
+    return Hexagon::POST_SThri_cNotPt_nv_V4;
+
+  case Hexagon::POST_SThri_cdnNotPt_V4:
+    return Hexagon::POST_SThri_cdnNotPt_nv_V4;
+
+  case Hexagon::STh_GP_cPt_V4:
+    return Hexagon::STh_GP_cPt_nv_V4;
+
+  case Hexagon::STh_GP_cNotPt_V4:
+    return Hexagon::STh_GP_cNotPt_nv_V4;
+
+  case Hexagon::STh_GP_cdnPt_V4:
+    return Hexagon::STh_GP_cdnPt_nv_V4;
+
+  case Hexagon::STh_GP_cdnNotPt_V4:
+    return Hexagon::STh_GP_cdnNotPt_nv_V4;
+
+  case Hexagon::STrih_GP_cPt_V4:
+    return Hexagon::STrih_GP_cPt_nv_V4;
+
+  case Hexagon::STrih_GP_cNotPt_V4:
+    return Hexagon::STrih_GP_cNotPt_nv_V4;
+
+  case Hexagon::STrih_GP_cdnPt_V4:
+    return Hexagon::STrih_GP_cdnPt_nv_V4;
+
+  case Hexagon::STrih_GP_cdnNotPt_V4:
+    return Hexagon::STrih_GP_cdnNotPt_nv_V4;
+
+  // store new value word
+  case Hexagon::STriw:
+    return Hexagon::STriw_nv_V4;
+
+  case Hexagon::STriw_indexed:
+    return Hexagon::STriw_indexed_nv_V4;
+
+  case Hexagon::STriw_indexed_shl_V4:
+    return Hexagon::STriw_indexed_shl_nv_V4;
+
+  case Hexagon::STriw_shl_V4:
+    return Hexagon::STriw_shl_nv_V4;
+
+  case Hexagon::STriw_GP_V4:
+    return Hexagon::STriw_GP_nv_V4;
+
+  case Hexagon::STw_GP_V4:
+    return Hexagon::STw_GP_nv_V4;
+
+  case Hexagon::POST_STwri:
+    return Hexagon::POST_STwri_nv_V4;
+
+  case Hexagon::STriw_cPt:
+    return Hexagon::STriw_cPt_nv_V4;
+
+  case Hexagon::STriw_cdnPt_V4:
+    return Hexagon::STriw_cdnPt_nv_V4;
+
+  case Hexagon::STriw_cNotPt:
+    return Hexagon::STriw_cNotPt_nv_V4;
+
+  case Hexagon::STriw_cdnNotPt_V4:
+    return Hexagon::STriw_cdnNotPt_nv_V4;
+
+  case Hexagon::STriw_indexed_cPt:
+    return Hexagon::STriw_indexed_cPt_nv_V4;
+
+  case Hexagon::STriw_indexed_cdnPt_V4:
+    return Hexagon::STriw_indexed_cdnPt_nv_V4;
+
+  case Hexagon::STriw_indexed_cNotPt:
+    return Hexagon::STriw_indexed_cNotPt_nv_V4;
+
+  case Hexagon::STriw_indexed_cdnNotPt_V4:
+    return Hexagon::STriw_indexed_cdnNotPt_nv_V4;
+
+  case Hexagon::STriw_indexed_shl_cPt_V4:
+    return Hexagon::STriw_indexed_shl_cPt_nv_V4;
+
+  case Hexagon::STriw_indexed_shl_cdnPt_V4:
+    return Hexagon::STriw_indexed_shl_cdnPt_nv_V4;
+
+  case Hexagon::STriw_indexed_shl_cNotPt_V4:
+    return Hexagon::STriw_indexed_shl_cNotPt_nv_V4;
+
+  case Hexagon::STriw_indexed_shl_cdnNotPt_V4:
+    return Hexagon::STriw_indexed_shl_cdnNotPt_nv_V4;
+
+  case Hexagon::POST_STwri_cPt:
+    return Hexagon::POST_STwri_cPt_nv_V4;
+
+  case Hexagon::POST_STwri_cdnPt_V4:
+    return Hexagon::POST_STwri_cdnPt_nv_V4;
+
+  case Hexagon::POST_STwri_cNotPt:
+    return Hexagon::POST_STwri_cNotPt_nv_V4;
+
+  case Hexagon::POST_STwri_cdnNotPt_V4:
+    return Hexagon::POST_STwri_cdnNotPt_nv_V4;
+
+  case Hexagon::STw_GP_cPt_V4:
+    return Hexagon::STw_GP_cPt_nv_V4;
+
+  case Hexagon::STw_GP_cNotPt_V4:
+    return Hexagon::STw_GP_cNotPt_nv_V4;
+
+  case Hexagon::STw_GP_cdnPt_V4:
+    return Hexagon::STw_GP_cdnPt_nv_V4;
+
+  case Hexagon::STw_GP_cdnNotPt_V4:
+    return Hexagon::STw_GP_cdnNotPt_nv_V4;
+
+  case Hexagon::STriw_GP_cPt_V4:
+    return Hexagon::STriw_GP_cPt_nv_V4;
+
+  case Hexagon::STriw_GP_cNotPt_V4:
+    return Hexagon::STriw_GP_cNotPt_nv_V4;
+
+  case Hexagon::STriw_GP_cdnPt_V4:
+    return Hexagon::STriw_GP_cdnPt_nv_V4;
+
+  case Hexagon::STriw_GP_cdnNotPt_V4:
+    return Hexagon::STriw_GP_cdnNotPt_nv_V4;
+  }
+}
+
+// Return .new predicate version for an instruction
+static int GetDotNewPredOp(const int opc) {
+  switch (opc) {
+  default: llvm_unreachable("Unknown .new type");
+  // Conditional stores
+  // Store byte conditionally
+  case Hexagon::STrib_cPt :
+    return Hexagon::STrib_cdnPt_V4;
+
+  case Hexagon::STrib_cNotPt :
+    return Hexagon::STrib_cdnNotPt_V4;
+
+  case Hexagon::STrib_indexed_cPt :
+    return Hexagon::STrib_indexed_cdnPt_V4;
+
+  case Hexagon::STrib_indexed_cNotPt :
+    return Hexagon::STrib_indexed_cdnNotPt_V4;
+
+  case Hexagon::STrib_imm_cPt_V4 :
+    return Hexagon::STrib_imm_cdnPt_V4;
+
+  case Hexagon::STrib_imm_cNotPt_V4 :
+    return Hexagon::STrib_imm_cdnNotPt_V4;
+
+  case Hexagon::POST_STbri_cPt :
+    return Hexagon::POST_STbri_cdnPt_V4;
+
+  case Hexagon::POST_STbri_cNotPt :
+    return Hexagon::POST_STbri_cdnNotPt_V4;
+
+  case Hexagon::STrib_indexed_shl_cPt_V4 :
+    return Hexagon::STrib_indexed_shl_cdnPt_V4;
+
+  case Hexagon::STrib_indexed_shl_cNotPt_V4 :
+    return Hexagon::STrib_indexed_shl_cdnNotPt_V4;
+
+  case Hexagon::STb_GP_cPt_V4 :
+    return Hexagon::STb_GP_cdnPt_V4;
+
+  case Hexagon::STb_GP_cNotPt_V4 :
+    return Hexagon::STb_GP_cdnNotPt_V4;
+
+  case Hexagon::STrib_GP_cPt_V4 :
+    return Hexagon::STrib_GP_cdnPt_V4;
+
+  case Hexagon::STrib_GP_cNotPt_V4 :
+    return Hexagon::STrib_GP_cdnNotPt_V4;
+
+  // Store doubleword conditionally
+  case Hexagon::STrid_cPt :
+    return Hexagon::STrid_cdnPt_V4;
+
+  case Hexagon::STrid_cNotPt :
+    return Hexagon::STrid_cdnNotPt_V4;
+
+  case Hexagon::STrid_indexed_cPt :
+    return Hexagon::STrid_indexed_cdnPt_V4;
+
+  case Hexagon::STrid_indexed_cNotPt :
+    return Hexagon::STrid_indexed_cdnNotPt_V4;
+
+  case Hexagon::STrid_indexed_shl_cPt_V4 :
+    return Hexagon::STrid_indexed_shl_cdnPt_V4;
+
+  case Hexagon::STrid_indexed_shl_cNotPt_V4 :
+    return Hexagon::STrid_indexed_shl_cdnNotPt_V4;
+
+  case Hexagon::POST_STdri_cPt :
+    return Hexagon::POST_STdri_cdnPt_V4;
+
+  case Hexagon::POST_STdri_cNotPt :
+    return Hexagon::POST_STdri_cdnNotPt_V4;
+
+  case Hexagon::STd_GP_cPt_V4 :
+    return Hexagon::STd_GP_cdnPt_V4;
+
+  case Hexagon::STd_GP_cNotPt_V4 :
+    return Hexagon::STd_GP_cdnNotPt_V4;
+
+  case Hexagon::STrid_GP_cPt_V4 :
+    return Hexagon::STrid_GP_cdnPt_V4;
+
+  case Hexagon::STrid_GP_cNotPt_V4 :
+    return Hexagon::STrid_GP_cdnNotPt_V4;
+
+  // Store halfword conditionally
+  case Hexagon::STrih_cPt :
+    return Hexagon::STrih_cdnPt_V4;
+
+  case Hexagon::STrih_cNotPt :
+    return Hexagon::STrih_cdnNotPt_V4;
+
+  case Hexagon::STrih_indexed_cPt :
+    return Hexagon::STrih_indexed_cdnPt_V4;
+
+  case Hexagon::STrih_indexed_cNotPt :
+    return Hexagon::STrih_indexed_cdnNotPt_V4;
+
+  case Hexagon::STrih_imm_cPt_V4 :
+    return Hexagon::STrih_imm_cdnPt_V4;
+
+  case Hexagon::STrih_imm_cNotPt_V4 :
+    return Hexagon::STrih_imm_cdnNotPt_V4;
+
+  case Hexagon::STrih_indexed_shl_cPt_V4 :
+    return Hexagon::STrih_indexed_shl_cdnPt_V4;
+
+  case Hexagon::STrih_indexed_shl_cNotPt_V4 :
+    return Hexagon::STrih_indexed_shl_cdnNotPt_V4;
+
+  case Hexagon::POST_SThri_cPt :
+    return Hexagon::POST_SThri_cdnPt_V4;
+
+  case Hexagon::POST_SThri_cNotPt :
+    return Hexagon::POST_SThri_cdnNotPt_V4;
+
+  case Hexagon::STh_GP_cPt_V4 :
+    return Hexagon::STh_GP_cdnPt_V4;
+
+  case Hexagon::STh_GP_cNotPt_V4 :
+    return Hexagon::STh_GP_cdnNotPt_V4;
+
+  case Hexagon::STrih_GP_cPt_V4 :
+    return Hexagon::STrih_GP_cdnPt_V4;
+
+  case Hexagon::STrih_GP_cNotPt_V4 :
+    return Hexagon::STrih_GP_cdnNotPt_V4;
+
+  // Store word conditionally
+  case Hexagon::STriw_cPt :
+    return Hexagon::STriw_cdnPt_V4;
+
+  case Hexagon::STriw_cNotPt :
+    return Hexagon::STriw_cdnNotPt_V4;
+
+  case Hexagon::STriw_indexed_cPt :
+    return Hexagon::STriw_indexed_cdnPt_V4;
+
+  case Hexagon::STriw_indexed_cNotPt :
+    return Hexagon::STriw_indexed_cdnNotPt_V4;
+
+  case Hexagon::STriw_imm_cPt_V4 :
+    return Hexagon::STriw_imm_cdnPt_V4;
+
+  case Hexagon::STriw_imm_cNotPt_V4 :
+    return Hexagon::STriw_imm_cdnNotPt_V4;
+
+  case Hexagon::STriw_indexed_shl_cPt_V4 :
+    return Hexagon::STriw_indexed_shl_cdnPt_V4;
+
+  case Hexagon::STriw_indexed_shl_cNotPt_V4 :
+    return Hexagon::STriw_indexed_shl_cdnNotPt_V4;
+
+  case Hexagon::POST_STwri_cPt :
+    return Hexagon::POST_STwri_cdnPt_V4;
+
+  case Hexagon::POST_STwri_cNotPt :
+    return Hexagon::POST_STwri_cdnNotPt_V4;
+
+  case Hexagon::STw_GP_cPt_V4 :
+    return Hexagon::STw_GP_cdnPt_V4;
+
+  case Hexagon::STw_GP_cNotPt_V4 :
+    return Hexagon::STw_GP_cdnNotPt_V4;
+
+  case Hexagon::STriw_GP_cPt_V4 :
+    return Hexagon::STriw_GP_cdnPt_V4;
+
+  case Hexagon::STriw_GP_cNotPt_V4 :
+    return Hexagon::STriw_GP_cdnNotPt_V4;
+
+  // Condtional Jumps
+  case Hexagon::JMP_c:
+    return Hexagon::JMP_cdnPt;
+
+  case Hexagon::JMP_cNot:
+    return Hexagon::JMP_cdnNotPt;
+
+  case Hexagon::JMPR_cPt:
+    return Hexagon::JMPR_cdnPt_V3;
+
+  case Hexagon::JMPR_cNotPt:
+    return Hexagon::JMPR_cdnNotPt_V3;
+
+  // Conditional Transfers
+  case Hexagon::TFR_cPt:
+    return Hexagon::TFR_cdnPt;
+
+  case Hexagon::TFR_cNotPt:
+    return Hexagon::TFR_cdnNotPt;
+
+  case Hexagon::TFRI_cPt:
+    return Hexagon::TFRI_cdnPt;
+
+  case Hexagon::TFRI_cNotPt:
+    return Hexagon::TFRI_cdnNotPt;
+
+  // Load double word
+  case Hexagon::LDrid_cPt :
+    return Hexagon::LDrid_cdnPt;
+
+  case Hexagon::LDrid_cNotPt :
+    return Hexagon::LDrid_cdnNotPt;
+
+  case Hexagon::LDrid_indexed_cPt :
+    return Hexagon::LDrid_indexed_cdnPt;
+
+  case Hexagon::LDrid_indexed_cNotPt :
+    return Hexagon::LDrid_indexed_cdnNotPt;
+
+  case Hexagon::POST_LDrid_cPt :
+    return Hexagon::POST_LDrid_cdnPt_V4;
+
+  case Hexagon::POST_LDrid_cNotPt :
+    return Hexagon::POST_LDrid_cdnNotPt_V4;
+
+  // Load word
+  case Hexagon::LDriw_cPt :
+    return Hexagon::LDriw_cdnPt;
+
+  case Hexagon::LDriw_cNotPt :
+    return Hexagon::LDriw_cdnNotPt;
+
+  case Hexagon::LDriw_indexed_cPt :
+    return Hexagon::LDriw_indexed_cdnPt;
+
+  case Hexagon::LDriw_indexed_cNotPt :
+    return Hexagon::LDriw_indexed_cdnNotPt;
+
+  case Hexagon::POST_LDriw_cPt :
+    return Hexagon::POST_LDriw_cdnPt_V4;
+
+  case Hexagon::POST_LDriw_cNotPt :
+    return Hexagon::POST_LDriw_cdnNotPt_V4;
+
+  // Load halfword
+  case Hexagon::LDrih_cPt :
+    return Hexagon::LDrih_cdnPt;
+
+  case Hexagon::LDrih_cNotPt :
+    return Hexagon::LDrih_cdnNotPt;
+
+  case Hexagon::LDrih_indexed_cPt :
+    return Hexagon::LDrih_indexed_cdnPt;
+
+  case Hexagon::LDrih_indexed_cNotPt :
+    return Hexagon::LDrih_indexed_cdnNotPt;
+
+  case Hexagon::POST_LDrih_cPt :
+    return Hexagon::POST_LDrih_cdnPt_V4;
+
+  case Hexagon::POST_LDrih_cNotPt :
+    return Hexagon::POST_LDrih_cdnNotPt_V4;
+
+  // Load byte
+  case Hexagon::LDrib_cPt :
+    return Hexagon::LDrib_cdnPt;
+
+  case Hexagon::LDrib_cNotPt :
+    return Hexagon::LDrib_cdnNotPt;
+
+  case Hexagon::LDrib_indexed_cPt :
+    return Hexagon::LDrib_indexed_cdnPt;
+
+  case Hexagon::LDrib_indexed_cNotPt :
+    return Hexagon::LDrib_indexed_cdnNotPt;
+
+  case Hexagon::POST_LDrib_cPt :
+    return Hexagon::POST_LDrib_cdnPt_V4;
+
+  case Hexagon::POST_LDrib_cNotPt :
+    return Hexagon::POST_LDrib_cdnNotPt_V4;
+
+  // Load unsigned halfword
+  case Hexagon::LDriuh_cPt :
+    return Hexagon::LDriuh_cdnPt;
+
+  case Hexagon::LDriuh_cNotPt :
+    return Hexagon::LDriuh_cdnNotPt;
+
+  case Hexagon::LDriuh_indexed_cPt :
+    return Hexagon::LDriuh_indexed_cdnPt;
+
+  case Hexagon::LDriuh_indexed_cNotPt :
+    return Hexagon::LDriuh_indexed_cdnNotPt;
+
+  case Hexagon::POST_LDriuh_cPt :
+    return Hexagon::POST_LDriuh_cdnPt_V4;
+
+  case Hexagon::POST_LDriuh_cNotPt :
+    return Hexagon::POST_LDriuh_cdnNotPt_V4;
+
+  // Load unsigned byte
+  case Hexagon::LDriub_cPt :
+    return Hexagon::LDriub_cdnPt;
+
+  case Hexagon::LDriub_cNotPt :
+    return Hexagon::LDriub_cdnNotPt;
+
+  case Hexagon::LDriub_indexed_cPt :
+    return Hexagon::LDriub_indexed_cdnPt;
+
+  case Hexagon::LDriub_indexed_cNotPt :
+    return Hexagon::LDriub_indexed_cdnNotPt;
+
+  case Hexagon::POST_LDriub_cPt :
+    return Hexagon::POST_LDriub_cdnPt_V4;
+
+  case Hexagon::POST_LDriub_cNotPt :
+    return Hexagon::POST_LDriub_cdnNotPt_V4;
+
+  // V4 indexed+scaled load
+
+  case Hexagon::LDrid_indexed_cPt_V4 :
+    return Hexagon::LDrid_indexed_cdnPt_V4;
+
+  case Hexagon::LDrid_indexed_cNotPt_V4 :
+    return Hexagon::LDrid_indexed_cdnNotPt_V4;
+
+  case Hexagon::LDrid_indexed_shl_cPt_V4 :
+    return Hexagon::LDrid_indexed_shl_cdnPt_V4;
+
+  case Hexagon::LDrid_indexed_shl_cNotPt_V4 :
+    return Hexagon::LDrid_indexed_shl_cdnNotPt_V4;
+
+  case Hexagon::LDrib_indexed_cPt_V4 :
+    return Hexagon::LDrib_indexed_cdnPt_V4;
+
+  case Hexagon::LDrib_indexed_cNotPt_V4 :
+    return Hexagon::LDrib_indexed_cdnNotPt_V4;
+
+  case Hexagon::LDrib_indexed_shl_cPt_V4 :
+    return Hexagon::LDrib_indexed_shl_cdnPt_V4;
+
+  case Hexagon::LDrib_indexed_shl_cNotPt_V4 :
+    return Hexagon::LDrib_indexed_shl_cdnNotPt_V4;
+
+  case Hexagon::LDriub_indexed_cPt_V4 :
+    return Hexagon::LDriub_indexed_cdnPt_V4;
+
+  case Hexagon::LDriub_indexed_cNotPt_V4 :
+    return Hexagon::LDriub_indexed_cdnNotPt_V4;
+
+  case Hexagon::LDriub_indexed_shl_cPt_V4 :
+    return Hexagon::LDriub_indexed_shl_cdnPt_V4;
+
+  case Hexagon::LDriub_indexed_shl_cNotPt_V4 :
+    return Hexagon::LDriub_indexed_shl_cdnNotPt_V4;
+
+  case Hexagon::LDrih_indexed_cPt_V4 :
+    return Hexagon::LDrih_indexed_cdnPt_V4;
+
+  case Hexagon::LDrih_indexed_cNotPt_V4 :
+    return Hexagon::LDrih_indexed_cdnNotPt_V4;
+
+  case Hexagon::LDrih_indexed_shl_cPt_V4 :
+    return Hexagon::LDrih_indexed_shl_cdnPt_V4;
+
+  case Hexagon::LDrih_indexed_shl_cNotPt_V4 :
+    return Hexagon::LDrih_indexed_shl_cdnNotPt_V4;
+
+  case Hexagon::LDriuh_indexed_cPt_V4 :
+    return Hexagon::LDriuh_indexed_cdnPt_V4;
+
+  case Hexagon::LDriuh_indexed_cNotPt_V4 :
+    return Hexagon::LDriuh_indexed_cdnNotPt_V4;
+
+  case Hexagon::LDriuh_indexed_shl_cPt_V4 :
+    return Hexagon::LDriuh_indexed_shl_cdnPt_V4;
+
+  case Hexagon::LDriuh_indexed_shl_cNotPt_V4 :
+    return Hexagon::LDriuh_indexed_shl_cdnNotPt_V4;
+
+  case Hexagon::LDriw_indexed_cPt_V4 :
+    return Hexagon::LDriw_indexed_cdnPt_V4;
+
+  case Hexagon::LDriw_indexed_cNotPt_V4 :
+    return Hexagon::LDriw_indexed_cdnNotPt_V4;
+
+  case Hexagon::LDriw_indexed_shl_cPt_V4 :
+    return Hexagon::LDriw_indexed_shl_cdnPt_V4;
+
+  case Hexagon::LDriw_indexed_shl_cNotPt_V4 :
+    return Hexagon::LDriw_indexed_shl_cdnNotPt_V4;
+
+  // V4 global address load
+
+  case Hexagon::LDd_GP_cPt_V4:
+    return Hexagon::LDd_GP_cdnPt_V4;
+
+  case Hexagon::LDd_GP_cNotPt_V4:
+    return Hexagon::LDd_GP_cdnNotPt_V4;
+
+  case Hexagon::LDb_GP_cPt_V4:
+    return Hexagon::LDb_GP_cdnPt_V4;
+
+  case Hexagon::LDb_GP_cNotPt_V4:
+    return Hexagon::LDb_GP_cdnNotPt_V4;
+
+  case Hexagon::LDub_GP_cPt_V4:
+    return Hexagon::LDub_GP_cdnPt_V4;
+
+  case Hexagon::LDub_GP_cNotPt_V4:
+    return Hexagon::LDub_GP_cdnNotPt_V4;
+
+  case Hexagon::LDh_GP_cPt_V4:
+    return Hexagon::LDh_GP_cdnPt_V4;
+
+  case Hexagon::LDh_GP_cNotPt_V4:
+    return Hexagon::LDh_GP_cdnNotPt_V4;
+
+  case Hexagon::LDuh_GP_cPt_V4:
+    return Hexagon::LDuh_GP_cdnPt_V4;
+
+  case Hexagon::LDuh_GP_cNotPt_V4:
+    return Hexagon::LDuh_GP_cdnNotPt_V4;
+
+  case Hexagon::LDw_GP_cPt_V4:
+    return Hexagon::LDw_GP_cdnPt_V4;
+
+  case Hexagon::LDw_GP_cNotPt_V4:
+    return Hexagon::LDw_GP_cdnNotPt_V4;
+
+  case Hexagon::LDrid_GP_cPt_V4:
+    return Hexagon::LDrid_GP_cdnPt_V4;
+
+  case Hexagon::LDrid_GP_cNotPt_V4:
+    return Hexagon::LDrid_GP_cdnNotPt_V4;
+
+  case Hexagon::LDrib_GP_cPt_V4:
+    return Hexagon::LDrib_GP_cdnPt_V4;
+
+  case Hexagon::LDrib_GP_cNotPt_V4:
+    return Hexagon::LDrib_GP_cdnNotPt_V4;
+
+  case Hexagon::LDriub_GP_cPt_V4:
+    return Hexagon::LDriub_GP_cdnPt_V4;
+
+  case Hexagon::LDriub_GP_cNotPt_V4:
+    return Hexagon::LDriub_GP_cdnNotPt_V4;
+
+  case Hexagon::LDrih_GP_cPt_V4:
+    return Hexagon::LDrih_GP_cdnPt_V4;
+
+  case Hexagon::LDrih_GP_cNotPt_V4:
+    return Hexagon::LDrih_GP_cdnNotPt_V4;
+
+  case Hexagon::LDriuh_GP_cPt_V4:
+    return Hexagon::LDriuh_GP_cdnPt_V4;
+
+  case Hexagon::LDriuh_GP_cNotPt_V4:
+    return Hexagon::LDriuh_GP_cdnNotPt_V4;
+
+  case Hexagon::LDriw_GP_cPt_V4:
+    return Hexagon::LDriw_GP_cdnPt_V4;
+
+  case Hexagon::LDriw_GP_cNotPt_V4:
+    return Hexagon::LDriw_GP_cdnNotPt_V4;
+
+  // Conditional store new-value byte
+  case Hexagon::STrib_cPt_nv_V4 :
+    return Hexagon::STrib_cdnPt_nv_V4;
+  case Hexagon::STrib_cNotPt_nv_V4 :
+    return Hexagon::STrib_cdnNotPt_nv_V4;
+
+  case Hexagon::STrib_indexed_cPt_nv_V4 :
+    return Hexagon::STrib_indexed_cdnPt_nv_V4;
+  case Hexagon::STrib_indexed_cNotPt_nv_V4 :
+    return Hexagon::STrib_indexed_cdnNotPt_nv_V4;
+
+  case Hexagon::STrib_indexed_shl_cPt_nv_V4 :
+    return Hexagon::STrib_indexed_shl_cdnPt_nv_V4;
+  case Hexagon::STrib_indexed_shl_cNotPt_nv_V4 :
+    return Hexagon::STrib_indexed_shl_cdnNotPt_nv_V4;
+
+  case Hexagon::POST_STbri_cPt_nv_V4 :
+    return Hexagon::POST_STbri_cdnPt_nv_V4;
+  case Hexagon::POST_STbri_cNotPt_nv_V4 :
+    return Hexagon::POST_STbri_cdnNotPt_nv_V4;
+
+  case Hexagon::STb_GP_cPt_nv_V4 :
+    return Hexagon::STb_GP_cdnPt_nv_V4;
+
+  case Hexagon::STb_GP_cNotPt_nv_V4 :
+    return Hexagon::STb_GP_cdnNotPt_nv_V4;
+
+  case Hexagon::STrib_GP_cPt_nv_V4 :
+    return Hexagon::STrib_GP_cdnPt_nv_V4;
+
+  case Hexagon::STrib_GP_cNotPt_nv_V4 :
+    return Hexagon::STrib_GP_cdnNotPt_nv_V4;
+
+  // Conditional store new-value halfword
+  case Hexagon::STrih_cPt_nv_V4 :
+    return Hexagon::STrih_cdnPt_nv_V4;
+  case Hexagon::STrih_cNotPt_nv_V4 :
+    return Hexagon::STrih_cdnNotPt_nv_V4;
+
+  case Hexagon::STrih_indexed_cPt_nv_V4 :
+    return Hexagon::STrih_indexed_cdnPt_nv_V4;
+  case Hexagon::STrih_indexed_cNotPt_nv_V4 :
+    return Hexagon::STrih_indexed_cdnNotPt_nv_V4;
+
+  case Hexagon::STrih_indexed_shl_cPt_nv_V4 :
+    return Hexagon::STrih_indexed_shl_cdnPt_nv_V4;
+  case Hexagon::STrih_indexed_shl_cNotPt_nv_V4 :
+    return Hexagon::STrih_indexed_shl_cdnNotPt_nv_V4;
+
+  case Hexagon::POST_SThri_cPt_nv_V4 :
+    return Hexagon::POST_SThri_cdnPt_nv_V4;
+  case Hexagon::POST_SThri_cNotPt_nv_V4 :
+    return Hexagon::POST_SThri_cdnNotPt_nv_V4;
+
+  case Hexagon::STh_GP_cPt_nv_V4 :
+    return Hexagon::STh_GP_cdnPt_nv_V4;
+
+  case Hexagon::STh_GP_cNotPt_nv_V4 :
+    return Hexagon::STh_GP_cdnNotPt_nv_V4;
+
+  case Hexagon::STrih_GP_cPt_nv_V4 :
+    return Hexagon::STrih_GP_cdnPt_nv_V4;
+
+  case Hexagon::STrih_GP_cNotPt_nv_V4 :
+    return Hexagon::STrih_GP_cdnNotPt_nv_V4;
+
+  // Conditional store new-value word
+  case Hexagon::STriw_cPt_nv_V4 :
+    return  Hexagon::STriw_cdnPt_nv_V4;
+  case Hexagon::STriw_cNotPt_nv_V4 :
+    return Hexagon::STriw_cdnNotPt_nv_V4;
+
+  case Hexagon::STriw_indexed_cPt_nv_V4 :
+    return Hexagon::STriw_indexed_cdnPt_nv_V4;
+  case Hexagon::STriw_indexed_cNotPt_nv_V4 :
+    return Hexagon::STriw_indexed_cdnNotPt_nv_V4;
+
+  case Hexagon::STriw_indexed_shl_cPt_nv_V4 :
+    return Hexagon::STriw_indexed_shl_cdnPt_nv_V4;
+  case Hexagon::STriw_indexed_shl_cNotPt_nv_V4 :
+    return Hexagon::STriw_indexed_shl_cdnNotPt_nv_V4;
+
+  case Hexagon::POST_STwri_cPt_nv_V4 :
+    return Hexagon::POST_STwri_cdnPt_nv_V4;
+  case Hexagon::POST_STwri_cNotPt_nv_V4:
+    return Hexagon::POST_STwri_cdnNotPt_nv_V4;
+
+  case Hexagon::STw_GP_cPt_nv_V4 :
+    return Hexagon::STw_GP_cdnPt_nv_V4;
+
+  case Hexagon::STw_GP_cNotPt_nv_V4 :
+    return Hexagon::STw_GP_cdnNotPt_nv_V4;
+
+  case Hexagon::STriw_GP_cPt_nv_V4 :
+    return Hexagon::STriw_GP_cdnPt_nv_V4;
+
+  case Hexagon::STriw_GP_cNotPt_nv_V4 :
+    return Hexagon::STriw_GP_cdnNotPt_nv_V4;
+
+  // Conditional add
+  case Hexagon::ADD_ri_cPt :
+    return Hexagon::ADD_ri_cdnPt;
+  case Hexagon::ADD_ri_cNotPt :
+    return Hexagon::ADD_ri_cdnNotPt;
+
+  case Hexagon::ADD_rr_cPt :
+    return Hexagon::ADD_rr_cdnPt;
+  case Hexagon::ADD_rr_cNotPt :
+    return Hexagon::ADD_rr_cdnNotPt;
+
+  // Conditional logical Operations
+  case Hexagon::XOR_rr_cPt :
+    return Hexagon::XOR_rr_cdnPt;
+  case Hexagon::XOR_rr_cNotPt :
+    return Hexagon::XOR_rr_cdnNotPt;
+
+  case Hexagon::AND_rr_cPt :
+    return Hexagon::AND_rr_cdnPt;
+  case Hexagon::AND_rr_cNotPt :
+    return Hexagon::AND_rr_cdnNotPt;
+
+  case Hexagon::OR_rr_cPt :
+    return Hexagon::OR_rr_cdnPt;
+  case Hexagon::OR_rr_cNotPt :
+    return Hexagon::OR_rr_cdnNotPt;
+
+  // Conditional Subtract
+  case Hexagon::SUB_rr_cPt :
+    return Hexagon::SUB_rr_cdnPt;
+  case Hexagon::SUB_rr_cNotPt :
+    return Hexagon::SUB_rr_cdnNotPt;
+
+  // Conditional combine
+  case Hexagon::COMBINE_rr_cPt :
+    return Hexagon::COMBINE_rr_cdnPt;
+  case Hexagon::COMBINE_rr_cNotPt :
+    return Hexagon::COMBINE_rr_cdnNotPt;
+
+  case Hexagon::ASLH_cPt_V4 :
+    return Hexagon::ASLH_cdnPt_V4;
+  case Hexagon::ASLH_cNotPt_V4 :
+    return Hexagon::ASLH_cdnNotPt_V4;
+
+  case Hexagon::ASRH_cPt_V4 :
+    return Hexagon::ASRH_cdnPt_V4;
+  case Hexagon::ASRH_cNotPt_V4 :
+    return Hexagon::ASRH_cdnNotPt_V4;
+
+  case Hexagon::SXTB_cPt_V4 :
+    return Hexagon::SXTB_cdnPt_V4;
+  case Hexagon::SXTB_cNotPt_V4 :
+    return Hexagon::SXTB_cdnNotPt_V4;
+
+  case Hexagon::SXTH_cPt_V4 :
+    return Hexagon::SXTH_cdnPt_V4;
+  case Hexagon::SXTH_cNotPt_V4 :
+    return Hexagon::SXTH_cdnNotPt_V4;
+
+  case Hexagon::ZXTB_cPt_V4 :
+    return Hexagon::ZXTB_cdnPt_V4;
+  case Hexagon::ZXTB_cNotPt_V4 :
+    return Hexagon::ZXTB_cdnNotPt_V4;
+
+  case Hexagon::ZXTH_cPt_V4 :
+    return Hexagon::ZXTH_cdnPt_V4;
+  case Hexagon::ZXTH_cNotPt_V4 :
+    return Hexagon::ZXTH_cdnNotPt_V4;
+  }
+}
+
+// Returns true if an instruction can be promoted to .new predicate
+// or new-value store.
+bool HexagonPacketizerList::isNewifiable(MachineInstr* MI) {
+  if ( isCondInst(MI) || IsNewifyStore(MI))
+    return true;
+  else
+    return false;
+}
+
+bool HexagonPacketizerList::isCondInst (MachineInstr* MI) {
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  const MCInstrDesc& TID = MI->getDesc();
+                                    // bug 5670: until that is fixed,
+                                    // this portion is disabled.
+  if (   TID.isConditionalBranch()  // && !IsRegisterJump(MI)) ||
+      || QII->isConditionalTransfer(MI)
+      || QII->isConditionalALU32(MI)
+      || QII->isConditionalLoad(MI)
+      || QII->isConditionalStore(MI)) {
+    return true;
+  }
+  return false;
+}
+
+
+// Promote an instructiont to its .new form.
+// At this time, we have already made a call to CanPromoteToDotNew
+// and made sure that it can *indeed* be promoted.
+bool HexagonPacketizerList::PromoteToDotNew(MachineInstr* MI,
+                        SDep::Kind DepType, MachineBasicBlock::iterator &MII,
+                        const TargetRegisterClass* RC) {
+
+  assert (DepType == SDep::Data);
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+
+  int NewOpcode;
+  if (RC == &Hexagon::PredRegsRegClass)
+    NewOpcode = GetDotNewPredOp(MI->getOpcode());
+  else
+    NewOpcode = GetDotNewOp(MI->getOpcode());
+  MI->setDesc(QII->get(NewOpcode));
+
+  return true;
+}
+
+// Returns the most basic instruction for the .new predicated instructions and
+// new-value stores.
+// For example, all of the following instructions will be converted back to the
+// same instruction:
+// 1) if (p0.new) memw(R0+#0) = R1.new  --->
+// 2) if (p0) memw(R0+#0)= R1.new      -------> if (p0) memw(R0+#0) = R1
+// 3) if (p0.new) memw(R0+#0) = R1      --->
+//
+// To understand the translation of instruction 1 to its original form, consider
+// a packet with 3 instructions.
+// { p0 = cmp.eq(R0,R1)
+//   if (p0.new) R2 = add(R3, R4)
+//   R5 = add (R3, R1)
+//   }
+// if (p0) memw(R5+#0) = R2 <--- trying to include it in the previous packet
+//
+// This instruction can be part of the previous packet only if both p0 and R2
+// are promoted to .new values. This promotion happens in steps, first
+// predicate register is promoted to .new and in the next iteration R2 is
+// promoted. Therefore, in case of dependence check failure (due to R5) during
+// next iteration, it should be converted back to its most basic form.
+
+static int GetDotOldOp(const int opc) {
+  switch (opc) {
+  default: llvm_unreachable("Unknown .old type");
+  case Hexagon::TFR_cdnPt:
+    return Hexagon::TFR_cPt;
+
+  case Hexagon::TFR_cdnNotPt:
+    return Hexagon::TFR_cNotPt;
+
+  case Hexagon::TFRI_cdnPt:
+    return Hexagon::TFRI_cPt;
+
+  case Hexagon::TFRI_cdnNotPt:
+    return Hexagon::TFRI_cNotPt;
+
+  case Hexagon::JMP_cdnPt:
+    return Hexagon::JMP_c;
+
+  case Hexagon::JMP_cdnNotPt:
+    return Hexagon::JMP_cNot;
+
+  case Hexagon::JMPR_cdnPt_V3:
+    return Hexagon::JMPR_cPt;
+
+  case Hexagon::JMPR_cdnNotPt_V3:
+    return Hexagon::JMPR_cNotPt;
+
+  // Load double word
+
+  case Hexagon::LDrid_cdnPt :
+    return Hexagon::LDrid_cPt;
+
+  case Hexagon::LDrid_cdnNotPt :
+    return Hexagon::LDrid_cNotPt;
+
+  case Hexagon::LDrid_indexed_cdnPt :
+    return Hexagon::LDrid_indexed_cPt;
+
+  case Hexagon::LDrid_indexed_cdnNotPt :
+    return Hexagon::LDrid_indexed_cNotPt;
+
+  case Hexagon::POST_LDrid_cdnPt_V4 :
+    return Hexagon::POST_LDrid_cPt;
+
+  case Hexagon::POST_LDrid_cdnNotPt_V4 :
+    return Hexagon::POST_LDrid_cNotPt;
+
+  // Load word
+
+  case Hexagon::LDriw_cdnPt :
+    return Hexagon::LDriw_cPt;
+
+  case Hexagon::LDriw_cdnNotPt :
+    return Hexagon::LDriw_cNotPt;
+
+  case Hexagon::LDriw_indexed_cdnPt :
+    return Hexagon::LDriw_indexed_cPt;
+
+  case Hexagon::LDriw_indexed_cdnNotPt :
+    return Hexagon::LDriw_indexed_cNotPt;
+
+  case Hexagon::POST_LDriw_cdnPt_V4 :
+    return Hexagon::POST_LDriw_cPt;
+
+  case Hexagon::POST_LDriw_cdnNotPt_V4 :
+    return Hexagon::POST_LDriw_cNotPt;
+
+  // Load half
+
+  case Hexagon::LDrih_cdnPt :
+    return Hexagon::LDrih_cPt;
+
+  case Hexagon::LDrih_cdnNotPt :
+    return Hexagon::LDrih_cNotPt;
+
+  case Hexagon::LDrih_indexed_cdnPt :
+    return Hexagon::LDrih_indexed_cPt;
+
+  case Hexagon::LDrih_indexed_cdnNotPt :
+    return Hexagon::LDrih_indexed_cNotPt;
+
+  case Hexagon::POST_LDrih_cdnPt_V4 :
+    return Hexagon::POST_LDrih_cPt;
+
+  case Hexagon::POST_LDrih_cdnNotPt_V4 :
+    return Hexagon::POST_LDrih_cNotPt;
+
+  // Load byte
+
+  case Hexagon::LDrib_cdnPt :
+    return Hexagon::LDrib_cPt;
+
+  case Hexagon::LDrib_cdnNotPt :
+    return Hexagon::LDrib_cNotPt;
+
+  case Hexagon::LDrib_indexed_cdnPt :
+    return Hexagon::LDrib_indexed_cPt;
+
+  case Hexagon::LDrib_indexed_cdnNotPt :
+    return Hexagon::LDrib_indexed_cNotPt;
+
+  case Hexagon::POST_LDrib_cdnPt_V4 :
+    return Hexagon::POST_LDrib_cPt;
+
+  case Hexagon::POST_LDrib_cdnNotPt_V4 :
+    return Hexagon::POST_LDrib_cNotPt;
+
+  // Load unsigned half
+
+  case Hexagon::LDriuh_cdnPt :
+    return Hexagon::LDriuh_cPt;
+
+  case Hexagon::LDriuh_cdnNotPt :
+    return Hexagon::LDriuh_cNotPt;
+
+  case Hexagon::LDriuh_indexed_cdnPt :
+    return Hexagon::LDriuh_indexed_cPt;
+
+  case Hexagon::LDriuh_indexed_cdnNotPt :
+    return Hexagon::LDriuh_indexed_cNotPt;
+
+  case Hexagon::POST_LDriuh_cdnPt_V4 :
+    return Hexagon::POST_LDriuh_cPt;
+
+  case Hexagon::POST_LDriuh_cdnNotPt_V4 :
+    return Hexagon::POST_LDriuh_cNotPt;
+
+  // Load unsigned byte
+  case Hexagon::LDriub_cdnPt :
+    return Hexagon::LDriub_cPt;
+
+  case Hexagon::LDriub_cdnNotPt :
+    return Hexagon::LDriub_cNotPt;
+
+  case Hexagon::LDriub_indexed_cdnPt :
+    return Hexagon::LDriub_indexed_cPt;
+
+  case Hexagon::LDriub_indexed_cdnNotPt :
+    return Hexagon::LDriub_indexed_cNotPt;
+
+  case Hexagon::POST_LDriub_cdnPt_V4 :
+    return Hexagon::POST_LDriub_cPt;
+
+  case Hexagon::POST_LDriub_cdnNotPt_V4 :
+    return Hexagon::POST_LDriub_cNotPt;
+
+  // V4 indexed+scaled Load
+
+  case Hexagon::LDrid_indexed_cdnPt_V4 :
+    return Hexagon::LDrid_indexed_cPt_V4;
+
+  case Hexagon::LDrid_indexed_cdnNotPt_V4 :
+    return Hexagon::LDrid_indexed_cNotPt_V4;
+
+  case Hexagon::LDrid_indexed_shl_cdnPt_V4 :
+    return Hexagon::LDrid_indexed_shl_cPt_V4;
+
+  case Hexagon::LDrid_indexed_shl_cdnNotPt_V4 :
+    return Hexagon::LDrid_indexed_shl_cNotPt_V4;
+
+  case Hexagon::LDrib_indexed_cdnPt_V4 :
+    return Hexagon::LDrib_indexed_cPt_V4;
+
+  case Hexagon::LDrib_indexed_cdnNotPt_V4 :
+    return Hexagon::LDrib_indexed_cNotPt_V4;
+
+  case Hexagon::LDrib_indexed_shl_cdnPt_V4 :
+    return Hexagon::LDrib_indexed_shl_cPt_V4;
+
+  case Hexagon::LDrib_indexed_shl_cdnNotPt_V4 :
+    return Hexagon::LDrib_indexed_shl_cNotPt_V4;
+
+  case Hexagon::LDriub_indexed_cdnPt_V4 :
+    return Hexagon::LDriub_indexed_cPt_V4;
+
+  case Hexagon::LDriub_indexed_cdnNotPt_V4 :
+    return Hexagon::LDriub_indexed_cNotPt_V4;
+
+  case Hexagon::LDriub_indexed_shl_cdnPt_V4 :
+    return Hexagon::LDriub_indexed_shl_cPt_V4;
+
+  case Hexagon::LDriub_indexed_shl_cdnNotPt_V4 :
+    return Hexagon::LDriub_indexed_shl_cNotPt_V4;
+
+  case Hexagon::LDrih_indexed_cdnPt_V4 :
+    return Hexagon::LDrih_indexed_cPt_V4;
+
+  case Hexagon::LDrih_indexed_cdnNotPt_V4 :
+    return Hexagon::LDrih_indexed_cNotPt_V4;
+
+  case Hexagon::LDrih_indexed_shl_cdnPt_V4 :
+    return Hexagon::LDrih_indexed_shl_cPt_V4;
+
+  case Hexagon::LDrih_indexed_shl_cdnNotPt_V4 :
+    return Hexagon::LDrih_indexed_shl_cNotPt_V4;
+
+  case Hexagon::LDriuh_indexed_cdnPt_V4 :
+    return Hexagon::LDriuh_indexed_cPt_V4;
+
+  case Hexagon::LDriuh_indexed_cdnNotPt_V4 :
+    return Hexagon::LDriuh_indexed_cNotPt_V4;
+
+  case Hexagon::LDriuh_indexed_shl_cdnPt_V4 :
+    return Hexagon::LDriuh_indexed_shl_cPt_V4;
+
+  case Hexagon::LDriuh_indexed_shl_cdnNotPt_V4 :
+    return Hexagon::LDriuh_indexed_shl_cNotPt_V4;
+
+  case Hexagon::LDriw_indexed_cdnPt_V4 :
+    return Hexagon::LDriw_indexed_cPt_V4;
+
+  case Hexagon::LDriw_indexed_cdnNotPt_V4 :
+    return Hexagon::LDriw_indexed_cNotPt_V4;
+
+  case Hexagon::LDriw_indexed_shl_cdnPt_V4 :
+    return Hexagon::LDriw_indexed_shl_cPt_V4;
+
+  case Hexagon::LDriw_indexed_shl_cdnNotPt_V4 :
+    return Hexagon::LDriw_indexed_shl_cNotPt_V4;
+
+  // V4 global address load
+
+  case Hexagon::LDd_GP_cdnPt_V4:
+    return Hexagon::LDd_GP_cPt_V4;
+
+  case Hexagon::LDd_GP_cdnNotPt_V4:
+    return Hexagon::LDd_GP_cNotPt_V4;
+
+  case Hexagon::LDb_GP_cdnPt_V4:
+    return Hexagon::LDb_GP_cPt_V4;
+
+  case Hexagon::LDb_GP_cdnNotPt_V4:
+    return Hexagon::LDb_GP_cNotPt_V4;
+
+  case Hexagon::LDub_GP_cdnPt_V4:
+    return Hexagon::LDub_GP_cPt_V4;
+
+  case Hexagon::LDub_GP_cdnNotPt_V4:
+    return Hexagon::LDub_GP_cNotPt_V4;
+
+  case Hexagon::LDh_GP_cdnPt_V4:
+    return Hexagon::LDh_GP_cPt_V4;
+
+  case Hexagon::LDh_GP_cdnNotPt_V4:
+    return Hexagon::LDh_GP_cNotPt_V4;
+
+  case Hexagon::LDuh_GP_cdnPt_V4:
+    return Hexagon::LDuh_GP_cPt_V4;
+
+  case Hexagon::LDuh_GP_cdnNotPt_V4:
+    return Hexagon::LDuh_GP_cNotPt_V4;
+
+  case Hexagon::LDw_GP_cdnPt_V4:
+    return Hexagon::LDw_GP_cPt_V4;
+
+  case Hexagon::LDw_GP_cdnNotPt_V4:
+    return Hexagon::LDw_GP_cNotPt_V4;
+
+  case Hexagon::LDrid_GP_cdnPt_V4:
+    return Hexagon::LDrid_GP_cPt_V4;
+
+  case Hexagon::LDrid_GP_cdnNotPt_V4:
+    return Hexagon::LDrid_GP_cNotPt_V4;
+
+  case Hexagon::LDrib_GP_cdnPt_V4:
+    return Hexagon::LDrib_GP_cPt_V4;
+
+  case Hexagon::LDrib_GP_cdnNotPt_V4:
+    return Hexagon::LDrib_GP_cNotPt_V4;
+
+  case Hexagon::LDriub_GP_cdnPt_V4:
+    return Hexagon::LDriub_GP_cPt_V4;
+
+  case Hexagon::LDriub_GP_cdnNotPt_V4:
+    return Hexagon::LDriub_GP_cNotPt_V4;
+
+  case Hexagon::LDrih_GP_cdnPt_V4:
+    return Hexagon::LDrih_GP_cPt_V4;
+
+  case Hexagon::LDrih_GP_cdnNotPt_V4:
+    return Hexagon::LDrih_GP_cNotPt_V4;
+
+  case Hexagon::LDriuh_GP_cdnPt_V4:
+    return Hexagon::LDriuh_GP_cPt_V4;
+
+  case Hexagon::LDriuh_GP_cdnNotPt_V4:
+    return Hexagon::LDriuh_GP_cNotPt_V4;
+
+  case Hexagon::LDriw_GP_cdnPt_V4:
+    return Hexagon::LDriw_GP_cPt_V4;
+
+  case Hexagon::LDriw_GP_cdnNotPt_V4:
+    return Hexagon::LDriw_GP_cNotPt_V4;
+
+  // Conditional add
+
+  case Hexagon::ADD_ri_cdnPt :
+    return Hexagon::ADD_ri_cPt;
+  case Hexagon::ADD_ri_cdnNotPt :
+    return Hexagon::ADD_ri_cNotPt;
+
+  case Hexagon::ADD_rr_cdnPt :
+    return Hexagon::ADD_rr_cPt;
+  case Hexagon::ADD_rr_cdnNotPt:
+    return Hexagon::ADD_rr_cNotPt;
+
+  // Conditional logical Operations
+
+  case Hexagon::XOR_rr_cdnPt :
+    return Hexagon::XOR_rr_cPt;
+  case Hexagon::XOR_rr_cdnNotPt :
+    return Hexagon::XOR_rr_cNotPt;
+
+  case Hexagon::AND_rr_cdnPt :
+    return Hexagon::AND_rr_cPt;
+  case Hexagon::AND_rr_cdnNotPt :
+    return Hexagon::AND_rr_cNotPt;
+
+  case Hexagon::OR_rr_cdnPt :
+    return Hexagon::OR_rr_cPt;
+  case Hexagon::OR_rr_cdnNotPt :
+    return Hexagon::OR_rr_cNotPt;
+
+  // Conditional Subtract
+
+  case Hexagon::SUB_rr_cdnPt :
+    return Hexagon::SUB_rr_cPt;
+  case Hexagon::SUB_rr_cdnNotPt :
+    return Hexagon::SUB_rr_cNotPt;
+
+  // Conditional combine
+
+  case Hexagon::COMBINE_rr_cdnPt :
+    return Hexagon::COMBINE_rr_cPt;
+  case Hexagon::COMBINE_rr_cdnNotPt :
+    return Hexagon::COMBINE_rr_cNotPt;
+
+// Conditional shift operations
+
+  case Hexagon::ASLH_cdnPt_V4 :
+    return Hexagon::ASLH_cPt_V4;
+  case Hexagon::ASLH_cdnNotPt_V4 :
+    return Hexagon::ASLH_cNotPt_V4;
+
+  case Hexagon::ASRH_cdnPt_V4 :
+    return Hexagon::ASRH_cPt_V4;
+  case Hexagon::ASRH_cdnNotPt_V4 :
+    return Hexagon::ASRH_cNotPt_V4;
+
+  case Hexagon::SXTB_cdnPt_V4 :
+    return Hexagon::SXTB_cPt_V4;
+  case Hexagon::SXTB_cdnNotPt_V4 :
+    return Hexagon::SXTB_cNotPt_V4;
+
+  case Hexagon::SXTH_cdnPt_V4 :
+    return Hexagon::SXTH_cPt_V4;
+  case Hexagon::SXTH_cdnNotPt_V4 :
+    return Hexagon::SXTH_cNotPt_V4;
+
+  case Hexagon::ZXTB_cdnPt_V4 :
+    return Hexagon::ZXTB_cPt_V4;
+  case Hexagon::ZXTB_cdnNotPt_V4 :
+    return Hexagon::ZXTB_cNotPt_V4;
+
+  case Hexagon::ZXTH_cdnPt_V4 :
+    return Hexagon::ZXTH_cPt_V4;
+  case Hexagon::ZXTH_cdnNotPt_V4 :
+    return Hexagon::ZXTH_cNotPt_V4;
+
+  // Store byte
+
+  case Hexagon::STrib_imm_cdnPt_V4 :
+    return Hexagon::STrib_imm_cPt_V4;
+
+  case Hexagon::STrib_imm_cdnNotPt_V4 :
+    return Hexagon::STrib_imm_cNotPt_V4;
+
+  case Hexagon::STrib_cdnPt_nv_V4 :
+  case Hexagon::STrib_cPt_nv_V4 :
+  case Hexagon::STrib_cdnPt_V4 :
+    return Hexagon::STrib_cPt;
+
+  case Hexagon::STrib_cdnNotPt_nv_V4 :
+  case Hexagon::STrib_cNotPt_nv_V4 :
+  case Hexagon::STrib_cdnNotPt_V4 :
+    return Hexagon::STrib_cNotPt;
+
+  case Hexagon::STrib_indexed_cdnPt_V4 :
+  case Hexagon::STrib_indexed_cPt_nv_V4 :
+  case Hexagon::STrib_indexed_cdnPt_nv_V4 :
+    return Hexagon::STrib_indexed_cPt;
+
+  case Hexagon::STrib_indexed_cdnNotPt_V4 :
+  case Hexagon::STrib_indexed_cNotPt_nv_V4 :
+  case Hexagon::STrib_indexed_cdnNotPt_nv_V4 :
+    return Hexagon::STrib_indexed_cNotPt;
+
+  case Hexagon::STrib_indexed_shl_cdnPt_nv_V4:
+  case Hexagon::STrib_indexed_shl_cPt_nv_V4 :
+  case Hexagon::STrib_indexed_shl_cdnPt_V4 :
+    return Hexagon::STrib_indexed_shl_cPt_V4;
+
+  case Hexagon::STrib_indexed_shl_cdnNotPt_nv_V4:
+  case Hexagon::STrib_indexed_shl_cNotPt_nv_V4 :
+  case Hexagon::STrib_indexed_shl_cdnNotPt_V4 :
+    return Hexagon::STrib_indexed_shl_cNotPt_V4;
+
+  case Hexagon::POST_STbri_cdnPt_nv_V4 :
+  case Hexagon::POST_STbri_cPt_nv_V4 :
+  case Hexagon::POST_STbri_cdnPt_V4 :
+    return Hexagon::POST_STbri_cPt;
+
+  case Hexagon::POST_STbri_cdnNotPt_nv_V4 :
+  case Hexagon::POST_STbri_cNotPt_nv_V4:
+  case Hexagon::POST_STbri_cdnNotPt_V4 :
+    return Hexagon::POST_STbri_cNotPt;
+
+  case Hexagon::STb_GP_cdnPt_nv_V4:
+  case Hexagon::STb_GP_cdnPt_V4:
+  case Hexagon::STb_GP_cPt_nv_V4:
+    return Hexagon::STb_GP_cPt_V4;
+
+  case Hexagon::STb_GP_cdnNotPt_nv_V4:
+  case Hexagon::STb_GP_cdnNotPt_V4:
+  case Hexagon::STb_GP_cNotPt_nv_V4:
+    return Hexagon::STb_GP_cNotPt_V4;
+
+  case Hexagon::STrib_GP_cdnPt_nv_V4:
+  case Hexagon::STrib_GP_cdnPt_V4:
+  case Hexagon::STrib_GP_cPt_nv_V4:
+    return Hexagon::STrib_GP_cPt_V4;
+
+  case Hexagon::STrib_GP_cdnNotPt_nv_V4:
+  case Hexagon::STrib_GP_cdnNotPt_V4:
+  case Hexagon::STrib_GP_cNotPt_nv_V4:
+    return Hexagon::STrib_GP_cNotPt_V4;
+
+  // Store new-value byte - unconditional
+  case Hexagon::STrib_nv_V4:
+    return Hexagon::STrib;
+
+  case Hexagon::STrib_indexed_nv_V4:
+    return Hexagon::STrib_indexed;
+
+  case Hexagon::STrib_indexed_shl_nv_V4:
+    return Hexagon::STrib_indexed_shl_V4;
+
+  case Hexagon::STrib_shl_nv_V4:
+    return Hexagon::STrib_shl_V4;
+
+  case Hexagon::STrib_GP_nv_V4:
+    return Hexagon::STrib_GP_V4;
+
+  case Hexagon::STb_GP_nv_V4:
+    return Hexagon::STb_GP_V4;
+
+  case Hexagon::POST_STbri_nv_V4:
+    return Hexagon::POST_STbri;
+
+  // Store halfword
+  case Hexagon::STrih_imm_cdnPt_V4 :
+    return Hexagon::STrih_imm_cPt_V4;
+
+  case Hexagon::STrih_imm_cdnNotPt_V4 :
+    return Hexagon::STrih_imm_cNotPt_V4;
+
+  case Hexagon::STrih_cdnPt_nv_V4 :
+  case Hexagon::STrih_cPt_nv_V4 :
+  case Hexagon::STrih_cdnPt_V4 :
+    return Hexagon::STrih_cPt;
+
+  case Hexagon::STrih_cdnNotPt_nv_V4 :
+  case Hexagon::STrih_cNotPt_nv_V4 :
+  case Hexagon::STrih_cdnNotPt_V4 :
+    return Hexagon::STrih_cNotPt;
+
+  case Hexagon::STrih_indexed_cdnPt_nv_V4:
+  case Hexagon::STrih_indexed_cPt_nv_V4 :
+  case Hexagon::STrih_indexed_cdnPt_V4 :
+    return Hexagon::STrih_indexed_cPt;
+
+  case Hexagon::STrih_indexed_cdnNotPt_nv_V4:
+  case Hexagon::STrih_indexed_cNotPt_nv_V4 :
+  case Hexagon::STrih_indexed_cdnNotPt_V4 :
+    return Hexagon::STrih_indexed_cNotPt;
+
+  case Hexagon::STrih_indexed_shl_cdnPt_nv_V4 :
+  case Hexagon::STrih_indexed_shl_cPt_nv_V4 :
+  case Hexagon::STrih_indexed_shl_cdnPt_V4 :
+    return Hexagon::STrih_indexed_shl_cPt_V4;
+
+  case Hexagon::STrih_indexed_shl_cdnNotPt_nv_V4 :
+  case Hexagon::STrih_indexed_shl_cNotPt_nv_V4 :
+  case Hexagon::STrih_indexed_shl_cdnNotPt_V4 :
+    return Hexagon::STrih_indexed_shl_cNotPt_V4;
+
+  case Hexagon::POST_SThri_cdnPt_nv_V4 :
+  case Hexagon::POST_SThri_cPt_nv_V4 :
+  case Hexagon::POST_SThri_cdnPt_V4 :
+    return Hexagon::POST_SThri_cPt;
+
+  case Hexagon::POST_SThri_cdnNotPt_nv_V4 :
+  case Hexagon::POST_SThri_cNotPt_nv_V4 :
+  case Hexagon::POST_SThri_cdnNotPt_V4 :
+    return Hexagon::POST_SThri_cNotPt;
+
+  case Hexagon::STh_GP_cdnPt_nv_V4:
+  case Hexagon::STh_GP_cdnPt_V4:
+  case Hexagon::STh_GP_cPt_nv_V4:
+    return Hexagon::STh_GP_cPt_V4;
+
+  case Hexagon::STh_GP_cdnNotPt_nv_V4:
+  case Hexagon::STh_GP_cdnNotPt_V4:
+  case Hexagon::STh_GP_cNotPt_nv_V4:
+    return Hexagon::STh_GP_cNotPt_V4;
+
+  case Hexagon::STrih_GP_cdnPt_nv_V4:
+  case Hexagon::STrih_GP_cdnPt_V4:
+  case Hexagon::STrih_GP_cPt_nv_V4:
+    return Hexagon::STrih_GP_cPt_V4;
+
+  case Hexagon::STrih_GP_cdnNotPt_nv_V4:
+  case Hexagon::STrih_GP_cdnNotPt_V4:
+  case Hexagon::STrih_GP_cNotPt_nv_V4:
+    return Hexagon::STrih_GP_cNotPt_V4;
+
+  // Store new-value halfword - unconditional
+
+  case Hexagon::STrih_nv_V4:
+    return Hexagon::STrih;
+
+  case Hexagon::STrih_indexed_nv_V4:
+    return Hexagon::STrih_indexed;
+
+  case Hexagon::STrih_indexed_shl_nv_V4:
+    return Hexagon::STrih_indexed_shl_V4;
+
+  case Hexagon::STrih_shl_nv_V4:
+    return Hexagon::STrih_shl_V4;
+
+  case Hexagon::STrih_GP_nv_V4:
+    return Hexagon::STrih_GP_V4;
+
+  case Hexagon::STh_GP_nv_V4:
+    return Hexagon::STh_GP_V4;
+
+  case Hexagon::POST_SThri_nv_V4:
+    return Hexagon::POST_SThri;
+
+   // Store word
+
+   case Hexagon::STriw_imm_cdnPt_V4 :
+    return Hexagon::STriw_imm_cPt_V4;
+
+  case Hexagon::STriw_imm_cdnNotPt_V4 :
+    return Hexagon::STriw_imm_cNotPt_V4;
+
+  case Hexagon::STriw_cdnPt_nv_V4 :
+  case Hexagon::STriw_cPt_nv_V4 :
+  case Hexagon::STriw_cdnPt_V4 :
+    return Hexagon::STriw_cPt;
+
+  case Hexagon::STriw_cdnNotPt_nv_V4 :
+  case Hexagon::STriw_cNotPt_nv_V4 :
+  case Hexagon::STriw_cdnNotPt_V4 :
+    return Hexagon::STriw_cNotPt;
+
+  case Hexagon::STriw_indexed_cdnPt_nv_V4 :
+  case Hexagon::STriw_indexed_cPt_nv_V4 :
+  case Hexagon::STriw_indexed_cdnPt_V4 :
+    return Hexagon::STriw_indexed_cPt;
+
+  case Hexagon::STriw_indexed_cdnNotPt_nv_V4 :
+  case Hexagon::STriw_indexed_cNotPt_nv_V4 :
+  case Hexagon::STriw_indexed_cdnNotPt_V4 :
+    return Hexagon::STriw_indexed_cNotPt;
+
+  case Hexagon::STriw_indexed_shl_cdnPt_nv_V4 :
+  case Hexagon::STriw_indexed_shl_cPt_nv_V4 :
+  case Hexagon::STriw_indexed_shl_cdnPt_V4 :
+    return Hexagon::STriw_indexed_shl_cPt_V4;
+
+  case Hexagon::STriw_indexed_shl_cdnNotPt_nv_V4 :
+  case Hexagon::STriw_indexed_shl_cNotPt_nv_V4 :
+  case Hexagon::STriw_indexed_shl_cdnNotPt_V4 :
+    return Hexagon::STriw_indexed_shl_cNotPt_V4;
+
+  case Hexagon::POST_STwri_cdnPt_nv_V4 :
+  case Hexagon::POST_STwri_cPt_nv_V4 :
+  case Hexagon::POST_STwri_cdnPt_V4 :
+    return Hexagon::POST_STwri_cPt;
+
+  case Hexagon::POST_STwri_cdnNotPt_nv_V4 :
+  case Hexagon::POST_STwri_cNotPt_nv_V4 :
+  case Hexagon::POST_STwri_cdnNotPt_V4 :
+    return Hexagon::POST_STwri_cNotPt;
+
+  case Hexagon::STw_GP_cdnPt_nv_V4:
+  case Hexagon::STw_GP_cdnPt_V4:
+  case Hexagon::STw_GP_cPt_nv_V4:
+    return Hexagon::STw_GP_cPt_V4;
+
+  case Hexagon::STw_GP_cdnNotPt_nv_V4:
+  case Hexagon::STw_GP_cdnNotPt_V4:
+  case Hexagon::STw_GP_cNotPt_nv_V4:
+    return Hexagon::STw_GP_cNotPt_V4;
+
+  case Hexagon::STriw_GP_cdnPt_nv_V4:
+  case Hexagon::STriw_GP_cdnPt_V4:
+  case Hexagon::STriw_GP_cPt_nv_V4:
+    return Hexagon::STriw_GP_cPt_V4;
+
+  case Hexagon::STriw_GP_cdnNotPt_nv_V4:
+  case Hexagon::STriw_GP_cdnNotPt_V4:
+  case Hexagon::STriw_GP_cNotPt_nv_V4:
+    return Hexagon::STriw_GP_cNotPt_V4;
+
+  // Store new-value word - unconditional
+
+  case Hexagon::STriw_nv_V4:
+    return Hexagon::STriw;
+
+  case Hexagon::STriw_indexed_nv_V4:
+    return Hexagon::STriw_indexed;
+
+  case Hexagon::STriw_indexed_shl_nv_V4:
+    return Hexagon::STriw_indexed_shl_V4;
+
+  case Hexagon::STriw_shl_nv_V4:
+    return Hexagon::STriw_shl_V4;
+
+  case Hexagon::STriw_GP_nv_V4:
+    return Hexagon::STriw_GP_V4;
+
+  case Hexagon::STw_GP_nv_V4:
+    return Hexagon::STw_GP_V4;
+
+  case Hexagon::POST_STwri_nv_V4:
+    return Hexagon::POST_STwri;
+
+ // Store doubleword
+
+  case Hexagon::STrid_cdnPt_V4 :
+    return Hexagon::STrid_cPt;
+
+  case Hexagon::STrid_cdnNotPt_V4 :
+    return Hexagon::STrid_cNotPt;
+
+  case Hexagon::STrid_indexed_cdnPt_V4 :
+    return Hexagon::STrid_indexed_cPt;
+
+  case Hexagon::STrid_indexed_cdnNotPt_V4 :
+    return Hexagon::STrid_indexed_cNotPt;
+
+  case Hexagon::STrid_indexed_shl_cdnPt_V4 :
+    return Hexagon::STrid_indexed_shl_cPt_V4;
+
+  case Hexagon::STrid_indexed_shl_cdnNotPt_V4 :
+    return Hexagon::STrid_indexed_shl_cNotPt_V4;
+
+  case Hexagon::POST_STdri_cdnPt_V4 :
+    return Hexagon::POST_STdri_cPt;
+
+  case Hexagon::POST_STdri_cdnNotPt_V4 :
+    return Hexagon::POST_STdri_cNotPt;
+
+  case Hexagon::STd_GP_cdnPt_V4 :
+    return Hexagon::STd_GP_cPt_V4;
+
+  case Hexagon::STd_GP_cdnNotPt_V4 :
+    return Hexagon::STd_GP_cNotPt_V4;
+
+  case Hexagon::STrid_GP_cdnPt_V4 :
+    return Hexagon::STrid_GP_cPt_V4;
+
+  case Hexagon::STrid_GP_cdnNotPt_V4 :
+    return Hexagon::STrid_GP_cNotPt_V4;
+  }
+}
+
+bool HexagonPacketizerList::DemoteToDotOld(MachineInstr* MI) {
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  int NewOpcode = GetDotOldOp(MI->getOpcode());
+  MI->setDesc(QII->get(NewOpcode));
+  return true;
+}
+
+// Returns true if an instruction is predicated on p0 and false if it's
+// predicated on !p0.
+
+static bool GetPredicateSense(MachineInstr* MI,
+                              const HexagonInstrInfo *QII) {
+
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("Unknown predicate sense of the instruction");
+  case Hexagon::TFR_cPt:
+  case Hexagon::TFR_cdnPt:
+  case Hexagon::TFRI_cPt:
+  case Hexagon::TFRI_cdnPt:
+  case Hexagon::STrib_cPt :
+  case Hexagon::STrib_cdnPt_V4 :
+  case Hexagon::STrib_indexed_cPt :
+  case Hexagon::STrib_indexed_cdnPt_V4 :
+  case Hexagon::STrib_indexed_shl_cPt_V4 :
+  case Hexagon::STrib_indexed_shl_cdnPt_V4 :
+  case Hexagon::POST_STbri_cPt :
+  case Hexagon::POST_STbri_cdnPt_V4 :
+  case Hexagon::STrih_cPt :
+  case Hexagon::STrih_cdnPt_V4 :
+  case Hexagon::STrih_indexed_cPt :
+  case Hexagon::STrih_indexed_cdnPt_V4 :
+  case Hexagon::STrih_indexed_shl_cPt_V4 :
+  case Hexagon::STrih_indexed_shl_cdnPt_V4 :
+  case Hexagon::POST_SThri_cPt :
+  case Hexagon::POST_SThri_cdnPt_V4 :
+  case Hexagon::STriw_cPt :
+  case Hexagon::STriw_cdnPt_V4 :
+  case Hexagon::STriw_indexed_cPt :
+  case Hexagon::STriw_indexed_cdnPt_V4 :
+  case Hexagon::STriw_indexed_shl_cPt_V4 :
+  case Hexagon::STriw_indexed_shl_cdnPt_V4 :
+  case Hexagon::POST_STwri_cPt :
+  case Hexagon::POST_STwri_cdnPt_V4 :
+  case Hexagon::STrib_imm_cPt_V4 :
+  case Hexagon::STrib_imm_cdnPt_V4 :
+  case Hexagon::STrid_cPt :
+  case Hexagon::STrid_cdnPt_V4 :
+  case Hexagon::STrid_indexed_cPt :
+  case Hexagon::STrid_indexed_cdnPt_V4 :
+  case Hexagon::STrid_indexed_shl_cPt_V4 :
+  case Hexagon::STrid_indexed_shl_cdnPt_V4 :
+  case Hexagon::POST_STdri_cPt :
+  case Hexagon::POST_STdri_cdnPt_V4 :
+  case Hexagon::STrih_imm_cPt_V4 :
+  case Hexagon::STrih_imm_cdnPt_V4 :
+  case Hexagon::STriw_imm_cPt_V4 :
+  case Hexagon::STriw_imm_cdnPt_V4 :
+  case Hexagon::JMP_cdnPt :
+  case Hexagon::LDrid_cPt :
+  case Hexagon::LDrid_cdnPt :
+  case Hexagon::LDrid_indexed_cPt :
+  case Hexagon::LDrid_indexed_cdnPt :
+  case Hexagon::POST_LDrid_cPt :
+  case Hexagon::POST_LDrid_cdnPt_V4 :
+  case Hexagon::LDriw_cPt :
+  case Hexagon::LDriw_cdnPt :
+  case Hexagon::LDriw_indexed_cPt :
+  case Hexagon::LDriw_indexed_cdnPt :
+  case Hexagon::POST_LDriw_cPt :
+  case Hexagon::POST_LDriw_cdnPt_V4 :
+  case Hexagon::LDrih_cPt :
+  case Hexagon::LDrih_cdnPt :
+  case Hexagon::LDrih_indexed_cPt :
+  case Hexagon::LDrih_indexed_cdnPt :
+  case Hexagon::POST_LDrih_cPt :
+  case Hexagon::POST_LDrih_cdnPt_V4 :
+  case Hexagon::LDrib_cPt :
+  case Hexagon::LDrib_cdnPt :
+  case Hexagon::LDrib_indexed_cPt :
+  case Hexagon::LDrib_indexed_cdnPt :
+  case Hexagon::POST_LDrib_cPt :
+  case Hexagon::POST_LDrib_cdnPt_V4 :
+  case Hexagon::LDriuh_cPt :
+  case Hexagon::LDriuh_cdnPt :
+  case Hexagon::LDriuh_indexed_cPt :
+  case Hexagon::LDriuh_indexed_cdnPt :
+  case Hexagon::POST_LDriuh_cPt :
+  case Hexagon::POST_LDriuh_cdnPt_V4 :
+  case Hexagon::LDriub_cPt :
+  case Hexagon::LDriub_cdnPt :
+  case Hexagon::LDriub_indexed_cPt :
+  case Hexagon::LDriub_indexed_cdnPt :
+  case Hexagon::POST_LDriub_cPt :
+  case Hexagon::POST_LDriub_cdnPt_V4 :
+  case Hexagon::LDrid_indexed_cPt_V4 :
+  case Hexagon::LDrid_indexed_cdnPt_V4 :
+  case Hexagon::LDrid_indexed_shl_cPt_V4 :
+  case Hexagon::LDrid_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDrib_indexed_cPt_V4 :
+  case Hexagon::LDrib_indexed_cdnPt_V4 :
+  case Hexagon::LDrib_indexed_shl_cPt_V4 :
+  case Hexagon::LDrib_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDriub_indexed_cPt_V4 :
+  case Hexagon::LDriub_indexed_cdnPt_V4 :
+  case Hexagon::LDriub_indexed_shl_cPt_V4 :
+  case Hexagon::LDriub_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDrih_indexed_cPt_V4 :
+  case Hexagon::LDrih_indexed_cdnPt_V4 :
+  case Hexagon::LDrih_indexed_shl_cPt_V4 :
+  case Hexagon::LDrih_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDriuh_indexed_cPt_V4 :
+  case Hexagon::LDriuh_indexed_cdnPt_V4 :
+  case Hexagon::LDriuh_indexed_shl_cPt_V4 :
+  case Hexagon::LDriuh_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDriw_indexed_cPt_V4 :
+  case Hexagon::LDriw_indexed_cdnPt_V4 :
+  case Hexagon::LDriw_indexed_shl_cPt_V4 :
+  case Hexagon::LDriw_indexed_shl_cdnPt_V4 :
+  case Hexagon::ADD_ri_cPt :
+  case Hexagon::ADD_ri_cdnPt :
+  case Hexagon::ADD_rr_cPt :
+  case Hexagon::ADD_rr_cdnPt :
+  case Hexagon::XOR_rr_cPt :
+  case Hexagon::XOR_rr_cdnPt :
+  case Hexagon::AND_rr_cPt :
+  case Hexagon::AND_rr_cdnPt :
+  case Hexagon::OR_rr_cPt :
+  case Hexagon::OR_rr_cdnPt :
+  case Hexagon::SUB_rr_cPt :
+  case Hexagon::SUB_rr_cdnPt :
+  case Hexagon::COMBINE_rr_cPt :
+  case Hexagon::COMBINE_rr_cdnPt :
+  case Hexagon::ASLH_cPt_V4 :
+  case Hexagon::ASLH_cdnPt_V4 :
+  case Hexagon::ASRH_cPt_V4 :
+  case Hexagon::ASRH_cdnPt_V4 :
+  case Hexagon::SXTB_cPt_V4 :
+  case Hexagon::SXTB_cdnPt_V4 :
+  case Hexagon::SXTH_cPt_V4 :
+  case Hexagon::SXTH_cdnPt_V4 :
+  case Hexagon::ZXTB_cPt_V4 :
+  case Hexagon::ZXTB_cdnPt_V4 :
+  case Hexagon::ZXTH_cPt_V4 :
+  case Hexagon::ZXTH_cdnPt_V4 :
+  case Hexagon::LDrid_GP_cPt_V4 :
+  case Hexagon::LDrib_GP_cPt_V4 :
+  case Hexagon::LDriub_GP_cPt_V4 :
+  case Hexagon::LDrih_GP_cPt_V4 :
+  case Hexagon::LDriuh_GP_cPt_V4 :
+  case Hexagon::LDriw_GP_cPt_V4 :
+  case Hexagon::LDd_GP_cPt_V4 :
+  case Hexagon::LDb_GP_cPt_V4 :
+  case Hexagon::LDub_GP_cPt_V4 :
+  case Hexagon::LDh_GP_cPt_V4 :
+  case Hexagon::LDuh_GP_cPt_V4 :
+  case Hexagon::LDw_GP_cPt_V4 :
+  case Hexagon::STrid_GP_cPt_V4 :
+  case Hexagon::STrib_GP_cPt_V4 :
+  case Hexagon::STrih_GP_cPt_V4 :
+  case Hexagon::STriw_GP_cPt_V4 :
+  case Hexagon::STd_GP_cPt_V4 :
+  case Hexagon::STb_GP_cPt_V4 :
+  case Hexagon::STh_GP_cPt_V4 :
+  case Hexagon::STw_GP_cPt_V4 :
+  case Hexagon::LDrid_GP_cdnPt_V4 :
+  case Hexagon::LDrib_GP_cdnPt_V4 :
+  case Hexagon::LDriub_GP_cdnPt_V4 :
+  case Hexagon::LDrih_GP_cdnPt_V4 :
+  case Hexagon::LDriuh_GP_cdnPt_V4 :
+  case Hexagon::LDriw_GP_cdnPt_V4 :
+  case Hexagon::LDd_GP_cdnPt_V4 :
+  case Hexagon::LDb_GP_cdnPt_V4 :
+  case Hexagon::LDub_GP_cdnPt_V4 :
+  case Hexagon::LDh_GP_cdnPt_V4 :
+  case Hexagon::LDuh_GP_cdnPt_V4 :
+  case Hexagon::LDw_GP_cdnPt_V4 :
+  case Hexagon::STrid_GP_cdnPt_V4 :
+  case Hexagon::STrib_GP_cdnPt_V4 :
+  case Hexagon::STrih_GP_cdnPt_V4 :
+  case Hexagon::STriw_GP_cdnPt_V4 :
+  case Hexagon::STd_GP_cdnPt_V4 :
+  case Hexagon::STb_GP_cdnPt_V4 :
+  case Hexagon::STh_GP_cdnPt_V4 :
+  case Hexagon::STw_GP_cdnPt_V4 :
+    return true;
+
+  case Hexagon::TFR_cNotPt:
+  case Hexagon::TFR_cdnNotPt:
+  case Hexagon::TFRI_cNotPt:
+  case Hexagon::TFRI_cdnNotPt:
+  case Hexagon::STrib_cNotPt :
+  case Hexagon::STrib_cdnNotPt_V4 :
+  case Hexagon::STrib_indexed_cNotPt :
+  case Hexagon::STrib_indexed_cdnNotPt_V4 :
+  case Hexagon::STrib_indexed_shl_cNotPt_V4 :
+  case Hexagon::STrib_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::POST_STbri_cNotPt :
+  case Hexagon::POST_STbri_cdnNotPt_V4 :
+  case Hexagon::STrih_cNotPt :
+  case Hexagon::STrih_cdnNotPt_V4 :
+  case Hexagon::STrih_indexed_cNotPt :
+  case Hexagon::STrih_indexed_cdnNotPt_V4 :
+  case Hexagon::STrih_indexed_shl_cNotPt_V4 :
+  case Hexagon::STrih_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::POST_SThri_cNotPt :
+  case Hexagon::POST_SThri_cdnNotPt_V4 :
+  case Hexagon::STriw_cNotPt :
+  case Hexagon::STriw_cdnNotPt_V4 :
+  case Hexagon::STriw_indexed_cNotPt :
+  case Hexagon::STriw_indexed_cdnNotPt_V4 :
+  case Hexagon::STriw_indexed_shl_cNotPt_V4 :
+  case Hexagon::STriw_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::POST_STwri_cNotPt :
+  case Hexagon::POST_STwri_cdnNotPt_V4 :
+  case Hexagon::STrib_imm_cNotPt_V4 :
+  case Hexagon::STrib_imm_cdnNotPt_V4 :
+  case Hexagon::STrid_cNotPt :
+  case Hexagon::STrid_cdnNotPt_V4 :
+  case Hexagon::STrid_indexed_cdnNotPt_V4 :
+  case Hexagon::STrid_indexed_cNotPt :
+  case Hexagon::STrid_indexed_shl_cNotPt_V4 :
+  case Hexagon::STrid_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::POST_STdri_cNotPt :
+  case Hexagon::POST_STdri_cdnNotPt_V4 :
+  case Hexagon::STrih_imm_cNotPt_V4 :
+  case Hexagon::STrih_imm_cdnNotPt_V4 :
+  case Hexagon::STriw_imm_cNotPt_V4 :
+  case Hexagon::STriw_imm_cdnNotPt_V4 :
+  case Hexagon::JMP_cdnNotPt :
+  case Hexagon::LDrid_cNotPt :
+  case Hexagon::LDrid_cdnNotPt :
+  case Hexagon::LDrid_indexed_cNotPt :
+  case Hexagon::LDrid_indexed_cdnNotPt :
+  case Hexagon::POST_LDrid_cNotPt :
+  case Hexagon::POST_LDrid_cdnNotPt_V4 :
+  case Hexagon::LDriw_cNotPt :
+  case Hexagon::LDriw_cdnNotPt :
+  case Hexagon::LDriw_indexed_cNotPt :
+  case Hexagon::LDriw_indexed_cdnNotPt :
+  case Hexagon::POST_LDriw_cNotPt :
+  case Hexagon::POST_LDriw_cdnNotPt_V4 :
+  case Hexagon::LDrih_cNotPt :
+  case Hexagon::LDrih_cdnNotPt :
+  case Hexagon::LDrih_indexed_cNotPt :
+  case Hexagon::LDrih_indexed_cdnNotPt :
+  case Hexagon::POST_LDrih_cNotPt :
+  case Hexagon::POST_LDrih_cdnNotPt_V4 :
+  case Hexagon::LDrib_cNotPt :
+  case Hexagon::LDrib_cdnNotPt :
+  case Hexagon::LDrib_indexed_cNotPt :
+  case Hexagon::LDrib_indexed_cdnNotPt :
+  case Hexagon::POST_LDrib_cNotPt :
+  case Hexagon::POST_LDrib_cdnNotPt_V4 :
+  case Hexagon::LDriuh_cNotPt :
+  case Hexagon::LDriuh_cdnNotPt :
+  case Hexagon::LDriuh_indexed_cNotPt :
+  case Hexagon::LDriuh_indexed_cdnNotPt :
+  case Hexagon::POST_LDriuh_cNotPt :
+  case Hexagon::POST_LDriuh_cdnNotPt_V4 :
+  case Hexagon::LDriub_cNotPt :
+  case Hexagon::LDriub_cdnNotPt :
+  case Hexagon::LDriub_indexed_cNotPt :
+  case Hexagon::LDriub_indexed_cdnNotPt :
+  case Hexagon::POST_LDriub_cNotPt :
+  case Hexagon::POST_LDriub_cdnNotPt_V4 :
+  case Hexagon::LDrid_indexed_cNotPt_V4 :
+  case Hexagon::LDrid_indexed_cdnNotPt_V4 :
+  case Hexagon::LDrid_indexed_shl_cNotPt_V4 :
+  case Hexagon::LDrid_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::LDrib_indexed_cNotPt_V4 :
+  case Hexagon::LDrib_indexed_cdnNotPt_V4 :
+  case Hexagon::LDrib_indexed_shl_cNotPt_V4 :
+  case Hexagon::LDrib_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::LDriub_indexed_cNotPt_V4 :
+  case Hexagon::LDriub_indexed_cdnNotPt_V4 :
+  case Hexagon::LDriub_indexed_shl_cNotPt_V4 :
+  case Hexagon::LDriub_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::LDrih_indexed_cNotPt_V4 :
+  case Hexagon::LDrih_indexed_cdnNotPt_V4 :
+  case Hexagon::LDrih_indexed_shl_cNotPt_V4 :
+  case Hexagon::LDrih_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::LDriuh_indexed_cNotPt_V4 :
+  case Hexagon::LDriuh_indexed_cdnNotPt_V4 :
+  case Hexagon::LDriuh_indexed_shl_cNotPt_V4 :
+  case Hexagon::LDriuh_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::LDriw_indexed_cNotPt_V4 :
+  case Hexagon::LDriw_indexed_cdnNotPt_V4 :
+  case Hexagon::LDriw_indexed_shl_cNotPt_V4 :
+  case Hexagon::LDriw_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::ADD_ri_cNotPt :
+  case Hexagon::ADD_ri_cdnNotPt :
+  case Hexagon::ADD_rr_cNotPt :
+  case Hexagon::ADD_rr_cdnNotPt :
+  case Hexagon::XOR_rr_cNotPt :
+  case Hexagon::XOR_rr_cdnNotPt :
+  case Hexagon::AND_rr_cNotPt :
+  case Hexagon::AND_rr_cdnNotPt :
+  case Hexagon::OR_rr_cNotPt :
+  case Hexagon::OR_rr_cdnNotPt :
+  case Hexagon::SUB_rr_cNotPt :
+  case Hexagon::SUB_rr_cdnNotPt :
+  case Hexagon::COMBINE_rr_cNotPt :
+  case Hexagon::COMBINE_rr_cdnNotPt :
+  case Hexagon::ASLH_cNotPt_V4 :
+  case Hexagon::ASLH_cdnNotPt_V4 :
+  case Hexagon::ASRH_cNotPt_V4 :
+  case Hexagon::ASRH_cdnNotPt_V4 :
+  case Hexagon::SXTB_cNotPt_V4 :
+  case Hexagon::SXTB_cdnNotPt_V4 :
+  case Hexagon::SXTH_cNotPt_V4 :
+  case Hexagon::SXTH_cdnNotPt_V4 :
+  case Hexagon::ZXTB_cNotPt_V4 :
+  case Hexagon::ZXTB_cdnNotPt_V4 :
+  case Hexagon::ZXTH_cNotPt_V4 :
+  case Hexagon::ZXTH_cdnNotPt_V4 :
+
+  case Hexagon::LDrid_GP_cNotPt_V4 :
+  case Hexagon::LDrib_GP_cNotPt_V4 :
+  case Hexagon::LDriub_GP_cNotPt_V4 :
+  case Hexagon::LDrih_GP_cNotPt_V4 :
+  case Hexagon::LDriuh_GP_cNotPt_V4 :
+  case Hexagon::LDriw_GP_cNotPt_V4 :
+  case Hexagon::LDd_GP_cNotPt_V4 :
+  case Hexagon::LDb_GP_cNotPt_V4 :
+  case Hexagon::LDub_GP_cNotPt_V4 :
+  case Hexagon::LDh_GP_cNotPt_V4 :
+  case Hexagon::LDuh_GP_cNotPt_V4 :
+  case Hexagon::LDw_GP_cNotPt_V4 :
+  case Hexagon::STrid_GP_cNotPt_V4 :
+  case Hexagon::STrib_GP_cNotPt_V4 :
+  case Hexagon::STrih_GP_cNotPt_V4 :
+  case Hexagon::STriw_GP_cNotPt_V4 :
+  case Hexagon::STd_GP_cNotPt_V4 :
+  case Hexagon::STb_GP_cNotPt_V4 :
+  case Hexagon::STh_GP_cNotPt_V4 :
+  case Hexagon::STw_GP_cNotPt_V4 :
+  case Hexagon::LDrid_GP_cdnNotPt_V4 :
+  case Hexagon::LDrib_GP_cdnNotPt_V4 :
+  case Hexagon::LDriub_GP_cdnNotPt_V4 :
+  case Hexagon::LDrih_GP_cdnNotPt_V4 :
+  case Hexagon::LDriuh_GP_cdnNotPt_V4 :
+  case Hexagon::LDriw_GP_cdnNotPt_V4 :
+  case Hexagon::LDd_GP_cdnNotPt_V4 :
+  case Hexagon::LDb_GP_cdnNotPt_V4 :
+  case Hexagon::LDub_GP_cdnNotPt_V4 :
+  case Hexagon::LDh_GP_cdnNotPt_V4 :
+  case Hexagon::LDuh_GP_cdnNotPt_V4 :
+  case Hexagon::LDw_GP_cdnNotPt_V4 :
+  case Hexagon::STrid_GP_cdnNotPt_V4 :
+  case Hexagon::STrib_GP_cdnNotPt_V4 :
+  case Hexagon::STrih_GP_cdnNotPt_V4 :
+  case Hexagon::STriw_GP_cdnNotPt_V4 :
+  case Hexagon::STd_GP_cdnNotPt_V4 :
+  case Hexagon::STb_GP_cdnNotPt_V4 :
+  case Hexagon::STh_GP_cdnNotPt_V4 :
+  case Hexagon::STw_GP_cdnNotPt_V4 :
+    return false;
+  }
+  // return *some value* to avoid compiler warning
+  return false;
+}
+
+bool HexagonPacketizerList::isDotNewInst(MachineInstr* MI) {
+  if (isNewValueInst(MI))
+    return true;
+
+  switch (MI->getOpcode()) {
+  case Hexagon::TFR_cdnNotPt:
+  case Hexagon::TFR_cdnPt:
+  case Hexagon::TFRI_cdnNotPt:
+  case Hexagon::TFRI_cdnPt:
+  case Hexagon::LDrid_cdnPt :
+  case Hexagon::LDrid_cdnNotPt :
+  case Hexagon::LDrid_indexed_cdnPt :
+  case Hexagon::LDrid_indexed_cdnNotPt :
+  case Hexagon::POST_LDrid_cdnPt_V4 :
+  case Hexagon::POST_LDrid_cdnNotPt_V4 :
+  case Hexagon::LDriw_cdnPt :
+  case Hexagon::LDriw_cdnNotPt :
+  case Hexagon::LDriw_indexed_cdnPt :
+  case Hexagon::LDriw_indexed_cdnNotPt :
+  case Hexagon::POST_LDriw_cdnPt_V4 :
+  case Hexagon::POST_LDriw_cdnNotPt_V4 :
+  case Hexagon::LDrih_cdnPt :
+  case Hexagon::LDrih_cdnNotPt :
+  case Hexagon::LDrih_indexed_cdnPt :
+  case Hexagon::LDrih_indexed_cdnNotPt :
+  case Hexagon::POST_LDrih_cdnPt_V4 :
+  case Hexagon::POST_LDrih_cdnNotPt_V4 :
+  case Hexagon::LDrib_cdnPt :
+  case Hexagon::LDrib_cdnNotPt :
+  case Hexagon::LDrib_indexed_cdnPt :
+  case Hexagon::LDrib_indexed_cdnNotPt :
+  case Hexagon::POST_LDrib_cdnPt_V4 :
+  case Hexagon::POST_LDrib_cdnNotPt_V4 :
+  case Hexagon::LDriuh_cdnPt :
+  case Hexagon::LDriuh_cdnNotPt :
+  case Hexagon::LDriuh_indexed_cdnPt :
+  case Hexagon::LDriuh_indexed_cdnNotPt :
+  case Hexagon::POST_LDriuh_cdnPt_V4 :
+  case Hexagon::POST_LDriuh_cdnNotPt_V4 :
+  case Hexagon::LDriub_cdnPt :
+  case Hexagon::LDriub_cdnNotPt :
+  case Hexagon::LDriub_indexed_cdnPt :
+  case Hexagon::LDriub_indexed_cdnNotPt :
+  case Hexagon::POST_LDriub_cdnPt_V4 :
+  case Hexagon::POST_LDriub_cdnNotPt_V4 :
+
+  case Hexagon::LDrid_indexed_cdnPt_V4 :
+  case Hexagon::LDrid_indexed_cdnNotPt_V4 :
+  case Hexagon::LDrid_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDrid_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::LDrib_indexed_cdnPt_V4 :
+  case Hexagon::LDrib_indexed_cdnNotPt_V4 :
+  case Hexagon::LDrib_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDrib_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::LDriub_indexed_cdnPt_V4 :
+  case Hexagon::LDriub_indexed_cdnNotPt_V4 :
+  case Hexagon::LDriub_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDriub_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::LDrih_indexed_cdnPt_V4 :
+  case Hexagon::LDrih_indexed_cdnNotPt_V4 :
+  case Hexagon::LDrih_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDrih_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::LDriuh_indexed_cdnPt_V4 :
+  case Hexagon::LDriuh_indexed_cdnNotPt_V4 :
+  case Hexagon::LDriuh_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDriuh_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::LDriw_indexed_cdnPt_V4 :
+  case Hexagon::LDriw_indexed_cdnNotPt_V4 :
+  case Hexagon::LDriw_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDriw_indexed_shl_cdnNotPt_V4 :
+
+// Coditional add
+  case Hexagon::ADD_ri_cdnPt:
+  case Hexagon::ADD_ri_cdnNotPt:
+  case Hexagon::ADD_rr_cdnPt:
+  case Hexagon::ADD_rr_cdnNotPt:
+
+  // Conditional logical operations
+  case Hexagon::XOR_rr_cdnPt :
+  case Hexagon::XOR_rr_cdnNotPt :
+  case Hexagon::AND_rr_cdnPt :
+  case Hexagon::AND_rr_cdnNotPt :
+  case Hexagon::OR_rr_cdnPt :
+  case Hexagon::OR_rr_cdnNotPt :
+
+  // Conditonal subtract
+  case Hexagon::SUB_rr_cdnPt :
+  case Hexagon::SUB_rr_cdnNotPt :
+
+  // Conditional combine
+  case Hexagon::COMBINE_rr_cdnPt :
+  case Hexagon::COMBINE_rr_cdnNotPt :
+
+  // Conditional shift operations
+  case Hexagon::ASLH_cdnPt_V4:
+  case Hexagon::ASLH_cdnNotPt_V4:
+  case Hexagon::ASRH_cdnPt_V4:
+  case Hexagon::ASRH_cdnNotPt_V4:
+  case Hexagon::SXTB_cdnPt_V4:
+  case Hexagon::SXTB_cdnNotPt_V4:
+  case Hexagon::SXTH_cdnPt_V4:
+  case Hexagon::SXTH_cdnNotPt_V4:
+  case Hexagon::ZXTB_cdnPt_V4:
+  case Hexagon::ZXTB_cdnNotPt_V4:
+  case Hexagon::ZXTH_cdnPt_V4:
+  case Hexagon::ZXTH_cdnNotPt_V4:
+
+  // Conditional stores
+  case Hexagon::STrib_imm_cdnPt_V4 :
+  case Hexagon::STrib_imm_cdnNotPt_V4 :
+  case Hexagon::STrib_cdnPt_V4 :
+  case Hexagon::STrib_cdnNotPt_V4 :
+  case Hexagon::STrib_indexed_cdnPt_V4 :
+  case Hexagon::STrib_indexed_cdnNotPt_V4 :
+  case Hexagon::POST_STbri_cdnPt_V4 :
+  case Hexagon::POST_STbri_cdnNotPt_V4 :
+  case Hexagon::STrib_indexed_shl_cdnPt_V4 :
+  case Hexagon::STrib_indexed_shl_cdnNotPt_V4 :
+
+  // Store doubleword conditionally
+  case Hexagon::STrid_indexed_cdnPt_V4 :
+  case Hexagon::STrid_indexed_cdnNotPt_V4 :
+  case Hexagon::STrid_indexed_shl_cdnPt_V4 :
+  case Hexagon::STrid_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::POST_STdri_cdnPt_V4 :
+  case Hexagon::POST_STdri_cdnNotPt_V4 :
+
+  // Store halfword conditionally
+  case Hexagon::STrih_cdnPt_V4 :
+  case Hexagon::STrih_cdnNotPt_V4 :
+  case Hexagon::STrih_indexed_cdnPt_V4 :
+  case Hexagon::STrih_indexed_cdnNotPt_V4 :
+  case Hexagon::STrih_imm_cdnPt_V4 :
+  case Hexagon::STrih_imm_cdnNotPt_V4 :
+  case Hexagon::STrih_indexed_shl_cdnPt_V4 :
+  case Hexagon::STrih_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::POST_SThri_cdnPt_V4 :
+  case Hexagon::POST_SThri_cdnNotPt_V4 :
+
+  // Store word conditionally
+  case Hexagon::STriw_cdnPt_V4 :
+  case Hexagon::STriw_cdnNotPt_V4 :
+  case Hexagon::STriw_indexed_cdnPt_V4 :
+  case Hexagon::STriw_indexed_cdnNotPt_V4 :
+  case Hexagon::STriw_imm_cdnPt_V4 :
+  case Hexagon::STriw_imm_cdnNotPt_V4 :
+  case Hexagon::STriw_indexed_shl_cdnPt_V4 :
+  case Hexagon::STriw_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::POST_STwri_cdnPt_V4 :
+  case Hexagon::POST_STwri_cdnNotPt_V4 :
+
+  case Hexagon::LDd_GP_cdnPt_V4:
+  case Hexagon::LDd_GP_cdnNotPt_V4:
+  case Hexagon::LDb_GP_cdnPt_V4:
+  case Hexagon::LDb_GP_cdnNotPt_V4:
+  case Hexagon::LDub_GP_cdnPt_V4:
+  case Hexagon::LDub_GP_cdnNotPt_V4:
+  case Hexagon::LDh_GP_cdnPt_V4:
+  case Hexagon::LDh_GP_cdnNotPt_V4:
+  case Hexagon::LDuh_GP_cdnPt_V4:
+  case Hexagon::LDuh_GP_cdnNotPt_V4:
+  case Hexagon::LDw_GP_cdnPt_V4:
+  case Hexagon::LDw_GP_cdnNotPt_V4:
+  case Hexagon::LDrid_GP_cdnPt_V4:
+  case Hexagon::LDrid_GP_cdnNotPt_V4:
+  case Hexagon::LDrib_GP_cdnPt_V4:
+  case Hexagon::LDrib_GP_cdnNotPt_V4:
+  case Hexagon::LDriub_GP_cdnPt_V4:
+  case Hexagon::LDriub_GP_cdnNotPt_V4:
+  case Hexagon::LDrih_GP_cdnPt_V4:
+  case Hexagon::LDrih_GP_cdnNotPt_V4:
+  case Hexagon::LDriuh_GP_cdnPt_V4:
+  case Hexagon::LDriuh_GP_cdnNotPt_V4:
+  case Hexagon::LDriw_GP_cdnPt_V4:
+  case Hexagon::LDriw_GP_cdnNotPt_V4:
+
+  case Hexagon::STrid_GP_cdnPt_V4:
+  case Hexagon::STrid_GP_cdnNotPt_V4:
+  case Hexagon::STrib_GP_cdnPt_V4:
+  case Hexagon::STrib_GP_cdnNotPt_V4:
+  case Hexagon::STrih_GP_cdnPt_V4:
+  case Hexagon::STrih_GP_cdnNotPt_V4:
+  case Hexagon::STriw_GP_cdnPt_V4:
+  case Hexagon::STriw_GP_cdnNotPt_V4:
+  case Hexagon::STd_GP_cdnPt_V4:
+  case Hexagon::STd_GP_cdnNotPt_V4:
+  case Hexagon::STb_GP_cdnPt_V4:
+  case Hexagon::STb_GP_cdnNotPt_V4:
+  case Hexagon::STh_GP_cdnPt_V4:
+  case Hexagon::STh_GP_cdnNotPt_V4:
+  case Hexagon::STw_GP_cdnPt_V4:
+  case Hexagon::STw_GP_cdnNotPt_V4:
+    return true;
+  }
+  return false;
+}
+
+static MachineOperand& GetPostIncrementOperand(MachineInstr *MI,
+                                               const HexagonInstrInfo *QII) {
+  assert(QII->isPostIncrement(MI) && "Not a post increment operation.");
+#ifndef NDEBUG
+  // Post Increment means duplicates. Use dense map to find duplicates in the
+  // list. Caution: Densemap initializes with the minimum of 64 buckets,
+  // whereas there are at most 5 operands in the post increment.
+  DenseMap<unsigned,  unsigned> DefRegsSet;
+  for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++)
+    if (MI->getOperand(opNum).isReg() &&
+        MI->getOperand(opNum).isDef()) {
+      DefRegsSet[MI->getOperand(opNum).getReg()] = 1;
+    }
+
+  for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++)
+    if (MI->getOperand(opNum).isReg() &&
+        MI->getOperand(opNum).isUse()) {
+      if (DefRegsSet[MI->getOperand(opNum).getReg()]) {
+        return MI->getOperand(opNum);
+      }
+    }
+#else
+  if (MI->getDesc().mayLoad()) {
+    // The 2nd operand is always the post increment operand in load.
+    assert(MI->getOperand(1).isReg() &&
+                "Post increment operand has be to a register.");
+    return (MI->getOperand(1));
+  }
+  if (MI->getDesc().mayStore()) {
+    // The 1st operand is always the post increment operand in store.
+    assert(MI->getOperand(0).isReg() &&
+                "Post increment operand has be to a register.");
+    return (MI->getOperand(0));
+  }
+#endif
+  // we should never come here.
+  llvm_unreachable("mayLoad or mayStore not set for Post Increment operation");
+}
+
+// get the value being stored
+static MachineOperand& GetStoreValueOperand(MachineInstr *MI) {
+  // value being stored is always the last operand.
+  return (MI->getOperand(MI->getNumOperands()-1));
+}
+
+// can be new value store?
+// Following restrictions are to be respected in convert a store into
+// a new value store.
+// 1. If an instruction uses auto-increment, its address register cannot
+//    be a new-value register. Arch Spec 5.4.2.1
+// 2. If an instruction uses absolute-set addressing mode,
+//    its address register cannot be a new-value register.
+//    Arch Spec 5.4.2.1.TODO: This is not enabled as
+//    as absolute-set address mode patters are not implemented.
+// 3. If an instruction produces a 64-bit result, its registers cannot be used
+//    as new-value registers. Arch Spec 5.4.2.2.
+// 4. If the instruction that sets a new-value register is conditional, then
+//    the instruction that uses the new-value register must also be conditional,
+//    and both must always have their predicates evaluate identically.
+//    Arch Spec 5.4.2.3.
+// 5. There is an implied restriction of a packet can not have another store,
+//    if there is a  new value store in the packet. Corollary, if there is
+//    already a store in a packet, there can not be a new value store.
+//    Arch Spec: 3.4.4.2
+bool HexagonPacketizerList::CanPromoteToNewValueStore( MachineInstr *MI,
+                MachineInstr *PacketMI, unsigned DepReg,
+                std::map <MachineInstr*, SUnit*> MIToSUnit)
+{
+  // Make sure we are looking at the store
+  if (!IsNewifyStore(MI))
+    return false;
+
+  // Make sure there is dependency and can be new value'ed
+  if (GetStoreValueOperand(MI).isReg() &&
+      GetStoreValueOperand(MI).getReg() != DepReg)
+    return false;
+
+  const HexagonRegisterInfo* QRI = 
+                            (const HexagonRegisterInfo *) TM.getRegisterInfo();
+  const MCInstrDesc& MCID = PacketMI->getDesc();
+  // first operand is always the result
+
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  const TargetRegisterClass* PacketRC = QII->getRegClass(MCID, 0, QRI, MF);
+
+  // if there is already an store in the packet, no can do new value store
+  // Arch Spec 3.4.4.2.
+  for (std::vector<MachineInstr*>::iterator VI = CurrentPacketMIs.begin(),
+         VE = CurrentPacketMIs.end();
+       (VI != VE); ++VI) {
+    SUnit* PacketSU = MIToSUnit[*VI];
+    if (PacketSU->getInstr()->getDesc().mayStore() ||
+        // if we have mayStore = 1 set on ALLOCFRAME and DEALLOCFRAME,
+        // then we don't need this
+        PacketSU->getInstr()->getOpcode() == Hexagon::ALLOCFRAME ||
+        PacketSU->getInstr()->getOpcode() == Hexagon::DEALLOCFRAME)
+      return false;
+  }
+
+  if (PacketRC == &Hexagon::DoubleRegsRegClass) {
+    // new value store constraint: double regs can not feed into new value store
+    // arch spec section: 5.4.2.2
+    return false;
+  }
+
+  // Make sure it's NOT the post increment register that we are going to
+  // new value.
+  if (QII->isPostIncrement(MI) &&
+      MI->getDesc().mayStore() &&
+      GetPostIncrementOperand(MI, QII).getReg() == DepReg) {
+    return false;
+  }
+
+  if (QII->isPostIncrement(PacketMI) &&
+      PacketMI->getDesc().mayLoad() &&
+      GetPostIncrementOperand(PacketMI, QII).getReg() == DepReg) {
+    // if source is post_inc, or absolute-set addressing,
+    // it can not feed into new value store
+    //  r3 = memw(r2++#4)
+    //  memw(r30 + #-1404) = r2.new -> can not be new value store
+    // arch spec section: 5.4.2.1
+    return false;
+  }
+
+  // If the source that feeds the store is predicated, new value store must
+  // also be also predicated.
+  if (QII->isPredicated(PacketMI)) {
+    if (!QII->isPredicated(MI))
+      return false;
+
+    // Check to make sure that they both will have their predicates
+    // evaluate identically
+    unsigned predRegNumSrc = 0;
+    unsigned predRegNumDst = 0;
+    const TargetRegisterClass* predRegClass = NULL;
+
+    // Get predicate register used in the source instruction
+    for(unsigned opNum = 0; opNum < PacketMI->getNumOperands(); opNum++) {
+      if ( PacketMI->getOperand(opNum).isReg())
+      predRegNumSrc = PacketMI->getOperand(opNum).getReg();
+      predRegClass = QRI->getMinimalPhysRegClass(predRegNumSrc);
+      if (predRegClass == &Hexagon::PredRegsRegClass) {
+        break;
+      }
+    }
+    assert ((predRegClass == &Hexagon::PredRegsRegClass ) &&
+        ("predicate register not found in a predicated PacketMI instruction"));
+
+    // Get predicate register used in new-value store instruction
+    for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++) {
+      if ( MI->getOperand(opNum).isReg())
+      predRegNumDst = MI->getOperand(opNum).getReg();
+      predRegClass = QRI->getMinimalPhysRegClass(predRegNumDst);
+      if (predRegClass == &Hexagon::PredRegsRegClass) {
+        break;
+      }
+    }
+    assert ((predRegClass == &Hexagon::PredRegsRegClass ) &&
+            ("predicate register not found in a predicated MI instruction"));
+
+    // New-value register producer and user (store) need to satisfy these
+    // constraints:
+    // 1) Both instructions should be predicated on the same register.
+    // 2) If producer of the new-value register is .new predicated then store
+    // should also be .new predicated and if producer is not .new predicated
+    // then store should not be .new predicated.
+    // 3) Both new-value register producer and user should have same predicate
+    // sense, i.e, either both should be negated or both should be none negated.
+
+    if (( predRegNumDst != predRegNumSrc) ||
+          isDotNewInst(PacketMI) != isDotNewInst(MI)  ||
+          GetPredicateSense(MI, QII) != GetPredicateSense(PacketMI, QII)) {
+      return false;
+    }
+  }
+
+  // Make sure that other than the new-value register no other store instruction
+  // register has been modified in the same packet. Predicate registers can be
+  // modified by they should not be modified between the producer and the store
+  // instruction as it will make them both conditional on different values.
+  // We already know this to be true for all the instructions before and
+  // including PacketMI. Howerver, we need to perform the check for the
+  // remaining instructions in the packet.
+
+  std::vector<MachineInstr*>::iterator VI;
+  std::vector<MachineInstr*>::iterator VE;
+  unsigned StartCheck = 0;
+
+  for (VI=CurrentPacketMIs.begin(), VE = CurrentPacketMIs.end();
+      (VI != VE); ++VI) {
+    SUnit* TempSU = MIToSUnit[*VI];
+    MachineInstr* TempMI = TempSU->getInstr();
+
+    // Following condition is true for all the instructions until PacketMI is
+    // reached (StartCheck is set to 0 before the for loop).
+    // StartCheck flag is 1 for all the instructions after PacketMI.
+    if (TempMI != PacketMI && !StartCheck) // start processing only after
+      continue;                            // encountering PacketMI
+
+    StartCheck = 1;
+    if (TempMI == PacketMI) // We don't want to check PacketMI for dependence
+      continue;
+
+    for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++) {
+      if (MI->getOperand(opNum).isReg() &&
+          TempSU->getInstr()->modifiesRegister(MI->getOperand(opNum).getReg(),
+                                               QRI))
+        return false;
+    }
+  }
+
+  // Make sure that for non POST_INC stores:
+  // 1. The only use of reg is DepReg and no other registers.
+  //    This handles V4 base+index registers.
+  //    The following store can not be dot new.
+  //    Eg.   r0 = add(r0, #3)a
+  //          memw(r1+r0<<#2) = r0
+  if (!QII->isPostIncrement(MI) &&
+      GetStoreValueOperand(MI).isReg() &&
+      GetStoreValueOperand(MI).getReg() == DepReg) {
+    for(unsigned opNum = 0; opNum < MI->getNumOperands()-1; opNum++) {
+      if (MI->getOperand(opNum).isReg() &&
+          MI->getOperand(opNum).getReg() == DepReg) {
+        return false;
+      }
+    }
+    // 2. If data definition is because of implicit definition of the register,
+    //    do not newify the store. Eg.
+    //    %R9<def> = ZXTH %R12, %D6<imp-use>, %R12<imp-def>
+    //    STrih_indexed %R8, 2, %R12<kill>; mem:ST2[%scevgep343]
+    for(unsigned opNum = 0; opNum < PacketMI->getNumOperands(); opNum++) {
+      if (PacketMI->getOperand(opNum).isReg() &&
+          PacketMI->getOperand(opNum).getReg() == DepReg &&
+          PacketMI->getOperand(opNum).isDef() &&
+          PacketMI->getOperand(opNum).isImplicit()) {
+        return false;
+      }
+    }
+  }
+
+  // Can be dot new store.
+  return true;
+}
+
+// can this MI to promoted to either
+// new value store or new value jump
+bool HexagonPacketizerList::CanPromoteToNewValue( MachineInstr *MI,
+                SUnit *PacketSU, unsigned DepReg,
+                std::map <MachineInstr*, SUnit*> MIToSUnit,
+                MachineBasicBlock::iterator &MII)
+{
+
+  const HexagonRegisterInfo* QRI =
+                            (const HexagonRegisterInfo *) TM.getRegisterInfo();
+  if (!QRI->Subtarget.hasV4TOps() ||
+      !IsNewifyStore(MI))
+    return false;
+
+  MachineInstr *PacketMI = PacketSU->getInstr();
+
+  // Check to see the store can be new value'ed.
+  if (CanPromoteToNewValueStore(MI, PacketMI, DepReg, MIToSUnit))
+    return true;
+
+  // Check to see the compare/jump can be new value'ed.
+  // This is done as a pass on its own. Don't need to check it here.
+  return false;
+}
+
+// Check to see if an instruction can be dot new
+// There are three kinds.
+// 1. dot new on predicate - V2/V3/V4
+// 2. dot new on stores NV/ST - V4
+// 3. dot new on jump NV/J - V4 -- This is generated in a pass.
+bool HexagonPacketizerList::CanPromoteToDotNew( MachineInstr *MI,
+                              SUnit *PacketSU, unsigned DepReg,
+                              std::map <MachineInstr*, SUnit*> MIToSUnit,
+                              MachineBasicBlock::iterator &MII,
+                              const TargetRegisterClass* RC )
+{
+  // already a dot new instruction
+  if (isDotNewInst(MI) && !IsNewifyStore(MI))
+    return false;
+
+  if (!isNewifiable(MI))
+    return false;
+
+  // predicate .new
+  if (RC == &Hexagon::PredRegsRegClass && isCondInst(MI))
+      return true;
+  else if (RC != &Hexagon::PredRegsRegClass &&
+      !IsNewifyStore(MI)) // MI is not a new-value store
+    return false;
+  else {
+    // Create a dot new machine instruction to see if resources can be
+    // allocated. If not, bail out now.
+    const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+    int NewOpcode = GetDotNewOp(MI->getOpcode());
+    const MCInstrDesc &desc = QII->get(NewOpcode);
+    DebugLoc dl;
+    MachineInstr *NewMI =
+                    MI->getParent()->getParent()->CreateMachineInstr(desc, dl);
+    bool ResourcesAvailable = ResourceTracker->canReserveResources(NewMI);
+    MI->getParent()->getParent()->DeleteMachineInstr(NewMI);
+
+    if (!ResourcesAvailable)
+      return false;
+
+    // new value store only
+    // new new value jump generated as a passes
+    if (!CanPromoteToNewValue(MI, PacketSU, DepReg, MIToSUnit, MII)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Go through the packet instructions and search for anti dependency
+// between them and DepReg from MI
+// Consider this case:
+// Trying to add
+// a) %R1<def> = TFRI_cdNotPt %P3, 2
+// to this packet:
+// {
+//   b) %P0<def> = OR_pp %P3<kill>, %P0<kill>
+//   c) %P3<def> = TFR_PdRs %R23
+//   d) %R1<def> = TFRI_cdnPt %P3, 4
+//  }
+// The P3 from a) and d) will be complements after
+// a)'s P3 is converted to .new form
+// Anti Dep between c) and b) is irrelevant for this case
+bool HexagonPacketizerList::RestrictingDepExistInPacket (MachineInstr* MI,
+      unsigned DepReg,
+      std::map <MachineInstr*, SUnit*> MIToSUnit) {
+
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  SUnit* PacketSUDep = MIToSUnit[MI];
+
+  for (std::vector<MachineInstr*>::iterator VIN = CurrentPacketMIs.begin(),
+       VEN = CurrentPacketMIs.end(); (VIN != VEN); ++VIN) {
+
+    // We only care for dependencies to predicated instructions
+    if(!QII->isPredicated(*VIN)) continue;
+
+    // Scheduling Unit for current insn in the packet
+    SUnit* PacketSU = MIToSUnit[*VIN];
+
+    // Look at dependencies between current members of the packet
+    // and predicate defining instruction MI.
+    // Make sure that dependency is on the exact register
+    // we care about.
+    if (PacketSU->isSucc(PacketSUDep)) {
+      for (unsigned i = 0; i < PacketSU->Succs.size(); ++i) {
+        if ((PacketSU->Succs[i].getSUnit() == PacketSUDep) &&
+            (PacketSU->Succs[i].getKind() == SDep::Anti) &&
+            (PacketSU->Succs[i].getReg() == DepReg)) {
+          return true;
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+
+// Given two predicated instructions, this function detects whether
+// the predicates are complements
+bool HexagonPacketizerList::ArePredicatesComplements (MachineInstr* MI1,
+     MachineInstr* MI2, std::map <MachineInstr*, SUnit*> MIToSUnit) {
+
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  // Currently can only reason about conditional transfers
+  if (!QII->isConditionalTransfer(MI1) || !QII->isConditionalTransfer(MI2)) {
+    return false;
+  }
+
+  // Scheduling unit for candidate
+  SUnit* SU = MIToSUnit[MI1];
+
+  // One corner case deals with the following scenario:
+  // Trying to add
+  // a) %R24<def> = TFR_cPt %P0, %R25
+  // to this packet:
+  //
+  // {
+  //   b) %R25<def> = TFR_cNotPt %P0, %R24
+  //   c) %P0<def> = CMPEQri %R26, 1
+  // }
+  //
+  // On general check a) and b) are complements, but
+  // presence of c) will convert a) to .new form, and
+  // then it is not a complement
+  // We attempt to detect it by analyzing  existing
+  // dependencies in the packet
+
+  // Analyze relationships between all existing members of the packet.
+  // Look for Anti dependecy on the same predicate reg
+  // as used in the candidate
+  for (std::vector<MachineInstr*>::iterator VIN = CurrentPacketMIs.begin(),
+       VEN = CurrentPacketMIs.end(); (VIN != VEN); ++VIN) {
+
+    // Scheduling Unit for current insn in the packet
+    SUnit* PacketSU = MIToSUnit[*VIN];
+
+    // If this instruction in the packet is succeeded by the candidate...
+    if (PacketSU->isSucc(SU)) {
+      for (unsigned i = 0; i < PacketSU->Succs.size(); ++i) {
+        // The corner case exist when there is true data
+        // dependency between candidate and one of current
+        // packet members, this dep is on predicate reg, and
+        // there already exist anti dep on the same pred in
+        // the packet.
+        if (PacketSU->Succs[i].getSUnit() == SU &&
+            Hexagon::PredRegsRegClass.contains(
+              PacketSU->Succs[i].getReg()) &&
+            PacketSU->Succs[i].getKind() == SDep::Data &&
+            // Here I know that *VIN is predicate setting instruction
+            // with true data dep to candidate on the register
+            // we care about - c) in the above example.
+            // Now I need to see if there is an anti dependency
+            // from c) to any other instruction in the
+            // same packet on the pred reg of interest
+            RestrictingDepExistInPacket(*VIN,PacketSU->Succs[i].getReg(),
+                                        MIToSUnit)) {
+           return false;
+        }
+      }
+    }
+  }
+
+  // If the above case does not apply, check regular
+  // complement condition.
+  // Check that the predicate register is the same and
+  // that the predicate sense is different
+  // We also need to differentiate .old vs. .new:
+  // !p0 is not complimentary to p0.new
+  return ((MI1->getOperand(1).getReg() == MI2->getOperand(1).getReg()) &&
+          (GetPredicateSense(MI1, QII) != GetPredicateSense(MI2, QII)) &&
+          (isDotNewInst(MI1) == isDotNewInst(MI2)));
+}
+
+// initPacketizerState - Initialize packetizer flags
+void HexagonPacketizerList::initPacketizerState() {
+
+  Dependence = false;
+  PromotedToDotNew = false;
+  GlueToNewValueJump = false;
+  GlueAllocframeStore = false;
+  FoundSequentialDependence = false;
+
+  return;
+}
+
+// ignorePseudoInstruction - Ignore bundling of pseudo instructions.
+bool HexagonPacketizerList::ignorePseudoInstruction(MachineInstr *MI,
+                                                    MachineBasicBlock *MBB) {
+  if (MI->isDebugValue())
+    return true;
+
+  // We must print out inline assembly
+  if (MI->isInlineAsm())
+    return false;
+
+  // We check if MI has any functional units mapped to it.
+  // If it doesn't, we ignore the instruction.
+  const MCInstrDesc& TID = MI->getDesc();
+  unsigned SchedClass = TID.getSchedClass();
+  const InstrStage* IS =
+                    ResourceTracker->getInstrItins()->beginStage(SchedClass);
+  unsigned FuncUnits = IS->getUnits();
+  return !FuncUnits;
+}
+
+// isSoloInstruction: - Returns true for instructions that must be
+// scheduled in their own packet.
+bool HexagonPacketizerList::isSoloInstruction(MachineInstr *MI) {
+
+  if (MI->isInlineAsm())
+    return true;
+
+  if (MI->isEHLabel())
+    return true;
+
+  // From Hexagon V4 Programmer's Reference Manual 3.4.4 Grouping constraints:
+  // trap, pause, barrier, icinva, isync, and syncht are solo instructions.
+  // They must not be grouped with other instructions in a packet.
+  if (IsSchedBarrier(MI))
+    return true;
+
+  return false;
+}
+
+// isLegalToPacketizeTogether:
+// SUI is the current instruction that is out side of the current packet.
+// SUJ is the current instruction inside the current packet against which that
+// SUI will be packetized.
+bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
+  MachineInstr *I = SUI->getInstr();
+  MachineInstr *J = SUJ->getInstr();
+  assert(I && J && "Unable to packetize null instruction!");
+
+  const MCInstrDesc &MCIDI = I->getDesc();
+  const MCInstrDesc &MCIDJ = J->getDesc();
+
+  MachineBasicBlock::iterator II = I;
+
+  const unsigned FrameSize = MF.getFrameInfo()->getStackSize();
+  const HexagonRegisterInfo* QRI =
+                      (const HexagonRegisterInfo *) TM.getRegisterInfo();
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+
+  // Inline asm cannot go in the packet.
+  if (I->getOpcode() == Hexagon::INLINEASM)
+    llvm_unreachable("Should not meet inline asm here!");
+
+  if (isSoloInstruction(I))
+    llvm_unreachable("Should not meet solo instr here!");
+
+  // A save callee-save register function call can only be in a packet
+  // with instructions that don't write to the callee-save registers.
+  if ((QII->isSaveCalleeSavedRegsCall(I) &&
+       DoesModifyCalleeSavedReg(J, QRI)) ||
+      (QII->isSaveCalleeSavedRegsCall(J) &&
+       DoesModifyCalleeSavedReg(I, QRI))) {
+    Dependence = true;
+    return false;
+  }
+
+  // Two control flow instructions cannot go in the same packet.
+  if (IsControlFlow(I) && IsControlFlow(J)) {
+    Dependence = true;
+    return false;
+  }
+
+  // A LoopN instruction cannot appear in the same packet as a jump or call.
+  if (IsLoopN(I) && (   IsDirectJump(J)
+                     || MCIDJ.isCall()
+                     || QII->isDeallocRet(J))) {
+    Dependence = true;
+    return false;
+  }
+  if (IsLoopN(J) && (   IsDirectJump(I)
+                     || MCIDI.isCall()
+                     || QII->isDeallocRet(I))) {
+    Dependence = true;
+    return false;
+  }
+
+  // dealloc_return cannot appear in the same packet as a conditional or
+  // unconditional jump.
+  if (QII->isDeallocRet(I) && (   MCIDJ.isBranch()
+                               || MCIDJ.isCall()
+                               || MCIDJ.isBarrier())) {
+    Dependence = true;
+    return false;
+  }
+
+
+  // V4 allows dual store. But does not allow second store, if the
+  // first store is not in SLOT0. New value store, new value jump,
+  // dealloc_return and memop always take SLOT0.
+  // Arch spec 3.4.4.2
+  if (QRI->Subtarget.hasV4TOps()) {
+
+    if (MCIDI.mayStore() && MCIDJ.mayStore() && isNewValueInst(J)) {
+      Dependence = true;
+      return false;
+    }
+
+    if (   (QII->isMemOp(J) && MCIDI.mayStore())
+        || (MCIDJ.mayStore() && QII->isMemOp(I))
+        || (QII->isMemOp(J) && QII->isMemOp(I))) {
+      Dependence = true;
+      return false;
+    }
+
+    //if dealloc_return
+    if (MCIDJ.mayStore() && QII->isDeallocRet(I)){
+      Dependence = true;
+      return false;
+    }
+
+    // If an instruction feeds new value jump, glue it.
+    MachineBasicBlock::iterator NextMII = I;
+    ++NextMII;
+    MachineInstr *NextMI = NextMII;
+
+    if (QII->isNewValueJump(NextMI)) {
+
+      bool secondRegMatch = false;
+      bool maintainNewValueJump = false;
+
+      if (NextMI->getOperand(1).isReg() &&
+          I->getOperand(0).getReg() == NextMI->getOperand(1).getReg()) {
+        secondRegMatch = true;
+        maintainNewValueJump = true;
+      }
+
+      if (!secondRegMatch &&
+           I->getOperand(0).getReg() == NextMI->getOperand(0).getReg()) {
+        maintainNewValueJump = true;
+      }
+
+      for (std::vector<MachineInstr*>::iterator
+            VI = CurrentPacketMIs.begin(),
+             VE = CurrentPacketMIs.end();
+           (VI != VE && maintainNewValueJump); ++VI) {
+        SUnit* PacketSU = MIToSUnit[*VI];
+
+        // NVJ can not be part of the dual jump - Arch Spec: section 7.8
+        if (PacketSU->getInstr()->getDesc().isCall()) {
+          Dependence = true;
+          break;
+        }
+        // Validate
+        // 1. Packet does not have a store in it.
+        // 2. If the first operand of the nvj is newified, and the second
+        //    operand is also a reg, it (second reg) is not defined in
+        //    the same packet.
+        // 3. If the second operand of the nvj is newified, (which means
+        //    first operand is also a reg), first reg is not defined in
+        //    the same packet.
+        if (PacketSU->getInstr()->getDesc().mayStore()               ||
+            PacketSU->getInstr()->getOpcode() == Hexagon::ALLOCFRAME ||
+            // Check #2.
+            (!secondRegMatch && NextMI->getOperand(1).isReg() &&
+             PacketSU->getInstr()->modifiesRegister(
+                               NextMI->getOperand(1).getReg(), QRI)) ||
+            // Check #3.
+            (secondRegMatch &&
+             PacketSU->getInstr()->modifiesRegister(
+                               NextMI->getOperand(0).getReg(), QRI))) {
+          Dependence = true;
+          break;
+        }
+      }
+      if (!Dependence)
+        GlueToNewValueJump = true;
+      else
+        return false;
+    }
+  }
+
+  if (SUJ->isSucc(SUI)) {
+    for (unsigned i = 0;
+         (i < SUJ->Succs.size()) && !FoundSequentialDependence;
+         ++i) {
+
+      if (SUJ->Succs[i].getSUnit() != SUI) {
+        continue;
+      }
+
+      SDep::Kind DepType = SUJ->Succs[i].getKind();
+
+      // For direct calls:
+      // Ignore register dependences for call instructions for
+      // packetization purposes except for those due to r31 and
+      // predicate registers.
+      //
+      // For indirect calls:
+      // Same as direct calls + check for true dependences to the register
+      // used in the indirect call.
+      //
+      // We completely ignore Order dependences for call instructions
+      //
+      // For returns:
+      // Ignore register dependences for return instructions like jumpr,
+      // dealloc return unless we have dependencies on the explicit uses
+      // of the registers used by jumpr (like r31) or dealloc return
+      // (like r29 or r30).
+      //
+      // TODO: Currently, jumpr is handling only return of r31. So, the
+      // following logic (specificaly IsCallDependent) is working fine.
+      // We need to enable jumpr for register other than r31 and then,
+      // we need to rework the last part, where it handles indirect call
+      // of that (IsCallDependent) function. Bug 6216 is opened for this.
+      //
+      unsigned DepReg = 0;
+      const TargetRegisterClass* RC = NULL;
+      if (DepType == SDep::Data) {
+        DepReg = SUJ->Succs[i].getReg();
+        RC = QRI->getMinimalPhysRegClass(DepReg);
+      }
+      if ((MCIDI.isCall() || MCIDI.isReturn()) &&
+          (!IsRegDependence(DepType) ||
+            !IsCallDependent(I, DepType, SUJ->Succs[i].getReg()))) {
+        /* do nothing */
+      }
+
+      // For instructions that can be promoted to dot-new, try to promote.
+      else if ((DepType == SDep::Data) &&
+               CanPromoteToDotNew(I, SUJ, DepReg, MIToSUnit, II, RC) &&
+               PromoteToDotNew(I, DepType, II, RC)) {
+        PromotedToDotNew = true;
+        /* do nothing */
+      }
+
+      else if ((DepType == SDep::Data) &&
+               (QII->isNewValueJump(I))) {
+        /* do nothing */
+      }
+
+      // For predicated instructions, if the predicates are complements
+      // then there can be no dependence.
+      else if (QII->isPredicated(I) &&
+               QII->isPredicated(J) &&
+          ArePredicatesComplements(I, J, MIToSUnit)) {
+        /* do nothing */
+
+      }
+      else if (IsDirectJump(I) &&
+               !MCIDJ.isBranch() &&
+               !MCIDJ.isCall() &&
+               (DepType == SDep::Order)) {
+        // Ignore Order dependences between unconditional direct branches
+        // and non-control-flow instructions
+        /* do nothing */
+      }
+      else if (MCIDI.isConditionalBranch() && (DepType != SDep::Data) &&
+               (DepType != SDep::Output)) {
+        // Ignore all dependences for jumps except for true and output
+        // dependences
+        /* do nothing */
+      }
+
+      // Ignore output dependences due to superregs. We can
+      // write to two different subregisters of R1:0 for instance
+      // in the same cycle
+      //
+
+      //
+      // Let the
+      // If neither I nor J defines DepReg, then this is a
+      // superfluous output dependence. The dependence must be of the
+      // form:
+      //  R0 = ...
+      //  R1 = ...
+      // and there is an output dependence between the two instructions
+      // with
+      // DepReg = D0
+      // We want to ignore these dependences.
+      // Ideally, the dependence constructor should annotate such
+      // dependences. We can then avoid this relatively expensive check.
+      //
+      else if (DepType == SDep::Output) {
+        // DepReg is the register that's responsible for the dependence.
+        unsigned DepReg = SUJ->Succs[i].getReg();
+
+        // Check if I and J really defines DepReg.
+        if (I->definesRegister(DepReg) ||
+            J->definesRegister(DepReg)) {
+          FoundSequentialDependence = true;
+          break;
+        }
+      }
+
+      // We ignore Order dependences for
+      // 1. Two loads unless they are volatile.
+      // 2. Two stores in V4 unless they are volatile.
+      else if ((DepType == SDep::Order) &&
+               !I->hasVolatileMemoryRef() &&
+               !J->hasVolatileMemoryRef()) {
+        if (QRI->Subtarget.hasV4TOps() &&
+            // hexagonv4 allows dual store.
+            MCIDI.mayStore() && MCIDJ.mayStore()) {
+          /* do nothing */
+        }
+        // store followed by store-- not OK on V2
+        // store followed by load -- not OK on all (OK if addresses
+        // are not aliased)
+        // load followed by store -- OK on all
+        // load followed by load  -- OK on all
+        else if ( !MCIDJ.mayStore()) {
+          /* do nothing */
+        }
+        else {
+          FoundSequentialDependence = true;
+          break;
+        }
+      }
+
+      // For V4, special case ALLOCFRAME. Even though there is dependency
+      // between ALLOCAFRAME and subsequent store, allow it to be
+      // packetized in a same packet. This implies that the store is using
+      // caller's SP. Hense, offset needs to be updated accordingly.
+      else if (DepType == SDep::Data
+               && QRI->Subtarget.hasV4TOps()
+               && J->getOpcode() == Hexagon::ALLOCFRAME
+               && (I->getOpcode() == Hexagon::STrid
+                   || I->getOpcode() == Hexagon::STriw
+                   || I->getOpcode() == Hexagon::STrib)
+               && I->getOperand(0).getReg() == QRI->getStackRegister()
+               && QII->isValidOffset(I->getOpcode(),
+                                     I->getOperand(1).getImm() -
+                                     (FrameSize + HEXAGON_LRFP_SIZE)))
+      {
+        GlueAllocframeStore = true;
+        // Since this store is to be glued with allocframe in the same
+        // packet, it will use SP of the previous stack frame, i.e
+        // caller's SP. Therefore, we need to recalculate offset according
+        // to this change.
+        I->getOperand(1).setImm(I->getOperand(1).getImm() -
+                                        (FrameSize + HEXAGON_LRFP_SIZE));
+      }
+
+      //
+      // Skip over anti-dependences. Two instructions that are
+      // anti-dependent can share a packet
+      //
+      else if (DepType != SDep::Anti) {
+        FoundSequentialDependence = true;
+        break;
+      }
+    }
+
+    if (FoundSequentialDependence) {
+      Dependence = true;
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// isLegalToPruneDependencies
+bool HexagonPacketizerList::isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) {
+  MachineInstr *I = SUI->getInstr();
+  assert(I && SUJ->getInstr() && "Unable to packetize null instruction!");
+
+  const unsigned FrameSize = MF.getFrameInfo()->getStackSize();
+
+  if (Dependence) {
+
+    // Check if the instruction was promoted to a dot-new. If so, demote it
+    // back into a dot-old.
+    if (PromotedToDotNew) {
+      DemoteToDotOld(I);
+    }
+
+    // Check if the instruction (must be a store) was glued with an Allocframe
+    // instruction. If so, restore its offset to its original value, i.e. use
+    // curent SP instead of caller's SP.
+    if (GlueAllocframeStore) {
+      I->getOperand(1).setImm(I->getOperand(1).getImm() +
+                                             FrameSize + HEXAGON_LRFP_SIZE);
+    }
+
+    return false;
+  }
+  return true;
+}
+
+MachineBasicBlock::iterator
+HexagonPacketizerList::addToPacket(MachineInstr *MI) {
+
+    MachineBasicBlock::iterator MII = MI;
+    MachineBasicBlock *MBB = MI->getParent();
+
+    const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+
+    if (GlueToNewValueJump) {
+
+      ++MII;
+      MachineInstr *nvjMI = MII;
+      assert(ResourceTracker->canReserveResources(MI));
+      ResourceTracker->reserveResources(MI);
+      if (QII->isExtended(MI) &&
+          !tryAllocateResourcesForConstExt(MI)) {
+        endPacket(MBB, MI);
+        ResourceTracker->reserveResources(MI);
+        assert(canReserveResourcesForConstExt(MI) &&
+               "Ensure that there is a slot");
+        reserveResourcesForConstExt(MI);
+        // Reserve resources for new value jump constant extender.
+        assert(canReserveResourcesForConstExt(MI) &&
+               "Ensure that there is a slot");
+        reserveResourcesForConstExt(nvjMI);
+        assert(ResourceTracker->canReserveResources(nvjMI) &&
+               "Ensure that there is a slot");
+
+      } else if (   // Extended instruction takes two slots in the packet.
+        // Try reserve and allocate 4-byte in the current packet first.
+        (QII->isExtended(nvjMI)
+            && (!tryAllocateResourcesForConstExt(nvjMI)
+                || !ResourceTracker->canReserveResources(nvjMI)))
+        || // For non-extended instruction, no need to allocate extra 4 bytes.
+        (!QII->isExtended(nvjMI) && 
+              !ResourceTracker->canReserveResources(nvjMI)))
+      {
+        endPacket(MBB, MI);
+        // A new and empty packet starts.
+        // We are sure that the resources requirements can be satisfied.
+        // Therefore, do not need to call "canReserveResources" anymore.
+        ResourceTracker->reserveResources(MI);
+        if (QII->isExtended(nvjMI))
+          reserveResourcesForConstExt(nvjMI);
+      }
+      // Here, we are sure that "reserveResources" would succeed.
+      ResourceTracker->reserveResources(nvjMI);
+      CurrentPacketMIs.push_back(MI);
+      CurrentPacketMIs.push_back(nvjMI);
+    } else {
+      if (   QII->isExtended(MI)
+          && (   !tryAllocateResourcesForConstExt(MI)
+              || !ResourceTracker->canReserveResources(MI)))
+      {
+        endPacket(MBB, MI);
+        // Check if the instruction was promoted to a dot-new. If so, demote it
+        // back into a dot-old
+        if (PromotedToDotNew) {
+          DemoteToDotOld(MI);
+        }
+        reserveResourcesForConstExt(MI);
+      }
+      // In case that "MI" is not an extended insn,
+      // the resource availability has already been checked.
+      ResourceTracker->reserveResources(MI);
+      CurrentPacketMIs.push_back(MI);
+    }
+    return MII;
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createHexagonPacketizer() {
+  return new HexagonPacketizer();
+}
+
diff --git a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
index ef36881..035afe8 100644
--- a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
+++ b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
@@ -15,6 +15,7 @@
 #include "Hexagon.h"
 #include "HexagonAsmPrinter.h"
 #include "HexagonInstPrinter.h"
+#include "HexagonMCInst.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
@@ -37,20 +38,50 @@ StringRef HexagonInstPrinter::getRegName(unsigned RegNo) const {
 
 void HexagonInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
                                    StringRef Annot) {
+  printInst((const HexagonMCInst*)(MI), O, Annot);
+}
+
+void HexagonInstPrinter::printInst(const HexagonMCInst *MI, raw_ostream &O,
+                                   StringRef Annot) {
   const char packetPadding[] = "      ";
   const char startPacket = '{',
              endPacket = '}';
   // TODO: add outer HW loop when it's supported too.
   if (MI->getOpcode() == Hexagon::ENDLOOP0) {
-    MCInst Nop;
+    // Ending a harware loop is different from ending an regular packet.
+    assert(MI->isEndPacket() && "Loop end must also end the packet");
+
+    if (MI->isStartPacket()) {
+      // There must be a packet to end a loop.
+      // FIXME: when shuffling is always run, this shouldn't be needed.
+      HexagonMCInst Nop;
+      StringRef NoAnnot;
 
-    O << packetPadding << startPacket << '\n';
-    Nop.setOpcode(Hexagon::NOP);
-    printInstruction(&Nop, O);
-    O << packetPadding << endPacket;
+      Nop.setOpcode (Hexagon::NOP);
+      Nop.setStartPacket (MI->isStartPacket());
+      printInst (&Nop, O, NoAnnot);
+    }
+
+    // Close the packet.
+    if (MI->isEndPacket())
+      O << packetPadding << endPacket;
+
+    printInstruction(MI, O);
+  }
+  else {
+    // Prefix the insn opening the packet.
+    if (MI->isStartPacket())
+      O << packetPadding << startPacket << '\n';
+
+    printInstruction(MI, O);
+
+    // Suffix the insn closing the packet.
+    if (MI->isEndPacket())
+      // Suffix the packet in a new line always, since the GNU assembler has
+      // issues with a closing brace on the same line as CONST{32,64}.
+      O << '\n' << packetPadding << endPacket;
   }
 
-  printInstruction(MI, O);
   printAnnotation(O, Annot);
 }
 
@@ -65,22 +96,22 @@ void HexagonInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   } else if(MO.isImm()) {
     printImmOperand(MI, OpNo, O);
   } else {
-    assert(false && "Unknown operand");
+    llvm_unreachable("Unknown operand");
   }
 }
 
-void HexagonInstPrinter::printImmOperand
-  (const MCInst *MI, unsigned OpNo, raw_ostream &O) const {
+void HexagonInstPrinter::printImmOperand(const MCInst *MI, unsigned OpNo,
+                                         raw_ostream &O) const {
   O << MI->getOperand(OpNo).getImm();
 }
 
 void HexagonInstPrinter::printExtOperand(const MCInst *MI, unsigned OpNo,
-                                                raw_ostream &O) const {
+                                         raw_ostream &O) const {
   O << MI->getOperand(OpNo).getImm();
 }
 
-void HexagonInstPrinter::printUnsignedImmOperand
-  (const MCInst *MI, unsigned OpNo, raw_ostream &O) const {
+void HexagonInstPrinter::printUnsignedImmOperand(const MCInst *MI,
+                                    unsigned OpNo, raw_ostream &O) const {
   O << MI->getOperand(OpNo).getImm();
 }
 
@@ -89,13 +120,13 @@ void HexagonInstPrinter::printNegImmOperand(const MCInst *MI, unsigned OpNo,
   O << -MI->getOperand(OpNo).getImm();
 }
 
-void HexagonInstPrinter::printNOneImmOperand
-  (const MCInst *MI, unsigned OpNo, raw_ostream &O) const {
+void HexagonInstPrinter::printNOneImmOperand(const MCInst *MI, unsigned OpNo,
+                                             raw_ostream &O) const {
   O << -1;
 }
 
-void HexagonInstPrinter::printMEMriOperand
-  (const MCInst *MI, unsigned OpNo, raw_ostream &O) const {
+void HexagonInstPrinter::printMEMriOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) const {
   const MCOperand& MO0 = MI->getOperand(OpNo);
   const MCOperand& MO1 = MI->getOperand(OpNo + 1);
 
@@ -103,8 +134,8 @@ void HexagonInstPrinter::printMEMriOperand
   O << " + #" << MO1.getImm();
 }
 
-void HexagonInstPrinter::printFrameIndexOperand
-  (const MCInst *MI, unsigned OpNo, raw_ostream &O) const {
+void HexagonInstPrinter::printFrameIndexOperand(const MCInst *MI, unsigned OpNo,
+                                                raw_ostream &O) const {
   const MCOperand& MO0 = MI->getOperand(OpNo);
   const MCOperand& MO1 = MI->getOperand(OpNo + 1);
 
@@ -113,24 +144,21 @@ void HexagonInstPrinter::printFrameIndexOperand
 
 void HexagonInstPrinter::printGlobalOperand(const MCInst *MI, unsigned OpNo,
                                             raw_ostream &O) const {
-  const MCOperand& MO = MI->getOperand(OpNo);
-  assert(MO.isExpr() && "Expecting expression");
+  assert(MI->getOperand(OpNo).isExpr() && "Expecting expression");
 
   printOperand(MI, OpNo, O);
 }
 
 void HexagonInstPrinter::printJumpTable(const MCInst *MI, unsigned OpNo,
                                         raw_ostream &O) const {
-  const MCOperand& MO = MI->getOperand(OpNo);
-  assert(MO.isExpr() && "Expecting expression");
+  assert(MI->getOperand(OpNo).isExpr() && "Expecting expression");
 
   printOperand(MI, OpNo, O);
 }
 
 void HexagonInstPrinter::printConstantPool(const MCInst *MI, unsigned OpNo,
                                            raw_ostream &O) const {
-  const MCOperand& MO = MI->getOperand(OpNo);
-  assert(MO.isExpr() && "Expecting expression");
+  assert(MI->getOperand(OpNo).isExpr() && "Expecting expression");
 
   printOperand(MI, OpNo, O);
 }
diff --git a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h
index dad4334..902a323 100644
--- a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h
+++ b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h
@@ -14,6 +14,7 @@
 #ifndef HEXAGONINSTPRINTER_H
 #define HEXAGONINSTPRINTER_H
 
+#include "HexagonMCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
 
 namespace llvm {
@@ -25,6 +26,7 @@ namespace llvm {
       : MCInstPrinter(MAI, MII, MRI) {}
 
     virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+    void printInst(const HexagonMCInst *MI, raw_ostream &O, StringRef Annot);
     virtual StringRef getOpcodeName(unsigned Opcode) const;
     void printInstruction(const MCInst *MI, raw_ostream &O);
     StringRef getRegName(unsigned RegNo) const;
@@ -33,16 +35,16 @@ namespace llvm {
     void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
     void printImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
     void printExtOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
-    void printUnsignedImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
-           const;
+    void printUnsignedImmOperand(const MCInst *MI, unsigned OpNo,
+                                 raw_ostream &O) const;
     void printNegImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
            const;
     void printNOneImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
            const;
     void printMEMriOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
            const;
-    void printFrameIndexOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
-           const;
+    void printFrameIndexOperand(const MCInst *MI, unsigned OpNo,
+                                raw_ostream &O) const;
     void printBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
            const;
     void printCallOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
@@ -55,7 +57,8 @@ namespace llvm {
            const;
     void printJumpTable(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
 
-    void printConstantPool(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
+    void printConstantPool(const MCInst *MI, unsigned OpNo,
+                           raw_ostream &O) const;
 
     void printSymbolHi(const MCInst *MI, unsigned OpNo, raw_ostream &O) const
       { printSymbol(MI, OpNo, O, true); }
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index ed55c3c..7221e90 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -23,14 +23,41 @@ namespace llvm {
 /// instruction info tracks.
 ///
 namespace HexagonII {
-
   // *** The code below must match HexagonInstrFormat*.td *** //
 
+  // Insn types.
+  // *** Must match HexagonInstrFormat*.td ***
+  enum Type {
+    TypePSEUDO = 0,
+    TypeALU32  = 1,
+    TypeCR     = 2,
+    TypeJR     = 3,
+    TypeJ      = 4,
+    TypeLD     = 5,
+    TypeST     = 6,
+    TypeSYSTEM = 7,
+    TypeXTYPE  = 8,
+    TypeMEMOP  = 9,
+    TypeNV     = 10,
+    TypePREFIX = 30, // Such as extenders.
+    TypeMARKER = 31  // Such as end of a HW loop.
+  };
+
+
+
   // MCInstrDesc TSFlags
+  // *** Must match HexagonInstrFormat*.td ***
   enum {
+    // This 5-bit field describes the insn type.
+    TypePos  = 0,
+    TypeMask = 0x1f,
+
+    // Solo instructions.
+    SoloPos  = 5,
+    SoloMask = 0x1,
 
     // Predicated instructions.
-    PredicatedPos  = 1,
+    PredicatedPos  = 6,
     PredicatedMask = 0x1
   };
 
diff --git a/lib/Target/LLVMBuild.txt b/lib/Target/LLVMBuild.txt
index 8ec5673..8995080 100644
--- a/lib/Target/LLVMBuild.txt
+++ b/lib/Target/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = ARM CellSPU CppBackend Hexagon MBlaze MSP430 Mips PTX PowerPC Sparc X86 XCore
+subdirectories = ARM CellSPU CppBackend Hexagon MBlaze MSP430 NVPTX Mips PowerPC Sparc X86 XCore
 
 ; This is a special group whose required libraries are extended (by llvm-build)
 ; with the best execution engine (the native JIT, if available, or the
diff --git a/lib/Target/MBlaze/CMakeLists.txt b/lib/Target/MBlaze/CMakeLists.txt
index bf1deef..6c3e8b6 100644
--- a/lib/Target/MBlaze/CMakeLists.txt
+++ b/lib/Target/MBlaze/CMakeLists.txt
@@ -30,6 +30,8 @@ add_llvm_target(MBlazeCodeGen
   MBlazeELFWriterInfo.cpp
   )
 
+add_dependencies(LLVMMBlazeCodeGen intrinsics_gen)
+
 add_subdirectory(AsmParser)
 add_subdirectory(Disassembler)
 add_subdirectory(InstPrinter)
diff --git a/lib/Target/MBlaze/MBlaze.td b/lib/Target/MBlaze/MBlaze.td
index b4edff0..c288855 100644
--- a/lib/Target/MBlaze/MBlaze.td
+++ b/lib/Target/MBlaze/MBlaze.td
@@ -50,7 +50,7 @@ def FeatureSqrt        : SubtargetFeature<"sqrt", "HasSqrt", "true",
 // MBlaze processors supported.
 //===----------------------------------------------------------------------===//
 
-def : Processor<"mblaze",  MBlazeGenericItineraries, []>;
+def : Processor<"mblaze",  NoItineraries, []>;
 def : Processor<"mblaze3", MBlazePipe3Itineraries, []>;
 def : Processor<"mblaze5", MBlazePipe5Itineraries, []>;
 
diff --git a/lib/Target/MBlaze/MBlazeAsmPrinter.cpp b/lib/Target/MBlaze/MBlazeAsmPrinter.cpp
index 55fffe3..e9f340f 100644
--- a/lib/Target/MBlaze/MBlazeAsmPrinter.cpp
+++ b/lib/Target/MBlaze/MBlazeAsmPrinter.cpp
@@ -135,7 +135,7 @@ void MBlazeAsmPrinter::printSavedRegsBitmask() {
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
     unsigned RegNum = getMBlazeRegisterNumbering(Reg);
-    if (MBlaze::GPRRegisterClass->contains(Reg))
+    if (MBlaze::GPRRegClass.contains(Reg))
       CPUBitmask |= (1 << RegNum);
   }
 
@@ -187,7 +187,7 @@ void MBlazeAsmPrinter::EmitFunctionBodyEnd() {
 
 //===----------------------------------------------------------------------===//
 void MBlazeAsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  MBlazeMCInstLower MCInstLowering(OutContext, *Mang, *this);
+  MBlazeMCInstLower MCInstLowering(OutContext, *this);
 
   MCInst TmpInst;
   MCInstLowering.Lower(MI, TmpInst);
@@ -200,7 +200,13 @@ PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                 unsigned AsmVariant,const char *ExtraCode, raw_ostream &O) {
   // Does this asm operand have a single letter operand modifier?
   if (ExtraCode && ExtraCode[0])
-    return true; // Unknown modifier.
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+    }
 
   printOperand(MI, OpNo, O);
   return false;
diff --git a/lib/Target/MBlaze/MBlazeISelLowering.cpp b/lib/Target/MBlaze/MBlazeISelLowering.cpp
index edfc335..310c25e 100644
--- a/lib/Target/MBlaze/MBlazeISelLowering.cpp
+++ b/lib/Target/MBlaze/MBlazeISelLowering.cpp
@@ -62,9 +62,9 @@ MBlazeTargetLowering::MBlazeTargetLowering(MBlazeTargetMachine &TM)
   setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
 
   // Set up the register classes
-  addRegisterClass(MVT::i32, MBlaze::GPRRegisterClass);
+  addRegisterClass(MVT::i32, &MBlaze::GPRRegClass);
   if (Subtarget->hasFPU()) {
-    addRegisterClass(MVT::f32, MBlaze::GPRRegisterClass);
+    addRegisterClass(MVT::f32, &MBlaze::GPRRegClass);
     setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
   }
 
@@ -291,12 +291,12 @@ MBlazeTargetLowering::EmitCustomShift(MachineInstr *MI,
   loop->addSuccessor(finish);
   loop->addSuccessor(loop);
 
-  unsigned IAMT = R.createVirtualRegister(MBlaze::GPRRegisterClass);
+  unsigned IAMT = R.createVirtualRegister(&MBlaze::GPRRegClass);
   BuildMI(MBB, dl, TII->get(MBlaze::ANDI), IAMT)
     .addReg(MI->getOperand(2).getReg())
     .addImm(31);
 
-  unsigned IVAL = R.createVirtualRegister(MBlaze::GPRRegisterClass);
+  unsigned IVAL = R.createVirtualRegister(&MBlaze::GPRRegClass);
   BuildMI(MBB, dl, TII->get(MBlaze::ADDIK), IVAL)
     .addReg(MI->getOperand(1).getReg())
     .addImm(0);
@@ -305,14 +305,14 @@ MBlazeTargetLowering::EmitCustomShift(MachineInstr *MI,
     .addReg(IAMT)
     .addMBB(finish);
 
-  unsigned DST = R.createVirtualRegister(MBlaze::GPRRegisterClass);
-  unsigned NDST = R.createVirtualRegister(MBlaze::GPRRegisterClass);
+  unsigned DST = R.createVirtualRegister(&MBlaze::GPRRegClass);
+  unsigned NDST = R.createVirtualRegister(&MBlaze::GPRRegClass);
   BuildMI(loop, dl, TII->get(MBlaze::PHI), DST)
     .addReg(IVAL).addMBB(MBB)
     .addReg(NDST).addMBB(loop);
 
-  unsigned SAMT = R.createVirtualRegister(MBlaze::GPRRegisterClass);
-  unsigned NAMT = R.createVirtualRegister(MBlaze::GPRRegisterClass);
+  unsigned SAMT = R.createVirtualRegister(&MBlaze::GPRRegClass);
+  unsigned NAMT = R.createVirtualRegister(&MBlaze::GPRRegClass);
   BuildMI(loop, dl, TII->get(MBlaze::PHI), SAMT)
     .addReg(IAMT).addMBB(MBB)
     .addReg(NAMT).addMBB(loop);
@@ -500,7 +500,7 @@ MBlazeTargetLowering::EmitCustomAtomic(MachineInstr *MI,
     case MBlaze::LAN32: opcode = MBlaze::AND; break;
     }
 
-    finalReg = R.createVirtualRegister(MBlaze::GPRRegisterClass);
+    finalReg = R.createVirtualRegister(&MBlaze::GPRRegClass);
     start->addSuccessor(exit);
     start->addSuccessor(start);
 
@@ -510,7 +510,7 @@ MBlazeTargetLowering::EmitCustomAtomic(MachineInstr *MI,
 
     if (MI->getOpcode() == MBlaze::LAN32) {
       unsigned tmp = finalReg;
-      finalReg = R.createVirtualRegister(MBlaze::GPRRegisterClass);
+      finalReg = R.createVirtualRegister(&MBlaze::GPRRegClass);
       BuildMI(start, dl, TII->get(MBlaze::XORI), finalReg)
         .addReg(tmp)
         .addImm(-1);
@@ -528,7 +528,7 @@ MBlazeTargetLowering::EmitCustomAtomic(MachineInstr *MI,
     final->addSuccessor(exit);
     final->addSuccessor(start);
 
-    unsigned CMP = R.createVirtualRegister(MBlaze::GPRRegisterClass);
+    unsigned CMP = R.createVirtualRegister(&MBlaze::GPRRegClass);
     BuildMI(start, dl, TII->get(MBlaze::CMP), CMP)
       .addReg(MI->getOperand(0).getReg())
       .addReg(MI->getOperand(2).getReg());
@@ -543,7 +543,7 @@ MBlazeTargetLowering::EmitCustomAtomic(MachineInstr *MI,
   }
   }
 
-  unsigned CHK = R.createVirtualRegister(MBlaze::GPRRegisterClass);
+  unsigned CHK = R.createVirtualRegister(&MBlaze::GPRRegClass);
   BuildMI(final, dl, TII->get(MBlaze::SWX))
     .addReg(finalReg)
     .addReg(MI->getOperand(1).getReg())
@@ -681,13 +681,19 @@ static bool CC_MBlaze_AssignReg(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
 /// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
 /// TODO: isVarArg, isTailCall.
 SDValue MBlazeTargetLowering::
-LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
-          bool isVarArg, bool doesNotRet, bool &isTailCall,
-          const SmallVectorImpl<ISD::OutputArg> &Outs,
-          const SmallVectorImpl<SDValue> &OutVals,
-          const SmallVectorImpl<ISD::InputArg> &Ins,
-          DebugLoc dl, SelectionDAG &DAG,
+LowerCall(TargetLowering::CallLoweringInfo &CLI,
           SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
+
   // MBlaze does not yet support tail call optimization
   isTailCall = false;
 
@@ -702,7 +708,7 @@ LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), ArgLocs, *DAG.getContext());
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
   CCInfo.AnalyzeCallOperands(Outs, CC_MBlaze);
 
   // Get a count of how many bytes are to be pushed on the stack.
@@ -841,7 +847,7 @@ LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv,
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), RVLocs, *DAG.getContext());
+                 getTargetMachine(), RVLocs, *DAG.getContext());
 
   CCInfo.AnalyzeCallResult(Ins, RetCC_MBlaze);
 
@@ -884,7 +890,7 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), ArgLocs, *DAG.getContext());
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   CCInfo.AnalyzeFormalArguments(Ins, CC_MBlaze);
   SDValue StackPtr;
@@ -899,9 +905,9 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
       const TargetRegisterClass *RC;
 
       if (RegVT == MVT::i32)
-        RC = MBlaze::GPRRegisterClass;
+        RC = &MBlaze::GPRRegClass;
       else if (RegVT == MVT::f32)
-        RC = MBlaze::GPRRegisterClass;
+        RC = &MBlaze::GPRRegClass;
       else
         llvm_unreachable("RegVT not supported by LowerFormalArguments");
 
@@ -964,7 +970,7 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
       StackPtr = DAG.getRegister(StackReg, getPointerTy());
 
     // The last register argument that must be saved is MBlaze::R10
-    const TargetRegisterClass *RC = MBlaze::GPRRegisterClass;
+    const TargetRegisterClass *RC = &MBlaze::GPRRegClass;
 
     unsigned Begin = getMBlazeRegisterNumbering(MBlaze::R5);
     unsigned Start = getMBlazeRegisterNumbering(ArgRegEnd+1);
@@ -1016,7 +1022,7 @@ LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
 
   // CCState - Info about the registers and stack slot.
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), RVLocs, *DAG.getContext());
+                 getTargetMachine(), RVLocs, *DAG.getContext());
 
   // Analize return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_MBlaze);
@@ -1124,14 +1130,14 @@ getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'r':
-      return std::make_pair(0U, MBlaze::GPRRegisterClass);
+      return std::make_pair(0U, &MBlaze::GPRRegClass);
       // TODO: These can't possibly be right, but match what was in
       // getRegClassForInlineAsmConstraint.
     case 'd':
     case 'y':
     case 'f':
       if (VT == MVT::f32)
-        return std::make_pair(0U, MBlaze::GPRRegisterClass);
+        return std::make_pair(0U, &MBlaze::GPRRegClass);
     }
   }
   return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
diff --git a/lib/Target/MBlaze/MBlazeISelLowering.h b/lib/Target/MBlaze/MBlazeISelLowering.h
index 6a79fc1..a01fab5 100644
--- a/lib/Target/MBlaze/MBlazeISelLowering.h
+++ b/lib/Target/MBlaze/MBlazeISelLowering.h
@@ -132,13 +132,7 @@ namespace llvm {
                            SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
-      LowerCall(SDValue Chain, SDValue Callee,
-                CallingConv::ID CallConv, bool isVarArg,
-                bool doesNotRet, bool &isTailCall,
-                const SmallVectorImpl<ISD::OutputArg> &Outs,
-                const SmallVectorImpl<SDValue> &OutVals,
-                const SmallVectorImpl<ISD::InputArg> &Ins,
-                DebugLoc dl, SelectionDAG &DAG,
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
                 SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.cpp b/lib/Target/MBlaze/MBlazeInstrInfo.cpp
index db71434..b5025fc 100644
--- a/lib/Target/MBlaze/MBlazeInstrInfo.cpp
+++ b/lib/Target/MBlaze/MBlazeInstrInfo.cpp
@@ -287,7 +287,7 @@ unsigned MBlazeInstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
   const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
 
-  GlobalBaseReg = RegInfo.createVirtualRegister(MBlaze::GPRRegisterClass);
+  GlobalBaseReg = RegInfo.createVirtualRegister(&MBlaze::GPRRegClass);
   BuildMI(FirstMBB, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY),
           GlobalBaseReg).addReg(MBlaze::R20);
   RegInfo.addLiveIn(MBlaze::R20);
diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.td b/lib/Target/MBlaze/MBlazeInstrInfo.td
index 02a2157..139bf71 100644
--- a/lib/Target/MBlaze/MBlazeInstrInfo.td
+++ b/lib/Target/MBlaze/MBlazeInstrInfo.td
@@ -295,7 +295,7 @@ class BranchI<bits<6> op, bits<5> br, string instr_asm> :
 // Branch and Link Instructions
 //===----------------------------------------------------------------------===//
 class BranchL<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> :
-              TA<op, flags, (outs), (ins GPR:$link, GPR:$target, variable_ops),
+              TA<op, flags, (outs), (ins GPR:$link, GPR:$target),
                  !strconcat(instr_asm, "   $link, $target"),
                  [], IIC_BRl> {
   let ra = br;
@@ -303,7 +303,7 @@ class BranchL<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> :
 }
 
 class BranchLI<bits<6> op, bits<5> br, string instr_asm> :
-               TB<op, (outs), (ins GPR:$link, calltarget:$target, variable_ops),
+               TB<op, (outs), (ins GPR:$link, calltarget:$target),
                   !strconcat(instr_asm, "   $link, $target"),
                   [], IIC_BRl> {
   let ra = br;
diff --git a/lib/Target/MBlaze/MBlazeMCInstLower.h b/lib/Target/MBlaze/MBlazeMCInstLower.h
index 7b97744..8ab2c9a 100644
--- a/lib/Target/MBlaze/MBlazeMCInstLower.h
+++ b/lib/Target/MBlaze/MBlazeMCInstLower.h
@@ -21,18 +21,16 @@ namespace llvm {
   class MachineInstr;
   class MachineModuleInfoMachO;
   class MachineOperand;
-  class Mangler;
 
   /// MBlazeMCInstLower - This class is used to lower an MachineInstr
   /// into an MCInst.
 class LLVM_LIBRARY_VISIBILITY MBlazeMCInstLower {
   MCContext &Ctx;
-  Mangler &Mang;
 
   AsmPrinter &Printer;
 public:
-  MBlazeMCInstLower(MCContext &ctx, Mangler &mang, AsmPrinter &printer)
-    : Ctx(ctx), Mang(mang), Printer(printer) {}
+  MBlazeMCInstLower(MCContext &ctx, AsmPrinter &printer)
+    : Ctx(ctx), Printer(printer) {}
   void Lower(const MachineInstr *MI, MCInst &OutMI) const;
 
   MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
diff --git a/lib/Target/MBlaze/MBlazeSchedule.td b/lib/Target/MBlaze/MBlazeSchedule.td
index 4a3ae5f..cd5691c 100644
--- a/lib/Target/MBlaze/MBlazeSchedule.td
+++ b/lib/Target/MBlaze/MBlazeSchedule.td
@@ -40,11 +40,6 @@ def IIC_WDC    : InstrItinClass;
 def IIC_Pseudo : InstrItinClass;
 
 //===----------------------------------------------------------------------===//
-// MBlaze generic instruction itineraries.
-//===----------------------------------------------------------------------===//
-def MBlazeGenericItineraries : ProcessorItineraries<[], [], []>;
-
-//===----------------------------------------------------------------------===//
 // MBlaze instruction itineraries for three stage pipeline.
 //===----------------------------------------------------------------------===//
 include "MBlazeSchedule3.td"
diff --git a/lib/Target/MBlaze/MBlazeSubtarget.cpp b/lib/Target/MBlaze/MBlazeSubtarget.cpp
index d12d142..dc2ad29 100644
--- a/lib/Target/MBlaze/MBlazeSubtarget.cpp
+++ b/lib/Target/MBlaze/MBlazeSubtarget.cpp
@@ -43,13 +43,6 @@ MBlazeSubtarget::MBlazeSubtarget(const std::string &TT,
 
   // Initialize scheduling itinerary for the specified CPU.
   InstrItins = getInstrItineraryForCPU(CPUName);
-
-  // Compute the issue width of the MBlaze itineraries
-  computeIssueWidth();
-}
-
-void MBlazeSubtarget::computeIssueWidth() {
-  InstrItins.IssueWidth = 1;
 }
 
 bool MBlazeSubtarget::
diff --git a/lib/Target/MBlaze/MBlazeTargetMachine.cpp b/lib/Target/MBlaze/MBlazeTargetMachine.cpp
index dd7de9b..5f82f14 100644
--- a/lib/Target/MBlaze/MBlazeTargetMachine.cpp
+++ b/lib/Target/MBlaze/MBlazeTargetMachine.cpp
@@ -68,7 +68,7 @@ TargetPassConfig *MBlazeTargetMachine::createPassConfig(PassManagerBase &PM) {
 // Install an instruction selector pass using
 // the ISelDag to gen MBlaze code.
 bool MBlazePassConfig::addInstSelector() {
-  PM.add(createMBlazeISelDag(getMBlazeTargetMachine()));
+  addPass(createMBlazeISelDag(getMBlazeTargetMachine()));
   return false;
 }
 
@@ -76,6 +76,6 @@ bool MBlazePassConfig::addInstSelector() {
 // machine code is emitted. return true if -print-machineinstrs should
 // print out the code after the passes.
 bool MBlazePassConfig::addPreEmitPass() {
-  PM.add(createMBlazeDelaySlotFillerPass(getMBlazeTargetMachine()));
+  addPass(createMBlazeDelaySlotFillerPass(getMBlazeTargetMachine()));
   return true;
 }
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp
index c9b1636..bfd11a0 100644
--- a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp
+++ b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp
@@ -98,6 +98,7 @@ public:
 
 
 MCCodeEmitter *llvm::createMBlazeMCCodeEmitter(const MCInstrInfo &MCII,
+                                               const MCRegisterInfo &MRI,
                                                const MCSubtargetInfo &STI,
                                                MCContext &Ctx) {
   return new MBlazeMCCodeEmitter(MCII, STI, Ctx);
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h
index ae82c32..7cc96c6 100644
--- a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h
+++ b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h
@@ -22,6 +22,7 @@ class MCContext;
 class MCCodeEmitter;
 class MCInstrInfo;
 class MCObjectWriter;
+class MCRegisterInfo;
 class MCSubtargetInfo;
 class Target;
 class StringRef;
@@ -30,6 +31,7 @@ class raw_ostream;
 extern Target TheMBlazeTarget;
 
 MCCodeEmitter *createMBlazeMCCodeEmitter(const MCInstrInfo &MCII,
+                                         const MCRegisterInfo &MRI,
                                          const MCSubtargetInfo &STI,
                                          MCContext &Ctx);
 
diff --git a/lib/Target/MSP430/CMakeLists.txt b/lib/Target/MSP430/CMakeLists.txt
index a8f9b52..f9ecaed 100644
--- a/lib/Target/MSP430/CMakeLists.txt
+++ b/lib/Target/MSP430/CMakeLists.txt
@@ -23,6 +23,8 @@ add_llvm_target(MSP430CodeGen
   MSP430MCInstLower.cpp
   )
 
+add_dependencies(LLVMMSP430CodeGen intrinsics_gen)
+
 add_subdirectory(InstPrinter)
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/MSP430/MSP430AsmPrinter.cpp b/lib/Target/MSP430/MSP430AsmPrinter.cpp
index 1d1094b..86bc183 100644
--- a/lib/Target/MSP430/MSP430AsmPrinter.cpp
+++ b/lib/Target/MSP430/MSP430AsmPrinter.cpp
@@ -154,7 +154,7 @@ bool MSP430AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
 
 //===----------------------------------------------------------------------===//
 void MSP430AsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  MSP430MCInstLower MCInstLowering(OutContext, *Mang, *this);
+  MSP430MCInstLower MCInstLowering(OutContext, *this);
 
   MCInst TmpInst;
   MCInstLowering.Lower(MI, TmpInst);
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index 071a2f7..f8b7e14 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -59,13 +59,13 @@ HWMultMode("msp430-hwmult-mode",
 
 MSP430TargetLowering::MSP430TargetLowering(MSP430TargetMachine &tm) :
   TargetLowering(tm, new TargetLoweringObjectFileELF()),
-  Subtarget(*tm.getSubtargetImpl()), TM(tm) {
+  Subtarget(*tm.getSubtargetImpl()) {
 
   TD = getTargetData();
 
   // Set up the register classes.
-  addRegisterClass(MVT::i8,  MSP430::GR8RegisterClass);
-  addRegisterClass(MVT::i16, MSP430::GR16RegisterClass);
+  addRegisterClass(MVT::i8,  &MSP430::GR8RegClass);
+  addRegisterClass(MVT::i16, &MSP430::GR16RegClass);
 
   // Compute derived properties from the register classes
   computeRegisterProperties();
@@ -226,9 +226,9 @@ getRegForInlineAsmConstraint(const std::string &Constraint,
     default: break;
     case 'r':   // GENERAL_REGS
       if (VT == MVT::i8)
-        return std::make_pair(0U, MSP430::GR8RegisterClass);
+        return std::make_pair(0U, &MSP430::GR8RegClass);
 
-      return std::make_pair(0U, MSP430::GR16RegisterClass);
+      return std::make_pair(0U, &MSP430::GR16RegClass);
     }
   }
 
@@ -266,14 +266,19 @@ MSP430TargetLowering::LowerFormalArguments(SDValue Chain,
 }
 
 SDValue
-MSP430TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
-                                CallingConv::ID CallConv, bool isVarArg,
-                                bool doesNotRet, bool &isTailCall,
-                                const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                const SmallVectorImpl<SDValue> &OutVals,
-                                const SmallVectorImpl<ISD::InputArg> &Ins,
-                                DebugLoc dl, SelectionDAG &DAG,
+MSP430TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                 SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
+
   // MSP430 target does not yet support tail call optimization.
   isTailCall = false;
 
@@ -310,7 +315,7 @@ MSP430TargetLowering::LowerCCCArguments(SDValue Chain,
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), ArgLocs, *DAG.getContext());
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
   CCInfo.AnalyzeFormalArguments(Ins, CC_MSP430);
 
   assert(!isVarArg && "Varargs not supported yet");
@@ -330,8 +335,7 @@ MSP430TargetLowering::LowerCCCArguments(SDValue Chain,
           llvm_unreachable(0);
         }
       case MVT::i16:
-        unsigned VReg =
-          RegInfo.createVirtualRegister(MSP430::GR16RegisterClass);
+        unsigned VReg = RegInfo.createVirtualRegister(&MSP430::GR16RegClass);
         RegInfo.addLiveIn(VA.getLocReg(), VReg);
         SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, RegVT);
 
@@ -391,7 +395,7 @@ MSP430TargetLowering::LowerReturn(SDValue Chain,
 
   // CCState - Info about the registers and stack slot.
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), RVLocs, *DAG.getContext());
+                 getTargetMachine(), RVLocs, *DAG.getContext());
 
   // Analize return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_MSP430);
@@ -445,7 +449,7 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), ArgLocs, *DAG.getContext());
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   CCInfo.AnalyzeCallOperands(Outs, CC_MSP430);
 
@@ -568,7 +572,7 @@ MSP430TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), RVLocs, *DAG.getContext());
+                 getTargetMachine(), RVLocs, *DAG.getContext());
 
   CCInfo.AnalyzeCallResult(Ins, RetCC_MSP430);
 
@@ -1024,27 +1028,27 @@ MSP430TargetLowering::EmitShiftInstr(MachineInstr *MI,
   default: llvm_unreachable("Invalid shift opcode!");
   case MSP430::Shl8:
    Opc = MSP430::SHL8r1;
-   RC = MSP430::GR8RegisterClass;
+   RC = &MSP430::GR8RegClass;
    break;
   case MSP430::Shl16:
    Opc = MSP430::SHL16r1;
-   RC = MSP430::GR16RegisterClass;
+   RC = &MSP430::GR16RegClass;
    break;
   case MSP430::Sra8:
    Opc = MSP430::SAR8r1;
-   RC = MSP430::GR8RegisterClass;
+   RC = &MSP430::GR8RegClass;
    break;
   case MSP430::Sra16:
    Opc = MSP430::SAR16r1;
-   RC = MSP430::GR16RegisterClass;
+   RC = &MSP430::GR16RegClass;
    break;
   case MSP430::Srl8:
    Opc = MSP430::SAR8r1c;
-   RC = MSP430::GR8RegisterClass;
+   RC = &MSP430::GR8RegClass;
    break;
   case MSP430::Srl16:
    Opc = MSP430::SAR16r1c;
-   RC = MSP430::GR16RegisterClass;
+   RC = &MSP430::GR16RegClass;
    break;
   }
 
@@ -1072,8 +1076,8 @@ MSP430TargetLowering::EmitShiftInstr(MachineInstr *MI,
   LoopBB->addSuccessor(RemBB);
   LoopBB->addSuccessor(LoopBB);
 
-  unsigned ShiftAmtReg = RI.createVirtualRegister(MSP430::GR8RegisterClass);
-  unsigned ShiftAmtReg2 = RI.createVirtualRegister(MSP430::GR8RegisterClass);
+  unsigned ShiftAmtReg = RI.createVirtualRegister(&MSP430::GR8RegClass);
+  unsigned ShiftAmtReg2 = RI.createVirtualRegister(&MSP430::GR8RegClass);
   unsigned ShiftReg = RI.createVirtualRegister(RC);
   unsigned ShiftReg2 = RI.createVirtualRegister(RC);
   unsigned ShiftAmtSrcReg = MI->getOperand(2).getReg();
diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h
index e372f00..d8ad02f 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.h
+++ b/lib/Target/MSP430/MSP430ISelLowering.h
@@ -152,12 +152,7 @@ namespace llvm {
                            DebugLoc dl, SelectionDAG &DAG,
                            SmallVectorImpl<SDValue> &InVals) const;
     virtual SDValue
-      LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
-                bool isVarArg, bool doesNotRet, bool &isTailCall,
-                const SmallVectorImpl<ISD::OutputArg> &Outs,
-                const SmallVectorImpl<SDValue> &OutVals,
-                const SmallVectorImpl<ISD::InputArg> &Ins,
-                DebugLoc dl, SelectionDAG &DAG,
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
                 SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
@@ -174,7 +169,6 @@ namespace llvm {
                                             SelectionDAG &DAG) const;
 
     const MSP430Subtarget &Subtarget;
-    const MSP430TargetMachine &TM;
     const TargetData *TD;
   };
 } // namespace llvm
diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp
index c03ba47..be332f0 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -29,7 +29,7 @@ using namespace llvm;
 
 MSP430InstrInfo::MSP430InstrInfo(MSP430TargetMachine &tm)
   : MSP430GenInstrInfo(MSP430::ADJCALLSTACKDOWN, MSP430::ADJCALLSTACKUP),
-    RI(tm, *this), TM(tm) {}
+    RI(tm, *this) {}
 
 void MSP430InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                           MachineBasicBlock::iterator MI,
diff --git a/lib/Target/MSP430/MSP430InstrInfo.h b/lib/Target/MSP430/MSP430InstrInfo.h
index 04f339b..d79f992 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.h
+++ b/lib/Target/MSP430/MSP430InstrInfo.h
@@ -42,7 +42,6 @@ namespace MSP430II {
 
 class MSP430InstrInfo : public MSP430GenInstrInfo {
   const MSP430RegisterInfo RI;
-  MSP430TargetMachine &TM;
 public:
   explicit MSP430InstrInfo(MSP430TargetMachine &TM);
 
diff --git a/lib/Target/MSP430/MSP430InstrInfo.td b/lib/Target/MSP430/MSP430InstrInfo.td
index 4348dd5..f003574 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.td
+++ b/lib/Target/MSP430/MSP430InstrInfo.td
@@ -210,13 +210,13 @@ let isCall = 1 in
   let Defs = [R12W, R13W, R14W, R15W, SRW],
       Uses = [SPW] in {
     def CALLi     : II16i<0x0,
-                          (outs), (ins i16imm:$dst, variable_ops),
+                          (outs), (ins i16imm:$dst),
                           "call\t$dst", [(MSP430call imm:$dst)]>;
     def CALLr     : II16r<0x0,
-                          (outs), (ins GR16:$dst, variable_ops),
+                          (outs), (ins GR16:$dst),
                           "call\t$dst", [(MSP430call GR16:$dst)]>;
     def CALLm     : II16m<0x0,
-                          (outs), (ins memsrc:$dst, variable_ops),
+                          (outs), (ins memsrc:$dst),
                           "call\t${dst:mem}", [(MSP430call (load addr:$dst))]>;
   }
 
diff --git a/lib/Target/MSP430/MSP430MCInstLower.h b/lib/Target/MSP430/MSP430MCInstLower.h
index 24151e2..794aa56 100644
--- a/lib/Target/MSP430/MSP430MCInstLower.h
+++ b/lib/Target/MSP430/MSP430MCInstLower.h
@@ -21,18 +21,16 @@ namespace llvm {
   class MachineInstr;
   class MachineModuleInfoMachO;
   class MachineOperand;
-  class Mangler;
 
   /// MSP430MCInstLower - This class is used to lower an MachineInstr
   /// into an MCInst.
 class LLVM_LIBRARY_VISIBILITY MSP430MCInstLower {
   MCContext &Ctx;
-  Mangler &Mang;
 
   AsmPrinter &Printer;
 public:
-  MSP430MCInstLower(MCContext &ctx, Mangler &mang, AsmPrinter &printer)
-    : Ctx(ctx), Mang(mang), Printer(printer) {}
+  MSP430MCInstLower(MCContext &ctx, AsmPrinter &printer)
+    : Ctx(ctx), Printer(printer) {}
   void Lower(const MachineInstr *MI, MCInst &OutMI) const;
 
   MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
index 51ec71a..aed46a2 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -96,7 +96,8 @@ BitVector MSP430RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 }
 
 const TargetRegisterClass *
-MSP430RegisterInfo::getPointerRegClass(unsigned Kind) const {
+MSP430RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
+                                                                         const {
   return &MSP430::GR16RegClass;
 }
 
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.h b/lib/Target/MSP430/MSP430RegisterInfo.h
index 82ee499..9ee0a03 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.h
+++ b/lib/Target/MSP430/MSP430RegisterInfo.h
@@ -39,7 +39,8 @@ public:
   const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
 
   BitVector getReservedRegs(const MachineFunction &MF) const;
-  const TargetRegisterClass* getPointerRegClass(unsigned Kind = 0) const;
+  const TargetRegisterClass*
+  getPointerRegClass(const MachineFunction &MF, unsigned Kind = 0) const;
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                      MachineBasicBlock &MBB,
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.td b/lib/Target/MSP430/MSP430RegisterInfo.td
index 3f2eb8c..07619d0 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.td
+++ b/lib/Target/MSP430/MSP430RegisterInfo.td
@@ -78,8 +78,4 @@ def GR16 : RegisterClass<"MSP430", [i16], 16,
    // Frame pointer, sometimes allocable
    FPW,
    // Volatile, but not allocable
-   PCW, SPW, SRW, CGW)>
-{
-  let SubRegClasses = [(GR8 subreg_8bit)];
-}
-
+   PCW, SPW, SRW, CGW)>;
diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp
index 9f2eda1..817001d 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -60,12 +60,12 @@ TargetPassConfig *MSP430TargetMachine::createPassConfig(PassManagerBase &PM) {
 
 bool MSP430PassConfig::addInstSelector() {
   // Install an instruction selector.
-  PM.add(createMSP430ISelDag(getMSP430TargetMachine(), getOptLevel()));
+  addPass(createMSP430ISelDag(getMSP430TargetMachine(), getOptLevel()));
   return false;
 }
 
 bool MSP430PassConfig::addPreEmitPass() {
   // Must run branch selection immediately preceding the asm printer.
-  PM.add(createMSP430BranchSelectionPass());
+  addPass(createMSP430BranchSelectionPass());
   return false;
 }
diff --git a/lib/Target/Mips/AsmParser/CMakeLists.txt b/lib/Target/Mips/AsmParser/CMakeLists.txt
index ac21c25..6c7343b 100644
--- a/lib/Target/Mips/AsmParser/CMakeLists.txt
+++ b/lib/Target/Mips/AsmParser/CMakeLists.txt
@@ -1,6 +1,5 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
 add_llvm_library(LLVMMipsAsmParser
   MipsAsmParser.cpp
   )
 
+add_dependencies(LLVMMipsAsmParser MipsCommonTableGen)
diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt
index 0500c5d..e9a228c 100644
--- a/lib/Target/Mips/CMakeLists.txt
+++ b/lib/Target/Mips/CMakeLists.txt
@@ -17,13 +17,12 @@ add_llvm_target(MipsCodeGen
   MipsAsmPrinter.cpp
   MipsCodeEmitter.cpp
   MipsDelaySlotFiller.cpp
-  MipsEmitGPRestore.cpp
-  MipsExpandPseudo.cpp
   MipsJITInfo.cpp
   MipsInstrInfo.cpp
   MipsISelDAGToDAG.cpp
   MipsISelLowering.cpp
   MipsFrameLowering.cpp
+  MipsLongBranch.cpp
   MipsMCInstLower.cpp
   MipsMachineFunction.cpp
   MipsRegisterInfo.cpp
@@ -33,6 +32,8 @@ add_llvm_target(MipsCodeGen
   MipsSelectionDAGInfo.cpp
   )
 
+add_dependencies(LLVMMipsCodeGen intrinsics_gen)
+
 add_subdirectory(InstPrinter)
 add_subdirectory(Disassembler)
 add_subdirectory(TargetInfo)
diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index 78dbc06..042b456 100644
--- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -13,136 +13,87 @@
 
 #include "Mips.h"
 #include "MipsSubtarget.h"
+#include "MipsRegisterInfo.h"
 #include "llvm/MC/EDInstInfo.h"
 #include "llvm/MC/MCDisassembler.h"
 #include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/MathExtras.h"
 
-
 #include "MipsGenEDInfo.inc"
 
 using namespace llvm;
 
 typedef MCDisassembler::DecodeStatus DecodeStatus;
 
-/// MipsDisassembler - a disasembler class for Mips32.
-class MipsDisassembler : public MCDisassembler {
+namespace {
+
+/// MipsDisassemblerBase - a disasembler class for Mips.
+class MipsDisassemblerBase : public MCDisassembler {
 public:
   /// Constructor     - Initializes the disassembler.
   ///
-  MipsDisassembler(const MCSubtargetInfo &STI, bool bigEndian) :
-    MCDisassembler(STI), isBigEndian(bigEndian) {
-  }
-
-  ~MipsDisassembler() {
-  }
+  MipsDisassemblerBase(const MCSubtargetInfo &STI, const MCRegisterInfo *Info,
+                       bool bigEndian) :
+    MCDisassembler(STI), RegInfo(Info), isBigEndian(bigEndian) {}
 
-  /// getInstruction - See MCDisassembler.
-  DecodeStatus getInstruction(MCInst &instr,
-                              uint64_t &size,
-                              const MemoryObject &region,
-                              uint64_t address,
-                              raw_ostream &vStream,
-                              raw_ostream &cStream) const;
+  virtual ~MipsDisassemblerBase() {}
 
   /// getEDInfo - See MCDisassembler.
   const EDInstInfo *getEDInfo() const;
 
+  const MCRegisterInfo *getRegInfo() const { return RegInfo; }
+
 private:
+  const MCRegisterInfo *RegInfo;
+protected:
   bool isBigEndian;
 };
 
-
-/// Mips64Disassembler - a disasembler class for Mips64.
-class Mips64Disassembler : public MCDisassembler {
+/// MipsDisassembler - a disasembler class for Mips32.
+class MipsDisassembler : public MipsDisassemblerBase {
 public:
   /// Constructor     - Initializes the disassembler.
   ///
-  Mips64Disassembler(const MCSubtargetInfo &STI, bool bigEndian) :
-    MCDisassembler(STI), isBigEndian(bigEndian) {
-  }
-
-  ~Mips64Disassembler() {
-  }
+  MipsDisassembler(const MCSubtargetInfo &STI, const MCRegisterInfo *Info,
+                   bool bigEndian) :
+    MipsDisassemblerBase(STI, Info, bigEndian) {}
 
   /// getInstruction - See MCDisassembler.
-  DecodeStatus getInstruction(MCInst &instr,
-                              uint64_t &size,
-                              const MemoryObject &region,
-                              uint64_t address,
-                              raw_ostream &vStream,
-                              raw_ostream &cStream) const;
-
-  /// getEDInfo - See MCDisassembler.
-  const EDInstInfo *getEDInfo() const;
-
-private:
-  bool isBigEndian;
+  virtual DecodeStatus getInstruction(MCInst &instr,
+                                      uint64_t &size,
+                                      const MemoryObject &region,
+                                      uint64_t address,
+                                      raw_ostream &vStream,
+                                      raw_ostream &cStream) const;
 };
 
-const EDInstInfo *MipsDisassembler::getEDInfo() const {
-  return instInfoMips;
-}
-
-const EDInstInfo *Mips64Disassembler::getEDInfo() const {
-  return instInfoMips;
-}
-
-// Decoder tables for Mips register
-static const unsigned CPURegsTable[] = {
-  Mips::ZERO, Mips::AT, Mips::V0, Mips::V1,
-  Mips::A0, Mips::A1, Mips::A2, Mips::A3,
-  Mips::T0, Mips::T1, Mips::T2, Mips::T3,
-  Mips::T4, Mips::T5, Mips::T6, Mips::T7,
-  Mips::S0, Mips::S1, Mips::S2, Mips::S3,
-  Mips::S4, Mips::S5, Mips::S6, Mips::S7,
-  Mips::T8, Mips::T9, Mips::K0, Mips::K1,
-  Mips::GP, Mips::SP, Mips::FP, Mips::RA
-};
 
-static const unsigned FGR32RegsTable[] = {
-  Mips::F0, Mips::F1, Mips::F2, Mips::F3,
-  Mips::F4, Mips::F5, Mips::F6, Mips::F7,
-  Mips::F8, Mips::F9, Mips::F10, Mips::F11,
-  Mips::F12, Mips::F13, Mips::F14, Mips::F15,
-  Mips::F16, Mips::F17, Mips::F18, Mips::F18,
-  Mips::F20, Mips::F21, Mips::F22, Mips::F23,
-  Mips::F24, Mips::F25, Mips::F26, Mips::F27,
-  Mips::F28, Mips::F29, Mips::F30, Mips::F31
-};
+/// Mips64Disassembler - a disasembler class for Mips64.
+class Mips64Disassembler : public MipsDisassemblerBase {
+public:
+  /// Constructor     - Initializes the disassembler.
+  ///
+  Mips64Disassembler(const MCSubtargetInfo &STI, const MCRegisterInfo *Info,
+                     bool bigEndian) :
+    MipsDisassemblerBase(STI, Info, bigEndian) {}
 
-static const unsigned CPU64RegsTable[] = {
-  Mips::ZERO_64, Mips::AT_64, Mips::V0_64, Mips::V1_64,
-  Mips::A0_64, Mips::A1_64, Mips::A2_64, Mips::A3_64,
-  Mips::T0_64, Mips::T1_64, Mips::T2_64, Mips::T3_64,
-  Mips::T4_64, Mips::T5_64, Mips::T6_64, Mips::T7_64,
-  Mips::S0_64, Mips::S1_64, Mips::S2_64, Mips::S3_64,
-  Mips::S4_64, Mips::S5_64, Mips::S6_64, Mips::S7_64,
-  Mips::T8_64, Mips::T9_64, Mips::K0_64, Mips::K1_64,
-  Mips::GP_64, Mips::SP_64, Mips::FP_64, Mips::RA_64
+  /// getInstruction - See MCDisassembler.
+  virtual DecodeStatus getInstruction(MCInst &instr,
+                                      uint64_t &size,
+                                      const MemoryObject &region,
+                                      uint64_t address,
+                                      raw_ostream &vStream,
+                                      raw_ostream &cStream) const;
 };
 
-static const unsigned FGR64RegsTable[] = {
-  Mips::D0_64,  Mips::D1_64,  Mips::D2_64,  Mips::D3_64,
-  Mips::D4_64,  Mips::D5_64,  Mips::D6_64,  Mips::D7_64,
-  Mips::D8_64,  Mips::D9_64,  Mips::D10_64, Mips::D11_64,
-  Mips::D12_64, Mips::D13_64, Mips::D14_64, Mips::D15_64,
-  Mips::D16_64, Mips::D17_64, Mips::D18_64, Mips::D19_64,
-  Mips::D20_64, Mips::D21_64, Mips::D22_64, Mips::D23_64,
-  Mips::D24_64, Mips::D25_64, Mips::D26_64, Mips::D27_64,
-  Mips::D28_64, Mips::D29_64, Mips::D30_64, Mips::D31_64
-};
+} // end anonymous namespace
 
-static const unsigned AFGR64RegsTable[] = {
-  Mips::D0,  Mips::D1,  Mips::D2,  Mips::D3,
-  Mips::D4,  Mips::D5,  Mips::D6,  Mips::D7,
-  Mips::D8,  Mips::D9,  Mips::D10, Mips::D11,
-  Mips::D12, Mips::D13, Mips::D14, Mips::D15
-};
+const EDInstInfo *MipsDisassemblerBase::getEDInfo() const {
+  return instInfoMips;
+}
 
 // Forward declare these because the autogenerated code will reference them.
 // Definitions are further down.
@@ -239,25 +190,25 @@ extern Target TheMipselTarget, TheMipsTarget, TheMips64Target,
 static MCDisassembler *createMipsDisassembler(
                        const Target &T,
                        const MCSubtargetInfo &STI) {
-  return new MipsDisassembler(STI,true);
+  return new MipsDisassembler(STI, T.createMCRegInfo(""), true);
 }
 
 static MCDisassembler *createMipselDisassembler(
                        const Target &T,
                        const MCSubtargetInfo &STI) {
-  return new MipsDisassembler(STI,false);
+  return new MipsDisassembler(STI, T.createMCRegInfo(""), false);
 }
 
 static MCDisassembler *createMips64Disassembler(
                        const Target &T,
                        const MCSubtargetInfo &STI) {
-  return new Mips64Disassembler(STI,true);
+  return new Mips64Disassembler(STI, T.createMCRegInfo(""), true);
 }
 
 static MCDisassembler *createMips64elDisassembler(
                        const Target &T,
                        const MCSubtargetInfo &STI) {
-  return new Mips64Disassembler(STI, false);
+  return new Mips64Disassembler(STI, T.createMCRegInfo(""), false);
 }
 
 extern "C" void LLVMInitializeMipsDisassembler() {
@@ -362,6 +313,11 @@ Mips64Disassembler::getInstruction(MCInst &instr,
   return MCDisassembler::Fail;
 }
 
+static unsigned getReg(const void *D, unsigned RC, unsigned RegNo) {
+  const MipsDisassemblerBase *Dis = static_cast<const MipsDisassemblerBase*>(D);
+  return *(Dis->getRegInfo()->getRegClass(RC).begin() + RegNo);
+}
+
 static DecodeStatus DecodeCPU64RegsRegisterClass(MCInst &Inst,
                                                  unsigned RegNo,
                                                  uint64_t Address,
@@ -370,7 +326,8 @@ static DecodeStatus DecodeCPU64RegsRegisterClass(MCInst &Inst,
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
-  Inst.addOperand(MCOperand::CreateReg(CPU64RegsTable[RegNo]));
+  unsigned Reg = getReg(Decoder, Mips::CPU64RegsRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
   return MCDisassembler::Success;
 }
 
@@ -380,8 +337,8 @@ static DecodeStatus DecodeCPURegsRegisterClass(MCInst &Inst,
                                                const void *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
-
-  Inst.addOperand(MCOperand::CreateReg(CPURegsTable[RegNo]));
+  unsigned Reg = getReg(Decoder, Mips::CPURegsRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
   return MCDisassembler::Success;
 }
 
@@ -392,7 +349,8 @@ static DecodeStatus DecodeFGR64RegisterClass(MCInst &Inst,
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
-  Inst.addOperand(MCOperand::CreateReg(FGR64RegsTable[RegNo]));
+  unsigned Reg = getReg(Decoder, Mips::FGR64RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
   return MCDisassembler::Success;
 }
 
@@ -403,7 +361,8 @@ static DecodeStatus DecodeFGR32RegisterClass(MCInst &Inst,
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
-  Inst.addOperand(MCOperand::CreateReg(FGR32RegsTable[RegNo]));
+  unsigned Reg = getReg(Decoder, Mips::FGR32RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
   return MCDisassembler::Success;
 }
 
@@ -420,15 +379,18 @@ static DecodeStatus DecodeMem(MCInst &Inst,
                               uint64_t Address,
                               const void *Decoder) {
   int Offset = SignExtend32<16>(Insn & 0xffff);
-  int Reg = (int)fieldFromInstruction32(Insn, 16, 5);
-  int Base = (int)fieldFromInstruction32(Insn, 21, 5);
+  unsigned Reg = fieldFromInstruction32(Insn, 16, 5);
+  unsigned Base = fieldFromInstruction32(Insn, 21, 5);
+
+  Reg = getReg(Decoder, Mips::CPURegsRegClassID, Reg);
+  Base = getReg(Decoder, Mips::CPURegsRegClassID, Base);
 
   if(Inst.getOpcode() == Mips::SC){
-    Inst.addOperand(MCOperand::CreateReg(CPURegsTable[Reg]));
+    Inst.addOperand(MCOperand::CreateReg(Reg));
   }
 
-  Inst.addOperand(MCOperand::CreateReg(CPURegsTable[Reg]));
-  Inst.addOperand(MCOperand::CreateReg(CPURegsTable[Base]));
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  Inst.addOperand(MCOperand::CreateReg(Base));
   Inst.addOperand(MCOperand::CreateImm(Offset));
 
   return MCDisassembler::Success;
@@ -439,11 +401,14 @@ static DecodeStatus DecodeFMem(MCInst &Inst,
                                uint64_t Address,
                                const void *Decoder) {
   int Offset = SignExtend32<16>(Insn & 0xffff);
-  int Reg = (int)fieldFromInstruction32(Insn, 16, 5);
-  int Base = (int)fieldFromInstruction32(Insn, 21, 5);
+  unsigned Reg = fieldFromInstruction32(Insn, 16, 5);
+  unsigned Base = fieldFromInstruction32(Insn, 21, 5);
 
-  Inst.addOperand(MCOperand::CreateReg(FGR64RegsTable[Reg]));
-  Inst.addOperand(MCOperand::CreateReg(CPURegsTable[Base]));
+  Reg = getReg(Decoder, Mips::FGR64RegClassID, Reg);
+  Base = getReg(Decoder, Mips::CPURegsRegClassID, Base);
+
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  Inst.addOperand(MCOperand::CreateReg(Base));
   Inst.addOperand(MCOperand::CreateImm(Offset));
 
   return MCDisassembler::Success;
@@ -474,10 +439,12 @@ static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst,
                                               unsigned RegNo,
                                               uint64_t Address,
                                               const void *Decoder) {
-  if (RegNo > 31)
+  if (RegNo > 30 || RegNo %2)
     return MCDisassembler::Fail;
 
-  Inst.addOperand(MCOperand::CreateReg(AFGR64RegsTable[RegNo]));
+  ;
+  unsigned Reg = getReg(Decoder, Mips::AFGR64RegClassID, RegNo /2);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
   return MCDisassembler::Success;
 }
 
@@ -488,7 +455,7 @@ static DecodeStatus DecodeHWRegs64RegisterClass(MCInst &Inst,
   //Currently only hardware register 29 is supported
   if (RegNo != 29)
     return  MCDisassembler::Fail;
-  Inst.addOperand(MCOperand::CreateReg(Mips::HWR29));
+  Inst.addOperand(MCOperand::CreateReg(Mips::HWR29_64));
   return MCDisassembler::Success;
 }
 
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
index 6886b17..b38463d 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
@@ -13,6 +13,7 @@
 
 #define DEBUG_TYPE "asm-printer"
 #include "MipsInstPrinter.h"
+#include "MipsInstrInfo.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
@@ -68,8 +69,25 @@ void MipsInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
 
 void MipsInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
                                 StringRef Annot) {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case Mips::RDHWR:
+  case Mips::RDHWR64:
+    O << "\t.set\tpush\n";
+    O << "\t.set\tmips32r2\n";
+  }
+
   printInstruction(MI, O);
   printAnnotation(O, Annot);
+
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case Mips::RDHWR:
+  case Mips::RDHWR64:
+    O << "\n\t.set\tpop";
+  }
 }
 
 static void printExpr(const MCExpr *Expr, raw_ostream &OS) {
@@ -108,6 +126,8 @@ static void printExpr(const MCExpr *Expr, raw_ostream &OS) {
   case MCSymbolRefExpr::VK_Mips_GOT_DISP:  OS << "%got_disp("; break;
   case MCSymbolRefExpr::VK_Mips_GOT_PAGE:  OS << "%got_page("; break;
   case MCSymbolRefExpr::VK_Mips_GOT_OFST:  OS << "%got_ofst("; break;
+  case MCSymbolRefExpr::VK_Mips_HIGHER:    OS << "%higher("; break;
+  case MCSymbolRefExpr::VK_Mips_HIGHEST:   OS << "%highest("; break;
   }
 
   OS << SRE->getSymbol();
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
index 76b839b..3d8a6f9 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
@@ -16,7 +16,7 @@
 #include "llvm/MC/MCInstPrinter.h"
 
 namespace llvm {
-// These enumeration declarations were orignally in MipsInstrInfo.h but
+// These enumeration declarations were originally in MipsInstrInfo.h but
 // had to be moved here to avoid circular dependencies between
 // LLVMMipsCodeGen and LLVMMipsAsmPrinter.
 namespace Mips {
diff --git a/lib/Target/Mips/MCTargetDesc/Makefile b/lib/Target/Mips/MCTargetDesc/Makefile
index 7fe2086..22a2721 100644
--- a/lib/Target/Mips/MCTargetDesc/Makefile
+++ b/lib/Target/Mips/MCTargetDesc/Makefile
@@ -14,3 +14,4 @@ LIBRARYNAME = LLVMMipsDesc
 CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
 
 include $(LEVEL)/Makefile.common
+
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 9b4caf6..6fe0c11 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -36,6 +36,11 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
   case FK_GPRel_4:
   case FK_Data_4:
   case Mips::fixup_Mips_LO16:
+  case Mips::fixup_Mips_GPOFF_HI:
+  case Mips::fixup_Mips_GPOFF_LO:
+  case Mips::fixup_Mips_GOT_PAGE:
+  case Mips::fixup_Mips_GOT_OFST:
+  case Mips::fixup_Mips_GOT_DISP:
     break;
   case Mips::fixup_Mips_PC16:
     // So far we are only using this type for branches.
@@ -74,7 +79,8 @@ public:
     :MCAsmBackend(), OSType(_OSType), IsLittle(_isLittle), Is64Bit(_is64Bit) {}
 
   MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
-    return createMipsELFObjectWriter(OS, OSType, IsLittle, Is64Bit);
+    return createMipsELFObjectWriter(OS,
+      MCELFObjectTargetWriter::getOSABI(OSType), IsLittle, Is64Bit);
   }
 
   /// ApplyFixup - Apply the \arg Value for given \arg Fixup into the provided
@@ -115,7 +121,8 @@ public:
       CurVal |= (uint64_t)((uint8_t)Data[Offset + Idx]) << (i*8);
     }
 
-    uint64_t Mask = ((uint64_t)(-1) >> (64 - getFixupKindInfo(Kind).TargetSize));
+    uint64_t Mask = ((uint64_t)(-1) >>
+                     (64 - getFixupKindInfo(Kind).TargetSize));
     CurVal |= Value & Mask;
 
     // Write out the fixed up bytes back to the code/data bits.
@@ -156,7 +163,12 @@ public:
       { "fixup_Mips_TLSLDM",       0,     16,   0 },
       { "fixup_Mips_DTPREL_HI",    0,     16,   0 },
       { "fixup_Mips_DTPREL_LO",    0,     16,   0 },
-      { "fixup_Mips_Branch_PCRel", 0,     16,  MCFixupKindInfo::FKF_IsPCRel }
+      { "fixup_Mips_Branch_PCRel", 0,     16,  MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Mips_GPOFF_HI",     0,     16,   0 },
+      { "fixup_Mips_GPOFF_LO",     0,     16,   0 },
+      { "fixup_Mips_GOT_PAGE",     0,     16,   0 },
+      { "fixup_Mips_GOT_OFST",     0,     16,   0 },
+      { "fixup_Mips_GOT_DISP",     0,     16,   0 }
     };
 
     if (Kind < FirstTargetFixupKind)
@@ -206,6 +218,14 @@ public:
   ///
   /// \return - True on success.
   bool writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+    // Check for a less than instruction size number of bytes
+    // FIXME: 16 bit instructions are not handled yet here.
+    // We shouldn't be using a hard coded number for instruction size.
+    if (Count % 4) return false;
+
+    uint64_t NumNops = Count / 4;
+    for (uint64_t i = 0; i != NumNops; ++i)
+      OW->Write32(0);
     return true;
   }
 }; // class MipsAsmBackend
diff --git a/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h b/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
index fb1c5ce..234455e 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
@@ -79,7 +79,12 @@ namespace MipsII {
     MO_GPOFF_LO,
     MO_GOT_DISP,
     MO_GOT_PAGE,
-    MO_GOT_OFST
+    MO_GOT_OFST,
+
+    /// MO_HIGHER/HIGHEST - Represents the highest or higher half word of a
+    /// 64-bit symbol address.
+    MO_HIGHER,
+    MO_HIGHEST
   };
 
   enum {
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index 2091bec..77c1524 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -34,7 +34,7 @@ namespace {
 
   class MipsELFObjectWriter : public MCELFObjectTargetWriter {
   public:
-    MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI);
+    MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI, bool _isN64);
 
     virtual ~MipsELFObjectWriter();
 
@@ -52,9 +52,11 @@ namespace {
   };
 }
 
-MipsELFObjectWriter::MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI)
+MipsELFObjectWriter::MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI,
+                                         bool _isN64)
   : MCELFObjectTargetWriter(_is64Bit, OSABI, ELF::EM_MIPS,
-                            /*HasRelocationAddend*/ false) {}
+                            /*HasRelocationAddend*/ false,
+                            /*IsN64*/ _isN64) {}
 
 MipsELFObjectWriter::~MipsELFObjectWriter() {}
 
@@ -148,8 +150,26 @@ unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target,
   case Mips::fixup_Mips_PC16:
     Type = ELF::R_MIPS_PC16;
     break;
+  case Mips::fixup_Mips_GOT_PAGE:
+    Type = ELF::R_MIPS_GOT_PAGE;
+    break;
+  case Mips::fixup_Mips_GOT_OFST:
+    Type = ELF::R_MIPS_GOT_OFST;
+    break;
+  case Mips::fixup_Mips_GOT_DISP:
+    Type = ELF::R_MIPS_GOT_DISP;
+    break;
+  case Mips::fixup_Mips_GPOFF_HI:
+    Type = setRType((unsigned)ELF::R_MIPS_GPREL16, Type);
+    Type = setRType2((unsigned)ELF::R_MIPS_SUB, Type);
+    Type = setRType3((unsigned)ELF::R_MIPS_HI16, Type);
+    break;
+  case Mips::fixup_Mips_GPOFF_LO:
+    Type = setRType((unsigned)ELF::R_MIPS_GPREL16, Type);
+    Type = setRType2((unsigned)ELF::R_MIPS_SUB, Type);
+    Type = setRType3((unsigned)ELF::R_MIPS_LO16, Type);
+    break;
   }
-
   return Type;
 }
 
@@ -184,10 +204,10 @@ static int CompareOffset(const RelEntry &R0, const RelEntry &R1) {
 
 void MipsELFObjectWriter::sortRelocs(const MCAssembler &Asm,
                                      std::vector<ELFRelocationEntry> &Relocs) {
-  // Call the defualt function first. Relocations are sorted in descending
+  // Call the default function first. Relocations are sorted in descending
   // order of r_offset.
   MCELFObjectTargetWriter::sortRelocs(Asm, Relocs);
-  
+
   RelLs RelocLs;
   std::vector<RelLsIter> Unmatched;
 
@@ -244,6 +264,7 @@ MCObjectWriter *llvm::createMipsELFObjectWriter(raw_ostream &OS,
                                                 uint8_t OSABI,
                                                 bool IsLittleEndian,
                                                 bool Is64Bit) {
-  MCELFObjectTargetWriter *MOTW = new MipsELFObjectWriter(Is64Bit, OSABI);
+  MCELFObjectTargetWriter *MOTW = new MipsELFObjectWriter(Is64Bit, OSABI,
+                                                (Is64Bit) ? true : false);
   return createELFObjectWriter(MOTW, OS, IsLittleEndian);
 }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
index 9b76eda..f5cbbd5 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
@@ -95,6 +95,21 @@ namespace Mips {
     // PC relative branch fixup resulting in - R_MIPS_PC16
     fixup_Mips_Branch_PCRel,
 
+    // resulting in - R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16
+    fixup_Mips_GPOFF_HI,
+
+    // resulting in - R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16
+    fixup_Mips_GPOFF_LO,
+
+    // resulting in - R_MIPS_PAGE
+    fixup_Mips_GOT_PAGE,
+
+    // resulting in - R_MIPS_GOT_OFST
+    fixup_Mips_GOT_OFST,
+
+    // resulting in - R_MIPS_GOT_DISP
+    fixup_Mips_GOT_DISP,
+
     // Marker
     LastTargetFixupKind,
     NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 27954b1..ff3b3a7 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -91,6 +91,7 @@ public:
 }  // namespace
 
 MCCodeEmitter *llvm::createMipsMCCodeEmitterEB(const MCInstrInfo &MCII,
+                                               const MCRegisterInfo &MRI,
                                                const MCSubtargetInfo &STI,
                                                MCContext &Ctx)
 {
@@ -98,6 +99,7 @@ MCCodeEmitter *llvm::createMipsMCCodeEmitterEB(const MCInstrInfo &MCII,
 }
 
 MCCodeEmitter *llvm::createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
+                                               const MCRegisterInfo &MRI,
                                                const MCSubtargetInfo &STI,
                                                MCContext &Ctx)
 {
@@ -179,7 +181,7 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
   } else if (MO.isFPImm()) {
     return static_cast<unsigned>(APFloat(MO.getFPImm())
         .bitcastToAPInt().getHiBits(32).getLimitedValue());
-  } 
+  }
 
   // MO must be an Expr.
   assert(MO.isExpr());
@@ -193,10 +195,27 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
   }
 
   assert (Kind == MCExpr::SymbolRef);
-    
-  Mips::Fixups FixupKind;
+
+  Mips::Fixups FixupKind = Mips::Fixups(0);
 
   switch(cast<MCSymbolRefExpr>(Expr)->getKind()) {
+  default: llvm_unreachable("Unknown fixup kind!");
+    break;
+  case MCSymbolRefExpr::VK_Mips_GPOFF_HI :
+    FixupKind = Mips::fixup_Mips_GPOFF_HI;
+    break;
+  case MCSymbolRefExpr::VK_Mips_GPOFF_LO :
+    FixupKind = Mips::fixup_Mips_GPOFF_LO;
+    break;
+  case MCSymbolRefExpr::VK_Mips_GOT_PAGE :
+    FixupKind = Mips::fixup_Mips_GOT_PAGE;
+    break;
+  case MCSymbolRefExpr::VK_Mips_GOT_OFST :
+    FixupKind = Mips::fixup_Mips_GOT_OFST;
+    break;
+  case MCSymbolRefExpr::VK_Mips_GOT_DISP :
+    FixupKind = Mips::fixup_Mips_GOT_DISP;
+    break;
   case MCSymbolRefExpr::VK_Mips_GPREL:
     FixupKind = Mips::fixup_Mips_GPREL16;
     break;
@@ -236,8 +255,6 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
   case MCSymbolRefExpr::VK_Mips_TPREL_LO:
     FixupKind = Mips::fixup_Mips_TPREL_LO;
     break;
-  default:
-    break;
   } // switch
 
   Fixups.push_back(MCFixup::Create(0, MO.getExpr(), MCFixupKind(FixupKind)));
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
index 547ccdd..bfcc2a2 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
@@ -22,6 +22,7 @@ class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
 class MCObjectWriter;
+class MCRegisterInfo;
 class MCSubtargetInfo;
 class StringRef;
 class Target;
@@ -33,9 +34,11 @@ extern Target TheMips64Target;
 extern Target TheMips64elTarget;
 
 MCCodeEmitter *createMipsMCCodeEmitterEB(const MCInstrInfo &MCII,
+                                         const MCRegisterInfo &MRI,
                                          const MCSubtargetInfo &STI,
                                          MCContext &Ctx);
 MCCodeEmitter *createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
+                                         const MCRegisterInfo &MRI,
                                          const MCSubtargetInfo &STI,
                                          MCContext &Ctx);
 
diff --git a/lib/Target/Mips/Mips.h b/lib/Target/Mips/Mips.h
index bafadc8..2963f7e 100644
--- a/lib/Target/Mips/Mips.h
+++ b/lib/Target/Mips/Mips.h
@@ -24,9 +24,7 @@ namespace llvm {
 
   FunctionPass *createMipsISelDag(MipsTargetMachine &TM);
   FunctionPass *createMipsDelaySlotFillerPass(MipsTargetMachine &TM);
-  FunctionPass *createMipsExpandPseudoPass(MipsTargetMachine &TM);
-  FunctionPass *createMipsEmitGPRestorePass(MipsTargetMachine &TM);
-
+  FunctionPass *createMipsLongBranchPass(MipsTargetMachine &TM);
   FunctionPass *createMipsJITCodeEmitterPass(MipsTargetMachine &TM,
                                              JITCodeEmitter &JCE);
 
diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
index cbebe84..8548ae0 100644
--- a/lib/Target/Mips/Mips.td
+++ b/lib/Target/Mips/Mips.td
@@ -72,6 +72,9 @@ def FeatureMips64r2    : SubtargetFeature<"mips64r2", "MipsArchVersion",
                                 "Mips64r2", "Mips64r2 ISA Support",
                                 [FeatureMips64, FeatureMips32r2]>;
 
+def FeatureMips16  : SubtargetFeature<"mips16", "InMips16Mode", "true",
+                                      "Mips16 mode">;
+
 //===----------------------------------------------------------------------===//
 // Mips processors supported.
 //===----------------------------------------------------------------------===//
@@ -83,6 +86,7 @@ def : Proc<"mips32", [FeatureMips32]>;
 def : Proc<"mips32r2", [FeatureMips32r2]>;
 def : Proc<"mips64", [FeatureMips64]>;
 def : Proc<"mips64r2", [FeatureMips64r2]>;
+def : Proc<"mips16", [FeatureMips16]>;
 
 def MipsAsmWriter : AsmWriter {
   string AsmWriterClassName  = "InstPrinter";
diff --git a/lib/Target/Mips/Mips16InstrFormats.td b/lib/Target/Mips/Mips16InstrFormats.td
new file mode 100644
index 0000000..61602b6
--- /dev/null
+++ b/lib/Target/Mips/Mips16InstrFormats.td
@@ -0,0 +1,663 @@
+//===- Mips16InstrFormats.td - Mips Instruction Formats ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe MIPS instructions format
+//
+//  CPU INSTRUCTION FORMATS
+//
+//  funct or f      Function field
+//
+//  immediate       4-,5-,8- or 11-bit immediate, branch displacement, or
+//  or imm          address displacement
+//
+//  op              5-bit major operation code
+//
+//  rx              3-bit source or destination register
+//
+//  ry              3-bit source or destination register
+//
+//  rz              3-bit source or destination register
+//
+//  sa              3- or 5-bit shift amount
+//
+//===----------------------------------------------------------------------===//
+
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+//
+class Format16<bits<5> val> {
+  bits<5> Value = val;
+}
+
+def Pseudo16          : Format16<0>;
+def FrmI16            : Format16<1>;
+def FrmRI16           : Format16<2>;
+def FrmRR16           : Format16<3>;
+def FrmRRI16          : Format16<4>;
+def FrmRRR16          : Format16<5>;
+def FrmRRI_A16        : Format16<6>;
+def FrmSHIFT16        : Format16<7>;
+def FrmI8_TYPE16      : Format16<8>;
+def FrmI8_MOVR3216    : Format16<9>;
+def FrmI8_MOV32R16    : Format16<10>;
+def FrmI8_SVRS16      : Format16<11>;
+def FrmJAL16          : Format16<12>;
+def FrmJALX16         : Format16<13>;
+def FrmEXT_I16        : Format16<14>;
+def FrmASMACRO16      : Format16<15>;
+def FrmEXT_RI16       : Format16<16>;
+def FrmEXT_RRI16      : Format16<17>;
+def FrmEXT_RRI_A16    : Format16<18>;
+def FrmEXT_SHIFT16    : Format16<19>;
+def FrmEXT_I816       : Format16<20>;
+def FrmEXT_I8_SVRS16  : Format16<21>;
+def FrmOther16        : Format16<22>; // Instruction w/ a custom format
+
+// Base class for Mips 16 Format
+// This class does not depend on the instruction size
+//
+class MipsInst16_Base<dag outs, dag ins, string asmstr, list<dag> pattern,
+                      InstrItinClass itin, Format16 f>: Instruction
+{
+  Format16 Form = f;
+
+  let Namespace = "Mips";
+
+  let OutOperandList = outs;
+  let InOperandList  = ins;
+
+  let AsmString   = asmstr;
+  let Pattern     = pattern;
+  let Itinerary   = itin;
+
+  //
+  // Attributes specific to Mips instructions...
+  //
+  bits<5> FormBits = Form.Value;
+
+  // TSFlags layout should be kept in sync with MipsInstrInfo.h.
+  let TSFlags{4-0}   = FormBits;
+
+  let Predicates = [InMips16Mode];
+}
+
+//
+// Generic Mips 16 Format
+//
+class MipsInst16<dag outs, dag ins, string asmstr, list<dag> pattern,
+                 InstrItinClass itin, Format16 f>:
+  MipsInst16_Base<outs, ins, asmstr, pattern, itin, f>
+{
+  field bits<16> Inst;
+  bits<5> Opcode = 0;
+
+  // Top 5 bits are the 'opcode' field
+  let Inst{15-11} = Opcode;
+}
+
+//
+// For 32 bit extended instruction forms.
+//
+class MipsInst16_32<dag outs, dag ins, string asmstr, list<dag> pattern,
+                    InstrItinClass itin, Format16 f>:
+  MipsInst16_Base<outs, ins, asmstr, pattern, itin, f>
+{
+  field bits<32> Inst;
+
+}
+
+class MipsInst16_EXTEND<dag outs, dag ins, string asmstr, list<dag> pattern,
+                        InstrItinClass itin, Format16 f>:
+  MipsInst16_32<outs, ins, asmstr, pattern, itin, f>
+{
+  let Inst{31-27} = 0b11110;
+}
+
+
+
+// Mips Pseudo Instructions Format
+class MipsPseudo16<dag outs, dag ins, string asmstr, list<dag> pattern>:
+  MipsInst16<outs, ins, asmstr, pattern, IIPseudo, Pseudo16> {
+  let isCodeGenOnly = 1;
+  let isPseudo = 1;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Format I instruction class in Mips : <|opcode|imm11|>
+//===----------------------------------------------------------------------===//
+
+class FI16<bits<5> op, dag outs, dag ins, string asmstr, list<dag> pattern,
+           InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin, FrmI16>
+{
+  bits<11> imm11;
+
+  let Opcode = op;
+
+  let Inst{10-0}  = imm11;
+}
+
+//===----------------------------------------------------------------------===//
+// Format RI instruction class in Mips : <|opcode|rx|imm8|>
+//===----------------------------------------------------------------------===//
+
+class FRI16<bits<5> op, dag outs, dag ins, string asmstr,
+            list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin, FrmRI16>
+{
+  bits<3>  rx;
+  bits<8>   imm8;
+
+  let Opcode = op;
+
+  let Inst{10-8} = rx;
+  let Inst{7-0} = imm8;
+}
+
+//===----------------------------------------------------------------------===//
+// Format RR instruction class in Mips : <|opcode|rx|ry|funct|>
+//===----------------------------------------------------------------------===//
+
+class FRR16<bits<5> _funct, dag outs, dag ins, string asmstr,
+            list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin, FrmRR16>
+{
+  bits<3>  rx;
+  bits<3>  ry;
+  bits<5>  funct;
+
+  let Opcode = 0b11101;
+  let funct  = _funct;
+
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4-0}   = funct;
+}
+
+//
+// For conversion functions.
+//
+class FRR_SF16<bits<5> _funct, bits<3> _subfunct, dag outs, dag ins,
+               string asmstr, list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin, FrmRR16>
+{
+  bits<3>  rx;
+  bits<3>  subfunct;
+  bits<5>  funct;
+
+  let Opcode = 0b11101; // RR
+  let funct  = _funct;
+  let subfunct = _subfunct;
+
+  let Inst{10-8} = rx;
+  let Inst{7-5} = subfunct;
+  let Inst{4-0}   = funct;
+}
+
+//
+// just used for breakpoint (hardware and software) instructions.
+//
+class FC16<bits<5> _funct, dag outs, dag ins, string asmstr,
+           list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin, FrmRR16>
+{
+  bits<6>  _code;  // code is a keyword in tablegen
+  bits<5>  funct;
+
+  let Opcode = 0b11101; // RR
+  let funct  = _funct;
+
+  let Inst{10-5} = _code;
+  let Inst{4-0}   = funct;
+}
+
+//
+// J(AL)R(C) subformat
+//
+class FRR16_JALRC<bits<1> _nd, bits<1> _l, bits<1> r_a,
+                  dag outs, dag ins, string asmstr,
+                  list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin, FrmRR16>
+{
+  bits<3>  rx;
+  bits<1>  nd;
+  bits<1>  l;
+  bits<1>  ra;
+
+  let nd = _nd;
+  let l = _l;
+  let ra = r_a;
+
+  let Opcode = 0b11101;
+
+  let Inst{10-8} = rx;
+  let Inst{7} = nd;
+  let Inst{6} = l;
+  let Inst{5} = ra;
+  let Inst{4-0} = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Format RRI instruction class in Mips : <|opcode|rx|ry|imm5|>
+//===----------------------------------------------------------------------===//
+
+class FRRI16<bits<5> op, dag outs, dag ins, string asmstr,
+             list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin, FrmRRI16>
+{
+  bits<3>  rx;
+  bits<3>  ry;
+  bits<5>  imm5;
+
+  let Opcode = op;
+
+
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4-0}   = imm5;
+}
+
+//===----------------------------------------------------------------------===//
+// Format RRR instruction class in Mips : <|opcode|rx|ry|rz|f|>
+//===----------------------------------------------------------------------===//
+
+class FRRR16<bits<2> _f, dag outs, dag ins, string asmstr,
+             list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin, FrmRRR16>
+{
+  bits<3>  rx;
+  bits<3>  ry;
+  bits<3>  rz;
+  bits<2>  f;
+
+  let Opcode = 0b11100;
+  let f  = _f;
+
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4-2} = rz;
+  let Inst{1-0}   = f;
+}
+
+//===----------------------------------------------------------------------===//
+// Format RRI-A instruction class in Mips : <|opcode|rx|ry|f|imm4|>
+//===----------------------------------------------------------------------===//
+
+class FRRI_A16<bits<1> _f, dag outs, dag ins, string asmstr,
+               list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin, FrmRRI_A16>
+{
+  bits<3>  rx;
+  bits<3>  ry;
+  bits<1>  f;
+  bits<4>  imm4;
+
+  let Opcode = 0b01000;
+  let  f = _f;
+
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4} = f;
+  let Inst{3-0}   = imm4;
+}
+
+//===----------------------------------------------------------------------===//
+// Format Shift instruction class in Mips : <|opcode|rx|ry|sa|f|>
+//===----------------------------------------------------------------------===//
+
+class FSHIFT16<bits<2> _f, dag outs, dag ins, string asmstr,
+               list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin, FrmSHIFT16>
+{
+  bits<3>  rx;
+  bits<3>  ry;
+  bits<3>  sa;
+  bits<2>  f;
+
+  let Opcode = 0b00110;
+  let f  = _f;
+
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4-2} = sa;
+  let Inst{1-0}   = f;
+}
+
+//===----------------------------------------------------------------------===//
+// Format i8 instruction class in Mips : <|opcode|funct|imm8>
+//===----------------------------------------------------------------------===//
+
+class FI816<bits<3> _func, dag outs, dag ins, string asmstr,
+            list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin, FrmI8_TYPE16>
+{
+  bits<3>  func;
+  bits<8>   imm8;
+
+  let Opcode = 0b01100;
+  let func  = _func;
+
+  let Inst{10-8} = func;
+  let Inst{7-0} = imm8;
+}
+
+//===----------------------------------------------------------------------===//
+// Format i8_MOVR32 instruction class in Mips : <|opcode|func|ry|r32>
+//===----------------------------------------------------------------------===//
+
+class FI8_MOVR3216<dag outs, dag ins, string asmstr,
+                   list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin, FrmI8_MOVR3216>
+{
+
+  bits<4> ry;
+  bits<4> r32;
+
+  let Opcode = 0b01100;
+
+  let Inst{10-8} = 0b111;
+  let Inst{7-4} = ry;
+  let Inst{3-0} = r32;
+
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// Format i8_MOV32R instruction class in Mips : <|opcode|func|r32|rz>
+//===----------------------------------------------------------------------===//
+
+class FI8_MOV32R16<dag outs, dag ins, string asmstr,
+                   list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin, FrmI8_MOV32R16>
+{
+
+  bits<3>  func;
+  bits<5> r32;
+  bits<3> rz;
+
+
+  let Opcode = 0b01100;
+
+  let Inst{10-8} = 0b101;
+  let Inst{7-5} = r32{2-0};
+  let Inst{4-3} = r32{4-3};
+  let Inst{2-0} = rz;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format i8_SVRS instruction class in Mips :
+//    <|opcode|svrs|s|ra|s0|s1|framesize>
+//===----------------------------------------------------------------------===//
+
+class FI8_SVRS16<bits<1> _s, dag outs, dag ins, string asmstr,
+                 list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin, FrmI8_SVRS16>
+{
+  bits<1> s;
+  bits<1> ra = 0;
+  bits<1> s0 = 0;
+  bits<1> s1 = 0;
+  bits<4> framesize = 0;
+
+  let s =_s;
+  let Opcode = 0b01100;
+
+  let Inst{10-8} = 0b100;
+  let Inst{7} = s;
+  let Inst{6} = ra;
+  let Inst{5} = s0;
+  let Inst{4} = s1;
+  let Inst{3-0} = framesize;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format JAL instruction class in Mips16 :
+//    <|opcode|svrs|s|ra|s0|s1|framesize>
+//===----------------------------------------------------------------------===//
+
+class FJAL16<bits<1> _X, dag outs, dag ins, string asmstr,
+             list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_32<outs, ins, asmstr, pattern, itin, FrmJAL16>
+{
+  bits<1> X;
+  bits<26> imm26;
+
+
+  let X = _X;
+
+  let Inst{31-27} = 0b00011;
+  let Inst{26} = X;
+  let Inst{25-21} = imm26{20-16};
+  let Inst{20-16} = imm26{25-21};
+  let Inst{15-0}  = imm26{15-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-I instruction class in Mips16 :
+//     <|EXTEND|imm10:5|imm15:11|op|0|0|0|0|0|0|imm4:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_I16<bits<5> _eop, dag outs, dag ins, string asmstr,
+               list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin, FrmEXT_I16>
+{
+  bits<16> imm16;
+  bits<5> eop;
+
+  let eop = _eop;
+
+  let Inst{26-21} = imm16{10-5};
+  let Inst{20-16} = imm16{15-11};
+  let Inst{15-11} = eop;
+  let Inst{10-5} = 0;
+  let Inst{4-0} = imm16{4-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format ASMACRO instruction class in Mips16 :
+//    <EXTEND|select|p4|p3|RRR|p2|p1|p0>
+//===----------------------------------------------------------------------===//
+
+class FASMACRO16<dag outs, dag ins, string asmstr,
+                 list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin, FrmASMACRO16>
+{
+  bits<3> select;
+  bits<3> p4;
+  bits<5> p3;
+  bits<5> RRR = 0b11100;
+  bits<3> p2;
+  bits<3> p1;
+  bits<5> p0;
+
+
+  let Inst{26-24} = select;
+  let Inst{23-21} = p4;
+  let Inst{20-16} = p3;
+  let Inst{15-11} = RRR;
+  let Inst{10-8} = p2;
+  let Inst{7-5} = p1;
+  let Inst{4-0} = p0;
+
+}
+
+
+//===----------------------------------------------------------------------===//
+// Format EXT-RI instruction class in Mips16 :
+//    <|EXTEND|imm10:5|imm15:11|op|rx|0|0|0|imm4:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_RI16<bits<5> _op, dag outs, dag ins, string asmstr,
+                list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin, FrmEXT_RI16>
+{
+  bits<16> imm16;
+  bits<5> op;
+  bits<3> rx;
+
+  let op = _op;
+
+  let Inst{26-21} = imm16{10-5};
+  let Inst{20-16} = imm16{15-11};
+  let Inst{15-11} = op;
+  let Inst{10-8} = rx;
+  let Inst{7-5} = 0;
+  let Inst{4-0} = imm16{4-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-RRI instruction class in Mips16 :
+//     <|EXTEND|imm10:5|imm15:11|op|rx|ry|imm4:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_RRI16<bits<5> _op, dag outs, dag ins, string asmstr,
+                 list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin, FrmEXT_RRI16>
+{
+  bits<5> op;
+  bits<16> imm16;
+  bits<3> rx;
+  bits<3> ry;
+
+  let op=_op;
+
+  let Inst{26-21} = imm16{10-5};
+  let Inst{20-16} = imm16{15-11};
+  let Inst{15-11} = op;
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4-0} = imm16{4-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-RRI-A instruction class in Mips16 :
+//    <|EXTEND|imm10:4|imm14:11|RRI-A|rx|ry|f|imm3:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_RRI_A16<bits<1> _f, dag outs, dag ins, string asmstr,
+                   list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin, FrmEXT_RRI_A16>
+{
+  bits<15> imm15;
+  bits<3> rx;
+  bits<3> ry;
+  bits<1> f;
+
+  let f = _f;
+
+  let Inst{26-20} = imm15{10-4};
+  let Inst{19-16} = imm15{14-11};
+  let Inst{15-11} = 0b01000;
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4} = f;
+  let Inst{3-0} = imm15{3-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-SHIFT instruction class in Mips16 :
+//    <|EXTEND|sa 4:0|s5|0|SHIFT|rx|ry|0|f>
+//===----------------------------------------------------------------------===//
+
+class FEXT_SHIFT16<bits<2> _f, dag outs, dag ins, string asmstr,
+                   list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin, FrmEXT_SHIFT16>
+{
+  bits<6> sa6;
+  bits<3> rx;
+  bits<3> ry;
+  bits<2> f;
+
+  let f = _f;
+
+  let Inst{26-22} = sa6{4-0};
+  let Inst{21} = sa6{5};
+  let Inst{20-16} = 0;
+  let Inst{15-11} = 0b00110;
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4-2} = 0;
+  let Inst{1-0} = f;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-I8 instruction class in Mips16 :
+//    <|EXTEND|imm10:5|imm15:11|I8|funct|0|imm4:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_I816<bits<3> _funct, dag outs, dag ins, string asmstr,
+                list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin, FrmEXT_I816>
+{
+  bits<16> imm16;
+  bits<5> I8;
+  bits<3> funct;
+
+  let funct = _funct;
+  let I8 = 0b0110;
+
+  let Inst{26-21} = imm16{10-5};
+  let Inst{20-16} = imm16{15-11};
+  let Inst{15-11} = I8;
+  let Inst{10-8} = funct;
+  let Inst{7-5} = 0;
+  let Inst{4-0} = imm16{4-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-I8_SVRS instruction class in Mips16 :
+//    <|EXTEND|xsregs|framesize7:4|aregs|I8|SVRS|s|ra|s0|s1|framesize3:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_I8_SVRS16<bits<1> s_, dag outs, dag ins, string asmstr,
+                     list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin, FrmI8_SVRS16>
+{
+  bits<3> xsregs =0;
+  bits<8> framesize =0;
+  bits<3> aregs =0;
+  bits<5> I8 = 0b01100;
+  bits<3> SVRS = 0b100;
+  bits<1> s;
+  bits<1> ra = 0;
+  bits<1> s0 = 0;
+  bits<1> s1 = 0;
+
+  let s= s_;
+
+  let Inst{26-24} = xsregs;
+  let Inst{23-20} = framesize{7-4};
+  let Inst{19} = 0;
+  let Inst{18-16} = aregs;
+  let Inst{15-11} = I8;
+  let Inst{10-8} = SVRS;
+  let Inst{7} = s;
+  let Inst{6} = ra;
+  let Inst{5} = s0;
+  let Inst{4} = s1;
+  let Inst{3-0} = framesize{3-0};
+
+
+}
+
+
+
diff --git a/lib/Target/Mips/Mips16InstrInfo.td b/lib/Target/Mips/Mips16InstrInfo.td
new file mode 100644
index 0000000..c852042
--- /dev/null
+++ b/lib/Target/Mips/Mips16InstrInfo.td
@@ -0,0 +1,243 @@
+//===- Mips16InstrInfo.td - Target Description for Mips16  -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips16 instructions.
+//
+//===----------------------------------------------------------------------===//
+
+def uimm5      : Operand<i8> {
+  let DecoderMethod= "DecodeSimm16";
+}
+
+//
+// RRR-type instruction format
+//
+
+class FRRR16_ins<bits<2> _f, string asmstr,  InstrItinClass itin> :
+  FRRR16<_f, (outs CPU16Regs:$rz), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+         !strconcat(asmstr, "\t$rz, $rx, $ry"), [], itin>;
+
+//
+// I8_MOV32R instruction format (used only by MOV32R instruction)
+//
+class FI8_MOV32R16_ins<string asmstr, InstrItinClass itin>:
+  FI8_MOV32R16<(outs CPURegs:$r32), (ins CPU16Regs:$rz),
+               !strconcat(asmstr,  "\t$r32, $rz"), [], itin>;
+
+//
+// EXT-RI instruction format
+//
+
+class FEXT_RI16_ins_base<bits<5> _op, string asmstr, string asmstr2,
+                         InstrItinClass itin>:
+  FEXT_RI16<_op, (outs CPU16Regs:$rx), (ins simm16:$imm),
+                  !strconcat(asmstr, asmstr2), [], itin>;
+
+class FEXT_RI16_ins<bits<5> _op, string asmstr,
+                    InstrItinClass itin>:
+  FEXT_RI16_ins_base<_op, asmstr, "\t$rx, $imm", itin>;
+
+class FEXT_RI16_PC_ins<bits<5> _op, string asmstr, InstrItinClass itin>:
+  FEXT_RI16_ins_base<_op, asmstr, "\t$rx, $$pc, $imm", itin>;
+
+//
+// RR-type instruction format
+//
+let rx=0 in
+class FRR16_JALRC_RA_only_ins<bits<1> nd_, bits<1> l_,
+                              string asmstr, InstrItinClass itin>:
+  FRR16_JALRC<nd_, l_, 1, (outs), (ins), !strconcat(asmstr, "\t $$ra"),
+              [], itin> ;
+
+//
+// EXT-RRI instruction format
+//
+
+class FEXT_RRI16_mem_ins<bits<5> op, string asmstr, Operand MemOpnd,
+                         InstrItinClass itin>:
+  FEXT_RRI16<op, (outs CPU16Regs:$ry), (ins  MemOpnd:$addr),
+             !strconcat(asmstr, "\t$ry, $addr"), [], itin>;
+
+//
+// EXT-SHIFT instruction format
+//
+class FEXT_SHIFT16_ins<bits<2> _f, string asmstr, InstrItinClass itin>:
+  FEXT_SHIFT16<_f, (outs CPU16Regs:$rx), (ins CPU16Regs:$ry, uimm5:$sa),
+               !strconcat(asmstr, "\t$rx, $ry, $sa"), [], itin>;
+
+//
+// Address operand
+def mem16 : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops CPU16Regs, simm16);
+  let EncoderMethod = "getMemEncoding";
+}
+
+//
+// Format: ADDIU rx, pc, immediate MIPS16e
+// Purpose: Add Immediate Unsigned Word (3-Operand, PC-Relative, Extended)
+// To add a constant to the program counter.
+//
+class AddiuRxPcImmX16_base : FEXT_RI16_PC_ins<0b00001, "addiu", IIAlu>;
+def AddiuRxPcImmX16   : AddiuRxPcImmX16_base;
+//
+// Format: ADDU rz, rx, ry MIPS16e
+// Purpose: Add Unsigned Word (3-Operand)
+// To add 32-bit integers.
+//
+
+class AdduRxRyRz16_base: FRRR16_ins<01, "addu", IIAlu>;
+def AdduRxRyRz16: AdduRxRyRz16_base;
+
+//
+// Format: JR ra MIPS16e
+// Purpose: Jump Register Through Register ra
+// To execute a branch to the instruction address in the return
+// address register.
+//
+
+def JrRa16: FRR16_JALRC_RA_only_ins<0, 0, "jr", IIAlu>;
+
+//
+// Format: LI rx, immediate MIPS16e
+// Purpose: Load Immediate (Extended)
+// To load a constant into a GPR.
+//
+def LiRxImmX16: FEXT_RI16_ins<0b01101, "li", IIAlu>;
+
+//
+// Format: LW ry, offset(rx) MIPS16e
+// Purpose: Load Word (Extended)
+// To load a word from memory as a signed value.
+//
+class LwRxRyOffMemX16_base: FEXT_RRI16_mem_ins<0b10011, "lw", mem16, IIAlu>;
+def LwRxRyOffMemX16: LwRxRyOffMemX16_base;
+
+//
+// Format: MOVE r32, rz MIPS16e
+// Purpose: Move
+// To move the contents of a GPR to a GPR.
+//
+def Mov32R16: FI8_MOV32R16_ins<"move", IIAlu>;
+//
+// Format: RESTORE {ra,}{s0/s1/s0-1,}{framesize}
+// (All args are optional) MIPS16e
+// Purpose: Restore Registers and Deallocate Stack Frame
+// To deallocate a stack frame before exit from a subroutine,
+// restoring return address and static registers, and adjusting
+// stack
+//
+
+// fixed form for restoring RA and the frame
+// for direct object emitter, encoding needs to be adjusted for the
+// frame size
+//
+let ra=1, s=0,s0=0,s1=0 in
+def RestoreRaF16:
+  FI8_SVRS16<0b1, (outs), (ins uimm16:$frame_size),
+             "restore \t$$ra, $frame_size", [], IILoad >;
+
+//
+// Format: SAVE {ra,}{s0/s1/s0-1,}{framesize} (All arguments are optional)
+// MIPS16e
+// Purpose: Save Registers and Set Up Stack Frame
+// To set up a stack frame on entry to a subroutine,
+// saving return address and static registers, and adjusting stack
+//
+let ra=1, s=1,s0=0,s1=0 in
+def SaveRaF16:
+  FI8_SVRS16<0b1, (outs), (ins uimm16:$frame_size),
+             "save \t$$ra, $frame_size", [], IILoad >;
+
+//
+// Format: SLL rx, ry, sa MIPS16e
+// Purpose: Shift Word Left Logical (Extended)
+// To execute a left-shift of a word by a fixed number of bits—0 to 31 bits.
+//
+def SllX16: FEXT_SHIFT16_ins<0b00, "sll", IIAlu>;
+
+//
+// Format: SW ry, offset(rx) MIPS16e
+// Purpose: Store Word (Extended)
+// To store a word to memory.
+//
+class SwRxRyOffMemX16_base: FEXT_RRI16_mem_ins<0b11011, "sw", mem16, IIAlu>;
+def SwRxRyOffMemX16: SwRxRyOffMemX16_base;
+
+class Mips16Pat<dag pattern, dag result> : Pat<pattern, result> {
+  let Predicates = [InMips16Mode];
+}
+
+class ArithLogicR16Defs<SDNode OpNode, bit isComm = 0> {
+  dag OutOperandList = (outs CPU16Regs:$rz);
+  dag InOperandList = (ins CPU16Regs:$rx, CPU16Regs:$ry);
+  list<dag> Pattern = [(set CPU16Regs:$rz,
+                       (OpNode CPU16Regs:$rx, CPU16Regs:$ry))];
+}
+
+multiclass ArithLogicR16_base {
+  def _add: AdduRxRyRz16_base, ArithLogicR16Defs<add, 1>;
+}
+
+defm ArithLogicR16_patt : ArithLogicR16_base;
+
+class LoadM16Defs<PatFrag OpNode, Operand _MemOpnd, bit Pseudo=0> {
+  bit isPseudo = Pseudo;
+  Operand MemOpnd = _MemOpnd;
+  dag OutOperandList = (outs CPU16Regs:$ry);
+  dag InOperandList = (ins MemOpnd:$addr);
+  list<dag> Pattern = [(set CPU16Regs:$ry, (OpNode addr:$addr))];
+}
+
+multiclass LoadM16_base {
+  def _LwRxRyOffMemX16: LwRxRyOffMemX16_base, LoadM16Defs<load_a, mem16>;
+}
+
+defm LoadM16: LoadM16_base;
+
+class StoreM16Defs<PatFrag OpNode, Operand _MemOpnd, bit Pseudo=0> {
+  bit isPseudo = Pseudo;
+  Operand MemOpnd = _MemOpnd;
+  dag OutOperandList = (outs );
+  dag InOperandList = (ins CPU16Regs:$ry, MemOpnd:$addr);
+  list<dag> Pattern = [(OpNode CPU16Regs:$ry, addr:$addr)];
+}
+
+multiclass StoreM16_base {
+  def _SwRxRyOffMemX16: SwRxRyOffMemX16_base, StoreM16Defs<store_a, mem16>;
+}
+
+defm StoreM16: StoreM16_base;
+
+// Jump and Link (Call)
+let isCall=1, hasDelaySlot=1 in
+def JumpLinkReg16:
+  FRR16_JALRC<0, 0, 0, (outs), (ins CPU16Regs:$rs),
+              "jalr \t$rs", [(MipsJmpLink CPU16Regs:$rs)], IIBranch>;
+
+// Mips16 pseudos
+let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, hasCtrlDep=1,
+  hasExtraSrcRegAllocReq = 1 in
+def RetRA16 : MipsPseudo16<(outs), (ins), "", [(MipsRet)]>;
+
+// As stack alignment is always done with addiu, we need a 16-bit immediate
+// This is basically deprecated code but needs to be there for things
+// to work.
+let Defs = [SP], Uses = [SP] in {
+def ADJCALLSTACKDOWN16 : MipsPseudo16<(outs), (ins uimm16:$amt),
+                                      ";",
+                                      [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP16   : MipsPseudo16<(outs), (ins uimm16:$amt1, uimm16:$amt2),
+                                      ";",
+                                      [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+// Small immediates
+def : Mips16Pat<(i32 immZExt16:$in), (LiRxImmX16 immZExt16:$in)>;
+def : Mips16Pat<(MipsLo tglobaladdr:$in), (LiRxImmX16 tglobaladdr:$in)>;
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index 0382869..cceee24 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -49,21 +49,24 @@ class Div64<SDNode op, bits<6> func, string instr_asm, InstrItinClass itin>:
   Div<op, func, instr_asm, itin, CPU64Regs, [HI64, LO64]>;
 
 multiclass Atomic2Ops64<PatFrag Op, string Opstr> {
-  def #NAME# : Atomic2Ops<Op, Opstr, CPU64Regs, CPURegs>, Requires<[NotN64]>;
-  def _P8    : Atomic2Ops<Op, Opstr, CPU64Regs, CPU64Regs>, Requires<[IsN64]> {
+  def #NAME# : Atomic2Ops<Op, Opstr, CPU64Regs, CPURegs>,
+               Requires<[NotN64, HasStandardEncoding]>;
+  def _P8    : Atomic2Ops<Op, Opstr, CPU64Regs, CPU64Regs>,
+               Requires<[IsN64, HasStandardEncoding]> {
     let isCodeGenOnly = 1;
   }
 }
 
 multiclass AtomicCmpSwap64<PatFrag Op, string Width>  {
-  def #NAME# : AtomicCmpSwap<Op, Width, CPU64Regs, CPURegs>, Requires<[NotN64]>;
+  def #NAME# : AtomicCmpSwap<Op, Width, CPU64Regs, CPURegs>,
+               Requires<[NotN64, HasStandardEncoding]>;
   def _P8    : AtomicCmpSwap<Op, Width, CPU64Regs, CPU64Regs>,
-               Requires<[IsN64]> {
+               Requires<[IsN64, HasStandardEncoding]> {
     let isCodeGenOnly = 1;
   }
 }
 }
-let usesCustomInserter = 1, Predicates = [HasMips64],
+let usesCustomInserter = 1, Predicates = [HasMips64, HasStandardEncoding],
   DecoderNamespace = "Mips64" in {
   defm ATOMIC_LOAD_ADD_I64  : Atomic2Ops64<atomic_load_add_64, "load_add_64">;
   defm ATOMIC_LOAD_SUB_I64  : Atomic2Ops64<atomic_load_sub_64, "load_sub_64">;
@@ -106,9 +109,15 @@ def DSRA     : shift_rotate_imm64<0x3b, 0x00, "dsra", sra>;
 def DSLLV    : shift_rotate_reg<0x14, 0x00, "dsllv", shl, CPU64Regs>;
 def DSRLV    : shift_rotate_reg<0x16, 0x00, "dsrlv", srl, CPU64Regs>;
 def DSRAV    : shift_rotate_reg<0x17, 0x00, "dsrav", sra, CPU64Regs>;
+let Pattern = []<dag> in {
+def DSLL32   : shift_rotate_imm64<0x3c, 0x00, "dsll32", shl>;
+def DSRL32   : shift_rotate_imm64<0x3e, 0x00, "dsrl32", srl>;
+def DSRA32   : shift_rotate_imm64<0x3f, 0x00, "dsra32", sra>;
+}
 }
 // Rotate Instructions
-let Predicates = [HasMips64r2], DecoderNamespace = "Mips64" in {
+let Predicates = [HasMips64r2, HasStandardEncoding],
+    DecoderNamespace = "Mips64" in {
   def DROTR    : shift_rotate_imm64<0x3a, 0x01, "drotr", rotr>;
   def DROTRV   : shift_rotate_reg<0x16, 0x01, "drotrv", rotr, CPU64Regs>;
 }
@@ -137,18 +146,34 @@ defm USW64     : StoreM64<0x2b, "usw", truncstorei32_u, 1>;
 defm ULD       : LoadM64<0x37, "uld",  load_u, 1>;
 defm USD       : StoreM64<0x3f, "usd", store_u, 1>;
 
+/// load/store left/right
+let isCodeGenOnly = 1 in {
+  defm LWL64 : LoadLeftRightM64<0x22, "lwl", MipsLWL>;
+  defm LWR64 : LoadLeftRightM64<0x26, "lwr", MipsLWR>;
+  defm SWL64 : StoreLeftRightM64<0x2a, "swl", MipsSWL>;
+  defm SWR64 : StoreLeftRightM64<0x2e, "swr", MipsSWR>;
+}
+defm LDL   : LoadLeftRightM64<0x1a, "ldl", MipsLDL>;
+defm LDR   : LoadLeftRightM64<0x1b, "ldr", MipsLDR>;
+defm SDL   : StoreLeftRightM64<0x2c, "sdl", MipsSDL>;
+defm SDR   : StoreLeftRightM64<0x2d, "sdr", MipsSDR>;
+
 /// Load-linked, Store-conditional
-def LLD    : LLBase<0x34, "lld", CPU64Regs, mem>, Requires<[NotN64]>;
-def LLD_P8 : LLBase<0x34, "lld", CPU64Regs, mem64>, Requires<[IsN64]> {
+def LLD    : LLBase<0x34, "lld", CPU64Regs, mem>,
+             Requires<[NotN64, HasStandardEncoding]>;
+def LLD_P8 : LLBase<0x34, "lld", CPU64Regs, mem64>,
+             Requires<[IsN64, HasStandardEncoding]> {
   let isCodeGenOnly = 1;
 }
-def SCD    : SCBase<0x3c, "scd", CPU64Regs, mem>, Requires<[NotN64]>;
-def SCD_P8 : SCBase<0x3c, "scd", CPU64Regs, mem64>, Requires<[IsN64]> {
+def SCD    : SCBase<0x3c, "scd", CPU64Regs, mem>,
+             Requires<[NotN64, HasStandardEncoding]>;
+def SCD_P8 : SCBase<0x3c, "scd", CPU64Regs, mem64>,
+             Requires<[IsN64, HasStandardEncoding]> {
   let isCodeGenOnly = 1;
 }
 
 /// Jump and Branch Instructions
-def JR64   : JumpFR<0x00, 0x08, "jr", CPU64Regs>;
+def JR64   : IndirectBranch<CPU64Regs>;
 def BEQ64  : CBranch<0x04, "beq", seteq, CPU64Regs>;
 def BNE64  : CBranch<0x05, "bne", setne, CPU64Regs>;
 def BGEZ64 : CBranchZero<0x01, 1, "bgez", setge, CPU64Regs>;
@@ -187,7 +212,7 @@ def LEA_ADDiu64 : EffectiveAddress<"daddiu\t$rt, $addr", CPU64Regs, mem_ea_64>;
 }
 let Uses = [SP_64], DecoderNamespace = "Mips64" in
 def DynAlloc64 : EffectiveAddress<"daddiu\t$rt, $addr", CPU64Regs, mem_ea_64>,
-                 Requires<[IsN64]> {
+                 Requires<[IsN64, HasStandardEncoding]> {
   let isCodeGenOnly = 1;
 }
 let DecoderNamespace = "Mips64" in {
@@ -209,48 +234,50 @@ def SLL64_64 : FR<0x0, 0x00, (outs CPU64Regs:$rd), (ins CPU64Regs:$rt),
 //===----------------------------------------------------------------------===//
 
 // extended loads
-let Predicates = [NotN64] in {
-  def : Pat<(i64 (extloadi1  addr:$src)), (LB64 addr:$src)>;
-  def : Pat<(i64 (extloadi8  addr:$src)), (LB64 addr:$src)>;
-  def : Pat<(i64 (extloadi16_a addr:$src)), (LH64 addr:$src)>;
-  def : Pat<(i64 (extloadi16_u addr:$src)), (ULH64 addr:$src)>;
-  def : Pat<(i64 (extloadi32_a addr:$src)), (LW64 addr:$src)>;
-  def : Pat<(i64 (extloadi32_u addr:$src)), (ULW64 addr:$src)>;
-  def : Pat<(zextloadi32_u addr:$a), (DSRL (DSLL (ULW64 addr:$a), 32), 32)>;
+let Predicates = [NotN64, HasStandardEncoding] in {
+  def : MipsPat<(i64 (extloadi1  addr:$src)), (LB64 addr:$src)>;
+  def : MipsPat<(i64 (extloadi8  addr:$src)), (LB64 addr:$src)>;
+  def : MipsPat<(i64 (extloadi16_a addr:$src)), (LH64 addr:$src)>;
+  def : MipsPat<(i64 (extloadi16_u addr:$src)), (ULH64 addr:$src)>;
+  def : MipsPat<(i64 (extloadi32_a addr:$src)), (LW64 addr:$src)>;
+  def : MipsPat<(i64 (extloadi32_u addr:$src)), (ULW64 addr:$src)>;
+  def : MipsPat<(zextloadi32_u addr:$a), (DSRL (DSLL (ULW64 addr:$a), 32), 32)>;
 }
-let Predicates = [IsN64] in {
-  def : Pat<(i64 (extloadi1  addr:$src)), (LB64_P8 addr:$src)>;
-  def : Pat<(i64 (extloadi8  addr:$src)), (LB64_P8 addr:$src)>;
-  def : Pat<(i64 (extloadi16_a addr:$src)), (LH64_P8 addr:$src)>;
-  def : Pat<(i64 (extloadi16_u addr:$src)), (ULH64_P8 addr:$src)>;
-  def : Pat<(i64 (extloadi32_a addr:$src)), (LW64_P8 addr:$src)>;
-  def : Pat<(i64 (extloadi32_u addr:$src)), (ULW64_P8 addr:$src)>;
-  def : Pat<(zextloadi32_u addr:$a), (DSRL (DSLL (ULW64_P8 addr:$a), 32), 32)>;
+let Predicates = [IsN64, HasStandardEncoding] in {
+  def : MipsPat<(i64 (extloadi1  addr:$src)), (LB64_P8 addr:$src)>;
+  def : MipsPat<(i64 (extloadi8  addr:$src)), (LB64_P8 addr:$src)>;
+  def : MipsPat<(i64 (extloadi16_a addr:$src)), (LH64_P8 addr:$src)>;
+  def : MipsPat<(i64 (extloadi16_u addr:$src)), (ULH64_P8 addr:$src)>;
+  def : MipsPat<(i64 (extloadi32_a addr:$src)), (LW64_P8 addr:$src)>;
+  def : MipsPat<(i64 (extloadi32_u addr:$src)), (ULW64_P8 addr:$src)>;
+  def : MipsPat<(zextloadi32_u addr:$a),
+                (DSRL (DSLL (ULW64_P8 addr:$a), 32), 32)>;
 }
 
 // hi/lo relocs
-def : Pat<(MipsHi tglobaladdr:$in), (LUi64 tglobaladdr:$in)>;
-def : Pat<(MipsHi tblockaddress:$in), (LUi64 tblockaddress:$in)>;
-def : Pat<(MipsHi tjumptable:$in), (LUi64 tjumptable:$in)>;
-def : Pat<(MipsHi tconstpool:$in), (LUi64 tconstpool:$in)>;
-def : Pat<(MipsHi tglobaltlsaddr:$in), (LUi64 tglobaltlsaddr:$in)>;
-
-def : Pat<(MipsLo tglobaladdr:$in), (DADDiu ZERO_64, tglobaladdr:$in)>;
-def : Pat<(MipsLo tblockaddress:$in), (DADDiu ZERO_64, tblockaddress:$in)>;
-def : Pat<(MipsLo tjumptable:$in), (DADDiu ZERO_64, tjumptable:$in)>;
-def : Pat<(MipsLo tconstpool:$in), (DADDiu ZERO_64, tconstpool:$in)>;
-def : Pat<(MipsLo tglobaltlsaddr:$in), (DADDiu ZERO_64, tglobaltlsaddr:$in)>;
-
-def : Pat<(add CPU64Regs:$hi, (MipsLo tglobaladdr:$lo)),
-          (DADDiu CPU64Regs:$hi, tglobaladdr:$lo)>;
-def : Pat<(add CPU64Regs:$hi, (MipsLo tblockaddress:$lo)),
-          (DADDiu CPU64Regs:$hi, tblockaddress:$lo)>;
-def : Pat<(add CPU64Regs:$hi, (MipsLo tjumptable:$lo)),
-          (DADDiu CPU64Regs:$hi, tjumptable:$lo)>;
-def : Pat<(add CPU64Regs:$hi, (MipsLo tconstpool:$lo)),
-          (DADDiu CPU64Regs:$hi, tconstpool:$lo)>;
-def : Pat<(add CPU64Regs:$hi, (MipsLo tglobaltlsaddr:$lo)),
-          (DADDiu CPU64Regs:$hi, tglobaltlsaddr:$lo)>;
+def : MipsPat<(MipsHi tglobaladdr:$in), (LUi64 tglobaladdr:$in)>;
+def : MipsPat<(MipsHi tblockaddress:$in), (LUi64 tblockaddress:$in)>;
+def : MipsPat<(MipsHi tjumptable:$in), (LUi64 tjumptable:$in)>;
+def : MipsPat<(MipsHi tconstpool:$in), (LUi64 tconstpool:$in)>;
+def : MipsPat<(MipsHi tglobaltlsaddr:$in), (LUi64 tglobaltlsaddr:$in)>;
+
+def : MipsPat<(MipsLo tglobaladdr:$in), (DADDiu ZERO_64, tglobaladdr:$in)>;
+def : MipsPat<(MipsLo tblockaddress:$in), (DADDiu ZERO_64, tblockaddress:$in)>;
+def : MipsPat<(MipsLo tjumptable:$in), (DADDiu ZERO_64, tjumptable:$in)>;
+def : MipsPat<(MipsLo tconstpool:$in), (DADDiu ZERO_64, tconstpool:$in)>;
+def : MipsPat<(MipsLo tglobaltlsaddr:$in),
+              (DADDiu ZERO_64, tglobaltlsaddr:$in)>;
+
+def : MipsPat<(add CPU64Regs:$hi, (MipsLo tglobaladdr:$lo)),
+              (DADDiu CPU64Regs:$hi, tglobaladdr:$lo)>;
+def : MipsPat<(add CPU64Regs:$hi, (MipsLo tblockaddress:$lo)),
+              (DADDiu CPU64Regs:$hi, tblockaddress:$lo)>;
+def : MipsPat<(add CPU64Regs:$hi, (MipsLo tjumptable:$lo)),
+              (DADDiu CPU64Regs:$hi, tjumptable:$lo)>;
+def : MipsPat<(add CPU64Regs:$hi, (MipsLo tconstpool:$lo)),
+              (DADDiu CPU64Regs:$hi, tconstpool:$lo)>;
+def : MipsPat<(add CPU64Regs:$hi, (MipsLo tglobaltlsaddr:$lo)),
+              (DADDiu CPU64Regs:$hi, tglobaltlsaddr:$lo)>;
 
 def : WrapperPat<tglobaladdr, DADDiu, CPU64Regs>;
 def : WrapperPat<tconstpool, DADDiu, CPU64Regs>;
@@ -270,19 +297,22 @@ defm : SetgePats<CPU64Regs, SLT64, SLTu64>;
 defm : SetgeImmPats<CPU64Regs, SLTi64, SLTiu64>;
 
 // select MipsDynAlloc
-def : Pat<(MipsDynAlloc addr:$f), (DynAlloc64 addr:$f)>, Requires<[IsN64]>;
+def : MipsPat<(MipsDynAlloc addr:$f), (DynAlloc64 addr:$f)>,
+      Requires<[IsN64, HasStandardEncoding]>;
 
 // truncate
-def : Pat<(i32 (trunc CPU64Regs:$src)),
-          (SLL (EXTRACT_SUBREG CPU64Regs:$src, sub_32), 0)>, Requires<[IsN64]>;
+def : MipsPat<(i32 (trunc CPU64Regs:$src)),
+              (SLL (EXTRACT_SUBREG CPU64Regs:$src, sub_32), 0)>,
+      Requires<[IsN64, HasStandardEncoding]>;
 
 // 32-to-64-bit extension
-def : Pat<(i64 (anyext CPURegs:$src)), (SLL64_32 CPURegs:$src)>;
-def : Pat<(i64 (zext CPURegs:$src)), (DSRL (DSLL64_32 CPURegs:$src), 32)>;
-def : Pat<(i64 (sext CPURegs:$src)), (SLL64_32 CPURegs:$src)>;
+def : MipsPat<(i64 (anyext CPURegs:$src)), (SLL64_32 CPURegs:$src)>;
+def : MipsPat<(i64 (zext CPURegs:$src)), (DSRL (DSLL64_32 CPURegs:$src), 32)>;
+def : MipsPat<(i64 (sext CPURegs:$src)), (SLL64_32 CPURegs:$src)>;
 
 // Sign extend in register
-def : Pat<(i64 (sext_inreg CPU64Regs:$src, i32)), (SLL64_64 CPU64Regs:$src)>;
+def : MipsPat<(i64 (sext_inreg CPU64Regs:$src, i32)),
+              (SLL64_64 CPU64Regs:$src)>;
 
-// bswap pattern
-def : Pat<(bswap CPU64Regs:$rt), (DSHD (DSBH CPU64Regs:$rt))>;
+// bswap MipsPattern
+def : MipsPat<(bswap CPU64Regs:$rt), (DSHD (DSBH CPU64Regs:$rt))>;
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index 8206cfc..00ff754 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -13,29 +13,29 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "mips-asm-printer"
-#include "MipsAsmPrinter.h"
 #include "Mips.h"
+#include "MipsAsmPrinter.h"
 #include "MipsInstrInfo.h"
+#include "MipsMCInstLower.h"
 #include "InstPrinter/MipsInstPrinter.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/BasicBlock.h"
-#include "llvm/Instructions.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/InlineAsm.h"
 #include "llvm/Instructions.h"
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -43,19 +43,6 @@
 
 using namespace llvm;
 
-void MipsAsmPrinter::EmitInstrWithMacroNoAT(const MachineInstr *MI) {
-  MCInst TmpInst;
-
-  MCInstLowering.Lower(MI, TmpInst);
-  OutStreamer.EmitRawText(StringRef("\t.set\tmacro"));
-  if (MipsFI->getEmitNOAT())
-    OutStreamer.EmitRawText(StringRef("\t.set\tat"));
-  OutStreamer.EmitInstruction(TmpInst);
-  if (MipsFI->getEmitNOAT())
-    OutStreamer.EmitRawText(StringRef("\t.set\tnoat"));
-  OutStreamer.EmitRawText(StringRef("\t.set\tnomacro"));
-}
-
 bool MipsAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   MipsFI = MF.getInfo<MipsFunctionInfo>();
   AsmPrinter::runOnMachineFunction(MF);
@@ -71,84 +58,33 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
 
-  unsigned Opc = MI->getOpcode();
-  MCInst TmpInst0;
-  SmallVector<MCInst, 4> MCInsts;
-
-  switch (Opc) {
-  case Mips::ULW:
-  case Mips::ULH:
-  case Mips::ULHu:
-  case Mips::USW:
-  case Mips::USH:
-  case Mips::ULW_P8:
-  case Mips::ULH_P8:
-  case Mips::ULHu_P8:
-  case Mips::USW_P8:
-  case Mips::USH_P8:
-  case Mips::ULD:
-  case Mips::ULW64:
-  case Mips::ULH64:
-  case Mips::ULHu64:
-  case Mips::USD:
-  case Mips::USW64:
-  case Mips::USH64:
-  case Mips::ULD_P8:
-  case Mips::ULW64_P8:
-  case Mips::ULH64_P8:
-  case Mips::ULHu64_P8:
-  case Mips::USD_P8:
-  case Mips::USW64_P8:
-  case Mips::USH64_P8: {
-    if (OutStreamer.hasRawTextSupport()) {
-      EmitInstrWithMacroNoAT(MI);
-      return;
-    }
-
-    MCInstLowering.LowerUnalignedLoadStore(MI, MCInsts);
-    for (SmallVector<MCInst, 4>::iterator I = MCInsts.begin(); I
-           != MCInsts.end(); ++I)
-      OutStreamer.EmitInstruction(*I);
-
-    return;
-  }
-  case Mips::CPRESTORE: {
-    const MachineOperand &MO = MI->getOperand(0);
-    assert(MO.isImm() && "CPRESTORE's operand must be an immediate.");
-    int64_t Offset = MO.getImm();
-
-    if (OutStreamer.hasRawTextSupport()) {
-      if (!isInt<16>(Offset)) {
-        EmitInstrWithMacroNoAT(MI);
+  // Direct object specific instruction lowering
+  if (!OutStreamer.hasRawTextSupport())
+    switch (MI->getOpcode()) {
+    case Mips::DSLL:
+    case Mips::DSRL:
+    case Mips::DSRA:
+      assert(MI->getNumOperands() == 3 &&
+             "Invalid no. of machine operands for shift!");
+      assert(MI->getOperand(2).isImm());
+      int64_t Shift = MI->getOperand(2).getImm();
+      if (Shift > 31) {
+        MCInst TmpInst0;
+        MCInstLowering.LowerLargeShift(MI, TmpInst0, Shift - 32);
+        OutStreamer.EmitInstruction(TmpInst0);
         return;
       }
-    } else {
-      MCInstLowering.LowerCPRESTORE(Offset, MCInsts);
-
-      for (SmallVector<MCInst, 4>::iterator I = MCInsts.begin();
-           I != MCInsts.end(); ++I)
-        OutStreamer.EmitInstruction(*I);
-
-      return;
+      break;
     }
 
-    break;
-  }
-  case Mips::SETGP01: {
-    MCInstLowering.LowerSETGP01(MI, MCInsts);
-
-    for (SmallVector<MCInst, 4>::iterator I = MCInsts.begin();
-         I != MCInsts.end(); ++I)
-      OutStreamer.EmitInstruction(*I);
-
-    return;
-  }
-  default:
-    break;
-  }
+  MachineBasicBlock::const_instr_iterator I = MI;
+  MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
 
-  MCInstLowering.Lower(MI, TmpInst0);
-  OutStreamer.EmitInstruction(TmpInst0);
+  do {
+    MCInst TmpInst0;
+    MCInstLowering.Lower(I++, TmpInst0);
+    OutStreamer.EmitInstruction(TmpInst0);
+  } while ((I != E) && I->isInsideBundle());
 }
 
 //===----------------------------------------------------------------------===//
@@ -197,9 +133,9 @@ void MipsAsmPrinter::printSavedRegsBitmask(raw_ostream &O) {
   const MachineFrameInfo *MFI = MF->getFrameInfo();
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
   // size of stack area to which FP callee-saved regs are saved.
-  unsigned CPURegSize = Mips::CPURegsRegisterClass->getSize();
-  unsigned FGR32RegSize = Mips::FGR32RegisterClass->getSize();
-  unsigned AFGR64RegSize = Mips::AFGR64RegisterClass->getSize();
+  unsigned CPURegSize = Mips::CPURegsRegClass.getSize();
+  unsigned FGR32RegSize = Mips::FGR32RegClass.getSize();
+  unsigned AFGR64RegSize = Mips::AFGR64RegClass.getSize();
   bool HasAFGR64Reg = false;
   unsigned CSFPRegsSize = 0;
   unsigned i, e = CSI.size();
@@ -207,11 +143,11 @@ void MipsAsmPrinter::printSavedRegsBitmask(raw_ostream &O) {
   // Set FPU Bitmask.
   for (i = 0; i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
-    if (Mips::CPURegsRegisterClass->contains(Reg))
+    if (Mips::CPURegsRegClass.contains(Reg))
       break;
 
     unsigned RegNum = getMipsRegisterNumbering(Reg);
-    if (Mips::AFGR64RegisterClass->contains(Reg)) {
+    if (Mips::AFGR64RegClass.contains(Reg)) {
       FPUBitmask |= (3 << RegNum);
       CSFPRegsSize += AFGR64RegSize;
       HasAFGR64Reg = true;
@@ -283,8 +219,15 @@ const char *MipsAsmPrinter::getCurrentABIString() const {
 }
 
 void MipsAsmPrinter::EmitFunctionEntryLabel() {
-  if (OutStreamer.hasRawTextSupport())
+  if (OutStreamer.hasRawTextSupport()) {
+    if (Subtarget->inMips16Mode())
+      OutStreamer.EmitRawText(StringRef("\t.set\tmips16"));
+    else
+      OutStreamer.EmitRawText(StringRef("\t.set\tnomips16"));
+    // leave out until FSF available gas has micromips changes
+    // OutStreamer.EmitRawText(StringRef("\t.set\tnomicromips"));
     OutStreamer.EmitRawText("\t.ent\t" + Twine(CurrentFnSym->getName()));
+  }
   OutStreamer.EmitLabel(CurrentFnSym);
 }
 
@@ -295,10 +238,6 @@ void MipsAsmPrinter::EmitFunctionBodyStart() {
 
   emitFrameDirective();
 
-  bool EmitCPLoad = (MF->getTarget().getRelocationModel() == Reloc::PIC_) &&
-    Subtarget->isABI_O32() && MipsFI->globalBaseRegSet() &&
-    MipsFI->globalBaseRegFixed();
-
   if (OutStreamer.hasRawTextSupport()) {
     SmallString<128> Str;
     raw_svector_ostream OS(Str);
@@ -306,20 +245,9 @@ void MipsAsmPrinter::EmitFunctionBodyStart() {
     OutStreamer.EmitRawText(OS.str());
 
     OutStreamer.EmitRawText(StringRef("\t.set\tnoreorder"));
-
-    // Emit .cpload directive if needed.
-    if (EmitCPLoad)
-      OutStreamer.EmitRawText(StringRef("\t.cpload\t$25"));
-
     OutStreamer.EmitRawText(StringRef("\t.set\tnomacro"));
     if (MipsFI->getEmitNOAT())
       OutStreamer.EmitRawText(StringRef("\t.set\tnoat"));
-  } else if (EmitCPLoad) {
-    SmallVector<MCInst, 4> MCInsts;
-    MCInstLowering.LowerCPLOAD(MCInsts);
-    for (SmallVector<MCInst, 4>::iterator I = MCInsts.begin();
-         I != MCInsts.end(); ++I)
-      OutStreamer.EmitInstruction(*I);
   }
 }
 
@@ -382,14 +310,99 @@ bool MipsAsmPrinter::isBlockOnlyReachableByFallthrough(const MachineBasicBlock*
 }
 
 // Print out an operand for an inline asm expression.
-bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
                                      unsigned AsmVariant,const char *ExtraCode,
                                      raw_ostream &O) {
   // Does this asm operand have a single letter operand modifier?
-  if (ExtraCode && ExtraCode[0])
-    return true; // Unknown modifier.
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
 
-  printOperand(MI, OpNo, O);
+    const MachineOperand &MO = MI->getOperand(OpNum);
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI,OpNum,AsmVariant,ExtraCode,O);
+    case 'X': // hex const int
+      if ((MO.getType()) != MachineOperand::MO_Immediate)
+        return true;
+      O << "0x" << StringRef(utohexstr(MO.getImm())).lower();
+      return false;
+    case 'x': // hex const int (low 16 bits)
+      if ((MO.getType()) != MachineOperand::MO_Immediate)
+        return true;
+      O << "0x" << StringRef(utohexstr(MO.getImm() & 0xffff)).lower();
+      return false;
+    case 'd': // decimal const int
+      if ((MO.getType()) != MachineOperand::MO_Immediate)
+        return true;
+      O << MO.getImm();
+      return false;
+    case 'm': // decimal const int minus 1
+      if ((MO.getType()) != MachineOperand::MO_Immediate)
+        return true;
+      O << MO.getImm() - 1;
+      return false;
+    case 'z': {
+      // $0 if zero, regular printing otherwise
+      if (MO.getType() != MachineOperand::MO_Immediate)
+        return true;
+      int64_t Val = MO.getImm();
+      if (Val)
+        O << Val;
+      else
+        O << "$0";
+      return false;
+    }
+    case 'D': // Second part of a double word register operand
+    case 'L': // Low order register of a double word register operand
+    case 'M': // High order register of a double word register operand
+    {
+      if (OpNum == 0)
+        return true;
+      const MachineOperand &FlagsOP = MI->getOperand(OpNum - 1);
+      if (!FlagsOP.isImm())
+        return true;
+      unsigned Flags = FlagsOP.getImm();
+      unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+      // Number of registers represented by this operand. We are looking
+      // for 2 for 32 bit mode and 1 for 64 bit mode.
+      if (NumVals != 2) {
+        if (Subtarget->isGP64bit() && NumVals == 1 && MO.isReg()) {
+          unsigned Reg = MO.getReg();
+          O << '$' << MipsInstPrinter::getRegisterName(Reg);
+          return false;
+        }
+        return true;
+      }
+
+      unsigned RegOp = OpNum;
+      if (!Subtarget->isGP64bit()){
+        // Endianess reverses which register holds the high or low value
+        // between M and L.
+        switch(ExtraCode[0]) {
+        case 'M':
+          RegOp = (Subtarget->isLittle()) ? OpNum + 1 : OpNum;
+          break;
+        case 'L':
+          RegOp = (Subtarget->isLittle()) ? OpNum : OpNum + 1;
+          break;
+        case 'D': // Always the second part
+          RegOp = OpNum + 1;
+        }
+        if (RegOp >= MI->getNumOperands())
+          return true;
+        const MachineOperand &MO = MI->getOperand(RegOp);
+        if (!MO.isReg())
+          return true;
+        unsigned Reg = MO.getReg();
+        O << '$' << MipsInstPrinter::getRegisterName(Reg);
+        return false;
+      }
+    }
+    }
+  }
+
+  printOperand(MI, OpNum, O);
   return false;
 }
 
@@ -398,11 +411,12 @@ bool MipsAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
                                            const char *ExtraCode,
                                            raw_ostream &O) {
   if (ExtraCode && ExtraCode[0])
-     return true; // Unknown modifier.
+    return true; // Unknown modifier.
 
   const MachineOperand &MO = MI->getOperand(OpNum);
   assert(MO.isReg() && "unexpected inline asm memory operand");
   O << "0($" << MipsInstPrinter::getRegisterName(MO.getReg()) << ")";
+
   return false;
 }
 
@@ -450,7 +464,7 @@ void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
       break;
 
     case MachineOperand::MO_BlockAddress: {
-      MCSymbol* BA = GetBlockAddressSymbol(MO.getBlockAddress());
+      MCSymbol *BA = GetBlockAddressSymbol(MO.getBlockAddress());
       O << BA->getName();
       break;
     }
@@ -511,7 +525,7 @@ printMemOperandEA(const MachineInstr *MI, int opNum, raw_ostream &O) {
 void MipsAsmPrinter::
 printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
                 const char *Modifier) {
-  const MachineOperand& MO = MI->getOperand(opNum);
+  const MachineOperand &MO = MI->getOperand(opNum);
   O << Mips::MipsFCCToString((Mips::CondCode)MO.getImm());
 }
 
diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td
index 4b7e1d3..8aadefd 100644
--- a/lib/Target/Mips/MipsCallingConv.td
+++ b/lib/Target/Mips/MipsCallingConv.td
@@ -145,6 +145,58 @@ def RetCC_MipsEABI : CallingConv<[
 ]>;
 
 //===----------------------------------------------------------------------===//
+// Mips FastCC Calling Convention
+//===----------------------------------------------------------------------===//
+def CC_MipsO32_FastCC : CallingConv<[
+  // f64 arguments are passed in double-precision floating pointer registers.
+  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7, D8, D9]>>,
+
+  // Stack parameter slots for f64 are 64-bit doublewords and 8-byte aligned.
+  CCIfType<[f64], CCAssignToStack<8, 8>>
+]>;
+
+def CC_MipsN_FastCC : CallingConv<[
+  // Integer arguments are passed in integer registers.
+  CCIfType<[i64], CCAssignToReg<[A0_64, A1_64, A2_64, A3_64, T0_64, T1_64,
+                                 T2_64, T3_64, T4_64, T5_64, T6_64, T7_64,
+                                 T8_64, V1_64]>>,
+
+  // f64 arguments are passed in double-precision floating pointer registers.
+  CCIfType<[f64], CCAssignToReg<[D0_64, D1_64, D2_64, D3_64, D4_64, D5_64,
+                                 D6_64, D7_64, D8_64, D9_64, D10_64, D11_64,
+                                 D12_64, D13_64, D14_64, D15_64, D16_64, D17_64,
+                                 D18_64, D19_64]>>,
+
+  // Stack parameter slots for i64 and f64 are 64-bit doublewords and
+  // 8-byte aligned.
+  CCIfType<[i64, f64], CCAssignToStack<8, 8>>
+]>;
+
+def CC_Mips_FastCC : CallingConv<[
+  // Handles byval parameters.
+  CCIfByVal<CCPassByVal<4, 4>>,
+
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Integer arguments are passed in integer registers. All scratch registers,
+  // except for AT, V0 and T9, are available to be used as argument registers.
+  CCIfType<[i32], CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3, T4, T5, T6,
+                                 T7, T8, V1]>>,
+
+  // f32 arguments are passed in single-precision floating pointer registers.
+  CCIfType<[f32], CCAssignToReg<[F0, F1, F2, F3, F4, F5, F6, F7, F8, F9, F10,
+                                 F11, F12, F13, F14, F15, F16, F17, F18, F19]>>,
+
+  // Stack parameter slots for i32 and f32 are 32-bit words and 4-byte aligned.
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+
+  CCIfSubtarget<"isABI_EABI()", CCDelegateTo<CC_MipsEABI>>,
+  CCIfSubtarget<"isABI_O32()", CCDelegateTo<CC_MipsO32_FastCC>>,
+  CCDelegateTo<CC_MipsN_FastCC>
+]>;
+
+//===----------------------------------------------------------------------===//
 // Mips Calling Convention Dispatch
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/Target/Mips/MipsCodeEmitter.cpp b/lib/Target/Mips/MipsCodeEmitter.cpp
index 7d81902..cb7022b 100644
--- a/lib/Target/Mips/MipsCodeEmitter.cpp
+++ b/lib/Target/Mips/MipsCodeEmitter.cpp
@@ -145,8 +145,8 @@ bool MipsCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
     for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
         MBB != E; ++MBB){
       MCE.StartMachineBasicBlock(MBB);
-      for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
-          I != E; ++I)
+      for (MachineBasicBlock::instr_iterator I = MBB->instr_begin(),
+           E = MBB->instr_end(); I != E; ++I)
         emitInstruction(*I);
     }
   } while (MCE.finishFunction(MF));
@@ -258,7 +258,7 @@ void MipsCodeEmitter::emitGlobalAddressUnaligned(const GlobalValue *GV,
 void MipsCodeEmitter::
 emitExternalSymbolAddress(const char *ES, unsigned Reloc) const {
   MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
-                                                 Reloc, ES, 0, 0, false));
+                                                 Reloc, ES, 0, 0));
 }
 
 void MipsCodeEmitter::emitConstPoolAddress(unsigned CPI, unsigned Reloc) const {
diff --git a/lib/Target/Mips/MipsCondMov.td b/lib/Target/Mips/MipsCondMov.td
index da33680..b12b1f2 100644
--- a/lib/Target/Mips/MipsCondMov.td
+++ b/lib/Target/Mips/MipsCondMov.td
@@ -61,41 +61,54 @@ multiclass MovzPats0<RegisterClass CRC, RegisterClass DRC,
                      Instruction MOVZInst, Instruction SLTOp,
                      Instruction SLTuOp, Instruction SLTiOp,
                      Instruction SLTiuOp> {
-  def : Pat<(select (i32 (setge CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
-            (MOVZInst DRC:$T, (SLTOp CRC:$lhs, CRC:$rhs), DRC:$F)>;
-  def : Pat<(select (i32 (setuge CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
-            (MOVZInst DRC:$T, (SLTuOp CRC:$lhs, CRC:$rhs), DRC:$F)>;
-  def : Pat<(select (i32 (setge CRC:$lhs, immSExt16:$rhs)), DRC:$T, DRC:$F),
-            (MOVZInst DRC:$T, (SLTiOp CRC:$lhs, immSExt16:$rhs), DRC:$F)>;
-  def : Pat<(select (i32 (setuge CRC:$lh, immSExt16:$rh)), DRC:$T, DRC:$F),
-            (MOVZInst DRC:$T, (SLTiuOp CRC:$lh, immSExt16:$rh), DRC:$F)>;
-  def : Pat<(select (i32 (setle CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
-            (MOVZInst DRC:$T, (SLTOp CRC:$rhs, CRC:$lhs), DRC:$F)>;
-  def : Pat<(select (i32 (setule CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
-            (MOVZInst DRC:$T, (SLTuOp CRC:$rhs, CRC:$lhs), DRC:$F)>;
+  def : MipsPat<(select (i32 (setge CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTOp CRC:$lhs, CRC:$rhs), DRC:$F)>;
+  def : MipsPat<
+          (select (i32 (setuge CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+          (MOVZInst DRC:$T, (SLTuOp CRC:$lhs, CRC:$rhs), DRC:$F)>;
+  def : MipsPat<
+          (select (i32 (setge CRC:$lhs, immSExt16:$rhs)), DRC:$T, DRC:$F),
+          (MOVZInst DRC:$T, (SLTiOp CRC:$lhs, immSExt16:$rhs), DRC:$F)>;
+  def : MipsPat<
+          (select (i32 (setuge CRC:$lh, immSExt16:$rh)), DRC:$T, DRC:$F),
+          (MOVZInst DRC:$T, (SLTiuOp CRC:$lh, immSExt16:$rh), DRC:$F)>;
+  def : MipsPat<
+          (select (i32 (setle CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+          (MOVZInst DRC:$T, (SLTOp CRC:$rhs, CRC:$lhs), DRC:$F)>;
+  def : MipsPat<
+          (select (i32 (setule CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+          (MOVZInst DRC:$T, (SLTuOp CRC:$rhs, CRC:$lhs), DRC:$F)>;
 }
 
 multiclass MovzPats1<RegisterClass CRC, RegisterClass DRC,
                      Instruction MOVZInst, Instruction XOROp> {
-  def : Pat<(select (i32 (seteq CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
-            (MOVZInst DRC:$T, (XOROp CRC:$lhs, CRC:$rhs), DRC:$F)>;
-  def : Pat<(select (i32 (seteq CRC:$lhs, 0)), DRC:$T, DRC:$F),
-            (MOVZInst DRC:$T, CRC:$lhs, DRC:$F)>;
+  def : MipsPat<(select (i32 (seteq CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (XOROp CRC:$lhs, CRC:$rhs), DRC:$F)>;
+  def : MipsPat<(select (i32 (seteq CRC:$lhs, 0)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, CRC:$lhs, DRC:$F)>;
+}
+
+multiclass MovzPats2<RegisterClass CRC, RegisterClass DRC,
+                     Instruction MOVZInst, Instruction XORiOp> {
+  def : MipsPat<
+            (select (i32 (seteq CRC:$lhs, immZExt16:$uimm16)), DRC:$T, DRC:$F),
+            (MOVZInst DRC:$T, (XORiOp CRC:$lhs, immZExt16:$uimm16), DRC:$F)>;
 }
 
 multiclass MovnPats<RegisterClass CRC, RegisterClass DRC, Instruction MOVNInst,
                     Instruction XOROp> {
-  def : Pat<(select (i32 (setne CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
-            (MOVNInst DRC:$T, (XOROp CRC:$lhs, CRC:$rhs), DRC:$F)>;
-  def : Pat<(select CRC:$cond, DRC:$T, DRC:$F),
-            (MOVNInst DRC:$T, CRC:$cond, DRC:$F)>;
-  def : Pat<(select (i32 (setne CRC:$lhs, 0)),DRC:$T, DRC:$F),
-            (MOVNInst DRC:$T, CRC:$lhs, DRC:$F)>;
+  def : MipsPat<(select (i32 (setne CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+                (MOVNInst DRC:$T, (XOROp CRC:$lhs, CRC:$rhs), DRC:$F)>;
+  def : MipsPat<(select CRC:$cond, DRC:$T, DRC:$F),
+                (MOVNInst DRC:$T, CRC:$cond, DRC:$F)>;
+  def : MipsPat<(select (i32 (setne CRC:$lhs, 0)),DRC:$T, DRC:$F),
+                (MOVNInst DRC:$T, CRC:$lhs, DRC:$F)>;
 }
 
 // Instantiation of instructions.
 def MOVZ_I_I     : CondMovIntInt<CPURegs, CPURegs, 0x0a, "movz">;
-let Predicates = [HasMips64],DecoderNamespace = "Mips64" in {
+let Predicates = [HasMips64, HasStandardEncoding],
+                  DecoderNamespace = "Mips64" in {
   def MOVZ_I_I64   : CondMovIntInt<CPURegs, CPU64Regs, 0x0a, "movz">;
   def MOVZ_I64_I   : CondMovIntInt<CPU64Regs, CPURegs, 0x0a, "movz"> {
     let isCodeGenOnly = 1;
@@ -106,7 +119,8 @@ let Predicates = [HasMips64],DecoderNamespace = "Mips64" in {
 }
 
 def MOVN_I_I     : CondMovIntInt<CPURegs, CPURegs, 0x0b, "movn">;
-let Predicates = [HasMips64],DecoderNamespace = "Mips64" in {
+let Predicates = [HasMips64, HasStandardEncoding],
+                  DecoderNamespace = "Mips64" in {
   def MOVN_I_I64   : CondMovIntInt<CPURegs, CPU64Regs, 0x0b, "movn">;
   def MOVN_I64_I   : CondMovIntInt<CPU64Regs, CPURegs, 0x0b, "movn"> {
     let isCodeGenOnly = 1;
@@ -118,21 +132,22 @@ let Predicates = [HasMips64],DecoderNamespace = "Mips64" in {
 
 def MOVZ_I_S   : CondMovIntFP<CPURegs, FGR32, 16, 18, "movz.s">;
 def MOVZ_I64_S : CondMovIntFP<CPU64Regs, FGR32, 16, 18, "movz.s">,
-                 Requires<[HasMips64]> {
+                 Requires<[HasMips64, HasStandardEncoding]> {
   let DecoderNamespace = "Mips64";
 }
 
 def MOVN_I_S   : CondMovIntFP<CPURegs, FGR32, 16, 19, "movn.s">;
 def MOVN_I64_S : CondMovIntFP<CPU64Regs, FGR32, 16, 19, "movn.s">,
-                 Requires<[HasMips64]> {
+                 Requires<[HasMips64, HasStandardEncoding]> {
   let DecoderNamespace = "Mips64";
 }
 
-let Predicates = [NotFP64bit] in {
+let Predicates = [NotFP64bit, HasStandardEncoding] in {
   def MOVZ_I_D32   : CondMovIntFP<CPURegs, AFGR64, 17, 18, "movz.d">;
   def MOVN_I_D32   : CondMovIntFP<CPURegs, AFGR64, 17, 19, "movn.d">;
 }
-let Predicates = [IsFP64bit],DecoderNamespace = "Mips64" in {
+let Predicates = [IsFP64bit, HasStandardEncoding],
+                  DecoderNamespace = "Mips64" in {
   def MOVZ_I_D64   : CondMovIntFP<CPURegs, FGR64, 17, 18, "movz.d">;
   def MOVZ_I64_D64 : CondMovIntFP<CPU64Regs, FGR64, 17, 18, "movz.d"> {
     let isCodeGenOnly = 1;
@@ -145,24 +160,25 @@ let Predicates = [IsFP64bit],DecoderNamespace = "Mips64" in {
 
 def MOVT_I   : CondMovFPInt<CPURegs, MipsCMovFP_T, 1, "movt">;
 def MOVT_I64 : CondMovFPInt<CPU64Regs, MipsCMovFP_T, 1, "movt">,
-               Requires<[HasMips64]> {
+               Requires<[HasMips64, HasStandardEncoding]> {
   let DecoderNamespace = "Mips64";
 }
 
 def MOVF_I   : CondMovFPInt<CPURegs, MipsCMovFP_F, 0, "movf">;
 def MOVF_I64 : CondMovFPInt<CPU64Regs, MipsCMovFP_F, 0, "movf">,
-               Requires<[HasMips64]> {
+               Requires<[HasMips64, HasStandardEncoding]> {
   let DecoderNamespace = "Mips64";
 }
 
 def MOVT_S : CondMovFPFP<FGR32, MipsCMovFP_T, 16, 1, "movt.s">;
 def MOVF_S : CondMovFPFP<FGR32, MipsCMovFP_F, 16, 0, "movf.s">;
 
-let Predicates = [NotFP64bit] in {
+let Predicates = [NotFP64bit, HasStandardEncoding] in {
   def MOVT_D32 : CondMovFPFP<AFGR64, MipsCMovFP_T, 17, 1, "movt.d">;
   def MOVF_D32 : CondMovFPFP<AFGR64, MipsCMovFP_F, 17, 0, "movf.d">;
 }
-let Predicates = [IsFP64bit], DecoderNamespace = "Mips64" in {
+let Predicates = [IsFP64bit, HasStandardEncoding],
+    DecoderNamespace = "Mips64" in {
   def MOVT_D64 : CondMovFPFP<FGR64, MipsCMovFP_T, 17, 1, "movt.d">;
   def MOVF_D64 : CondMovFPFP<FGR64, MipsCMovFP_F, 17, 0, "movf.d">;
 }
@@ -170,7 +186,8 @@ let Predicates = [IsFP64bit], DecoderNamespace = "Mips64" in {
 // Instantiation of conditional move patterns.
 defm : MovzPats0<CPURegs, CPURegs, MOVZ_I_I, SLT, SLTu, SLTi, SLTiu>;
 defm : MovzPats1<CPURegs, CPURegs, MOVZ_I_I, XOR>;
-let Predicates = [HasMips64] in {
+defm : MovzPats2<CPURegs, CPURegs, MOVZ_I_I, XORi>;
+let Predicates = [HasMips64, HasStandardEncoding] in {
   defm : MovzPats0<CPURegs, CPU64Regs, MOVZ_I_I64, SLT, SLTu, SLTi, SLTiu>;
   defm : MovzPats0<CPU64Regs, CPURegs, MOVZ_I_I, SLT64, SLTu64, SLTi64,
                    SLTiu64>;
@@ -179,10 +196,13 @@ let Predicates = [HasMips64] in {
   defm : MovzPats1<CPURegs, CPU64Regs, MOVZ_I_I64, XOR>;
   defm : MovzPats1<CPU64Regs, CPURegs, MOVZ_I64_I, XOR64>;
   defm : MovzPats1<CPU64Regs, CPU64Regs, MOVZ_I64_I64, XOR64>;
+  defm : MovzPats2<CPURegs, CPU64Regs, MOVZ_I_I64, XORi>;
+  defm : MovzPats2<CPU64Regs, CPURegs, MOVZ_I64_I, XORi64>;
+  defm : MovzPats2<CPU64Regs, CPU64Regs, MOVZ_I64_I64, XORi64>;
 }
 
 defm : MovnPats<CPURegs, CPURegs, MOVN_I_I, XOR>;
-let Predicates = [HasMips64] in {
+let Predicates = [HasMips64, HasStandardEncoding] in {
   defm : MovnPats<CPURegs, CPU64Regs, MOVN_I_I64, XOR>;
   defm : MovnPats<CPU64Regs, CPURegs, MOVN_I64_I, XOR64>;
   defm : MovnPats<CPU64Regs, CPU64Regs, MOVN_I64_I64, XOR64>;
@@ -191,19 +211,19 @@ let Predicates = [HasMips64] in {
 defm : MovzPats0<CPURegs, FGR32, MOVZ_I_S, SLT, SLTu, SLTi, SLTiu>;
 defm : MovzPats1<CPURegs, FGR32, MOVZ_I_S, XOR>;
 defm : MovnPats<CPURegs, FGR32, MOVN_I_S, XOR>;
-let Predicates = [HasMips64] in {
+let Predicates = [HasMips64, HasStandardEncoding] in {
   defm : MovzPats0<CPU64Regs, FGR32, MOVZ_I_S, SLT64, SLTu64, SLTi64,
                    SLTiu64>;
   defm : MovzPats1<CPU64Regs, FGR32, MOVZ_I64_S, XOR64>;
   defm : MovnPats<CPU64Regs, FGR32, MOVN_I64_S, XOR64>;
 }
 
-let Predicates = [NotFP64bit] in {
+let Predicates = [NotFP64bit, HasStandardEncoding] in {
   defm : MovzPats0<CPURegs, AFGR64, MOVZ_I_D32, SLT, SLTu, SLTi, SLTiu>;
   defm : MovzPats1<CPURegs, AFGR64, MOVZ_I_D32, XOR>;
   defm : MovnPats<CPURegs, AFGR64, MOVN_I_D32, XOR>;
 }
-let Predicates = [IsFP64bit] in {
+let Predicates = [IsFP64bit, HasStandardEncoding] in {
   defm : MovzPats0<CPURegs, FGR64, MOVZ_I_D64, SLT, SLTu, SLTi, SLTiu>;
   defm : MovzPats0<CPU64Regs, FGR64, MOVZ_I_D64, SLT64, SLTu64, SLTi64,
                    SLTiu64>;
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index debf2f1..2bba8a3 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -36,12 +36,21 @@ static cl::opt<bool> EnableDelaySlotFiller(
   cl::desc("Fill the Mips delay slots useful instructions."),
   cl::Hidden);
 
+// This option can be used to silence complaints by machine verifier passes.
+static cl::opt<bool> SkipDelaySlotFiller(
+  "skip-mips-delay-filler",
+  cl::init(false),
+  cl::desc("Skip MIPS' delay slot filling pass."),
+  cl::Hidden);
+
 namespace {
   struct Filler : public MachineFunctionPass {
+    typedef MachineBasicBlock::instr_iterator InstrIter;
+    typedef MachineBasicBlock::reverse_instr_iterator ReverseInstrIter;
 
     TargetMachine &TM;
     const TargetInstrInfo *TII;
-    MachineBasicBlock::iterator LastFiller;
+    InstrIter LastFiller;
 
     static char ID;
     Filler(TargetMachine &tm)
@@ -53,6 +62,9 @@ namespace {
 
     bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
     bool runOnMachineFunction(MachineFunction &F) {
+      if (SkipDelaySlotFiller)
+        return false;
+
       bool Changed = false;
       for (MachineFunction::iterator FI = F.begin(), FE = F.end();
            FI != FE; ++FI)
@@ -61,27 +73,27 @@ namespace {
     }
 
     bool isDelayFiller(MachineBasicBlock &MBB,
-                       MachineBasicBlock::iterator candidate);
+                       InstrIter candidate);
 
-    void insertCallUses(MachineBasicBlock::iterator MI,
-                        SmallSet<unsigned, 32>& RegDefs,
-                        SmallSet<unsigned, 32>& RegUses);
+    void insertCallUses(InstrIter MI,
+                        SmallSet<unsigned, 32> &RegDefs,
+                        SmallSet<unsigned, 32> &RegUses);
 
-    void insertDefsUses(MachineBasicBlock::iterator MI,
-                        SmallSet<unsigned, 32>& RegDefs,
-                        SmallSet<unsigned, 32>& RegUses);
+    void insertDefsUses(InstrIter MI,
+                        SmallSet<unsigned, 32> &RegDefs,
+                        SmallSet<unsigned, 32> &RegUses);
 
-    bool IsRegInSet(SmallSet<unsigned, 32>& RegSet,
+    bool IsRegInSet(SmallSet<unsigned, 32> &RegSet,
                     unsigned Reg);
 
-    bool delayHasHazard(MachineBasicBlock::iterator candidate,
+    bool delayHasHazard(InstrIter candidate,
                         bool &sawLoad, bool &sawStore,
                         SmallSet<unsigned, 32> &RegDefs,
                         SmallSet<unsigned, 32> &RegUses);
 
     bool
-    findDelayInstr(MachineBasicBlock &MBB, MachineBasicBlock::iterator slot,
-                   MachineBasicBlock::iterator &Filler);
+    findDelayInstr(MachineBasicBlock &MBB, InstrIter slot,
+                   InstrIter &Filler);
 
 
   };
@@ -93,14 +105,14 @@ namespace {
 bool Filler::
 runOnMachineBasicBlock(MachineBasicBlock &MBB) {
   bool Changed = false;
-  LastFiller = MBB.end();
+  LastFiller = MBB.instr_end();
 
-  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I)
+  for (InstrIter I = MBB.instr_begin(); I != MBB.instr_end(); ++I)
     if (I->hasDelaySlot()) {
       ++FilledSlots;
       Changed = true;
 
-      MachineBasicBlock::iterator D;
+      InstrIter D;
 
       if (EnableDelaySlotFiller && findDelayInstr(MBB, I, D)) {
         MBB.splice(llvm::next(I), &MBB, D);
@@ -111,6 +123,10 @@ runOnMachineBasicBlock(MachineBasicBlock &MBB) {
       // Record the filler instruction that filled the delay slot.
       // The instruction after it will be visited in the next iteration.
       LastFiller = ++I;
+
+      // Set InsideBundle bit so that the machine verifier doesn't expect this
+      // instruction to be a terminator.
+      LastFiller->setIsInsideBundle();
      }
   return Changed;
 
@@ -123,8 +139,8 @@ FunctionPass *llvm::createMipsDelaySlotFillerPass(MipsTargetMachine &tm) {
 }
 
 bool Filler::findDelayInstr(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator slot,
-                            MachineBasicBlock::iterator &Filler) {
+                            InstrIter slot,
+                            InstrIter &Filler) {
   SmallSet<unsigned, 32> RegDefs;
   SmallSet<unsigned, 32> RegUses;
 
@@ -133,13 +149,13 @@ bool Filler::findDelayInstr(MachineBasicBlock &MBB,
   bool sawLoad = false;
   bool sawStore = false;
 
-  for (MachineBasicBlock::reverse_iterator I(slot); I != MBB.rend(); ++I) {
+  for (ReverseInstrIter I(slot); I != MBB.instr_rend(); ++I) {
     // skip debug value
     if (I->isDebugValue())
       continue;
 
     // Convert to forward iterator.
-    MachineBasicBlock::iterator FI(llvm::next(I).base());
+    InstrIter FI(llvm::next(I).base());
 
     if (I->hasUnmodeledSideEffects()
         || I->isInlineAsm()
@@ -165,7 +181,7 @@ bool Filler::findDelayInstr(MachineBasicBlock &MBB,
   return false;
 }
 
-bool Filler::delayHasHazard(MachineBasicBlock::iterator candidate,
+bool Filler::delayHasHazard(InstrIter candidate,
                             bool &sawLoad, bool &sawStore,
                             SmallSet<unsigned, 32> &RegDefs,
                             SmallSet<unsigned, 32> &RegUses) {
@@ -213,9 +229,9 @@ bool Filler::delayHasHazard(MachineBasicBlock::iterator candidate,
 }
 
 // Insert Defs and Uses of MI into the sets RegDefs and RegUses.
-void Filler::insertDefsUses(MachineBasicBlock::iterator MI,
-                            SmallSet<unsigned, 32>& RegDefs,
-                            SmallSet<unsigned, 32>& RegUses) {
+void Filler::insertDefsUses(InstrIter MI,
+                            SmallSet<unsigned, 32> &RegDefs,
+                            SmallSet<unsigned, 32> &RegUses) {
   // If MI is a call or return, just examine the explicit non-variadic operands.
   MCInstrDesc MCID = MI->getDesc();
   unsigned e = MI->isCall() || MI->isReturn() ? MCID.getNumOperands() :
@@ -240,14 +256,11 @@ void Filler::insertDefsUses(MachineBasicBlock::iterator MI,
 }
 
 //returns true if the Reg or its alias is in the RegSet.
-bool Filler::IsRegInSet(SmallSet<unsigned, 32>& RegSet, unsigned Reg) {
-  if (RegSet.count(Reg))
-    return true;
-  // check Aliased Registers
-  for (const uint16_t *Alias = TM.getRegisterInfo()->getAliasSet(Reg);
-       *Alias; ++Alias)
-    if (RegSet.count(*Alias))
+bool Filler::IsRegInSet(SmallSet<unsigned, 32> &RegSet, unsigned Reg) {
+  // Check Reg and all aliased Registers.
+  for (MCRegAliasIterator AI(Reg, TM.getRegisterInfo(), true);
+       AI.isValid(); ++AI)
+    if (RegSet.count(*AI))
       return true;
-
   return false;
 }
diff --git a/lib/Target/Mips/MipsEmitGPRestore.cpp b/lib/Target/Mips/MipsEmitGPRestore.cpp
deleted file mode 100644
index 119d1a8..0000000
--- a/lib/Target/Mips/MipsEmitGPRestore.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-//===-- MipsEmitGPRestore.cpp - Emit GP Restore Instruction ---------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass emits instructions that restore $gp right
-// after jalr instructions.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "emit-gp-restore"
-
-#include "Mips.h"
-#include "MipsTargetMachine.h"
-#include "MipsMachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/ADT/Statistic.h"
-
-using namespace llvm;
-
-namespace {
-  struct Inserter : public MachineFunctionPass {
-
-    TargetMachine &TM;
-    const TargetInstrInfo *TII;
-
-    static char ID;
-    Inserter(TargetMachine &tm)
-      : MachineFunctionPass(ID), TM(tm), TII(tm.getInstrInfo()) { }
-
-    virtual const char *getPassName() const {
-      return "Mips Emit GP Restore";
-    }
-
-    bool runOnMachineFunction(MachineFunction &F);
-  };
-  char Inserter::ID = 0;
-} // end of anonymous namespace
-
-bool Inserter::runOnMachineFunction(MachineFunction &F) {
-  MipsFunctionInfo *MipsFI = F.getInfo<MipsFunctionInfo>();
-
-  if ((TM.getRelocationModel() != Reloc::PIC_) ||
-      (!MipsFI->globalBaseRegFixed()))
-    return false;
-
-  bool Changed = false;
-  int FI = MipsFI->getGPFI();
-
-  for (MachineFunction::iterator MFI = F.begin(), MFE = F.end();
-       MFI != MFE; ++MFI) {
-    MachineBasicBlock& MBB = *MFI;
-    MachineBasicBlock::iterator I = MFI->begin();
-
-    // If MBB is a landing pad, insert instruction that restores $gp after
-    // EH_LABEL.
-    if (MBB.isLandingPad()) {
-      // Find EH_LABEL first.
-      for (; I->getOpcode() != TargetOpcode::EH_LABEL; ++I) ;
-
-      // Insert lw.
-      ++I;
-      DebugLoc dl = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
-      BuildMI(MBB, I, dl, TII->get(Mips::LW), Mips::GP).addFrameIndex(FI)
-                                                       .addImm(0);
-      Changed = true;
-    }
-
-    while (I != MFI->end()) {
-      if (I->getOpcode() != Mips::JALR) {
-        ++I;
-        continue;
-      }
-
-      DebugLoc dl = I->getDebugLoc();
-      // emit lw $gp, ($gp save slot on stack) after jalr
-      BuildMI(MBB, ++I, dl, TII->get(Mips::LW), Mips::GP).addFrameIndex(FI)
-                                                         .addImm(0);
-      Changed = true;
-    }
-  }
-
-  return Changed;
-}
-
-/// createMipsEmitGPRestorePass - Returns a pass that emits instructions that
-/// restores $gp clobbered by jalr instructions.
-FunctionPass *llvm::createMipsEmitGPRestorePass(MipsTargetMachine &tm) {
-  return new Inserter(tm);
-}
-
diff --git a/lib/Target/Mips/MipsExpandPseudo.cpp b/lib/Target/Mips/MipsExpandPseudo.cpp
deleted file mode 100644
index baeae97..0000000
--- a/lib/Target/Mips/MipsExpandPseudo.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-//===--  MipsExpandPseudo.cpp - Expand Pseudo Instructions ----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass expands pseudo instructions into target instructions after register
-// allocation but before post-RA scheduling.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "mips-expand-pseudo"
-
-#include "Mips.h"
-#include "MipsTargetMachine.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/ADT/Statistic.h"
-
-using namespace llvm;
-
-namespace {
-  struct MipsExpandPseudo : public MachineFunctionPass {
-
-    TargetMachine &TM;
-    const TargetInstrInfo *TII;
-
-    static char ID;
-    MipsExpandPseudo(TargetMachine &tm)
-      : MachineFunctionPass(ID), TM(tm), TII(tm.getInstrInfo()) { }
-
-    virtual const char *getPassName() const {
-      return "Mips PseudoInstrs Expansion";
-    }
-
-    bool runOnMachineFunction(MachineFunction &F);
-    bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
-
-  private:
-    void ExpandBuildPairF64(MachineBasicBlock&, MachineBasicBlock::iterator);
-    void ExpandExtractElementF64(MachineBasicBlock&,
-                                 MachineBasicBlock::iterator);
-  };
-  char MipsExpandPseudo::ID = 0;
-} // end of anonymous namespace
-
-bool MipsExpandPseudo::runOnMachineFunction(MachineFunction& F) {
-  bool Changed = false;
-
-  for (MachineFunction::iterator I = F.begin(); I != F.end(); ++I)
-    Changed |= runOnMachineBasicBlock(*I);
-
-  return Changed;
-}
-
-bool MipsExpandPseudo::runOnMachineBasicBlock(MachineBasicBlock& MBB) {
-
-  bool Changed = false;
-  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end();) {
-    const MCInstrDesc& MCid = I->getDesc();
-
-    switch(MCid.getOpcode()) {
-    default:
-      ++I;
-      continue;
-    case Mips::SETGP2:
-      // Convert "setgp2 $globalreg, $t9" to "addu $globalreg, $v0, $t9"
-      BuildMI(MBB, I, I->getDebugLoc(), TII->get(Mips::ADDu),
-              I->getOperand(0).getReg())
-        .addReg(Mips::V0).addReg(I->getOperand(1).getReg());
-      break;
-    case Mips::BuildPairF64:
-      ExpandBuildPairF64(MBB, I);
-      break;
-    case Mips::ExtractElementF64:
-      ExpandExtractElementF64(MBB, I);
-      break;
-    }
-
-    // delete original instr
-    MBB.erase(I++);
-    Changed = true;
-  }
-
-  return Changed;
-}
-
-void MipsExpandPseudo::ExpandBuildPairF64(MachineBasicBlock& MBB,
-                                            MachineBasicBlock::iterator I) {
-  unsigned DstReg = I->getOperand(0).getReg();
-  unsigned LoReg = I->getOperand(1).getReg(), HiReg = I->getOperand(2).getReg();
-  const MCInstrDesc& Mtc1Tdd = TII->get(Mips::MTC1);
-  DebugLoc dl = I->getDebugLoc();
-  const uint16_t* SubReg =
-    TM.getRegisterInfo()->getSubRegisters(DstReg);
-
-  // mtc1 Lo, $fp
-  // mtc1 Hi, $fp + 1
-  BuildMI(MBB, I, dl, Mtc1Tdd, *SubReg).addReg(LoReg);
-  BuildMI(MBB, I, dl, Mtc1Tdd, *(SubReg + 1)).addReg(HiReg);
-}
-
-void MipsExpandPseudo::ExpandExtractElementF64(MachineBasicBlock& MBB,
-                                               MachineBasicBlock::iterator I) {
-  unsigned DstReg = I->getOperand(0).getReg();
-  unsigned SrcReg = I->getOperand(1).getReg();
-  unsigned N = I->getOperand(2).getImm();
-  const MCInstrDesc& Mfc1Tdd = TII->get(Mips::MFC1);
-  DebugLoc dl = I->getDebugLoc();
-  const uint16_t* SubReg = TM.getRegisterInfo()->getSubRegisters(SrcReg);
-
-  BuildMI(MBB, I, dl, Mfc1Tdd, DstReg).addReg(*(SubReg + N));
-}
-
-/// createMipsMipsExpandPseudoPass - Returns a pass that expands pseudo
-/// instrs into real instrs
-FunctionPass *llvm::createMipsExpandPseudoPass(MipsTargetMachine &tm) {
-  return new MipsExpandPseudo(tm);
-}
diff --git a/lib/Target/Mips/MipsFrameLowering.cpp b/lib/Target/Mips/MipsFrameLowering.cpp
index f8ea3d0..6338f3c 100644
--- a/lib/Target/Mips/MipsFrameLowering.cpp
+++ b/lib/Target/Mips/MipsFrameLowering.cpp
@@ -94,38 +94,6 @@ bool MipsFrameLowering::targetHandlesStackFrameRounding() const {
   return true;
 }
 
-// Build an instruction sequence to load an immediate that is too large to fit
-// in 16-bit and add the result to Reg.
-static void expandLargeImm(unsigned Reg, int64_t Imm, bool IsN64,
-                           const MipsInstrInfo &TII, MachineBasicBlock& MBB,
-                           MachineBasicBlock::iterator II, DebugLoc DL) {
-  unsigned LUi = IsN64 ? Mips::LUi64 : Mips::LUi;
-  unsigned ADDu = IsN64 ? Mips::DADDu : Mips::ADDu;
-  unsigned ZEROReg = IsN64 ? Mips::ZERO_64 : Mips::ZERO;
-  unsigned ATReg = IsN64 ? Mips::AT_64 : Mips::AT;
-  MipsAnalyzeImmediate AnalyzeImm;
-  const MipsAnalyzeImmediate::InstSeq &Seq =
-    AnalyzeImm.Analyze(Imm, IsN64 ? 64 : 32, false /* LastInstrIsADDiu */);
-  MipsAnalyzeImmediate::InstSeq::const_iterator Inst = Seq.begin();
-
-  // The first instruction can be a LUi, which is different from other
-  // instructions (ADDiu, ORI and SLL) in that it does not have a register
-  // operand.
-  if (Inst->Opc == LUi)
-    BuildMI(MBB, II, DL, TII.get(LUi), ATReg)
-      .addImm(SignExtend64<16>(Inst->ImmOpnd));
-  else
-    BuildMI(MBB, II, DL, TII.get(Inst->Opc), ATReg).addReg(ZEROReg)
-      .addImm(SignExtend64<16>(Inst->ImmOpnd));
-
-  // Build the remaining instructions in Seq.
-  for (++Inst; Inst != Seq.end(); ++Inst)
-    BuildMI(MBB, II, DL, TII.get(Inst->Opc), ATReg).addReg(ATReg)
-      .addImm(SignExtend64<16>(Inst->ImmOpnd));
-
-  BuildMI(MBB, II, DL, TII.get(ADDu), Reg).addReg(Reg).addReg(ATReg);
-}
-
 void MipsFrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock &MBB   = MF.front();
   MachineFrameInfo *MFI    = MF.getFrameInfo();
@@ -136,7 +104,6 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const {
     *static_cast<const MipsInstrInfo*>(MF.getTarget().getInstrInfo());
   MachineBasicBlock::iterator MBBI = MBB.begin();
   DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
-  bool isPIC = (MF.getTarget().getRelocationModel() == Reloc::PIC_);
   unsigned SP = STI.isABI_N64() ? Mips::SP_64 : Mips::SP;
   unsigned FP = STI.isABI_N64() ? Mips::FP_64 : Mips::FP;
   unsigned ZERO = STI.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO;
@@ -144,35 +111,17 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const {
   unsigned ADDiu = STI.isABI_N64() ? Mips::DADDiu : Mips::ADDiu;
 
   // First, compute final stack size.
-  unsigned RegSize = STI.isGP32bit() ? 4 : 8;
   unsigned StackAlign = getStackAlignment();
-  unsigned LocalVarAreaOffset = MipsFI->needGPSaveRestore() ?
-    (MFI->getObjectOffset(MipsFI->getGPFI()) + RegSize) :
-    MipsFI->getMaxCallFrameSize();
-  uint64_t StackSize =  RoundUpToAlignment(LocalVarAreaOffset, StackAlign) +
-     RoundUpToAlignment(MFI->getStackSize(), StackAlign);
+  uint64_t StackSize = RoundUpToAlignment(MFI->getStackSize(), StackAlign);
+
+  if (MipsFI->globalBaseRegSet())
+    StackSize += MFI->getObjectOffset(MipsFI->getGlobalRegFI()) + StackAlign;
+  else
+    StackSize += RoundUpToAlignment(MipsFI->getMaxCallFrameSize(), StackAlign);
 
    // Update stack size
   MFI->setStackSize(StackSize);
 
-  // Emit instructions that set the global base register if the target ABI is
-  // O32.
-  if (isPIC && MipsFI->globalBaseRegSet() && STI.isABI_O32() &&
-      !MipsFI->globalBaseRegFixed()) {
-      // See MipsInstrInfo.td for explanation.
-    MachineBasicBlock *NewEntry = MF.CreateMachineBasicBlock();
-    MF.insert(&MBB, NewEntry);
-    NewEntry->addSuccessor(&MBB);
-
-    // Copy live in registers.
-    for (MachineBasicBlock::livein_iterator R = MBB.livein_begin();
-         R != MBB.livein_end(); ++R)
-      NewEntry->addLiveIn(*R);
-
-    BuildMI(*NewEntry, NewEntry->begin(), dl, TII.get(Mips:: SETGP01),
-            Mips::V0);
-  }
-
   // No need to allocate space on the stack.
   if (StackSize == 0 && !MFI->adjustsStack()) return;
 
@@ -181,11 +130,20 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineLocation DstML, SrcML;
 
   // Adjust stack.
-  if (isInt<16>(-StackSize)) // addi sp, sp, (-stacksize)
-    BuildMI(MBB, MBBI, dl, TII.get(ADDiu), SP).addReg(SP).addImm(-StackSize);
+  if (isInt<16>(-StackSize)) {// addi sp, sp, (-stacksize)
+    if (STI.inMips16Mode())
+      BuildMI(MBB, MBBI, dl,
+              TII.get(Mips::SaveRaF16)).addImm(StackSize); // cleanup
+    else
+      BuildMI(MBB, MBBI, dl, TII.get(ADDiu), SP).addReg(SP).addImm(-StackSize);
+  }
   else { // Expand immediate that doesn't fit in 16-bit.
-    MipsFI->setEmitNOAT();
-    expandLargeImm(SP, -StackSize, STI.isABI_N64(), TII, MBB, MBBI, dl);
+    unsigned ATReg = STI.isABI_N64() ? Mips::AT_64 : Mips::AT;
+
+    MF.getInfo<MipsFunctionInfo>()->setEmitNOAT();
+    Mips::loadImmediate(-StackSize, STI.isABI_N64(), TII, MBB, MBBI, dl, false,
+                        0);
+    BuildMI(MBB, MBBI, dl, TII.get(ADDu), SP).addReg(SP).addReg(ATReg);
   }
 
   // emit ".cfi_def_cfa_offset StackSize"
@@ -217,20 +175,18 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const {
 
       // If Reg is a double precision register, emit two cfa_offsets,
       // one for each of the paired single precision registers.
-      if (Mips::AFGR64RegisterClass->contains(Reg)) {
-        const uint16_t *SubRegs = RegInfo->getSubRegisters(Reg);
+      if (Mips::AFGR64RegClass.contains(Reg)) {
         MachineLocation DstML0(MachineLocation::VirtualFP, Offset);
         MachineLocation DstML1(MachineLocation::VirtualFP, Offset + 4);
-        MachineLocation SrcML0(*SubRegs);
-        MachineLocation SrcML1(*(SubRegs + 1));
+        MachineLocation SrcML0(RegInfo->getSubReg(Reg, Mips::sub_fpeven));
+        MachineLocation SrcML1(RegInfo->getSubReg(Reg, Mips::sub_fpodd));
 
         if (!STI.isLittle())
           std::swap(SrcML0, SrcML1);
 
         Moves.push_back(MachineMove(CSLabel, DstML0, SrcML0));
         Moves.push_back(MachineMove(CSLabel, DstML1, SrcML1));
-      }
-      else {
+      } else {
         // Reg is either in CPURegs or FGR32.
         DstML = MachineLocation(MachineLocation::VirtualFP, Offset);
         SrcML = MachineLocation(Reg);
@@ -252,13 +208,6 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const {
     SrcML = MachineLocation(MachineLocation::VirtualFP);
     Moves.push_back(MachineMove(SetFPLabel, DstML, SrcML));
   }
-
-  // Restore GP from the saved stack location
-  if (MipsFI->needGPSaveRestore()) {
-    unsigned Offset = MFI->getObjectOffset(MipsFI->getGPFI());
-    BuildMI(MBB, MBBI, dl, TII.get(Mips::CPRESTORE)).addImm(Offset)
-      .addReg(Mips::GP);
-  }
 }
 
 void MipsFrameLowering::emitEpilogue(MachineFunction &MF,
@@ -293,16 +242,28 @@ void MipsFrameLowering::emitEpilogue(MachineFunction &MF,
     return;
 
   // Adjust stack.
-  if (isInt<16>(StackSize)) // addi sp, sp, (-stacksize)
-    BuildMI(MBB, MBBI, dl, TII.get(ADDiu), SP).addReg(SP).addImm(StackSize);
-  else // Expand immediate that doesn't fit in 16-bit.
-    expandLargeImm(SP, StackSize, STI.isABI_N64(), TII, MBB, MBBI, dl);
+  if (isInt<16>(StackSize)) { // addi sp, sp, (-stacksize)
+    if (STI.inMips16Mode())
+      // assumes stacksize multiple of 8
+      BuildMI(MBB, MBBI, dl,
+              TII.get(Mips::RestoreRaF16)).addImm(StackSize);
+    else
+      BuildMI(MBB, MBBI, dl, TII.get(ADDiu), SP).addReg(SP).addImm(StackSize);
+  }
+  else { // Expand immediate that doesn't fit in 16-bit.
+    unsigned ATReg = STI.isABI_N64() ? Mips::AT_64 : Mips::AT;
+
+    MF.getInfo<MipsFunctionInfo>()->setEmitNOAT();
+    Mips::loadImmediate(StackSize, STI.isABI_N64(), TII, MBB, MBBI, dl, false,
+                        0);
+    BuildMI(MBB, MBBI, dl, TII.get(ADDu), SP).addReg(SP).addReg(ATReg);
+  }
 }
 
 void MipsFrameLowering::
 processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                      RegScavenger *RS) const {
-  MachineRegisterInfo& MRI = MF.getRegInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
   unsigned FP = STI.isABI_N64() ? Mips::FP_64 : Mips::FP;
 
   // FIXME: remove this code if register allocator can correctly mark
@@ -311,16 +272,35 @@ processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   // Mark $fp and $ra as used or unused.
   if (hasFP(MF))
     MRI.setPhysRegUsed(FP);
+}
 
-  // The register allocator might determine $ra is used after seeing
-  // instruction "jr $ra", but we do not want PrologEpilogInserter to insert
-  // instructions to save/restore $ra unless there is a function call.
-  // To correct this, $ra is explicitly marked unused if there is no
-  // function call.
-  if (MF.getFrameInfo()->hasCalls())
-    MRI.setPhysRegUsed(Mips::RA);
-  else {
-    MRI.setPhysRegUnused(Mips::RA);
-    MRI.setPhysRegUnused(Mips::RA_64);
+bool MipsFrameLowering::
+spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MI,
+                          const std::vector<CalleeSavedInfo> &CSI,
+                          const TargetRegisterInfo *TRI) const {
+  MachineFunction *MF = MBB.getParent();
+  MachineBasicBlock *EntryBlock = MF->begin();
+  const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo();
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    // Add the callee-saved register as live-in. Do not add if the register is
+    // RA and return address is taken, because it has already been added in
+    // method MipsTargetLowering::LowerRETURNADDR.
+    // It's killed at the spill, unless the register is RA and return address
+    // is taken.
+    unsigned Reg = CSI[i].getReg();
+    bool IsRAAndRetAddrIsTaken = (Reg == Mips::RA || Reg == Mips::RA_64)
+        && MF->getFrameInfo()->isReturnAddressTaken();
+    if (!IsRAAndRetAddrIsTaken)
+      EntryBlock->addLiveIn(Reg);
+
+    // Insert the spill to the stack frame.
+    bool IsKill = !IsRAAndRetAddrIsTaken;
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    TII.storeRegToStackSlot(*EntryBlock, MI, Reg, IsKill,
+                            CSI[i].getFrameIdx(), RC, TRI);
   }
+
+  return true;
 }
diff --git a/lib/Target/Mips/MipsFrameLowering.h b/lib/Target/Mips/MipsFrameLowering.h
index bd1d89f..e364ded 100644
--- a/lib/Target/Mips/MipsFrameLowering.h
+++ b/lib/Target/Mips/MipsFrameLowering.h
@@ -38,6 +38,11 @@ public:
   void emitPrologue(MachineFunction &MF) const;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
 
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const;
+
   bool hasFP(const MachineFunction &MF) const;
 
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index f0651c6..ea33b74 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -117,28 +117,32 @@ private:
 void MipsDAGToDAGISel::InitGlobalBaseReg(MachineFunction &MF) {
   MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
 
-  if (!MipsFI->globalBaseRegSet())
+  if (((MF.getTarget().getRelocationModel() == Reloc::Static) ||
+       Subtarget.inMips16Mode()) && !MipsFI->globalBaseRegSet())
     return;
 
   MachineBasicBlock &MBB = MF.front();
   MachineBasicBlock::iterator I = MBB.begin();
   MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  const MipsRegisterInfo *TargetRegInfo = TM.getRegisterInfo();
+  const MipsInstrInfo *MII = TM.getInstrInfo();
   const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
-  unsigned V0, V1, GlobalBaseReg = MipsFI->getGlobalBaseReg();
-  bool FixGlobalBaseReg = MipsFI->globalBaseRegFixed();
-
-  if (Subtarget.isABI_O32() && FixGlobalBaseReg)
-    // $gp is the global base register.
-    V0 = V1 = GlobalBaseReg;
-  else {
-    const TargetRegisterClass *RC;
-    RC = Subtarget.isABI_N64() ?
-         Mips::CPU64RegsRegisterClass : Mips::CPURegsRegisterClass;
-
-    V0 = RegInfo.createVirtualRegister(RC);
-    V1 = RegInfo.createVirtualRegister(RC);
-  }
+  unsigned V0, V1, V2, GlobalBaseReg = MipsFI->getGlobalBaseReg();
+  int FI = 0;
+
+  FI= MipsFI->initGlobalRegFI();
+
+  const TargetRegisterClass *RC = Subtarget.isABI_N64() ?
+    (const TargetRegisterClass*)&Mips::CPU64RegsRegClass :
+    (const TargetRegisterClass*)&Mips::CPURegsRegClass;
+
+  if (Subtarget.inMips16Mode())
+    RC=(const TargetRegisterClass*)&Mips::CPU16RegsRegClass;
+
+  V0 = RegInfo.createVirtualRegister(RC);
+  V1 = RegInfo.createVirtualRegister(RC);
+  V2 = RegInfo.createVirtualRegister(RC);
 
   if (Subtarget.isABI_N64()) {
     MF.getRegInfo().addLiveIn(Mips::T9_64);
@@ -150,10 +154,31 @@ void MipsDAGToDAGISel::InitGlobalBaseReg(MachineFunction &MF) {
     const GlobalValue *FName = MF.getFunction();
     BuildMI(MBB, I, DL, TII.get(Mips::LUi64), V0)
       .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_HI);
-    BuildMI(MBB, I, DL, TII.get(Mips::DADDu), V1).addReg(V0).addReg(Mips::T9_64);
+    BuildMI(MBB, I, DL, TII.get(Mips::DADDu), V1).addReg(V0)
+      .addReg(Mips::T9_64);
     BuildMI(MBB, I, DL, TII.get(Mips::DADDiu), GlobalBaseReg).addReg(V1)
       .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_LO);
-  } else if (MF.getTarget().getRelocationModel() == Reloc::Static) {
+    MII->storeRegToStackSlot(MBB, I, GlobalBaseReg, false, FI, RC,
+                             TargetRegInfo);
+    return;
+  }
+
+  if (Subtarget.inMips16Mode()) {
+    BuildMI(MBB, I, DL, TII.get(Mips::LiRxImmX16), V0)
+        .addExternalSymbol("_gp_disp", MipsII::MO_ABS_HI);
+    BuildMI(MBB, I, DL, TII.get(Mips::AddiuRxPcImmX16),
+            V1)
+        .addExternalSymbol("_gp_disp", MipsII::MO_ABS_LO);
+    BuildMI(MBB, I, DL, TII.get(Mips::SllX16),
+            V2 ).addReg(V0).addImm(16);
+    BuildMI(MBB, I, DL, TII.get(Mips::AdduRxRyRz16), GlobalBaseReg)
+      .addReg(V1).addReg(V2);
+
+
+    return;
+  }
+
+  if (MF.getTarget().getRelocationModel() == Reloc::Static) {
     // Set global register to __gnu_local_gp.
     //
     // lui   $v0, %hi(__gnu_local_gp)
@@ -162,27 +187,57 @@ void MipsDAGToDAGISel::InitGlobalBaseReg(MachineFunction &MF) {
       .addExternalSymbol("__gnu_local_gp", MipsII::MO_ABS_HI);
     BuildMI(MBB, I, DL, TII.get(Mips::ADDiu), GlobalBaseReg).addReg(V0)
       .addExternalSymbol("__gnu_local_gp", MipsII::MO_ABS_LO);
-  } else {
-    MF.getRegInfo().addLiveIn(Mips::T9);
-    MBB.addLiveIn(Mips::T9);
-
-    if (Subtarget.isABI_N32()) {
-      // lui $v0, %hi(%neg(%gp_rel(fname)))
-      // addu $v1, $v0, $t9
-      // addiu $globalbasereg, $v1, %lo(%neg(%gp_rel(fname)))
-      const GlobalValue *FName = MF.getFunction();
-      BuildMI(MBB, I, DL, TII.get(Mips::LUi), V0)
-        .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_HI);
-      BuildMI(MBB, I, DL, TII.get(Mips::ADDu), V1).addReg(V0).addReg(Mips::T9);
-      BuildMI(MBB, I, DL, TII.get(Mips::ADDiu), GlobalBaseReg).addReg(V1)
-        .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_LO);
-    } else if (!MipsFI->globalBaseRegFixed()) {
-      assert(Subtarget.isABI_O32());
-
-      BuildMI(MBB, I, DL, TII.get(Mips::SETGP2), GlobalBaseReg)
-        .addReg(Mips::T9);
-    }
+    return;
+  }
+
+  MF.getRegInfo().addLiveIn(Mips::T9);
+  MBB.addLiveIn(Mips::T9);
+
+  if (Subtarget.isABI_N32()) {
+    // lui $v0, %hi(%neg(%gp_rel(fname)))
+    // addu $v1, $v0, $t9
+    // addiu $globalbasereg, $v1, %lo(%neg(%gp_rel(fname)))
+    const GlobalValue *FName = MF.getFunction();
+    BuildMI(MBB, I, DL, TII.get(Mips::LUi), V0)
+      .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_HI);
+    BuildMI(MBB, I, DL, TII.get(Mips::ADDu), V1).addReg(V0).addReg(Mips::T9);
+    BuildMI(MBB, I, DL, TII.get(Mips::ADDiu), GlobalBaseReg).addReg(V1)
+      .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_LO);
+    MII->storeRegToStackSlot(MBB, I, GlobalBaseReg, false, FI, RC,
+                             TargetRegInfo);
+    return;
   }
+
+  assert(Subtarget.isABI_O32());
+
+
+  //if (Subtarget.inMips16Mode())
+  //  return; // no need to load GP. It can be calculated anywhere
+
+
+
+  // For O32 ABI, the following instruction sequence is emitted to initialize
+  // the global base register:
+  //
+  //  0. lui   $2, %hi(_gp_disp)
+  //  1. addiu $2, $2, %lo(_gp_disp)
+  //  2. addu  $globalbasereg, $2, $t9
+  //
+  // We emit only the last instruction here.
+  //
+  // GNU linker requires that the first two instructions appear at the beginning
+  // of a function and no instructions be inserted before or between them.
+  // The two instructions are emitted during lowering to MC layer in order to
+  // avoid any reordering.
+  //
+  // Register $2 (Mips::V0) is added to the list of live-in registers to ensure
+  // the value instruction 1 (addiu) defines is valid when instruction 2 (addu)
+  // reads it.
+  MF.getRegInfo().addLiveIn(Mips::V0);
+  MBB.addLiveIn(Mips::V0);
+  BuildMI(MBB, I, DL, TII.get(Mips::ADDu), GlobalBaseReg)
+    .addReg(Mips::V0).addReg(Mips::T9);
+  MII->storeRegToStackSlot(MBB, I, GlobalBaseReg, false, FI, RC, TargetRegInfo);
 }
 
 bool MipsDAGToDAGISel::ReplaceUsesWithZeroReg(MachineRegisterInfo *MRI,
@@ -212,7 +267,8 @@ bool MipsDAGToDAGISel::ReplaceUsesWithZeroReg(MachineRegisterInfo *MRI,
     MachineInstr *MI = MO.getParent();
 
     // Do not replace if it is a phi's operand or is tied to def operand.
-    if (MI->isPHI() || MI->isRegTiedToDefOperand(U.getOperandNo()))
+    if (MI->isPHI() || MI->isRegTiedToDefOperand(U.getOperandNo()) ||
+        MI->isPseudo())
       continue;
 
     MO.setReg(ZeroReg);
@@ -255,7 +311,7 @@ SelectAddr(SDNode *Parent, SDValue Addr, SDValue &Base, SDValue &Offset) {
 
   // If Parent is an unaligned f32 load or store, select a (base + index)
   // floating point load/store instruction (luxc1 or suxc1).
-  const LSBaseSDNode* LS = 0;
+  const LSBaseSDNode *LS = 0;
 
   if (Parent && (LS = dyn_cast<LSBaseSDNode>(Parent))) {
     EVT VT = LS->getMemoryVT();
@@ -316,17 +372,18 @@ SelectAddr(SDNode *Parent, SDValue Addr, SDValue &Base, SDValue &Offset) {
     //  lui $2, %hi($CPI1_0)
     //  lwc1 $f0, %lo($CPI1_0)($2)
     if (Addr.getOperand(1).getOpcode() == MipsISD::Lo) {
-      SDValue LoVal = Addr.getOperand(1);
-      if (isa<ConstantPoolSDNode>(LoVal.getOperand(0)) ||
-          isa<GlobalAddressSDNode>(LoVal.getOperand(0))) {
+      SDValue LoVal = Addr.getOperand(1), Opnd0 = LoVal.getOperand(0);
+      if (isa<ConstantPoolSDNode>(Opnd0) || isa<GlobalAddressSDNode>(Opnd0) ||
+          isa<JumpTableSDNode>(Opnd0)) {
         Base = Addr.getOperand(0);
-        Offset = LoVal.getOperand(0);
+        Offset = Opnd0;
         return true;
       }
     }
 
     // If an indexed floating point load/store can be emitted, return false.
-    if (LS && (LS->getMemoryVT() == MVT::f32 || LS->getMemoryVT() == MVT::f64) &&
+    if (LS &&
+        (LS->getMemoryVT() == MVT::f32 || LS->getMemoryVT() == MVT::f64) &&
         Subtarget.hasMips32r2Or64())
       return false;
   }
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 6a23bc3..7741f9f 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -34,6 +34,8 @@
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
 using namespace llvm;
 
 // If I is a shifted mask, set the size (Size) and the first bit of the
@@ -79,6 +81,14 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MipsISD::Sync:              return "MipsISD::Sync";
   case MipsISD::Ext:               return "MipsISD::Ext";
   case MipsISD::Ins:               return "MipsISD::Ins";
+  case MipsISD::LWL:               return "MipsISD::LWL";
+  case MipsISD::LWR:               return "MipsISD::LWR";
+  case MipsISD::SWL:               return "MipsISD::SWL";
+  case MipsISD::SWR:               return "MipsISD::SWR";
+  case MipsISD::LDL:               return "MipsISD::LDL";
+  case MipsISD::LDR:               return "MipsISD::LDR";
+  case MipsISD::SDL:               return "MipsISD::SDL";
+  case MipsISD::SDR:               return "MipsISD::SDR";
   default:                         return NULL;
   }
 }
@@ -96,20 +106,25 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
 
   // Set up the register classes
-  addRegisterClass(MVT::i32, Mips::CPURegsRegisterClass);
+  addRegisterClass(MVT::i32, &Mips::CPURegsRegClass);
 
   if (HasMips64)
-    addRegisterClass(MVT::i64, Mips::CPU64RegsRegisterClass);
+    addRegisterClass(MVT::i64, &Mips::CPU64RegsRegClass);
+
+  if (Subtarget->inMips16Mode()) {
+    addRegisterClass(MVT::i32, &Mips::CPU16RegsRegClass);
+    addRegisterClass(MVT::i32, &Mips::CPURARegRegClass);
+  }
 
   if (!TM.Options.UseSoftFloat) {
-    addRegisterClass(MVT::f32, Mips::FGR32RegisterClass);
+    addRegisterClass(MVT::f32, &Mips::FGR32RegClass);
 
     // When dealing with single precision only, use libcalls
     if (!Subtarget->isSingleFloat()) {
       if (HasMips64)
-        addRegisterClass(MVT::f64, Mips::FGR64RegisterClass);
+        addRegisterClass(MVT::f64, &Mips::FGR64RegClass);
       else
-        addRegisterClass(MVT::f64, Mips::AFGR64RegisterClass);
+        addRegisterClass(MVT::f64, &Mips::AFGR64RegClass);
     }
   }
 
@@ -137,6 +152,8 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::SELECT,             MVT::f32,   Custom);
   setOperationAction(ISD::SELECT,             MVT::f64,   Custom);
   setOperationAction(ISD::SELECT,             MVT::i32,   Custom);
+  setOperationAction(ISD::SELECT_CC,          MVT::f32,   Custom);
+  setOperationAction(ISD::SELECT_CC,          MVT::f64,   Custom);
   setOperationAction(ISD::SETCC,              MVT::f32,   Custom);
   setOperationAction(ISD::SETCC,              MVT::f64,   Custom);
   setOperationAction(ISD::BRCOND,             MVT::Other, Custom);
@@ -146,6 +163,8 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::FCOPYSIGN,          MVT::f64,   Custom);
   setOperationAction(ISD::MEMBARRIER,         MVT::Other, Custom);
   setOperationAction(ISD::ATOMIC_FENCE,       MVT::Other, Custom);
+  setOperationAction(ISD::LOAD,               MVT::i32, Custom);
+  setOperationAction(ISD::STORE,              MVT::i32, Custom);
 
   if (!TM.Options.NoNaNsFPMath) {
     setOperationAction(ISD::FABS,             MVT::f32,   Custom);
@@ -160,6 +179,14 @@ MipsTargetLowering(MipsTargetMachine &TM)
     setOperationAction(ISD::ConstantPool,       MVT::i64,   Custom);
     setOperationAction(ISD::SELECT,             MVT::i64,   Custom);
     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64,   Custom);
+    setOperationAction(ISD::LOAD,               MVT::i64,   Custom);
+    setOperationAction(ISD::STORE,              MVT::i64,   Custom);
+  }
+
+  if (!HasMips64) {
+    setOperationAction(ISD::SHL_PARTS,          MVT::i32,   Custom);
+    setOperationAction(ISD::SRA_PARTS,          MVT::i32,   Custom);
+    setOperationAction(ISD::SRL_PARTS,          MVT::i32,   Custom);
   }
 
   setOperationAction(ISD::SDIV, MVT::i32, Expand);
@@ -197,9 +224,6 @@ MipsTargetLowering(MipsTargetMachine &TM)
   if (!Subtarget->hasMips64r2())
     setOperationAction(ISD::ROTR, MVT::i64,   Expand);
 
-  setOperationAction(ISD::SHL_PARTS,         MVT::i32,   Expand);
-  setOperationAction(ISD::SRA_PARTS,         MVT::i32,   Expand);
-  setOperationAction(ISD::SRL_PARTS,         MVT::i32,   Expand);
   setOperationAction(ISD::FSIN,              MVT::f32,   Expand);
   setOperationAction(ISD::FSIN,              MVT::f64,   Expand);
   setOperationAction(ISD::FCOS,              MVT::f32,   Expand);
@@ -241,9 +265,6 @@ MipsTargetLowering(MipsTargetMachine &TM)
 
   setInsertFencesForAtomic(true);
 
-  if (Subtarget->isSingleFloat())
-    setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
-
   if (!Subtarget->hasSEInReg()) {
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
@@ -259,6 +280,13 @@ MipsTargetLowering(MipsTargetMachine &TM)
     setOperationAction(ISD::BSWAP, MVT::i64, Expand);
   }
 
+  if (HasMips64) {
+    setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Custom);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::i32, Custom);
+    setTruncStoreAction(MVT::i64, MVT::i32, Custom);
+  }
+
   setTargetDAGCombine(ISD::ADDE);
   setTargetDAGCombine(ISD::SUBE);
   setTargetDAGCombine(ISD::SDIVREM);
@@ -266,6 +294,7 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setTargetDAGCombine(ISD::SELECT);
   setTargetDAGCombine(ISD::AND);
   setTargetDAGCombine(ISD::OR);
+  setTargetDAGCombine(ISD::ADD);
 
   setMinFunctionAlignment(HasMips64 ? 3 : 2);
 
@@ -274,6 +303,8 @@ MipsTargetLowering(MipsTargetMachine &TM)
 
   setExceptionPointerRegister(IsN64 ? Mips::A0_64 : Mips::A0);
   setExceptionSelectorRegister(IsN64 ? Mips::A1_64 : Mips::A1);
+
+  maxStoresPerMemcpy = 16;
 }
 
 bool MipsTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
@@ -282,7 +313,6 @@ bool MipsTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
   switch (SVT) {
   case MVT::i64:
   case MVT::i32:
-  case MVT::i16:
     return true;
   case MVT::f32:
     return Subtarget->hasMips32r2Or64();
@@ -303,17 +333,17 @@ EVT MipsTargetLowering::getSetCCResultType(EVT VT) const {
 //  Lo0: initial value of Lo register
 //  Hi0: initial value of Hi register
 // Return true if pattern matching was successful.
-static bool SelectMadd(SDNode* ADDENode, SelectionDAG* CurDAG) {
+static bool SelectMadd(SDNode *ADDENode, SelectionDAG *CurDAG) {
   // ADDENode's second operand must be a flag output of an ADDC node in order
   // for the matching to be successful.
-  SDNode* ADDCNode = ADDENode->getOperand(2).getNode();
+  SDNode *ADDCNode = ADDENode->getOperand(2).getNode();
 
   if (ADDCNode->getOpcode() != ISD::ADDC)
     return false;
 
   SDValue MultHi = ADDENode->getOperand(0);
   SDValue MultLo = ADDCNode->getOperand(0);
-  SDNode* MultNode = MultHi.getNode();
+  SDNode *MultNode = MultHi.getNode();
   unsigned MultOpc = MultHi.getOpcode();
 
   // MultHi and MultLo must be generated by the same node,
@@ -376,17 +406,17 @@ static bool SelectMadd(SDNode* ADDENode, SelectionDAG* CurDAG) {
 //  Lo0: initial value of Lo register
 //  Hi0: initial value of Hi register
 // Return true if pattern matching was successful.
-static bool SelectMsub(SDNode* SUBENode, SelectionDAG* CurDAG) {
+static bool SelectMsub(SDNode *SUBENode, SelectionDAG *CurDAG) {
   // SUBENode's second operand must be a flag output of an SUBC node in order
   // for the matching to be successful.
-  SDNode* SUBCNode = SUBENode->getOperand(2).getNode();
+  SDNode *SUBCNode = SUBENode->getOperand(2).getNode();
 
   if (SUBCNode->getOpcode() != ISD::SUBC)
     return false;
 
   SDValue MultHi = SUBENode->getOperand(1);
   SDValue MultLo = SUBCNode->getOperand(1);
-  SDNode* MultNode = MultHi.getNode();
+  SDNode *MultNode = MultHi.getNode();
   unsigned MultOpc = MultHi.getOpcode();
 
   // MultHi and MultLo must be generated by the same node,
@@ -441,9 +471,9 @@ static bool SelectMsub(SDNode* SUBENode, SelectionDAG* CurDAG) {
   return true;
 }
 
-static SDValue PerformADDECombine(SDNode *N, SelectionDAG& DAG,
+static SDValue PerformADDECombine(SDNode *N, SelectionDAG &DAG,
                                   TargetLowering::DAGCombinerInfo &DCI,
-                                  const MipsSubtarget* Subtarget) {
+                                  const MipsSubtarget *Subtarget) {
   if (DCI.isBeforeLegalize())
     return SDValue();
 
@@ -454,9 +484,9 @@ static SDValue PerformADDECombine(SDNode *N, SelectionDAG& DAG,
   return SDValue();
 }
 
-static SDValue PerformSUBECombine(SDNode *N, SelectionDAG& DAG,
+static SDValue PerformSUBECombine(SDNode *N, SelectionDAG &DAG,
                                   TargetLowering::DAGCombinerInfo &DCI,
-                                  const MipsSubtarget* Subtarget) {
+                                  const MipsSubtarget *Subtarget) {
   if (DCI.isBeforeLegalize())
     return SDValue();
 
@@ -467,9 +497,9 @@ static SDValue PerformSUBECombine(SDNode *N, SelectionDAG& DAG,
   return SDValue();
 }
 
-static SDValue PerformDivRemCombine(SDNode *N, SelectionDAG& DAG,
+static SDValue PerformDivRemCombine(SDNode *N, SelectionDAG &DAG,
                                     TargetLowering::DAGCombinerInfo &DCI,
-                                    const MipsSubtarget* Subtarget) {
+                                    const MipsSubtarget *Subtarget) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
@@ -544,7 +574,7 @@ static bool InvertFPCondCode(Mips::CondCode CC) {
 
 // Creates and returns an FPCmp node from a setcc node.
 // Returns Op if setcc is not a floating point comparison.
-static SDValue CreateFPCmp(SelectionDAG& DAG, const SDValue& Op) {
+static SDValue CreateFPCmp(SelectionDAG &DAG, const SDValue &Op) {
   // must be a SETCC node
   if (Op.getOpcode() != ISD::SETCC)
     return Op;
@@ -566,7 +596,7 @@ static SDValue CreateFPCmp(SelectionDAG& DAG, const SDValue& Op) {
 }
 
 // Creates and returns a CMovFPT/F node.
-static SDValue CreateCMovFP(SelectionDAG& DAG, SDValue Cond, SDValue True,
+static SDValue CreateCMovFP(SelectionDAG &DAG, SDValue Cond, SDValue True,
                             SDValue False, DebugLoc DL) {
   bool invert = InvertFPCondCode((Mips::CondCode)
                                  cast<ConstantSDNode>(Cond.getOperand(2))
@@ -576,9 +606,9 @@ static SDValue CreateCMovFP(SelectionDAG& DAG, SDValue Cond, SDValue True,
                      True.getValueType(), True, False, Cond);
 }
 
-static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG& DAG,
+static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
                                     TargetLowering::DAGCombinerInfo &DCI,
-                                    const MipsSubtarget* Subtarget) {
+                                    const MipsSubtarget *Subtarget) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
@@ -602,16 +632,16 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG& DAG,
   const DebugLoc DL = N->getDebugLoc();
   ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
   SDValue True = N->getOperand(1);
-  
+
   SetCC = DAG.getSetCC(DL, SetCC.getValueType(), SetCC.getOperand(0),
                        SetCC.getOperand(1), ISD::getSetCCInverse(CC, true));
-  
+
   return DAG.getNode(ISD::SELECT, DL, FalseTy, SetCC, False, True);
 }
 
-static SDValue PerformANDCombine(SDNode *N, SelectionDAG& DAG,
+static SDValue PerformANDCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
-                                 const MipsSubtarget* Subtarget) {
+                                 const MipsSubtarget *Subtarget) {
   // Pattern match EXT.
   //  $dst = and ((sra or srl) $src , pos), (2**size - 1)
   //  => ext $dst, $src, size, pos
@@ -649,9 +679,9 @@ static SDValue PerformANDCombine(SDNode *N, SelectionDAG& DAG,
                      DAG.getConstant(SMSize, MVT::i32));
 }
 
-static SDValue PerformORCombine(SDNode *N, SelectionDAG& DAG,
+static SDValue PerformORCombine(SDNode *N, SelectionDAG &DAG,
                                 TargetLowering::DAGCombinerInfo &DCI,
-                                const MipsSubtarget* Subtarget) {
+                                const MipsSubtarget *Subtarget) {
   // Pattern match INS.
   //  $dst = or (and $src1 , mask0), (and (shl $src, pos), mask1),
   //  where mask1 = (2**size - 1) << pos, mask0 = ~mask1
@@ -703,6 +733,33 @@ static SDValue PerformORCombine(SDNode *N, SelectionDAG& DAG,
                      DAG.getConstant(SMSize0, MVT::i32), And0.getOperand(0));
 }
 
+static SDValue PerformADDCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const MipsSubtarget *Subtarget) {
+  // (add v0, (add v1, abs_lo(tjt))) => (add (add v0, v1), abs_lo(tjt))
+
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue Add = N->getOperand(1);
+
+  if (Add.getOpcode() != ISD::ADD)
+    return SDValue();
+
+  SDValue Lo = Add.getOperand(1);
+
+  if ((Lo.getOpcode() != MipsISD::Lo) ||
+      (Lo.getOperand(0).getOpcode() != ISD::TargetJumpTable))
+    return SDValue();
+
+  EVT ValTy = N->getValueType(0);
+  DebugLoc DL = N->getDebugLoc();
+
+  SDValue Add1 = DAG.getNode(ISD::ADD, DL, ValTy, N->getOperand(0),
+                             Add.getOperand(0));
+  return DAG.getNode(ISD::ADD, DL, ValTy, Add1, Lo);
+}
+
 SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
   const {
   SelectionDAG &DAG = DCI.DAG;
@@ -718,11 +775,13 @@ SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
   case ISD::UDIVREM:
     return PerformDivRemCombine(N, DAG, DCI, Subtarget);
   case ISD::SELECT:
-    return PerformSELECTCombine(N, DAG, DCI, Subtarget);  
+    return PerformSELECTCombine(N, DAG, DCI, Subtarget);
   case ISD::AND:
     return PerformANDCombine(N, DAG, DCI, Subtarget);
   case ISD::OR:
     return PerformORCombine(N, DAG, DCI, Subtarget);
+  case ISD::ADD:
+    return PerformADDCombine(N, DAG, DCI, Subtarget);
   }
 
   return SDValue();
@@ -741,13 +800,20 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const
     case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
     case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
     case ISD::SELECT:             return LowerSELECT(Op, DAG);
+    case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
     case ISD::SETCC:              return LowerSETCC(Op, DAG);
     case ISD::VASTART:            return LowerVASTART(Op, DAG);
     case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
     case ISD::FABS:               return LowerFABS(Op, DAG);
     case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
+    case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
     case ISD::MEMBARRIER:         return LowerMEMBARRIER(Op, DAG);
     case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, DAG);
+    case ISD::SHL_PARTS:          return LowerShiftLeftParts(Op, DAG);
+    case ISD::SRA_PARTS:          return LowerShiftRightParts(Op, DAG, true);
+    case ISD::SRL_PARTS:          return LowerShiftRightParts(Op, DAG, false);
+    case ISD::LOAD:               return LowerLOAD(Op, DAG);
+    case ISD::STORE:              return LowerSTORE(Op, DAG);
   }
   return SDValue();
 }
@@ -782,7 +848,7 @@ static Mips::FPBranchCode GetFPBranchCodeFromCond(Mips::CondCode CC) {
 /*
 static MachineBasicBlock* ExpandCondMov(MachineInstr *MI, MachineBasicBlock *BB,
                                         DebugLoc dl,
-                                        const MipsSubtarget* Subtarget,
+                                        const MipsSubtarget *Subtarget,
                                         const TargetInstrInfo *TII,
                                         bool isFPCmp, unsigned Opc) {
   // There is no need to expand CMov instructions if target has
@@ -1510,6 +1576,19 @@ LowerSELECT(SDValue Op, SelectionDAG &DAG) const
                       Op.getDebugLoc());
 }
 
+SDValue MipsTargetLowering::
+LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
+{
+  DebugLoc DL = Op.getDebugLoc();
+  EVT Ty = Op.getOperand(0).getValueType();
+  SDValue Cond = DAG.getNode(ISD::SETCC, DL, getSetCCResultType(Ty),
+                             Op.getOperand(0), Op.getOperand(1),
+                             Op.getOperand(4));
+
+  return DAG.getNode(ISD::SELECT, DL, Op.getValueType(), Cond, Op.getOperand(2),
+                     Op.getOperand(3));
+}
+
 SDValue MipsTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   SDValue Cond = CreateFPCmp(DAG, Op);
 
@@ -1612,10 +1691,13 @@ LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
   const GlobalValue *GV = GA->getGlobal();
   EVT PtrVT = getPointerTy();
 
-  if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
-    // General Dynamic TLS Model
-    bool LocalDynamic = GV->hasInternalLinkage();
-    unsigned Flag = LocalDynamic ? MipsII::MO_TLSLDM :MipsII::MO_TLSGD;
+  TLSModel::Model model = getTargetMachine().getTLSModel(GV);
+
+  if (model == TLSModel::GeneralDynamic || model == TLSModel::LocalDynamic) {
+    // General Dynamic and Local Dynamic TLS Model.
+    unsigned Flag = (model == TLSModel::LocalDynamic) ? MipsII::MO_TLSLDM
+                                                      : MipsII::MO_TLSGD;
+
     SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, Flag);
     SDValue Argument = DAG.getNode(MipsISD::Wrapper, dl, PtrVT,
                                    GetGlobalReg(DAG, PtrVT), TGA);
@@ -1630,16 +1712,16 @@ LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
     Entry.Ty = PtrTy;
     Args.push_back(Entry);
 
-    std::pair<SDValue, SDValue> CallResult =
-      LowerCallTo(DAG.getEntryNode(), PtrTy,
+    TargetLowering::CallLoweringInfo CLI(DAG.getEntryNode(), PtrTy,
                   false, false, false, false, 0, CallingConv::C,
                   /*isTailCall=*/false, /*doesNotRet=*/false,
                   /*isReturnValueUsed=*/true,
                   TlsGetAddr, Args, DAG, dl);
+    std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
 
     SDValue Ret = CallResult.first;
 
-    if (!LocalDynamic)
+    if (model != TLSModel::LocalDynamic)
       return Ret;
 
     SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
@@ -1653,7 +1735,7 @@ LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
   }
 
   SDValue Offset;
-  if (GV->isDeclaration()) {
+  if (model == TLSModel::InitialExec) {
     // Initial Exec TLS Model
     SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
                                              MipsII::MO_GOTTPREL);
@@ -1664,6 +1746,7 @@ LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
                          false, false, false, 0);
   } else {
     // Local Exec TLS Model
+    assert(model == TLSModel::LocalExec);
     SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
                                                MipsII::MO_TPREL_HI);
     SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
@@ -1940,9 +2023,26 @@ LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   return FrameAddr;
 }
 
+SDValue MipsTargetLowering::LowerRETURNADDR(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  // check the depth
+  assert((cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() == 0) &&
+         "Return address can be determined only for current frame.");
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  EVT VT = Op.getValueType();
+  unsigned RA = IsN64 ? Mips::RA_64 : Mips::RA;
+  MFI->setReturnAddressIsTaken(true);
+
+  // Return RA, which contains the return address. Mark it an implicit live-in.
+  unsigned Reg = MF.addLiveIn(RA, getRegClassFor(VT));
+  return DAG.getCopyFromReg(DAG.getEntryNode(), Op.getDebugLoc(), Reg, VT);
+}
+
 // TODO: set SType according to the desired memory barrier behavior.
 SDValue
-MipsTargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG& DAG) const {
+MipsTargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const {
   unsigned SType = 0;
   DebugLoc dl = Op.getDebugLoc();
   return DAG.getNode(MipsISD::Sync, dl, MVT::Other, Op.getOperand(0),
@@ -1950,7 +2050,7 @@ MipsTargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG& DAG) const {
 }
 
 SDValue MipsTargetLowering::LowerATOMIC_FENCE(SDValue Op,
-                                              SelectionDAG& DAG) const {
+                                              SelectionDAG &DAG) const {
   // FIXME: Need pseudo-fence for 'singlethread' fences
   // FIXME: Set SType for weaker fences where supported/appropriate.
   unsigned SType = 0;
@@ -1959,6 +2059,210 @@ SDValue MipsTargetLowering::LowerATOMIC_FENCE(SDValue Op,
                      DAG.getConstant(SType, MVT::i32));
 }
 
+SDValue MipsTargetLowering::LowerShiftLeftParts(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  SDValue Lo = Op.getOperand(0), Hi = Op.getOperand(1);
+  SDValue Shamt = Op.getOperand(2);
+
+  // if shamt < 32:
+  //  lo = (shl lo, shamt)
+  //  hi = (or (shl hi, shamt) (srl (srl lo, 1), ~shamt))
+  // else:
+  //  lo = 0
+  //  hi = (shl lo, shamt[4:0])
+  SDValue Not = DAG.getNode(ISD::XOR, DL, MVT::i32, Shamt,
+                            DAG.getConstant(-1, MVT::i32));
+  SDValue ShiftRight1Lo = DAG.getNode(ISD::SRL, DL, MVT::i32, Lo,
+                                      DAG.getConstant(1, MVT::i32));
+  SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, MVT::i32, ShiftRight1Lo,
+                                     Not);
+  SDValue ShiftLeftHi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi, Shamt);
+  SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, ShiftLeftHi, ShiftRightLo);
+  SDValue ShiftLeftLo = DAG.getNode(ISD::SHL, DL, MVT::i32, Lo, Shamt);
+  SDValue Cond = DAG.getNode(ISD::AND, DL, MVT::i32, Shamt,
+                             DAG.getConstant(0x20, MVT::i32));
+  Lo = DAG.getNode(ISD::SELECT, DL, MVT::i32, Cond,
+                   DAG.getConstant(0, MVT::i32), ShiftLeftLo);
+  Hi = DAG.getNode(ISD::SELECT, DL, MVT::i32, Cond, ShiftLeftLo, Or);
+
+  SDValue Ops[2] = {Lo, Hi};
+  return DAG.getMergeValues(Ops, 2, DL);
+}
+
+SDValue MipsTargetLowering::LowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
+                                                 bool IsSRA) const {
+  DebugLoc DL = Op.getDebugLoc();
+  SDValue Lo = Op.getOperand(0), Hi = Op.getOperand(1);
+  SDValue Shamt = Op.getOperand(2);
+
+  // if shamt < 32:
+  //  lo = (or (shl (shl hi, 1), ~shamt) (srl lo, shamt))
+  //  if isSRA:
+  //    hi = (sra hi, shamt)
+  //  else:
+  //    hi = (srl hi, shamt)
+  // else:
+  //  if isSRA:
+  //   lo = (sra hi, shamt[4:0])
+  //   hi = (sra hi, 31)
+  //  else:
+  //   lo = (srl hi, shamt[4:0])
+  //   hi = 0
+  SDValue Not = DAG.getNode(ISD::XOR, DL, MVT::i32, Shamt,
+                            DAG.getConstant(-1, MVT::i32));
+  SDValue ShiftLeft1Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
+                                     DAG.getConstant(1, MVT::i32));
+  SDValue ShiftLeftHi = DAG.getNode(ISD::SHL, DL, MVT::i32, ShiftLeft1Hi, Not);
+  SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, MVT::i32, Lo, Shamt);
+  SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, ShiftLeftHi, ShiftRightLo);
+  SDValue ShiftRightHi = DAG.getNode(IsSRA ? ISD::SRA : ISD::SRL, DL, MVT::i32,
+                                     Hi, Shamt);
+  SDValue Cond = DAG.getNode(ISD::AND, DL, MVT::i32, Shamt,
+                             DAG.getConstant(0x20, MVT::i32));
+  SDValue Shift31 = DAG.getNode(ISD::SRA, DL, MVT::i32, Hi,
+                                DAG.getConstant(31, MVT::i32));
+  Lo = DAG.getNode(ISD::SELECT, DL, MVT::i32, Cond, ShiftRightHi, Or);
+  Hi = DAG.getNode(ISD::SELECT, DL, MVT::i32, Cond,
+                   IsSRA ? Shift31 : DAG.getConstant(0, MVT::i32),
+                   ShiftRightHi);
+
+  SDValue Ops[2] = {Lo, Hi};
+  return DAG.getMergeValues(Ops, 2, DL);
+}
+
+static SDValue CreateLoadLR(unsigned Opc, SelectionDAG &DAG, LoadSDNode *LD,
+                            SDValue Chain, SDValue Src, unsigned Offset) {
+  SDValue Ptr = LD->getBasePtr();
+  EVT VT = LD->getValueType(0), MemVT = LD->getMemoryVT();
+  EVT BasePtrVT = Ptr.getValueType();
+  DebugLoc DL = LD->getDebugLoc();
+  SDVTList VTList = DAG.getVTList(VT, MVT::Other);
+
+  if (Offset)
+    Ptr = DAG.getNode(ISD::ADD, DL, BasePtrVT, Ptr,
+                      DAG.getConstant(Offset, BasePtrVT));
+
+  SDValue Ops[] = { Chain, Ptr, Src };
+  return DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, 3, MemVT,
+                                 LD->getMemOperand());
+}
+
+// Expand an unaligned 32 or 64-bit integer load node.
+SDValue MipsTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  LoadSDNode *LD = cast<LoadSDNode>(Op);
+  EVT MemVT = LD->getMemoryVT();
+
+  // Return if load is aligned or if MemVT is neither i32 nor i64.
+  if ((LD->getAlignment() >= MemVT.getSizeInBits() / 8) ||
+      ((MemVT != MVT::i32) && (MemVT != MVT::i64)))
+    return SDValue();
+
+  bool IsLittle = Subtarget->isLittle();
+  EVT VT = Op.getValueType();
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  SDValue Chain = LD->getChain(), Undef = DAG.getUNDEF(VT);
+
+  assert((VT == MVT::i32) || (VT == MVT::i64));
+
+  // Expand
+  //  (set dst, (i64 (load baseptr)))
+  // to
+  //  (set tmp, (ldl (add baseptr, 7), undef))
+  //  (set dst, (ldr baseptr, tmp))
+  if ((VT == MVT::i64) && (ExtType == ISD::NON_EXTLOAD)) {
+    SDValue LDL = CreateLoadLR(MipsISD::LDL, DAG, LD, Chain, Undef,
+                               IsLittle ? 7 : 0);
+    return CreateLoadLR(MipsISD::LDR, DAG, LD, LDL.getValue(1), LDL,
+                        IsLittle ? 0 : 7);
+  }
+
+  SDValue LWL = CreateLoadLR(MipsISD::LWL, DAG, LD, Chain, Undef,
+                             IsLittle ? 3 : 0);
+  SDValue LWR = CreateLoadLR(MipsISD::LWR, DAG, LD, LWL.getValue(1), LWL,
+                             IsLittle ? 0 : 3);
+
+  // Expand
+  //  (set dst, (i32 (load baseptr))) or
+  //  (set dst, (i64 (sextload baseptr))) or
+  //  (set dst, (i64 (extload baseptr)))
+  // to
+  //  (set tmp, (lwl (add baseptr, 3), undef))
+  //  (set dst, (lwr baseptr, tmp))
+  if ((VT == MVT::i32) || (ExtType == ISD::SEXTLOAD) ||
+      (ExtType == ISD::EXTLOAD))
+    return LWR;
+
+  assert((VT == MVT::i64) && (ExtType == ISD::ZEXTLOAD));
+
+  // Expand
+  //  (set dst, (i64 (zextload baseptr)))
+  // to
+  //  (set tmp0, (lwl (add baseptr, 3), undef))
+  //  (set tmp1, (lwr baseptr, tmp0))
+  //  (set tmp2, (shl tmp1, 32))
+  //  (set dst, (srl tmp2, 32))
+  DebugLoc DL = LD->getDebugLoc();
+  SDValue Const32 = DAG.getConstant(32, MVT::i32);
+  SDValue SLL = DAG.getNode(ISD::SHL, DL, MVT::i64, LWR, Const32);
+  SDValue SRL = DAG.getNode(ISD::SRL, DL, MVT::i64, SLL, Const32);
+  SDValue Ops[] = { SRL, LWR.getValue(1) };
+  return DAG.getMergeValues(Ops, 2, DL);
+}
+
+static SDValue CreateStoreLR(unsigned Opc, SelectionDAG &DAG, StoreSDNode *SD,
+                             SDValue Chain, unsigned Offset) {
+  SDValue Ptr = SD->getBasePtr(), Value = SD->getValue();
+  EVT MemVT = SD->getMemoryVT(), BasePtrVT = Ptr.getValueType();
+  DebugLoc DL = SD->getDebugLoc();
+  SDVTList VTList = DAG.getVTList(MVT::Other);
+
+  if (Offset)
+    Ptr = DAG.getNode(ISD::ADD, DL, BasePtrVT, Ptr,
+                      DAG.getConstant(Offset, BasePtrVT));
+
+  SDValue Ops[] = { Chain, Value, Ptr };
+  return DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, 3, MemVT,
+                                 SD->getMemOperand());
+}
+
+// Expand an unaligned 32 or 64-bit integer store node.
+SDValue MipsTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  StoreSDNode *SD = cast<StoreSDNode>(Op);
+  EVT MemVT = SD->getMemoryVT();
+
+  // Return if store is aligned or if MemVT is neither i32 nor i64.
+  if ((SD->getAlignment() >= MemVT.getSizeInBits() / 8) ||
+      ((MemVT != MVT::i32) && (MemVT != MVT::i64)))
+    return SDValue();
+
+  bool IsLittle = Subtarget->isLittle();
+  SDValue Value = SD->getValue(), Chain = SD->getChain();
+  EVT VT = Value.getValueType();
+
+  // Expand
+  //  (store val, baseptr) or
+  //  (truncstore val, baseptr)
+  // to
+  //  (swl val, (add baseptr, 3))
+  //  (swr val, baseptr)
+  if ((VT == MVT::i32) || SD->isTruncatingStore()) {
+    SDValue SWL = CreateStoreLR(MipsISD::SWL, DAG, SD, Chain,
+                                IsLittle ? 3 : 0);
+    return CreateStoreLR(MipsISD::SWR, DAG, SD, SWL, IsLittle ? 0 : 3);
+  }
+
+  assert(VT == MVT::i64);
+
+  // Expand
+  //  (store val, baseptr)
+  // to
+  //  (sdl val, (add baseptr, 7))
+  //  (sdr val, baseptr)
+  SDValue SDL = CreateStoreLR(MipsISD::SDL, DAG, SD, Chain, IsLittle ? 7 : 0);
+  return CreateStoreLR(MipsISD::SDR, DAG, SD, SDL, IsLittle ? 0 : 7);
+}
+
 //===----------------------------------------------------------------------===//
 //                      Calling Convention Implementation
 //===----------------------------------------------------------------------===//
@@ -2152,10 +2456,10 @@ static unsigned getNextIntArgReg(unsigned Reg) {
 // Write ByVal Arg to arg registers and stack.
 static void
 WriteByValArg(SDValue& ByValChain, SDValue Chain, DebugLoc dl,
-              SmallVector<std::pair<unsigned, SDValue>, 16>& RegsToPass,
-              SmallVector<SDValue, 8>& MemOpChains, int& LastFI,
+              SmallVector<std::pair<unsigned, SDValue>, 16> &RegsToPass,
+              SmallVector<SDValue, 8> &MemOpChains, int &LastFI,
               MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg,
-              const CCValAssign &VA, const ISD::ArgFlagsTy& Flags,
+              const CCValAssign &VA, const ISD::ArgFlagsTy &Flags,
               MVT PtrType, bool isLittle) {
   unsigned LocMemOffset = VA.getLocMemOffset();
   unsigned Offset = 0;
@@ -2243,10 +2547,10 @@ WriteByValArg(SDValue& ByValChain, SDValue Chain, DebugLoc dl,
 // Copy Mips64 byVal arg to registers and stack.
 void static
 PassByValArg64(SDValue& ByValChain, SDValue Chain, DebugLoc dl,
-               SmallVector<std::pair<unsigned, SDValue>, 16>& RegsToPass,
-               SmallVector<SDValue, 8>& MemOpChains, int& LastFI,
+               SmallVector<std::pair<unsigned, SDValue>, 16> &RegsToPass,
+               SmallVector<SDValue, 8> &MemOpChains, int &LastFI,
                MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg,
-               const CCValAssign &VA, const ISD::ArgFlagsTy& Flags,
+               const CCValAssign &VA, const ISD::ArgFlagsTy &Flags,
                EVT PtrTy, bool isLittle) {
   unsigned ByValSize = Flags.getByValSize();
   unsigned Alignment = std::min(Flags.getByValAlign(), (unsigned)8);
@@ -2332,14 +2636,20 @@ PassByValArg64(SDValue& ByValChain, SDValue Chain, DebugLoc dl,
 /// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
 /// TODO: isTailCall.
 SDValue
-MipsTargetLowering::LowerCall(SDValue InChain, SDValue Callee,
-                              CallingConv::ID CallConv, bool isVarArg,
-                              bool doesNotRet, bool &isTailCall,
-                              const SmallVectorImpl<ISD::OutputArg> &Outs,
-                              const SmallVectorImpl<SDValue> &OutVals,
-                              const SmallVectorImpl<ISD::InputArg> &Ins,
-                              DebugLoc dl, SelectionDAG &DAG,
+MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                               SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue InChain                       = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  SDValue CalleeSave                    = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
+
   // MIPs target does not yet support tail call optimization.
   isTailCall = false;
 
@@ -2354,7 +2664,9 @@ MipsTargetLowering::LowerCall(SDValue InChain, SDValue Callee,
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), ArgLocs, *DAG.getContext());
 
-  if (IsO32)
+  if (CallConv == CallingConv::Fast)
+    CCInfo.AnalyzeCallOperands(Outs, CC_Mips_FastCC);
+  else if (IsO32)
     CCInfo.AnalyzeCallOperands(Outs, CC_MipsO32);
   else if (HasMips64)
     AnalyzeMips64CallOperands(CCInfo, Outs);
@@ -2372,11 +2684,6 @@ MipsTargetLowering::LowerCall(SDValue InChain, SDValue Callee,
   Chain = CallSeqStart = DAG.getCALLSEQ_START(InChain, NextStackOffsetVal);
   ByValChain = InChain;
 
-  // If this is the first call, create a stack frame object that points to
-  // a location to which .cprestore saves $gp.
-  if (IsO32 && IsPIC && MipsFI->globalBaseRegFixed() && !MipsFI->getGPFI())
-    MipsFI->setGPFI(MFI->CreateFixedObject(4, 0, true));
-
   // Get the frame index of the stack frame object that points to the location
   // of dynamically allocated area on the stack.
   int DynAllocFI = MipsFI->getDynAllocFI();
@@ -2384,7 +2691,7 @@ MipsTargetLowering::LowerCall(SDValue InChain, SDValue Callee,
   // Update size of the maximum argument space.
   // For O32, a minimum of four words (16 bytes) of argument space is
   // allocated.
-  if (IsO32)
+  if (IsO32 && (CallConv != CallingConv::Fast))
     NextStackOffset = std::max(NextStackOffset, (unsigned)16);
 
   unsigned MaxCallFrameSize = MipsFI->getMaxCallFrameSize();
@@ -2399,9 +2706,6 @@ MipsTargetLowering::LowerCall(SDValue InChain, SDValue Callee,
     NextStackOffset = (NextStackOffset + StackAlignment - 1) /
                       StackAlignment * StackAlignment;
 
-    if (MipsFI->needGPSaveRestore())
-      MFI->setObjectOffset(MipsFI->getGPFI(), NextStackOffset);
-
     MFI->setObjectOffset(DynAllocFI, NextStackOffset);
   }
 
@@ -2573,6 +2877,14 @@ MipsTargetLowering::LowerCall(SDValue InChain, SDValue Callee,
     Callee = DAG.getRegister(T9Reg, getPointerTy());
   }
 
+  // Insert node "GP copy globalreg" before call to function.
+  // Lazy-binding stubs require GP to point to the GOT.
+  if (IsPICCall) {
+    unsigned GPReg = IsN64 ? Mips::GP_64 : Mips::GP;
+    EVT Ty = IsN64 ? MVT::i64 : MVT::i32;
+    RegsToPass.push_back(std::make_pair(GPReg, GetGlobalReg(DAG, Ty)));
+  }
+
   // Build a sequence of copy-to-reg nodes chained together with token
   // chain and flag operands which copy the outgoing args into registers.
   // The InFlag in necessary since all emitted instructions must be
@@ -2590,7 +2902,7 @@ MipsTargetLowering::LowerCall(SDValue InChain, SDValue Callee,
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   SmallVector<SDValue, 8> Ops;
   Ops.push_back(Chain);
-  Ops.push_back(Callee);
+  Ops.push_back(Subtarget->inMips16Mode()? CalleeSave: Callee);
 
   // Add argument registers to the end of the list so that they are
   // known live into the call.
@@ -2598,6 +2910,8 @@ MipsTargetLowering::LowerCall(SDValue InChain, SDValue Callee,
     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
                                   RegsToPass[i].second.getValueType()));
 
+  if (Subtarget->inMips16Mode())
+    Ops.push_back(Callee);
   // Add a register mask operand representing the call-preserved registers.
   const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
@@ -2633,7 +2947,7 @@ MipsTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), RVLocs, *DAG.getContext());
+                 getTargetMachine(), RVLocs, *DAG.getContext());
 
   CCInfo.AnalyzeCallResult(Ins, RetCC_Mips);
 
@@ -2652,9 +2966,9 @@ MipsTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 //             Formal Arguments Calling Convention Implementation
 //===----------------------------------------------------------------------===//
 static void ReadByValArg(MachineFunction &MF, SDValue Chain, DebugLoc dl,
-                         std::vector<SDValue>& OutChains,
+                         std::vector<SDValue> &OutChains,
                          SelectionDAG &DAG, unsigned NumWords, SDValue FIN,
-                         const CCValAssign &VA, const ISD::ArgFlagsTy& Flags,
+                         const CCValAssign &VA, const ISD::ArgFlagsTy &Flags,
                          const Argument *FuncArg) {
   unsigned LocMem = VA.getLocMemOffset();
   unsigned FirstWord = LocMem / 4;
@@ -2666,7 +2980,7 @@ static void ReadByValArg(MachineFunction &MF, SDValue Chain, DebugLoc dl,
       break;
 
     unsigned SrcReg = O32IntRegs[CurWord];
-    unsigned Reg = AddLiveIn(MF, SrcReg, Mips::CPURegsRegisterClass);
+    unsigned Reg = AddLiveIn(MF, SrcReg, &Mips::CPURegsRegClass);
     SDValue StorePtr = DAG.getNode(ISD::ADD, dl, MVT::i32, FIN,
                                    DAG.getConstant(i * 4, MVT::i32));
     SDValue Store = DAG.getStore(Chain, dl, DAG.getRegister(Reg, MVT::i32),
@@ -2679,8 +2993,8 @@ static void ReadByValArg(MachineFunction &MF, SDValue Chain, DebugLoc dl,
 // Create frame object on stack and copy registers used for byval passing to it.
 static unsigned
 CopyMips64ByValRegs(MachineFunction &MF, SDValue Chain, DebugLoc dl,
-                    std::vector<SDValue>& OutChains, SelectionDAG &DAG,
-                    const CCValAssign &VA, const ISD::ArgFlagsTy& Flags,
+                    std::vector<SDValue> &OutChains, SelectionDAG &DAG,
+                    const CCValAssign &VA, const ISD::ArgFlagsTy &Flags,
                     MachineFrameInfo *MFI, bool IsRegLoc,
                     SmallVectorImpl<SDValue> &InVals, MipsFunctionInfo *MipsFI,
                     EVT PtrTy, const Argument *FuncArg) {
@@ -2703,7 +3017,7 @@ CopyMips64ByValRegs(MachineFunction &MF, SDValue Chain, DebugLoc dl,
   // Copy arg registers.
   for (unsigned I = 0; (Reg != Mips64IntRegs + 8) && (I < NumRegs);
        ++Reg, ++I) {
-    unsigned VReg = AddLiveIn(MF, *Reg, Mips::CPU64RegsRegisterClass);
+    unsigned VReg = AddLiveIn(MF, *Reg, &Mips::CPU64RegsRegClass);
     SDValue StorePtr = DAG.getNode(ISD::ADD, dl, PtrTy, FIN,
                                    DAG.getConstant(I * 8, PtrTy));
     SDValue Store = DAG.getStore(Chain, dl, DAG.getRegister(VReg, MVT::i64),
@@ -2739,7 +3053,9 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), ArgLocs, *DAG.getContext());
 
-  if (IsO32)
+  if (CallConv == CallingConv::Fast)
+    CCInfo.AnalyzeFormalArguments(Ins, CC_Mips_FastCC);
+  else if (IsO32)
     CCInfo.AnalyzeFormalArguments(Ins, CC_MipsO32);
   else
     CCInfo.AnalyzeFormalArguments(Ins, CC_Mips);
@@ -2779,13 +3095,13 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
       const TargetRegisterClass *RC;
 
       if (RegVT == MVT::i32)
-        RC = Mips::CPURegsRegisterClass;
+        RC = &Mips::CPURegsRegClass;
       else if (RegVT == MVT::i64)
-        RC = Mips::CPU64RegsRegisterClass;
+        RC = &Mips::CPU64RegsRegClass;
       else if (RegVT == MVT::f32)
-        RC = Mips::FGR32RegisterClass;
+        RC = &Mips::FGR32RegClass;
       else if (RegVT == MVT::f64)
-        RC = HasMips64 ? Mips::FGR64RegisterClass : Mips::AFGR64RegisterClass;
+        RC = HasMips64 ? &Mips::FGR64RegClass : &Mips::AFGR64RegClass;
       else
         llvm_unreachable("RegVT not supported by FormalArguments Lowering");
 
@@ -2859,8 +3175,9 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
     const uint16_t *ArgRegs = IsO32 ? O32IntRegs : Mips64IntRegs;
     unsigned Idx = CCInfo.getFirstUnallocated(ArgRegs, NumOfRegs);
     int FirstRegSlotOffset = IsO32 ? 0 : -64 ; // offset of $a0's slot.
-    const TargetRegisterClass *RC
-      = IsO32 ? Mips::CPURegsRegisterClass : Mips::CPU64RegsRegisterClass;
+    const TargetRegisterClass *RC = IsO32 ?
+      (const TargetRegisterClass*)&Mips::CPURegsRegClass :
+      (const TargetRegisterClass*)&Mips::CPU64RegsRegClass;
     unsigned RegSize = RC->getSize();
     int RegSlotOffset = FirstRegSlotOffset + Idx * RegSize;
 
@@ -2924,7 +3241,7 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
 
   // CCState - Info about the registers and stack slot.
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), RVLocs, *DAG.getContext());
+                 getTargetMachine(), RVLocs, *DAG.getContext());
 
   // Analize return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_Mips);
@@ -2970,11 +3287,10 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
 
   // Return on Mips is always a "jr $ra"
   if (Flag.getNode())
-    return DAG.getNode(MipsISD::Ret, dl, MVT::Other,
-                       Chain, DAG.getRegister(Mips::RA, MVT::i32), Flag);
-  else // Return Void
-    return DAG.getNode(MipsISD::Ret, dl, MVT::Other,
-                       Chain, DAG.getRegister(Mips::RA, MVT::i32));
+    return DAG.getNode(MipsISD::Ret, dl, MVT::Other, Chain, Flag);
+
+  // Return Void
+  return DAG.getNode(MipsISD::Ret, dl, MVT::Other, Chain);
 }
 
 //===----------------------------------------------------------------------===//
@@ -2993,13 +3309,19 @@ getConstraintType(const std::string &Constraint) const
   //       unless generating MIPS16 code.
   // 'y' : Equivalent to r; retained for
   //       backwards compatibility.
-  // 'f' : Floating Point registers.
+  // 'c' : A register suitable for use in an indirect
+  //       jump. This will always be $25 for -mabicalls.
+  // 'l' : The lo register. 1 word storage.
+  // 'x' : The hilo register pair. Double word storage.
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
       default : break;
       case 'd':
       case 'y':
       case 'f':
+      case 'c':
+      case 'l':
+      case 'x':
         return C_RegisterClass;
     }
   }
@@ -3033,6 +3355,22 @@ MipsTargetLowering::getSingleConstraintMatchWeight(
     if (type->isFloatTy())
       weight = CW_Register;
     break;
+  case 'c': // $25 for indirect jumps
+  case 'l': // lo register
+  case 'x': // hilo register pair
+      if (type->isIntegerTy())
+      weight = CW_SpecificReg;
+      break;
+  case 'I': // signed 16 bit immediate
+  case 'J': // integer zero
+  case 'K': // unsigned 16 bit immediate
+  case 'L': // signed 32 bit immediate where lower 16 bits are 0
+  case 'N': // immediate in the range of -65535 to -1 (inclusive)
+  case 'O': // signed 15 bit immediate (+- 16383)
+  case 'P': // immediate in the range of 65535 to 1 (inclusive)
+    if (isa<ConstantInt>(CallOperandVal))
+      weight = CW_Constant;
+    break;
   }
   return weight;
 }
@@ -3048,30 +3386,152 @@ getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const
     case 'd': // Address register. Same as 'r' unless generating MIPS16 code.
     case 'y': // Same as 'r'. Exists for compatibility.
     case 'r':
-      if (VT == MVT::i32)
-        return std::make_pair(0U, Mips::CPURegsRegisterClass);
-      assert(VT == MVT::i64 && "Unexpected type.");
-      return std::make_pair(0U, Mips::CPU64RegsRegisterClass);
+      if (VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8)
+        return std::make_pair(0U, &Mips::CPURegsRegClass);
+      if (VT == MVT::i64 && !HasMips64)
+        return std::make_pair(0U, &Mips::CPURegsRegClass);
+      if (VT == MVT::i64 && HasMips64)
+        return std::make_pair(0U, &Mips::CPU64RegsRegClass);
+      // This will generate an error message
+      return std::make_pair(0u, static_cast<const TargetRegisterClass*>(0));
     case 'f':
       if (VT == MVT::f32)
-        return std::make_pair(0U, Mips::FGR32RegisterClass);
+        return std::make_pair(0U, &Mips::FGR32RegClass);
       if ((VT == MVT::f64) && (!Subtarget->isSingleFloat())) {
         if (Subtarget->isFP64bit())
-          return std::make_pair(0U, Mips::FGR64RegisterClass);
-        else
-          return std::make_pair(0U, Mips::AFGR64RegisterClass);
+          return std::make_pair(0U, &Mips::FGR64RegClass);
+        return std::make_pair(0U, &Mips::AFGR64RegClass);
       }
+      break;
+    case 'c': // register suitable for indirect jump
+      if (VT == MVT::i32)
+        return std::make_pair((unsigned)Mips::T9, &Mips::CPURegsRegClass);
+      assert(VT == MVT::i64 && "Unexpected type.");
+      return std::make_pair((unsigned)Mips::T9_64, &Mips::CPU64RegsRegClass);
+    case 'l': // register suitable for indirect jump
+      if (VT == MVT::i32)
+        return std::make_pair((unsigned)Mips::LO, &Mips::HILORegClass);
+      return std::make_pair((unsigned)Mips::LO64, &Mips::HILO64RegClass);
+    case 'x': // register suitable for indirect jump
+      // Fixme: Not triggering the use of both hi and low
+      // This will generate an error message
+      return std::make_pair(0u, static_cast<const TargetRegisterClass*>(0));
     }
   }
   return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
 }
 
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops.
+void MipsTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                     std::string &Constraint,
+                                                     std::vector<SDValue>&Ops,
+                                                     SelectionDAG &DAG) const {
+  SDValue Result(0, 0);
+
+  // Only support length 1 constraints for now.
+  if (Constraint.length() > 1) return;
+
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
+  default: break; // This will fall through to the generic implementation
+  case 'I': // Signed 16 bit constant
+    // If this fails, the parent routine will give an error
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      int64_t Val = C->getSExtValue();
+      if (isInt<16>(Val)) {
+        Result = DAG.getTargetConstant(Val, Type);
+        break;
+      }
+    }
+    return;
+  case 'J': // integer zero
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      int64_t Val = C->getZExtValue();
+      if (Val == 0) {
+        Result = DAG.getTargetConstant(0, Type);
+        break;
+      }
+    }
+    return;
+  case 'K': // unsigned 16 bit immediate
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      uint64_t Val = (uint64_t)C->getZExtValue();
+      if (isUInt<16>(Val)) {
+        Result = DAG.getTargetConstant(Val, Type);
+        break;
+      }
+    }
+    return;
+  case 'L': // signed 32 bit immediate where lower 16 bits are 0
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      int64_t Val = C->getSExtValue();
+      if ((isInt<32>(Val)) && ((Val & 0xffff) == 0)){
+        Result = DAG.getTargetConstant(Val, Type);
+        break;
+      }
+    }
+    return;
+  case 'N': // immediate in the range of -65535 to -1 (inclusive)
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      int64_t Val = C->getSExtValue();
+      if ((Val >= -65535) && (Val <= -1)) {
+        Result = DAG.getTargetConstant(Val, Type);
+        break;
+      }
+    }
+    return;
+  case 'O': // signed 15 bit immediate
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      int64_t Val = C->getSExtValue();
+      if ((isInt<15>(Val))) {
+        Result = DAG.getTargetConstant(Val, Type);
+        break;
+      }
+    }
+    return;
+  case 'P': // immediate in the range of 1 to 65535 (inclusive)
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      int64_t Val = C->getSExtValue();
+      if ((Val <= 65535) && (Val >= 1)) {
+        Result = DAG.getTargetConstant(Val, Type);
+        break;
+      }
+    }
+    return;
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+
+  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
 bool
 MipsTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   // The Mips target isn't yet aware of offsets.
   return false;
 }
 
+EVT MipsTargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+                                            unsigned SrcAlign, bool IsZeroVal,
+                                            bool MemcpyStrSrc,
+                                            MachineFunction &MF) const {
+  if (Subtarget->hasMips64())
+    return MVT::i64;
+
+  return MVT::i32;
+}
+
 bool MipsTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
   if (VT != MVT::f32 && VT != MVT::f64)
     return false;
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index c36f40f..edab03c 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -79,7 +79,17 @@ namespace llvm {
       Sync,
 
       Ext,
-      Ins
+      Ins,
+
+      // Load/Store Left/Right nodes.
+      LWL = ISD::FIRST_TARGET_MEMORY_OPCODE,
+      LWR,
+      SWL,
+      SWR,
+      LDL,
+      LDR,
+      SDL,
+      SDR
     };
   }
 
@@ -128,13 +138,20 @@ namespace llvm {
     SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG& DAG) const;
     SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const;
+    SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG& DAG) const;
+    SDValue LowerShiftRightParts(SDValue Op, SelectionDAG& DAG,
+                                 bool IsSRA) const;
+    SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
 
     virtual SDValue
       LowerFormalArguments(SDValue Chain,
@@ -144,13 +161,7 @@ namespace llvm {
                            SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
-      LowerCall(SDValue Chain, SDValue Callee,
-                CallingConv::ID CallConv, bool isVarArg,
-                bool doesNotRet, bool &isTailCall,
-                const SmallVectorImpl<ISD::OutputArg> &Outs,
-                const SmallVectorImpl<SDValue> &OutVals,
-                const SmallVectorImpl<ISD::InputArg> &Ins,
-                DebugLoc dl, SelectionDAG &DAG,
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
                 SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
@@ -176,8 +187,22 @@ namespace llvm {
               getRegForInlineAsmConstraint(const std::string &Constraint,
               EVT VT) const;
 
+    /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+    /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
+    /// true it means one of the asm constraint of the inline asm instruction
+    /// being processed is 'm'.
+    virtual void LowerAsmOperandForConstraint(SDValue Op,
+                                              std::string &Constraint,
+                                              std::vector<SDValue> &Ops,
+                                              SelectionDAG &DAG) const;
+
     virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
 
+    virtual EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+                                    unsigned SrcAlign, bool IsZeroVal,
+                                    bool MemcpyStrSrc,
+                                    MachineFunction &MF) const;
+
     /// isFPImmLegal - Returns true if the target can instruction select the
     /// specified FP immediate natively. If false, the legalizer will
     /// materialize the FP immediate as a load from a constant pool.
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index 14d8f1e..9654b86 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -54,10 +54,14 @@ let PrintMethod = "printFCCOperand", DecoderMethod = "DecodeCondCode" in
 // Feature predicates.
 //===----------------------------------------------------------------------===//
 
-def IsFP64bit        : Predicate<"Subtarget.isFP64bit()">, AssemblerPredicate<"FeatureFP64Bit">;
-def NotFP64bit       : Predicate<"!Subtarget.isFP64bit()">, AssemblerPredicate<"!FeatureFP64Bit">;
-def IsSingleFloat    : Predicate<"Subtarget.isSingleFloat()">, AssemblerPredicate<"FeatureSingleFloat">;
-def IsNotSingleFloat : Predicate<"!Subtarget.isSingleFloat()">, AssemblerPredicate<"!FeatureSingleFloat">;
+def IsFP64bit        : Predicate<"Subtarget.isFP64bit()">,
+                       AssemblerPredicate<"FeatureFP64Bit">;
+def NotFP64bit       : Predicate<"!Subtarget.isFP64bit()">,
+                       AssemblerPredicate<"!FeatureFP64Bit">;
+def IsSingleFloat    : Predicate<"Subtarget.isSingleFloat()">,
+                       AssemblerPredicate<"FeatureSingleFloat">;
+def IsNotSingleFloat : Predicate<"!Subtarget.isSingleFloat()">,
+                       AssemblerPredicate<"!FeatureSingleFloat">;
 
 // FP immediate patterns.
 def fpimm0 : PatLeaf<(fpimm), [{
@@ -117,15 +121,15 @@ class FPIdxStore<bits<6> funct, string opstr, RegisterClass DRC,
 multiclass FFR1_W_M<bits<6> funct, string opstr> {
   def _S   : FFR1<funct, 16, opstr, "w.s", FGR32, FGR32>;
   def _D32 : FFR1<funct, 17, opstr, "w.d", FGR32, AFGR64>,
-             Requires<[NotFP64bit]>;
+             Requires<[NotFP64bit, HasStandardEncoding]>;
   def _D64 : FFR1<funct, 17, opstr, "w.d", FGR32, FGR64>,
-             Requires<[IsFP64bit]> {
+             Requires<[IsFP64bit, HasStandardEncoding]> {
     let DecoderNamespace = "Mips64";
   }
 }
 
 // Instructions that convert an FP value to 64-bit fixed point.
-let Predicates = [IsFP64bit], DecoderNamespace = "Mips64" in
+let Predicates = [IsFP64bit, HasStandardEncoding], DecoderNamespace = "Mips64" in
 multiclass FFR1_L_M<bits<6> funct, string opstr> {
   def _S   : FFR1<funct, 16, opstr, "l.s", FGR64, FGR32>;
   def _D64 : FFR1<funct, 17, opstr, "l.d", FGR64, FGR64>;
@@ -135,9 +139,9 @@ multiclass FFR1_L_M<bits<6> funct, string opstr> {
 multiclass FFR1P_M<bits<6> funct, string opstr, SDNode OpNode> {
   def _S   : FFR1P<funct, 16, opstr, "s", FGR32, FGR32, OpNode>;
   def _D32 : FFR1P<funct, 17, opstr, "d", AFGR64, AFGR64, OpNode>,
-             Requires<[NotFP64bit]>;
+             Requires<[NotFP64bit, HasStandardEncoding]>;
   def _D64 : FFR1P<funct, 17, opstr, "d", FGR64, FGR64, OpNode>,
-             Requires<[IsFP64bit]> {
+             Requires<[IsFP64bit, HasStandardEncoding]> {
     let DecoderNamespace = "Mips64";
   }
 }
@@ -146,9 +150,9 @@ multiclass FFR2P_M<bits<6> funct, string opstr, SDNode OpNode, bit isComm = 0> {
   let isCommutable = isComm in {
   def _S   : FFR2P<funct, 16, opstr, "s", FGR32, OpNode>;
   def _D32 : FFR2P<funct, 17, opstr, "d", AFGR64, OpNode>,
-             Requires<[NotFP64bit]>;
+             Requires<[NotFP64bit, HasStandardEncoding]>;
   def _D64 : FFR2P<funct, 17, opstr, "d", FGR64, OpNode>,
-             Requires<[IsFP64bit]> {
+             Requires<[IsFP64bit, HasStandardEncoding]> {
     let DecoderNamespace = "Mips64";
   }
 }
@@ -185,13 +189,13 @@ def CVT_S_W : FFR1<0x20, 20, "cvt", "s.w", FGR32, FGR32>;
 def CVT_L_S : FFR1<0x25, 16, "cvt", "l.s", FGR64, FGR32>;
 def CVT_L_D64: FFR1<0x25, 17, "cvt", "l.d", FGR64, FGR64>;
 
-let Predicates = [NotFP64bit] in {
+let Predicates = [NotFP64bit, HasStandardEncoding] in {
   def CVT_S_D32 : FFR1<0x20, 17, "cvt", "s.d", FGR32, AFGR64>;
   def CVT_D32_W : FFR1<0x21, 20, "cvt", "d.w", AFGR64, FGR32>;
   def CVT_D32_S : FFR1<0x21, 16, "cvt", "d.s", AFGR64, FGR32>;
 }
 
-let Predicates = [IsFP64bit], DecoderNamespace = "Mips64" in {
+let Predicates = [IsFP64bit, HasStandardEncoding], DecoderNamespace = "Mips64" in {
  def CVT_S_D64 : FFR1<0x20, 17, "cvt", "s.d", FGR32, FGR64>;
  def CVT_S_L   : FFR1<0x20, 21, "cvt", "s.l", FGR32, FGR64>;
  def CVT_D64_W : FFR1<0x21, 20, "cvt", "d.w", FGR64, FGR32>;
@@ -199,7 +203,7 @@ let Predicates = [IsFP64bit], DecoderNamespace = "Mips64" in {
  def CVT_D64_L : FFR1<0x21, 21, "cvt", "d.l", FGR64, FGR64>;
 }
 
-let Predicates = [NoNaNsFPMath] in {
+let Predicates = [NoNaNsFPMath, HasStandardEncoding] in {
   defm FABS    : FFR1P_M<0x5, "abs",  fabs>;
   defm FNEG    : FFR1P_M<0x7, "neg",  fneg>;
 }
@@ -242,14 +246,14 @@ def DMTC1 : FFRGPR<0x05, (outs FGR64:$fs), (ins CPU64Regs:$rt),
 
 def FMOV_S   : FFR1<0x6, 16, "mov", "s", FGR32, FGR32>;
 def FMOV_D32 : FFR1<0x6, 17, "mov", "d", AFGR64, AFGR64>,
-               Requires<[NotFP64bit]>;
+               Requires<[NotFP64bit, HasStandardEncoding]>;
 def FMOV_D64 : FFR1<0x6, 17, "mov", "d", FGR64, FGR64>,
-               Requires<[IsFP64bit]> {
+               Requires<[IsFP64bit, HasStandardEncoding]> {
   let DecoderNamespace = "Mips64";
 }
 
 /// Floating Point Memory Instructions
-let Predicates = [IsN64], DecoderNamespace = "Mips64" in {
+let Predicates = [IsN64, HasStandardEncoding], DecoderNamespace = "Mips64" in {
   def LWC1_P8   : FPLoad<0x31, "lwc1", FGR32, mem64>;
   def SWC1_P8   : FPStore<0x39, "swc1", FGR32, mem64>;
   def LDC164_P8 : FPLoad<0x35, "ldc1", FGR64, mem64> {
@@ -260,41 +264,42 @@ let Predicates = [IsN64], DecoderNamespace = "Mips64" in {
   }
 }
 
-let Predicates = [NotN64] in {
+let Predicates = [NotN64, HasStandardEncoding] in {
   def LWC1   : FPLoad<0x31, "lwc1", FGR32, mem>;
   def SWC1   : FPStore<0x39, "swc1", FGR32, mem>;
 }
 
-let Predicates = [NotN64, HasMips64], DecoderNamespace = "Mips64" in {
+let Predicates = [NotN64, HasMips64, HasStandardEncoding],
+    DecoderNamespace = "Mips64" in {
   def LDC164 : FPLoad<0x35, "ldc1", FGR64, mem>;
   def SDC164 : FPStore<0x3d, "sdc1", FGR64, mem>;
 }
 
-let Predicates = [NotN64, NotMips64] in {
+let Predicates = [NotN64, NotMips64, HasStandardEncoding] in {
   def LDC1   : FPLoad<0x35, "ldc1", AFGR64, mem>;
   def SDC1   : FPStore<0x3d, "sdc1", AFGR64, mem>;
 }
 
 // Indexed loads and stores.
-let Predicates = [HasMips32r2Or64] in {
+let Predicates = [HasMips32r2Or64, HasStandardEncoding] in {
   def LWXC1 : FPIdxLoad<0x0, "lwxc1", FGR32, CPURegs, load_a>;
   def LUXC1 : FPIdxLoad<0x5, "luxc1", FGR32, CPURegs, load_u>;
   def SWXC1 : FPIdxStore<0x8, "swxc1", FGR32, CPURegs, store_a>;
   def SUXC1 : FPIdxStore<0xd, "suxc1", FGR32, CPURegs, store_u>;
 }
 
-let Predicates = [HasMips32r2, NotMips64] in {
+let Predicates = [HasMips32r2, NotMips64, HasStandardEncoding] in {
   def LDXC1 : FPIdxLoad<0x1, "ldxc1", AFGR64, CPURegs, load_a>;
   def SDXC1 : FPIdxStore<0x9, "sdxc1", AFGR64, CPURegs, store_a>;
 }
 
-let Predicates = [HasMips64, NotN64], DecoderNamespace="Mips64" in {
+let Predicates = [HasMips64, NotN64, HasStandardEncoding], DecoderNamespace="Mips64" in {
   def LDXC164 : FPIdxLoad<0x1, "ldxc1", FGR64, CPURegs, load_a>;
   def SDXC164 : FPIdxStore<0x9, "sdxc1", FGR64, CPURegs, store_a>;
 }
 
 // n64
-let Predicates = [IsN64], isCodeGenOnly=1 in {
+let Predicates = [IsN64, HasStandardEncoding], isCodeGenOnly=1 in {
   def LWXC1_P8   : FPIdxLoad<0x0, "lwxc1", FGR32, CPU64Regs, load_a>;
   def LUXC1_P8   : FPIdxLoad<0x5, "luxc1", FGR32, CPU64Regs, load_u>;
   def LDXC164_P8 : FPIdxLoad<0x1, "ldxc1", FGR64, CPU64Regs, load_a>;
@@ -309,32 +314,33 @@ defm FDIV : FFR2P_M<0x03, "div", fdiv>;
 defm FMUL : FFR2P_M<0x02, "mul", fmul, 1>;
 defm FSUB : FFR2P_M<0x01, "sub", fsub>;
 
-let Predicates = [HasMips32r2] in {
+let Predicates = [HasMips32r2, HasStandardEncoding] in {
   def MADD_S : FMADDSUB<0x4, 0, "madd", "s", fadd, FGR32>;
   def MSUB_S : FMADDSUB<0x5, 0, "msub", "s", fsub, FGR32>;
 }
 
-let Predicates = [HasMips32r2, NoNaNsFPMath] in {
+let Predicates = [HasMips32r2, NoNaNsFPMath, HasStandardEncoding] in {
   def NMADD_S : FNMADDSUB<0x6, 0, "nmadd", "s", fadd, FGR32>;
   def NMSUB_S : FNMADDSUB<0x7, 0, "nmsub", "s", fsub, FGR32>;
 }
 
-let Predicates = [HasMips32r2, NotFP64bit] in {
+let Predicates = [HasMips32r2, NotFP64bit, HasStandardEncoding] in {
   def MADD_D32 : FMADDSUB<0x4, 1, "madd", "d", fadd, AFGR64>;
   def MSUB_D32 : FMADDSUB<0x5, 1, "msub", "d", fsub, AFGR64>;
 }
 
-let Predicates = [HasMips32r2, NotFP64bit, NoNaNsFPMath] in {
+let Predicates = [HasMips32r2, NotFP64bit, NoNaNsFPMath, HasStandardEncoding] in {
   def NMADD_D32 : FNMADDSUB<0x6, 1, "nmadd", "d", fadd, AFGR64>;
   def NMSUB_D32 : FNMADDSUB<0x7, 1, "nmsub", "d", fsub, AFGR64>;
 }
 
-let Predicates = [HasMips32r2, IsFP64bit], isCodeGenOnly=1 in {
+let Predicates = [HasMips32r2, IsFP64bit, HasStandardEncoding], isCodeGenOnly=1 in {
   def MADD_D64 : FMADDSUB<0x4, 1, "madd", "d", fadd, FGR64>;
   def MSUB_D64 : FMADDSUB<0x5, 1, "msub", "d", fsub, FGR64>;
 }
 
-let Predicates = [HasMips32r2, IsFP64bit, NoNaNsFPMath], isCodeGenOnly=1 in {
+let Predicates = [HasMips32r2, IsFP64bit, NoNaNsFPMath, HasStandardEncoding],
+    isCodeGenOnly=1 in {
   def NMADD_D64 : FNMADDSUB<0x6, 1, "nmadd", "d", fadd, FGR64>;
   def NMSUB_D64 : FNMADDSUB<0x7, 1, "nmsub", "d", fsub, FGR64>;
 }
@@ -391,8 +397,10 @@ class FCMP<bits<5> fmt, RegisterClass RC, string typestr> :
 /// Floating Point Compare
 let Defs=[FCR31] in {
   def FCMP_S32 : FCMP<0x10, FGR32, "s">;
-  def FCMP_D32 : FCMP<0x11, AFGR64, "d">, Requires<[NotFP64bit]>;
-  def FCMP_D64 : FCMP<0x11, FGR64, "d">, Requires<[IsFP64bit]> {
+  def FCMP_D32 : FCMP<0x11, AFGR64, "d">,
+      Requires<[NotFP64bit, HasStandardEncoding]>;
+  def FCMP_D64 : FCMP<0x11, FGR64, "d">,
+      Requires<[IsFP64bit, HasStandardEncoding]> {
     let DecoderNamespace = "Mips64";
   }
 }
@@ -423,46 +431,52 @@ def ExtractElementF64 :
 //===----------------------------------------------------------------------===//
 // Floating Point Patterns
 //===----------------------------------------------------------------------===//
-def : Pat<(f32 fpimm0), (MTC1 ZERO)>;
-def : Pat<(f32 fpimm0neg), (FNEG_S (MTC1 ZERO))>;
+def : MipsPat<(f32 fpimm0), (MTC1 ZERO)>;
+def : MipsPat<(f32 fpimm0neg), (FNEG_S (MTC1 ZERO))>;
 
-def : Pat<(f32 (sint_to_fp CPURegs:$src)), (CVT_S_W (MTC1 CPURegs:$src))>;
-def : Pat<(i32 (fp_to_sint FGR32:$src)), (MFC1 (TRUNC_W_S FGR32:$src))>;
+def : MipsPat<(f32 (sint_to_fp CPURegs:$src)), (CVT_S_W (MTC1 CPURegs:$src))>;
+def : MipsPat<(i32 (fp_to_sint FGR32:$src)), (MFC1 (TRUNC_W_S FGR32:$src))>;
 
-let Predicates = [NotFP64bit] in {
-  def : Pat<(f64 (sint_to_fp CPURegs:$src)), (CVT_D32_W (MTC1 CPURegs:$src))>;
-  def : Pat<(i32 (fp_to_sint AFGR64:$src)), (MFC1 (TRUNC_W_D32 AFGR64:$src))>;
-  def : Pat<(f32 (fround AFGR64:$src)), (CVT_S_D32 AFGR64:$src)>;
-  def : Pat<(f64 (fextend FGR32:$src)), (CVT_D32_S FGR32:$src)>;
+let Predicates = [NotFP64bit, HasStandardEncoding] in {
+  def : MipsPat<(f64 (sint_to_fp CPURegs:$src)),
+                (CVT_D32_W (MTC1 CPURegs:$src))>;
+  def : MipsPat<(i32 (fp_to_sint AFGR64:$src)),
+                (MFC1 (TRUNC_W_D32 AFGR64:$src))>;
+  def : MipsPat<(f32 (fround AFGR64:$src)), (CVT_S_D32 AFGR64:$src)>;
+  def : MipsPat<(f64 (fextend FGR32:$src)), (CVT_D32_S FGR32:$src)>;
 }
 
-let Predicates = [IsFP64bit] in {
-  def : Pat<(f64 fpimm0), (DMTC1 ZERO_64)>;
-  def : Pat<(f64 fpimm0neg), (FNEG_D64 (DMTC1 ZERO_64))>;
+let Predicates = [IsFP64bit, HasStandardEncoding] in {
+  def : MipsPat<(f64 fpimm0), (DMTC1 ZERO_64)>;
+  def : MipsPat<(f64 fpimm0neg), (FNEG_D64 (DMTC1 ZERO_64))>;
 
-  def : Pat<(f64 (sint_to_fp CPURegs:$src)), (CVT_D64_W (MTC1 CPURegs:$src))>;
-  def : Pat<(f32 (sint_to_fp CPU64Regs:$src)),
-            (CVT_S_L (DMTC1 CPU64Regs:$src))>;
-  def : Pat<(f64 (sint_to_fp CPU64Regs:$src)),
-            (CVT_D64_L (DMTC1 CPU64Regs:$src))>;
+  def : MipsPat<(f64 (sint_to_fp CPURegs:$src)),
+                (CVT_D64_W (MTC1 CPURegs:$src))>;
+  def : MipsPat<(f32 (sint_to_fp CPU64Regs:$src)),
+                (CVT_S_L (DMTC1 CPU64Regs:$src))>;
+  def : MipsPat<(f64 (sint_to_fp CPU64Regs:$src)),
+                (CVT_D64_L (DMTC1 CPU64Regs:$src))>;
 
-  def : Pat<(i32 (fp_to_sint FGR64:$src)), (MFC1 (TRUNC_W_D64 FGR64:$src))>;
-  def : Pat<(i64 (fp_to_sint FGR32:$src)), (DMFC1 (TRUNC_L_S FGR32:$src))>;
-  def : Pat<(i64 (fp_to_sint FGR64:$src)), (DMFC1 (TRUNC_L_D64 FGR64:$src))>;
+  def : MipsPat<(i32 (fp_to_sint FGR64:$src)),
+                (MFC1 (TRUNC_W_D64 FGR64:$src))>;
+  def : MipsPat<(i64 (fp_to_sint FGR32:$src)), (DMFC1 (TRUNC_L_S FGR32:$src))>;
+  def : MipsPat<(i64 (fp_to_sint FGR64:$src)),
+                (DMFC1 (TRUNC_L_D64 FGR64:$src))>;
 
-  def : Pat<(f32 (fround FGR64:$src)), (CVT_S_D64 FGR64:$src)>;
-  def : Pat<(f64 (fextend FGR32:$src)), (CVT_D64_S FGR32:$src)>;
+  def : MipsPat<(f32 (fround FGR64:$src)), (CVT_S_D64 FGR64:$src)>;
+  def : MipsPat<(f64 (fextend FGR32:$src)), (CVT_D64_S FGR32:$src)>;
 }
 
 // Patterns for unaligned floating point loads and stores.
-let Predicates = [HasMips32r2Or64, NotN64] in {
-  def : Pat<(f32 (load_u CPURegs:$addr)), (LUXC1 CPURegs:$addr, ZERO)>;
-  def : Pat<(store_u FGR32:$src, CPURegs:$addr),
-            (SUXC1 FGR32:$src, CPURegs:$addr, ZERO)>;
+let Predicates = [HasMips32r2Or64, NotN64, HasStandardEncoding] in {
+  def : MipsPat<(f32 (load_u CPURegs:$addr)), (LUXC1 CPURegs:$addr, ZERO)>;
+  def : MipsPat<(store_u FGR32:$src, CPURegs:$addr),
+                (SUXC1 FGR32:$src, CPURegs:$addr, ZERO)>;
 }
 
-let Predicates = [IsN64] in {
-  def : Pat<(f32 (load_u CPU64Regs:$addr)), (LUXC1_P8 CPU64Regs:$addr, ZERO_64)>;
-  def : Pat<(store_u FGR32:$src, CPU64Regs:$addr),
-            (SUXC1_P8 FGR32:$src, CPU64Regs:$addr, ZERO_64)>;
+let Predicates = [IsN64, HasStandardEncoding] in {
+  def : MipsPat<(f32 (load_u CPU64Regs:$addr)),
+                (LUXC1_P8 CPU64Regs:$addr, ZERO_64)>;
+  def : MipsPat<(store_u FGR32:$src, CPU64Regs:$addr),
+                (SUXC1_P8 FGR32:$src, CPU64Regs:$addr, ZERO_64)>;
 }
diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
index 841eba0..15a77fb 100644
--- a/lib/Target/Mips/MipsInstrFormats.td
+++ b/lib/Target/Mips/MipsInstrFormats.td
@@ -70,6 +70,9 @@ class MipsInst<dag outs, dag ins, string asmstr, list<dag> pattern,
   let DecoderNamespace = "Mips";
 
   field bits<32> SoftFail = 0;
+
+  let Predicates = [HasStandardEncoding];
+
 }
 
 // Mips Pseudo Instructions Format
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index a3a18bf..458e4f7 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MipsAnalyzeImmediate.h"
 #include "MipsInstrInfo.h"
 #include "MipsTargetMachine.h"
 #include "MipsMachineFunction.h"
@@ -29,6 +30,7 @@ using namespace llvm;
 MipsInstrInfo::MipsInstrInfo(MipsTargetMachine &tm)
   : MipsGenInstrInfo(Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP),
     TM(tm), IsN64(TM.getSubtarget<MipsSubtarget>().isABI_N64()),
+    InMips16Mode(TM.getSubtarget<MipsSubtarget>().inMips16Mode()),
     RI(*TM.getSubtargetImpl(), *this),
     UncondBrOpc(TM.getRelocationModel() == Reloc::PIC_ ? Mips::B : Mips::J) {}
 
@@ -106,8 +108,13 @@ copyPhysReg(MachineBasicBlock &MBB,
   unsigned Opc = 0, ZeroReg = 0;
 
   if (Mips::CPURegsRegClass.contains(DestReg)) { // Copy to CPU Reg.
-    if (Mips::CPURegsRegClass.contains(SrcReg))
-      Opc = Mips::ADDu, ZeroReg = Mips::ZERO;
+    if (Mips::CPURegsRegClass.contains(SrcReg)) {
+      if (InMips16Mode)
+        Opc=Mips::Mov32R16;
+      else {
+        Opc = Mips::ADDu, ZeroReg = Mips::ZERO;
+      }
+    }
     else if (Mips::CCRRegClass.contains(SrcReg))
       Opc = Mips::CFC1;
     else if (Mips::FGR32RegClass.contains(SrcReg))
@@ -189,15 +196,15 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
 
   unsigned Opc = 0;
 
-  if (RC == Mips::CPURegsRegisterClass)
+  if (Mips::CPURegsRegClass.hasSubClassEq(RC))
     Opc = IsN64 ? Mips::SW_P8 : Mips::SW;
-  else if (RC == Mips::CPU64RegsRegisterClass)
+  else if (Mips::CPU64RegsRegClass.hasSubClassEq(RC))
     Opc = IsN64 ? Mips::SD_P8 : Mips::SD;
-  else if (RC == Mips::FGR32RegisterClass)
+  else if (Mips::FGR32RegClass.hasSubClassEq(RC))
     Opc = IsN64 ? Mips::SWC1_P8 : Mips::SWC1;
-  else if (RC == Mips::AFGR64RegisterClass)
+  else if (Mips::AFGR64RegClass.hasSubClassEq(RC))
     Opc = Mips::SDC1;
-  else if (RC == Mips::FGR64RegisterClass)
+  else if (Mips::FGR64RegClass.hasSubClassEq(RC))
     Opc = IsN64 ? Mips::SDC164_P8 : Mips::SDC164;
 
   assert(Opc && "Register class not handled!");
@@ -216,15 +223,15 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOLoad);
   unsigned Opc = 0;
 
-  if (RC == Mips::CPURegsRegisterClass)
+  if (Mips::CPURegsRegClass.hasSubClassEq(RC))
     Opc = IsN64 ? Mips::LW_P8 : Mips::LW;
-  else if (RC == Mips::CPU64RegsRegisterClass)
+  else if (Mips::CPU64RegsRegClass.hasSubClassEq(RC))
     Opc = IsN64 ? Mips::LD_P8 : Mips::LD;
-  else if (RC == Mips::FGR32RegisterClass)
+  else if (Mips::FGR32RegClass.hasSubClassEq(RC))
     Opc = IsN64 ? Mips::LWC1_P8 : Mips::LWC1;
-  else if (RC == Mips::AFGR64RegisterClass)
+  else if (Mips::AFGR64RegClass.hasSubClassEq(RC))
     Opc = Mips::LDC1;
-  else if (RC == Mips::FGR64RegisterClass)
+  else if (Mips::FGR64RegClass.hasSubClassEq(RC))
     Opc = IsN64 ? Mips::LDC164_P8 : Mips::LDC164;
 
   assert(Opc && "Register class not handled!");
@@ -232,6 +239,76 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     .addMemOperand(MMO);
 }
 
+void MipsInstrInfo::ExpandRetRA(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I,
+                                unsigned Opc) const {
+  BuildMI(MBB, I, I->getDebugLoc(), TM.getInstrInfo()->get(Opc))
+    .addReg(Mips::RA);
+}
+
+void MipsInstrInfo::ExpandRetRA16(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I,
+                                unsigned Opc) const {
+  BuildMI(MBB, I, I->getDebugLoc(), TM.getInstrInfo()->get(Opc));
+}
+
+void MipsInstrInfo::ExpandExtractElementF64(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator I) const {
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+  unsigned DstReg = I->getOperand(0).getReg();
+  unsigned SrcReg = I->getOperand(1).getReg();
+  unsigned N = I->getOperand(2).getImm();
+  const MCInstrDesc& Mfc1Tdd = TII->get(Mips::MFC1);
+  DebugLoc dl = I->getDebugLoc();
+
+  assert(N < 2 && "Invalid immediate");
+  unsigned SubIdx = N ? Mips::sub_fpodd : Mips::sub_fpeven;
+  unsigned SubReg = TM.getRegisterInfo()->getSubReg(SrcReg, SubIdx);
+
+  BuildMI(MBB, I, dl, Mfc1Tdd, DstReg).addReg(SubReg);
+}
+
+void MipsInstrInfo::ExpandBuildPairF64(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator I) const {
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+  unsigned DstReg = I->getOperand(0).getReg();
+  unsigned LoReg = I->getOperand(1).getReg(), HiReg = I->getOperand(2).getReg();
+  const MCInstrDesc& Mtc1Tdd = TII->get(Mips::MTC1);
+  DebugLoc dl = I->getDebugLoc();
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+
+  // mtc1 Lo, $fp
+  // mtc1 Hi, $fp + 1
+  BuildMI(MBB, I, dl, Mtc1Tdd, TRI->getSubReg(DstReg, Mips::sub_fpeven))
+    .addReg(LoReg);
+  BuildMI(MBB, I, dl, Mtc1Tdd, TRI->getSubReg(DstReg, Mips::sub_fpodd))
+    .addReg(HiReg);
+}
+
+bool MipsInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+  MachineBasicBlock &MBB = *MI->getParent();
+
+  switch(MI->getDesc().getOpcode()) {
+  default:
+    return false;
+  case Mips::RetRA:
+    ExpandRetRA(MBB, MI, Mips::RET);
+    break;
+  case Mips::RetRA16:
+    ExpandRetRA16(MBB, MI, Mips::JrRa16);
+    break;
+  case Mips::BuildPairF64:
+    ExpandBuildPairF64(MBB, MI);
+    break;
+  case Mips::ExtractElementF64:
+    ExpandExtractElementF64(MBB, MI);
+    break;
+  }
+
+  MBB.erase(MI);
+  return true;
+}
+
 MachineInstr*
 MipsInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
                                         uint64_t Offset, const MDNode *MDPtr,
@@ -278,9 +355,9 @@ unsigned Mips::GetOppositeBranchOpc(unsigned Opc)
   }
 }
 
-static void AnalyzeCondBr(const MachineInstr* Inst, unsigned Opc,
+static void AnalyzeCondBr(const MachineInstr *Inst, unsigned Opc,
                           MachineBasicBlock *&BB,
-                          SmallVectorImpl<MachineOperand>& Cond) {
+                          SmallVectorImpl<MachineOperand> &Cond) {
   assert(GetAnalyzableBrOpc(Opc) && "Not an analyzable branch");
   int NumOp = Inst->getNumExplicitOperands();
 
@@ -454,3 +531,58 @@ ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const
   return false;
 }
 
+/// Return the number of bytes of code the specified instruction may be.
+unsigned MipsInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    return MI->getDesc().getSize();
+  case  TargetOpcode::INLINEASM: {       // Inline Asm: Variable size.
+    const MachineFunction *MF = MI->getParent()->getParent();
+    const char *AsmStr = MI->getOperand(0).getSymbolName();
+    return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
+  }
+  }
+}
+
+unsigned
+llvm::Mips::loadImmediate(int64_t Imm, bool IsN64, const TargetInstrInfo &TII,
+                          MachineBasicBlock& MBB,
+                          MachineBasicBlock::iterator II, DebugLoc DL,
+                          bool LastInstrIsADDiu,
+                          MipsAnalyzeImmediate::Inst *LastInst) {
+  MipsAnalyzeImmediate AnalyzeImm;
+  unsigned Size = IsN64 ? 64 : 32;
+  unsigned LUi = IsN64 ? Mips::LUi64 : Mips::LUi;
+  unsigned ZEROReg = IsN64 ? Mips::ZERO_64 : Mips::ZERO;
+  unsigned ATReg = IsN64 ? Mips::AT_64 : Mips::AT;
+
+  const MipsAnalyzeImmediate::InstSeq &Seq =
+    AnalyzeImm.Analyze(Imm, Size, LastInstrIsADDiu);
+  MipsAnalyzeImmediate::InstSeq::const_iterator Inst = Seq.begin();
+
+  if (LastInst && (Seq.size() == 1)) {
+    *LastInst = *Inst;
+    return 0;
+  }
+
+  // The first instruction can be a LUi, which is different from other
+  // instructions (ADDiu, ORI and SLL) in that it does not have a register
+  // operand.
+  if (Inst->Opc == LUi)
+    BuildMI(MBB, II, DL, TII.get(LUi), ATReg)
+      .addImm(SignExtend64<16>(Inst->ImmOpnd));
+  else
+    BuildMI(MBB, II, DL, TII.get(Inst->Opc), ATReg).addReg(ZEROReg)
+      .addImm(SignExtend64<16>(Inst->ImmOpnd));
+
+  // Build the remaining instructions in Seq. Skip the last instruction if
+  // LastInst is not 0.
+  for (++Inst; Inst != Seq.end() - !!LastInst; ++Inst)
+    BuildMI(MBB, II, DL, TII.get(Inst->Opc), ATReg).addReg(ATReg)
+      .addImm(SignExtend64<16>(Inst->ImmOpnd));
+
+  if (LastInst)
+    *LastInst = *Inst;
+
+  return Seq.size() - !!LastInst;
+}
diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index 4be727d..358f817 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h
@@ -15,6 +15,7 @@
 #define MIPSINSTRUCTIONINFO_H
 
 #include "Mips.h"
+#include "MipsAnalyzeImmediate.h"
 #include "MipsRegisterInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -24,15 +25,9 @@
 
 namespace llvm {
 
-namespace Mips {
-  /// GetOppositeBranchOpc - Return the inverse of the specified
-  /// opcode, e.g. turning BEQ to BNE.
-  unsigned GetOppositeBranchOpc(unsigned Opc);
-}
-
 class MipsInstrInfo : public MipsGenInstrInfo {
   MipsTargetMachine &TM;
-  bool IsN64;
+  bool IsN64; bool InMips16Mode;
   const MipsRegisterInfo RI;
   unsigned UncondBrOpc;
 public:
@@ -68,8 +63,17 @@ public:
   virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
 
 private:
+  void ExpandRetRA(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   unsigned Opc) const;
+  void ExpandRetRA16(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   unsigned Opc) const;
+
   void BuildCondBr(MachineBasicBlock &MBB, MachineBasicBlock *TBB, DebugLoc DL,
                    const SmallVectorImpl<MachineOperand>& Cond) const;
+  void ExpandExtractElementF64(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I) const;
+  void ExpandBuildPairF64(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator I) const;
 
 public:
   virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
@@ -92,6 +96,8 @@ public:
                                     const TargetRegisterClass *RC,
                                     const TargetRegisterInfo *TRI) const;
 
+  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+
   virtual MachineInstr* emitFrameIndexDebugValue(MachineFunction &MF,
                                                  int FrameIx, uint64_t Offset,
                                                  const MDNode *MDPtr,
@@ -103,8 +109,27 @@ public:
   /// Insert nop instruction when hazard condition is found
   virtual void insertNoop(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator MI) const;
+
+  /// Return the number of bytes of code the specified instruction may be.
+  unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
 };
 
+namespace Mips {
+  /// GetOppositeBranchOpc - Return the inverse of the specified
+  /// opcode, e.g. turning BEQ to BNE.
+  unsigned GetOppositeBranchOpc(unsigned Opc);
+
+  /// Emit a series of instructions to load an immediate. All instructions
+  /// except for the last one are emitted. The function returns the number of
+  /// MachineInstrs generated. The opcode-immediate pair of the last
+  /// instruction is returned in LastInst, if it is not 0.
+  unsigned
+  loadImmediate(int64_t Imm, bool IsN64, const TargetInstrInfo &TII,
+                MachineBasicBlock& MBB, MachineBasicBlock::iterator II,
+                DebugLoc DL, bool LastInstrIsADDiu,
+                MipsAnalyzeImmediate::Inst *LastInst);
+}
+
 }
 
 #endif
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index 873d2bd..f1aada4 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -11,17 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-//===----------------------------------------------------------------------===//
-// Instruction format superclass
-//===----------------------------------------------------------------------===//
-
-include "MipsInstrFormats.td"
 
 //===----------------------------------------------------------------------===//
 // Mips profiles and nodes
 //===----------------------------------------------------------------------===//
 
-def SDT_MipsRet          : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 def SDT_MipsJmpLink      : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>;
 def SDT_MipsCMov         : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
                                                 SDTCisSameAs<1, 2>,
@@ -49,6 +43,10 @@ def SDT_Ins : SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
                                    SDTCisVT<2, i32>, SDTCisSameAs<2, 3>,
                                    SDTCisSameAs<0, 4>]>;
 
+def SDTMipsLoadLR  : SDTypeProfile<1, 2,
+                                   [SDTCisInt<0>, SDTCisPtrTy<1>,
+                                    SDTCisSameAs<0, 2>]>;
+
 // Call
 def MipsJmpLink : SDNode<"MipsISD::JmpLink",SDT_MipsJmpLink,
                          [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
@@ -72,8 +70,7 @@ def MipsTprelLo    : SDNode<"MipsISD::TprelLo", SDTIntUnaryOp>;
 def MipsThreadPointer: SDNode<"MipsISD::ThreadPointer", SDT_MipsThreadPointer>;
 
 // Return
-def MipsRet : SDNode<"MipsISD::Ret", SDT_MipsRet, [SDNPHasChain,
-                     SDNPOptInGlue]>;
+def MipsRet : SDNode<"MipsISD::Ret", SDTNone, [SDNPHasChain, SDNPOptInGlue]>;
 
 // These are target-independent nodes, but have target-specific formats.
 def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_MipsCallSeqStart,
@@ -118,6 +115,23 @@ def MipsSync : SDNode<"MipsISD::Sync", SDT_Sync, [SDNPHasChain]>;
 def MipsExt :  SDNode<"MipsISD::Ext", SDT_Ext>;
 def MipsIns :  SDNode<"MipsISD::Ins", SDT_Ins>;
 
+def MipsLWL : SDNode<"MipsISD::LWL", SDTMipsLoadLR,
+                     [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def MipsLWR : SDNode<"MipsISD::LWR", SDTMipsLoadLR,
+                     [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def MipsSWL : SDNode<"MipsISD::SWL", SDTStore,
+                     [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def MipsSWR : SDNode<"MipsISD::SWR", SDTStore,
+                     [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def MipsLDL : SDNode<"MipsISD::LDL", SDTMipsLoadLR,
+                     [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def MipsLDR : SDNode<"MipsISD::LDR", SDTMipsLoadLR,
+                     [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def MipsSDL : SDNode<"MipsISD::SDL", SDTStore,
+                     [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def MipsSDR : SDNode<"MipsISD::SDR", SDTStore,
+                     [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
 //===----------------------------------------------------------------------===//
 // Mips Instruction Predicate Definitions.
 //===----------------------------------------------------------------------===//
@@ -145,12 +159,26 @@ def IsN64       :     Predicate<"Subtarget.isABI_N64()">,
                       AssemblerPredicate<"FeatureN64">;
 def NotN64      :     Predicate<"!Subtarget.isABI_N64()">,
                       AssemblerPredicate<"!FeatureN64">;
+def InMips16Mode :    Predicate<"Subtarget.inMips16Mode()">,
+                      AssemblerPredicate<"FeatureMips16">;
 def RelocStatic :     Predicate<"TM.getRelocationModel() == Reloc::Static">,
                       AssemblerPredicate<"FeatureMips32">;
 def RelocPIC    :     Predicate<"TM.getRelocationModel() == Reloc::PIC_">,
                       AssemblerPredicate<"FeatureMips32">;
 def NoNaNsFPMath :    Predicate<"TM.Options.NoNaNsFPMath">,
                       AssemblerPredicate<"FeatureMips32">;
+def HasStandardEncoding : Predicate<"Subtarget.hasStandardEncoding()">,
+                          AssemblerPredicate<"!FeatureMips16">;
+
+class MipsPat<dag pattern, dag result> : Pat<pattern, result> {
+  let Predicates = [HasStandardEncoding];
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+include "MipsInstrFormats.td"
 
 //===----------------------------------------------------------------------===//
 // Mips Operand, Complex Patterns and Transformations Definitions.
@@ -190,6 +218,7 @@ def mem : Operand<i32> {
 def mem64 : Operand<i64> {
   let PrintMethod = "printMemOperand";
   let MIOperandInfo = (ops CPU64Regs, simm16_64);
+  let EncoderMethod = "getMemEncoding";
 }
 
 def mem_ea : Operand<i32> {
@@ -252,7 +281,8 @@ def immZExt5 : ImmLeaf<i32, [{return Imm == (Imm & 0x1f);}]>;
 
 // Mips Address Mode! SDNode frameindex could possibily be a match
 // since load and store instructions from stack used it.
-def addr : ComplexPattern<iPTR, 2, "SelectAddr", [frameindex], [SDNPWantParent]>;
+def addr :
+  ComplexPattern<iPTR, 2, "SelectAddr", [frameindex], [SDNPWantParent]>;
 
 //===----------------------------------------------------------------------===//
 // Pattern fragment for load/store
@@ -418,21 +448,13 @@ class StoreM<bits<6> op, string instr_asm, PatFrag OpNode, RegisterClass RC,
   let isPseudo = Pseudo;
 }
 
-// Unaligned Memory Load/Store
-let canFoldAsLoad = 1 in
-class LoadUnAlign<bits<6> op, RegisterClass RC, Operand MemOpnd>:
-  FMem<op, (outs RC:$rt), (ins MemOpnd:$addr), "", [], IILoad> {}
-
-class StoreUnAlign<bits<6> op, RegisterClass RC, Operand MemOpnd>:
-  FMem<op, (outs), (ins RC:$rt, MemOpnd:$addr), "", [], IIStore> {}
-
 // 32-bit load.
 multiclass LoadM32<bits<6> op, string instr_asm, PatFrag OpNode,
                    bit Pseudo = 0> {
   def #NAME# : LoadM<op, instr_asm, OpNode, CPURegs, mem, Pseudo>,
-               Requires<[NotN64]>;
+               Requires<[NotN64, HasStandardEncoding]>;
   def _P8    : LoadM<op, instr_asm, OpNode, CPURegs, mem64, Pseudo>,
-               Requires<[IsN64]> {
+               Requires<[IsN64, HasStandardEncoding]> {
     let DecoderNamespace = "Mips64";
     let isCodeGenOnly = 1;
   }
@@ -442,31 +464,21 @@ multiclass LoadM32<bits<6> op, string instr_asm, PatFrag OpNode,
 multiclass LoadM64<bits<6> op, string instr_asm, PatFrag OpNode,
                    bit Pseudo = 0> {
   def #NAME# : LoadM<op, instr_asm, OpNode, CPU64Regs, mem, Pseudo>,
-               Requires<[NotN64]>;
+               Requires<[NotN64, HasStandardEncoding]>;
   def _P8    : LoadM<op, instr_asm, OpNode, CPU64Regs, mem64, Pseudo>,
-               Requires<[IsN64]> {
+               Requires<[IsN64, HasStandardEncoding]> {
     let DecoderNamespace = "Mips64";
     let isCodeGenOnly = 1;
   }
 }
 
-// 32-bit load.
-multiclass LoadUnAlign32<bits<6> op> {
-  def #NAME# : LoadUnAlign<op, CPURegs, mem>,
-               Requires<[NotN64]>;
-  def _P8    : LoadUnAlign<op, CPURegs, mem64>,
-               Requires<[IsN64]> {
-    let DecoderNamespace = "Mips64";
-    let isCodeGenOnly = 1;
-  }
-}
 // 32-bit store.
 multiclass StoreM32<bits<6> op, string instr_asm, PatFrag OpNode,
                     bit Pseudo = 0> {
   def #NAME# : StoreM<op, instr_asm, OpNode, CPURegs, mem, Pseudo>,
-               Requires<[NotN64]>;
+               Requires<[NotN64, HasStandardEncoding]>;
   def _P8    : StoreM<op, instr_asm, OpNode, CPURegs, mem64, Pseudo>,
-               Requires<[IsN64]> {
+               Requires<[IsN64, HasStandardEncoding]> {
     let DecoderNamespace = "Mips64";
     let isCodeGenOnly = 1;
   }
@@ -476,20 +488,69 @@ multiclass StoreM32<bits<6> op, string instr_asm, PatFrag OpNode,
 multiclass StoreM64<bits<6> op, string instr_asm, PatFrag OpNode,
                     bit Pseudo = 0> {
   def #NAME# : StoreM<op, instr_asm, OpNode, CPU64Regs, mem, Pseudo>,
-               Requires<[NotN64]>;
+               Requires<[NotN64, HasStandardEncoding]>;
   def _P8    : StoreM<op, instr_asm, OpNode, CPU64Regs, mem64, Pseudo>,
-               Requires<[IsN64]> {
+               Requires<[IsN64, HasStandardEncoding]> {
     let DecoderNamespace = "Mips64";
     let isCodeGenOnly = 1;
   }
 }
 
-// 32-bit store.
-multiclass StoreUnAlign32<bits<6> op> {
-  def #NAME# : StoreUnAlign<op, CPURegs, mem>,
-               Requires<[NotN64]>;
-  def _P8    : StoreUnAlign<op, CPURegs, mem64>,
-               Requires<[IsN64]> {
+// Load/Store Left/Right
+let canFoldAsLoad = 1 in
+class LoadLeftRight<bits<6> op, string instr_asm, SDNode OpNode,
+                    RegisterClass RC, Operand MemOpnd> :
+  FMem<op, (outs RC:$rt), (ins MemOpnd:$addr, RC:$src),
+       !strconcat(instr_asm, "\t$rt, $addr"),
+       [(set RC:$rt, (OpNode addr:$addr, RC:$src))], IILoad> {
+  string Constraints = "$src = $rt";
+}
+
+class StoreLeftRight<bits<6> op, string instr_asm, SDNode OpNode,
+                     RegisterClass RC, Operand MemOpnd>:
+  FMem<op, (outs), (ins RC:$rt, MemOpnd:$addr),
+       !strconcat(instr_asm, "\t$rt, $addr"), [(OpNode RC:$rt, addr:$addr)],
+       IIStore>;
+
+// 32-bit load left/right.
+multiclass LoadLeftRightM32<bits<6> op, string instr_asm, SDNode OpNode> {
+  def #NAME# : LoadLeftRight<op, instr_asm, OpNode, CPURegs, mem>,
+               Requires<[NotN64, HasStandardEncoding]>;
+  def _P8    : LoadLeftRight<op, instr_asm, OpNode, CPURegs, mem64>,
+               Requires<[IsN64, HasStandardEncoding]> {
+    let DecoderNamespace = "Mips64";
+    let isCodeGenOnly = 1;
+  }
+}
+
+// 64-bit load left/right.
+multiclass LoadLeftRightM64<bits<6> op, string instr_asm, SDNode OpNode> {
+  def #NAME# : LoadLeftRight<op, instr_asm, OpNode, CPU64Regs, mem>,
+               Requires<[NotN64, HasStandardEncoding]>;
+  def _P8    : LoadLeftRight<op, instr_asm, OpNode, CPU64Regs, mem64>,
+               Requires<[IsN64, HasStandardEncoding]> {
+    let DecoderNamespace = "Mips64";
+    let isCodeGenOnly = 1;
+  }
+}
+
+// 32-bit store left/right.
+multiclass StoreLeftRightM32<bits<6> op, string instr_asm, SDNode OpNode> {
+  def #NAME# : StoreLeftRight<op, instr_asm, OpNode, CPURegs, mem>,
+               Requires<[NotN64, HasStandardEncoding]>;
+  def _P8    : StoreLeftRight<op, instr_asm, OpNode, CPURegs, mem64>,
+               Requires<[IsN64, HasStandardEncoding]> {
+    let DecoderNamespace = "Mips64";
+    let isCodeGenOnly = 1;
+  }
+}
+
+// 64-bit store left/right.
+multiclass StoreLeftRightM64<bits<6> op, string instr_asm, SDNode OpNode> {
+  def #NAME# : StoreLeftRight<op, instr_asm, OpNode, CPU64Regs, mem>,
+               Requires<[NotN64, HasStandardEncoding]>;
+  def _P8    : StoreLeftRight<op, instr_asm, OpNode, CPU64Regs, mem64>,
+               Requires<[IsN64, HasStandardEncoding]> {
     let DecoderNamespace = "Mips64";
     let isCodeGenOnly = 1;
   }
@@ -503,6 +564,7 @@ class CBranch<bits<6> op, string instr_asm, PatFrag cond_op, RegisterClass RC>:
   let isBranch = 1;
   let isTerminator = 1;
   let hasDelaySlot = 1;
+  let Defs = [AT];
 }
 
 class CBranchZero<bits<6> op, bits<5> _rt, string instr_asm, PatFrag cond_op,
@@ -514,6 +576,7 @@ class CBranchZero<bits<6> op, bits<5> _rt, string instr_asm, PatFrag cond_op,
   let isBranch = 1;
   let isTerminator = 1;
   let hasDelaySlot = 1;
+  let Defs = [AT];
 }
 
 // SetCC
@@ -541,8 +604,9 @@ class JumpFJ<bits<6> op, string instr_asm>:
   let isTerminator=1;
   let isBarrier=1;
   let hasDelaySlot = 1;
-  let Predicates = [RelocStatic];
+  let Predicates = [RelocStatic, HasStandardEncoding];
   let DecoderMethod = "DecodeJumpTarget";
+  let Defs = [AT];
 }
 
 // Unconditional branch
@@ -555,23 +619,37 @@ class UncondBranch<bits<6> op, string instr_asm>:
   let isTerminator = 1;
   let isBarrier = 1;
   let hasDelaySlot = 1;
-  let Predicates = [RelocPIC];
+  let Predicates = [RelocPIC, HasStandardEncoding];
+  let Defs = [AT];
 }
 
-let isBranch=1, isTerminator=1, isBarrier=1, rd=0, hasDelaySlot = 1,
-    isIndirectBranch = 1 in
-class JumpFR<bits<6> op, bits<6> func, string instr_asm, RegisterClass RC>:
-  FR<op, func, (outs), (ins RC:$rs),
-     !strconcat(instr_asm, "\t$rs"), [(brind RC:$rs)], IIBranch> {
+// Base class for indirect branch and return instruction classes.
+let isTerminator=1, isBarrier=1, hasDelaySlot = 1 in
+class JumpFR<RegisterClass RC, list<dag> pattern>:
+  FR<0, 0x8, (outs), (ins RC:$rs), "jr\t$rs", pattern, IIBranch> {
   let rt = 0;
   let rd = 0;
   let shamt = 0;
 }
 
+// Indirect branch
+class IndirectBranch<RegisterClass RC>: JumpFR<RC, [(brind RC:$rs)]> {
+  let isBranch = 1;
+  let isIndirectBranch = 1;
+}
+
+// Return instruction
+class RetBase<RegisterClass RC>: JumpFR<RC, []> {
+  let isReturn = 1;
+  let isCodeGenOnly = 1;
+  let hasCtrlDep = 1;
+  let hasExtraSrcRegAllocReq = 1;
+}
+
 // Jump and Link (Call)
-let isCall=1, hasDelaySlot=1 in {
+let isCall=1, hasDelaySlot=1, Defs = [RA] in {
   class JumpLink<bits<6> op, string instr_asm>:
-    FJ<op, (outs), (ins calltarget:$target, variable_ops),
+    FJ<op, (outs), (ins calltarget:$target),
        !strconcat(instr_asm, "\t$target"), [(MipsJmpLink imm:$target)],
        IIBranch> {
        let DecoderMethod = "DecodeJumpTarget";
@@ -579,7 +657,7 @@ let isCall=1, hasDelaySlot=1 in {
 
   class JumpLinkReg<bits<6> op, bits<6> func, string instr_asm,
                     RegisterClass RC>:
-    FR<op, func, (outs), (ins RC:$rs, variable_ops),
+    FR<op, func, (outs), (ins RC:$rs),
        !strconcat(instr_asm, "\t$rs"), [(MipsJmpLink RC:$rs)], IIBranch> {
     let rt = 0;
     let rd = 31;
@@ -587,7 +665,7 @@ let isCall=1, hasDelaySlot=1 in {
   }
 
   class BranchLink<string instr_asm, bits<5> _rt, RegisterClass RC>:
-    FI<0x1, (outs), (ins RC:$rs, brtarget:$imm16, variable_ops),
+    FI<0x1, (outs), (ins RC:$rs, brtarget:$imm16),
        !strconcat(instr_asm, "\t$rs, $imm16"), [], IIBranch> {
     let rt = _rt;
   }
@@ -653,7 +731,7 @@ class CountLeading0<bits<6> func, string instr_asm, RegisterClass RC>:
   FR<0x1c, func, (outs RC:$rd), (ins RC:$rs),
      !strconcat(instr_asm, "\t$rd, $rs"),
      [(set RC:$rd, (ctlz RC:$rs))], IIAlu>,
-     Requires<[HasBitCount]> {
+     Requires<[HasBitCount, HasStandardEncoding]> {
   let shamt = 0;
   let rt = rd;
 }
@@ -662,7 +740,7 @@ class CountLeading1<bits<6> func, string instr_asm, RegisterClass RC>:
   FR<0x1c, func, (outs RC:$rd), (ins RC:$rs),
      !strconcat(instr_asm, "\t$rd, $rs"),
      [(set RC:$rd, (ctlz (not RC:$rs)))], IIAlu>,
-     Requires<[HasBitCount]> {
+     Requires<[HasBitCount, HasStandardEncoding]> {
   let shamt = 0;
   let rt = rd;
 }
@@ -675,7 +753,7 @@ class SignExtInReg<bits<5> sa, string instr_asm, ValueType vt,
      [(set RC:$rd, (sext_inreg RC:$rt, vt))], NoItinerary> {
   let rs = 0;
   let shamt = sa;
-  let Predicates = [HasSEInReg];
+  let Predicates = [HasSEInReg, HasStandardEncoding];
 }
 
 // Subword Swap
@@ -684,7 +762,7 @@ class SubwordSwap<bits<6> func, bits<5> sa, string instr_asm, RegisterClass RC>:
      !strconcat(instr_asm, "\t$rd, $rt"), [], NoItinerary> {
   let rs = 0;
   let shamt = sa;
-  let Predicates = [HasSwap];
+  let Predicates = [HasSwap, HasStandardEncoding];
   let neverHasSideEffects = 1;
 }
 
@@ -705,7 +783,7 @@ class ExtBase<bits<6> _funct, string instr_asm, RegisterClass RC>:
   bits<5> sz;
   let rd = sz;
   let shamt = pos;
-  let Predicates = [HasMips32r2];
+  let Predicates = [HasMips32r2, HasStandardEncoding];
 }
 
 class InsBase<bits<6> _funct, string instr_asm, RegisterClass RC>:
@@ -718,7 +796,7 @@ class InsBase<bits<6> _funct, string instr_asm, RegisterClass RC>:
   bits<5> sz;
   let rd = sz;
   let shamt = pos;
-  let Predicates = [HasMips32r2];
+  let Predicates = [HasMips32r2, HasStandardEncoding];
   let Constraints = "$src = $rt";
 }
 
@@ -730,8 +808,10 @@ class Atomic2Ops<PatFrag Op, string Opstr, RegisterClass DRC,
              [(set DRC:$dst, (Op PRC:$ptr, DRC:$incr))]>;
 
 multiclass Atomic2Ops32<PatFrag Op, string Opstr> {
-  def #NAME# : Atomic2Ops<Op, Opstr, CPURegs, CPURegs>, Requires<[NotN64]>;
-  def _P8    : Atomic2Ops<Op, Opstr, CPURegs, CPU64Regs>, Requires<[IsN64]> {
+  def #NAME# : Atomic2Ops<Op, Opstr, CPURegs, CPURegs>,
+                          Requires<[NotN64, HasStandardEncoding]>;
+  def _P8    : Atomic2Ops<Op, Opstr, CPURegs, CPU64Regs>,
+                          Requires<[IsN64, HasStandardEncoding]> {
     let DecoderNamespace = "Mips64";
   }
 }
@@ -744,8 +824,10 @@ class AtomicCmpSwap<PatFrag Op, string Width, RegisterClass DRC,
              [(set DRC:$dst, (Op PRC:$ptr, DRC:$cmp, DRC:$swap))]>;
 
 multiclass AtomicCmpSwap32<PatFrag Op, string Width>  {
-  def #NAME# : AtomicCmpSwap<Op, Width, CPURegs, CPURegs>, Requires<[NotN64]>;
-  def _P8    : AtomicCmpSwap<Op, Width, CPURegs, CPU64Regs>, Requires<[IsN64]> {
+  def #NAME# : AtomicCmpSwap<Op, Width, CPURegs, CPURegs>,
+                             Requires<[NotN64, HasStandardEncoding]>;
+  def _P8    : AtomicCmpSwap<Op, Width, CPURegs, CPU64Regs>,
+                             Requires<[IsN64, HasStandardEncoding]> {
     let DecoderNamespace = "Mips64";
   }
 }
@@ -767,6 +849,10 @@ class SCBase<bits<6> Opc, string opstring, RegisterClass RC, Operand Mem> :
 // Pseudo instructions
 //===----------------------------------------------------------------------===//
 
+// Return RA.
+let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, hasCtrlDep=1 in
+def RetRA : MipsPseudo<(outs), (ins), "", [(MipsRet)]>;
+
 // As stack alignment is always done with addiu, we need a 16-bit immediate
 let Defs = [SP], Uses = [SP] in {
 def ADJCALLSTACKDOWN : MipsPseudo<(outs), (ins uimm16:$amt),
@@ -785,29 +871,6 @@ let neverHasSideEffects = 1 in
 def CPRESTORE : MipsPseudo<(outs), (ins i32imm:$loc, CPURegs:$gp),
                            ".cprestore\t$loc", []>;
 
-// For O32 ABI & PIC & non-fixed global base register, the following instruction
-// seqeunce is emitted to set the global base register:
-//
-//  0. lui   $2, %hi(_gp_disp)
-//  1. addiu $2, $2, %lo(_gp_disp)
-//  2. addu  $globalbasereg, $2, $t9
-//
-// SETGP01 is emitted during Prologue/Epilogue insertion and then converted to
-// instructions 0 and 1 in the sequence above during MC lowering.
-// SETGP2 is emitted just before register allocation and converted to
-// instruction 2 just prior to post-RA scheduling.
-//
-// These pseudo instructions are needed to ensure no instructions are inserted
-// before or between instructions 0 and 1, which is a limitation imposed by
-// GNU linker.
-
-let isTerminator = 1, isBarrier = 1 in
-def SETGP01 : MipsPseudo<(outs CPURegs:$dst), (ins), "", []>;
-
-let neverHasSideEffects = 1 in
-def SETGP2 : MipsPseudo<(outs CPURegs:$globalreg), (ins CPURegs:$picreg), "",
-                        []>;
-
 let usesCustomInserter = 1 in {
   defm ATOMIC_LOAD_ADD_I8   : Atomic2Ops32<atomic_load_add_8, "load_add_8">;
   defm ATOMIC_LOAD_ADD_I16  : Atomic2Ops32<atomic_load_add_16, "load_add_16">;
@@ -876,7 +939,7 @@ def SRLV    : shift_rotate_reg<0x06, 0x00, "srlv", srl, CPURegs>;
 def SRAV    : shift_rotate_reg<0x07, 0x00, "srav", sra, CPURegs>;
 
 // Rotate Instructions
-let Predicates = [HasMips32r2] in {
+let Predicates = [HasMips32r2, HasStandardEncoding] in {
     def ROTR    : shift_rotate_imm32<0x02, 0x01, "rotr", rotr>;
     def ROTRV   : shift_rotate_reg<0x06, 0x01, "rotrv", rotr, CPURegs>;
 }
@@ -899,11 +962,11 @@ defm ULW     : LoadM32<0x23, "ulw",  load_u, 1>;
 defm USH     : StoreM32<0x29, "ush", truncstorei16_u, 1>;
 defm USW     : StoreM32<0x2b, "usw", store_u, 1>;
 
-/// Primitives for unaligned
-defm LWL     : LoadUnAlign32<0x22>;
-defm LWR     : LoadUnAlign32<0x26>;
-defm SWL     : StoreUnAlign32<0x2A>;
-defm SWR     : StoreUnAlign32<0x2E>;
+/// load/store left/right
+defm LWL : LoadLeftRightM32<0x22, "lwl", MipsLWL>;
+defm LWR : LoadLeftRightM32<0x26, "lwr", MipsLWR>;
+defm SWL : StoreLeftRightM32<0x2a, "swl", MipsSWL>;
+defm SWR : StoreLeftRightM32<0x2e, "swr", MipsSWR>;
 
 let hasSideEffects = 1 in
 def SYNC : MipsInst<(outs), (ins i32imm:$stype), "sync $stype",
@@ -917,19 +980,23 @@ def SYNC : MipsInst<(outs), (ins i32imm:$stype), "sync $stype",
 }
 
 /// Load-linked, Store-conditional
-def LL    : LLBase<0x30, "ll", CPURegs, mem>, Requires<[NotN64]>;
-def LL_P8 : LLBase<0x30, "ll", CPURegs, mem64>, Requires<[IsN64]> {
+def LL    : LLBase<0x30, "ll", CPURegs, mem>,
+            Requires<[NotN64, HasStandardEncoding]>;
+def LL_P8 : LLBase<0x30, "ll", CPURegs, mem64>,
+            Requires<[IsN64, HasStandardEncoding]> {
   let DecoderNamespace = "Mips64";
 }
 
-def SC    : SCBase<0x38, "sc", CPURegs, mem>, Requires<[NotN64]>;
-def SC_P8 : SCBase<0x38, "sc", CPURegs, mem64>, Requires<[IsN64]> {
+def SC    : SCBase<0x38, "sc", CPURegs, mem>,
+            Requires<[NotN64, HasStandardEncoding]>;
+def SC_P8 : SCBase<0x38, "sc", CPURegs, mem64>,
+            Requires<[IsN64, HasStandardEncoding]> {
   let DecoderNamespace = "Mips64";
 }
 
 /// Jump and Branch Instructions
 def J       : JumpFJ<0x02, "j">;
-def JR      : JumpFR<0x00, 0x08, "jr", CPURegs>;
+def JR      : IndirectBranch<CPURegs>;
 def B       : UncondBranch<0x04, "b">;
 def BEQ     : CBranch<0x04, "beq", seteq, CPURegs>;
 def BNE     : CBranch<0x05, "bne", setne, CPURegs>;
@@ -938,15 +1005,16 @@ def BGTZ    : CBranchZero<0x07, 0, "bgtz", setgt, CPURegs>;
 def BLEZ    : CBranchZero<0x06, 0, "blez", setle, CPURegs>;
 def BLTZ    : CBranchZero<0x01, 0, "bltz", setlt, CPURegs>;
 
+let rt = 0, rs = 0, isBranch = 1, isTerminator = 1, isBarrier = 1,
+    hasDelaySlot = 1, Defs = [RA] in
+def BAL_BR: FI<0x1, (outs), (ins brtarget:$imm16), "bal\t$imm16", [], IIBranch>;
+
 def JAL  : JumpLink<0x03, "jal">;
 def JALR : JumpLinkReg<0x00, 0x09, "jalr", CPURegs>;
 def BGEZAL  : BranchLink<"bgezal", 0x11, CPURegs>;
 def BLTZAL  : BranchLink<"bltzal", 0x10, CPURegs>;
 
-let isReturn=1, isTerminator=1, hasDelaySlot=1, isCodeGenOnly=1,
-    isBarrier=1, hasCtrlDep=1, rd=0, rt=0, shamt=0 in
-  def RET : FR <0x00, 0x08, (outs), (ins CPURegs:$target),
-                "jr\t$target", [(MipsRet CPURegs:$target)], IIBranch>;
+def RET : RetBase<CPURegs>;
 
 /// Multiply and Divide Instructions.
 def MULT    : Mult32<0x18, "mult", IIImul>;
@@ -999,7 +1067,7 @@ def MSUBU : MArithR<5, "msubu", MipsMSubu>;
 // MUL is a assembly macro in the current used ISAs. In recent ISA's
 // it is a real instruction.
 def MUL   : ArithLogicR<0x1c, 0x02, "mul", mul, IIImul, CPURegs, 1>,
-            Requires<[HasMips32]>;
+            Requires<[HasMips32, HasStandardEncoding]>;
 
 def RDHWR : ReadHardware<CPURegs, HWRegs>;
 
@@ -1011,67 +1079,67 @@ def INS : InsBase<4, "ins", CPURegs>;
 //===----------------------------------------------------------------------===//
 
 // Small immediates
-def : Pat<(i32 immSExt16:$in),
-          (ADDiu ZERO, imm:$in)>;
-def : Pat<(i32 immZExt16:$in),
-          (ORi ZERO, imm:$in)>;
-def : Pat<(i32 immLow16Zero:$in),
-          (LUi (HI16 imm:$in))>;
+def : MipsPat<(i32 immSExt16:$in),
+              (ADDiu ZERO, imm:$in)>;
+def : MipsPat<(i32 immZExt16:$in),
+              (ORi ZERO, imm:$in)>;
+def : MipsPat<(i32 immLow16Zero:$in),
+              (LUi (HI16 imm:$in))>;
 
 // Arbitrary immediates
-def : Pat<(i32 imm:$imm),
+def : MipsPat<(i32 imm:$imm),
           (ORi (LUi (HI16 imm:$imm)), (LO16 imm:$imm))>;
 
-// Carry patterns
-def : Pat<(subc CPURegs:$lhs, CPURegs:$rhs),
-          (SUBu CPURegs:$lhs, CPURegs:$rhs)>;
-def : Pat<(addc CPURegs:$lhs, CPURegs:$rhs),
-          (ADDu CPURegs:$lhs, CPURegs:$rhs)>;
-def : Pat<(addc  CPURegs:$src, immSExt16:$imm),
-          (ADDiu CPURegs:$src, imm:$imm)>;
+// Carry MipsPatterns
+def : MipsPat<(subc CPURegs:$lhs, CPURegs:$rhs),
+              (SUBu CPURegs:$lhs, CPURegs:$rhs)>;
+def : MipsPat<(addc CPURegs:$lhs, CPURegs:$rhs),
+              (ADDu CPURegs:$lhs, CPURegs:$rhs)>;
+def : MipsPat<(addc  CPURegs:$src, immSExt16:$imm),
+              (ADDiu CPURegs:$src, imm:$imm)>;
 
 // Call
-def : Pat<(MipsJmpLink (i32 tglobaladdr:$dst)),
-          (JAL tglobaladdr:$dst)>;
-def : Pat<(MipsJmpLink (i32 texternalsym:$dst)),
-          (JAL texternalsym:$dst)>;
-//def : Pat<(MipsJmpLink CPURegs:$dst),
-//          (JALR CPURegs:$dst)>;
+def : MipsPat<(MipsJmpLink (i32 tglobaladdr:$dst)),
+              (JAL tglobaladdr:$dst)>;
+def : MipsPat<(MipsJmpLink (i32 texternalsym:$dst)),
+              (JAL texternalsym:$dst)>;
+//def : MipsPat<(MipsJmpLink CPURegs:$dst),
+//              (JALR CPURegs:$dst)>;
 
 // hi/lo relocs
-def : Pat<(MipsHi tglobaladdr:$in), (LUi tglobaladdr:$in)>;
-def : Pat<(MipsHi tblockaddress:$in), (LUi tblockaddress:$in)>;
-def : Pat<(MipsHi tjumptable:$in), (LUi tjumptable:$in)>;
-def : Pat<(MipsHi tconstpool:$in), (LUi tconstpool:$in)>;
-def : Pat<(MipsHi tglobaltlsaddr:$in), (LUi tglobaltlsaddr:$in)>;
-
-def : Pat<(MipsLo tglobaladdr:$in), (ADDiu ZERO, tglobaladdr:$in)>;
-def : Pat<(MipsLo tblockaddress:$in), (ADDiu ZERO, tblockaddress:$in)>;
-def : Pat<(MipsLo tjumptable:$in), (ADDiu ZERO, tjumptable:$in)>;
-def : Pat<(MipsLo tconstpool:$in), (ADDiu ZERO, tconstpool:$in)>;
-def : Pat<(MipsLo tglobaltlsaddr:$in), (ADDiu ZERO, tglobaltlsaddr:$in)>;
-
-def : Pat<(add CPURegs:$hi, (MipsLo tglobaladdr:$lo)),
-          (ADDiu CPURegs:$hi, tglobaladdr:$lo)>;
-def : Pat<(add CPURegs:$hi, (MipsLo tblockaddress:$lo)),
-          (ADDiu CPURegs:$hi, tblockaddress:$lo)>;
-def : Pat<(add CPURegs:$hi, (MipsLo tjumptable:$lo)),
-          (ADDiu CPURegs:$hi, tjumptable:$lo)>;
-def : Pat<(add CPURegs:$hi, (MipsLo tconstpool:$lo)),
-          (ADDiu CPURegs:$hi, tconstpool:$lo)>;
-def : Pat<(add CPURegs:$hi, (MipsLo tglobaltlsaddr:$lo)),
-          (ADDiu CPURegs:$hi, tglobaltlsaddr:$lo)>;
+def : MipsPat<(MipsHi tglobaladdr:$in), (LUi tglobaladdr:$in)>;
+def : MipsPat<(MipsHi tblockaddress:$in), (LUi tblockaddress:$in)>;
+def : MipsPat<(MipsHi tjumptable:$in), (LUi tjumptable:$in)>;
+def : MipsPat<(MipsHi tconstpool:$in), (LUi tconstpool:$in)>;
+def : MipsPat<(MipsHi tglobaltlsaddr:$in), (LUi tglobaltlsaddr:$in)>;
+
+def : MipsPat<(MipsLo tglobaladdr:$in), (ADDiu ZERO, tglobaladdr:$in)>;
+def : MipsPat<(MipsLo tblockaddress:$in), (ADDiu ZERO, tblockaddress:$in)>;
+def : MipsPat<(MipsLo tjumptable:$in), (ADDiu ZERO, tjumptable:$in)>;
+def : MipsPat<(MipsLo tconstpool:$in), (ADDiu ZERO, tconstpool:$in)>;
+def : MipsPat<(MipsLo tglobaltlsaddr:$in), (ADDiu ZERO, tglobaltlsaddr:$in)>;
+
+def : MipsPat<(add CPURegs:$hi, (MipsLo tglobaladdr:$lo)),
+              (ADDiu CPURegs:$hi, tglobaladdr:$lo)>;
+def : MipsPat<(add CPURegs:$hi, (MipsLo tblockaddress:$lo)),
+              (ADDiu CPURegs:$hi, tblockaddress:$lo)>;
+def : MipsPat<(add CPURegs:$hi, (MipsLo tjumptable:$lo)),
+              (ADDiu CPURegs:$hi, tjumptable:$lo)>;
+def : MipsPat<(add CPURegs:$hi, (MipsLo tconstpool:$lo)),
+              (ADDiu CPURegs:$hi, tconstpool:$lo)>;
+def : MipsPat<(add CPURegs:$hi, (MipsLo tglobaltlsaddr:$lo)),
+              (ADDiu CPURegs:$hi, tglobaltlsaddr:$lo)>;
 
 // gp_rel relocs
-def : Pat<(add CPURegs:$gp, (MipsGPRel tglobaladdr:$in)),
-          (ADDiu CPURegs:$gp, tglobaladdr:$in)>;
-def : Pat<(add CPURegs:$gp, (MipsGPRel tconstpool:$in)),
-          (ADDiu CPURegs:$gp, tconstpool:$in)>;
+def : MipsPat<(add CPURegs:$gp, (MipsGPRel tglobaladdr:$in)),
+              (ADDiu CPURegs:$gp, tglobaladdr:$in)>;
+def : MipsPat<(add CPURegs:$gp, (MipsGPRel tconstpool:$in)),
+              (ADDiu CPURegs:$gp, tconstpool:$in)>;
 
 // wrapper_pic
 class WrapperPat<SDNode node, Instruction ADDiuOp, RegisterClass RC>:
-      Pat<(MipsWrapper RC:$gp, node:$in),
-          (ADDiuOp RC:$gp, node:$in)>;
+      MipsPat<(MipsWrapper RC:$gp, node:$in),
+              (ADDiuOp RC:$gp, node:$in)>;
 
 def : WrapperPat<tglobaladdr, ADDiu, CPURegs>;
 def : WrapperPat<tconstpool, ADDiu, CPURegs>;
@@ -1081,58 +1149,58 @@ def : WrapperPat<tjumptable, ADDiu, CPURegs>;
 def : WrapperPat<tglobaltlsaddr, ADDiu, CPURegs>;
 
 // Mips does not have "not", so we expand our way
-def : Pat<(not CPURegs:$in),
-          (NOR CPURegs:$in, ZERO)>;
+def : MipsPat<(not CPURegs:$in),
+              (NOR CPURegs:$in, ZERO)>;
 
 // extended loads
-let Predicates = [NotN64] in {
-  def : Pat<(i32 (extloadi1  addr:$src)), (LBu addr:$src)>;
-  def : Pat<(i32 (extloadi8  addr:$src)), (LBu addr:$src)>;
-  def : Pat<(i32 (extloadi16_a addr:$src)), (LHu addr:$src)>;
-  def : Pat<(i32 (extloadi16_u addr:$src)), (ULHu addr:$src)>;
+let Predicates = [NotN64, HasStandardEncoding] in {
+  def : MipsPat<(i32 (extloadi1  addr:$src)), (LBu addr:$src)>;
+  def : MipsPat<(i32 (extloadi8  addr:$src)), (LBu addr:$src)>;
+  def : MipsPat<(i32 (extloadi16_a addr:$src)), (LHu addr:$src)>;
+  def : MipsPat<(i32 (extloadi16_u addr:$src)), (ULHu addr:$src)>;
 }
-let Predicates = [IsN64] in {
-  def : Pat<(i32 (extloadi1  addr:$src)), (LBu_P8 addr:$src)>;
-  def : Pat<(i32 (extloadi8  addr:$src)), (LBu_P8 addr:$src)>;
-  def : Pat<(i32 (extloadi16_a addr:$src)), (LHu_P8 addr:$src)>;
-  def : Pat<(i32 (extloadi16_u addr:$src)), (ULHu_P8 addr:$src)>;
+let Predicates = [IsN64, HasStandardEncoding] in {
+  def : MipsPat<(i32 (extloadi1  addr:$src)), (LBu_P8 addr:$src)>;
+  def : MipsPat<(i32 (extloadi8  addr:$src)), (LBu_P8 addr:$src)>;
+  def : MipsPat<(i32 (extloadi16_a addr:$src)), (LHu_P8 addr:$src)>;
+  def : MipsPat<(i32 (extloadi16_u addr:$src)), (ULHu_P8 addr:$src)>;
 }
 
 // peepholes
-let Predicates = [NotN64] in {
-  def : Pat<(store_a (i32 0), addr:$dst), (SW ZERO, addr:$dst)>;
-  def : Pat<(store_u (i32 0), addr:$dst), (USW ZERO, addr:$dst)>;
+let Predicates = [NotN64, HasStandardEncoding] in {
+  def : MipsPat<(store_a (i32 0), addr:$dst), (SW ZERO, addr:$dst)>;
+  def : MipsPat<(store_u (i32 0), addr:$dst), (USW ZERO, addr:$dst)>;
 }
-let Predicates = [IsN64] in {
-  def : Pat<(store_a (i32 0), addr:$dst), (SW_P8 ZERO, addr:$dst)>;
-  def : Pat<(store_u (i32 0), addr:$dst), (USW_P8 ZERO, addr:$dst)>;
+let Predicates = [IsN64, HasStandardEncoding] in {
+  def : MipsPat<(store_a (i32 0), addr:$dst), (SW_P8 ZERO, addr:$dst)>;
+  def : MipsPat<(store_u (i32 0), addr:$dst), (USW_P8 ZERO, addr:$dst)>;
 }
 
 // brcond patterns
 multiclass BrcondPats<RegisterClass RC, Instruction BEQOp, Instruction BNEOp,
                       Instruction SLTOp, Instruction SLTuOp, Instruction SLTiOp,
                       Instruction SLTiuOp, Register ZEROReg> {
-def : Pat<(brcond (i32 (setne RC:$lhs, 0)), bb:$dst),
-          (BNEOp RC:$lhs, ZEROReg, bb:$dst)>;
-def : Pat<(brcond (i32 (seteq RC:$lhs, 0)), bb:$dst),
-          (BEQOp RC:$lhs, ZEROReg, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setne RC:$lhs, 0)), bb:$dst),
+              (BNEOp RC:$lhs, ZEROReg, bb:$dst)>;
+def : MipsPat<(brcond (i32 (seteq RC:$lhs, 0)), bb:$dst),
+              (BEQOp RC:$lhs, ZEROReg, bb:$dst)>;
 
-def : Pat<(brcond (i32 (setge RC:$lhs, RC:$rhs)), bb:$dst),
-          (BEQ (SLTOp RC:$lhs, RC:$rhs), ZERO, bb:$dst)>;
-def : Pat<(brcond (i32 (setuge RC:$lhs, RC:$rhs)), bb:$dst),
-          (BEQ (SLTuOp RC:$lhs, RC:$rhs), ZERO, bb:$dst)>;
-def : Pat<(brcond (i32 (setge RC:$lhs, immSExt16:$rhs)), bb:$dst),
-          (BEQ (SLTiOp RC:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
-def : Pat<(brcond (i32 (setuge RC:$lhs, immSExt16:$rhs)), bb:$dst),
-          (BEQ (SLTiuOp RC:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setge RC:$lhs, RC:$rhs)), bb:$dst),
+              (BEQ (SLTOp RC:$lhs, RC:$rhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setuge RC:$lhs, RC:$rhs)), bb:$dst),
+              (BEQ (SLTuOp RC:$lhs, RC:$rhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setge RC:$lhs, immSExt16:$rhs)), bb:$dst),
+              (BEQ (SLTiOp RC:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setuge RC:$lhs, immSExt16:$rhs)), bb:$dst),
+              (BEQ (SLTiuOp RC:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
 
-def : Pat<(brcond (i32 (setle RC:$lhs, RC:$rhs)), bb:$dst),
-          (BEQ (SLTOp RC:$rhs, RC:$lhs), ZERO, bb:$dst)>;
-def : Pat<(brcond (i32 (setule RC:$lhs, RC:$rhs)), bb:$dst),
-          (BEQ (SLTuOp RC:$rhs, RC:$lhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setle RC:$lhs, RC:$rhs)), bb:$dst),
+              (BEQ (SLTOp RC:$rhs, RC:$lhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setule RC:$lhs, RC:$rhs)), bb:$dst),
+              (BEQ (SLTuOp RC:$rhs, RC:$lhs), ZERO, bb:$dst)>;
 
-def : Pat<(brcond RC:$cond, bb:$dst),
-          (BNEOp RC:$cond, ZEROReg, bb:$dst)>;
+def : MipsPat<(brcond RC:$cond, bb:$dst),
+              (BNEOp RC:$cond, ZEROReg, bb:$dst)>;
 }
 
 defm : BrcondPats<CPURegs, BEQ, BNE, SLT, SLTu, SLTi, SLTiu, ZERO>;
@@ -1140,39 +1208,39 @@ defm : BrcondPats<CPURegs, BEQ, BNE, SLT, SLTu, SLTi, SLTiu, ZERO>;
 // setcc patterns
 multiclass SeteqPats<RegisterClass RC, Instruction SLTiuOp, Instruction XOROp,
                      Instruction SLTuOp, Register ZEROReg> {
-  def : Pat<(seteq RC:$lhs, RC:$rhs),
-            (SLTiuOp (XOROp RC:$lhs, RC:$rhs), 1)>;
-  def : Pat<(setne RC:$lhs, RC:$rhs),
-            (SLTuOp ZEROReg, (XOROp RC:$lhs, RC:$rhs))>;
+  def : MipsPat<(seteq RC:$lhs, RC:$rhs),
+                (SLTiuOp (XOROp RC:$lhs, RC:$rhs), 1)>;
+  def : MipsPat<(setne RC:$lhs, RC:$rhs),
+                (SLTuOp ZEROReg, (XOROp RC:$lhs, RC:$rhs))>;
 }
 
 multiclass SetlePats<RegisterClass RC, Instruction SLTOp, Instruction SLTuOp> {
-  def : Pat<(setle RC:$lhs, RC:$rhs),
-            (XORi (SLTOp RC:$rhs, RC:$lhs), 1)>;
-  def : Pat<(setule RC:$lhs, RC:$rhs),
-            (XORi (SLTuOp RC:$rhs, RC:$lhs), 1)>;
+  def : MipsPat<(setle RC:$lhs, RC:$rhs),
+                (XORi (SLTOp RC:$rhs, RC:$lhs), 1)>;
+  def : MipsPat<(setule RC:$lhs, RC:$rhs),
+                (XORi (SLTuOp RC:$rhs, RC:$lhs), 1)>;
 }
 
 multiclass SetgtPats<RegisterClass RC, Instruction SLTOp, Instruction SLTuOp> {
-  def : Pat<(setgt RC:$lhs, RC:$rhs),
-            (SLTOp RC:$rhs, RC:$lhs)>;
-  def : Pat<(setugt RC:$lhs, RC:$rhs),
-            (SLTuOp RC:$rhs, RC:$lhs)>;
+  def : MipsPat<(setgt RC:$lhs, RC:$rhs),
+                (SLTOp RC:$rhs, RC:$lhs)>;
+  def : MipsPat<(setugt RC:$lhs, RC:$rhs),
+                (SLTuOp RC:$rhs, RC:$lhs)>;
 }
 
 multiclass SetgePats<RegisterClass RC, Instruction SLTOp, Instruction SLTuOp> {
-  def : Pat<(setge RC:$lhs, RC:$rhs),
-            (XORi (SLTOp RC:$lhs, RC:$rhs), 1)>;
-  def : Pat<(setuge RC:$lhs, RC:$rhs),
-            (XORi (SLTuOp RC:$lhs, RC:$rhs), 1)>;
+  def : MipsPat<(setge RC:$lhs, RC:$rhs),
+                (XORi (SLTOp RC:$lhs, RC:$rhs), 1)>;
+  def : MipsPat<(setuge RC:$lhs, RC:$rhs),
+                (XORi (SLTuOp RC:$lhs, RC:$rhs), 1)>;
 }
 
 multiclass SetgeImmPats<RegisterClass RC, Instruction SLTiOp,
                         Instruction SLTiuOp> {
-  def : Pat<(setge RC:$lhs, immSExt16:$rhs),
-            (XORi (SLTiOp RC:$lhs, immSExt16:$rhs), 1)>;
-  def : Pat<(setuge RC:$lhs, immSExt16:$rhs),
-            (XORi (SLTiuOp RC:$lhs, immSExt16:$rhs), 1)>;
+  def : MipsPat<(setge RC:$lhs, immSExt16:$rhs),
+                (XORi (SLTiOp RC:$lhs, immSExt16:$rhs), 1)>;
+  def : MipsPat<(setuge RC:$lhs, immSExt16:$rhs),
+                (XORi (SLTiuOp RC:$lhs, immSExt16:$rhs), 1)>;
 }
 
 defm : SeteqPats<CPURegs, SLTiu, XOR, SLTu, ZERO>;
@@ -1182,10 +1250,10 @@ defm : SetgePats<CPURegs, SLT, SLTu>;
 defm : SetgeImmPats<CPURegs, SLTi, SLTiu>;
 
 // select MipsDynAlloc
-def : Pat<(MipsDynAlloc addr:$f), (DynAlloc addr:$f)>;
+def : MipsPat<(MipsDynAlloc addr:$f), (DynAlloc addr:$f)>;
 
 // bswap pattern
-def : Pat<(bswap CPURegs:$rt), (ROTR (WSBH CPURegs:$rt), 16)>;
+def : MipsPat<(bswap CPURegs:$rt), (ROTR (WSBH CPURegs:$rt), 16)>;
 
 //===----------------------------------------------------------------------===//
 // Floating Point Support
@@ -1195,3 +1263,8 @@ include "MipsInstrFPU.td"
 include "Mips64InstrInfo.td"
 include "MipsCondMov.td"
 
+//
+// Mips16
+
+include "Mips16InstrFormats.td"
+include "Mips16InstrInfo.td"
diff --git a/lib/Target/Mips/MipsJITInfo.cpp b/lib/Target/Mips/MipsJITInfo.cpp
index 76ca3e1..150bdbb 100644
--- a/lib/Target/Mips/MipsJITInfo.cpp
+++ b/lib/Target/Mips/MipsJITInfo.cpp
@@ -154,8 +154,8 @@ TargetJITInfo::StubLayout MipsJITInfo::getStubLayout() {
   return Result;
 }
 
-void *MipsJITInfo::emitFunctionStub(const Function* F, void *Fn,
-    JITCodeEmitter &JCE) {
+void *MipsJITInfo::emitFunctionStub(const Function *F, void *Fn,
+                                    JITCodeEmitter &JCE) {
   JCE.emitAlignment(4);
   void *Addr = (void*) (JCE.getCurrentPCValue());
   if (!sys::Memory::setRangeWritable(Addr, 16))
@@ -193,7 +193,7 @@ void *MipsJITInfo::emitFunctionStub(const Function* F, void *Fn,
 /// it must rewrite the code to contain the actual addresses of any
 /// referenced global symbols.
 void MipsJITInfo::relocate(void *Function, MachineRelocation *MR,
-    unsigned NumRelocs, unsigned char* GOTBase) {
+                           unsigned NumRelocs, unsigned char *GOTBase) {
   for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
 
     void *RelocPos = (char*) Function + MR->getMachineCodeOffset();
diff --git a/lib/Target/Mips/MipsJITInfo.h b/lib/Target/Mips/MipsJITInfo.h
index f4c4ae8..637a318 100644
--- a/lib/Target/Mips/MipsJITInfo.h
+++ b/lib/Target/Mips/MipsJITInfo.h
@@ -45,8 +45,8 @@ class MipsJITInfo : public TargetJITInfo {
     /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a
     /// small native function that simply calls the function at the specified
     /// address.
-    virtual void *emitFunctionStub(const Function* F, void *Fn,
-        JITCodeEmitter &JCE);
+    virtual void *emitFunctionStub(const Function *F, void *Fn,
+                                   JITCodeEmitter &JCE);
 
     /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
     virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
@@ -55,7 +55,7 @@ class MipsJITInfo : public TargetJITInfo {
     /// it must rewrite the code to contain the actual addresses of any
     /// referenced global symbols.
     virtual void relocate(void *Function, MachineRelocation *MR,
-        unsigned NumRelocs, unsigned char* GOTBase);
+                          unsigned NumRelocs, unsigned char *GOTBase);
 
     /// Initialize - Initialize internal stage for the function being JITted.
     void Initialize(const MachineFunction &MF, bool isPIC) {
diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp
new file mode 100644
index 0000000..70ecbc1
--- /dev/null
+++ b/lib/Target/Mips/MipsLongBranch.cpp
@@ -0,0 +1,419 @@
+//===-- MipsLongBranch.cpp - Emit long branches ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass expands a branch or jump instruction into a long branch if its
+// offset is too large to fit into its immediate field.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mips-long-branch"
+
+#include "Mips.h"
+#include "MipsTargetMachine.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+STATISTIC(LongBranches, "Number of long branches.");
+
+static cl::opt<bool> SkipLongBranch(
+  "skip-mips-long-branch",
+  cl::init(false),
+  cl::desc("MIPS: Skip long branch pass."),
+  cl::Hidden);
+
+static cl::opt<bool> ForceLongBranch(
+  "force-mips-long-branch",
+  cl::init(false),
+  cl::desc("MIPS: Expand all branches to long format."),
+  cl::Hidden);
+
+namespace {
+  typedef MachineBasicBlock::iterator Iter;
+  typedef MachineBasicBlock::reverse_iterator ReverseIter;
+
+  struct MBBInfo {
+    uint64_t Size;
+    bool HasLongBranch;
+    MachineInstr *Br;
+
+    MBBInfo() : Size(0), HasLongBranch(false), Br(0) {}
+  };
+
+  class MipsLongBranch : public MachineFunctionPass {
+
+  public:
+    static char ID;
+    MipsLongBranch(TargetMachine &tm)
+      : MachineFunctionPass(ID), TM(tm),
+        TII(static_cast<const MipsInstrInfo*>(tm.getInstrInfo())) {}
+
+    virtual const char *getPassName() const {
+      return "Mips Long Branch";
+    }
+
+    bool runOnMachineFunction(MachineFunction &F);
+
+  private:
+    void splitMBB(MachineBasicBlock *MBB);
+    void initMBBInfo();
+    int64_t computeOffset(const MachineInstr *Br);
+    void replaceBranch(MachineBasicBlock &MBB, Iter Br, DebugLoc DL,
+                       MachineBasicBlock *MBBOpnd);
+    void expandToLongBranch(MBBInfo &Info);
+
+    const TargetMachine &TM;
+    const MipsInstrInfo *TII;
+    MachineFunction *MF;
+    SmallVector<MBBInfo, 16> MBBInfos;
+  };
+
+  char MipsLongBranch::ID = 0;
+} // end of anonymous namespace
+
+/// createMipsLongBranchPass - Returns a pass that converts branches to long
+/// branches.
+FunctionPass *llvm::createMipsLongBranchPass(MipsTargetMachine &tm) {
+  return new MipsLongBranch(tm);
+}
+
+/// Iterate over list of Br's operands and search for a MachineBasicBlock
+/// operand.
+static MachineBasicBlock *getTargetMBB(const MachineInstr &Br) {
+  for (unsigned I = 0, E = Br.getDesc().getNumOperands(); I < E; ++I) {
+    const MachineOperand &MO = Br.getOperand(I);
+
+    if (MO.isMBB())
+      return MO.getMBB();
+  }
+
+  assert(false && "This instruction does not have an MBB operand.");
+  return 0;
+}
+
+// Traverse the list of instructions backwards until a non-debug instruction is
+// found or it reaches E.
+static ReverseIter getNonDebugInstr(ReverseIter B, ReverseIter E) {
+  for (; B != E; ++B)
+    if (!B->isDebugValue())
+      return B;
+
+  return E;
+}
+
+// Split MBB if it has two direct jumps/branches.
+void MipsLongBranch::splitMBB(MachineBasicBlock *MBB) {
+  ReverseIter End = MBB->rend();
+  ReverseIter LastBr = getNonDebugInstr(MBB->rbegin(), End);
+
+  // Return if MBB has no branch instructions.
+  if ((LastBr == End) ||
+      (!LastBr->isConditionalBranch() && !LastBr->isUnconditionalBranch()))
+    return;
+
+  ReverseIter FirstBr = getNonDebugInstr(llvm::next(LastBr), End);
+
+  // MBB has only one branch instruction if FirstBr is not a branch
+  // instruction.
+  if ((FirstBr == End) ||
+      (!FirstBr->isConditionalBranch() && !FirstBr->isUnconditionalBranch()))
+    return;
+
+  assert(!FirstBr->isIndirectBranch() && "Unexpected indirect branch found.");
+
+  // Create a new MBB. Move instructions in MBB to the newly created MBB.
+  MachineBasicBlock *NewMBB =
+    MF->CreateMachineBasicBlock(MBB->getBasicBlock());
+
+  // Insert NewMBB and fix control flow.
+  MachineBasicBlock *Tgt = getTargetMBB(*FirstBr);
+  NewMBB->transferSuccessors(MBB);
+  NewMBB->removeSuccessor(Tgt);
+  MBB->addSuccessor(NewMBB);
+  MBB->addSuccessor(Tgt);
+  MF->insert(llvm::next(MachineFunction::iterator(MBB)), NewMBB);
+
+  NewMBB->splice(NewMBB->end(), MBB, (++LastBr).base(), MBB->end());
+}
+
+// Fill MBBInfos.
+void MipsLongBranch::initMBBInfo() {
+  // Split the MBBs if they have two branches. Each basic block should have at
+  // most one branch after this loop is executed.
+  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E;)
+    splitMBB(I++);
+
+  MF->RenumberBlocks();
+  MBBInfos.clear();
+  MBBInfos.resize(MF->size());
+
+  for (unsigned I = 0, E = MBBInfos.size(); I < E; ++I) {
+    MachineBasicBlock *MBB = MF->getBlockNumbered(I);
+
+    // Compute size of MBB.
+    for (MachineBasicBlock::instr_iterator MI = MBB->instr_begin();
+         MI != MBB->instr_end(); ++MI)
+      MBBInfos[I].Size += TII->GetInstSizeInBytes(&*MI);
+
+    // Search for MBB's branch instruction.
+    ReverseIter End = MBB->rend();
+    ReverseIter Br = getNonDebugInstr(MBB->rbegin(), End);
+
+    if ((Br != End) && !Br->isIndirectBranch() &&
+        (Br->isConditionalBranch() ||
+         (Br->isUnconditionalBranch() &&
+          TM.getRelocationModel() == Reloc::PIC_)))
+      MBBInfos[I].Br = (++Br).base();
+  }
+}
+
+// Compute offset of branch in number of bytes.
+int64_t MipsLongBranch::computeOffset(const MachineInstr *Br) {
+  int64_t Offset = 0;
+  int ThisMBB = Br->getParent()->getNumber();
+  int TargetMBB = getTargetMBB(*Br)->getNumber();
+
+  // Compute offset of a forward branch.
+  if (ThisMBB < TargetMBB) {
+    for (int N = ThisMBB + 1; N < TargetMBB; ++N)
+      Offset += MBBInfos[N].Size;
+
+    return Offset + 4;
+  }
+
+  // Compute offset of a backward branch.
+  for (int N = ThisMBB; N >= TargetMBB; --N)
+    Offset += MBBInfos[N].Size;
+
+  return -Offset + 4;
+}
+
+// Replace Br with a branch which has the opposite condition code and a
+// MachineBasicBlock operand MBBOpnd.
+void MipsLongBranch::replaceBranch(MachineBasicBlock &MBB, Iter Br,
+                                   DebugLoc DL, MachineBasicBlock *MBBOpnd) {
+  unsigned NewOpc = Mips::GetOppositeBranchOpc(Br->getOpcode());
+  const MCInstrDesc &NewDesc = TII->get(NewOpc);
+
+  MachineInstrBuilder MIB = BuildMI(MBB, Br, DL, NewDesc);
+
+  for (unsigned I = 0, E = Br->getDesc().getNumOperands(); I < E; ++I) {
+    MachineOperand &MO = Br->getOperand(I);
+
+    if (!MO.isReg()) {
+      assert(MO.isMBB() && "MBB operand expected.");
+      break;
+    }
+
+    MIB.addReg(MO.getReg());
+  }
+
+  MIB.addMBB(MBBOpnd);
+
+  Br->eraseFromParent();
+}
+
+// Expand branch instructions to long branches.
+void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
+  I.HasLongBranch = true;
+
+  bool IsPIC = TM.getRelocationModel() == Reloc::PIC_;
+  unsigned ABI = TM.getSubtarget<MipsSubtarget>().getTargetABI();
+  bool N64 = ABI == MipsSubtarget::N64;
+
+  MachineBasicBlock::iterator Pos;
+  MachineBasicBlock *MBB = I.Br->getParent(), *TgtMBB = getTargetMBB(*I.Br);
+  DebugLoc DL = I.Br->getDebugLoc();
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator FallThroughMBB = ++MachineFunction::iterator(MBB);
+  MachineBasicBlock *LongBrMBB = MF->CreateMachineBasicBlock(BB);
+
+  MF->insert(FallThroughMBB, LongBrMBB);
+  MBB->removeSuccessor(TgtMBB);
+  MBB->addSuccessor(LongBrMBB);
+
+  if (IsPIC) {
+    // $longbr:
+    //  addiu $sp, $sp, -regsize * 2
+    //  sw $ra, 0($sp)
+    //  bal $baltgt
+    //  sw $a3, regsize($sp)
+    // $baltgt:
+    //  lui $a3, %hi($baltgt)
+    //  lui $at, %hi($tgt)
+    //  addiu $a3, $a3, %lo($baltgt)
+    //  addiu $at, $at, %lo($tgt)
+    //  subu $at, $at, $a3
+    //  addu $at, $ra, $at
+    //
+    //  if n64:
+    //   lui $a3, %highest($baltgt)
+    //   lui $ra, %highest($tgt)
+    //   addiu $a3, $a3, %higher($baltgt)
+    //   addiu $ra, $ra, %higher($tgt)
+    //   dsll $a3, $a3, 32
+    //   dsll $ra, $ra, 32
+    //   subu $at, $at, $a3
+    //   addu $at, $at, $ra
+    //
+    //  lw $ra, 0($sp)
+    //  lw $a3, regsize($sp)
+    //  jr $at
+    //  addiu $sp, $sp, regsize * 2
+    // $fallthrough:
+    //
+    MF->getInfo<MipsFunctionInfo>()->setEmitNOAT();
+    MachineBasicBlock *BalTgtMBB = MF->CreateMachineBasicBlock(BB);
+    MF->insert(FallThroughMBB, BalTgtMBB);
+    LongBrMBB->addSuccessor(BalTgtMBB);
+    BalTgtMBB->addSuccessor(TgtMBB);
+
+    int RegSize = N64 ? 8 : 4;
+    unsigned AT = N64 ? Mips::AT_64 : Mips::AT;
+    unsigned A3 = N64 ? Mips::A3_64 : Mips::A3;
+    unsigned SP = N64 ? Mips::SP_64 : Mips::SP;
+    unsigned RA = N64 ? Mips::RA_64 : Mips::RA;
+    unsigned Load = N64 ? Mips::LD_P8 : Mips::LW;
+    unsigned Store = N64 ? Mips::SD_P8 : Mips::SW;
+    unsigned LUi = N64 ? Mips::LUi64 : Mips::LUi;
+    unsigned ADDiu = N64 ? Mips::DADDiu : Mips::ADDiu;
+    unsigned ADDu = N64 ? Mips::DADDu : Mips::ADDu;
+    unsigned SUBu = N64 ? Mips::SUBu : Mips::SUBu;
+    unsigned JR = N64 ? Mips::JR64 : Mips::JR;
+
+    Pos = LongBrMBB->begin();
+
+    BuildMI(*LongBrMBB, Pos, DL, TII->get(ADDiu), SP).addReg(SP)
+      .addImm(-RegSize * 2);
+    BuildMI(*LongBrMBB, Pos, DL, TII->get(Store)).addReg(RA).addReg(SP)
+      .addImm(0);
+    BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::BAL_BR)).addMBB(BalTgtMBB);
+    BuildMI(*LongBrMBB, Pos, DL, TII->get(Store)).addReg(A3).addReg(SP)
+      .addImm(RegSize)->setIsInsideBundle();
+
+    Pos = BalTgtMBB->begin();
+
+    BuildMI(*BalTgtMBB, Pos, DL, TII->get(LUi), A3)
+      .addMBB(BalTgtMBB, MipsII::MO_ABS_HI);
+    BuildMI(*BalTgtMBB, Pos, DL, TII->get(LUi), AT)
+      .addMBB(TgtMBB, MipsII::MO_ABS_HI);
+    BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), A3).addReg(A3)
+      .addMBB(BalTgtMBB, MipsII::MO_ABS_LO);
+    BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), AT).addReg(AT)
+      .addMBB(TgtMBB, MipsII::MO_ABS_LO);
+    BuildMI(*BalTgtMBB, Pos, DL, TII->get(SUBu), AT).addReg(AT).addReg(A3);
+    BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDu), AT).addReg(RA).addReg(AT);
+
+    if (N64) {
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(LUi), A3)
+        .addMBB(BalTgtMBB, MipsII::MO_HIGHEST);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(LUi), RA)
+        .addMBB(TgtMBB, MipsII::MO_HIGHEST);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), A3).addReg(A3)
+        .addMBB(BalTgtMBB, MipsII::MO_HIGHER);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), RA).addReg(RA)
+        .addMBB(TgtMBB, MipsII::MO_HIGHER);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DSLL), A3).addReg(A3)
+        .addImm(32);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DSLL), RA).addReg(RA)
+        .addImm(32);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(SUBu), AT).addReg(AT).addReg(A3);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDu), AT).addReg(AT).addReg(RA);
+      I.Size += 4 * 8;
+    }
+
+    BuildMI(*BalTgtMBB, Pos, DL, TII->get(Load), RA).addReg(SP).addImm(0);
+    BuildMI(*BalTgtMBB, Pos, DL, TII->get(Load), A3).addReg(SP).addImm(RegSize);
+    BuildMI(*BalTgtMBB, Pos, DL, TII->get(JR)).addReg(AT);
+    BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), SP).addReg(SP)
+      .addImm(RegSize * 2)->setIsInsideBundle();
+    I.Size += 4 * 14;
+  } else {
+    // $longbr:
+    //  j $tgt
+    //  nop
+    // $fallthrough:
+    //
+    Pos = LongBrMBB->begin();
+    LongBrMBB->addSuccessor(TgtMBB);
+    BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::J)).addMBB(TgtMBB);
+    BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::NOP))->setIsInsideBundle();
+    I.Size += 4 * 2;
+  }
+
+  if (I.Br->isUnconditionalBranch()) {
+    // Change branch destination.
+    assert(I.Br->getDesc().getNumOperands() == 1);
+    I.Br->RemoveOperand(0);
+    I.Br->addOperand(MachineOperand::CreateMBB(LongBrMBB));
+  } else
+    // Change branch destination and reverse condition.
+    replaceBranch(*MBB, I.Br, DL, FallThroughMBB);
+}
+
+static void emitGPDisp(MachineFunction &F, const MipsInstrInfo *TII) {
+  MachineBasicBlock &MBB = F.front();
+  MachineBasicBlock::iterator I = MBB.begin();
+  DebugLoc DL = MBB.findDebugLoc(MBB.begin());
+  BuildMI(MBB, I, DL, TII->get(Mips::LUi), Mips::V0)
+    .addExternalSymbol("_gp_disp", MipsII::MO_ABS_HI);
+  BuildMI(MBB, I, DL, TII->get(Mips::ADDiu), Mips::V0)
+    .addReg(Mips::V0).addExternalSymbol("_gp_disp", MipsII::MO_ABS_LO);
+  MBB.removeLiveIn(Mips::V0);
+}
+
+bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
+  if ((TM.getRelocationModel() == Reloc::PIC_) &&
+      TM.getSubtarget<MipsSubtarget>().isABI_O32() &&
+      F.getInfo<MipsFunctionInfo>()->globalBaseRegSet())
+    emitGPDisp(F, TII);
+
+  if (SkipLongBranch)
+    return true;
+
+  MF = &F;
+  initMBBInfo();
+
+  SmallVector<MBBInfo, 16>::iterator I, E = MBBInfos.end();
+  bool EverMadeChange = false, MadeChange = true;
+
+  while (MadeChange) {
+    MadeChange = false;
+
+    for (I = MBBInfos.begin(); I != E; ++I) {
+      // Skip if this MBB doesn't have a branch or the branch has already been
+      // converted to a long branch.
+      if (!I->Br || I->HasLongBranch)
+        continue;
+
+      if (!ForceLongBranch)
+        // Check if offset fits into 16-bit immediate field of branches.
+        if (isInt<16>(computeOffset(I->Br) / 4))
+          continue;
+
+      expandToLongBranch(*I);
+      ++LongBranches;
+      EverMadeChange = MadeChange = true;
+    }
+  }
+
+  if (EverMadeChange)
+    MF->RenumberBlocks();
+
+  return true;
+}
diff --git a/lib/Target/Mips/MipsMCInstLower.cpp b/lib/Target/Mips/MipsMCInstLower.cpp
index 1597b93..d4c5e6d 100644
--- a/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/lib/Target/Mips/MipsMCInstLower.cpp
@@ -29,7 +29,7 @@ using namespace llvm;
 MipsMCInstLower::MipsMCInstLower(MipsAsmPrinter &asmprinter)
   : AsmPrinter(asmprinter) {}
 
-void MipsMCInstLower::Initialize(Mangler *M, MCContext* C) {
+void MipsMCInstLower::Initialize(Mangler *M, MCContext *C) {
   Mang = M;
   Ctx = C;
 }
@@ -61,6 +61,8 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   case MipsII::MO_GOT_DISP:  Kind = MCSymbolRefExpr::VK_Mips_GOT_DISP; break;
   case MipsII::MO_GOT_PAGE:  Kind = MCSymbolRefExpr::VK_Mips_GOT_PAGE; break;
   case MipsII::MO_GOT_OFST:  Kind = MCSymbolRefExpr::VK_Mips_GOT_OFST; break;
+  case MipsII::MO_HIGHER:    Kind = MCSymbolRefExpr::VK_Mips_HIGHER; break;
+  case MipsII::MO_HIGHEST:   Kind = MCSymbolRefExpr::VK_Mips_HIGHEST; break;
   }
 
   switch (MOTy) {
@@ -70,14 +72,17 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
 
   case MachineOperand::MO_GlobalAddress:
     Symbol = Mang->getSymbol(MO.getGlobal());
+    Offset += MO.getOffset();
     break;
 
   case MachineOperand::MO_BlockAddress:
     Symbol = AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress());
+    Offset += MO.getOffset();
     break;
 
   case MachineOperand::MO_ExternalSymbol:
     Symbol = AsmPrinter.GetExternalSymbolSymbol(MO.getSymbolName());
+    Offset += MO.getOffset();
     break;
 
   case MachineOperand::MO_JumpTableIndex:
@@ -86,8 +91,7 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
 
   case MachineOperand::MO_ConstantPoolIndex:
     Symbol = AsmPrinter.GetCPISymbol(MO.getIndex());
-    if (MO.getOffset())
-      Offset += MO.getOffset();
+    Offset += MO.getOffset();
     break;
 
   default:
@@ -103,71 +107,23 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   assert(Offset > 0);
 
   const MCConstantExpr *OffsetExpr =  MCConstantExpr::Create(Offset, *Ctx);
-  const MCBinaryExpr *AddExpr = MCBinaryExpr::CreateAdd(MCSym, OffsetExpr, *Ctx);
-  return MCOperand::CreateExpr(AddExpr);
+  const MCBinaryExpr *Add = MCBinaryExpr::CreateAdd(MCSym, OffsetExpr, *Ctx);
+  return MCOperand::CreateExpr(Add);
 }
 
-static void CreateMCInst(MCInst& Inst, unsigned Opc, const MCOperand& Opnd0,
-                         const MCOperand& Opnd1,
-                         const MCOperand& Opnd2 = MCOperand()) {
+/*
+static void CreateMCInst(MCInst& Inst, unsigned Opc, const MCOperand &Opnd0,
+                         const MCOperand &Opnd1,
+                         const MCOperand &Opnd2 = MCOperand()) {
   Inst.setOpcode(Opc);
   Inst.addOperand(Opnd0);
   Inst.addOperand(Opnd1);
   if (Opnd2.isValid())
     Inst.addOperand(Opnd2);
 }
+*/
 
-// Lower ".cpload $reg" to
-//  "lui   $gp, %hi(_gp_disp)"
-//  "addiu $gp, $gp, %lo(_gp_disp)"
-//  "addu  $gp, $gp, $t9"
-void MipsMCInstLower::LowerCPLOAD(SmallVector<MCInst, 4>& MCInsts) {
-  MCOperand GPReg = MCOperand::CreateReg(Mips::GP);
-  MCOperand T9Reg = MCOperand::CreateReg(Mips::T9);
-  StringRef SymName("_gp_disp");
-  const MCSymbol *Sym = Ctx->GetOrCreateSymbol(SymName);
-  const MCSymbolRefExpr *MCSym;
-
-  MCSym = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_Mips_ABS_HI, *Ctx);
-  MCOperand SymHi = MCOperand::CreateExpr(MCSym);
-  MCSym = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_Mips_ABS_LO, *Ctx);
-  MCOperand SymLo = MCOperand::CreateExpr(MCSym);
-
-  MCInsts.resize(3);
-
-  CreateMCInst(MCInsts[0], Mips::LUi, GPReg, SymHi);
-  CreateMCInst(MCInsts[1], Mips::ADDiu, GPReg, GPReg, SymLo);
-  CreateMCInst(MCInsts[2], Mips::ADDu, GPReg, GPReg, T9Reg);
-}
-
-// Lower ".cprestore offset" to "sw $gp, offset($sp)".
-void MipsMCInstLower::LowerCPRESTORE(int64_t Offset,
-                                     SmallVector<MCInst, 4>& MCInsts) {
-  assert(isInt<32>(Offset) && (Offset >= 0) &&
-         "Imm operand of .cprestore must be a non-negative 32-bit value.");
-
-  MCOperand SPReg = MCOperand::CreateReg(Mips::SP), BaseReg = SPReg;
-  MCOperand GPReg = MCOperand::CreateReg(Mips::GP);
-
-  if (!isInt<16>(Offset)) {
-    unsigned Hi = ((Offset + 0x8000) >> 16) & 0xffff;
-    Offset &= 0xffff;
-    MCOperand ATReg = MCOperand::CreateReg(Mips::AT);
-    BaseReg = ATReg;
-
-    // lui   at,hi
-    // addu  at,at,sp
-    MCInsts.resize(2);
-    CreateMCInst(MCInsts[0], Mips::LUi, ATReg, MCOperand::CreateImm(Hi));
-    CreateMCInst(MCInsts[1], Mips::ADDu, ATReg, ATReg, SPReg);
-  }
-
-  MCInst Sw;
-  CreateMCInst(Sw, Mips::SW, GPReg, BaseReg, MCOperand::CreateImm(Offset));
-  MCInsts.push_back(Sw);
-}
-
-MCOperand MipsMCInstLower::LowerOperand(const MachineOperand& MO,
+MCOperand MipsMCInstLower::LowerOperand(const MachineOperand &MO,
                                         unsigned offset) const {
   MachineOperandType MOTy = MO.getType();
 
@@ -205,139 +161,31 @@ void MipsMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
   }
 }
 
-void MipsMCInstLower::LowerUnalignedLoadStore(const MachineInstr *MI,
-                                              SmallVector<MCInst,
-                                              4>& MCInsts) {
-  unsigned Opc = MI->getOpcode();
-  MCInst Instr1, Instr2, Instr3, Move;
-
-  bool TwoInstructions = false;
-
-  assert(MI->getNumOperands() == 3);
-  assert(MI->getOperand(0).isReg());
-  assert(MI->getOperand(1).isReg());
-
-  MCOperand Target = LowerOperand(MI->getOperand(0));
-  MCOperand Base = LowerOperand(MI->getOperand(1));
-  MCOperand ATReg = MCOperand::CreateReg(Mips::AT);
-  MCOperand ZeroReg = MCOperand::CreateReg(Mips::ZERO);
-
-  MachineOperand UnLoweredName = MI->getOperand(2);
-  MCOperand Name = LowerOperand(UnLoweredName);
-
-  Move.setOpcode(Mips::ADDu);
-  Move.addOperand(Target);
-  Move.addOperand(ATReg);
-  Move.addOperand(ZeroReg);
-
-  switch (Opc) {
-  case Mips::ULW: {
-    // FIXME: only works for little endian right now
-    MCOperand AdjName = LowerOperand(UnLoweredName, 3);
-    if (Base.getReg() == (Target.getReg())) {
-      Instr1.setOpcode(Mips::LWL);
-      Instr1.addOperand(ATReg);
-      Instr1.addOperand(Base);
-      Instr1.addOperand(AdjName);
-      Instr2.setOpcode(Mips::LWR);
-      Instr2.addOperand(ATReg);
-      Instr2.addOperand(Base);
-      Instr2.addOperand(Name);
-      Instr3 = Move;
-    } else {
-      TwoInstructions = true;
-      Instr1.setOpcode(Mips::LWL);
-      Instr1.addOperand(Target);
-      Instr1.addOperand(Base);
-      Instr1.addOperand(AdjName);
-      Instr2.setOpcode(Mips::LWR);
-      Instr2.addOperand(Target);
-      Instr2.addOperand(Base);
-      Instr2.addOperand(Name);
-    }
+// If the D<shift> instruction has a shift amount that is greater
+// than 31 (checked in calling routine), lower it to a D<shift>32 instruction
+void MipsMCInstLower::LowerLargeShift(const MachineInstr *MI,
+                                      MCInst& Inst,
+                                      int64_t Shift) {
+  // rt
+  Inst.addOperand(LowerOperand(MI->getOperand(0)));
+  // rd
+  Inst.addOperand(LowerOperand(MI->getOperand(1)));
+  // saminus32
+  Inst.addOperand(MCOperand::CreateImm(Shift));
+
+  switch (MI->getOpcode()) {
+  default:
+    // Calling function is not synchronized
+    llvm_unreachable("Unexpected shift instruction");
     break;
-  }
-  case Mips::ULHu: {
-    // FIXME: only works for little endian right now
-    MCOperand AdjName = LowerOperand(UnLoweredName, 1);
-    Instr1.setOpcode(Mips::LBu);
-    Instr1.addOperand(ATReg);
-    Instr1.addOperand(Base);
-    Instr1.addOperand(AdjName);
-    Instr2.setOpcode(Mips::LBu);
-    Instr2.addOperand(Target);
-    Instr2.addOperand(Base);
-    Instr2.addOperand(Name);
-    Instr3.setOpcode(Mips::INS);
-    Instr3.addOperand(Target);
-    Instr3.addOperand(ATReg);
-    Instr3.addOperand(MCOperand::CreateImm(0x8));
-    Instr3.addOperand(MCOperand::CreateImm(0x18));
+  case Mips::DSLL:
+    Inst.setOpcode(Mips::DSLL32);
     break;
-  }
-
-  case Mips::USW: {
-    // FIXME: only works for little endian right now
-    assert (Base.getReg() != Target.getReg());
-    TwoInstructions = true;
-    MCOperand AdjName = LowerOperand(UnLoweredName, 3);
-    Instr1.setOpcode(Mips::SWL);
-    Instr1.addOperand(Target);
-    Instr1.addOperand(Base);
-    Instr1.addOperand(AdjName);
-    Instr2.setOpcode(Mips::SWR);
-    Instr2.addOperand(Target);
-    Instr2.addOperand(Base);
-    Instr2.addOperand(Name);
+  case Mips::DSRL:
+    Inst.setOpcode(Mips::DSRL32);
     break;
-  }
-  case Mips::USH: {
-    MCOperand AdjName = LowerOperand(UnLoweredName, 1);
-    Instr1.setOpcode(Mips::SB);
-    Instr1.addOperand(Target);
-    Instr1.addOperand(Base);
-    Instr1.addOperand(Name);
-    Instr2.setOpcode(Mips::SRL);
-    Instr2.addOperand(ATReg);
-    Instr2.addOperand(Target);
-    Instr2.addOperand(MCOperand::CreateImm(8));
-    Instr3.setOpcode(Mips::SB);
-    Instr3.addOperand(ATReg);
-    Instr3.addOperand(Base);
-    Instr3.addOperand(AdjName);
+  case Mips::DSRA:
+    Inst.setOpcode(Mips::DSRA32);
     break;
   }
-  default:
-    // FIXME: need to add others
-    llvm_unreachable("unaligned instruction not processed");
-  }
-
-  MCInsts.push_back(Instr1);
-  MCInsts.push_back(Instr2);
-  if (!TwoInstructions) MCInsts.push_back(Instr3);
-}
-
-// Convert
-//  "setgp01 $reg"
-// to
-//  "lui   $reg, %hi(_gp_disp)"
-//  "addiu $reg, $reg, %lo(_gp_disp)"
-void MipsMCInstLower::LowerSETGP01(const MachineInstr *MI,
-                                   SmallVector<MCInst, 4>& MCInsts) {
-  const MachineOperand &MO = MI->getOperand(0);
-  assert(MO.isReg());
-  MCOperand RegOpnd = MCOperand::CreateReg(MO.getReg());
-  StringRef SymName("_gp_disp");
-  const MCSymbol *Sym = Ctx->GetOrCreateSymbol(SymName);
-  const MCSymbolRefExpr *MCSym;
-
-  MCSym = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_Mips_ABS_HI, *Ctx);
-  MCOperand SymHi = MCOperand::CreateExpr(MCSym);
-  MCSym = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_Mips_ABS_LO, *Ctx);
-  MCOperand SymLo = MCOperand::CreateExpr(MCSym);
-
-  MCInsts.resize(2);
-
-  CreateMCInst(MCInsts[0], Mips::LUi, RegOpnd, SymHi);
-  CreateMCInst(MCInsts[1], Mips::ADDiu, RegOpnd, RegOpnd, SymLo);
 }
diff --git a/lib/Target/Mips/MipsMCInstLower.h b/lib/Target/Mips/MipsMCInstLower.h
index c1d007d..0abb996 100644
--- a/lib/Target/Mips/MipsMCInstLower.h
+++ b/lib/Target/Mips/MipsMCInstLower.h
@@ -31,13 +31,10 @@ class LLVM_LIBRARY_VISIBILITY MipsMCInstLower {
   MipsAsmPrinter &AsmPrinter;
 public:
   MipsMCInstLower(MipsAsmPrinter &asmprinter);
-  void Initialize(Mangler *mang, MCContext* C);
+  void Initialize(Mangler *mang, MCContext *C);
   void Lower(const MachineInstr *MI, MCInst &OutMI) const;
-  void LowerCPLOAD(SmallVector<MCInst, 4>& MCInsts);
-  void LowerCPRESTORE(int64_t Offset, SmallVector<MCInst, 4>& MCInsts);
-  void LowerUnalignedLoadStore(const MachineInstr *MI,
-                               SmallVector<MCInst, 4>& MCInsts);
-  void LowerSETGP01(const MachineInstr *MI, SmallVector<MCInst, 4>& MCInsts);
+  void LowerLargeShift(const MachineInstr *MI, MCInst &Inst, int64_t Shift);
+
 private:
   MCOperand LowerSymbolOperand(const MachineOperand &MO,
                                MachineOperandType MOTy, unsigned Offset) const;
diff --git a/lib/Target/Mips/MipsMachineFunction.cpp b/lib/Target/Mips/MipsMachineFunction.cpp
index b00c62b..362173e 100644
--- a/lib/Target/Mips/MipsMachineFunction.cpp
+++ b/lib/Target/Mips/MipsMachineFunction.cpp
@@ -22,10 +22,6 @@ static cl::opt<bool>
 FixGlobalBaseReg("mips-fix-global-base-reg", cl::Hidden, cl::init(true),
                  cl::desc("Always use $gp as the global base register."));
 
-bool MipsFunctionInfo::globalBaseRegFixed() const {
-  return FixGlobalBaseReg;
-}
-
 bool MipsFunctionInfo::globalBaseRegSet() const {
   return GlobalBaseReg;
 }
@@ -37,13 +33,13 @@ unsigned MipsFunctionInfo::getGlobalBaseReg() {
 
   const MipsSubtarget &ST = MF.getTarget().getSubtarget<MipsSubtarget>();
 
-  if (FixGlobalBaseReg) // $gp is the global base register.
-    return GlobalBaseReg = ST.isABI_N64() ? Mips::GP_64 : Mips::GP;
-
   const TargetRegisterClass *RC;
-  RC = ST.isABI_N64() ?
-    Mips::CPU64RegsRegisterClass : Mips::CPURegsRegisterClass;
-
+  if (ST.inMips16Mode())
+    RC=(const TargetRegisterClass*)&Mips::CPU16RegsRegClass;
+  else
+    RC = ST.isABI_N64() ?
+      (const TargetRegisterClass*)&Mips::CPU64RegsRegClass :
+      (const TargetRegisterClass*)&Mips::CPURegsRegClass;
   return GlobalBaseReg = MF.getRegInfo().createVirtualRegister(RC);
 }
 
diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h
index 0fde55c..b2232c6 100644
--- a/lib/Target/Mips/MipsMachineFunction.h
+++ b/lib/Target/Mips/MipsMachineFunction.h
@@ -14,8 +14,11 @@
 #ifndef MIPS_MACHINE_FUNCTION_INFO_H
 #define MIPS_MACHINE_FUNCTION_INFO_H
 
+#include "MipsSubtarget.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
 #include <utility>
 
 namespace llvm {
@@ -45,7 +48,7 @@ class MipsFunctionInfo : public MachineFunctionInfo {
   // OutArgFIRange: Range of indices of all frame objects created during call to
   //                LowerCall except for the frame object for restoring $gp.
   std::pair<int, int> InArgFIRange, OutArgFIRange;
-  int GPFI; // Index of the frame object for restoring $gp
+  int GlobalRegFI;
   mutable int DynAllocFI; // Frame index of dynamically allocated stack area.
   unsigned MaxCallFrameSize;
 
@@ -55,7 +58,7 @@ public:
   MipsFunctionInfo(MachineFunction& MF)
   : MF(MF), SRetReturnReg(0), GlobalBaseReg(0),
     VarArgsFrameIndex(0), InArgFIRange(std::make_pair(-1, 0)),
-    OutArgFIRange(std::make_pair(-1, 0)), GPFI(0), DynAllocFI(0),
+    OutArgFIRange(std::make_pair(-1, 0)), GlobalRegFI(0), DynAllocFI(0),
     MaxCallFrameSize(0), EmitNOAT(false)
   {}
 
@@ -74,10 +77,23 @@ public:
     OutArgFIRange.second = LastFI;
   }
 
-  int getGPFI() const { return GPFI; }
-  void setGPFI(int FI) { GPFI = FI; }
-  bool needGPSaveRestore() const { return getGPFI(); }
-  bool isGPFI(int FI) const { return GPFI && GPFI == FI; }
+  bool isGlobalRegFI(int FI) const {
+    return GlobalRegFI && (FI == GlobalRegFI);
+  }
+
+  int getGlobalRegFI() const {
+    return GlobalRegFI;
+  }
+
+  int initGlobalRegFI() {
+    const TargetMachine &TM = MF.getTarget();
+    unsigned RegSize = TM.getSubtarget<MipsSubtarget>().isABI_N64() ? 8 : 4;
+    int64_t StackAlignment = TM.getFrameLowering()->getStackAlignment();
+    uint64_t Offset = RoundUpToAlignment(MaxCallFrameSize, StackAlignment);
+
+    GlobalRegFI = MF.getFrameInfo()->CreateFixedObject(RegSize, Offset, true);
+    return GlobalRegFI;
+  }
 
   // The first call to this function creates a frame object for dynamically
   // allocated stack area.
@@ -92,7 +108,6 @@ public:
   unsigned getSRetReturnReg() const { return SRetReturnReg; }
   void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
 
-  bool globalBaseRegFixed() const;
   bool globalBaseRegSet() const;
   unsigned getGlobalBaseReg();
 
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index f30de44..a3ce236 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -16,9 +16,11 @@
 #include "MipsRegisterInfo.h"
 #include "Mips.h"
 #include "MipsAnalyzeImmediate.h"
+#include "MipsInstrInfo.h"
 #include "MipsSubtarget.h"
 #include "MipsMachineFunction.h"
 #include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/Type.h"
 #include "llvm/Function.h"
 #include "llvm/CodeGen/ValueTypes.h"
@@ -35,7 +37,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/Analysis/DebugInfo.h"
 
 #define GET_REGINFO_TARGET_DESC
 #include "MipsGenRegisterInfo.inc"
@@ -54,8 +55,7 @@ unsigned MipsRegisterInfo::getPICCallReg() { return Mips::T9; }
 
 /// Mips Callee Saved Registers
 const uint16_t* MipsRegisterInfo::
-getCalleeSavedRegs(const MachineFunction *MF) const
-{
+getCalleeSavedRegs(const MachineFunction *MF) const {
   if (Subtarget.isSingleFloat())
     return CSR_SingleFloatOnly_SaveList;
   else if (!Subtarget.hasMips64())
@@ -64,12 +64,11 @@ getCalleeSavedRegs(const MachineFunction *MF) const
     return CSR_N32_SaveList;
 
   assert(Subtarget.isABI_N64());
-  return CSR_N64_SaveList;  
+  return CSR_N64_SaveList;
 }
 
 const uint32_t*
-MipsRegisterInfo::getCallPreservedMask(CallingConv::ID) const
-{  
+MipsRegisterInfo::getCallPreservedMask(CallingConv::ID) const {
   if (Subtarget.isSingleFloat())
     return CSR_SingleFloatOnly_RegMask;
   else if (!Subtarget.hasMips64())
@@ -78,23 +77,21 @@ MipsRegisterInfo::getCallPreservedMask(CallingConv::ID) const
     return CSR_N32_RegMask;
 
   assert(Subtarget.isABI_N64());
-  return CSR_N64_RegMask;  
+  return CSR_N64_RegMask;
 }
 
 BitVector MipsRegisterInfo::
 getReservedRegs(const MachineFunction &MF) const {
   static const uint16_t ReservedCPURegs[] = {
-    Mips::ZERO, Mips::AT, Mips::K0, Mips::K1,
-    Mips::SP, Mips::FP, Mips::RA
+    Mips::ZERO, Mips::AT, Mips::K0, Mips::K1, Mips::SP
   };
 
   static const uint16_t ReservedCPU64Regs[] = {
-    Mips::ZERO_64, Mips::AT_64, Mips::K0_64, Mips::K1_64,
-    Mips::SP_64, Mips::FP_64, Mips::RA_64
+    Mips::ZERO_64, Mips::AT_64, Mips::K0_64, Mips::K1_64, Mips::SP_64
   };
 
   BitVector Reserved(getNumRegs());
-  typedef TargetRegisterClass::iterator RegIter;
+  typedef TargetRegisterClass::const_iterator RegIter;
 
   for (unsigned I = 0; I < array_lengthof(ReservedCPURegs); ++I)
     Reserved.set(ReservedCPURegs[I]);
@@ -104,31 +101,36 @@ getReservedRegs(const MachineFunction &MF) const {
       Reserved.set(ReservedCPU64Regs[I]);
 
     // Reserve all registers in AFGR64.
-    for (RegIter Reg = Mips::AFGR64RegisterClass->begin();
-         Reg != Mips::AFGR64RegisterClass->end(); ++Reg)
+    for (RegIter Reg = Mips::AFGR64RegClass.begin(),
+         EReg = Mips::AFGR64RegClass.end(); Reg != EReg; ++Reg)
       Reserved.set(*Reg);
-  }
-  else {
+  } else {
     // Reserve all registers in CPU64Regs & FGR64.
-    for (RegIter Reg = Mips::CPU64RegsRegisterClass->begin();
-         Reg != Mips::CPU64RegsRegisterClass->end(); ++Reg)
+    for (RegIter Reg = Mips::CPU64RegsRegClass.begin(),
+         EReg = Mips::CPU64RegsRegClass.end(); Reg != EReg; ++Reg)
       Reserved.set(*Reg);
 
-    for (RegIter Reg = Mips::FGR64RegisterClass->begin();
-         Reg != Mips::FGR64RegisterClass->end(); ++Reg)
+    for (RegIter Reg = Mips::FGR64RegClass.begin(),
+         EReg = Mips::FGR64RegClass.end(); Reg != EReg; ++Reg)
       Reserved.set(*Reg);
   }
 
-  // If GP is dedicated as a global base register, reserve it.
-  if (MF.getInfo<MipsFunctionInfo>()->globalBaseRegFixed()) {
-    Reserved.set(Mips::GP);
-    Reserved.set(Mips::GP_64);
+  // Reserve FP if this function should have a dedicated frame pointer register.
+  if (MF.getTarget().getFrameLowering()->hasFP(MF)) {
+    Reserved.set(Mips::FP);
+    Reserved.set(Mips::FP_64);
   }
 
   // Reserve hardware registers.
   Reserved.set(Mips::HWR29);
   Reserved.set(Mips::HWR29_64);
 
+  // Reserve RA if in mips16 mode.
+  if (Subtarget.inMips16Mode()) {
+    Reserved.set(Mips::RA);
+    Reserved.set(Mips::RA_64);
+  }
+
   return Reserved;
 }
 
@@ -137,6 +139,11 @@ MipsRegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
   return true;
 }
 
+bool
+MipsRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+  return true;
+}
+
 // This function eliminate ADJCALLSTACKDOWN,
 // ADJCALLSTACKUP pseudo instructions
 void MipsRegisterInfo::
@@ -207,8 +214,8 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
   //   incoming argument, callee-saved register location or local variable.
   int64_t Offset;
 
-  if (MipsFI->isOutArgFI(FrameIndex) || MipsFI->isGPFI(FrameIndex) ||
-      MipsFI->isDynAllocFI(FrameIndex))
+  if (MipsFI->isOutArgFI(FrameIndex) || MipsFI->isDynAllocFI(FrameIndex) ||
+      MipsFI->isGlobalRegFI(FrameIndex))
     Offset = spOffset;
   else
     Offset = spOffset + (int64_t)stackSize;
@@ -222,37 +229,17 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
   if (!MI.isDebugValue() && !isInt<16>(Offset)) {
     MachineBasicBlock &MBB = *MI.getParent();
     DebugLoc DL = II->getDebugLoc();
-    MipsAnalyzeImmediate AnalyzeImm;
-    unsigned Size = Subtarget.isABI_N64() ? 64 : 32;
-    unsigned LUi = Subtarget.isABI_N64() ? Mips::LUi64 : Mips::LUi;
     unsigned ADDu = Subtarget.isABI_N64() ? Mips::DADDu : Mips::ADDu;
-    unsigned ZEROReg = Subtarget.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO;
     unsigned ATReg = Subtarget.isABI_N64() ? Mips::AT_64 : Mips::AT;
-    const MipsAnalyzeImmediate::InstSeq &Seq =
-      AnalyzeImm.Analyze(Offset, Size, true /* LastInstrIsADDiu */);
-    MipsAnalyzeImmediate::InstSeq::const_iterator Inst = Seq.begin();
+    MipsAnalyzeImmediate::Inst LastInst(0, 0);
 
     MipsFI->setEmitNOAT();
-
-    // The first instruction can be a LUi, which is different from other
-    // instructions (ADDiu, ORI and SLL) in that it does not have a register
-    // operand.
-    if (Inst->Opc == LUi)
-      BuildMI(MBB, II, DL, TII.get(LUi), ATReg)
-        .addImm(SignExtend64<16>(Inst->ImmOpnd));
-    else
-      BuildMI(MBB, II, DL, TII.get(Inst->Opc), ATReg).addReg(ZEROReg)
-        .addImm(SignExtend64<16>(Inst->ImmOpnd));
-
-    // Build the remaining instructions in Seq except for the last one.
-    for (++Inst; Inst != Seq.end() - 1; ++Inst)
-      BuildMI(MBB, II, DL, TII.get(Inst->Opc), ATReg).addReg(ATReg)
-        .addImm(SignExtend64<16>(Inst->ImmOpnd));
-
+    Mips::loadImmediate(Offset, Subtarget.isABI_N64(), TII, MBB, II, DL, true,
+                        &LastInst);
     BuildMI(MBB, II, DL, TII.get(ADDu), ATReg).addReg(FrameReg).addReg(ATReg);
 
     FrameReg = ATReg;
-    Offset = SignExtend64<16>(Inst->ImmOpnd);
+    Offset = SignExtend64<16>(LastInst.ImmOpnd);
   }
 
   MI.getOperand(i).ChangeToRegister(FrameReg, false);
diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h
index 0716d29..f320bae 100644
--- a/lib/Target/Mips/MipsRegisterInfo.h
+++ b/lib/Target/Mips/MipsRegisterInfo.h
@@ -42,13 +42,15 @@ struct MipsRegisterInfo : public MipsGenRegisterInfo {
   void adjustMipsStackFrame(MachineFunction &MF) const;
 
   /// Code Generation virtual methods...
-  const uint16_t *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
+  const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
   const uint32_t *getCallPreservedMask(CallingConv::ID) const;
 
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
   virtual bool requiresRegisterScavenging(const MachineFunction &MF) const;
 
+  virtual bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const;
+
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I) const;
diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
index ce399a0..b255e42 100644
--- a/lib/Target/Mips/MipsRegisterInfo.td
+++ b/lib/Target/Mips/MipsRegisterInfo.td
@@ -70,8 +70,8 @@ class HWR<bits<5> num, string n> : MipsReg<n> {
 
 let Namespace = "Mips" in {
   // General Purpose Registers
-  def ZERO : MipsGPRReg< 0, "ZERO">, DwarfRegNum<[0]>;
-  def AT   : MipsGPRReg< 1, "AT">,   DwarfRegNum<[1]>;
+  def ZERO : MipsGPRReg< 0, "zero">, DwarfRegNum<[0]>;
+  def AT   : MipsGPRReg< 1, "at">,   DwarfRegNum<[1]>;
   def V0   : MipsGPRReg< 2, "2">,    DwarfRegNum<[2]>;
   def V1   : MipsGPRReg< 3, "3">,    DwarfRegNum<[3]>;
   def A0   : MipsGPRReg< 4, "4">,    DwarfRegNum<[4]>;
@@ -98,14 +98,14 @@ let Namespace = "Mips" in {
   def T9   : MipsGPRReg< 25, "25">,  DwarfRegNum<[25]>;
   def K0   : MipsGPRReg< 26, "26">,  DwarfRegNum<[26]>;
   def K1   : MipsGPRReg< 27, "27">,  DwarfRegNum<[27]>;
-  def GP   : MipsGPRReg< 28, "GP">,  DwarfRegNum<[28]>;
-  def SP   : MipsGPRReg< 29, "SP">,  DwarfRegNum<[29]>;
-  def FP   : MipsGPRReg< 30, "FP">,  DwarfRegNum<[30]>;
-  def RA   : MipsGPRReg< 31, "RA">,  DwarfRegNum<[31]>;
+  def GP   : MipsGPRReg< 28, "gp">,  DwarfRegNum<[28]>;
+  def SP   : MipsGPRReg< 29, "sp">,  DwarfRegNum<[29]>;
+  def FP   : MipsGPRReg< 30, "fp">,  DwarfRegNum<[30]>;
+  def RA   : MipsGPRReg< 31, "ra">,  DwarfRegNum<[31]>;
 
   // General Purpose 64-bit Registers
-  def ZERO_64 : Mips64GPRReg< 0, "ZERO", [ZERO]>, DwarfRegNum<[0]>;
-  def AT_64   : Mips64GPRReg< 1, "AT",   [AT]>, DwarfRegNum<[1]>;
+  def ZERO_64 : Mips64GPRReg< 0, "zero", [ZERO]>, DwarfRegNum<[0]>;
+  def AT_64   : Mips64GPRReg< 1, "at",   [AT]>, DwarfRegNum<[1]>;
   def V0_64   : Mips64GPRReg< 2, "2",    [V0]>, DwarfRegNum<[2]>;
   def V1_64   : Mips64GPRReg< 3, "3",    [V1]>, DwarfRegNum<[3]>;
   def A0_64   : Mips64GPRReg< 4, "4",    [A0]>, DwarfRegNum<[4]>;
@@ -132,97 +132,97 @@ let Namespace = "Mips" in {
   def T9_64   : Mips64GPRReg< 25, "25",  [T9]>, DwarfRegNum<[25]>;
   def K0_64   : Mips64GPRReg< 26, "26",  [K0]>, DwarfRegNum<[26]>;
   def K1_64   : Mips64GPRReg< 27, "27",  [K1]>, DwarfRegNum<[27]>;
-  def GP_64   : Mips64GPRReg< 28, "GP",  [GP]>, DwarfRegNum<[28]>;
-  def SP_64   : Mips64GPRReg< 29, "SP",  [SP]>, DwarfRegNum<[29]>;
-  def FP_64   : Mips64GPRReg< 30, "FP",  [FP]>, DwarfRegNum<[30]>;
-  def RA_64   : Mips64GPRReg< 31, "RA",  [RA]>, DwarfRegNum<[31]>;
+  def GP_64   : Mips64GPRReg< 28, "gp",  [GP]>, DwarfRegNum<[28]>;
+  def SP_64   : Mips64GPRReg< 29, "sp",  [SP]>, DwarfRegNum<[29]>;
+  def FP_64   : Mips64GPRReg< 30, "fp",  [FP]>, DwarfRegNum<[30]>;
+  def RA_64   : Mips64GPRReg< 31, "ra",  [RA]>, DwarfRegNum<[31]>;
 
   /// Mips Single point precision FPU Registers
-  def F0  : FPR< 0,  "F0">, DwarfRegNum<[32]>;
-  def F1  : FPR< 1,  "F1">, DwarfRegNum<[33]>;
-  def F2  : FPR< 2,  "F2">, DwarfRegNum<[34]>;
-  def F3  : FPR< 3,  "F3">, DwarfRegNum<[35]>;
-  def F4  : FPR< 4,  "F4">, DwarfRegNum<[36]>;
-  def F5  : FPR< 5,  "F5">, DwarfRegNum<[37]>;
-  def F6  : FPR< 6,  "F6">, DwarfRegNum<[38]>;
-  def F7  : FPR< 7,  "F7">, DwarfRegNum<[39]>;
-  def F8  : FPR< 8,  "F8">, DwarfRegNum<[40]>;
-  def F9  : FPR< 9,  "F9">, DwarfRegNum<[41]>;
-  def F10 : FPR<10, "F10">, DwarfRegNum<[42]>;
-  def F11 : FPR<11, "F11">, DwarfRegNum<[43]>;
-  def F12 : FPR<12, "F12">, DwarfRegNum<[44]>;
-  def F13 : FPR<13, "F13">, DwarfRegNum<[45]>;
-  def F14 : FPR<14, "F14">, DwarfRegNum<[46]>;
-  def F15 : FPR<15, "F15">, DwarfRegNum<[47]>;
-  def F16 : FPR<16, "F16">, DwarfRegNum<[48]>;
-  def F17 : FPR<17, "F17">, DwarfRegNum<[49]>;
-  def F18 : FPR<18, "F18">, DwarfRegNum<[50]>;
-  def F19 : FPR<19, "F19">, DwarfRegNum<[51]>;
-  def F20 : FPR<20, "F20">, DwarfRegNum<[52]>;
-  def F21 : FPR<21, "F21">, DwarfRegNum<[53]>;
-  def F22 : FPR<22, "F22">, DwarfRegNum<[54]>;
-  def F23 : FPR<23, "F23">, DwarfRegNum<[55]>;
-  def F24 : FPR<24, "F24">, DwarfRegNum<[56]>;
-  def F25 : FPR<25, "F25">, DwarfRegNum<[57]>;
-  def F26 : FPR<26, "F26">, DwarfRegNum<[58]>;
-  def F27 : FPR<27, "F27">, DwarfRegNum<[59]>;
-  def F28 : FPR<28, "F28">, DwarfRegNum<[60]>;
-  def F29 : FPR<29, "F29">, DwarfRegNum<[61]>;
-  def F30 : FPR<30, "F30">, DwarfRegNum<[62]>;
-  def F31 : FPR<31, "F31">, DwarfRegNum<[63]>;
+  def F0  : FPR< 0,  "f0">, DwarfRegNum<[32]>;
+  def F1  : FPR< 1,  "f1">, DwarfRegNum<[33]>;
+  def F2  : FPR< 2,  "f2">, DwarfRegNum<[34]>;
+  def F3  : FPR< 3,  "f3">, DwarfRegNum<[35]>;
+  def F4  : FPR< 4,  "f4">, DwarfRegNum<[36]>;
+  def F5  : FPR< 5,  "f5">, DwarfRegNum<[37]>;
+  def F6  : FPR< 6,  "f6">, DwarfRegNum<[38]>;
+  def F7  : FPR< 7,  "f7">, DwarfRegNum<[39]>;
+  def F8  : FPR< 8,  "f8">, DwarfRegNum<[40]>;
+  def F9  : FPR< 9,  "f9">, DwarfRegNum<[41]>;
+  def F10 : FPR<10, "f10">, DwarfRegNum<[42]>;
+  def F11 : FPR<11, "f11">, DwarfRegNum<[43]>;
+  def F12 : FPR<12, "f12">, DwarfRegNum<[44]>;
+  def F13 : FPR<13, "f13">, DwarfRegNum<[45]>;
+  def F14 : FPR<14, "f14">, DwarfRegNum<[46]>;
+  def F15 : FPR<15, "f15">, DwarfRegNum<[47]>;
+  def F16 : FPR<16, "f16">, DwarfRegNum<[48]>;
+  def F17 : FPR<17, "f17">, DwarfRegNum<[49]>;
+  def F18 : FPR<18, "f18">, DwarfRegNum<[50]>;
+  def F19 : FPR<19, "f19">, DwarfRegNum<[51]>;
+  def F20 : FPR<20, "f20">, DwarfRegNum<[52]>;
+  def F21 : FPR<21, "f21">, DwarfRegNum<[53]>;
+  def F22 : FPR<22, "f22">, DwarfRegNum<[54]>;
+  def F23 : FPR<23, "f23">, DwarfRegNum<[55]>;
+  def F24 : FPR<24, "f24">, DwarfRegNum<[56]>;
+  def F25 : FPR<25, "f25">, DwarfRegNum<[57]>;
+  def F26 : FPR<26, "f26">, DwarfRegNum<[58]>;
+  def F27 : FPR<27, "f27">, DwarfRegNum<[59]>;
+  def F28 : FPR<28, "f28">, DwarfRegNum<[60]>;
+  def F29 : FPR<29, "f29">, DwarfRegNum<[61]>;
+  def F30 : FPR<30, "f30">, DwarfRegNum<[62]>;
+  def F31 : FPR<31, "f31">, DwarfRegNum<[63]>;
 
   /// Mips Double point precision FPU Registers (aliased
   /// with the single precision to hold 64 bit values)
-  def D0  : AFPR< 0,  "F0", [F0,   F1]>;
-  def D1  : AFPR< 2,  "F2", [F2,   F3]>;
-  def D2  : AFPR< 4,  "F4", [F4,   F5]>;
-  def D3  : AFPR< 6,  "F6", [F6,   F7]>;
-  def D4  : AFPR< 8,  "F8", [F8,   F9]>;
-  def D5  : AFPR<10, "F10", [F10, F11]>;
-  def D6  : AFPR<12, "F12", [F12, F13]>;
-  def D7  : AFPR<14, "F14", [F14, F15]>;
-  def D8  : AFPR<16, "F16", [F16, F17]>;
-  def D9  : AFPR<18, "F18", [F18, F19]>;
-  def D10 : AFPR<20, "F20", [F20, F21]>;
-  def D11 : AFPR<22, "F22", [F22, F23]>;
-  def D12 : AFPR<24, "F24", [F24, F25]>;
-  def D13 : AFPR<26, "F26", [F26, F27]>;
-  def D14 : AFPR<28, "F28", [F28, F29]>;
-  def D15 : AFPR<30, "F30", [F30, F31]>;
+  def D0  : AFPR< 0,  "f0", [F0,   F1]>;
+  def D1  : AFPR< 2,  "f2", [F2,   F3]>;
+  def D2  : AFPR< 4,  "f4", [F4,   F5]>;
+  def D3  : AFPR< 6,  "f6", [F6,   F7]>;
+  def D4  : AFPR< 8,  "f8", [F8,   F9]>;
+  def D5  : AFPR<10, "f10", [F10, F11]>;
+  def D6  : AFPR<12, "f12", [F12, F13]>;
+  def D7  : AFPR<14, "f14", [F14, F15]>;
+  def D8  : AFPR<16, "f16", [F16, F17]>;
+  def D9  : AFPR<18, "f18", [F18, F19]>;
+  def D10 : AFPR<20, "f20", [F20, F21]>;
+  def D11 : AFPR<22, "f22", [F22, F23]>;
+  def D12 : AFPR<24, "f24", [F24, F25]>;
+  def D13 : AFPR<26, "f26", [F26, F27]>;
+  def D14 : AFPR<28, "f28", [F28, F29]>;
+  def D15 : AFPR<30, "f30", [F30, F31]>;
 
   /// Mips Double point precision FPU Registers in MFP64 mode.
-  def D0_64  : AFPR64<0, "F0", [F0]>, DwarfRegNum<[32]>;
-  def D1_64  : AFPR64<1, "F1", [F1]>, DwarfRegNum<[33]>;
-  def D2_64  : AFPR64<2, "F2", [F2]>, DwarfRegNum<[34]>;
-  def D3_64  : AFPR64<3, "F3", [F3]>, DwarfRegNum<[35]>;
-  def D4_64  : AFPR64<4, "F4", [F4]>, DwarfRegNum<[36]>;
-  def D5_64  : AFPR64<5, "F5", [F5]>, DwarfRegNum<[37]>;
-  def D6_64  : AFPR64<6, "F6", [F6]>, DwarfRegNum<[38]>;
-  def D7_64  : AFPR64<7, "F7", [F7]>, DwarfRegNum<[39]>;
-  def D8_64  : AFPR64<8, "F8", [F8]>, DwarfRegNum<[40]>;
-  def D9_64  : AFPR64<9, "F9", [F9]>, DwarfRegNum<[41]>;
-  def D10_64  : AFPR64<10, "F10", [F10]>, DwarfRegNum<[42]>;
-  def D11_64  : AFPR64<11, "F11", [F11]>, DwarfRegNum<[43]>;
-  def D12_64  : AFPR64<12, "F12", [F12]>, DwarfRegNum<[44]>;
-  def D13_64  : AFPR64<13, "F13", [F13]>, DwarfRegNum<[45]>;
-  def D14_64  : AFPR64<14, "F14", [F14]>, DwarfRegNum<[46]>;
-  def D15_64  : AFPR64<15, "F15", [F15]>, DwarfRegNum<[47]>;
-  def D16_64  : AFPR64<16, "F16", [F16]>, DwarfRegNum<[48]>;
-  def D17_64  : AFPR64<17, "F17", [F17]>, DwarfRegNum<[49]>;
-  def D18_64  : AFPR64<18, "F18", [F18]>, DwarfRegNum<[50]>;
-  def D19_64  : AFPR64<19, "F19", [F19]>, DwarfRegNum<[51]>;
-  def D20_64  : AFPR64<20, "F20", [F20]>, DwarfRegNum<[52]>;
-  def D21_64  : AFPR64<21, "F21", [F21]>, DwarfRegNum<[53]>;
-  def D22_64  : AFPR64<22, "F22", [F22]>, DwarfRegNum<[54]>;
-  def D23_64  : AFPR64<23, "F23", [F23]>, DwarfRegNum<[55]>;
-  def D24_64  : AFPR64<24, "F24", [F24]>, DwarfRegNum<[56]>;
-  def D25_64  : AFPR64<25, "F25", [F25]>, DwarfRegNum<[57]>;
-  def D26_64  : AFPR64<26, "F26", [F26]>, DwarfRegNum<[58]>;
-  def D27_64  : AFPR64<27, "F27", [F27]>, DwarfRegNum<[59]>;
-  def D28_64  : AFPR64<28, "F28", [F28]>, DwarfRegNum<[60]>;
-  def D29_64  : AFPR64<29, "F29", [F29]>, DwarfRegNum<[61]>;
-  def D30_64  : AFPR64<30, "F30", [F30]>, DwarfRegNum<[62]>;
-  def D31_64  : AFPR64<31, "F31", [F31]>, DwarfRegNum<[63]>;
+  def D0_64  : AFPR64<0, "f0", [F0]>, DwarfRegNum<[32]>;
+  def D1_64  : AFPR64<1, "f1", [F1]>, DwarfRegNum<[33]>;
+  def D2_64  : AFPR64<2, "f2", [F2]>, DwarfRegNum<[34]>;
+  def D3_64  : AFPR64<3, "f3", [F3]>, DwarfRegNum<[35]>;
+  def D4_64  : AFPR64<4, "f4", [F4]>, DwarfRegNum<[36]>;
+  def D5_64  : AFPR64<5, "f5", [F5]>, DwarfRegNum<[37]>;
+  def D6_64  : AFPR64<6, "f6", [F6]>, DwarfRegNum<[38]>;
+  def D7_64  : AFPR64<7, "f7", [F7]>, DwarfRegNum<[39]>;
+  def D8_64  : AFPR64<8, "f8", [F8]>, DwarfRegNum<[40]>;
+  def D9_64  : AFPR64<9, "f9", [F9]>, DwarfRegNum<[41]>;
+  def D10_64  : AFPR64<10, "f10", [F10]>, DwarfRegNum<[42]>;
+  def D11_64  : AFPR64<11, "f11", [F11]>, DwarfRegNum<[43]>;
+  def D12_64  : AFPR64<12, "f12", [F12]>, DwarfRegNum<[44]>;
+  def D13_64  : AFPR64<13, "f13", [F13]>, DwarfRegNum<[45]>;
+  def D14_64  : AFPR64<14, "f14", [F14]>, DwarfRegNum<[46]>;
+  def D15_64  : AFPR64<15, "f15", [F15]>, DwarfRegNum<[47]>;
+  def D16_64  : AFPR64<16, "f16", [F16]>, DwarfRegNum<[48]>;
+  def D17_64  : AFPR64<17, "f17", [F17]>, DwarfRegNum<[49]>;
+  def D18_64  : AFPR64<18, "f18", [F18]>, DwarfRegNum<[50]>;
+  def D19_64  : AFPR64<19, "f19", [F19]>, DwarfRegNum<[51]>;
+  def D20_64  : AFPR64<20, "f20", [F20]>, DwarfRegNum<[52]>;
+  def D21_64  : AFPR64<21, "f21", [F21]>, DwarfRegNum<[53]>;
+  def D22_64  : AFPR64<22, "f22", [F22]>, DwarfRegNum<[54]>;
+  def D23_64  : AFPR64<23, "f23", [F23]>, DwarfRegNum<[55]>;
+  def D24_64  : AFPR64<24, "f24", [F24]>, DwarfRegNum<[56]>;
+  def D25_64  : AFPR64<25, "f25", [F25]>, DwarfRegNum<[57]>;
+  def D26_64  : AFPR64<26, "f26", [F26]>, DwarfRegNum<[58]>;
+  def D27_64  : AFPR64<27, "f27", [F27]>, DwarfRegNum<[59]>;
+  def D28_64  : AFPR64<28, "f28", [F28]>, DwarfRegNum<[60]>;
+  def D29_64  : AFPR64<29, "f29", [F29]>, DwarfRegNum<[61]>;
+  def D30_64  : AFPR64<30, "f30", [F30]>, DwarfRegNum<[62]>;
+  def D31_64  : AFPR64<31, "f31", [F31]>, DwarfRegNum<[63]>;
 
   // Hi/Lo registers
   def HI  : Register<"hi">, DwarfRegNum<[64]>;
@@ -236,6 +236,9 @@ let Namespace = "Mips" in {
   // Status flags register
   def FCR31 : Register<"31">;
 
+  // fcc0 register
+  def FCC0 : Register<"fcc0">;
+
   // Hardware register $29
   def HWR29 : Register<"29">;
   def HWR29_64 : Register<"29">;
@@ -246,26 +249,41 @@ let Namespace = "Mips" in {
 //===----------------------------------------------------------------------===//
 
 def CPURegs : RegisterClass<"Mips", [i32], 32, (add
+  // Reserved
+  ZERO, AT,
   // Return Values and Arguments
   V0, V1, A0, A1, A2, A3,
   // Not preserved across procedure calls
-  T0, T1, T2, T3, T4, T5, T6, T7, T8, T9,
+  T0, T1, T2, T3, T4, T5, T6, T7,
   // Callee save
   S0, S1, S2, S3, S4, S5, S6, S7,
+  // Not preserved across procedure calls
+  T8, T9,
   // Reserved
-  ZERO, AT, K0, K1, GP, SP, FP, RA)>;
+  K0, K1, GP, SP, FP, RA)>;
 
 def CPU64Regs : RegisterClass<"Mips", [i64], 64, (add
+// Reserved
+  ZERO_64, AT_64,
   // Return Values and Arguments
   V0_64, V1_64, A0_64, A1_64, A2_64, A3_64,
   // Not preserved across procedure calls
-  T0_64, T1_64, T2_64, T3_64, T4_64, T5_64, T6_64, T7_64, T8_64, T9_64,
+  T0_64, T1_64, T2_64, T3_64, T4_64, T5_64, T6_64, T7_64,
   // Callee save
   S0_64, S1_64, S2_64, S3_64, S4_64, S5_64, S6_64, S7_64,
+  // Not preserved across procedure calls
+  T8_64, T9_64,
   // Reserved
-  ZERO_64, AT_64, K0_64, K1_64, GP_64, SP_64, FP_64, RA_64)> {
-  let SubRegClasses = [(CPURegs sub_32)];
-}
+  K0_64, K1_64, GP_64, SP_64, FP_64, RA_64)>;
+
+def CPU16Regs : RegisterClass<"Mips", [i32], 32, (add
+  // Return Values and Arguments
+  V0, V1, A0, A1, A2, A3,
+  // Callee save
+  S0, S1)>;
+
+def CPURAReg : RegisterClass<"Mips", [i32], 32, (add RA)>;
+
 
 // 64bit fp:
 // * FGR64  - 32 64-bit registers
@@ -278,26 +296,24 @@ def FGR32 : RegisterClass<"Mips", [f32], 32, (sequence "F%u", 0, 31)>;
 
 def AFGR64 : RegisterClass<"Mips", [f64], 64, (add
   // Return Values and Arguments
-  D0, D1, D6, D7,
+  D0, D1,
+  // Not preserved across procedure calls
+  D2, D3, D4, D5,
+  // Return Values and Arguments
+  D6, D7,
   // Not preserved across procedure calls
-  D2, D3, D4, D5, D8, D9,
+  D8, D9,
   // Callee save
-  D10, D11, D12, D13, D14, D15)> {
-  let SubRegClasses = [(FGR32 sub_fpeven, sub_fpodd)];
-}
+  D10, D11, D12, D13, D14, D15)>;
 
-def FGR64 : RegisterClass<"Mips", [f64], 64, (sequence "D%u_64", 0, 31)> {
-  let SubRegClasses = [(FGR32 sub_32)];
-}
+def FGR64 : RegisterClass<"Mips", [f64], 64, (sequence "D%u_64", 0, 31)>;
 
 // Condition Register for floating point operations
-def CCR  : RegisterClass<"Mips", [i32], 32, (add FCR31)>;
+def CCR  : RegisterClass<"Mips", [i32], 32, (add FCR31,FCC0)>;
 
 // Hi/Lo Registers
 def HILO : RegisterClass<"Mips", [i32], 32, (add HI, LO)>;
-def HILO64 : RegisterClass<"Mips", [i64], 64, (add HI64, LO64)> {
-  let SubRegClasses = [(HILO sub_32)];
-}
+def HILO64 : RegisterClass<"Mips", [i64], 64, (add HI64, LO64)>;
 
 // Hardware registers
 def HWRegs : RegisterClass<"Mips", [i32], 32, (add HWR29)>;
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index 00347df..11ff809 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -30,7 +30,7 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
   MipsArchVersion(Mips32), MipsABI(UnknownABI), IsLittle(little),
   IsSingleFloat(false), IsFP64bit(false), IsGP64bit(false), HasVFPU(false),
   IsLinux(true), HasSEInReg(false), HasCondMov(false), HasMulDivAdd(false),
-  HasMinMax(false), HasSwap(false), HasBitCount(false)
+  HasMinMax(false), HasSwap(false), HasBitCount(false), InMips16Mode(false)
 {
   std::string CPUName = CPU;
   if (CPUName.empty())
@@ -58,9 +58,9 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
 
 bool
 MipsSubtarget::enablePostRAScheduler(CodeGenOpt::Level OptLevel,
-                                    TargetSubtargetInfo::AntiDepBreakMode& Mode,
-                                     RegClassVector& CriticalPathRCs) const {
-  Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL;
+                                    TargetSubtargetInfo::AntiDepBreakMode &Mode,
+                                     RegClassVector &CriticalPathRCs) const {
+  Mode = TargetSubtargetInfo::ANTIDEP_NONE;
   CriticalPathRCs.clear();
   CriticalPathRCs.push_back(hasMips64() ?
                             &Mips::CPU64RegsRegClass : &Mips::CPURegsRegClass);
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index 7faf77b..3215c44 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -86,6 +86,9 @@ protected:
   // HasBitCount - Count leading '1' and '0' bits.
   bool HasBitCount;
 
+  // InMips16 -- can process Mips16 instructions
+  bool InMips16Mode;
+
   InstrItineraryData InstrItins;
 
 public:
@@ -124,8 +127,11 @@ public:
   bool isSingleFloat() const { return IsSingleFloat; }
   bool isNotSingleFloat() const { return !IsSingleFloat; }
   bool hasVFPU() const { return HasVFPU; }
+  bool inMips16Mode() const { return InMips16Mode; }
   bool isLinux() const { return IsLinux; }
 
+  bool hasStandardEncoding() const { return !inMips16Mode(); }
+
   /// Features related to the presence of specific instructions.
   bool hasSEInReg()   const { return HasSEInReg; }
   bool hasCondMov()   const { return HasCondMov; }
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index ad02231..dd5d35f 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -105,8 +105,6 @@ public:
   }
 
   virtual bool addInstSelector();
-  virtual bool addPreRegAlloc();
-  virtual bool addPreSched2();
   virtual bool addPreEmitPass();
 };
 } // namespace
@@ -117,31 +115,22 @@ TargetPassConfig *MipsTargetMachine::createPassConfig(PassManagerBase &PM) {
 
 // Install an instruction selector pass using
 // the ISelDag to gen Mips code.
-bool MipsPassConfig::addInstSelector()
-{
-  PM.add(createMipsISelDag(getMipsTargetMachine()));
+bool MipsPassConfig::addInstSelector() {
+  addPass(createMipsISelDag(getMipsTargetMachine()));
   return false;
 }
 
 // Implemented by targets that want to run passes immediately before
 // machine code is emitted. return true if -print-machineinstrs should
 // print out the code after the passes.
-bool MipsPassConfig::addPreEmitPass()
-{
-  PM.add(createMipsDelaySlotFillerPass(getMipsTargetMachine()));
-  return true;
-}
+bool MipsPassConfig::addPreEmitPass() {
+  MipsTargetMachine &TM = getMipsTargetMachine();
+  addPass(createMipsDelaySlotFillerPass(TM));
 
-bool MipsPassConfig::addPreRegAlloc() {
-  // Do not restore $gp if target is Mips64.
-  // In N32/64, $gp is a callee-saved register.
-  if (!getMipsSubtarget().hasMips64())
-    PM.add(createMipsEmitGPRestorePass(getMipsTargetMachine()));
-  return true;
-}
+  // NOTE: long branch has not been implemented for mips16.
+  if (TM.getSubtarget<MipsSubtarget>().hasStandardEncoding())
+    addPass(createMipsLongBranchPass(TM));
 
-bool MipsPassConfig::addPreSched2() {
-  PM.add(createMipsExpandPseudoPass(getMipsTargetMachine()));
   return true;
 }
 
diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h
index 80c00e8..5cbf057 100644
--- a/lib/Target/Mips/MipsTargetMachine.h
+++ b/lib/Target/Mips/MipsTargetMachine.h
@@ -69,9 +69,7 @@ namespace llvm {
 
     // Pass Pipeline Configuration
     virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
-    virtual bool addCodeEmitter(PassManagerBase &PM,
-				 JITCodeEmitter &JCE);
-
+    virtual bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &JCE);
   };
 
 /// MipsebTargetMachine - Mips32 big endian target machine.
diff --git a/lib/Target/NVPTX/CMakeLists.txt b/lib/Target/NVPTX/CMakeLists.txt
new file mode 100644
index 0000000..7cb16b4
--- /dev/null
+++ b/lib/Target/NVPTX/CMakeLists.txt
@@ -0,0 +1,34 @@
+set(LLVM_TARGET_DEFINITIONS NVPTX.td)
+
+
+tablegen(LLVM NVPTXGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM NVPTXGenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM NVPTXGenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM NVPTXGenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM NVPTXGenSubtargetInfo.inc -gen-subtarget)
+add_public_tablegen_target(NVPTXCommonTableGen)
+
+set(NVPTXCodeGen_sources
+  NVPTXFrameLowering.cpp
+  NVPTXInstrInfo.cpp
+  NVPTXISelDAGToDAG.cpp
+  NVPTXISelLowering.cpp
+  NVPTXRegisterInfo.cpp
+  NVPTXSubtarget.cpp
+  NVPTXTargetMachine.cpp
+  NVPTXSplitBBatBar.cpp
+  NVPTXLowerAggrCopies.cpp
+  NVPTXutil.cpp
+  NVPTXAllocaHoisting.cpp
+  NVPTXAsmPrinter.cpp
+  NVPTXUtilities.cpp
+  VectorElementize.cpp
+  )
+
+add_llvm_target(NVPTXCodeGen ${NVPTXCodeGen_sources})
+
+add_dependencies(LLVMNVPTXCodeGen intrinsics_gen)
+
+add_subdirectory(TargetInfo)
+add_subdirectory(InstPrinter)
+add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/NVPTX/InstPrinter/CMakeLists.txt b/lib/Target/NVPTX/InstPrinter/CMakeLists.txt
new file mode 100644
index 0000000..ae4c751
--- /dev/null
+++ b/lib/Target/NVPTX/InstPrinter/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMNVPTXAsmPrinter
+  NVPTXInstPrinter.cpp
+  )
+
+add_dependencies(LLVMNVPTXAsmPrinter NVPTXCommonTableGen)
diff --git a/lib/Target/PTX/InstPrinter/LLVMBuild.txt b/lib/Target/NVPTX/InstPrinter/LLVMBuild.txt
index af5d200..032b573 100644
--- a/lib/Target/PTX/InstPrinter/LLVMBuild.txt
+++ b/lib/Target/NVPTX/InstPrinter/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/PTX/InstPrinter/LLVMBuild.txt ---------------*- Conf -*--===;
+;===- ./lib/Target/NVPTX/InstPrinter/LLVMBuild.txt -------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
@@ -17,7 +17,7 @@
 
 [component_0]
 type = Library
-name = PTXAsmPrinter
-parent = PTX
+name = NVPTXAsmPrinter
+parent = NVPTX
 required_libraries = MC Support
-add_to_library_groups = PTX
+add_to_library_groups = NVPTX
diff --git a/lib/Target/PTX/InstPrinter/Makefile b/lib/Target/NVPTX/InstPrinter/Makefile
index 0ccfe44..7b78654 100644
--- a/lib/Target/PTX/InstPrinter/Makefile
+++ b/lib/Target/NVPTX/InstPrinter/Makefile
@@ -1,4 +1,4 @@
-##===- lib/Target/PTX/AsmPrinter/Makefile ------------------*- Makefile -*-===##
+##===- lib/Target/NVPTX/AsmPrinter/Makefile ----------------*- Makefile -*-===##
 #
 #											The LLVM Compiler Infrastructure
 #
@@ -7,10 +7,9 @@
 #
 ##===----------------------------------------------------------------------===##
 LEVEL = ../../../..
-LIBRARYNAME = LLVMPTXAsmPrinter
+LIBRARYNAME = LLVMNVPTXAsmPrinter
 
 # Hack: we need to include 'main' ptx target directory to grab private headers
 CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
 
 include $(LEVEL)/Makefile.common
-
diff --git a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
new file mode 100644
index 0000000..10051c7
--- /dev/null
+++ b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
@@ -0,0 +1 @@
+// Placeholder
diff --git a/lib/Target/PTX/LLVMBuild.txt b/lib/Target/NVPTX/LLVMBuild.txt
index 15a1eb5..e2d6ed2 100644
--- a/lib/Target/PTX/LLVMBuild.txt
+++ b/lib/Target/NVPTX/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/PTX/LLVMBuild.txt ---------------------------*- Conf -*--===;
+;===- ./lib/Target/NVPTX/LLVMBuild.txt -------------------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
@@ -20,13 +20,13 @@ subdirectories = InstPrinter MCTargetDesc TargetInfo
 
 [component_0]
 type = TargetGroup
-name = PTX
+name = NVPTX
 parent = Target
 has_asmprinter = 1
 
 [component_1]
 type = Library
-name = PTXCodeGen
-parent = PTX
-required_libraries = Analysis AsmPrinter CodeGen Core MC PTXDesc PTXInfo SelectionDAG Support Target TransformUtils
-add_to_library_groups = PTX
+name = NVPTXCodeGen
+parent = NVPTX
+required_libraries = Analysis AsmPrinter CodeGen Core MC NVPTXDesc NVPTXInfo SelectionDAG Support Target TransformUtils
+add_to_library_groups = NVPTX
diff --git a/lib/Target/NVPTX/MCTargetDesc/CMakeLists.txt b/lib/Target/NVPTX/MCTargetDesc/CMakeLists.txt
new file mode 100644
index 0000000..a030d9f
--- /dev/null
+++ b/lib/Target/NVPTX/MCTargetDesc/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_llvm_library(LLVMNVPTXDesc
+  NVPTXMCAsmInfo.cpp
+  NVPTXMCTargetDesc.cpp
+  )
+
+add_dependencies(LLVMNVPTXDesc NVPTXCommonTableGen)
+
+# Hack: we need to include 'main' target directory to grab private headers
+#include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..)
diff --git a/lib/Target/PTX/MCTargetDesc/LLVMBuild.txt b/lib/Target/NVPTX/MCTargetDesc/LLVMBuild.txt
index 19b80c5..01a051a 100644
--- a/lib/Target/PTX/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/NVPTX/MCTargetDesc/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/PTX/MCTargetDesc/LLVMBuild.txt --------------*- Conf -*--===;
+;===- ./lib/Target/NVPTX/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
@@ -17,7 +17,7 @@
 
 [component_0]
 type = Library
-name = PTXDesc
-parent = PTX
-required_libraries = MC PTXAsmPrinter PTXInfo Support
-add_to_library_groups = PTX
+name = NVPTXDesc
+parent = NVPTX
+required_libraries = MC NVPTXAsmPrinter NVPTXInfo Support
+add_to_library_groups = NVPTX
diff --git a/lib/Target/PTX/MCTargetDesc/Makefile b/lib/Target/NVPTX/MCTargetDesc/Makefile
index 35f5a7b..31d06cb 100644
--- a/lib/Target/PTX/MCTargetDesc/Makefile
+++ b/lib/Target/NVPTX/MCTargetDesc/Makefile
@@ -1,4 +1,4 @@
-##===- lib/Target/PTX/TargetDesc/Makefile ------------------*- Makefile -*-===##
+##===- lib/Target/NVPTX/TargetDesc/Makefile ----------------*- Makefile -*-===##
 #
 #                     The LLVM Compiler Infrastructure
 #
@@ -8,7 +8,7 @@
 ##===----------------------------------------------------------------------===##
 
 LEVEL = ../../../..
-LIBRARYNAME = LLVMPTXDesc
+LIBRARYNAME = LLVMNVPTXDesc
 
 # Hack: we need to include 'main' target directory to grab private headers
 CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
new file mode 100644
index 0000000..4545838
--- /dev/null
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
@@ -0,0 +1,88 @@
+//===-- NVPTXBaseInfo.h - Top-level definitions for NVPTX -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the NVPTX target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXBASEINFO_H
+#define NVPTXBASEINFO_H
+
+namespace llvm {
+
+enum AddressSpace {
+  ADDRESS_SPACE_GENERIC = 0,
+  ADDRESS_SPACE_GLOBAL = 1,
+  ADDRESS_SPACE_CONST_NOT_GEN = 2, // Not part of generic space
+  ADDRESS_SPACE_SHARED = 3,
+  ADDRESS_SPACE_CONST = 4,
+  ADDRESS_SPACE_LOCAL = 5,
+
+  // NVVM Internal
+  ADDRESS_SPACE_PARAM = 101
+};
+
+enum PropertyAnnotation {
+  PROPERTY_MAXNTID_X = 0,
+  PROPERTY_MAXNTID_Y,
+  PROPERTY_MAXNTID_Z,
+  PROPERTY_REQNTID_X,
+  PROPERTY_REQNTID_Y,
+  PROPERTY_REQNTID_Z,
+  PROPERTY_MINNCTAPERSM,
+  PROPERTY_ISTEXTURE,
+  PROPERTY_ISSURFACE,
+  PROPERTY_ISSAMPLER,
+  PROPERTY_ISREADONLY_IMAGE_PARAM,
+  PROPERTY_ISWRITEONLY_IMAGE_PARAM,
+  PROPERTY_ISKERNEL_FUNCTION,
+  PROPERTY_ALIGN,
+
+  // last property
+  PROPERTY_LAST
+};
+
+const unsigned AnnotationNameLen = 8; // length of each annotation name
+const char
+PropertyAnnotationNames[PROPERTY_LAST + 1][AnnotationNameLen + 1] = {
+  "maxntidx",               // PROPERTY_MAXNTID_X
+  "maxntidy",               // PROPERTY_MAXNTID_Y
+  "maxntidz",               // PROPERTY_MAXNTID_Z
+  "reqntidx",               // PROPERTY_REQNTID_X
+  "reqntidy",               // PROPERTY_REQNTID_Y
+  "reqntidz",               // PROPERTY_REQNTID_Z
+  "minctasm",               // PROPERTY_MINNCTAPERSM
+  "texture",                // PROPERTY_ISTEXTURE
+  "surface",                // PROPERTY_ISSURFACE
+  "sampler",                // PROPERTY_ISSAMPLER
+  "rdoimage",               // PROPERTY_ISREADONLY_IMAGE_PARAM
+  "wroimage",               // PROPERTY_ISWRITEONLY_IMAGE_PARAM
+  "kernel",                 // PROPERTY_ISKERNEL_FUNCTION
+  "align",                  // PROPERTY_ALIGN
+
+  // last property
+  "proplast",               // PROPERTY_LAST
+};
+
+// name of named metadata used for global annotations
+#if defined(__GNUC__)
+// As this is declared to be static but some of the .cpp files that
+// include NVVM.h do not use this array, gcc gives a warning when
+// compiling those .cpp files, hence __attribute__((unused)).
+__attribute__((unused))
+#endif
+static const char* NamedMDForAnnotations = "nvvm.annotations";
+
+}
+
+
+#endif
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
new file mode 100644
index 0000000..1d41665
--- /dev/null
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
@@ -0,0 +1,63 @@
+//===-- NVPTXMCAsmInfo.cpp - NVPTX asm properties -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the NVPTXMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXMCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+bool CompileForDebugging;
+
+// -debug-compile - Command line option to inform opt and llc passes to
+// compile for debugging
+static cl::opt<bool, true>
+Debug("debug-compile", cl::desc("Compile for debugging"), cl::Hidden,
+      cl::location(CompileForDebugging),
+      cl::init(false));
+
+void NVPTXMCAsmInfo::anchor() { }
+
+NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Target &T, const StringRef &TT) {
+  Triple TheTriple(TT);
+  if (TheTriple.getArch() == Triple::nvptx64)
+    PointerSize = 8;
+
+  CommentString = "//";
+
+  PrivateGlobalPrefix = "$L__";
+
+  AllowPeriodsInName = false;
+
+  HasSetDirective = false;
+
+  HasSingleParameterDotFile = false;
+
+  InlineAsmStart = " inline asm";
+  InlineAsmEnd = " inline asm";
+
+  SupportsDebugInformation = CompileForDebugging;
+  HasDotTypeDotSizeDirective = false;
+
+  Data8bitsDirective = " .b8 ";
+  Data16bitsDirective = " .b16 ";
+  Data32bitsDirective = " .b32 ";
+  Data64bitsDirective = " .b64 ";
+  PrivateGlobalPrefix = "";
+  ZeroDirective =  " .b8";
+  AsciiDirective = " .b8";
+  AscizDirective = " .b8";
+
+  // @TODO: Can we just disable this?
+  GlobalDirective = "\t// .globl\t";
+}
diff --git a/lib/Target/PTX/MCTargetDesc/PTXMCAsmInfo.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
index 32ca069..82097da 100644
--- a/lib/Target/PTX/MCTargetDesc/PTXMCAsmInfo.h
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
@@ -1,4 +1,4 @@
-//===-- PTXMCAsmInfo.h - PTX asm properties --------------------*- C++ -*--===//
+//===-- NVPTXMCAsmInfo.h - NVPTX asm properties ----------------*- C++ -*--===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,24 +7,24 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains the declaration of the PTXMCAsmInfo class.
+// This file contains the declaration of the NVPTXMCAsmInfo class.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef PTX_MCASM_INFO_H
-#define PTX_MCASM_INFO_H
+#ifndef NVPTX_MCASM_INFO_H
+#define NVPTX_MCASM_INFO_H
 
 #include "llvm/MC/MCAsmInfo.h"
 
 namespace llvm {
-  class Target;
-  class StringRef;
+class Target;
+class StringRef;
 
-  class PTXMCAsmInfo : public MCAsmInfo {
-    virtual void anchor();
-  public:
-    explicit PTXMCAsmInfo(const Target &T, const StringRef &TT);
-  };
+class NVPTXMCAsmInfo : public MCAsmInfo {
+  virtual void anchor();
+public:
+  explicit NVPTXMCAsmInfo(const Target &T, const StringRef &TT);
+};
 } // namespace llvm
 
-#endif // PTX_MCASM_INFO_H
+#endif // NVPTX_MCASM_INFO_H
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
new file mode 100644
index 0000000..44aa01c
--- /dev/null
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
@@ -0,0 +1,91 @@
+//===-- NVPTXMCTargetDesc.cpp - NVPTX Target Descriptions -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides NVPTX specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXMCTargetDesc.h"
+#include "NVPTXMCAsmInfo.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "NVPTXGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "NVPTXGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "NVPTXGenRegisterInfo.inc"
+
+
+using namespace llvm;
+
+static MCInstrInfo *createNVPTXMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitNVPTXMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createNVPTXMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  // PTX does not have a return address register.
+  InitNVPTXMCRegisterInfo(X, 0);
+  return X;
+}
+
+static MCSubtargetInfo *createNVPTXMCSubtargetInfo(StringRef TT, StringRef CPU,
+                                                   StringRef FS) {
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  InitNVPTXMCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+static MCCodeGenInfo *createNVPTXMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                               CodeModel::Model CM,
+                                               CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+
+// Force static initialization.
+extern "C" void LLVMInitializeNVPTXTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfo<NVPTXMCAsmInfo> X(TheNVPTXTarget32);
+  RegisterMCAsmInfo<NVPTXMCAsmInfo> Y(TheNVPTXTarget64);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheNVPTXTarget32,
+                                        createNVPTXMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheNVPTXTarget64,
+                                        createNVPTXMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheNVPTXTarget32, createNVPTXMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheNVPTXTarget64, createNVPTXMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheNVPTXTarget32,
+                                    createNVPTXMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheNVPTXTarget64,
+                                    createNVPTXMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheNVPTXTarget32,
+                                          createNVPTXMCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheNVPTXTarget64,
+                                          createNVPTXMCSubtargetInfo);
+
+}
diff --git a/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h
index 542638a..af95c76 100644
--- a/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.h
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h
@@ -1,4 +1,4 @@
-//===-- PTXMCTargetDesc.h - PTX Target Descriptions ------------*- C++ -*-===//
+//===-- NVPTXMCTargetDesc.h - NVPTX Target Descriptions ---------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,30 +7,30 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file provides PTX specific target descriptions.
+// This file provides NVPTX specific target descriptions.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef PTXMCTARGETDESC_H
-#define PTXMCTARGETDESC_H
+#ifndef NVPTXMCTARGETDESC_H
+#define NVPTXMCTARGETDESC_H
 
 namespace llvm {
 class Target;
 
-extern Target ThePTX32Target;
-extern Target ThePTX64Target;
+extern Target TheNVPTXTarget32;
+extern Target TheNVPTXTarget64;
 
 } // End llvm namespace
 
 // Defines symbolic names for PTX registers.
 #define GET_REGINFO_ENUM
-#include "PTXGenRegisterInfo.inc"
+#include "NVPTXGenRegisterInfo.inc"
 
 // Defines symbolic names for the PTX instructions.
 #define GET_INSTRINFO_ENUM
-#include "PTXGenInstrInfo.inc"
+#include "NVPTXGenInstrInfo.inc"
 
 #define GET_SUBTARGETINFO_ENUM
-#include "PTXGenSubtargetInfo.inc"
+#include "NVPTXGenSubtargetInfo.inc"
 
 #endif
diff --git a/lib/Target/PTX/Makefile b/lib/Target/NVPTX/Makefile
index fa09634..8db20eb 100644
--- a/lib/Target/PTX/Makefile
+++ b/lib/Target/NVPTX/Makefile
@@ -1,4 +1,4 @@
-##===- lib/Target/PTX/Makefile -----------------------------*- Makefile -*-===##
+##===- lib/Target/NVPTX/Makefile ---------------------------*- Makefile -*-===##
 #
 #                     The LLVM Compiler Infrastructure
 #
@@ -8,15 +8,15 @@
 ##===----------------------------------------------------------------------===##
 
 LEVEL = ../../..
-LIBRARYNAME = LLVMPTXCodeGen
-TARGET = PTX
+LIBRARYNAME = LLVMNVPTXCodeGen
+TARGET = NVPTX
 
 # Make sure that tblgen is run, first thing.
-BUILT_SOURCES = PTXGenAsmWriter.inc \
-		PTXGenDAGISel.inc \
-		PTXGenInstrInfo.inc \
-		PTXGenRegisterInfo.inc \
-		PTXGenSubtargetInfo.inc
+BUILT_SOURCES = NVPTXGenAsmWriter.inc \
+		NVPTXGenDAGISel.inc \
+		NVPTXGenInstrInfo.inc \
+		NVPTXGenRegisterInfo.inc \
+		NVPTXGenSubtargetInfo.inc
 
 DIRS = InstPrinter TargetInfo MCTargetDesc
 
diff --git a/lib/Target/NVPTX/ManagedStringPool.h b/lib/Target/NVPTX/ManagedStringPool.h
new file mode 100644
index 0000000..b568488
--- /dev/null
+++ b/lib/Target/NVPTX/ManagedStringPool.h
@@ -0,0 +1,49 @@
+//===-- ManagedStringPool.h - Managed String Pool ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The strings allocated from a managed string pool are owned by the string
+// pool and will be deleted together with the managed string pool.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef LLVM_SUPPORT_MANAGED_STRING_H
+#define LLVM_SUPPORT_MANAGED_STRING_H
+
+#include "llvm/ADT/SmallVector.h"
+#include <string>
+
+namespace llvm {
+
+/// ManagedStringPool - The strings allocated from a managed string pool are
+/// owned by the string pool and will be deleted together with the managed
+/// string pool.
+class ManagedStringPool {
+  SmallVector<std::string *, 8> Pool;
+
+public:
+  ManagedStringPool() {}
+  ~ManagedStringPool() {
+    SmallVector<std::string *, 8>::iterator Current = Pool.begin();
+    while (Current != Pool.end()) {
+      delete *Current;
+      Current++;
+    }
+  }
+
+  std::string *getManagedString(const char *S) {
+    std::string *Str = new std::string(S);
+    Pool.push_back(Str);
+    return Str;
+  }
+};
+
+}
+
+#endif
diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h
new file mode 100644
index 0000000..a8d082a
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTX.h
@@ -0,0 +1,137 @@
+//===-- NVPTX.h - Top-level interface for NVPTX representation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in
+// the LLVM NVPTX back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_NVPTX_H
+#define LLVM_TARGET_NVPTX_H
+
+#include "llvm/Value.h"
+#include "llvm/Module.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
+#include <cassert>
+#include <iosfwd>
+
+namespace llvm {
+class NVPTXTargetMachine;
+class FunctionPass;
+class formatted_raw_ostream;
+
+namespace NVPTXCC {
+enum CondCodes {
+  EQ,
+  NE,
+  LT,
+  LE,
+  GT,
+  GE
+};
+}
+
+inline static const char *NVPTXCondCodeToString(NVPTXCC::CondCodes CC) {
+  switch (CC) {
+  case NVPTXCC::NE:  return "ne";
+  case NVPTXCC::EQ:   return "eq";
+  case NVPTXCC::LT:   return "lt";
+  case NVPTXCC::LE:  return "le";
+  case NVPTXCC::GT:  return "gt";
+  case NVPTXCC::GE:   return "ge";
+  }
+  llvm_unreachable("Unknown condition code");
+}
+
+FunctionPass *createNVPTXISelDag(NVPTXTargetMachine &TM,
+                                 llvm::CodeGenOpt::Level OptLevel);
+FunctionPass *createVectorElementizePass(NVPTXTargetMachine &);
+FunctionPass *createLowerStructArgsPass(NVPTXTargetMachine &);
+FunctionPass *createNVPTXReMatPass(NVPTXTargetMachine &);
+FunctionPass *createNVPTXReMatBlockPass(NVPTXTargetMachine &);
+
+bool isImageOrSamplerVal(const Value *, const Module *);
+
+extern Target TheNVPTXTarget32;
+extern Target TheNVPTXTarget64;
+
+namespace NVPTX
+{
+enum DrvInterface {
+  NVCL,
+  CUDA,
+  TEST
+};
+
+// A field inside TSFlags needs a shift and a mask. The usage is
+// always as follows :
+// ((TSFlags & fieldMask) >> fieldShift)
+// The enum keeps the mask, the shift, and all valid values of the
+// field in one place.
+enum VecInstType {
+  VecInstTypeShift = 0,
+  VecInstTypeMask = 0xF,
+
+  VecNOP = 0,
+  VecLoad = 1,
+  VecStore = 2,
+  VecBuild = 3,
+  VecShuffle = 4,
+  VecExtract = 5,
+  VecInsert = 6,
+  VecDest = 7,
+  VecOther = 15
+};
+
+enum SimpleMove {
+  SimpleMoveMask = 0x10,
+  SimpleMoveShift = 4
+};
+enum LoadStore {
+  isLoadMask = 0x20,
+  isLoadShift = 5,
+  isStoreMask = 0x40,
+  isStoreShift = 6
+};
+
+namespace PTXLdStInstCode {
+enum AddressSpace{
+  GENERIC = 0,
+  GLOBAL = 1,
+  CONSTANT = 2,
+  SHARED = 3,
+  PARAM = 4,
+  LOCAL = 5
+};
+enum FromType {
+  Unsigned = 0,
+  Signed,
+  Float
+};
+enum VecType {
+  Scalar = 1,
+  V2 = 2,
+  V4 = 4
+};
+}
+}
+} // end namespace llvm;
+
+// Defines symbolic names for NVPTX registers.  This defines a mapping from
+// register name to register number.
+#define GET_REGINFO_ENUM
+#include "NVPTXGenRegisterInfo.inc"
+
+// Defines symbolic names for the NVPTX instructions.
+#define GET_INSTRINFO_ENUM
+#include "NVPTXGenInstrInfo.inc"
+
+#endif
diff --git a/lib/Target/NVPTX/NVPTX.td b/lib/Target/NVPTX/NVPTX.td
new file mode 100644
index 0000000..ae7710e
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTX.td
@@ -0,0 +1,44 @@
+//===- NVPTX.td - Describe the NVPTX Target Machine -----------*- tblgen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is the top level entry point for the NVPTX target.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+include "NVPTXRegisterInfo.td"
+include "NVPTXInstrInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Subtarget Features.
+// - We use the SM version number instead of explicit feature table.
+// - Need at least one feature to avoid generating zero sized array by
+//   TableGen in NVPTXGenSubtarget.inc.
+//===----------------------------------------------------------------------===//
+def FeatureDummy  : SubtargetFeature<"dummy", "dummy", "true", "">;
+
+//===----------------------------------------------------------------------===//
+// NVPTX supported processors.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"sm_10", [FeatureDummy]>;
+
+
+def NVPTXInstrInfo : InstrInfo {
+}
+
+def NVPTX : Target {
+  let InstructionSet = NVPTXInstrInfo;
+}
diff --git a/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp b/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp
new file mode 100644
index 0000000..668c393
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp
@@ -0,0 +1,48 @@
+//===-- AllocaHoisting.cpp - Hoist allocas to the entry block --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Hoist the alloca instructions in the non-entry blocks to the entry blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Constants.h"
+#include "NVPTXAllocaHoisting.h"
+
+namespace llvm {
+
+bool NVPTXAllocaHoisting::runOnFunction(Function &function) {
+  bool               functionModified    = false;
+  Function::iterator I                   = function.begin();
+  TerminatorInst    *firstTerminatorInst = (I++)->getTerminator();
+
+  for (Function::iterator E = function.end(); I != E; ++I) {
+    for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
+      AllocaInst *allocaInst = dyn_cast<AllocaInst>(BI++);
+      if (allocaInst && isa<ConstantInt>(allocaInst->getArraySize())) {
+        allocaInst->moveBefore(firstTerminatorInst);
+        functionModified = true;
+      }
+    }
+  }
+
+  return functionModified;
+}
+
+char NVPTXAllocaHoisting::ID = 1;
+RegisterPass<NVPTXAllocaHoisting> X("alloca-hoisting",
+                                    "Hoisting alloca instructions in non-entry "
+                                    "blocks to the entry block");
+
+FunctionPass *createAllocaHoisting() {
+  return new NVPTXAllocaHoisting();
+}
+
+} // end namespace llvm
diff --git a/lib/Target/NVPTX/NVPTXAllocaHoisting.h b/lib/Target/NVPTX/NVPTXAllocaHoisting.h
new file mode 100644
index 0000000..24b3bd5
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXAllocaHoisting.h
@@ -0,0 +1,49 @@
+//===-- AllocaHoisting.h - Hosist allocas to the entry block ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Hoist the alloca instructions in the non-entry blocks to the entry blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_ALLOCA_HOISTING_H_
+#define NVPTX_ALLOCA_HOISTING_H_
+
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetData.h"
+
+namespace llvm {
+
+class FunctionPass;
+class Function;
+
+// Hoisting the alloca instructions in the non-entry blocks to the entry
+// block.
+class NVPTXAllocaHoisting : public FunctionPass {
+public:
+  static char ID; // Pass ID
+  NVPTXAllocaHoisting() : FunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<TargetData>();
+    AU.addPreserved<MachineFunctionAnalysis>();
+  }
+
+  virtual const char *getPassName() const {
+    return "NVPTX specific alloca hoisting";
+  }
+
+  virtual bool runOnFunction(Function &function);
+};
+
+extern FunctionPass *createAllocaHoisting();
+
+} // end namespace llvm
+
+#endif // NVPTX_ALLOCA_HOISTING_H_
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
new file mode 100644
index 0000000..f2b9616
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -0,0 +1,2064 @@
+//===-- NVPTXAsmPrinter.cpp - NVPTX LLVM assembly writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to NVPTX assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXAsmPrinter.h"
+#include "NVPTX.h"
+#include "NVPTXInstrInfo.h"
+#include "NVPTXTargetMachine.h"
+#include "NVPTXRegisterInfo.h"
+#include "NVPTXUtilities.h"
+#include "MCTargetDesc/NVPTXMCAsmInfo.h"
+#include "NVPTXNumRegisters.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Support/TimeValue.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Assembly/Writer.h"
+#include "cl_common_defines.h"
+#include <sstream>
+using namespace llvm;
+
+
+#include "NVPTXGenAsmWriter.inc"
+
+bool RegAllocNilUsed = true;
+
+#define DEPOTNAME "__local_depot"
+
+static cl::opt<bool>
+EmitLineNumbers("nvptx-emit-line-numbers",
+                cl::desc("NVPTX Specific: Emit Line numbers even without -G"),
+                cl::init(true));
+
+namespace llvm  {
+bool InterleaveSrcInPtx = false;
+}
+
+static cl::opt<bool, true>InterleaveSrc("nvptx-emit-src",
+                                        cl::ZeroOrMore,
+                       cl::desc("NVPTX Specific: Emit source line in ptx file"),
+                                        cl::location(llvm::InterleaveSrcInPtx));
+
+
+
+
+// @TODO: This is a copy from AsmPrinter.cpp.  The function is static, so we
+// cannot just link to the existing version.
+/// LowerConstant - Lower the specified LLVM Constant to an MCExpr.
+///
+using namespace nvptx;
+const MCExpr *nvptx::LowerConstant(const Constant *CV, AsmPrinter &AP) {
+  MCContext &Ctx = AP.OutContext;
+
+  if (CV->isNullValue() || isa<UndefValue>(CV))
+    return MCConstantExpr::Create(0, Ctx);
+
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV))
+    return MCConstantExpr::Create(CI->getZExtValue(), Ctx);
+
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV))
+    return MCSymbolRefExpr::Create(AP.Mang->getSymbol(GV), Ctx);
+
+  if (const BlockAddress *BA = dyn_cast<BlockAddress>(CV))
+    return MCSymbolRefExpr::Create(AP.GetBlockAddressSymbol(BA), Ctx);
+
+  const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV);
+  if (CE == 0)
+    llvm_unreachable("Unknown constant value to lower!");
+
+
+  switch (CE->getOpcode()) {
+  default:
+    // If the code isn't optimized, there may be outstanding folding
+    // opportunities. Attempt to fold the expression using TargetData as a
+    // last resort before giving up.
+    if (Constant *C =
+        ConstantFoldConstantExpression(CE, AP.TM.getTargetData()))
+      if (C != CE)
+        return LowerConstant(C, AP);
+
+    // Otherwise report the problem to the user.
+    {
+        std::string S;
+        raw_string_ostream OS(S);
+        OS << "Unsupported expression in static initializer: ";
+        WriteAsOperand(OS, CE, /*PrintType=*/false,
+                       !AP.MF ? 0 : AP.MF->getFunction()->getParent());
+        report_fatal_error(OS.str());
+    }
+  case Instruction::GetElementPtr: {
+    const TargetData &TD = *AP.TM.getTargetData();
+    // Generate a symbolic expression for the byte address
+    const Constant *PtrVal = CE->getOperand(0);
+    SmallVector<Value*, 8> IdxVec(CE->op_begin()+1, CE->op_end());
+    int64_t Offset = TD.getIndexedOffset(PtrVal->getType(), IdxVec);
+
+    const MCExpr *Base = LowerConstant(CE->getOperand(0), AP);
+    if (Offset == 0)
+      return Base;
+
+    // Truncate/sext the offset to the pointer size.
+    if (TD.getPointerSizeInBits() != 64) {
+      int SExtAmount = 64-TD.getPointerSizeInBits();
+      Offset = (Offset << SExtAmount) >> SExtAmount;
+    }
+
+    return MCBinaryExpr::CreateAdd(Base, MCConstantExpr::Create(Offset, Ctx),
+                                   Ctx);
+  }
+
+  case Instruction::Trunc:
+    // We emit the value and depend on the assembler to truncate the generated
+    // expression properly.  This is important for differences between
+    // blockaddress labels.  Since the two labels are in the same function, it
+    // is reasonable to treat their delta as a 32-bit value.
+    // FALL THROUGH.
+  case Instruction::BitCast:
+    return LowerConstant(CE->getOperand(0), AP);
+
+  case Instruction::IntToPtr: {
+    const TargetData &TD = *AP.TM.getTargetData();
+    // Handle casts to pointers by changing them into casts to the appropriate
+    // integer type.  This promotes constant folding and simplifies this code.
+    Constant *Op = CE->getOperand(0);
+    Op = ConstantExpr::getIntegerCast(Op, TD.getIntPtrType(CV->getContext()),
+                                      false/*ZExt*/);
+    return LowerConstant(Op, AP);
+  }
+
+  case Instruction::PtrToInt: {
+    const TargetData &TD = *AP.TM.getTargetData();
+    // Support only foldable casts to/from pointers that can be eliminated by
+    // changing the pointer to the appropriately sized integer type.
+    Constant *Op = CE->getOperand(0);
+    Type *Ty = CE->getType();
+
+    const MCExpr *OpExpr = LowerConstant(Op, AP);
+
+    // We can emit the pointer value into this slot if the slot is an
+    // integer slot equal to the size of the pointer.
+    if (TD.getTypeAllocSize(Ty) == TD.getTypeAllocSize(Op->getType()))
+      return OpExpr;
+
+    // Otherwise the pointer is smaller than the resultant integer, mask off
+    // the high bits so we are sure to get a proper truncation if the input is
+    // a constant expr.
+    unsigned InBits = TD.getTypeAllocSizeInBits(Op->getType());
+    const MCExpr *MaskExpr = MCConstantExpr::Create(~0ULL >> (64-InBits), Ctx);
+    return MCBinaryExpr::CreateAnd(OpExpr, MaskExpr, Ctx);
+  }
+
+  // The MC library also has a right-shift operator, but it isn't consistently
+  // signed or unsigned between different targets.
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::SDiv:
+  case Instruction::SRem:
+  case Instruction::Shl:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    const MCExpr *LHS = LowerConstant(CE->getOperand(0), AP);
+    const MCExpr *RHS = LowerConstant(CE->getOperand(1), AP);
+    switch (CE->getOpcode()) {
+    default: llvm_unreachable("Unknown binary operator constant cast expr");
+    case Instruction::Add: return MCBinaryExpr::CreateAdd(LHS, RHS, Ctx);
+    case Instruction::Sub: return MCBinaryExpr::CreateSub(LHS, RHS, Ctx);
+    case Instruction::Mul: return MCBinaryExpr::CreateMul(LHS, RHS, Ctx);
+    case Instruction::SDiv: return MCBinaryExpr::CreateDiv(LHS, RHS, Ctx);
+    case Instruction::SRem: return MCBinaryExpr::CreateMod(LHS, RHS, Ctx);
+    case Instruction::Shl: return MCBinaryExpr::CreateShl(LHS, RHS, Ctx);
+    case Instruction::And: return MCBinaryExpr::CreateAnd(LHS, RHS, Ctx);
+    case Instruction::Or:  return MCBinaryExpr::CreateOr (LHS, RHS, Ctx);
+    case Instruction::Xor: return MCBinaryExpr::CreateXor(LHS, RHS, Ctx);
+    }
+  }
+  }
+}
+
+
+void NVPTXAsmPrinter::emitLineNumberAsDotLoc(const MachineInstr &MI)
+{
+  if (!EmitLineNumbers)
+    return;
+  if (ignoreLoc(MI))
+    return;
+
+  DebugLoc curLoc = MI.getDebugLoc();
+
+  if (prevDebugLoc.isUnknown() && curLoc.isUnknown())
+    return;
+
+  if (prevDebugLoc == curLoc)
+    return;
+
+  prevDebugLoc = curLoc;
+
+  if (curLoc.isUnknown())
+    return;
+
+
+  const MachineFunction *MF = MI.getParent()->getParent();
+  //const TargetMachine &TM = MF->getTarget();
+
+  const LLVMContext &ctx = MF->getFunction()->getContext();
+  DIScope Scope(curLoc.getScope(ctx));
+
+  if (!Scope.Verify())
+    return;
+
+  StringRef fileName(Scope.getFilename());
+  StringRef dirName(Scope.getDirectory());
+  SmallString<128> FullPathName = dirName;
+  if (!dirName.empty() && !sys::path::is_absolute(fileName)) {
+    sys::path::append(FullPathName, fileName);
+    fileName = FullPathName.str();
+  }
+
+  if (filenameMap.find(fileName.str()) == filenameMap.end())
+    return;
+
+
+  // Emit the line from the source file.
+  if (llvm::InterleaveSrcInPtx)
+    this->emitSrcInText(fileName.str(), curLoc.getLine());
+
+  std::stringstream temp;
+  temp << "\t.loc " << filenameMap[fileName.str()]
+       << " " << curLoc.getLine() << " " << curLoc.getCol();
+  OutStreamer.EmitRawText(Twine(temp.str().c_str()));
+}
+
+void NVPTXAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  SmallString<128> Str;
+  raw_svector_ostream OS(Str);
+  if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA)
+    emitLineNumberAsDotLoc(*MI);
+  printInstruction(MI, OS);
+  OutStreamer.EmitRawText(OS.str());
+}
+
+void NVPTXAsmPrinter::printReturnValStr(const Function *F,
+                                        raw_ostream &O)
+{
+  const TargetData *TD = TM.getTargetData();
+  const TargetLowering *TLI = TM.getTargetLowering();
+
+  Type *Ty = F->getReturnType();
+
+  bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+
+  if (Ty->getTypeID() == Type::VoidTyID)
+    return;
+
+  O << " (";
+
+  if (isABI) {
+    if (Ty->isPrimitiveType() || Ty->isIntegerTy()) {
+      unsigned size = 0;
+      if (const IntegerType *ITy = dyn_cast<IntegerType>(Ty)) {
+        size = ITy->getBitWidth();
+        if (size < 32) size = 32;
+      } else {
+        assert(Ty->isFloatingPointTy() &&
+               "Floating point type expected here");
+        size = Ty->getPrimitiveSizeInBits();
+      }
+
+      O << ".param .b" << size << " func_retval0";
+    }
+    else if (isa<PointerType>(Ty)) {
+      O << ".param .b" << TLI->getPointerTy().getSizeInBits()
+            << " func_retval0";
+    } else {
+      if ((Ty->getTypeID() == Type::StructTyID) ||
+          isa<VectorType>(Ty)) {
+        SmallVector<EVT, 16> vtparts;
+        ComputeValueVTs(*TLI, Ty, vtparts);
+        unsigned totalsz = 0;
+        for (unsigned i=0,e=vtparts.size(); i!=e; ++i) {
+          unsigned elems = 1;
+          EVT elemtype = vtparts[i];
+          if (vtparts[i].isVector()) {
+            elems = vtparts[i].getVectorNumElements();
+            elemtype = vtparts[i].getVectorElementType();
+          }
+          for (unsigned j=0, je=elems; j!=je; ++j) {
+            unsigned sz = elemtype.getSizeInBits();
+            if (elemtype.isInteger() && (sz < 8)) sz = 8;
+            totalsz += sz/8;
+          }
+        }
+        unsigned retAlignment = 0;
+        if (!llvm::getAlign(*F, 0, retAlignment))
+          retAlignment = TD->getABITypeAlignment(Ty);
+        O << ".param .align "
+            << retAlignment
+            << " .b8 func_retval0["
+            << totalsz << "]";
+      } else
+        assert(false &&
+               "Unknown return type");
+    }
+  } else {
+    SmallVector<EVT, 16> vtparts;
+    ComputeValueVTs(*TLI, Ty, vtparts);
+    unsigned idx = 0;
+    for (unsigned i=0,e=vtparts.size(); i!=e; ++i) {
+      unsigned elems = 1;
+      EVT elemtype = vtparts[i];
+      if (vtparts[i].isVector()) {
+        elems = vtparts[i].getVectorNumElements();
+        elemtype = vtparts[i].getVectorElementType();
+      }
+
+      for (unsigned j=0, je=elems; j!=je; ++j) {
+        unsigned sz = elemtype.getSizeInBits();
+        if (elemtype.isInteger() && (sz < 32)) sz = 32;
+        O << ".reg .b" << sz << " func_retval" << idx;
+        if (j<je-1) O << ", ";
+        ++idx;
+      }
+      if (i < e-1)
+        O << ", ";
+    }
+  }
+  O << ") ";
+  return;
+}
+
+void NVPTXAsmPrinter::printReturnValStr(const MachineFunction &MF,
+                                        raw_ostream &O) {
+  const Function *F = MF.getFunction();
+  printReturnValStr(F, O);
+}
+
+void NVPTXAsmPrinter::EmitFunctionEntryLabel() {
+  SmallString<128> Str;
+  raw_svector_ostream O(Str);
+
+  // Set up
+  MRI = &MF->getRegInfo();
+  F = MF->getFunction();
+  emitLinkageDirective(F,O);
+  if (llvm::isKernelFunction(*F))
+    O << ".entry ";
+  else {
+    O << ".func ";
+    printReturnValStr(*MF, O);
+  }
+
+  O << *CurrentFnSym;
+
+  emitFunctionParamList(*MF, O);
+
+  if (llvm::isKernelFunction(*F))
+    emitKernelFunctionDirectives(*F, O);
+
+  OutStreamer.EmitRawText(O.str());
+
+  prevDebugLoc = DebugLoc();
+}
+
+void NVPTXAsmPrinter::EmitFunctionBodyStart() {
+  const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
+  unsigned numRegClasses = TRI.getNumRegClasses();
+  VRidGlobal2LocalMap = new std::map<unsigned, unsigned>[numRegClasses+1];
+  OutStreamer.EmitRawText(StringRef("{\n"));
+  setAndEmitFunctionVirtualRegisters(*MF);
+
+  SmallString<128> Str;
+  raw_svector_ostream O(Str);
+  emitDemotedVars(MF->getFunction(), O);
+  OutStreamer.EmitRawText(O.str());
+}
+
+void NVPTXAsmPrinter::EmitFunctionBodyEnd() {
+  OutStreamer.EmitRawText(StringRef("}\n"));
+  delete []VRidGlobal2LocalMap;
+}
+
+
+void
+NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function& F,
+                                              raw_ostream &O) const {
+  // If the NVVM IR has some of reqntid* specified, then output
+  // the reqntid directive, and set the unspecified ones to 1.
+  // If none of reqntid* is specified, don't output reqntid directive.
+  unsigned reqntidx, reqntidy, reqntidz;
+  bool specified = false;
+  if (llvm::getReqNTIDx(F, reqntidx) == false) reqntidx = 1;
+  else specified = true;
+  if (llvm::getReqNTIDy(F, reqntidy) == false) reqntidy = 1;
+  else specified = true;
+  if (llvm::getReqNTIDz(F, reqntidz) == false) reqntidz = 1;
+  else specified = true;
+
+  if (specified)
+    O << ".reqntid " << reqntidx << ", "
+    << reqntidy << ", " << reqntidz << "\n";
+
+  // If the NVVM IR has some of maxntid* specified, then output
+  // the maxntid directive, and set the unspecified ones to 1.
+  // If none of maxntid* is specified, don't output maxntid directive.
+  unsigned maxntidx, maxntidy, maxntidz;
+  specified = false;
+  if (llvm::getMaxNTIDx(F, maxntidx) == false) maxntidx = 1;
+  else specified = true;
+  if (llvm::getMaxNTIDy(F, maxntidy) == false) maxntidy = 1;
+  else specified = true;
+  if (llvm::getMaxNTIDz(F, maxntidz) == false) maxntidz = 1;
+  else specified = true;
+
+  if (specified)
+    O << ".maxntid " << maxntidx << ", "
+    << maxntidy << ", " << maxntidz << "\n";
+
+  unsigned mincta;
+  if (llvm::getMinCTASm(F, mincta))
+    O << ".minnctapersm " << mincta << "\n";
+}
+
+void
+NVPTXAsmPrinter::getVirtualRegisterName(unsigned vr, bool isVec,
+                                        raw_ostream &O) {
+  const TargetRegisterClass * RC = MRI->getRegClass(vr);
+  unsigned id = RC->getID();
+
+  std::map<unsigned, unsigned> &regmap = VRidGlobal2LocalMap[id];
+  unsigned mapped_vr = regmap[vr];
+
+  if (!isVec) {
+    O << getNVPTXRegClassStr(RC) << mapped_vr;
+    return;
+  }
+  // Vector virtual register
+  if (getNVPTXVectorSize(RC) == 4)
+    O << "{"
+    << getNVPTXRegClassStr(RC) << mapped_vr << "_0, "
+    << getNVPTXRegClassStr(RC) << mapped_vr << "_1, "
+    << getNVPTXRegClassStr(RC) << mapped_vr << "_2, "
+    << getNVPTXRegClassStr(RC) << mapped_vr << "_3"
+    << "}";
+  else if (getNVPTXVectorSize(RC) == 2)
+    O << "{"
+    << getNVPTXRegClassStr(RC) << mapped_vr << "_0, "
+    << getNVPTXRegClassStr(RC) << mapped_vr << "_1"
+    << "}";
+  else
+    llvm_unreachable("Unsupported vector size");
+}
+
+void
+NVPTXAsmPrinter::emitVirtualRegister(unsigned int vr, bool isVec,
+                                     raw_ostream &O) {
+  getVirtualRegisterName(vr, isVec, O);
+}
+
+void NVPTXAsmPrinter::printVecModifiedImmediate(const MachineOperand &MO,
+                                                const char *Modifier,
+                                                raw_ostream &O) {
+  static const char vecelem[] = {'0', '1', '2', '3', '0', '1', '2', '3'};
+  int Imm = (int)MO.getImm();
+  if(0 == strcmp(Modifier, "vecelem"))
+    O << "_" << vecelem[Imm];
+  else if(0 == strcmp(Modifier, "vecv4comm1")) {
+    if((Imm < 0) || (Imm > 3))
+      O << "//";
+  }
+  else if(0 == strcmp(Modifier, "vecv4comm2")) {
+    if((Imm < 4) || (Imm > 7))
+      O << "//";
+  }
+  else if(0 == strcmp(Modifier, "vecv4pos")) {
+    if(Imm < 0) Imm = 0;
+    O << "_" << vecelem[Imm%4];
+  }
+  else if(0 == strcmp(Modifier, "vecv2comm1")) {
+    if((Imm < 0) || (Imm > 1))
+      O << "//";
+  }
+  else if(0 == strcmp(Modifier, "vecv2comm2")) {
+    if((Imm < 2) || (Imm > 3))
+      O << "//";
+  }
+  else if(0 == strcmp(Modifier, "vecv2pos")) {
+    if(Imm < 0) Imm = 0;
+    O << "_" << vecelem[Imm%2];
+  }
+  else
+    llvm_unreachable("Unknown Modifier on immediate operand");
+}
+
+void NVPTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
+                                   raw_ostream &O, const char *Modifier) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+      if (MO.getReg() == NVPTX::VRDepot)
+        O << DEPOTNAME << getFunctionNumber();
+      else
+        O << getRegisterName(MO.getReg());
+    } else {
+      if (!Modifier)
+        emitVirtualRegister(MO.getReg(), false, O);
+      else {
+        if (strcmp(Modifier, "vecfull") == 0)
+          emitVirtualRegister(MO.getReg(), true, O);
+        else
+          llvm_unreachable(
+                 "Don't know how to handle the modifier on virtual register.");
+      }
+    }
+    return;
+
+  case MachineOperand::MO_Immediate:
+    if (!Modifier)
+      O << MO.getImm();
+    else if (strstr(Modifier, "vec") == Modifier)
+      printVecModifiedImmediate(MO, Modifier, O);
+    else
+      llvm_unreachable("Don't know how to handle modifier on immediate operand");
+    return;
+
+  case MachineOperand::MO_FPImmediate:
+    printFPConstant(MO.getFPImm(), O);
+    break;
+
+  case MachineOperand::MO_GlobalAddress:
+    O << *Mang->getSymbol(MO.getGlobal());
+    break;
+
+  case MachineOperand::MO_ExternalSymbol: {
+    const char * symbname = MO.getSymbolName();
+    if (strstr(symbname, ".PARAM") == symbname) {
+      unsigned index;
+      sscanf(symbname+6, "%u[];", &index);
+      printParamName(index, O);
+    }
+    else if (strstr(symbname, ".HLPPARAM") == symbname) {
+      unsigned index;
+      sscanf(symbname+9, "%u[];", &index);
+      O << *CurrentFnSym << "_param_" << index << "_offset";
+    }
+    else
+      O << symbname;
+    break;
+  }
+
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol();
+    return;
+
+  default:
+    llvm_unreachable("Operand type not supported.");
+  }
+}
+
+void NVPTXAsmPrinter::
+printImplicitDef(const MachineInstr *MI, raw_ostream &O) const {
+#ifndef __OPTIMIZE__
+  O << "\t// Implicit def :";
+  //printOperand(MI, 0);
+  O << "\n";
+#endif
+}
+
+void NVPTXAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum,
+                                      raw_ostream &O, const char *Modifier) {
+  printOperand(MI, opNum, O);
+
+  if (Modifier && !strcmp(Modifier, "add")) {
+    O << ", ";
+    printOperand(MI, opNum+1, O);
+  } else {
+    if (MI->getOperand(opNum+1).isImm() &&
+        MI->getOperand(opNum+1).getImm() == 0)
+      return; // don't print ',0' or '+0'
+    O << "+";
+    printOperand(MI, opNum+1, O);
+  }
+}
+
+void NVPTXAsmPrinter::printLdStCode(const MachineInstr *MI, int opNum,
+                                    raw_ostream &O, const char *Modifier)
+{
+  if (Modifier) {
+    const MachineOperand &MO = MI->getOperand(opNum);
+    int Imm = (int)MO.getImm();
+    if (!strcmp(Modifier, "volatile")) {
+      if (Imm)
+        O << ".volatile";
+    } else if (!strcmp(Modifier, "addsp")) {
+      switch (Imm) {
+      case NVPTX::PTXLdStInstCode::GLOBAL: O << ".global"; break;
+      case NVPTX::PTXLdStInstCode::SHARED: O << ".shared"; break;
+      case NVPTX::PTXLdStInstCode::LOCAL: O << ".local"; break;
+      case NVPTX::PTXLdStInstCode::PARAM: O << ".param"; break;
+      case NVPTX::PTXLdStInstCode::CONSTANT: O << ".const"; break;
+      case NVPTX::PTXLdStInstCode::GENERIC:
+        if (!nvptxSubtarget.hasGenericLdSt())
+          O << ".global";
+        break;
+      default:
+        assert("wrong value");
+      }
+    }
+    else if (!strcmp(Modifier, "sign")) {
+      if (Imm==NVPTX::PTXLdStInstCode::Signed)
+        O << "s";
+      else if (Imm==NVPTX::PTXLdStInstCode::Unsigned)
+        O << "u";
+      else
+        O << "f";
+    }
+    else if (!strcmp(Modifier, "vec")) {
+      if (Imm==NVPTX::PTXLdStInstCode::V2)
+        O << ".v2";
+      else if (Imm==NVPTX::PTXLdStInstCode::V4)
+        O << ".v4";
+    }
+    else
+      assert("unknown modifier");
+  }
+  else
+    assert("unknown modifier");
+}
+
+void NVPTXAsmPrinter::emitDeclaration (const Function *F, raw_ostream &O) {
+
+  emitLinkageDirective(F,O);
+  if (llvm::isKernelFunction(*F))
+    O << ".entry ";
+  else
+    O << ".func ";
+  printReturnValStr(F, O);
+  O << *CurrentFnSym << "\n";
+  emitFunctionParamList(F, O);
+  O << ";\n";
+}
+
+static bool usedInGlobalVarDef(const Constant *C)
+{
+  if (!C)
+    return false;
+
+  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
+    if (GV->getName().str() == "llvm.used")
+      return false;
+    return true;
+  }
+
+  for (Value::const_use_iterator ui=C->use_begin(), ue=C->use_end();
+      ui!=ue; ++ui) {
+    const Constant *C = dyn_cast<Constant>(*ui);
+    if (usedInGlobalVarDef(C))
+      return true;
+  }
+  return false;
+}
+
+static bool usedInOneFunc(const User *U, Function const *&oneFunc)
+{
+  if (const GlobalVariable *othergv = dyn_cast<GlobalVariable>(U)) {
+    if (othergv->getName().str() == "llvm.used")
+      return true;
+  }
+
+  if (const Instruction *instr = dyn_cast<Instruction>(U)) {
+    if (instr->getParent() && instr->getParent()->getParent()) {
+      const Function *curFunc = instr->getParent()->getParent();
+      if (oneFunc && (curFunc != oneFunc))
+        return false;
+      oneFunc = curFunc;
+      return true;
+    }
+    else
+      return false;
+  }
+
+  if (const MDNode *md = dyn_cast<MDNode>(U))
+    if (md->hasName() && ((md->getName().str() == "llvm.dbg.gv") ||
+        (md->getName().str() == "llvm.dbg.sp")))
+      return true;
+
+
+  for (User::const_use_iterator ui=U->use_begin(), ue=U->use_end();
+      ui!=ue; ++ui) {
+    if (usedInOneFunc(*ui, oneFunc) == false)
+      return false;
+  }
+  return true;
+}
+
+/* Find out if a global variable can be demoted to local scope.
+ * Currently, this is valid for CUDA shared variables, which have local
+ * scope and global lifetime. So the conditions to check are :
+ * 1. Is the global variable in shared address space?
+ * 2. Does it have internal linkage?
+ * 3. Is the global variable referenced only in one function?
+ */
+static bool canDemoteGlobalVar(const GlobalVariable *gv, Function const *&f) {
+  if (gv->hasInternalLinkage() == false)
+    return false;
+  const PointerType *Pty = gv->getType();
+  if (Pty->getAddressSpace() != llvm::ADDRESS_SPACE_SHARED)
+    return false;
+
+  const Function *oneFunc = 0;
+
+  bool flag = usedInOneFunc(gv, oneFunc);
+  if (flag == false)
+    return false;
+  if (!oneFunc)
+    return false;
+  f = oneFunc;
+  return true;
+}
+
+static bool useFuncSeen(const Constant *C,
+                        llvm::DenseMap<const Function *, bool> &seenMap) {
+  for (Value::const_use_iterator ui=C->use_begin(), ue=C->use_end();
+      ui!=ue; ++ui) {
+    if (const Constant *cu = dyn_cast<Constant>(*ui)) {
+      if (useFuncSeen(cu, seenMap))
+        return true;
+    } else if (const Instruction *I = dyn_cast<Instruction>(*ui)) {
+      const BasicBlock *bb = I->getParent();
+      if (!bb) continue;
+      const Function *caller = bb->getParent();
+      if (!caller) continue;
+      if (seenMap.find(caller) != seenMap.end())
+        return true;
+    }
+  }
+  return false;
+}
+
+void NVPTXAsmPrinter::emitDeclarations (Module &M, raw_ostream &O) {
+  llvm::DenseMap<const Function *, bool> seenMap;
+  for (Module::const_iterator FI=M.begin(), FE=M.end();
+      FI!=FE; ++FI) {
+    const Function *F = FI;
+
+    if (F->isDeclaration()) {
+      if (F->use_empty())
+        continue;
+      if (F->getIntrinsicID())
+        continue;
+      CurrentFnSym = Mang->getSymbol(F);
+      emitDeclaration(F, O);
+      continue;
+    }
+    for (Value::const_use_iterator iter=F->use_begin(),
+        iterEnd=F->use_end(); iter!=iterEnd; ++iter) {
+      if (const Constant *C = dyn_cast<Constant>(*iter)) {
+        if (usedInGlobalVarDef(C)) {
+          // The use is in the initialization of a global variable
+          // that is a function pointer, so print a declaration
+          // for the original function
+          CurrentFnSym = Mang->getSymbol(F);
+          emitDeclaration(F, O);
+          break;
+        }
+        // Emit a declaration of this function if the function that
+        // uses this constant expr has already been seen.
+        if (useFuncSeen(C, seenMap)) {
+          CurrentFnSym = Mang->getSymbol(F);
+          emitDeclaration(F, O);
+          break;
+        }
+      }
+
+      if (!isa<Instruction>(*iter)) continue;
+      const Instruction *instr = cast<Instruction>(*iter);
+      const BasicBlock *bb = instr->getParent();
+      if (!bb) continue;
+      const Function *caller = bb->getParent();
+      if (!caller) continue;
+
+      // If a caller has already been seen, then the caller is
+      // appearing in the module before the callee. so print out
+      // a declaration for the callee.
+      if (seenMap.find(caller) != seenMap.end()) {
+        CurrentFnSym = Mang->getSymbol(F);
+        emitDeclaration(F, O);
+        break;
+      }
+    }
+    seenMap[F] = true;
+  }
+}
+
+void NVPTXAsmPrinter::recordAndEmitFilenames(Module &M) {
+  DebugInfoFinder DbgFinder;
+  DbgFinder.processModule(M);
+
+  unsigned i=1;
+  for (DebugInfoFinder::iterator I = DbgFinder.compile_unit_begin(),
+      E = DbgFinder.compile_unit_end(); I != E; ++I) {
+    DICompileUnit DIUnit(*I);
+    StringRef Filename(DIUnit.getFilename());
+    StringRef Dirname(DIUnit.getDirectory());
+    SmallString<128> FullPathName = Dirname;
+    if (!Dirname.empty() && !sys::path::is_absolute(Filename)) {
+      sys::path::append(FullPathName, Filename);
+      Filename = FullPathName.str();
+    }
+    if (filenameMap.find(Filename.str()) != filenameMap.end())
+      continue;
+    filenameMap[Filename.str()] = i;
+    OutStreamer.EmitDwarfFileDirective(i, "", Filename.str());
+    ++i;
+  }
+
+  for (DebugInfoFinder::iterator I = DbgFinder.subprogram_begin(),
+      E = DbgFinder.subprogram_end(); I != E; ++I) {
+    DISubprogram SP(*I);
+    StringRef Filename(SP.getFilename());
+    StringRef Dirname(SP.getDirectory());
+    SmallString<128> FullPathName = Dirname;
+    if (!Dirname.empty() && !sys::path::is_absolute(Filename)) {
+      sys::path::append(FullPathName, Filename);
+      Filename = FullPathName.str();
+    }
+    if (filenameMap.find(Filename.str()) != filenameMap.end())
+      continue;
+    filenameMap[Filename.str()] = i;
+    ++i;
+  }
+}
+
+bool NVPTXAsmPrinter::doInitialization (Module &M) {
+
+  SmallString<128> Str1;
+  raw_svector_ostream OS1(Str1);
+
+  MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+  MMI->AnalyzeModule(M);
+
+  // We need to call the parent's one explicitly.
+  //bool Result = AsmPrinter::doInitialization(M);
+
+  // Initialize TargetLoweringObjectFile.
+  const_cast<TargetLoweringObjectFile&>(getObjFileLowering())
+          .Initialize(OutContext, TM);
+
+  Mang = new Mangler(OutContext, *TM.getTargetData());
+
+  // Emit header before any dwarf directives are emitted below.
+  emitHeader(M, OS1);
+  OutStreamer.EmitRawText(OS1.str());
+
+
+  // Already commented out
+  //bool Result = AsmPrinter::doInitialization(M);
+
+
+  if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA)
+    recordAndEmitFilenames(M);
+
+  SmallString<128> Str2;
+  raw_svector_ostream OS2(Str2);
+
+  emitDeclarations(M, OS2);
+
+  // Print out module-level global variables here.
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+      I != E; ++I)
+    printModuleLevelGV(I, OS2);
+
+  OS2 << '\n';
+
+  OutStreamer.EmitRawText(OS2.str());
+  return false;  // success
+}
+
+void NVPTXAsmPrinter::emitHeader (Module &M, raw_ostream &O) {
+  O << "//\n";
+  O << "// Generated by LLVM NVPTX Back-End\n";
+  O << "//\n";
+  O << "\n";
+
+  O << ".version 3.0\n";
+
+  O << ".target ";
+  O << nvptxSubtarget.getTargetName();
+
+  if (nvptxSubtarget.getDrvInterface() == NVPTX::NVCL)
+    O << ", texmode_independent";
+  if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA) {
+    if (!nvptxSubtarget.hasDouble())
+      O << ", map_f64_to_f32";
+  }
+
+  if (MAI->doesSupportDebugInformation())
+    O << ", debug";
+
+  O << "\n";
+
+  O << ".address_size ";
+  if (nvptxSubtarget.is64Bit())
+    O << "64";
+  else
+    O << "32";
+  O << "\n";
+
+  O << "\n";
+}
+
+bool NVPTXAsmPrinter::doFinalization(Module &M) {
+  // XXX Temproarily remove global variables so that doFinalization() will not
+  // emit them again (global variables are emitted at beginning).
+
+  Module::GlobalListType &global_list = M.getGlobalList();
+  int i, n = global_list.size();
+  GlobalVariable **gv_array = new GlobalVariable* [n];
+
+  // first, back-up GlobalVariable in gv_array
+  i = 0;
+  for (Module::global_iterator I = global_list.begin(), E = global_list.end();
+      I != E; ++I)
+    gv_array[i++] = &*I;
+
+  // second, empty global_list
+  while (!global_list.empty())
+    global_list.remove(global_list.begin());
+
+  // call doFinalization
+  bool ret = AsmPrinter::doFinalization(M);
+
+  // now we restore global variables
+  for (i = 0; i < n; i ++)
+    global_list.insert(global_list.end(), gv_array[i]);
+
+  delete[] gv_array;
+  return ret;
+
+
+  //bool Result = AsmPrinter::doFinalization(M);
+  // Instead of calling the parents doFinalization, we may
+  // clone parents doFinalization and customize here.
+  // Currently, we if NVISA out the EmitGlobals() in
+  // parent's doFinalization, which is too intrusive.
+  //
+  // Same for the doInitialization.
+  //return Result;
+}
+
+// This function emits appropriate linkage directives for
+// functions and global variables.
+//
+// extern function declaration            -> .extern
+// extern function definition             -> .visible
+// external global variable with init     -> .visible
+// external without init                  -> .extern
+// appending                              -> not allowed, assert.
+
+void NVPTXAsmPrinter::emitLinkageDirective(const GlobalValue* V, raw_ostream &O)
+{
+  if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA) {
+    if (V->hasExternalLinkage()) {
+      if (isa<GlobalVariable>(V)) {
+        const GlobalVariable *GVar = cast<GlobalVariable>(V);
+        if (GVar) {
+          if (GVar->hasInitializer())
+            O << ".visible ";
+          else
+            O << ".extern ";
+        }
+      } else if (V->isDeclaration())
+        O << ".extern ";
+      else
+        O << ".visible ";
+    } else if (V->hasAppendingLinkage()) {
+      std::string msg;
+      msg.append("Error: ");
+      msg.append("Symbol ");
+      if (V->hasName())
+        msg.append(V->getName().str());
+      msg.append("has unsupported appending linkage type");
+      llvm_unreachable(msg.c_str());
+    }
+  }
+}
+
+
+void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable* GVar, raw_ostream &O,
+                                         bool processDemoted) {
+
+  // Skip meta data
+  if (GVar->hasSection()) {
+    if (GVar->getSection() == "llvm.metadata")
+      return;
+  }
+
+  const TargetData *TD = TM.getTargetData();
+
+  // GlobalVariables are always constant pointers themselves.
+  const PointerType *PTy = GVar->getType();
+  Type *ETy = PTy->getElementType();
+
+  if (GVar->hasExternalLinkage()) {
+    if (GVar->hasInitializer())
+      O << ".visible ";
+    else
+      O << ".extern ";
+  }
+
+  if (llvm::isTexture(*GVar)) {
+    O << ".global .texref " << llvm::getTextureName(*GVar) << ";\n";
+    return;
+  }
+
+  if (llvm::isSurface(*GVar)) {
+    O << ".global .surfref " << llvm::getSurfaceName(*GVar) << ";\n";
+    return;
+  }
+
+  if (GVar->isDeclaration()) {
+    // (extern) declarations, no definition or initializer
+    // Currently the only known declaration is for an automatic __local
+    // (.shared) promoted to global.
+    emitPTXGlobalVariable(GVar, O);
+    O << ";\n";
+    return;
+  }
+
+  if (llvm::isSampler(*GVar)) {
+    O << ".global .samplerref " << llvm::getSamplerName(*GVar);
+
+    Constant *Initializer = NULL;
+    if (GVar->hasInitializer())
+      Initializer = GVar->getInitializer();
+    ConstantInt *CI = NULL;
+    if (Initializer)
+      CI = dyn_cast<ConstantInt>(Initializer);
+    if (CI) {
+      unsigned sample=CI->getZExtValue();
+
+      O << " = { ";
+
+      for (int i =0, addr=((sample & __CLK_ADDRESS_MASK ) >>
+          __CLK_ADDRESS_BASE) ; i < 3 ; i++) {
+        O << "addr_mode_" << i << " = ";
+        switch (addr) {
+        case 0: O << "wrap"; break;
+        case 1: O << "clamp_to_border"; break;
+        case 2: O << "clamp_to_edge"; break;
+        case 3: O << "wrap"; break;
+        case 4: O << "mirror"; break;
+        }
+        O <<", ";
+      }
+      O << "filter_mode = ";
+      switch (( sample & __CLK_FILTER_MASK ) >> __CLK_FILTER_BASE ) {
+      case 0: O << "nearest"; break;
+      case 1: O << "linear";  break;
+      case 2: assert ( 0 && "Anisotropic filtering is not supported");
+      default: O << "nearest"; break;
+      }
+      if (!(( sample &__CLK_NORMALIZED_MASK ) >> __CLK_NORMALIZED_BASE)) {
+        O << ", force_unnormalized_coords = 1";
+      }
+      O << " }";
+    }
+
+    O << ";\n";
+    return;
+  }
+
+  if (GVar->hasPrivateLinkage()) {
+
+    if (!strncmp(GVar->getName().data(), "unrollpragma", 12))
+      return;
+
+    // FIXME - need better way (e.g. Metadata) to avoid generating this global
+    if (!strncmp(GVar->getName().data(), "filename", 8))
+      return;
+    if (GVar->use_empty())
+      return;
+  }
+
+  const Function *demotedFunc = 0;
+  if (!processDemoted && canDemoteGlobalVar(GVar, demotedFunc)) {
+    O << "// " << GVar->getName().str() << " has been demoted\n";
+    if (localDecls.find(demotedFunc) != localDecls.end())
+      localDecls[demotedFunc].push_back(GVar);
+    else {
+      std::vector<GlobalVariable *> temp;
+      temp.push_back(GVar);
+      localDecls[demotedFunc] = temp;
+    }
+    return;
+  }
+
+  O << ".";
+  emitPTXAddressSpace(PTy->getAddressSpace(), O);
+  if (GVar->getAlignment() == 0)
+    O << " .align " << (int) TD->getPrefTypeAlignment(ETy);
+  else
+    O << " .align " << GVar->getAlignment();
+
+
+  if (ETy->isPrimitiveType() || ETy->isIntegerTy() || isa<PointerType>(ETy)) {
+    O << " .";
+    O << getPTXFundamentalTypeStr(ETy, false);
+    O << " ";
+    O << *Mang->getSymbol(GVar);
+
+    // Ptx allows variable initilization only for constant and global state
+    // spaces.
+    if (((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) ||
+        (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST_NOT_GEN) ||
+        (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST))
+        && GVar->hasInitializer()) {
+      Constant *Initializer = GVar->getInitializer();
+      if (!Initializer->isNullValue()) {
+        O << " = " ;
+        printScalarConstant(Initializer, O);
+      }
+    }
+  } else {
+    unsigned int ElementSize =0;
+
+    // Although PTX has direct support for struct type and array type and
+    // LLVM IR is very similar to PTX, the LLVM CodeGen does not support for
+    // targets that support these high level field accesses. Structs, arrays
+    // and vectors are lowered into arrays of bytes.
+    switch (ETy->getTypeID()) {
+    case Type::StructTyID:
+    case Type::ArrayTyID:
+    case Type::VectorTyID:
+      ElementSize = TD->getTypeStoreSize(ETy);
+      // Ptx allows variable initilization only for constant and
+      // global state spaces.
+      if (((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) ||
+          (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST_NOT_GEN) ||
+          (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST))
+          && GVar->hasInitializer()) {
+        Constant *Initializer = GVar->getInitializer();
+        if (!isa<UndefValue>(Initializer) &&
+            !Initializer->isNullValue()) {
+          AggBuffer aggBuffer(ElementSize, O, *this);
+          bufferAggregateConstant(Initializer, &aggBuffer);
+          if (aggBuffer.numSymbols) {
+            if (nvptxSubtarget.is64Bit()) {
+              O << " .u64 " << *Mang->getSymbol(GVar) <<"[" ;
+              O << ElementSize/8;
+            }
+            else {
+              O << " .u32 " << *Mang->getSymbol(GVar) <<"[" ;
+              O << ElementSize/4;
+            }
+            O << "]";
+          }
+          else {
+            O << " .b8 " << *Mang->getSymbol(GVar) <<"[" ;
+            O << ElementSize;
+            O << "]";
+          }
+          O << " = {" ;
+          aggBuffer.print();
+          O << "}";
+        }
+        else {
+          O << " .b8 " << *Mang->getSymbol(GVar) ;
+          if (ElementSize) {
+            O <<"[" ;
+            O << ElementSize;
+            O << "]";
+          }
+        }
+      }
+      else {
+        O << " .b8 " << *Mang->getSymbol(GVar);
+        if (ElementSize) {
+          O <<"[" ;
+          O << ElementSize;
+          O << "]";
+        }
+      }
+      break;
+    default:
+      assert( 0 && "type not supported yet");
+    }
+
+  }
+  O << ";\n";
+}
+
+void NVPTXAsmPrinter::emitDemotedVars(const Function *f, raw_ostream &O) {
+  if (localDecls.find(f) == localDecls.end())
+    return;
+
+  std::vector<GlobalVariable *> &gvars = localDecls[f];
+
+  for (unsigned i=0, e=gvars.size(); i!=e; ++i) {
+    O << "\t// demoted variable\n\t";
+    printModuleLevelGV(gvars[i], O, true);
+  }
+}
+
+void NVPTXAsmPrinter::emitPTXAddressSpace(unsigned int AddressSpace,
+                                          raw_ostream &O) const {
+  switch (AddressSpace) {
+  case llvm::ADDRESS_SPACE_LOCAL:
+    O << "local" ;
+    break;
+  case llvm::ADDRESS_SPACE_GLOBAL:
+    O << "global" ;
+    break;
+  case llvm::ADDRESS_SPACE_CONST:
+    // This logic should be consistent with that in
+    // getCodeAddrSpace() (NVPTXISelDATToDAT.cpp)
+    if (nvptxSubtarget.hasGenericLdSt())
+      O << "global" ;
+    else
+      O << "const" ;
+    break;
+  case llvm::ADDRESS_SPACE_CONST_NOT_GEN:
+    O << "const" ;
+    break;
+  case llvm::ADDRESS_SPACE_SHARED:
+    O << "shared" ;
+    break;
+  default:
+    llvm_unreachable("unexpected address space");
+  }
+}
+
+std::string NVPTXAsmPrinter::getPTXFundamentalTypeStr(const Type *Ty,
+                                                      bool useB4PTR) const {
+  switch (Ty->getTypeID()) {
+  default:
+    llvm_unreachable("unexpected type");
+    break;
+  case Type::IntegerTyID: {
+    unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
+    if (NumBits == 1)
+      return "pred";
+    else if (NumBits <= 64) {
+      std::string name = "u";
+      return name + utostr(NumBits);
+    } else {
+      llvm_unreachable("Integer too large");
+      break;
+    }
+    break;
+  }
+  case Type::FloatTyID:
+    return "f32";
+  case Type::DoubleTyID:
+    return "f64";
+  case Type::PointerTyID:
+    if (nvptxSubtarget.is64Bit())
+      if (useB4PTR) return "b64";
+      else return "u64";
+    else
+      if (useB4PTR) return "b32";
+      else return "u32";
+  }
+  llvm_unreachable("unexpected type");
+  return NULL;
+}
+
+void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable* GVar,
+                                            raw_ostream &O) {
+
+  const TargetData *TD = TM.getTargetData();
+
+  // GlobalVariables are always constant pointers themselves.
+  const PointerType *PTy = GVar->getType();
+  Type *ETy = PTy->getElementType();
+
+  O << ".";
+  emitPTXAddressSpace(PTy->getAddressSpace(), O);
+  if (GVar->getAlignment() == 0)
+    O << " .align " << (int) TD->getPrefTypeAlignment(ETy);
+  else
+    O << " .align " << GVar->getAlignment();
+
+  if (ETy->isPrimitiveType() || ETy->isIntegerTy() || isa<PointerType>(ETy)) {
+    O << " .";
+    O << getPTXFundamentalTypeStr(ETy);
+    O << " ";
+    O << *Mang->getSymbol(GVar);
+    return;
+  }
+
+  int64_t ElementSize =0;
+
+  // Although PTX has direct support for struct type and array type and LLVM IR
+  // is very similar to PTX, the LLVM CodeGen does not support for targets that
+  // support these high level field accesses. Structs and arrays are lowered
+  // into arrays of bytes.
+  switch (ETy->getTypeID()) {
+  case Type::StructTyID:
+  case Type::ArrayTyID:
+  case Type::VectorTyID:
+    ElementSize = TD->getTypeStoreSize(ETy);
+    O << " .b8 " << *Mang->getSymbol(GVar) <<"[" ;
+    if (ElementSize) {
+      O << itostr(ElementSize) ;
+    }
+    O << "]";
+    break;
+  default:
+    assert( 0 && "type not supported yet");
+  }
+  return ;
+}
+
+
+static unsigned int
+getOpenCLAlignment(const TargetData *TD,
+                   Type *Ty) {
+  if (Ty->isPrimitiveType() || Ty->isIntegerTy() || isa<PointerType>(Ty))
+    return TD->getPrefTypeAlignment(Ty);
+
+  const ArrayType *ATy = dyn_cast<ArrayType>(Ty);
+  if (ATy)
+    return getOpenCLAlignment(TD, ATy->getElementType());
+
+  const VectorType *VTy = dyn_cast<VectorType>(Ty);
+  if (VTy) {
+    Type *ETy = VTy->getElementType();
+    unsigned int numE = VTy->getNumElements();
+    unsigned int alignE = TD->getPrefTypeAlignment(ETy);
+    if (numE == 3)
+      return 4*alignE;
+    else
+      return numE*alignE;
+  }
+
+  const StructType *STy = dyn_cast<StructType>(Ty);
+  if (STy) {
+    unsigned int alignStruct = 1;
+    // Go through each element of the struct and find the
+    // largest alignment.
+    for (unsigned i=0, e=STy->getNumElements(); i != e; i++) {
+      Type *ETy = STy->getElementType(i);
+      unsigned int align = getOpenCLAlignment(TD, ETy);
+      if (align > alignStruct)
+        alignStruct = align;
+    }
+    return alignStruct;
+  }
+
+  const FunctionType *FTy = dyn_cast<FunctionType>(Ty);
+  if (FTy)
+    return TD->getPointerPrefAlignment();
+  return TD->getPrefTypeAlignment(Ty);
+}
+
+void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I,
+                                     int paramIndex, raw_ostream &O) {
+  if ((nvptxSubtarget.getDrvInterface() == NVPTX::NVCL) ||
+      (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA))
+    O << *CurrentFnSym << "_param_" << paramIndex;
+  else {
+    std::string argName = I->getName();
+    const char *p = argName.c_str();
+    while (*p) {
+      if (*p == '.')
+        O << "_";
+      else
+        O << *p;
+      p++;
+    }
+  }
+}
+
+void NVPTXAsmPrinter::printParamName(int paramIndex, raw_ostream &O) {
+  Function::const_arg_iterator I, E;
+  int i = 0;
+
+  if ((nvptxSubtarget.getDrvInterface() == NVPTX::NVCL) ||
+      (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA)) {
+    O << *CurrentFnSym << "_param_" << paramIndex;
+    return;
+  }
+
+  for (I = F->arg_begin(), E = F->arg_end(); I != E; ++I, i++) {
+    if (i==paramIndex) {
+      printParamName(I, paramIndex, O);
+      return;
+    }
+  }
+  llvm_unreachable("paramIndex out of bound");
+}
+
+void NVPTXAsmPrinter::emitFunctionParamList(const Function *F,
+                                            raw_ostream &O) {
+  const TargetData *TD = TM.getTargetData();
+  const AttrListPtr &PAL = F->getAttributes();
+  const TargetLowering *TLI = TM.getTargetLowering();
+  Function::const_arg_iterator I, E;
+  unsigned paramIndex = 0;
+  bool first = true;
+  bool isKernelFunc = llvm::isKernelFunction(*F);
+  bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+  MVT thePointerTy = TLI->getPointerTy();
+
+  O << "(\n";
+
+  for (I = F->arg_begin(), E = F->arg_end(); I != E; ++I, paramIndex++) {
+    const Type *Ty = I->getType();
+
+    if (!first)
+      O << ",\n";
+
+    first = false;
+
+    // Handle image/sampler parameters
+    if (llvm::isSampler(*I) || llvm::isImage(*I)) {
+      if (llvm::isImage(*I)) {
+        std::string sname = I->getName();
+        if (llvm::isImageWriteOnly(*I))
+          O << "\t.param .surfref " << *CurrentFnSym << "_param_" << paramIndex;
+        else // Default image is read_only
+          O << "\t.param .texref " << *CurrentFnSym << "_param_" << paramIndex;
+      }
+      else // Should be llvm::isSampler(*I)
+        O << "\t.param .samplerref " << *CurrentFnSym << "_param_"
+        << paramIndex;
+      continue;
+    }
+
+    if (PAL.paramHasAttr(paramIndex+1, Attribute::ByVal) == false) {
+      // Just a scalar
+      const PointerType *PTy = dyn_cast<PointerType>(Ty);
+      if (isKernelFunc) {
+        if (PTy) {
+          // Special handling for pointer arguments to kernel
+          O << "\t.param .u" << thePointerTy.getSizeInBits() << " ";
+
+          if (nvptxSubtarget.getDrvInterface() != NVPTX::CUDA) {
+            Type *ETy = PTy->getElementType();
+            int addrSpace = PTy->getAddressSpace();
+            switch(addrSpace) {
+            default:
+              O << ".ptr ";
+              break;
+            case llvm::ADDRESS_SPACE_CONST_NOT_GEN:
+              O << ".ptr .const ";
+              break;
+            case llvm::ADDRESS_SPACE_SHARED:
+              O << ".ptr .shared ";
+              break;
+            case llvm::ADDRESS_SPACE_GLOBAL:
+            case llvm::ADDRESS_SPACE_CONST:
+              O << ".ptr .global ";
+              break;
+            }
+            O << ".align " << (int)getOpenCLAlignment(TD, ETy) << " ";
+          }
+          printParamName(I, paramIndex, O);
+          continue;
+        }
+
+        // non-pointer scalar to kernel func
+        O << "\t.param ."
+            << getPTXFundamentalTypeStr(Ty) << " ";
+        printParamName(I, paramIndex, O);
+        continue;
+      }
+      // Non-kernel function, just print .param .b<size> for ABI
+      // and .reg .b<size> for non ABY
+      unsigned sz = 0;
+      if (isa<IntegerType>(Ty)) {
+        sz = cast<IntegerType>(Ty)->getBitWidth();
+        if (sz < 32) sz = 32;
+      }
+      else if (isa<PointerType>(Ty))
+        sz = thePointerTy.getSizeInBits();
+      else
+        sz = Ty->getPrimitiveSizeInBits();
+      if (isABI)
+        O << "\t.param .b" << sz << " ";
+      else
+        O << "\t.reg .b" << sz << " ";
+      printParamName(I, paramIndex, O);
+      continue;
+    }
+
+    // param has byVal attribute. So should be a pointer
+    const PointerType *PTy = dyn_cast<PointerType>(Ty);
+    assert(PTy &&
+           "Param with byval attribute should be a pointer type");
+    Type *ETy = PTy->getElementType();
+
+    if (isABI || isKernelFunc) {
+      // Just print .param .b8 .align <a> .param[size];
+      // <a> = PAL.getparamalignment
+      // size = typeallocsize of element type
+      unsigned align = PAL.getParamAlignment(paramIndex+1);
+      unsigned sz = TD->getTypeAllocSize(ETy);
+      O << "\t.param .align " << align
+          << " .b8 ";
+      printParamName(I, paramIndex, O);
+      O << "[" << sz << "]";
+      continue;
+    } else {
+      // Split the ETy into constituent parts and
+      // print .param .b<size> <name> for each part.
+      // Further, if a part is vector, print the above for
+      // each vector element.
+      SmallVector<EVT, 16> vtparts;
+      ComputeValueVTs(*TLI, ETy, vtparts);
+      for (unsigned i=0,e=vtparts.size(); i!=e; ++i) {
+        unsigned elems = 1;
+        EVT elemtype = vtparts[i];
+        if (vtparts[i].isVector()) {
+          elems = vtparts[i].getVectorNumElements();
+          elemtype = vtparts[i].getVectorElementType();
+        }
+
+        for (unsigned j=0,je=elems; j!=je; ++j) {
+          unsigned sz = elemtype.getSizeInBits();
+          if (elemtype.isInteger() && (sz < 32)) sz = 32;
+          O << "\t.reg .b" << sz << " ";
+          printParamName(I, paramIndex, O);
+          if (j<je-1) O << ",\n";
+          ++paramIndex;
+        }
+        if (i<e-1)
+          O << ",\n";
+      }
+      --paramIndex;
+      continue;
+    }
+  }
+
+  O << "\n)\n";
+}
+
+void NVPTXAsmPrinter::emitFunctionParamList(const MachineFunction &MF,
+                                            raw_ostream &O) {
+  const Function *F = MF.getFunction();
+  emitFunctionParamList(F, O);
+}
+
+
+void NVPTXAsmPrinter::
+setAndEmitFunctionVirtualRegisters(const MachineFunction &MF) {
+  SmallString<128> Str;
+  raw_svector_ostream O(Str);
+
+  // Map the global virtual register number to a register class specific
+  // virtual register number starting from 1 with that class.
+  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+  //unsigned numRegClasses = TRI->getNumRegClasses();
+
+  // Emit the Fake Stack Object
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  int NumBytes = (int) MFI->getStackSize();
+  if (NumBytes) {
+    O << "\t.local .align " << MFI->getMaxAlignment() << " .b8 \t"
+        << DEPOTNAME
+        << getFunctionNumber() << "[" << NumBytes << "];\n";
+    if (nvptxSubtarget.is64Bit()) {
+      O << "\t.reg .b64 \t%SP;\n";
+      O << "\t.reg .b64 \t%SPL;\n";
+    }
+    else {
+      O << "\t.reg .b32 \t%SP;\n";
+      O << "\t.reg .b32 \t%SPL;\n";
+    }
+  }
+
+  // Go through all virtual registers to establish the mapping between the
+  // global virtual
+  // register number and the per class virtual register number.
+  // We use the per class virtual register number in the ptx output.
+  unsigned int numVRs = MRI->getNumVirtRegs();
+  for (unsigned i=0; i< numVRs; i++) {
+    unsigned int vr = TRI->index2VirtReg(i);
+    const TargetRegisterClass *RC = MRI->getRegClass(vr);
+    std::map<unsigned, unsigned> &regmap = VRidGlobal2LocalMap[RC->getID()];
+    int n = regmap.size();
+    regmap.insert(std::make_pair(vr, n+1));
+  }
+
+  // Emit register declarations
+  // @TODO: Extract out the real register usage
+  O << "\t.reg .pred %p<" << NVPTXNumRegisters << ">;\n";
+  O << "\t.reg .s16 %rc<" << NVPTXNumRegisters << ">;\n";
+  O << "\t.reg .s16 %rs<" << NVPTXNumRegisters << ">;\n";
+  O << "\t.reg .s32 %r<" << NVPTXNumRegisters << ">;\n";
+  O << "\t.reg .s64 %rl<" << NVPTXNumRegisters << ">;\n";
+  O << "\t.reg .f32 %f<" << NVPTXNumRegisters << ">;\n";
+  O << "\t.reg .f64 %fl<" << NVPTXNumRegisters << ">;\n";
+
+  // Emit declaration of the virtual registers or 'physical' registers for
+  // each register class
+  //for (unsigned i=0; i< numRegClasses; i++) {
+  //    std::map<unsigned, unsigned> &regmap = VRidGlobal2LocalMap[i];
+  //    const TargetRegisterClass *RC = TRI->getRegClass(i);
+  //    std::string rcname = getNVPTXRegClassName(RC);
+  //    std::string rcStr = getNVPTXRegClassStr(RC);
+  //    //int n = regmap.size();
+  //    if (!isNVPTXVectorRegClass(RC)) {
+  //      O << "\t.reg " << rcname << " \t" << rcStr << "<"
+  //        << NVPTXNumRegisters << ">;\n";
+  //    }
+
+  // Only declare those registers that may be used. And do not emit vector
+  // registers as
+  // they are all elementized to scalar registers.
+  //if (n && !isNVPTXVectorRegClass(RC)) {
+  //    if (RegAllocNilUsed) {
+  //        O << "\t.reg " << rcname << " \t" << rcStr << "<" << (n+1)
+  //          << ">;\n";
+  //    }
+  //    else {
+  //        O << "\t.reg " << rcname << " \t" << StrToUpper(rcStr)
+  //          << "<" << 32 << ">;\n";
+  //    }
+  //}
+  //}
+
+  OutStreamer.EmitRawText(O.str());
+}
+
+
+void NVPTXAsmPrinter::printFPConstant(const ConstantFP *Fp, raw_ostream &O) {
+  APFloat APF = APFloat(Fp->getValueAPF());  // make a copy
+  bool ignored;
+  unsigned int numHex;
+  const char *lead;
+
+  if (Fp->getType()->getTypeID()==Type::FloatTyID) {
+    numHex = 8;
+    lead = "0f";
+    APF.convert(APFloat::IEEEsingle, APFloat::rmNearestTiesToEven,
+                &ignored);
+  } else if (Fp->getType()->getTypeID() == Type::DoubleTyID) {
+    numHex = 16;
+    lead = "0d";
+    APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven,
+                &ignored);
+  } else
+    llvm_unreachable("unsupported fp type");
+
+  APInt API = APF.bitcastToAPInt();
+  std::string hexstr(utohexstr(API.getZExtValue()));
+  O << lead;
+  if (hexstr.length() < numHex)
+    O << std::string(numHex - hexstr.length(), '0');
+  O << utohexstr(API.getZExtValue());
+}
+
+void NVPTXAsmPrinter::printScalarConstant(Constant *CPV, raw_ostream &O) {
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) {
+    O << CI->getValue();
+    return;
+  }
+  if (ConstantFP *CFP = dyn_cast<ConstantFP>(CPV)) {
+    printFPConstant(CFP, O);
+    return;
+  }
+  if (isa<ConstantPointerNull>(CPV)) {
+    O << "0";
+    return;
+  }
+  if (GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
+    O << *Mang->getSymbol(GVar);
+    return;
+  }
+  if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+    Value *v = Cexpr->stripPointerCasts();
+    if (GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
+      O << *Mang->getSymbol(GVar);
+      return;
+    } else {
+      O << *LowerConstant(CPV, *this);
+      return;
+    }
+  }
+  llvm_unreachable("Not scalar type found in printScalarConstant()");
+}
+
+
+void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes,
+                                   AggBuffer *aggBuffer) {
+
+  const TargetData *TD = TM.getTargetData();
+
+  if (isa<UndefValue>(CPV) || CPV->isNullValue()) {
+    int s = TD->getTypeAllocSize(CPV->getType());
+    if (s<Bytes)
+      s = Bytes;
+    aggBuffer->addZeros(s);
+    return;
+  }
+
+  unsigned char *ptr;
+  switch (CPV->getType()->getTypeID()) {
+
+  case Type::IntegerTyID: {
+    const Type *ETy = CPV->getType();
+    if ( ETy == Type::getInt8Ty(CPV->getContext()) ){
+      unsigned char c =
+          (unsigned char)(dyn_cast<ConstantInt>(CPV))->getZExtValue();
+      ptr = &c;
+      aggBuffer->addBytes(ptr, 1, Bytes);
+    } else if ( ETy == Type::getInt16Ty(CPV->getContext()) ) {
+      short int16 =
+          (short)(dyn_cast<ConstantInt>(CPV))->getZExtValue();
+      ptr = (unsigned char*)&int16;
+      aggBuffer->addBytes(ptr, 2, Bytes);
+    } else if ( ETy == Type::getInt32Ty(CPV->getContext()) ) {
+      if (ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
+        int int32 =(int)(constInt->getZExtValue());
+        ptr = (unsigned char*)&int32;
+        aggBuffer->addBytes(ptr, 4, Bytes);
+        break;
+      } else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+        if (ConstantInt *constInt =
+            dyn_cast<ConstantInt>(ConstantFoldConstantExpression(
+                Cexpr, TD))) {
+          int int32 =(int)(constInt->getZExtValue());
+          ptr = (unsigned char*)&int32;
+          aggBuffer->addBytes(ptr, 4, Bytes);
+          break;
+        }
+        if (Cexpr->getOpcode() == Instruction::PtrToInt) {
+          Value *v = Cexpr->getOperand(0)->stripPointerCasts();
+          aggBuffer->addSymbol(v);
+          aggBuffer->addZeros(4);
+          break;
+        }
+      }
+      llvm_unreachable("unsupported integer const type");
+    } else if (ETy == Type::getInt64Ty(CPV->getContext()) ) {
+      if (ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
+        long long int64 =(long long)(constInt->getZExtValue());
+        ptr = (unsigned char*)&int64;
+        aggBuffer->addBytes(ptr, 8, Bytes);
+        break;
+      } else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+        if (ConstantInt *constInt = dyn_cast<ConstantInt>(
+            ConstantFoldConstantExpression(Cexpr, TD))) {
+          long long int64 =(long long)(constInt->getZExtValue());
+          ptr = (unsigned char*)&int64;
+          aggBuffer->addBytes(ptr, 8, Bytes);
+          break;
+        }
+        if (Cexpr->getOpcode() == Instruction::PtrToInt) {
+          Value *v = Cexpr->getOperand(0)->stripPointerCasts();
+          aggBuffer->addSymbol(v);
+          aggBuffer->addZeros(8);
+          break;
+        }
+      }
+      llvm_unreachable("unsupported integer const type");
+    } else
+      llvm_unreachable("unsupported integer const type");
+    break;
+  }
+  case Type::FloatTyID:
+  case Type::DoubleTyID: {
+    ConstantFP *CFP = dyn_cast<ConstantFP>(CPV);
+    const Type* Ty = CFP->getType();
+    if (Ty == Type::getFloatTy(CPV->getContext())) {
+      float float32 = (float)CFP->getValueAPF().convertToFloat();
+      ptr = (unsigned char*)&float32;
+      aggBuffer->addBytes(ptr, 4, Bytes);
+    } else if (Ty == Type::getDoubleTy(CPV->getContext())) {
+      double float64 = CFP->getValueAPF().convertToDouble();
+      ptr = (unsigned char*)&float64;
+      aggBuffer->addBytes(ptr, 8, Bytes);
+    }
+    else {
+      llvm_unreachable("unsupported fp const type");
+    }
+    break;
+  }
+  case Type::PointerTyID: {
+    if (GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
+      aggBuffer->addSymbol(GVar);
+    }
+    else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+      Value *v = Cexpr->stripPointerCasts();
+      aggBuffer->addSymbol(v);
+    }
+    unsigned int s = TD->getTypeAllocSize(CPV->getType());
+    aggBuffer->addZeros(s);
+    break;
+  }
+
+  case Type::ArrayTyID:
+  case Type::VectorTyID:
+  case Type::StructTyID: {
+    if (isa<ConstantArray>(CPV) || isa<ConstantVector>(CPV) ||
+        isa<ConstantStruct>(CPV)) {
+      int ElementSize = TD->getTypeAllocSize(CPV->getType());
+      bufferAggregateConstant(CPV, aggBuffer);
+      if ( Bytes > ElementSize )
+        aggBuffer->addZeros(Bytes-ElementSize);
+    }
+    else if (isa<ConstantAggregateZero>(CPV))
+      aggBuffer->addZeros(Bytes);
+    else
+      llvm_unreachable("Unexpected Constant type");
+    break;
+  }
+
+  default:
+    llvm_unreachable("unsupported type");
+  }
+}
+
+void NVPTXAsmPrinter::bufferAggregateConstant(Constant *CPV,
+                                              AggBuffer *aggBuffer) {
+  const TargetData *TD = TM.getTargetData();
+  int Bytes;
+
+  // Old constants
+  if (isa<ConstantArray>(CPV) || isa<ConstantVector>(CPV)) {
+    if (CPV->getNumOperands())
+      for (unsigned i = 0, e = CPV->getNumOperands(); i != e; ++i)
+        bufferLEByte(cast<Constant>(CPV->getOperand(i)), 0, aggBuffer);
+    return;
+  }
+
+  if (const ConstantDataSequential *CDS =
+      dyn_cast<ConstantDataSequential>(CPV)) {
+    if (CDS->getNumElements())
+      for (unsigned i = 0; i < CDS->getNumElements(); ++i)
+        bufferLEByte(cast<Constant>(CDS->getElementAsConstant(i)), 0,
+                     aggBuffer);
+    return;
+  }
+
+
+  if (isa<ConstantStruct>(CPV)) {
+    if (CPV->getNumOperands()) {
+      StructType *ST = cast<StructType>(CPV->getType());
+      for (unsigned i = 0, e = CPV->getNumOperands(); i != e; ++i) {
+        if ( i == (e - 1))
+          Bytes = TD->getStructLayout(ST)->getElementOffset(0) +
+          TD->getTypeAllocSize(ST)
+          - TD->getStructLayout(ST)->getElementOffset(i);
+        else
+          Bytes = TD->getStructLayout(ST)->getElementOffset(i+1) -
+          TD->getStructLayout(ST)->getElementOffset(i);
+        bufferLEByte(cast<Constant>(CPV->getOperand(i)), Bytes,
+                     aggBuffer);
+      }
+    }
+    return;
+  }
+  llvm_unreachable("unsupported constant type in printAggregateConstant()");
+}
+
+// buildTypeNameMap - Run through symbol table looking for type names.
+//
+
+
+bool NVPTXAsmPrinter::isImageType(const Type *Ty) {
+
+  std::map<const Type *, std::string>::iterator PI = TypeNameMap.find(Ty);
+
+  if (PI != TypeNameMap.end() &&
+      (!PI->second.compare("struct._image1d_t") ||
+          !PI->second.compare("struct._image2d_t") ||
+          !PI->second.compare("struct._image3d_t")))
+    return true;
+
+  return false;
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool NVPTXAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                      unsigned AsmVariant,
+                                      const char *ExtraCode,
+                                      raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+    case 'r':
+      break;
+    }
+  }
+
+  printOperand(MI, OpNo, O);
+
+  return false;
+}
+
+bool NVPTXAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                            unsigned OpNo,
+                                            unsigned AsmVariant,
+                                            const char *ExtraCode,
+                                            raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0])
+    return true;  // Unknown modifier
+
+  O << '[';
+  printMemOperand(MI, OpNo, O);
+  O << ']';
+
+  return false;
+}
+
+bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI)
+{
+  switch(MI.getOpcode()) {
+  default:
+    return false;
+  case NVPTX::CallArgBeginInst:  case NVPTX::CallArgEndInst0:
+  case NVPTX::CallArgEndInst1:  case NVPTX::CallArgF32:
+  case NVPTX::CallArgF64:  case NVPTX::CallArgI16:
+  case NVPTX::CallArgI32:  case NVPTX::CallArgI32imm:
+  case NVPTX::CallArgI64:  case NVPTX::CallArgI8:
+  case NVPTX::CallArgParam:  case NVPTX::CallVoidInst:
+  case NVPTX::CallVoidInstReg:  case NVPTX::Callseq_End:
+  case NVPTX::CallVoidInstReg64:
+  case NVPTX::DeclareParamInst:  case NVPTX::DeclareRetMemInst:
+  case NVPTX::DeclareRetRegInst:  case NVPTX::DeclareRetScalarInst:
+  case NVPTX::DeclareScalarParamInst:  case NVPTX::DeclareScalarRegInst:
+  case NVPTX::StoreParamF32:  case NVPTX::StoreParamF64:
+  case NVPTX::StoreParamI16:  case NVPTX::StoreParamI32:
+  case NVPTX::StoreParamI64:  case NVPTX::StoreParamI8:
+  case NVPTX::StoreParamS32I8:  case NVPTX::StoreParamU32I8:
+  case NVPTX::StoreParamS32I16:  case NVPTX::StoreParamU32I16:
+  case NVPTX::StoreParamScalar2F32:  case NVPTX::StoreParamScalar2F64:
+  case NVPTX::StoreParamScalar2I16:  case NVPTX::StoreParamScalar2I32:
+  case NVPTX::StoreParamScalar2I64:  case NVPTX::StoreParamScalar2I8:
+  case NVPTX::StoreParamScalar4F32:  case NVPTX::StoreParamScalar4I16:
+  case NVPTX::StoreParamScalar4I32:  case NVPTX::StoreParamScalar4I8:
+  case NVPTX::StoreParamV2F32:  case NVPTX::StoreParamV2F64:
+  case NVPTX::StoreParamV2I16:  case NVPTX::StoreParamV2I32:
+  case NVPTX::StoreParamV2I64:  case NVPTX::StoreParamV2I8:
+  case NVPTX::StoreParamV4F32:  case NVPTX::StoreParamV4I16:
+  case NVPTX::StoreParamV4I32:  case NVPTX::StoreParamV4I8:
+  case NVPTX::StoreRetvalF32:  case NVPTX::StoreRetvalF64:
+  case NVPTX::StoreRetvalI16:  case NVPTX::StoreRetvalI32:
+  case NVPTX::StoreRetvalI64:  case NVPTX::StoreRetvalI8:
+  case NVPTX::StoreRetvalScalar2F32:  case NVPTX::StoreRetvalScalar2F64:
+  case NVPTX::StoreRetvalScalar2I16:  case NVPTX::StoreRetvalScalar2I32:
+  case NVPTX::StoreRetvalScalar2I64:  case NVPTX::StoreRetvalScalar2I8:
+  case NVPTX::StoreRetvalScalar4F32:  case NVPTX::StoreRetvalScalar4I16:
+  case NVPTX::StoreRetvalScalar4I32:  case NVPTX::StoreRetvalScalar4I8:
+  case NVPTX::StoreRetvalV2F32:  case NVPTX::StoreRetvalV2F64:
+  case NVPTX::StoreRetvalV2I16:  case NVPTX::StoreRetvalV2I32:
+  case NVPTX::StoreRetvalV2I64:  case NVPTX::StoreRetvalV2I8:
+  case NVPTX::StoreRetvalV4F32:  case NVPTX::StoreRetvalV4I16:
+  case NVPTX::StoreRetvalV4I32:  case NVPTX::StoreRetvalV4I8:
+  case NVPTX::LastCallArgF32:  case NVPTX::LastCallArgF64:
+  case NVPTX::LastCallArgI16:  case NVPTX::LastCallArgI32:
+  case NVPTX::LastCallArgI32imm:  case NVPTX::LastCallArgI64:
+  case NVPTX::LastCallArgI8:  case NVPTX::LastCallArgParam:
+  case NVPTX::LoadParamMemF32:  case NVPTX::LoadParamMemF64:
+  case NVPTX::LoadParamMemI16:  case NVPTX::LoadParamMemI32:
+  case NVPTX::LoadParamMemI64:  case NVPTX::LoadParamMemI8:
+  case NVPTX::LoadParamRegF32:  case NVPTX::LoadParamRegF64:
+  case NVPTX::LoadParamRegI16:  case NVPTX::LoadParamRegI32:
+  case NVPTX::LoadParamRegI64:  case NVPTX::LoadParamRegI8:
+  case NVPTX::LoadParamScalar2F32:  case NVPTX::LoadParamScalar2F64:
+  case NVPTX::LoadParamScalar2I16:  case NVPTX::LoadParamScalar2I32:
+  case NVPTX::LoadParamScalar2I64:  case NVPTX::LoadParamScalar2I8:
+  case NVPTX::LoadParamScalar4F32:  case NVPTX::LoadParamScalar4I16:
+  case NVPTX::LoadParamScalar4I32:  case NVPTX::LoadParamScalar4I8:
+  case NVPTX::LoadParamV2F32:  case NVPTX::LoadParamV2F64:
+  case NVPTX::LoadParamV2I16:  case NVPTX::LoadParamV2I32:
+  case NVPTX::LoadParamV2I64:  case NVPTX::LoadParamV2I8:
+  case NVPTX::LoadParamV4F32:  case NVPTX::LoadParamV4I16:
+  case NVPTX::LoadParamV4I32:  case NVPTX::LoadParamV4I8:
+  case NVPTX::PrototypeInst:   case NVPTX::DBG_VALUE:
+    return true;
+  }
+  return false;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeNVPTXBackendAsmPrinter() {
+  RegisterAsmPrinter<NVPTXAsmPrinter> X(TheNVPTXTarget32);
+  RegisterAsmPrinter<NVPTXAsmPrinter> Y(TheNVPTXTarget64);
+}
+
+
+void NVPTXAsmPrinter::emitSrcInText(StringRef filename, unsigned line) {
+  std::stringstream temp;
+  LineReader * reader = this->getReader(filename.str());
+  temp << "\n//";
+  temp << filename.str();
+  temp << ":";
+  temp << line;
+  temp << " ";
+  temp << reader->readLine(line);
+  temp << "\n";
+  this->OutStreamer.EmitRawText(Twine(temp.str()));
+}
+
+
+LineReader *NVPTXAsmPrinter::getReader(std::string filename) {
+  if (reader == NULL)  {
+    reader =  new LineReader(filename);
+  }
+
+  if (reader->fileName() != filename) {
+    delete reader;
+    reader =  new LineReader(filename);
+  }
+
+  return reader;
+}
+
+
+std::string
+LineReader::readLine(unsigned lineNum) {
+  if (lineNum < theCurLine) {
+    theCurLine = 0;
+    fstr.seekg(0,std::ios::beg);
+  }
+  while (theCurLine < lineNum) {
+    fstr.getline(buff,500);
+    theCurLine++;
+  }
+  return buff;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeNVPTXAsmPrinter() {
+  RegisterAsmPrinter<NVPTXAsmPrinter> X(TheNVPTXTarget32);
+  RegisterAsmPrinter<NVPTXAsmPrinter> Y(TheNVPTXTarget64);
+}
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h
new file mode 100644
index 0000000..6488b14
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -0,0 +1,315 @@
+//===-- NVPTXAsmPrinter.h - NVPTX LLVM assembly writer --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to NVPTX assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXASMPRINTER_H
+#define NVPTXASMPRINTER_H
+
+#include "NVPTX.h"
+#include "NVPTXTargetMachine.h"
+#include "NVPTXSubtarget.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include <fstream>
+
+// The ptx syntax and format is very different from that usually seem in a .s
+// file,
+// therefore we are not able to use the MCAsmStreamer interface here.
+//
+// We are handcrafting the output method here.
+//
+// A better approach is to clone the MCAsmStreamer to a MCPTXAsmStreamer
+// (subclass of MCStreamer).
+
+// This is defined in AsmPrinter.cpp.
+// Used to process the constant expressions in initializers.
+namespace nvptx {
+const llvm::MCExpr *LowerConstant(const llvm::Constant *CV,
+                                  llvm::AsmPrinter &AP) ;
+}
+
+namespace llvm {
+
+class LineReader {
+private:
+  unsigned theCurLine ;
+  std::ifstream fstr;
+  char buff[512];
+  std::string theFileName;
+  SmallVector<unsigned, 32> lineOffset;
+public:
+  LineReader(std::string filename) {
+    theCurLine = 0;
+    fstr.open(filename.c_str());
+    theFileName = filename;
+  }
+  std::string fileName() { return theFileName; }
+  ~LineReader() {
+    fstr.close();
+  }
+  std::string readLine(unsigned line);
+};
+
+
+
+class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
+
+
+  class AggBuffer {
+    // Used to buffer the emitted string for initializing global
+    // aggregates.
+    //
+    // Normally an aggregate (array, vector or structure) is emitted
+    // as a u8[]. However, if one element/field of the aggregate
+    // is a non-NULL address, then the aggregate is emitted as u32[]
+    // or u64[].
+    //
+    // We first layout the aggregate in 'buffer' in bytes, except for
+    // those symbol addresses. For the i-th symbol address in the
+    //aggregate, its corresponding 4-byte or 8-byte elements in 'buffer'
+    // are filled with 0s. symbolPosInBuffer[i-1] records its position
+    // in 'buffer', and Symbols[i-1] records the Value*.
+    //
+    // Once we have this AggBuffer setup, we can choose how to print
+    // it out.
+  public:
+    unsigned size;   // size of the buffer in bytes
+    unsigned char *buffer; // the buffer
+    unsigned numSymbols;   // number of symbol addresses
+    SmallVector<unsigned, 4> symbolPosInBuffer;
+    SmallVector<Value *, 4> Symbols;
+
+  private:
+    unsigned curpos;
+    raw_ostream &O;
+    NVPTXAsmPrinter &AP;
+
+  public:
+    AggBuffer(unsigned _size, raw_ostream &_O, NVPTXAsmPrinter &_AP)
+    :O(_O),AP(_AP) {
+      buffer = new unsigned char[_size];
+      size = _size;
+      curpos = 0;
+      numSymbols = 0;
+    }
+    ~AggBuffer() {
+      delete [] buffer;
+    }
+    unsigned addBytes(unsigned char *Ptr, int Num, int Bytes) {
+      assert((curpos+Num) <= size);
+      assert((curpos+Bytes) <= size);
+      for ( int i= 0; i < Num; ++i) {
+        buffer[curpos] = Ptr[i];
+        curpos ++;
+      }
+      for ( int i=Num; i < Bytes ; ++i) {
+        buffer[curpos] = 0;
+        curpos ++;
+      }
+      return curpos;
+    }
+    unsigned addZeros(int Num) {
+      assert((curpos+Num) <= size);
+      for ( int i= 0; i < Num; ++i) {
+        buffer[curpos] = 0;
+        curpos ++;
+      }
+      return curpos;
+    }
+    void addSymbol(Value *GVar) {
+      symbolPosInBuffer.push_back(curpos);
+      Symbols.push_back(GVar);
+      numSymbols++;
+    }
+    void print() {
+      if (numSymbols == 0) {
+        // print out in bytes
+        for (unsigned i=0; i<size; i++) {
+          if (i)
+            O << ", ";
+          O << (unsigned int)buffer[i];
+        }
+      } else {
+        // print out in 4-bytes or 8-bytes
+        unsigned int pos = 0;
+        unsigned int nSym = 0;
+        unsigned int nextSymbolPos = symbolPosInBuffer[nSym];
+        unsigned int nBytes = 4;
+        if (AP.nvptxSubtarget.is64Bit())
+          nBytes = 8;
+        for (pos=0; pos<size; pos+=nBytes) {
+          if (pos)
+            O << ", ";
+          if (pos == nextSymbolPos) {
+            Value *v = Symbols[nSym];
+            if (GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
+              MCSymbol *Name = AP.Mang->getSymbol(GVar);
+              O << *Name;
+            }
+            else if (ConstantExpr *Cexpr =
+                dyn_cast<ConstantExpr>(v)) {
+              O << *nvptx::LowerConstant(Cexpr, AP);
+            } else
+              llvm_unreachable("symbol type unknown");
+            nSym++;
+            if (nSym >= numSymbols)
+              nextSymbolPos = size+1;
+            else
+              nextSymbolPos = symbolPosInBuffer[nSym];
+          } else
+            if (nBytes == 4)
+              O << *(unsigned int*)(buffer+pos);
+            else
+              O << *(unsigned long long*)(buffer+pos);
+        }
+      }
+    }
+  };
+
+  friend class AggBuffer;
+
+  virtual void emitSrcInText(StringRef filename, unsigned line);
+
+private :
+  virtual const char *getPassName() const {
+    return "NVPTX Assembly Printer";
+  }
+
+  const Function *F;
+  std::string CurrentFnName;
+
+  void EmitFunctionEntryLabel();
+  void EmitFunctionBodyStart();
+  void EmitFunctionBodyEnd();
+
+  void EmitInstruction(const MachineInstr *);
+
+  void EmitAlignment(unsigned NumBits, const GlobalValue *GV = 0) const {}
+
+  void printGlobalVariable(const GlobalVariable *GVar);
+  void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
+                    const char *Modifier=0);
+  void printLdStCode(const MachineInstr *MI, int opNum, raw_ostream &O,
+                     const char *Modifier=0);
+  void printVecModifiedImmediate(const MachineOperand &MO,
+                                 const char *Modifier, raw_ostream &O);
+  void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
+                       const char *Modifier=0);
+  void printImplicitDef(const MachineInstr *MI, raw_ostream &O) const;
+  // definition autogenerated.
+  void printInstruction(const MachineInstr *MI, raw_ostream &O);
+  void printModuleLevelGV(GlobalVariable* GVar, raw_ostream &O,
+                          bool=false);
+  void printParamName(int paramIndex, raw_ostream &O);
+  void printParamName(Function::const_arg_iterator I, int paramIndex,
+                      raw_ostream &O);
+  void emitHeader(Module &M, raw_ostream &O);
+  void emitKernelFunctionDirectives(const Function& F,
+                                    raw_ostream &O) const;
+  void emitVirtualRegister(unsigned int vr, bool isVec, raw_ostream &O);
+  void emitFunctionExternParamList(const MachineFunction &MF);
+  void emitFunctionParamList(const Function *, raw_ostream &O);
+  void emitFunctionParamList(const MachineFunction &MF, raw_ostream &O);
+  void setAndEmitFunctionVirtualRegisters(const MachineFunction &MF);
+  void emitFunctionTempData(const MachineFunction &MF,
+                            unsigned &FrameSize);
+  bool isImageType(const Type *Ty);
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &);
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &);
+  void printReturnValStr(const Function *, raw_ostream &O);
+  void printReturnValStr(const MachineFunction &MF, raw_ostream &O);
+
+protected:
+  bool doInitialization(Module &M);
+  bool doFinalization(Module &M);
+
+private:
+  std::string CurrentBankselLabelInBasicBlock;
+
+  // This is specific per MachineFunction.
+  const MachineRegisterInfo *MRI;
+  // The contents are specific for each
+  // MachineFunction. But the size of the
+  // array is not.
+  std::map<unsigned, unsigned> *VRidGlobal2LocalMap;
+  // cache the subtarget here.
+  const NVPTXSubtarget &nvptxSubtarget;
+  // Build the map between type name and ID based on module's type
+  // symbol table.
+  std::map<const Type *, std::string> TypeNameMap;
+
+  // List of variables demoted to a function scope.
+  std::map<const Function *, std::vector<GlobalVariable *> > localDecls;
+
+  // To record filename to ID mapping
+  std::map<std::string, unsigned> filenameMap;
+  void recordAndEmitFilenames(Module &);
+
+  void emitPTXGlobalVariable(const GlobalVariable *GVar, raw_ostream &O);
+  void emitPTXAddressSpace(unsigned int AddressSpace,
+                           raw_ostream &O) const;
+  std::string getPTXFundamentalTypeStr(const Type *Ty, bool=true) const ;
+  void printScalarConstant(Constant *CPV, raw_ostream &O) ;
+  void printFPConstant(const ConstantFP *Fp, raw_ostream &O) ;
+  void bufferLEByte(Constant *CPV, int Bytes, AggBuffer *aggBuffer) ;
+  void bufferAggregateConstant(Constant *CV, AggBuffer *aggBuffer) ;
+
+  void printOperandProper(const MachineOperand &MO);
+
+  void emitLinkageDirective(const GlobalValue* V, raw_ostream &O);
+  void emitDeclarations(Module &, raw_ostream &O);
+  void emitDeclaration(const Function *, raw_ostream &O);
+
+  static const char *getRegisterName(unsigned RegNo);
+  void emitDemotedVars(const Function *, raw_ostream &);
+
+  LineReader *reader;
+  LineReader *getReader(std::string);
+public:
+  NVPTXAsmPrinter(TargetMachine &TM,
+                  MCStreamer &Streamer)
+  : AsmPrinter(TM, Streamer),
+    nvptxSubtarget(TM.getSubtarget<NVPTXSubtarget>()) {
+    CurrentBankselLabelInBasicBlock = "";
+    VRidGlobal2LocalMap = NULL;
+    reader = NULL;
+  }
+
+  ~NVPTXAsmPrinter() {
+    if (!reader)
+      delete reader;
+  }
+
+  bool ignoreLoc(const MachineInstr &);
+
+  virtual void getVirtualRegisterName(unsigned, bool, raw_ostream &);
+
+  DebugLoc prevDebugLoc;
+  void emitLineNumberAsDotLoc(const MachineInstr &);
+};
+} // end of namespace
+
+#endif
diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
new file mode 100644
index 0000000..a9abc00
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -0,0 +1,76 @@
+//=======- NVPTXFrameLowering.cpp - NVPTX Frame Information ---*- C++ -*-=====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXFrameLowering.h"
+#include "NVPTX.h"
+#include "NVPTXRegisterInfo.h"
+#include "NVPTXSubtarget.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+bool NVPTXFrameLowering::hasFP(const MachineFunction &MF) const {
+  return true;
+}
+
+void NVPTXFrameLowering::emitPrologue(MachineFunction &MF) const {
+  if (MF.getFrameInfo()->hasStackObjects()) {
+    MachineBasicBlock &MBB = MF.front();
+    // Insert "mov.u32 %SP, %Depot"
+    MachineBasicBlock::iterator MBBI = MBB.begin();
+    // This instruction really occurs before first instruction
+    // in the BB, so giving it no debug location.
+    DebugLoc dl = DebugLoc();
+
+    if (tm.getSubtargetImpl()->hasGenericLdSt()) {
+      // mov %SPL, %depot;
+      // cvta.local %SP, %SPL;
+      if (is64bit) {
+        MachineInstr *MI = BuildMI(MBB, MBBI, dl,
+                               tm.getInstrInfo()->get(NVPTX::cvta_local_yes_64),
+                                   NVPTX::VRFrame).addReg(NVPTX::VRFrameLocal);
+        BuildMI(MBB, MI, dl,
+                tm.getInstrInfo()->get(NVPTX::IMOV64rr), NVPTX::VRFrameLocal)
+        .addReg(NVPTX::VRDepot);
+      } else {
+        MachineInstr *MI = BuildMI(MBB, MBBI, dl,
+                                  tm.getInstrInfo()->get(NVPTX::cvta_local_yes),
+                                   NVPTX::VRFrame).addReg(NVPTX::VRFrameLocal);
+        BuildMI(MBB, MI, dl,
+                tm.getInstrInfo()->get(NVPTX::IMOV32rr), NVPTX::VRFrameLocal)
+        .addReg(NVPTX::VRDepot);
+      }
+    }
+    else {
+      // mov %SP, %depot;
+      if (is64bit)
+        BuildMI(MBB, MBBI, dl,
+                tm.getInstrInfo()->get(NVPTX::IMOV64rr), NVPTX::VRFrame)
+                .addReg(NVPTX::VRDepot);
+      else
+        BuildMI(MBB, MBBI, dl,
+                tm.getInstrInfo()->get(NVPTX::IMOV32rr), NVPTX::VRFrame)
+                .addReg(NVPTX::VRDepot);
+    }
+  }
+}
+
+void NVPTXFrameLowering::emitEpilogue(MachineFunction &MF,
+                                      MachineBasicBlock &MBB) const {
+}
diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.h b/lib/Target/NVPTX/NVPTXFrameLowering.h
new file mode 100644
index 0000000..ee87b39
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.h
@@ -0,0 +1,40 @@
+//===--- NVPTXFrameLowering.h - Define frame lowering for NVPTX -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_FRAMELOWERING_H
+#define NVPTX_FRAMELOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+
+
+namespace llvm {
+class NVPTXTargetMachine;
+
+class NVPTXFrameLowering : public TargetFrameLowering {
+  NVPTXTargetMachine &tm;
+  bool is64bit;
+
+public:
+  explicit NVPTXFrameLowering(NVPTXTargetMachine &_tm, bool _is64bit)
+  : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 8, 0),
+    tm(_tm), is64bit(_is64bit) {}
+
+  virtual bool hasFP(const MachineFunction &MF) const;
+  virtual void emitPrologue(MachineFunction &MF) const;
+  virtual void emitEpilogue(MachineFunction &MF,
+                            MachineBasicBlock &MBB) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
new file mode 100644
index 0000000..4e92f0e
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -0,0 +1,683 @@
+//===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the NVPTX target.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "llvm/Instructions.h"
+#include "llvm/Support/raw_ostream.h"
+#include "NVPTXISelDAGToDAG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/GlobalValue.h"
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "nvptx-isel"
+
+using namespace llvm;
+
+
+static cl::opt<bool>
+UseFMADInstruction("nvptx-mad-enable",
+                   cl::ZeroOrMore,
+                cl::desc("NVPTX Specific: Enable generating FMAD instructions"),
+                   cl::init(false));
+
+static cl::opt<int>
+FMAContractLevel("nvptx-fma-level",
+                 cl::ZeroOrMore,
+                 cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
+                     " 1: do it  2: do it aggressively"),
+                     cl::init(2));
+
+
+static cl::opt<int>
+UsePrecDivF32("nvptx-prec-divf32",
+              cl::ZeroOrMore,
+             cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
+                  " IEEE Compliant F32 div.rnd if avaiable."),
+                  cl::init(2));
+
+/// createNVPTXISelDag - This pass converts a legalized DAG into a
+/// NVPTX-specific DAG, ready for instruction scheduling.
+FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
+                                       llvm::CodeGenOpt::Level OptLevel) {
+  return new NVPTXDAGToDAGISel(TM, OptLevel);
+}
+
+
+NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
+                                     CodeGenOpt::Level OptLevel)
+: SelectionDAGISel(tm, OptLevel),
+  Subtarget(tm.getSubtarget<NVPTXSubtarget>())
+{
+  // Always do fma.f32 fpcontract if the target supports the instruction.
+  // Always do fma.f64 fpcontract if the target supports the instruction.
+  // Do mad.f32 is nvptx-mad-enable is specified and the target does not
+  // support fma.f32.
+
+  doFMADF32 = (OptLevel > 0) && UseFMADInstruction && !Subtarget.hasFMAF32();
+  doFMAF32 =  (OptLevel > 0) && Subtarget.hasFMAF32() &&
+      (FMAContractLevel>=1);
+  doFMAF64 =  (OptLevel > 0) && Subtarget.hasFMAF64() &&
+      (FMAContractLevel>=1);
+  doFMAF32AGG =  (OptLevel > 0) && Subtarget.hasFMAF32() &&
+      (FMAContractLevel==2);
+  doFMAF64AGG =  (OptLevel > 0) && Subtarget.hasFMAF64() &&
+      (FMAContractLevel==2);
+
+  allowFMA = (FMAContractLevel >= 1) || UseFMADInstruction;
+
+  UseF32FTZ = false;
+
+  doMulWide = (OptLevel > 0);
+
+  // Decide how to translate f32 div
+  do_DIVF32_PREC = UsePrecDivF32;
+  // sm less than sm_20 does not support div.rnd. Use div.full.
+  if (do_DIVF32_PREC == 2 && !Subtarget.reqPTX20())
+    do_DIVF32_PREC = 1;
+
+}
+
+/// Select - Select instructions not customized! Used for
+/// expanded, promoted and normal instructions.
+SDNode* NVPTXDAGToDAGISel::Select(SDNode *N) {
+
+  if (N->isMachineOpcode())
+    return NULL;   // Already selected.
+
+  SDNode *ResNode = NULL;
+  switch (N->getOpcode()) {
+  case ISD::LOAD:
+    ResNode = SelectLoad(N);
+    break;
+  case ISD::STORE:
+    ResNode = SelectStore(N);
+    break;
+  }
+  if (ResNode)
+    return ResNode;
+  return SelectCode(N);
+}
+
+
+static unsigned int
+getCodeAddrSpace(MemSDNode *N, const NVPTXSubtarget &Subtarget)
+{
+  const Value *Src = N->getSrcValue();
+  if (!Src)
+    return NVPTX::PTXLdStInstCode::LOCAL;
+
+  if (const PointerType *PT = dyn_cast<PointerType>(Src->getType())) {
+    switch (PT->getAddressSpace()) {
+    case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL;
+    case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL;
+    case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED;
+    case llvm::ADDRESS_SPACE_CONST_NOT_GEN:
+      return NVPTX::PTXLdStInstCode::CONSTANT;
+    case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC;
+    case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM;
+    case llvm::ADDRESS_SPACE_CONST:
+      // If the arch supports generic address space, translate it to GLOBAL
+      // for correctness.
+      // If the arch does not support generic address space, then the arch
+      // does not really support ADDRESS_SPACE_CONST, translate it to
+      // to CONSTANT for better performance.
+      if (Subtarget.hasGenericLdSt())
+        return NVPTX::PTXLdStInstCode::GLOBAL;
+      else
+        return NVPTX::PTXLdStInstCode::CONSTANT;
+    default: break;
+    }
+  }
+  return NVPTX::PTXLdStInstCode::LOCAL;
+}
+
+
+SDNode* NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  EVT LoadedVT = LD->getMemoryVT();
+  SDNode *NVPTXLD= NULL;
+
+  // do not support pre/post inc/dec
+  if (LD->isIndexed())
+    return NULL;
+
+  if (!LoadedVT.isSimple())
+    return NULL;
+
+  // Address Space Setting
+  unsigned int codeAddrSpace = getCodeAddrSpace(LD, Subtarget);
+
+  // Volatile Setting
+  // - .volatile is only availalble for .global and .shared
+  bool isVolatile = LD->isVolatile();
+  if (codeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+      codeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+      codeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+    isVolatile = false;
+
+  // Vector Setting
+  MVT SimpleVT = LoadedVT.getSimpleVT();
+  unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
+  if (SimpleVT.isVector()) {
+    unsigned num = SimpleVT.getVectorNumElements();
+    if (num == 2)
+      vecType = NVPTX::PTXLdStInstCode::V2;
+    else if (num == 4)
+      vecType = NVPTX::PTXLdStInstCode::V4;
+    else
+      return NULL;
+  }
+
+  // Type Setting: fromType + fromTypeWidth
+  //
+  // Sign   : ISD::SEXTLOAD
+  // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
+  //          type is integer
+  // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
+  MVT ScalarVT = SimpleVT.getScalarType();
+  unsigned fromTypeWidth =  ScalarVT.getSizeInBits();
+  unsigned int fromType;
+  if ((LD->getExtensionType() == ISD::SEXTLOAD))
+    fromType = NVPTX::PTXLdStInstCode::Signed;
+  else if (ScalarVT.isFloatingPoint())
+    fromType = NVPTX::PTXLdStInstCode::Float;
+  else
+    fromType = NVPTX::PTXLdStInstCode::Unsigned;
+
+  // Create the machine instruction DAG
+  SDValue Chain = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue Addr;
+  SDValue Offset, Base;
+  unsigned Opcode;
+  MVT::SimpleValueType TargetVT = LD->getValueType(0).getSimpleVT().SimpleTy;
+
+  if (SelectDirectAddr(N1, Addr)) {
+    switch (TargetVT) {
+    case MVT::i8:    Opcode = NVPTX::LD_i8_avar; break;
+    case MVT::i16:   Opcode = NVPTX::LD_i16_avar; break;
+    case MVT::i32:   Opcode = NVPTX::LD_i32_avar; break;
+    case MVT::i64:   Opcode = NVPTX::LD_i64_avar; break;
+    case MVT::f32:   Opcode = NVPTX::LD_f32_avar; break;
+    case MVT::f64:   Opcode = NVPTX::LD_f64_avar; break;
+    case MVT::v2i8:  Opcode = NVPTX::LD_v2i8_avar; break;
+    case MVT::v2i16: Opcode = NVPTX::LD_v2i16_avar; break;
+    case MVT::v2i32: Opcode = NVPTX::LD_v2i32_avar; break;
+    case MVT::v2i64: Opcode = NVPTX::LD_v2i64_avar; break;
+    case MVT::v2f32: Opcode = NVPTX::LD_v2f32_avar; break;
+    case MVT::v2f64: Opcode = NVPTX::LD_v2f64_avar; break;
+    case MVT::v4i8:  Opcode = NVPTX::LD_v4i8_avar; break;
+    case MVT::v4i16: Opcode = NVPTX::LD_v4i16_avar; break;
+    case MVT::v4i32: Opcode = NVPTX::LD_v4i32_avar; break;
+    case MVT::v4f32: Opcode = NVPTX::LD_v4f32_avar; break;
+    default: return NULL;
+    }
+    SDValue Ops[] = { getI32Imm(isVolatile),
+                      getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType),
+                      getI32Imm(fromType),
+                      getI32Imm(fromTypeWidth),
+                      Addr, Chain };
+    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT,
+                                     MVT::Other, Ops, 7);
+  } else if (Subtarget.is64Bit()?
+      SelectADDRsi64(N1.getNode(), N1, Base, Offset):
+      SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
+    switch (TargetVT) {
+    case MVT::i8:    Opcode = NVPTX::LD_i8_asi; break;
+    case MVT::i16:   Opcode = NVPTX::LD_i16_asi; break;
+    case MVT::i32:   Opcode = NVPTX::LD_i32_asi; break;
+    case MVT::i64:   Opcode = NVPTX::LD_i64_asi; break;
+    case MVT::f32:   Opcode = NVPTX::LD_f32_asi; break;
+    case MVT::f64:   Opcode = NVPTX::LD_f64_asi; break;
+    case MVT::v2i8:  Opcode = NVPTX::LD_v2i8_asi; break;
+    case MVT::v2i16: Opcode = NVPTX::LD_v2i16_asi; break;
+    case MVT::v2i32: Opcode = NVPTX::LD_v2i32_asi; break;
+    case MVT::v2i64: Opcode = NVPTX::LD_v2i64_asi; break;
+    case MVT::v2f32: Opcode = NVPTX::LD_v2f32_asi; break;
+    case MVT::v2f64: Opcode = NVPTX::LD_v2f64_asi; break;
+    case MVT::v4i8:  Opcode = NVPTX::LD_v4i8_asi; break;
+    case MVT::v4i16: Opcode = NVPTX::LD_v4i16_asi; break;
+    case MVT::v4i32: Opcode = NVPTX::LD_v4i32_asi; break;
+    case MVT::v4f32: Opcode = NVPTX::LD_v4f32_asi; break;
+    default: return NULL;
+    }
+    SDValue Ops[] = { getI32Imm(isVolatile),
+                      getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType),
+                      getI32Imm(fromType),
+                      getI32Imm(fromTypeWidth),
+                      Base, Offset, Chain };
+    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT,
+                                     MVT::Other, Ops, 8);
+  } else if (Subtarget.is64Bit()?
+      SelectADDRri64(N1.getNode(), N1, Base, Offset):
+      SelectADDRri(N1.getNode(), N1, Base, Offset)) {
+    switch (TargetVT) {
+    case MVT::i8:    Opcode = NVPTX::LD_i8_ari; break;
+    case MVT::i16:   Opcode = NVPTX::LD_i16_ari; break;
+    case MVT::i32:   Opcode = NVPTX::LD_i32_ari; break;
+    case MVT::i64:   Opcode = NVPTX::LD_i64_ari; break;
+    case MVT::f32:   Opcode = NVPTX::LD_f32_ari; break;
+    case MVT::f64:   Opcode = NVPTX::LD_f64_ari; break;
+    case MVT::v2i8:  Opcode = NVPTX::LD_v2i8_ari; break;
+    case MVT::v2i16: Opcode = NVPTX::LD_v2i16_ari; break;
+    case MVT::v2i32: Opcode = NVPTX::LD_v2i32_ari; break;
+    case MVT::v2i64: Opcode = NVPTX::LD_v2i64_ari; break;
+    case MVT::v2f32: Opcode = NVPTX::LD_v2f32_ari; break;
+    case MVT::v2f64: Opcode = NVPTX::LD_v2f64_ari; break;
+    case MVT::v4i8:  Opcode = NVPTX::LD_v4i8_ari; break;
+    case MVT::v4i16: Opcode = NVPTX::LD_v4i16_ari; break;
+    case MVT::v4i32: Opcode = NVPTX::LD_v4i32_ari; break;
+    case MVT::v4f32: Opcode = NVPTX::LD_v4f32_ari; break;
+    default: return NULL;
+    }
+    SDValue Ops[] = { getI32Imm(isVolatile),
+                      getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType),
+                      getI32Imm(fromType),
+                      getI32Imm(fromTypeWidth),
+                      Base, Offset, Chain };
+    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT,
+                                     MVT::Other, Ops, 8);
+  }
+  else {
+    switch (TargetVT) {
+    case MVT::i8:    Opcode = NVPTX::LD_i8_areg; break;
+    case MVT::i16:   Opcode = NVPTX::LD_i16_areg; break;
+    case MVT::i32:   Opcode = NVPTX::LD_i32_areg; break;
+    case MVT::i64:   Opcode = NVPTX::LD_i64_areg; break;
+    case MVT::f32:   Opcode = NVPTX::LD_f32_areg; break;
+    case MVT::f64:   Opcode = NVPTX::LD_f64_areg; break;
+    case MVT::v2i8:  Opcode = NVPTX::LD_v2i8_areg; break;
+    case MVT::v2i16: Opcode = NVPTX::LD_v2i16_areg; break;
+    case MVT::v2i32: Opcode = NVPTX::LD_v2i32_areg; break;
+    case MVT::v2i64: Opcode = NVPTX::LD_v2i64_areg; break;
+    case MVT::v2f32: Opcode = NVPTX::LD_v2f32_areg; break;
+    case MVT::v2f64: Opcode = NVPTX::LD_v2f64_areg; break;
+    case MVT::v4i8:  Opcode = NVPTX::LD_v4i8_areg; break;
+    case MVT::v4i16: Opcode = NVPTX::LD_v4i16_areg; break;
+    case MVT::v4i32: Opcode = NVPTX::LD_v4i32_areg; break;
+    case MVT::v4f32: Opcode = NVPTX::LD_v4f32_areg; break;
+    default: return NULL;
+    }
+    SDValue Ops[] = { getI32Imm(isVolatile),
+                      getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType),
+                      getI32Imm(fromType),
+                      getI32Imm(fromTypeWidth),
+                      N1, Chain };
+    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT,
+                                     MVT::Other, Ops, 7);
+  }
+
+  if (NVPTXLD != NULL) {
+    MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+    MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+    cast<MachineSDNode>(NVPTXLD)->setMemRefs(MemRefs0, MemRefs0 + 1);
+  }
+
+  return NVPTXLD;
+}
+
+SDNode* NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+  EVT StoreVT = ST->getMemoryVT();
+  SDNode *NVPTXST = NULL;
+
+  // do not support pre/post inc/dec
+  if (ST->isIndexed())
+    return NULL;
+
+  if (!StoreVT.isSimple())
+    return NULL;
+
+  // Address Space Setting
+  unsigned int codeAddrSpace = getCodeAddrSpace(ST, Subtarget);
+
+  // Volatile Setting
+  // - .volatile is only availalble for .global and .shared
+  bool isVolatile = ST->isVolatile();
+  if (codeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+      codeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+      codeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+    isVolatile = false;
+
+  // Vector Setting
+  MVT SimpleVT = StoreVT.getSimpleVT();
+  unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
+  if (SimpleVT.isVector()) {
+    unsigned num = SimpleVT.getVectorNumElements();
+    if (num == 2)
+      vecType = NVPTX::PTXLdStInstCode::V2;
+    else if (num == 4)
+      vecType = NVPTX::PTXLdStInstCode::V4;
+    else
+      return NULL;
+  }
+
+  // Type Setting: toType + toTypeWidth
+  // - for integer type, always use 'u'
+  //
+  MVT ScalarVT = SimpleVT.getScalarType();
+  unsigned toTypeWidth =  ScalarVT.getSizeInBits();
+  unsigned int toType;
+  if (ScalarVT.isFloatingPoint())
+    toType = NVPTX::PTXLdStInstCode::Float;
+  else
+    toType = NVPTX::PTXLdStInstCode::Unsigned;
+
+  // Create the machine instruction DAG
+  SDValue Chain = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  SDValue Addr;
+  SDValue Offset, Base;
+  unsigned Opcode;
+  MVT::SimpleValueType SourceVT =
+      N1.getNode()->getValueType(0).getSimpleVT().SimpleTy;
+
+  if (SelectDirectAddr(N2, Addr)) {
+    switch (SourceVT) {
+    case MVT::i8:    Opcode = NVPTX::ST_i8_avar; break;
+    case MVT::i16:   Opcode = NVPTX::ST_i16_avar; break;
+    case MVT::i32:   Opcode = NVPTX::ST_i32_avar; break;
+    case MVT::i64:   Opcode = NVPTX::ST_i64_avar; break;
+    case MVT::f32:   Opcode = NVPTX::ST_f32_avar; break;
+    case MVT::f64:   Opcode = NVPTX::ST_f64_avar; break;
+    case MVT::v2i8:  Opcode = NVPTX::ST_v2i8_avar; break;
+    case MVT::v2i16: Opcode = NVPTX::ST_v2i16_avar; break;
+    case MVT::v2i32: Opcode = NVPTX::ST_v2i32_avar; break;
+    case MVT::v2i64: Opcode = NVPTX::ST_v2i64_avar; break;
+    case MVT::v2f32: Opcode = NVPTX::ST_v2f32_avar; break;
+    case MVT::v2f64: Opcode = NVPTX::ST_v2f64_avar; break;
+    case MVT::v4i8:  Opcode = NVPTX::ST_v4i8_avar; break;
+    case MVT::v4i16: Opcode = NVPTX::ST_v4i16_avar; break;
+    case MVT::v4i32: Opcode = NVPTX::ST_v4i32_avar; break;
+    case MVT::v4f32: Opcode = NVPTX::ST_v4f32_avar; break;
+    default: return NULL;
+    }
+    SDValue Ops[] = { N1,
+                      getI32Imm(isVolatile),
+                      getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType),
+                      getI32Imm(toType),
+                      getI32Imm(toTypeWidth),
+                      Addr, Chain };
+    NVPTXST = CurDAG->getMachineNode(Opcode, dl,
+                                     MVT::Other, Ops, 8);
+  } else if (Subtarget.is64Bit()?
+      SelectADDRsi64(N2.getNode(), N2, Base, Offset):
+      SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
+    switch (SourceVT) {
+    case MVT::i8:    Opcode = NVPTX::ST_i8_asi; break;
+    case MVT::i16:   Opcode = NVPTX::ST_i16_asi; break;
+    case MVT::i32:   Opcode = NVPTX::ST_i32_asi; break;
+    case MVT::i64:   Opcode = NVPTX::ST_i64_asi; break;
+    case MVT::f32:   Opcode = NVPTX::ST_f32_asi; break;
+    case MVT::f64:   Opcode = NVPTX::ST_f64_asi; break;
+    case MVT::v2i8:  Opcode = NVPTX::ST_v2i8_asi; break;
+    case MVT::v2i16: Opcode = NVPTX::ST_v2i16_asi; break;
+    case MVT::v2i32: Opcode = NVPTX::ST_v2i32_asi; break;
+    case MVT::v2i64: Opcode = NVPTX::ST_v2i64_asi; break;
+    case MVT::v2f32: Opcode = NVPTX::ST_v2f32_asi; break;
+    case MVT::v2f64: Opcode = NVPTX::ST_v2f64_asi; break;
+    case MVT::v4i8:  Opcode = NVPTX::ST_v4i8_asi; break;
+    case MVT::v4i16: Opcode = NVPTX::ST_v4i16_asi; break;
+    case MVT::v4i32: Opcode = NVPTX::ST_v4i32_asi; break;
+    case MVT::v4f32: Opcode = NVPTX::ST_v4f32_asi; break;
+    default: return NULL;
+    }
+    SDValue Ops[] = { N1,
+                      getI32Imm(isVolatile),
+                      getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType),
+                      getI32Imm(toType),
+                      getI32Imm(toTypeWidth),
+                      Base, Offset, Chain };
+    NVPTXST = CurDAG->getMachineNode(Opcode, dl,
+                                     MVT::Other, Ops, 9);
+  } else if (Subtarget.is64Bit()?
+      SelectADDRri64(N2.getNode(), N2, Base, Offset):
+      SelectADDRri(N2.getNode(), N2, Base, Offset)) {
+    switch (SourceVT) {
+    case MVT::i8:    Opcode = NVPTX::ST_i8_ari; break;
+    case MVT::i16:   Opcode = NVPTX::ST_i16_ari; break;
+    case MVT::i32:   Opcode = NVPTX::ST_i32_ari; break;
+    case MVT::i64:   Opcode = NVPTX::ST_i64_ari; break;
+    case MVT::f32:   Opcode = NVPTX::ST_f32_ari; break;
+    case MVT::f64:   Opcode = NVPTX::ST_f64_ari; break;
+    case MVT::v2i8:  Opcode = NVPTX::ST_v2i8_ari; break;
+    case MVT::v2i16: Opcode = NVPTX::ST_v2i16_ari; break;
+    case MVT::v2i32: Opcode = NVPTX::ST_v2i32_ari; break;
+    case MVT::v2i64: Opcode = NVPTX::ST_v2i64_ari; break;
+    case MVT::v2f32: Opcode = NVPTX::ST_v2f32_ari; break;
+    case MVT::v2f64: Opcode = NVPTX::ST_v2f64_ari; break;
+    case MVT::v4i8:  Opcode = NVPTX::ST_v4i8_ari; break;
+    case MVT::v4i16: Opcode = NVPTX::ST_v4i16_ari; break;
+    case MVT::v4i32: Opcode = NVPTX::ST_v4i32_ari; break;
+    case MVT::v4f32: Opcode = NVPTX::ST_v4f32_ari; break;
+    default: return NULL;
+    }
+    SDValue Ops[] = { N1,
+                      getI32Imm(isVolatile),
+                      getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType),
+                      getI32Imm(toType),
+                      getI32Imm(toTypeWidth),
+                      Base, Offset, Chain };
+    NVPTXST = CurDAG->getMachineNode(Opcode, dl,
+                                     MVT::Other, Ops, 9);
+  } else {
+    switch (SourceVT) {
+    case MVT::i8:    Opcode = NVPTX::ST_i8_areg; break;
+    case MVT::i16:   Opcode = NVPTX::ST_i16_areg; break;
+    case MVT::i32:   Opcode = NVPTX::ST_i32_areg; break;
+    case MVT::i64:   Opcode = NVPTX::ST_i64_areg; break;
+    case MVT::f32:   Opcode = NVPTX::ST_f32_areg; break;
+    case MVT::f64:   Opcode = NVPTX::ST_f64_areg; break;
+    case MVT::v2i8:  Opcode = NVPTX::ST_v2i8_areg; break;
+    case MVT::v2i16: Opcode = NVPTX::ST_v2i16_areg; break;
+    case MVT::v2i32: Opcode = NVPTX::ST_v2i32_areg; break;
+    case MVT::v2i64: Opcode = NVPTX::ST_v2i64_areg; break;
+    case MVT::v2f32: Opcode = NVPTX::ST_v2f32_areg; break;
+    case MVT::v2f64: Opcode = NVPTX::ST_v2f64_areg; break;
+    case MVT::v4i8:  Opcode = NVPTX::ST_v4i8_areg; break;
+    case MVT::v4i16: Opcode = NVPTX::ST_v4i16_areg; break;
+    case MVT::v4i32: Opcode = NVPTX::ST_v4i32_areg; break;
+    case MVT::v4f32: Opcode = NVPTX::ST_v4f32_areg; break;
+    default: return NULL;
+    }
+    SDValue Ops[] = { N1,
+                      getI32Imm(isVolatile),
+                      getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType),
+                      getI32Imm(toType),
+                      getI32Imm(toTypeWidth),
+                      N2, Chain };
+    NVPTXST = CurDAG->getMachineNode(Opcode, dl,
+                                     MVT::Other, Ops, 8);
+  }
+
+  if (NVPTXST != NULL) {
+    MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+    MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+    cast<MachineSDNode>(NVPTXST)->setMemRefs(MemRefs0, MemRefs0 + 1);
+  }
+
+  return NVPTXST;
+}
+
+// SelectDirectAddr - Match a direct address for DAG.
+// A direct address could be a globaladdress or externalsymbol.
+bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
+  // Return true if TGA or ES.
+  if (N.getOpcode() == ISD::TargetGlobalAddress
+      || N.getOpcode() == ISD::TargetExternalSymbol) {
+    Address = N;
+    return true;
+  }
+  if (N.getOpcode() == NVPTXISD::Wrapper) {
+    Address = N.getOperand(0);
+    return true;
+  }
+  if (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
+    unsigned IID = cast<ConstantSDNode>(N.getOperand(0))->getZExtValue();
+    if (IID == Intrinsic::nvvm_ptr_gen_to_param)
+      if (N.getOperand(1).getOpcode() == NVPTXISD::MoveParam)
+        return (SelectDirectAddr(N.getOperand(1).getOperand(0), Address));
+  }
+  return false;
+}
+
+// symbol+offset
+bool NVPTXDAGToDAGISel::SelectADDRsi_imp(SDNode *OpNode, SDValue Addr,
+                                         SDValue &Base, SDValue &Offset,
+                                         MVT mvt) {
+  if (Addr.getOpcode() == ISD::ADD) {
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
+      SDValue base=Addr.getOperand(0);
+      if (SelectDirectAddr(base, Base)) {
+        Offset = CurDAG->getTargetConstant(CN->getZExtValue(), mvt);
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// symbol+offset
+bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr,
+                                     SDValue &Base, SDValue &Offset) {
+  return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32);
+}
+
+// symbol+offset
+bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,
+                                       SDValue &Base, SDValue &Offset) {
+  return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64);
+}
+
+// register+offset
+bool NVPTXDAGToDAGISel::SelectADDRri_imp(SDNode *OpNode, SDValue Addr,
+                                         SDValue &Base, SDValue &Offset,
+                                         MVT mvt) {
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
+    Offset = CurDAG->getTargetConstant(0, mvt);
+    return true;
+  }
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // direct calls.
+
+  if (Addr.getOpcode() == ISD::ADD) {
+    if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
+      return false;
+    }
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
+      if (FrameIndexSDNode *FIN =
+          dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
+        // Constant offset from frame ref.
+        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
+      else
+        Base = Addr.getOperand(0);
+      Offset = CurDAG->getTargetConstant(CN->getZExtValue(), mvt);
+      return true;
+    }
+  }
+  return false;
+}
+
+// register+offset
+bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr,
+                                     SDValue &Base, SDValue &Offset) {
+  return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32);
+}
+
+// register+offset
+bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
+                                       SDValue &Base, SDValue &Offset) {
+  return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
+}
+
+bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
+                                                 unsigned int spN) const {
+  const Value *Src = NULL;
+  // Even though MemIntrinsicSDNode is a subclas of MemSDNode,
+  // the classof() for MemSDNode does not include MemIntrinsicSDNode
+  // (See SelectionDAGNodes.h). So we need to check for both.
+  if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
+    Src = mN->getSrcValue();
+  }
+  else if (MemSDNode *mN = dyn_cast<MemIntrinsicSDNode>(N)) {
+    Src = mN->getSrcValue();
+  }
+  if (!Src)
+    return false;
+  if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+    return (PT->getAddressSpace() == spN);
+  return false;
+}
+
+/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+/// inline asm expressions.
+bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                                     char ConstraintCode,
+                                                 std::vector<SDValue> &OutOps) {
+  SDValue Op0, Op1;
+  switch (ConstraintCode) {
+  default: return true;
+  case 'm':   // memory
+    if (SelectDirectAddr(Op, Op0)) {
+      OutOps.push_back(Op0);
+      OutOps.push_back(CurDAG->getTargetConstant(0, MVT::i32));
+      return false;
+    }
+    if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) {
+      OutOps.push_back(Op0);
+      OutOps.push_back(Op1);
+      return false;
+    }
+    break;
+  }
+  return true;
+}
+
+// Return true if N is a undef or a constant.
+// If N was undef, return a (i8imm 0) in Retval
+// If N was imm, convert it to i8imm and return in Retval
+// Note: The convert to i8imm is required, otherwise the
+// pattern matcher inserts a bunch of IMOVi8rr to convert
+// the imm to i8imm, and this causes instruction selection
+// to fail.
+bool NVPTXDAGToDAGISel::UndefOrImm(SDValue Op, SDValue N,
+                                   SDValue &Retval) {
+  if (!(N.getOpcode() == ISD::UNDEF) &&
+      !(N.getOpcode() == ISD::Constant))
+    return false;
+
+  if (N.getOpcode() == ISD::UNDEF)
+    Retval = CurDAG->getTargetConstant(0, MVT::i8);
+  else {
+    ConstantSDNode *cn = cast<ConstantSDNode>(N.getNode());
+    unsigned retval = cn->getZExtValue();
+    Retval = CurDAG->getTargetConstant(retval, MVT::i8);
+  }
+  return true;
+}
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
new file mode 100644
index 0000000..ccd69b2
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -0,0 +1,105 @@
+//===-- NVPTXISelDAGToDAG.h - A dag to dag inst selector for NVPTX --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the NVPTX target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "nvptx-isel"
+
+#include "NVPTX.h"
+#include "NVPTXISelLowering.h"
+#include "NVPTXRegisterInfo.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Intrinsics.h"
+using namespace llvm;
+
+namespace {
+
+class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
+
+  // If true, generate corresponding FPCONTRACT. This is
+  // language dependent (i.e. CUDA and OpenCL works differently).
+  bool doFMADF32;
+  bool doFMAF64;
+  bool doFMAF32;
+  bool doFMAF64AGG;
+  bool doFMAF32AGG;
+  bool allowFMA;
+
+  // 0: use div.approx
+  // 1: use div.full
+  // 2: For sm_20 and later, ieee-compliant div.rnd.f32 can be generated;
+  //    Otherwise, use div.full
+  int do_DIVF32_PREC;
+
+  // If true, add .ftz to f32 instructions.
+  // This is only meaningful for sm_20 and later, as the default
+  // is not ftz.
+  // For sm earlier than sm_20, f32 denorms are always ftz by the
+  // hardware.
+  // We always add the .ftz modifier regardless of the sm value
+  // when Use32FTZ is true.
+  bool UseF32FTZ;
+
+  // If true, generate mul.wide from sext and mul
+  bool doMulWide;
+
+public:
+  explicit NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
+                             CodeGenOpt::Level OptLevel);
+
+  // Pass Name
+  virtual const char *getPassName() const {
+    return "NVPTX DAG->DAG Pattern Instruction Selection";
+  }
+
+  const NVPTXSubtarget &Subtarget;
+
+  virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                            char ConstraintCode,
+                                            std::vector<SDValue> &OutOps);
+private:
+  // Include the pieces autogenerated from the target description.
+#include "NVPTXGenDAGISel.inc"
+
+  SDNode *Select(SDNode *N);
+  SDNode* SelectLoad(SDNode *N);
+  SDNode* SelectStore(SDNode *N);
+
+  inline SDValue getI32Imm(unsigned Imm) {
+    return CurDAG->getTargetConstant(Imm, MVT::i32);
+  }
+
+  // Match direct address complex pattern.
+  bool SelectDirectAddr(SDValue N, SDValue &Address);
+
+  bool SelectADDRri_imp(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                        SDValue &Offset, MVT mvt);
+  bool SelectADDRri(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                    SDValue &Offset);
+  bool SelectADDRri64(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                      SDValue &Offset);
+
+  bool SelectADDRsi_imp(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                        SDValue &Offset, MVT mvt);
+  bool SelectADDRsi(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                    SDValue &Offset);
+  bool SelectADDRsi64(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                      SDValue &Offset);
+
+
+  bool ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const;
+
+  bool UndefOrImm(SDValue Op, SDValue N, SDValue &Retval);
+
+};
+}
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
new file mode 100644
index 0000000..6ea10ea
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -0,0 +1,1291 @@
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that NVPTX uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "NVPTX.h"
+#include "NVPTXISelLowering.h"
+#include "NVPTXTargetMachine.h"
+#include "NVPTXTargetObjectFile.h"
+#include "NVPTXUtilities.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Module.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/MC/MCSectionELF.h"
+#include <sstream>
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "nvptx-lower"
+
+using namespace llvm;
+
+static unsigned int uniqueCallSite = 0;
+
+static cl::opt<bool>
+RetainVectorOperands("nvptx-codegen-vectors",
+     cl::desc("NVPTX Specific: Retain LLVM's vectors and generate PTX vectors"),
+                     cl::init(true));
+
+static cl::opt<bool>
+sched4reg("nvptx-sched4reg",
+          cl::desc("NVPTX Specific: schedule for register pressue"),
+          cl::init(false));
+
+// NVPTXTargetLowering Constructor.
+NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM)
+: TargetLowering(TM, new NVPTXTargetObjectFile()),
+  nvTM(&TM),
+  nvptxSubtarget(TM.getSubtarget<NVPTXSubtarget>()) {
+
+  // always lower memset, memcpy, and memmove intrinsics to load/store
+  // instructions, rather
+  // then generating calls to memset, mempcy or memmove.
+  maxStoresPerMemset = (unsigned)0xFFFFFFFF;
+  maxStoresPerMemcpy = (unsigned)0xFFFFFFFF;
+  maxStoresPerMemmove = (unsigned)0xFFFFFFFF;
+
+  setBooleanContents(ZeroOrNegativeOneBooleanContent);
+
+  // Jump is Expensive. Don't create extra control flow for 'and', 'or'
+  // condition branches.
+  setJumpIsExpensive(true);
+
+  // By default, use the Source scheduling
+  if (sched4reg)
+    setSchedulingPreference(Sched::RegPressure);
+  else
+    setSchedulingPreference(Sched::Source);
+
+  addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
+  addRegisterClass(MVT::i8, &NVPTX::Int8RegsRegClass);
+  addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
+  addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
+  addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
+  addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
+  addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
+
+  if (RetainVectorOperands) {
+    addRegisterClass(MVT::v2f32, &NVPTX::V2F32RegsRegClass);
+    addRegisterClass(MVT::v4f32, &NVPTX::V4F32RegsRegClass);
+    addRegisterClass(MVT::v2i32, &NVPTX::V2I32RegsRegClass);
+    addRegisterClass(MVT::v4i32, &NVPTX::V4I32RegsRegClass);
+    addRegisterClass(MVT::v2f64, &NVPTX::V2F64RegsRegClass);
+    addRegisterClass(MVT::v2i64, &NVPTX::V2I64RegsRegClass);
+    addRegisterClass(MVT::v2i16, &NVPTX::V2I16RegsRegClass);
+    addRegisterClass(MVT::v4i16, &NVPTX::V4I16RegsRegClass);
+    addRegisterClass(MVT::v2i8, &NVPTX::V2I8RegsRegClass);
+    addRegisterClass(MVT::v4i8, &NVPTX::V4I8RegsRegClass);
+
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32  , Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32  , Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16  , Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v4i8   , Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64  , Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64  , Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32  , Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32  , Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16  , Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v2i8   , Custom);
+
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32  , Custom);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32  , Custom);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i16  , Custom);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i8   , Custom);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i64  , Custom);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f64  , Custom);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32  , Custom);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32  , Custom);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i16  , Custom);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i8   , Custom);
+  }
+
+  // Operations not directly supported by NVPTX.
+  setOperationAction(ISD::SELECT_CC,         MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC,             MVT::Other, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
+
+  if (nvptxSubtarget.hasROT64()) {
+    setOperationAction(ISD::ROTL , MVT::i64, Legal);
+    setOperationAction(ISD::ROTR , MVT::i64, Legal);
+  }
+  else {
+    setOperationAction(ISD::ROTL , MVT::i64, Expand);
+    setOperationAction(ISD::ROTR , MVT::i64, Expand);
+  }
+  if (nvptxSubtarget.hasROT32()) {
+    setOperationAction(ISD::ROTL , MVT::i32, Legal);
+    setOperationAction(ISD::ROTR , MVT::i32, Legal);
+  }
+  else {
+    setOperationAction(ISD::ROTL , MVT::i32, Expand);
+    setOperationAction(ISD::ROTR , MVT::i32, Expand);
+  }
+
+  setOperationAction(ISD::ROTL , MVT::i16, Expand);
+  setOperationAction(ISD::ROTR , MVT::i16, Expand);
+  setOperationAction(ISD::ROTL , MVT::i8, Expand);
+  setOperationAction(ISD::ROTR , MVT::i8, Expand);
+  setOperationAction(ISD::BSWAP , MVT::i16, Expand);
+  setOperationAction(ISD::BSWAP , MVT::i32, Expand);
+  setOperationAction(ISD::BSWAP , MVT::i64, Expand);
+
+  // Indirect branch is not supported.
+  // This also disables Jump Table creation.
+  setOperationAction(ISD::BR_JT,             MVT::Other, Expand);
+  setOperationAction(ISD::BRIND,             MVT::Other, Expand);
+
+  setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
+  setOperationAction(ISD::GlobalAddress   , MVT::i64  , Custom);
+
+  // We want to legalize constant related memmove and memcopy
+  // intrinsics.
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+
+  // Turn FP extload into load/fextend
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  // Turn FP truncstore into trunc + store.
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  // PTX does not support load / store predicate registers
+  setOperationAction(ISD::LOAD, MVT::i1, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
+  setOperationAction(ISD::STORE, MVT::i1, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i1, Expand);
+  setTruncStoreAction(MVT::i32, MVT::i1, Expand);
+  setTruncStoreAction(MVT::i16, MVT::i1, Expand);
+  setTruncStoreAction(MVT::i8, MVT::i1, Expand);
+
+  // This is legal in NVPTX
+  setOperationAction(ISD::ConstantFP,         MVT::f64, Legal);
+  setOperationAction(ISD::ConstantFP,         MVT::f32, Legal);
+
+  // TRAP can be lowered to PTX trap
+  setOperationAction(ISD::TRAP,               MVT::Other, Legal);
+
+  // By default, CONCAT_VECTORS is implemented via store/load
+  // through stack. It is slow and uses local memory. We need
+  // to custom-lowering them.
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32  , Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32  , Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i16  , Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i8   , Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64  , Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64  , Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i32  , Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f32  , Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i16  , Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i8   , Custom);
+
+  // Expand vector int to float and float to int conversions
+  // - For SINT_TO_FP and UINT_TO_FP, the src type
+  //   (Node->getOperand(0).getValueType())
+  //   is used to determine the action, while for FP_TO_UINT and FP_TO_SINT,
+  //   the dest type (Node->getValueType(0)) is used.
+  //
+  //   See VectorLegalizer::LegalizeOp() (LegalizeVectorOps.cpp) for the vector
+  //   case, and
+  //   SelectionDAGLegalize::LegalizeOp() (LegalizeDAG.cpp) for the scalar case.
+  //
+  //   That is why v4i32 or v2i32 are used here.
+  //
+  //   The expansion for vectors happens in VectorLegalizer::LegalizeOp()
+  //   (LegalizeVectorOps.cpp).
+  setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
+  setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Expand);
+  setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
+  setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Expand);
+  setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Expand);
+  setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
+  setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Expand);
+  setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
+
+  // Now deduce the information based on the above mentioned
+  // actions
+  computeRegisterProperties();
+}
+
+
+const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return 0;
+  case NVPTXISD::CALL:            return "NVPTXISD::CALL";
+  case NVPTXISD::RET_FLAG:        return "NVPTXISD::RET_FLAG";
+  case NVPTXISD::Wrapper:         return "NVPTXISD::Wrapper";
+  case NVPTXISD::NVBuiltin:       return "NVPTXISD::NVBuiltin";
+  case NVPTXISD::DeclareParam:    return "NVPTXISD::DeclareParam";
+  case NVPTXISD::DeclareScalarParam:
+    return "NVPTXISD::DeclareScalarParam";
+  case NVPTXISD::DeclareRet:      return "NVPTXISD::DeclareRet";
+  case NVPTXISD::DeclareRetParam: return "NVPTXISD::DeclareRetParam";
+  case NVPTXISD::PrintCall:       return "NVPTXISD::PrintCall";
+  case NVPTXISD::LoadParam:       return "NVPTXISD::LoadParam";
+  case NVPTXISD::StoreParam:      return "NVPTXISD::StoreParam";
+  case NVPTXISD::StoreParamS32:   return "NVPTXISD::StoreParamS32";
+  case NVPTXISD::StoreParamU32:   return "NVPTXISD::StoreParamU32";
+  case NVPTXISD::MoveToParam:     return "NVPTXISD::MoveToParam";
+  case NVPTXISD::CallArgBegin:    return "NVPTXISD::CallArgBegin";
+  case NVPTXISD::CallArg:         return "NVPTXISD::CallArg";
+  case NVPTXISD::LastCallArg:     return "NVPTXISD::LastCallArg";
+  case NVPTXISD::CallArgEnd:      return "NVPTXISD::CallArgEnd";
+  case NVPTXISD::CallVoid:        return "NVPTXISD::CallVoid";
+  case NVPTXISD::CallVal:         return "NVPTXISD::CallVal";
+  case NVPTXISD::CallSymbol:      return "NVPTXISD::CallSymbol";
+  case NVPTXISD::Prototype:       return "NVPTXISD::Prototype";
+  case NVPTXISD::MoveParam:       return "NVPTXISD::MoveParam";
+  case NVPTXISD::MoveRetval:      return "NVPTXISD::MoveRetval";
+  case NVPTXISD::MoveToRetval:    return "NVPTXISD::MoveToRetval";
+  case NVPTXISD::StoreRetval:     return "NVPTXISD::StoreRetval";
+  case NVPTXISD::PseudoUseParam:  return "NVPTXISD::PseudoUseParam";
+  case NVPTXISD::RETURN:          return "NVPTXISD::RETURN";
+  case NVPTXISD::CallSeqBegin:    return "NVPTXISD::CallSeqBegin";
+  case NVPTXISD::CallSeqEnd:      return "NVPTXISD::CallSeqEnd";
+  }
+}
+
+
+SDValue
+NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  Op = DAG.getTargetGlobalAddress(GV, dl, getPointerTy());
+  return DAG.getNode(NVPTXISD::Wrapper, dl, getPointerTy(), Op);
+}
+
+std::string NVPTXTargetLowering::getPrototype(Type *retTy,
+                                              const ArgListTy &Args,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                              unsigned retAlignment) const {
+
+  bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+
+  std::stringstream O;
+  O << "prototype_" << uniqueCallSite << " : .callprototype ";
+
+  if (retTy->getTypeID() == Type::VoidTyID)
+    O << "()";
+  else {
+    O << "(";
+    if (isABI) {
+      if (retTy->isPrimitiveType() || retTy->isIntegerTy()) {
+        unsigned size = 0;
+        if (const IntegerType *ITy = dyn_cast<IntegerType>(retTy)) {
+          size = ITy->getBitWidth();
+          if (size < 32) size = 32;
+        }
+        else {
+          assert(retTy->isFloatingPointTy() &&
+                 "Floating point type expected here");
+          size = retTy->getPrimitiveSizeInBits();
+        }
+
+        O << ".param .b" << size << " _";
+      }
+      else if (isa<PointerType>(retTy))
+        O << ".param .b" << getPointerTy().getSizeInBits()
+        << " _";
+      else {
+        if ((retTy->getTypeID() == Type::StructTyID) ||
+            isa<VectorType>(retTy)) {
+          SmallVector<EVT, 16> vtparts;
+          ComputeValueVTs(*this, retTy, vtparts);
+          unsigned totalsz = 0;
+          for (unsigned i=0,e=vtparts.size(); i!=e; ++i) {
+            unsigned elems = 1;
+            EVT elemtype = vtparts[i];
+            if (vtparts[i].isVector()) {
+              elems = vtparts[i].getVectorNumElements();
+              elemtype = vtparts[i].getVectorElementType();
+            }
+            for (unsigned j=0, je=elems; j!=je; ++j) {
+              unsigned sz = elemtype.getSizeInBits();
+              if (elemtype.isInteger() && (sz < 8)) sz = 8;
+              totalsz += sz/8;
+            }
+          }
+          O << ".param .align "
+              << retAlignment
+              << " .b8 _["
+              << totalsz << "]";
+        }
+        else {
+          assert(false &&
+                 "Unknown return type");
+        }
+      }
+    }
+    else {
+      SmallVector<EVT, 16> vtparts;
+      ComputeValueVTs(*this, retTy, vtparts);
+      unsigned idx = 0;
+      for (unsigned i=0,e=vtparts.size(); i!=e; ++i) {
+        unsigned elems = 1;
+        EVT elemtype = vtparts[i];
+        if (vtparts[i].isVector()) {
+          elems = vtparts[i].getVectorNumElements();
+          elemtype = vtparts[i].getVectorElementType();
+        }
+
+        for (unsigned j=0, je=elems; j!=je; ++j) {
+          unsigned sz = elemtype.getSizeInBits();
+          if (elemtype.isInteger() && (sz < 32)) sz = 32;
+          O << ".reg .b" << sz << " _";
+          if (j<je-1) O << ", ";
+          ++idx;
+        }
+        if (i < e-1)
+          O << ", ";
+      }
+    }
+    O << ") ";
+  }
+  O << "_ (";
+
+  bool first = true;
+  MVT thePointerTy = getPointerTy();
+
+  for (unsigned i=0,e=Args.size(); i!=e; ++i) {
+    const Type *Ty = Args[i].Ty;
+    if (!first) {
+      O << ", ";
+    }
+    first = false;
+
+    if (Outs[i].Flags.isByVal() == false) {
+      unsigned sz = 0;
+      if (isa<IntegerType>(Ty)) {
+        sz = cast<IntegerType>(Ty)->getBitWidth();
+        if (sz < 32) sz = 32;
+      }
+      else if (isa<PointerType>(Ty))
+        sz = thePointerTy.getSizeInBits();
+      else
+        sz = Ty->getPrimitiveSizeInBits();
+      if (isABI)
+        O << ".param .b" << sz << " ";
+      else
+        O << ".reg .b" << sz << " ";
+      O << "_";
+      continue;
+    }
+    const PointerType *PTy = dyn_cast<PointerType>(Ty);
+    assert(PTy &&
+           "Param with byval attribute should be a pointer type");
+    Type *ETy = PTy->getElementType();
+
+    if (isABI) {
+      unsigned align = Outs[i].Flags.getByValAlign();
+      unsigned sz = getTargetData()->getTypeAllocSize(ETy);
+      O << ".param .align " << align
+          << " .b8 ";
+      O << "_";
+      O << "[" << sz << "]";
+      continue;
+    }
+    else {
+      SmallVector<EVT, 16> vtparts;
+      ComputeValueVTs(*this, ETy, vtparts);
+      for (unsigned i=0,e=vtparts.size(); i!=e; ++i) {
+        unsigned elems = 1;
+        EVT elemtype = vtparts[i];
+        if (vtparts[i].isVector()) {
+          elems = vtparts[i].getVectorNumElements();
+          elemtype = vtparts[i].getVectorElementType();
+        }
+
+        for (unsigned j=0,je=elems; j!=je; ++j) {
+          unsigned sz = elemtype.getSizeInBits();
+          if (elemtype.isInteger() && (sz < 32)) sz = 32;
+          O << ".reg .b" << sz << " ";
+          O << "_";
+          if (j<je-1) O << ", ";
+        }
+        if (i<e-1)
+          O << ", ";
+      }
+      continue;
+    }
+  }
+  O << ");";
+  return O.str();
+}
+
+
+SDValue
+NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                               SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  ArgListTy &Args                       = CLI.Args;
+  Type *retTy                           = CLI.RetTy;
+  ImmutableCallSite *CS                 = CLI.CS;
+
+  bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+
+  SDValue tempChain = Chain;
+  Chain = DAG.getCALLSEQ_START(Chain,
+                               DAG.getIntPtrConstant(uniqueCallSite, true));
+  SDValue InFlag = Chain.getValue(1);
+
+  assert((Outs.size() == Args.size()) &&
+         "Unexpected number of arguments to function call");
+  unsigned paramCount = 0;
+  // Declare the .params or .reg need to pass values
+  // to the function
+  for (unsigned i=0, e=Outs.size(); i!=e; ++i) {
+    EVT VT = Outs[i].VT;
+
+    if (Outs[i].Flags.isByVal() == false) {
+      // Plain scalar
+      // for ABI,    declare .param .b<size> .param<n>;
+      // for nonABI, declare .reg .b<size> .param<n>;
+      unsigned isReg = 1;
+      if (isABI)
+        isReg = 0;
+      unsigned sz = VT.getSizeInBits();
+      if (VT.isInteger() && (sz < 32)) sz = 32;
+      SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      SDValue DeclareParamOps[] = { Chain,
+                                    DAG.getConstant(paramCount, MVT::i32),
+                                    DAG.getConstant(sz, MVT::i32),
+                                    DAG.getConstant(isReg, MVT::i32),
+                                    InFlag };
+      Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
+                          DeclareParamOps, 5);
+      InFlag = Chain.getValue(1);
+      SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32),
+                             DAG.getConstant(0, MVT::i32), OutVals[i], InFlag };
+
+      unsigned opcode = NVPTXISD::StoreParam;
+      if (isReg)
+        opcode = NVPTXISD::MoveToParam;
+      else {
+        if (Outs[i].Flags.isZExt())
+          opcode = NVPTXISD::StoreParamU32;
+        else if (Outs[i].Flags.isSExt())
+          opcode = NVPTXISD::StoreParamS32;
+      }
+      Chain = DAG.getNode(opcode, dl, CopyParamVTs, CopyParamOps, 5);
+
+      InFlag = Chain.getValue(1);
+      ++paramCount;
+      continue;
+    }
+    // struct or vector
+    SmallVector<EVT, 16> vtparts;
+    const PointerType *PTy = dyn_cast<PointerType>(Args[i].Ty);
+    assert(PTy &&
+           "Type of a byval parameter should be pointer");
+    ComputeValueVTs(*this, PTy->getElementType(), vtparts);
+
+    if (isABI) {
+      // declare .param .align 16 .b8 .param<n>[<size>];
+      unsigned sz = Outs[i].Flags.getByValSize();
+      SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      // The ByValAlign in the Outs[i].Flags is alway set at this point, so we
+      // don't need to
+      // worry about natural alignment or not. See TargetLowering::LowerCallTo()
+      SDValue DeclareParamOps[] = { Chain,
+                       DAG.getConstant(Outs[i].Flags.getByValAlign(), MVT::i32),
+                                    DAG.getConstant(paramCount, MVT::i32),
+                                    DAG.getConstant(sz, MVT::i32),
+                                    InFlag };
+      Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
+                          DeclareParamOps, 5);
+      InFlag = Chain.getValue(1);
+      unsigned curOffset = 0;
+      for (unsigned j=0,je=vtparts.size(); j!=je; ++j) {
+        unsigned elems = 1;
+        EVT elemtype = vtparts[j];
+        if (vtparts[j].isVector()) {
+          elems = vtparts[j].getVectorNumElements();
+          elemtype = vtparts[j].getVectorElementType();
+        }
+        for (unsigned k=0,ke=elems; k!=ke; ++k) {
+          unsigned sz = elemtype.getSizeInBits();
+          if (elemtype.isInteger() && (sz < 8)) sz = 8;
+          SDValue srcAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                                        OutVals[i],
+                                        DAG.getConstant(curOffset,
+                                                        getPointerTy()));
+          SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
+                                MachinePointerInfo(), false, false, false, 0);
+          SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+          SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount,
+                                                            MVT::i32),
+                                           DAG.getConstant(curOffset, MVT::i32),
+                                                            theVal, InFlag };
+          Chain = DAG.getNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
+                              CopyParamOps, 5);
+          InFlag = Chain.getValue(1);
+          curOffset += sz/8;
+        }
+      }
+      ++paramCount;
+      continue;
+    }
+    // Non-abi, struct or vector
+    // Declare a bunch or .reg .b<size> .param<n>
+    unsigned curOffset = 0;
+    for (unsigned j=0,je=vtparts.size(); j!=je; ++j) {
+      unsigned elems = 1;
+      EVT elemtype = vtparts[j];
+      if (vtparts[j].isVector()) {
+        elems = vtparts[j].getVectorNumElements();
+        elemtype = vtparts[j].getVectorElementType();
+      }
+      for (unsigned k=0,ke=elems; k!=ke; ++k) {
+        unsigned sz = elemtype.getSizeInBits();
+        if (elemtype.isInteger() && (sz < 32)) sz = 32;
+        SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+        SDValue DeclareParamOps[] = { Chain, DAG.getConstant(paramCount,
+                                                             MVT::i32),
+                                                  DAG.getConstant(sz, MVT::i32),
+                                                   DAG.getConstant(1, MVT::i32),
+                                                             InFlag };
+        Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
+                            DeclareParamOps, 5);
+        InFlag = Chain.getValue(1);
+        SDValue srcAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), OutVals[i],
+                                      DAG.getConstant(curOffset,
+                                                      getPointerTy()));
+        SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
+                                  MachinePointerInfo(), false, false, false, 0);
+        SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+        SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32),
+                                   DAG.getConstant(0, MVT::i32), theVal,
+                                   InFlag };
+        Chain = DAG.getNode(NVPTXISD::MoveToParam, dl, CopyParamVTs,
+                            CopyParamOps, 5);
+        InFlag = Chain.getValue(1);
+        ++paramCount;
+      }
+    }
+  }
+
+  GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
+  unsigned retAlignment = 0;
+
+  // Handle Result
+  unsigned retCount = 0;
+  if (Ins.size() > 0) {
+    SmallVector<EVT, 16> resvtparts;
+    ComputeValueVTs(*this, retTy, resvtparts);
+
+    // Declare one .param .align 16 .b8 func_retval0[<size>] for ABI or
+    // individual .reg .b<size> func_retval<0..> for non ABI
+    unsigned resultsz = 0;
+    for (unsigned i=0,e=resvtparts.size(); i!=e; ++i) {
+      unsigned elems = 1;
+      EVT elemtype = resvtparts[i];
+      if (resvtparts[i].isVector()) {
+        elems = resvtparts[i].getVectorNumElements();
+        elemtype = resvtparts[i].getVectorElementType();
+      }
+      for (unsigned j=0,je=elems; j!=je; ++j) {
+        unsigned sz = elemtype.getSizeInBits();
+        if (isABI == false) {
+          if (elemtype.isInteger() && (sz < 32)) sz = 32;
+        }
+        else {
+          if (elemtype.isInteger() && (sz < 8)) sz = 8;
+        }
+        if (isABI == false) {
+          SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+          SDValue DeclareRetOps[] = { Chain, DAG.getConstant(2, MVT::i32),
+                                      DAG.getConstant(sz, MVT::i32),
+                                      DAG.getConstant(retCount, MVT::i32),
+                                      InFlag };
+          Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
+                              DeclareRetOps, 5);
+          InFlag = Chain.getValue(1);
+          ++retCount;
+        }
+        resultsz += sz;
+      }
+    }
+    if (isABI) {
+      if (retTy->isPrimitiveType() || retTy->isIntegerTy() ||
+          retTy->isPointerTy() ) {
+        // Scalar needs to be at least 32bit wide
+        if (resultsz < 32)
+          resultsz = 32;
+        SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+        SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, MVT::i32),
+                                    DAG.getConstant(resultsz, MVT::i32),
+                                    DAG.getConstant(0, MVT::i32), InFlag };
+        Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
+                            DeclareRetOps, 5);
+        InFlag = Chain.getValue(1);
+      }
+      else {
+        if (Func) { // direct call
+          if (!llvm::getAlign(*(CS->getCalledFunction()), 0, retAlignment))
+            retAlignment = getTargetData()->getABITypeAlignment(retTy);
+        } else { // indirect call
+          const CallInst *CallI = dyn_cast<CallInst>(CS->getInstruction());
+          if (!llvm::getAlign(*CallI, 0, retAlignment))
+            retAlignment = getTargetData()->getABITypeAlignment(retTy);
+        }
+        SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+        SDValue DeclareRetOps[] = { Chain, DAG.getConstant(retAlignment,
+                                                           MVT::i32),
+                                          DAG.getConstant(resultsz/8, MVT::i32),
+                                         DAG.getConstant(0, MVT::i32), InFlag };
+        Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
+                            DeclareRetOps, 5);
+        InFlag = Chain.getValue(1);
+      }
+    }
+  }
+
+  if (!Func) {
+    // This is indirect function call case : PTX requires a prototype of the
+    // form
+    // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
+    // to be emitted, and the label has to used as the last arg of call
+    // instruction.
+    // The prototype is embedded in a string and put as the operand for an
+    // INLINEASM SDNode.
+    SDVTList InlineAsmVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    std::string proto_string = getPrototype(retTy, Args, Outs, retAlignment);
+    const char *asmstr = nvTM->getManagedStrPool()->
+        getManagedString(proto_string.c_str())->c_str();
+    SDValue InlineAsmOps[] = { Chain,
+                               DAG.getTargetExternalSymbol(asmstr,
+                                                           getPointerTy()),
+                                                           DAG.getMDNode(0),
+                                   DAG.getTargetConstant(0, MVT::i32), InFlag };
+    Chain = DAG.getNode(ISD::INLINEASM, dl, InlineAsmVTs, InlineAsmOps, 5);
+    InFlag = Chain.getValue(1);
+  }
+  // Op to just print "call"
+  SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue PrintCallOps[] = { Chain,
+                             DAG.getConstant(isABI ? ((Ins.size()==0) ? 0 : 1)
+                                 : retCount, MVT::i32),
+                                   InFlag };
+  Chain = DAG.getNode(Func?(NVPTXISD::PrintCallUni):(NVPTXISD::PrintCall), dl,
+      PrintCallVTs, PrintCallOps, 3);
+  InFlag = Chain.getValue(1);
+
+  // Ops to print out the function name
+  SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue CallVoidOps[] = { Chain, Callee, InFlag };
+  Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps, 3);
+  InFlag = Chain.getValue(1);
+
+  // Ops to print out the param list
+  SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue CallArgBeginOps[] = { Chain, InFlag };
+  Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
+                      CallArgBeginOps, 2);
+  InFlag = Chain.getValue(1);
+
+  for (unsigned i=0, e=paramCount; i!=e; ++i) {
+    unsigned opcode;
+    if (i==(e-1))
+      opcode = NVPTXISD::LastCallArg;
+    else
+      opcode = NVPTXISD::CallArg;
+    SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue CallArgOps[] = { Chain, DAG.getConstant(1, MVT::i32),
+                             DAG.getConstant(i, MVT::i32),
+                             InFlag };
+    Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps, 4);
+    InFlag = Chain.getValue(1);
+  }
+  SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue CallArgEndOps[] = { Chain,
+                              DAG.getConstant(Func ? 1 : 0, MVT::i32),
+                              InFlag };
+  Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps,
+                      3);
+  InFlag = Chain.getValue(1);
+
+  if (!Func) {
+    SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue PrototypeOps[] = { Chain,
+                               DAG.getConstant(uniqueCallSite, MVT::i32),
+                               InFlag };
+    Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps, 3);
+    InFlag = Chain.getValue(1);
+  }
+
+  // Generate loads from param memory/moves from registers for result
+  if (Ins.size() > 0) {
+    if (isABI) {
+      unsigned resoffset = 0;
+      for (unsigned i=0,e=Ins.size(); i!=e; ++i) {
+        unsigned sz = Ins[i].VT.getSizeInBits();
+        if (Ins[i].VT.isInteger() && (sz < 8)) sz = 8;
+        std::vector<EVT> LoadRetVTs;
+        LoadRetVTs.push_back(Ins[i].VT);
+        LoadRetVTs.push_back(MVT::Other); LoadRetVTs.push_back(MVT::Glue);
+        std::vector<SDValue> LoadRetOps;
+        LoadRetOps.push_back(Chain);
+        LoadRetOps.push_back(DAG.getConstant(1, MVT::i32));
+        LoadRetOps.push_back(DAG.getConstant(resoffset, MVT::i32));
+        LoadRetOps.push_back(InFlag);
+        SDValue retval = DAG.getNode(NVPTXISD::LoadParam, dl, LoadRetVTs,
+                                     &LoadRetOps[0], LoadRetOps.size());
+        Chain = retval.getValue(1);
+        InFlag = retval.getValue(2);
+        InVals.push_back(retval);
+        resoffset += sz/8;
+      }
+    }
+    else {
+      SmallVector<EVT, 16> resvtparts;
+      ComputeValueVTs(*this, retTy, resvtparts);
+
+      assert(Ins.size() == resvtparts.size() &&
+             "Unexpected number of return values in non-ABI case");
+      unsigned paramNum = 0;
+      for (unsigned i=0,e=Ins.size(); i!=e; ++i) {
+        assert(EVT(Ins[i].VT) == resvtparts[i] &&
+               "Unexpected EVT type in non-ABI case");
+        unsigned numelems = 1;
+        EVT elemtype = Ins[i].VT;
+        if (Ins[i].VT.isVector()) {
+          numelems = Ins[i].VT.getVectorNumElements();
+          elemtype = Ins[i].VT.getVectorElementType();
+        }
+        std::vector<SDValue> tempRetVals;
+        for (unsigned j=0; j<numelems; ++j) {
+          std::vector<EVT> MoveRetVTs;
+          MoveRetVTs.push_back(elemtype);
+          MoveRetVTs.push_back(MVT::Other); MoveRetVTs.push_back(MVT::Glue);
+          std::vector<SDValue> MoveRetOps;
+          MoveRetOps.push_back(Chain);
+          MoveRetOps.push_back(DAG.getConstant(0, MVT::i32));
+          MoveRetOps.push_back(DAG.getConstant(paramNum, MVT::i32));
+          MoveRetOps.push_back(InFlag);
+          SDValue retval = DAG.getNode(NVPTXISD::LoadParam, dl, MoveRetVTs,
+                                       &MoveRetOps[0], MoveRetOps.size());
+          Chain = retval.getValue(1);
+          InFlag = retval.getValue(2);
+          tempRetVals.push_back(retval);
+          ++paramNum;
+        }
+        if (Ins[i].VT.isVector())
+          InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, Ins[i].VT,
+                                       &tempRetVals[0], tempRetVals.size()));
+        else
+          InVals.push_back(tempRetVals[0]);
+      }
+    }
+  }
+  Chain = DAG.getCALLSEQ_END(Chain,
+                             DAG.getIntPtrConstant(uniqueCallSite, true),
+                             DAG.getIntPtrConstant(uniqueCallSite+1, true),
+                             InFlag);
+  uniqueCallSite++;
+
+  // set isTailCall to false for now, until we figure out how to express
+  // tail call optimization in PTX
+  isTailCall = false;
+  return Chain;
+}
+
+// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
+// (see LegalizeDAG.cpp). This is slow and uses local memory.
+// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
+SDValue NVPTXTargetLowering::
+LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
+  SDNode *Node = Op.getNode();
+  DebugLoc dl = Node->getDebugLoc();
+  SmallVector<SDValue, 8> Ops;
+  unsigned NumOperands = Node->getNumOperands();
+  for (unsigned i=0; i < NumOperands; ++i) {
+    SDValue SubOp = Node->getOperand(i);
+    EVT VVT = SubOp.getNode()->getValueType(0);
+    EVT EltVT = VVT.getVectorElementType();
+    unsigned NumSubElem = VVT.getVectorNumElements();
+    for (unsigned j=0; j < NumSubElem; ++j) {
+      Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
+                                DAG.getIntPtrConstant(j)));
+    }
+  }
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0),
+                     &Ops[0], Ops.size());
+}
+
+SDValue NVPTXTargetLowering::
+LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  case ISD::RETURNADDR: return SDValue();
+  case ISD::FRAMEADDR:  return SDValue();
+  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+  case ISD::INTRINSIC_W_CHAIN: return Op;
+  case ISD::BUILD_VECTOR:
+  case ISD::EXTRACT_SUBVECTOR:
+    return Op;
+  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
+  default:
+    llvm_unreachable("Custom lowering not defined for operation");
+  }
+}
+
+SDValue
+NVPTXTargetLowering::getExtSymb(SelectionDAG &DAG, const char *inname, int idx,
+                                EVT v) const {
+  std::string *name = nvTM->getManagedStrPool()->getManagedString(inname);
+  std::stringstream suffix;
+  suffix << idx;
+  *name += suffix.str();
+  return DAG.getTargetExternalSymbol(name->c_str(), v);
+}
+
+SDValue
+NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
+  return getExtSymb(DAG, ".PARAM", idx, v);
+}
+
+SDValue
+NVPTXTargetLowering::getParamHelpSymbol(SelectionDAG &DAG, int idx) {
+  return getExtSymb(DAG, ".HLPPARAM", idx);
+}
+
+// Check to see if the kernel argument is image*_t or sampler_t
+
+bool llvm::isImageOrSamplerVal(const Value *arg, const Module *context) {
+  static const char *const specialTypes[] = {
+                                             "struct._image2d_t",
+                                             "struct._image3d_t",
+                                             "struct._sampler_t"
+  };
+
+  const Type *Ty = arg->getType();
+  const PointerType *PTy = dyn_cast<PointerType>(Ty);
+
+  if (!PTy)
+    return false;
+
+  if (!context)
+    return false;
+
+  const StructType *STy = dyn_cast<StructType>(PTy->getElementType());
+  const std::string TypeName = STy ? STy->getName() : "";
+
+  for (int i = 0, e = array_lengthof(specialTypes); i != e; ++i)
+    if (TypeName == specialTypes[i])
+      return true;
+
+  return false;
+}
+
+SDValue
+NVPTXTargetLowering::LowerFormalArguments(SDValue Chain,
+                                        CallingConv::ID CallConv, bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                          DebugLoc dl, SelectionDAG &DAG,
+                                       SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  const TargetData *TD = getTargetData();
+
+  const Function *F = MF.getFunction();
+  const AttrListPtr &PAL = F->getAttributes();
+
+  SDValue Root = DAG.getRoot();
+  std::vector<SDValue> OutChains;
+
+  bool isKernel = llvm::isKernelFunction(*F);
+  bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+
+  std::vector<Type *> argTypes;
+  std::vector<const Argument *> theArgs;
+  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+      I != E; ++I) {
+    theArgs.push_back(I);
+    argTypes.push_back(I->getType());
+  }
+  assert(argTypes.size() == Ins.size() &&
+         "Ins types and function types did not match");
+
+  int idx = 0;
+  for (unsigned i=0, e=Ins.size(); i!=e; ++i, ++idx) {
+    Type *Ty = argTypes[i];
+    EVT ObjectVT = getValueType(Ty);
+    assert(ObjectVT == Ins[i].VT &&
+           "Ins type did not match function type");
+
+    // If the kernel argument is image*_t or sampler_t, convert it to
+    // a i32 constant holding the parameter position. This can later
+    // matched in the AsmPrinter to output the correct mangled name.
+    if (isImageOrSamplerVal(theArgs[i],
+                           (theArgs[i]->getParent() ?
+                               theArgs[i]->getParent()->getParent() : 0))) {
+      assert(isKernel && "Only kernels can have image/sampler params");
+      InVals.push_back(DAG.getConstant(i+1, MVT::i32));
+      continue;
+    }
+
+    if (theArgs[i]->use_empty()) {
+      // argument is dead
+      InVals.push_back(DAG.getNode(ISD::UNDEF, dl, ObjectVT));
+      continue;
+    }
+
+    // In the following cases, assign a node order of "idx+1"
+    // to newly created nodes. The SDNOdes for params have to
+    // appear in the same order as their order of appearance
+    // in the original function. "idx+1" holds that order.
+    if (PAL.paramHasAttr(i+1, Attribute::ByVal) == false) {
+      // A plain scalar.
+      if (isABI || isKernel) {
+        // If ABI, load from the param symbol
+        SDValue Arg = getParamSymbol(DAG, idx);
+        Value *srcValue = new Argument(PointerType::get(ObjectVT.getTypeForEVT(
+            F->getContext()),
+            llvm::ADDRESS_SPACE_PARAM));
+        SDValue p = DAG.getLoad(ObjectVT, dl, Root, Arg,
+                                MachinePointerInfo(srcValue), false, false,
+                                false,
+                                TD->getABITypeAlignment(ObjectVT.getTypeForEVT(
+                                  F->getContext())));
+        if (p.getNode())
+          DAG.AssignOrdering(p.getNode(), idx+1);
+        InVals.push_back(p);
+      }
+      else {
+        // If no ABI, just move the param symbol
+        SDValue Arg = getParamSymbol(DAG, idx, ObjectVT);
+        SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
+        if (p.getNode())
+          DAG.AssignOrdering(p.getNode(), idx+1);
+        InVals.push_back(p);
+      }
+      continue;
+    }
+
+    // Param has ByVal attribute
+    if (isABI || isKernel) {
+      // Return MoveParam(param symbol).
+      // Ideally, the param symbol can be returned directly,
+      // but when SDNode builder decides to use it in a CopyToReg(),
+      // machine instruction fails because TargetExternalSymbol
+      // (not lowered) is target dependent, and CopyToReg assumes
+      // the source is lowered.
+      SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
+      SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
+      if (p.getNode())
+        DAG.AssignOrdering(p.getNode(), idx+1);
+      if (isKernel)
+        InVals.push_back(p);
+      else {
+        SDValue p2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, ObjectVT,
+                    DAG.getConstant(Intrinsic::nvvm_ptr_local_to_gen, MVT::i32),
+                                 p);
+        InVals.push_back(p2);
+      }
+    } else {
+      // Have to move a set of param symbols to registers and
+      // store them locally and return the local pointer in InVals
+      const PointerType *elemPtrType = dyn_cast<PointerType>(argTypes[i]);
+      assert(elemPtrType &&
+             "Byval parameter should be a pointer type");
+      Type *elemType = elemPtrType->getElementType();
+      // Compute the constituent parts
+      SmallVector<EVT, 16> vtparts;
+      SmallVector<uint64_t, 16> offsets;
+      ComputeValueVTs(*this, elemType, vtparts, &offsets, 0);
+      unsigned totalsize = 0;
+      for (unsigned j=0, je=vtparts.size(); j!=je; ++j)
+        totalsize += vtparts[j].getStoreSizeInBits();
+      SDValue localcopy =  DAG.getFrameIndex(MF.getFrameInfo()->
+                                      CreateStackObject(totalsize/8, 16, false),
+                                             getPointerTy());
+      unsigned sizesofar = 0;
+      std::vector<SDValue> theChains;
+      for (unsigned j=0, je=vtparts.size(); j!=je; ++j) {
+        unsigned numElems = 1;
+        if (vtparts[j].isVector()) numElems = vtparts[j].getVectorNumElements();
+        for (unsigned k=0, ke=numElems; k!=ke; ++k) {
+          EVT tmpvt = vtparts[j];
+          if (tmpvt.isVector()) tmpvt = tmpvt.getVectorElementType();
+          SDValue arg = DAG.getNode(NVPTXISD::MoveParam, dl, tmpvt,
+                                    getParamSymbol(DAG, idx, tmpvt));
+          SDValue addr = DAG.getNode(ISD::ADD, dl, getPointerTy(), localcopy,
+                                    DAG.getConstant(sizesofar, getPointerTy()));
+          theChains.push_back(DAG.getStore(Chain, dl, arg, addr,
+                                        MachinePointerInfo(), false, false, 0));
+          sizesofar += tmpvt.getStoreSizeInBits()/8;
+          ++idx;
+        }
+      }
+      --idx;
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &theChains[0],
+                          theChains.size());
+      InVals.push_back(localcopy);
+    }
+  }
+
+  // Clang will check explicit VarArg and issue error if any. However, Clang
+  // will let code with
+  // implicit var arg like f() pass.
+  // We treat this case as if the arg list is empty.
+  //if (F.isVarArg()) {
+  // assert(0 && "VarArg not supported yet!");
+  //}
+
+  if (!OutChains.empty())
+    DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                            &OutChains[0], OutChains.size()));
+
+  return Chain;
+}
+
+SDValue
+NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                                 bool isVarArg,
+                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 const SmallVectorImpl<SDValue> &OutVals,
+                                 DebugLoc dl, SelectionDAG &DAG) const {
+
+  bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+
+  unsigned sizesofar = 0;
+  unsigned idx = 0;
+  for (unsigned i=0, e=Outs.size(); i!=e; ++i) {
+    SDValue theVal = OutVals[i];
+    EVT theValType = theVal.getValueType();
+    unsigned numElems = 1;
+    if (theValType.isVector()) numElems = theValType.getVectorNumElements();
+    for (unsigned j=0,je=numElems; j!=je; ++j) {
+      SDValue tmpval = theVal;
+      if (theValType.isVector())
+        tmpval = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                             theValType.getVectorElementType(),
+                             tmpval, DAG.getIntPtrConstant(j));
+      Chain = DAG.getNode(isABI ? NVPTXISD::StoreRetval :NVPTXISD::MoveToRetval,
+          dl, MVT::Other,
+          Chain,
+          DAG.getConstant(isABI ? sizesofar : idx, MVT::i32),
+          tmpval);
+      if (theValType.isVector())
+        sizesofar += theValType.getVectorElementType().getStoreSizeInBits()/8;
+      else
+        sizesofar += theValType.getStoreSizeInBits()/8;
+      ++idx;
+    }
+  }
+
+  return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
+}
+
+void
+NVPTXTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                  std::string &Constraint,
+                                                  std::vector<SDValue> &Ops,
+                                                  SelectionDAG &DAG) const
+{
+  if (Constraint.length() > 1)
+    return;
+  else
+    TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+// NVPTX suuport vector of legal types of any length in Intrinsics because the
+// NVPTX specific type legalizer
+// will legalize them to the PTX supported length.
+bool
+NVPTXTargetLowering::isTypeSupportedInIntrinsic(MVT VT) const {
+  if (isTypeLegal(VT))
+    return true;
+  if (VT.isVector()) {
+    MVT eVT = VT.getVectorElementType();
+    if (isTypeLegal(eVT))
+      return true;
+  }
+  return false;
+}
+
+
+// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
+// TgtMemIntrinsic
+// because we need the information that is only available in the "Value" type
+// of destination
+// pointer. In particular, the address space information.
+bool
+NVPTXTargetLowering::getTgtMemIntrinsic(IntrinsicInfo& Info, const CallInst &I,
+                                        unsigned Intrinsic) const {
+  switch (Intrinsic) {
+  default:
+    return false;
+
+  case Intrinsic::nvvm_atomic_load_add_f32:
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::f32;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = true;
+    Info.align = 0;
+    return true;
+
+  case Intrinsic::nvvm_atomic_load_inc_32:
+  case Intrinsic::nvvm_atomic_load_dec_32:
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::i32;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = true;
+    Info.align = 0;
+    return true;
+
+  case Intrinsic::nvvm_ldu_global_i:
+  case Intrinsic::nvvm_ldu_global_f:
+  case Intrinsic::nvvm_ldu_global_p:
+
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
+      Info.memVT = MVT::i32;
+    else if (Intrinsic == Intrinsic::nvvm_ldu_global_p)
+      Info.memVT = getPointerTy();
+    else
+      Info.memVT = MVT::f32;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = 0;
+    return true;
+
+  }
+  return false;
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+/// Used to guide target specific optimizations, like loop strength reduction
+/// (LoopStrengthReduce.cpp) and memory optimization for address mode
+/// (CodeGenPrepare.cpp)
+bool
+NVPTXTargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                           Type *Ty) const {
+
+  // AddrMode - This represents an addressing mode of:
+  //    BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
+  //
+  // The legal address modes are
+  // - [avar]
+  // - [areg]
+  // - [areg+immoff]
+  // - [immAddr]
+
+  if (AM.BaseGV) {
+    if (AM.BaseOffs || AM.HasBaseReg || AM.Scale)
+      return false;
+    return true;
+  }
+
+  switch (AM.Scale) {
+  case 0:  // "r", "r+i" or "i" is allowed
+    break;
+  case 1:
+    if (AM.HasBaseReg)  // "r+r+i" or "r+r" is not allowed.
+      return false;
+    // Otherwise we have r+i.
+    break;
+  default:
+    // No scale > 1 is allowed
+    return false;
+  }
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//                         NVPTX Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+NVPTXTargetLowering::ConstraintType
+NVPTXTargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default:
+      break;
+    case 'r':
+    case 'h':
+    case 'c':
+    case 'l':
+    case 'f':
+    case 'd':
+    case '0':
+    case 'N':
+      return C_RegisterClass;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+
+std::pair<unsigned, const TargetRegisterClass*>
+NVPTXTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+                                                  EVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'c':
+      return std::make_pair(0U, &NVPTX::Int8RegsRegClass);
+    case 'h':
+      return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
+    case 'r':
+      return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
+    case 'l':
+    case 'N':
+      return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
+    case 'f':
+      return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
+    case 'd':
+      return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
+    }
+  }
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+
+
+/// getFunctionAlignment - Return the Log2 alignment of this function.
+unsigned NVPTXTargetLowering::getFunctionAlignment(const Function *) const {
+  return 4;
+}
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h
new file mode 100644
index 0000000..86246e6
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -0,0 +1,144 @@
+//===-- NVPTXISelLowering.h - NVPTX DAG Lowering Interface ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that NVPTX uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXISELLOWERING_H
+#define NVPTXISELLOWERING_H
+
+#include "NVPTX.h"
+#include "NVPTXSubtarget.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+namespace NVPTXISD {
+enum NodeType {
+  // Start the numbering from where ISD NodeType finishes.
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+  Wrapper,
+  CALL,
+  RET_FLAG,
+  LOAD_PARAM,
+  NVBuiltin,
+  DeclareParam,
+  DeclareScalarParam,
+  DeclareRetParam,
+  DeclareRet,
+  DeclareScalarRet,
+  LoadParam,
+  StoreParam,
+  StoreParamS32, // to sext and store a <32bit value, not used currently
+  StoreParamU32, // to zext and store a <32bit value, not used currently
+  MoveToParam,
+  PrintCall,
+  PrintCallUni,
+  CallArgBegin,
+  CallArg,
+  LastCallArg,
+  CallArgEnd,
+  CallVoid,
+  CallVal,
+  CallSymbol,
+  Prototype,
+  MoveParam,
+  MoveRetval,
+  MoveToRetval,
+  StoreRetval,
+  PseudoUseParam,
+  RETURN,
+  CallSeqBegin,
+  CallSeqEnd,
+  Dummy
+};
+}
+
+//===--------------------------------------------------------------------===//
+// TargetLowering Implementation
+//===--------------------------------------------------------------------===//
+class NVPTXTargetLowering : public TargetLowering {
+public:
+  explicit NVPTXTargetLowering(NVPTXTargetMachine &TM);
+  virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGlobalAddress(const GlobalValue *GV, int64_t Offset,
+                             SelectionDAG &DAG) const;
+
+  virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+  bool isTypeSupportedInIntrinsic(MVT VT) const;
+
+  bool getTgtMemIntrinsic(IntrinsicInfo& Info, const CallInst &I,
+                          unsigned Intrinsic) const;
+
+  /// isLegalAddressingMode - Return true if the addressing mode represented
+  /// by AM is legal for this target, for a load/store of the specified type
+  /// Used to guide target specific optimizations, like loop strength
+  /// reduction (LoopStrengthReduce.cpp) and memory optimization for
+  /// address mode (CodeGenPrepare.cpp)
+  virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const;
+
+  /// getFunctionAlignment - Return the Log2 alignment of this function.
+  virtual unsigned getFunctionAlignment(const Function *F) const;
+
+  virtual EVT getSetCCResultType(EVT VT) const {
+    return MVT::i1;
+  }
+
+  ConstraintType getConstraintType(const std::string &Constraint) const;
+  std::pair<unsigned, const TargetRegisterClass*>
+  getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const;
+
+  virtual SDValue
+  LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                       const SmallVectorImpl<ISD::InputArg> &Ins, DebugLoc dl,
+                       SelectionDAG &DAG,
+                       SmallVectorImpl<SDValue> &InVals) const;
+
+  virtual SDValue
+  LowerCall(CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const;
+
+  std::string getPrototype(Type *, const ArgListTy &,
+                           const SmallVectorImpl<ISD::OutputArg> &,
+                           unsigned retAlignment) const;
+
+  virtual SDValue
+  LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+              const SmallVectorImpl<ISD::OutputArg> &Outs,
+              const SmallVectorImpl<SDValue> &OutVals, DebugLoc dl,
+              SelectionDAG &DAG) const;
+
+  virtual void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+                                            std::vector<SDValue> &Ops,
+                                            SelectionDAG &DAG) const;
+
+  NVPTXTargetMachine *nvTM;
+
+  // PTX always uses 32-bit shift amounts
+  virtual MVT getShiftAmountTy(EVT LHSTy) const {
+    return MVT::i32;
+  }
+
+private:
+  const NVPTXSubtarget &nvptxSubtarget;  // cache the subtarget here
+
+  SDValue getExtSymb(SelectionDAG &DAG, const char *name, int idx, EVT =
+                         MVT::i32) const;
+  SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT = MVT::i32) const;
+  SDValue getParamHelpSymbol(SelectionDAG &DAG, int idx);
+
+  SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+};
+} // namespace llvm
+
+#endif // NVPTXISELLOWERING_H
diff --git a/lib/Target/NVPTX/NVPTXInstrFormats.td b/lib/Target/NVPTX/NVPTXInstrFormats.td
new file mode 100644
index 0000000..f11f1b8
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXInstrFormats.td
@@ -0,0 +1,43 @@
+//===- NVPTXInstrFormats.td - NVPTX Instruction Formats-------*- tblgen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe NVPTX instructions format
+//
+//===----------------------------------------------------------------------===//
+
+// Vector instruction type enum
+class VecInstTypeEnum<bits<4> val> {
+  bits<4> Value=val;
+}
+def VecNOP : VecInstTypeEnum<0>;
+
+// Generic NVPTX Format
+
+class NVPTXInst<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : Instruction {
+  field bits<14> Inst;
+
+  let Namespace = "NVPTX";
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString = asmstr;
+  let Pattern = pattern;
+
+  // TSFlagFields
+  bits<4> VecInstType = VecNOP.Value;
+  bit IsSimpleMove = 0;
+  bit IsLoad = 0;
+  bit IsStore = 0;
+
+  let TSFlags{3-0} = VecInstType;
+  let TSFlags{4-4} = IsSimpleMove;
+  let TSFlags{5-5} = IsLoad;
+  let TSFlags{6-6} = IsStore;
+}
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
new file mode 100644
index 0000000..cd50deb
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -0,0 +1,326 @@
+//===- NVPTXInstrInfo.cpp - NVPTX Instruction Information -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXInstrInfo.h"
+#include "NVPTXTargetMachine.h"
+#define GET_INSTRINFO_CTOR
+#include "NVPTXGenInstrInfo.inc"
+#include "llvm/Function.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include <cstdio>
+
+
+using namespace llvm;
+
+// FIXME: Add the subtarget support on this constructor.
+NVPTXInstrInfo::NVPTXInstrInfo(NVPTXTargetMachine &tm)
+: NVPTXGenInstrInfo(),
+  TM(tm),
+  RegInfo(*this, *TM.getSubtargetImpl()) {}
+
+
+void NVPTXInstrInfo::copyPhysReg (MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I, DebugLoc DL,
+                                  unsigned DestReg, unsigned SrcReg,
+                                  bool KillSrc) const {
+  if (NVPTX::Int32RegsRegClass.contains(DestReg) &&
+      NVPTX::Int32RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::IMOV32rr), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::Int8RegsRegClass.contains(DestReg) &&
+      NVPTX::Int8RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::IMOV8rr), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::Int1RegsRegClass.contains(DestReg) &&
+      NVPTX::Int1RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::IMOV1rr), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::Float32RegsRegClass.contains(DestReg) &&
+      NVPTX::Float32RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::FMOV32rr), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::Int16RegsRegClass.contains(DestReg) &&
+      NVPTX::Int16RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::IMOV16rr), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::Int64RegsRegClass.contains(DestReg) &&
+      NVPTX::Int64RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::IMOV64rr), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::Float64RegsRegClass.contains(DestReg) &&
+      NVPTX::Float64RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::FMOV64rr), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::V4F32RegsRegClass.contains(DestReg) &&
+      NVPTX::V4F32RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::V4f32Mov), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::V4I32RegsRegClass.contains(DestReg) &&
+      NVPTX::V4I32RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::V4i32Mov), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::V2F32RegsRegClass.contains(DestReg) &&
+      NVPTX::V2F32RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::V2f32Mov), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::V2I32RegsRegClass.contains(DestReg) &&
+      NVPTX::V2I32RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::V2i32Mov), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::V4I8RegsRegClass.contains(DestReg) &&
+      NVPTX::V4I8RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::V4i8Mov), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::V2I8RegsRegClass.contains(DestReg) &&
+      NVPTX::V2I8RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::V2i8Mov), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::V4I16RegsRegClass.contains(DestReg) &&
+      NVPTX::V4I16RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::V4i16Mov), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::V2I16RegsRegClass.contains(DestReg) &&
+      NVPTX::V2I16RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::V2i16Mov), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::V2I64RegsRegClass.contains(DestReg) &&
+      NVPTX::V2I64RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::V2i64Mov), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::V2F64RegsRegClass.contains(DestReg) &&
+      NVPTX::V2F64RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::V2f64Mov), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else {
+    llvm_unreachable("Don't know how to copy a register");
+  }
+}
+
+bool NVPTXInstrInfo::isMoveInstr(const MachineInstr &MI,
+                                 unsigned &SrcReg,
+                                 unsigned &DestReg) const {
+  // Look for the appropriate part of TSFlags
+  bool isMove = false;
+
+  unsigned TSFlags = (MI.getDesc().TSFlags & NVPTX::SimpleMoveMask) >>
+      NVPTX::SimpleMoveShift;
+  isMove = (TSFlags == 1);
+
+  if (isMove) {
+    MachineOperand dest = MI.getOperand(0);
+    MachineOperand src = MI.getOperand(1);
+    assert(dest.isReg() && "dest of a movrr is not a reg");
+    assert(src.isReg() && "src of a movrr is not a reg");
+
+    SrcReg = src.getReg();
+    DestReg = dest.getReg();
+    return true;
+  }
+
+  return false;
+}
+
+bool  NVPTXInstrInfo::isReadSpecialReg(MachineInstr &MI) const
+{
+  switch (MI.getOpcode()) {
+  default: return false;
+  case NVPTX::INT_PTX_SREG_NTID_X:
+  case NVPTX::INT_PTX_SREG_NTID_Y:
+  case NVPTX::INT_PTX_SREG_NTID_Z:
+  case NVPTX::INT_PTX_SREG_TID_X:
+  case NVPTX::INT_PTX_SREG_TID_Y:
+  case NVPTX::INT_PTX_SREG_TID_Z:
+  case NVPTX::INT_PTX_SREG_CTAID_X:
+  case NVPTX::INT_PTX_SREG_CTAID_Y:
+  case NVPTX::INT_PTX_SREG_CTAID_Z:
+  case NVPTX::INT_PTX_SREG_NCTAID_X:
+  case NVPTX::INT_PTX_SREG_NCTAID_Y:
+  case NVPTX::INT_PTX_SREG_NCTAID_Z:
+  case NVPTX::INT_PTX_SREG_WARPSIZE:
+    return true;
+  }
+}
+
+
+bool NVPTXInstrInfo::isLoadInstr(const MachineInstr &MI,
+                                 unsigned &AddrSpace) const {
+  bool isLoad = false;
+  unsigned TSFlags = (MI.getDesc().TSFlags & NVPTX::isLoadMask) >>
+      NVPTX::isLoadShift;
+  isLoad = (TSFlags == 1);
+  if (isLoad)
+    AddrSpace = getLdStCodeAddrSpace(MI);
+  return isLoad;
+}
+
+bool NVPTXInstrInfo::isStoreInstr(const MachineInstr &MI,
+                                  unsigned &AddrSpace) const {
+  bool isStore = false;
+  unsigned TSFlags = (MI.getDesc().TSFlags & NVPTX::isStoreMask) >>
+      NVPTX::isStoreShift;
+  isStore = (TSFlags == 1);
+  if (isStore)
+    AddrSpace = getLdStCodeAddrSpace(MI);
+  return isStore;
+}
+
+
+bool NVPTXInstrInfo::CanTailMerge(const MachineInstr *MI) const {
+  unsigned addrspace = 0;
+  if (MI->getOpcode() == NVPTX::INT_CUDA_SYNCTHREADS)
+    return false;
+  if (isLoadInstr(*MI, addrspace))
+    if (addrspace == NVPTX::PTXLdStInstCode::SHARED)
+      return false;
+  if (isStoreInstr(*MI, addrspace))
+    if (addrspace == NVPTX::PTXLdStInstCode::SHARED)
+      return false;
+  return true;
+}
+
+
+/// AnalyzeBranch - Analyze the branching code at the end of MBB, returning
+/// true if it cannot be understood (e.g. it's a switch dispatch or isn't
+/// implemented for a target).  Upon success, this returns false and returns
+/// with the following information in various cases:
+///
+/// 1. If this block ends with no branches (it just falls through to its succ)
+///    just return false, leaving TBB/FBB null.
+/// 2. If this block ends with only an unconditional branch, it sets TBB to be
+///    the destination block.
+/// 3. If this block ends with an conditional branch and it falls through to
+///    an successor block, it sets TBB to be the branch destination block and a
+///    list of operands that evaluate the condition. These
+///    operands can be passed to other TargetInstrInfo methods to create new
+///    branches.
+/// 4. If this block ends with an conditional branch and an unconditional
+///    block, it returns the 'true' destination in TBB, the 'false' destination
+///    in FBB, and a list of operands that evaluate the condition. These
+///    operands can be passed to other TargetInstrInfo methods to create new
+///    branches.
+///
+/// Note that RemoveBranch and InsertBranch must be implemented to support
+/// cases where this method returns success.
+///
+bool NVPTXInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                   MachineBasicBlock *&TBB,
+                                   MachineBasicBlock *&FBB,
+                                   SmallVectorImpl<MachineOperand> &Cond,
+                                   bool AllowModify) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+
+  // If there is only one terminator instruction, process it.
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (LastInst->getOpcode() == NVPTX::GOTO) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    } else if (LastInst->getOpcode() == NVPTX::CBranch) {
+      // Block ends with fall-through condbranch.
+      TBB = LastInst->getOperand(1).getMBB();
+      Cond.push_back(LastInst->getOperand(0));
+      return false;
+    }
+    // Otherwise, don't know what this is.
+    return true;
+  }
+
+  // Get the instruction before it if it's a terminator.
+  MachineInstr *SecondLastInst = I;
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() &&
+      isUnpredicatedTerminator(--I))
+    return true;
+
+  // If the block ends with NVPTX::GOTO and NVPTX:CBranch, handle it.
+  if (SecondLastInst->getOpcode() == NVPTX::CBranch &&
+      LastInst->getOpcode() == NVPTX::GOTO) {
+    TBB =  SecondLastInst->getOperand(1).getMBB();
+    Cond.push_back(SecondLastInst->getOperand(0));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+
+  // If the block ends with two NVPTX:GOTOs, handle it.  The second one is not
+  // executed, so remove it.
+  if (SecondLastInst->getOpcode() == NVPTX::GOTO &&
+      LastInst->getOpcode() == NVPTX::GOTO) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+unsigned NVPTXInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) return 0;
+  --I;
+  if (I->getOpcode() != NVPTX::GOTO && I->getOpcode() != NVPTX::CBranch)
+    return 0;
+
+  // Remove the branch.
+  I->eraseFromParent();
+
+  I = MBB.end();
+
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (I->getOpcode() != NVPTX::CBranch)
+    return 1;
+
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+unsigned
+NVPTXInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                             MachineBasicBlock *FBB,
+                             const SmallVectorImpl<MachineOperand> &Cond,
+                             DebugLoc DL) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 1 || Cond.size() == 0) &&
+         "NVPTX branch conditions have two components!");
+
+  // One-way branch.
+  if (FBB == 0) {
+    if (Cond.empty())   // Unconditional branch
+      BuildMI(&MBB, DL, get(NVPTX::GOTO)).addMBB(TBB);
+    else                // Conditional branch
+      BuildMI(&MBB, DL, get(NVPTX::CBranch))
+      .addReg(Cond[0].getReg()).addMBB(TBB);
+    return 1;
+  }
+
+  // Two-way Conditional Branch.
+  BuildMI(&MBB, DL, get(NVPTX::CBranch))
+  .addReg(Cond[0].getReg()).addMBB(TBB);
+  BuildMI(&MBB, DL, get(NVPTX::GOTO)).addMBB(FBB);
+  return 2;
+}
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.h b/lib/Target/NVPTX/NVPTXInstrInfo.h
new file mode 100644
index 0000000..7b8e218
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.h
@@ -0,0 +1,83 @@
+//===- NVPTXInstrInfo.h - NVPTX Instruction Information----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the niversity of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXINSTRUCTIONINFO_H
+#define NVPTXINSTRUCTIONINFO_H
+
+#include "NVPTX.h"
+#include "NVPTXRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "NVPTXGenInstrInfo.inc"
+
+namespace llvm {
+
+class NVPTXInstrInfo : public NVPTXGenInstrInfo
+{
+  NVPTXTargetMachine &TM;
+  const NVPTXRegisterInfo RegInfo;
+public:
+  explicit NVPTXInstrInfo(NVPTXTargetMachine &TM);
+
+  virtual const NVPTXRegisterInfo &getRegisterInfo() const { return RegInfo; }
+
+  /* The following virtual functions are used in register allocation.
+   * They are not implemented because the existing interface and the logic
+   * at the caller side do not work for the elementized vector load and store.
+   *
+   * virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+   *                                  int &FrameIndex) const;
+   * virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+   *                                 int &FrameIndex) const;
+   * virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+   *                              MachineBasicBlock::iterator MBBI,
+   *                             unsigned SrcReg, bool isKill, int FrameIndex,
+   *                              const TargetRegisterClass *RC) const;
+   * virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+   *                               MachineBasicBlock::iterator MBBI,
+   *                               unsigned DestReg, int FrameIndex,
+   *                               const TargetRegisterClass *RC) const;
+   */
+
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const ;
+  virtual bool isMoveInstr(const MachineInstr &MI,
+                           unsigned &SrcReg,
+                           unsigned &DestReg) const;
+  bool isLoadInstr(const MachineInstr &MI, unsigned &AddrSpace) const;
+  bool isStoreInstr(const MachineInstr &MI, unsigned &AddrSpace) const;
+  bool isReadSpecialReg(MachineInstr &MI) const;
+
+  virtual bool CanTailMerge(const MachineInstr *MI) const ;
+  // Branch analysis.
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             bool AllowModify) const;
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                                const SmallVectorImpl<MachineOperand> &Cond,
+                                DebugLoc DL) const;
+  unsigned getLdStCodeAddrSpace(const MachineInstr &MI) const {
+    return  MI.getOperand(2).getImm();
+  }
+
+};
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
new file mode 100644
index 0000000..8a410b8
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -0,0 +1,2837 @@
+//===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the PTX instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+include "NVPTXInstrFormats.td"
+
+// A NOP instruction
+def NOP : NVPTXInst<(outs), (ins), "", []>;
+
+// List of vector specific properties
+def isVecLD      : VecInstTypeEnum<1>;
+def isVecST      : VecInstTypeEnum<2>;
+def isVecBuild   : VecInstTypeEnum<3>;
+def isVecShuffle : VecInstTypeEnum<4>;
+def isVecExtract : VecInstTypeEnum<5>;
+def isVecInsert  : VecInstTypeEnum<6>;
+def isVecDest    : VecInstTypeEnum<7>;
+def isVecOther   : VecInstTypeEnum<15>;
+
+//===----------------------------------------------------------------------===//
+// NVPTX Operand Definitions.
+//===----------------------------------------------------------------------===//
+
+def brtarget    : Operand<OtherVT>;
+
+//===----------------------------------------------------------------------===//
+// NVPTX Instruction Predicate Definitions
+//===----------------------------------------------------------------------===//
+
+
+def hasAtomRedG32 : Predicate<"Subtarget.hasAtomRedG32()">;
+def hasAtomRedS32 : Predicate<"Subtarget.hasAtomRedS32()">;
+def hasAtomRedGen32 : Predicate<"Subtarget.hasAtomRedGen32()">;
+def useAtomRedG32forGen32 :
+  Predicate<"!Subtarget.hasAtomRedGen32() && Subtarget.hasAtomRedG32()">;
+def hasBrkPt : Predicate<"Subtarget.hasBrkPt()">;
+def hasAtomRedG64 : Predicate<"Subtarget.hasAtomRedG64()">;
+def hasAtomRedS64 : Predicate<"Subtarget.hasAtomRedS64()">;
+def hasAtomRedGen64 : Predicate<"Subtarget.hasAtomRedGen64()">;
+def useAtomRedG64forGen64 :
+  Predicate<"!Subtarget.hasAtomRedGen64() && Subtarget.hasAtomRedG64()">;
+def hasAtomAddF32 : Predicate<"Subtarget.hasAtomAddF32()">;
+def hasVote : Predicate<"Subtarget.hasVote()">;
+def hasDouble : Predicate<"Subtarget.hasDouble()">;
+def reqPTX20 : Predicate<"Subtarget.reqPTX20()">;
+def hasLDU : Predicate<"Subtarget.hasLDU()">;
+def hasGenericLdSt : Predicate<"Subtarget.hasGenericLdSt()">;
+
+def doF32FTZ : Predicate<"UseF32FTZ">;
+
+def doFMAF32      : Predicate<"doFMAF32">;
+def doFMAF32_ftz  : Predicate<"(doFMAF32 && UseF32FTZ)">;
+def doFMAF32AGG      : Predicate<"doFMAF32AGG">;
+def doFMAF32AGG_ftz  : Predicate<"(doFMAF32AGG && UseF32FTZ)">;
+def doFMAF64      : Predicate<"doFMAF64">;
+def doFMAF64AGG      : Predicate<"doFMAF64AGG">;
+def doFMADF32     : Predicate<"doFMADF32">;
+def doFMADF32_ftz : Predicate<"(doFMADF32 && UseF32FTZ)">;
+
+def doMulWide      : Predicate<"doMulWide">;
+
+def allowFMA : Predicate<"allowFMA">;
+def allowFMA_ftz : Predicate<"(allowFMA && UseF32FTZ)">;
+
+def do_DIVF32_APPROX : Predicate<"do_DIVF32_PREC==0">;
+def do_DIVF32_FULL : Predicate<"do_DIVF32_PREC==1">;
+
+def hasHWROT32 : Predicate<"Subtarget.hasHWROT32()">;
+
+def true : Predicate<"1">;
+
+//===----------------------------------------------------------------------===//
+// Special Handling for 8-bit Operands and Operations
+//
+// PTX supports 8-bit signed and unsigned types, but does not support 8-bit
+// operations (like add, shift, etc) except for ld/st/cvt. SASS does not have
+// 8-bit registers.
+//
+// PTX ld, st and cvt instructions permit source and destination data operands
+// to be wider than the instruction-type size, so that narrow values may be
+// loaded, stored, and converted using regular-width registers.
+//
+// So in PTX generation, we
+// - always use 16-bit registers in place in 8-bit registers.
+//   (8-bit variables should stay as 8-bit as they represent memory layout.)
+// - for the following 8-bit operations, we sign-ext/zero-ext the 8-bit values
+//   before operation
+//   . div
+//   . rem
+//   . neg (sign)
+//   . set, setp
+//   . shr
+//
+// We are patching the operations by inserting the cvt instructions in the
+// asm strings of the affected instructions.
+//
+// Since vector operations, except for ld/st, are eventually elementized. We
+// do not need to special-hand the vector 8-bit operations.
+//
+//
+//===----------------------------------------------------------------------===//
+
+// Generate string block like
+// {
+//   .reg .s16 %temp1;
+//   .reg .s16 %temp2;
+//   cvt.s16.s8 %temp1, %a;
+//   cvt.s16.s8 %temp2, %b;
+//   opc.s16    %dst, %temp1, %temp2;
+// }
+// when OpcStr=opc.s TypeStr=s16 CVTStr=cvt.s16.s8
+class Handle_i8rr<string OpcStr, string TypeStr, string CVTStr> {
+  string s = !strconcat("{{\n\t",
+             !strconcat(".reg .", !strconcat(TypeStr,
+             !strconcat(" \t%temp1;\n\t",
+             !strconcat(".reg .", !strconcat(TypeStr,
+             !strconcat(" \t%temp2;\n\t",
+             !strconcat(CVTStr, !strconcat(" \t%temp1, $a;\n\t",
+             !strconcat(CVTStr, !strconcat(" \t%temp2, $b;\n\t",
+             !strconcat(OpcStr, "16 \t$dst, %temp1, %temp2;\n\t}}"))))))))))));
+}
+
+// Generate string block like
+// {
+//   .reg .s16 %temp1;
+//   .reg .s16 %temp2;
+//   cvt.s16.s8 %temp1, %a;
+//   mov.b16    %temp2, %b;
+//   cvt.s16.s8 %temp2, %temp2;
+//   opc.s16    %dst, %temp1, %temp2;
+// }
+// when OpcStr=opc.s TypeStr=s16 CVTStr=cvt.s16.s8
+class Handle_i8ri<string OpcStr, string TypeStr, string CVTStr> {
+  string s = !strconcat("{{\n\t",
+             !strconcat(".reg .", !strconcat(TypeStr,
+             !strconcat(" \t%temp1;\n\t",
+             !strconcat(".reg .",
+             !strconcat(TypeStr, !strconcat(" \t%temp2;\n\t",
+             !strconcat(CVTStr, !strconcat(" \t%temp1, $a;\n\t",
+             !strconcat("mov.b16 \t%temp2, $b;\n\t",
+             !strconcat(CVTStr, !strconcat(" \t%temp2, %temp2;\n\t",
+             !strconcat(OpcStr, "16 \t$dst, %temp1, %temp2;\n\t}}")))))))))))));
+}
+
+// Generate string block like
+// {
+//   .reg .s16 %temp1;
+//   .reg .s16 %temp2;
+//   mov.b16    %temp1, %b;
+//   cvt.s16.s8 %temp1, %temp1;
+//   cvt.s16.s8 %temp2, %a;
+//   opc.s16    %dst, %temp1, %temp2;
+// }
+// when OpcStr=opc.s TypeStr=s16 CVTStr=cvt.s16.s8
+class Handle_i8ir<string OpcStr, string TypeStr, string CVTStr> {
+  string s = !strconcat("{{\n\t",
+             !strconcat(".reg .", !strconcat(TypeStr,
+             !strconcat(" \t%temp1;\n\t",
+             !strconcat(".reg .", !strconcat(TypeStr,
+             !strconcat(" \t%temp2;\n\t",
+             !strconcat("mov.b16 \t%temp1, $a;\n\t",
+             !strconcat(CVTStr, !strconcat(" \t%temp1, %temp1;\n\t",
+             !strconcat(CVTStr, !strconcat(" \t%temp2, $b;\n\t",
+             !strconcat(OpcStr, "16 \t$dst, %temp1, %temp2;\n\t}}")))))))))))));
+}
+
+
+//===----------------------------------------------------------------------===//
+// Some Common Instruction Class Templates
+//===----------------------------------------------------------------------===//
+
+multiclass I3<string OpcStr, SDNode OpNode> {
+  def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
+                     !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                     [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+                       Int64Regs:$b))]>;
+  def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+                     !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                     [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+  def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+                     !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                     [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                       Int32Regs:$b))]>;
+  def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                     !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                     [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+  def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                     [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+                       Int16Regs:$b))]>;
+  def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                     [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
+  def i8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b),
+                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                     [(set Int8Regs:$dst, (OpNode Int8Regs:$a, Int8Regs:$b))]>;
+  def i8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i8imm:$b),
+                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                     [(set Int8Regs:$dst, (OpNode Int8Regs:$a, (imm):$b))]>;
+}
+
+multiclass I3_i8<string OpcStr, SDNode OpNode, string TypeStr, string CVTStr> {
+  def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
+                     !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                     [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+                       Int64Regs:$b))]>;
+  def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+                     !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                     [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+  def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+                     !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                     [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                       Int32Regs:$b))]>;
+  def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                     !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                     [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+  def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                     [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+                       Int16Regs:$b))]>;
+  def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                     [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
+  def i8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b),
+                     Handle_i8rr<OpcStr, TypeStr, CVTStr>.s,
+                     [(set Int8Regs:$dst, (OpNode Int8Regs:$a, Int8Regs:$b))]>;
+  def i8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i8imm:$b),
+                     Handle_i8ri<OpcStr, TypeStr, CVTStr>.s,
+                     [(set Int8Regs:$dst, (OpNode Int8Regs:$a, (imm):$b))]>;
+}
+
+multiclass I3_noi8<string OpcStr, SDNode OpNode> {
+  def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
+                     !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                     [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+                       Int64Regs:$b))]>;
+  def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+                     !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                     [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+  def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+                     !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                     [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                       Int32Regs:$b))]>;
+  def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                     !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                     [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+  def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                     [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+                       Int16Regs:$b))]>;
+  def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                     [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
+}
+
+multiclass ADD_SUB_INT_32<string OpcStr, SDNode OpNode> {
+   def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a,
+       Int32Regs:$b),
+                      !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                        Int32Regs:$b))]>;
+   def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+}
+
+multiclass F3<string OpcStr, SDNode OpNode> {
+   def f64rr : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, Float64Regs:$b),
+                      !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+                      [(set Float64Regs:$dst,
+                        (OpNode Float64Regs:$a, Float64Regs:$b))]>,
+                      Requires<[allowFMA]>;
+   def f64ri : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, f64imm:$b),
+                      !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+                      [(set Float64Regs:$dst,
+                        (OpNode Float64Regs:$a, fpimm:$b))]>,
+                      Requires<[allowFMA]>;
+   def f32rr_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+                      [(set Float32Regs:$dst,
+                        (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[allowFMA_ftz]>;
+   def f32ri_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+                      [(set Float32Regs:$dst,
+                        (OpNode Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[allowFMA_ftz]>;
+   def f32rr : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+                      [(set Float32Regs:$dst,
+                        (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[allowFMA]>;
+   def f32ri : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+                      [(set Float32Regs:$dst,
+                        (OpNode Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[allowFMA]>;
+}
+
+multiclass F3_rn<string OpcStr, SDNode OpNode> {
+   def f64rr : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, Float64Regs:$b),
+                      !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
+                      [(set Float64Regs:$dst,
+                        (OpNode Float64Regs:$a, Float64Regs:$b))]>;
+   def f64ri : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, f64imm:$b),
+                      !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
+                      [(set Float64Regs:$dst,
+                        (OpNode Float64Regs:$a, fpimm:$b))]>;
+   def f32rr_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
+                      [(set Float32Regs:$dst,
+                        (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[doF32FTZ]>;
+   def f32ri_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
+                      [(set Float32Regs:$dst,
+                        (OpNode Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[doF32FTZ]>;
+   def f32rr : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
+                      [(set Float32Regs:$dst,
+                        (OpNode Float32Regs:$a, Float32Regs:$b))]>;
+   def f32ri : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
+                      [(set Float32Regs:$dst,
+                        (OpNode Float32Regs:$a, fpimm:$b))]>;
+}
+
+multiclass F2<string OpcStr, SDNode OpNode> {
+   def f64 : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
+                      !strconcat(OpcStr, ".f64 \t$dst, $a;"),
+                      [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>;
+   def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
+                      !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"),
+                      [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>,
+                      Requires<[doF32FTZ]>;
+   def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
+                      !strconcat(OpcStr, ".f32 \t$dst, $a;"),
+                      [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// NVPTX Instructions.
+//===----------------------------------------------------------------------===//
+
+//-----------------------------------
+// Integer Arithmetic
+//-----------------------------------
+
+multiclass ADD_SUB_i1<SDNode OpNode> {
+   def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
+          "xor.pred \t$dst, $a, $b;",
+      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
+   def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
+          "xor.pred \t$dst, $a, $b;",
+      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>;
+}
+
+defm ADD_i1 : ADD_SUB_i1<add>;
+defm SUB_i1 : ADD_SUB_i1<sub>;
+
+
+defm ADD : I3<"add.s", add>;
+defm SUB : I3<"sub.s", sub>;
+
+defm ADDCC : ADD_SUB_INT_32<"add.cc", addc>;
+defm SUBCC : ADD_SUB_INT_32<"sub.cc", subc>;
+
+defm ADDCCC : ADD_SUB_INT_32<"addc.cc", adde>;
+defm SUBCCC : ADD_SUB_INT_32<"subc.cc", sube>;
+
+//mul.wide PTX instruction
+def SInt32Const : PatLeaf<(imm), [{
+  const APInt &v = N->getAPIntValue();
+  if (v.isSignedIntN(32))
+    return true;
+  return false;
+}]>;
+
+def UInt32Const : PatLeaf<(imm), [{
+  const APInt &v = N->getAPIntValue();
+  if (v.isIntN(32))
+    return true;
+  return false;
+}]>;
+
+def SInt16Const : PatLeaf<(imm), [{
+  const APInt &v = N->getAPIntValue();
+  if (v.isSignedIntN(16))
+    return true;
+  return false;
+}]>;
+
+def UInt16Const : PatLeaf<(imm), [{
+  const APInt &v = N->getAPIntValue();
+  if (v.isIntN(16))
+    return true;
+  return false;
+}]>;
+
+def Int5Const : PatLeaf<(imm), [{
+  const APInt &v = N->getAPIntValue();
+  // Check if 0 <= v < 32
+  // Only then the result from (x << v) will be i32
+  if (v.sge(0) && v.slt(32))
+    return true;
+  return false;
+}]>;
+
+def Int4Const : PatLeaf<(imm), [{
+  const APInt &v = N->getAPIntValue();
+  // Check if 0 <= v < 16
+  // Only then the result from (x << v) will be i16
+  if (v.sge(0) && v.slt(16))
+    return true;
+  return false;
+}]>;
+
+def SHL2MUL32 : SDNodeXForm<imm, [{
+  const APInt &v = N->getAPIntValue();
+  APInt temp(32, 1);
+  return CurDAG->getTargetConstant(temp.shl(v), MVT::i32);
+}]>;
+
+def SHL2MUL16 : SDNodeXForm<imm, [{
+  const APInt &v = N->getAPIntValue();
+  APInt temp(16, 1);
+  return CurDAG->getTargetConstant(temp.shl(v), MVT::i16);
+}]>;
+
+def MULWIDES64 : NVPTXInst<(outs Int64Regs:$dst),
+                           (ins Int32Regs:$a, Int32Regs:$b),
+                           "mul.wide.s32 \t$dst, $a, $b;", []>;
+def MULWIDES64Imm : NVPTXInst<(outs Int64Regs:$dst),
+                            (ins Int32Regs:$a, i64imm:$b),
+                           "mul.wide.s32 \t$dst, $a, $b;", []>;
+
+def MULWIDEU64 : NVPTXInst<(outs Int64Regs:$dst),
+                           (ins Int32Regs:$a, Int32Regs:$b),
+                           "mul.wide.u32 \t$dst, $a, $b;", []>;
+def MULWIDEU64Imm : NVPTXInst<(outs Int64Regs:$dst),
+                            (ins Int32Regs:$a, i64imm:$b),
+                           "mul.wide.u32 \t$dst, $a, $b;", []>;
+
+def MULWIDES32 : NVPTXInst<(outs Int32Regs:$dst),
+                            (ins Int16Regs:$a, Int16Regs:$b),
+                           "mul.wide.s16 \t$dst, $a, $b;", []>;
+def MULWIDES32Imm : NVPTXInst<(outs Int32Regs:$dst),
+                            (ins Int16Regs:$a, i32imm:$b),
+                           "mul.wide.s16 \t$dst, $a, $b;", []>;
+
+def MULWIDEU32 : NVPTXInst<(outs Int32Regs:$dst),
+                            (ins Int16Regs:$a, Int16Regs:$b),
+                           "mul.wide.u16 \t$dst, $a, $b;", []>;
+def MULWIDEU32Imm : NVPTXInst<(outs Int32Regs:$dst),
+                            (ins Int16Regs:$a, i32imm:$b),
+                           "mul.wide.u16 \t$dst, $a, $b;", []>;
+
+def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)),
+          (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
+          Requires<[doMulWide]>;
+def : Pat<(shl (zext Int32Regs:$a), (i32 Int5Const:$b)),
+          (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
+          Requires<[doMulWide]>;
+
+def : Pat<(shl (sext Int16Regs:$a), (i16 Int4Const:$b)),
+          (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
+          Requires<[doMulWide]>;
+def : Pat<(shl (zext Int16Regs:$a), (i16 Int4Const:$b)),
+          (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
+          Requires<[doMulWide]>;
+
+def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)),
+          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
+          Requires<[doMulWide]>;
+def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)),
+          (MULWIDES64Imm Int32Regs:$a, (i64 SInt32Const:$b))>,
+          Requires<[doMulWide]>;
+
+def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)),
+          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, Requires<[doMulWide]>;
+def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)),
+          (MULWIDEU64Imm Int32Regs:$a, (i64 UInt32Const:$b))>,
+          Requires<[doMulWide]>;
+
+def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)),
+          (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, Requires<[doMulWide]>;
+def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)),
+          (MULWIDES32Imm Int16Regs:$a, (i32 SInt16Const:$b))>,
+          Requires<[doMulWide]>;
+
+def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)),
+          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, Requires<[doMulWide]>;
+def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)),
+          (MULWIDEU32Imm Int16Regs:$a, (i32 UInt16Const:$b))>,
+          Requires<[doMulWide]>;
+
+defm MULT : I3<"mul.lo.s", mul>;
+
+defm MULTHS : I3_noi8<"mul.hi.s", mulhs>;
+defm MULTHU : I3_noi8<"mul.hi.u", mulhu>;
+def MULTHSi8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b),
+            !strconcat("{{ \n\t",
+            !strconcat(".reg \t.s16 temp1; \n\t",
+            !strconcat(".reg \t.s16 temp2; \n\t",
+            !strconcat("cvt.s16.s8 \ttemp1, $a; \n\t",
+            !strconcat("cvt.s16.s8 \ttemp2, $b; \n\t",
+            !strconcat("mul.lo.s16 \t$dst, temp1, temp2; \n\t",
+            !strconcat("shr.s16 \t$dst, $dst, 8; \n\t",
+            !strconcat("}}", "")))))))),
+      [(set Int8Regs:$dst, (mulhs Int8Regs:$a, Int8Regs:$b))]>;
+def MULTHSi8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i8imm:$b),
+            !strconcat("{{ \n\t",
+            !strconcat(".reg \t.s16 temp1; \n\t",
+            !strconcat(".reg \t.s16 temp2; \n\t",
+            !strconcat("cvt.s16.s8 \ttemp1, $a; \n\t",
+            !strconcat("mov.b16 \ttemp2, $b; \n\t",
+            !strconcat("cvt.s16.s8 \ttemp2, temp2; \n\t",
+            !strconcat("mul.lo.s16 \t$dst, temp1, temp2; \n\t",
+            !strconcat("shr.s16 \t$dst, $dst, 8; \n\t",
+            !strconcat("}}", ""))))))))),
+      [(set Int8Regs:$dst, (mulhs Int8Regs:$a, imm:$b))]>;
+def MULTHUi8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b),
+            !strconcat("{{ \n\t",
+            !strconcat(".reg \t.u16 temp1; \n\t",
+            !strconcat(".reg \t.u16 temp2; \n\t",
+            !strconcat("cvt.u16.u8 \ttemp1, $a; \n\t",
+            !strconcat("cvt.u16.u8 \ttemp2, $b; \n\t",
+            !strconcat("mul.lo.u16 \t$dst, temp1, temp2; \n\t",
+            !strconcat("shr.u16 \t$dst, $dst, 8; \n\t",
+            !strconcat("}}", "")))))))),
+      [(set Int8Regs:$dst, (mulhu Int8Regs:$a, Int8Regs:$b))]>;
+def MULTHUi8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i8imm:$b),
+            !strconcat("{{ \n\t",
+            !strconcat(".reg \t.u16 temp1; \n\t",
+            !strconcat(".reg \t.u16 temp2; \n\t",
+            !strconcat("cvt.u16.u8 \ttemp1, $a; \n\t",
+            !strconcat("mov.b16 \ttemp2, $b; \n\t",
+            !strconcat("cvt.u16.u8 \ttemp2, temp2; \n\t",
+            !strconcat("mul.lo.u16 \t$dst, temp1, temp2; \n\t",
+            !strconcat("shr.u16 \t$dst, $dst, 8; \n\t",
+            !strconcat("}}", ""))))))))),
+      [(set Int8Regs:$dst, (mulhu Int8Regs:$a, imm:$b))]>;
+
+
+defm SDIV : I3_i8<"div.s", sdiv, "s16", "cvt.s16.s8">;
+defm UDIV : I3_i8<"div.u", udiv, "u16", "cvt.u16.u8">;
+
+defm SREM : I3_i8<"rem.s", srem, "s16", "cvt.s16.s8">;
+// The ri version will not be selected as DAGCombiner::visitSREM will lower it.
+defm UREM : I3_i8<"rem.u", urem, "u16", "cvt.u16.u8">;
+// The ri version will not be selected as DAGCombiner::visitUREM will lower it.
+
+def MAD8rrr : NVPTXInst<(outs Int8Regs:$dst),
+                      (ins Int8Regs:$a, Int8Regs:$b, Int8Regs:$c),
+                      "mad.lo.s16 \t$dst, $a, $b, $c;",
+                      [(set Int8Regs:$dst, (add (mul Int8Regs:$a, Int8Regs:$b),
+                        Int8Regs:$c))]>;
+def MAD8rri : NVPTXInst<(outs Int8Regs:$dst),
+                      (ins Int8Regs:$a, Int8Regs:$b, i8imm:$c),
+                      "mad.lo.s16 \t$dst, $a, $b, $c;",
+                      [(set Int8Regs:$dst, (add (mul Int8Regs:$a, Int8Regs:$b),
+                        imm:$c))]>;
+def MAD8rir : NVPTXInst<(outs Int8Regs:$dst),
+                      (ins Int8Regs:$a, i8imm:$b, Int8Regs:$c),
+                      "mad.lo.s16 \t$dst, $a, $b, $c;",
+                      [(set Int8Regs:$dst, (add (mul Int8Regs:$a, imm:$b),
+                        Int8Regs:$c))]>;
+def MAD8rii : NVPTXInst<(outs Int8Regs:$dst),
+                      (ins Int8Regs:$a, i8imm:$b, i8imm:$c),
+                      "mad.lo.s16 \t$dst, $a, $b, $c;",
+                      [(set Int8Regs:$dst, (add (mul Int8Regs:$a, imm:$b),
+                        imm:$c))]>;
+
+def MAD16rrr : NVPTXInst<(outs Int16Regs:$dst),
+                      (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
+                      "mad.lo.s16 \t$dst, $a, $b, $c;",
+                      [(set Int16Regs:$dst, (add
+                        (mul Int16Regs:$a, Int16Regs:$b), Int16Regs:$c))]>;
+def MAD16rri : NVPTXInst<(outs Int16Regs:$dst),
+                      (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
+                      "mad.lo.s16 \t$dst, $a, $b, $c;",
+                      [(set Int16Regs:$dst, (add
+                        (mul Int16Regs:$a, Int16Regs:$b), imm:$c))]>;
+def MAD16rir : NVPTXInst<(outs Int16Regs:$dst),
+                      (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
+                      "mad.lo.s16 \t$dst, $a, $b, $c;",
+                      [(set Int16Regs:$dst, (add
+                        (mul Int16Regs:$a, imm:$b), Int16Regs:$c))]>;
+def MAD16rii : NVPTXInst<(outs Int16Regs:$dst),
+    (ins Int16Regs:$a, i16imm:$b, i16imm:$c),
+                      "mad.lo.s16 \t$dst, $a, $b, $c;",
+                      [(set Int16Regs:$dst, (add (mul Int16Regs:$a, imm:$b),
+                        imm:$c))]>;
+
+def MAD32rrr : NVPTXInst<(outs Int32Regs:$dst),
+                      (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
+                      "mad.lo.s32 \t$dst, $a, $b, $c;",
+                      [(set Int32Regs:$dst, (add
+                        (mul Int32Regs:$a, Int32Regs:$b), Int32Regs:$c))]>;
+def MAD32rri : NVPTXInst<(outs Int32Regs:$dst),
+                      (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
+                      "mad.lo.s32 \t$dst, $a, $b, $c;",
+                      [(set Int32Regs:$dst, (add
+                        (mul Int32Regs:$a, Int32Regs:$b), imm:$c))]>;
+def MAD32rir : NVPTXInst<(outs Int32Regs:$dst),
+                      (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
+                      "mad.lo.s32 \t$dst, $a, $b, $c;",
+                      [(set Int32Regs:$dst, (add
+                        (mul Int32Regs:$a, imm:$b), Int32Regs:$c))]>;
+def MAD32rii : NVPTXInst<(outs Int32Regs:$dst),
+                      (ins Int32Regs:$a, i32imm:$b, i32imm:$c),
+                      "mad.lo.s32 \t$dst, $a, $b, $c;",
+                      [(set Int32Regs:$dst, (add
+                        (mul Int32Regs:$a, imm:$b), imm:$c))]>;
+
+def MAD64rrr : NVPTXInst<(outs Int64Regs:$dst),
+                      (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
+                      "mad.lo.s64 \t$dst, $a, $b, $c;",
+                      [(set Int64Regs:$dst, (add
+                        (mul Int64Regs:$a, Int64Regs:$b), Int64Regs:$c))]>;
+def MAD64rri : NVPTXInst<(outs Int64Regs:$dst),
+                      (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
+                      "mad.lo.s64 \t$dst, $a, $b, $c;",
+                      [(set Int64Regs:$dst, (add
+                        (mul Int64Regs:$a, Int64Regs:$b), imm:$c))]>;
+def MAD64rir : NVPTXInst<(outs Int64Regs:$dst),
+                      (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
+                      "mad.lo.s64 \t$dst, $a, $b, $c;",
+                      [(set Int64Regs:$dst, (add
+                        (mul Int64Regs:$a, imm:$b), Int64Regs:$c))]>;
+def MAD64rii : NVPTXInst<(outs Int64Regs:$dst),
+                      (ins Int64Regs:$a, i64imm:$b, i64imm:$c),
+                      "mad.lo.s64 \t$dst, $a, $b, $c;",
+                      [(set Int64Regs:$dst, (add
+                        (mul Int64Regs:$a, imm:$b), imm:$c))]>;
+
+
+def INEG8 : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$src),
+                     !strconcat("cvt.s16.s8 \t$dst, $src;\n\t",
+                                 "neg.s16 \t$dst, $dst;"),
+         [(set Int8Regs:$dst, (ineg Int8Regs:$src))]>;
+def INEG16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+                     "neg.s16 \t$dst, $src;",
+         [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>;
+def INEG32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+                     "neg.s32 \t$dst, $src;",
+         [(set Int32Regs:$dst, (ineg Int32Regs:$src))]>;
+def INEG64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+                     "neg.s64 \t$dst, $src;",
+         [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>;
+
+//-----------------------------------
+// Floating Point Arithmetic
+//-----------------------------------
+
+// Constant 1.0f
+def FloatConst1 : PatLeaf<(fpimm), [{
+    if (&(N->getValueAPF().getSemantics()) != &llvm::APFloat::IEEEsingle)
+      return false;
+    float f = (float)N->getValueAPF().convertToFloat();
+    return (f==1.0f);
+}]>;
+// Constand (double)1.0
+def DoubleConst1 : PatLeaf<(fpimm), [{
+    if (&(N->getValueAPF().getSemantics()) != &llvm::APFloat::IEEEdouble)
+      return false;
+    double d = (double)N->getValueAPF().convertToDouble();
+    return (d==1.0);
+}]>;
+
+defm FADD : F3<"add", fadd>;
+defm FSUB : F3<"sub", fsub>;
+defm FMUL : F3<"mul", fmul>;
+
+defm FADD_rn : F3_rn<"add", fadd>;
+defm FSUB_rn : F3_rn<"sub", fsub>;
+defm FMUL_rn : F3_rn<"mul", fmul>;
+
+defm FABS : F2<"abs", fabs>;
+defm FNEG : F2<"neg", fneg>;
+defm FSQRT : F2<"sqrt.rn", fsqrt>;
+
+//
+// F64 division
+//
+def FDIV641r : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins f64imm:$a, Float64Regs:$b),
+                      "rcp.rn.f64 \t$dst, $b;",
+                      [(set Float64Regs:$dst,
+                        (fdiv DoubleConst1:$a, Float64Regs:$b))]>;
+def FDIV64rr : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, Float64Regs:$b),
+                      "div.rn.f64 \t$dst, $a, $b;",
+                      [(set Float64Regs:$dst,
+                        (fdiv Float64Regs:$a, Float64Regs:$b))]>;
+def FDIV64ri : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, f64imm:$b),
+                      "div.rn.f64 \t$dst, $a, $b;",
+                      [(set Float64Regs:$dst,
+                        (fdiv Float64Regs:$a, fpimm:$b))]>;
+
+//
+// F32 Approximate reciprocal
+//
+def FDIV321r_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins f32imm:$a, Float32Regs:$b),
+                      "rcp.approx.ftz.f32 \t$dst, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+                      Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+def FDIV321r : NVPTXInst<(outs Float32Regs:$dst),
+                        (ins f32imm:$a, Float32Regs:$b),
+                       "rcp.approx.f32 \t$dst, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+                      Requires<[do_DIVF32_APPROX]>;
+//
+// F32 Approximate division
+//
+def FDIV32approxrr_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      "div.approx.ftz.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+def FDIV32approxrr     : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      "div.approx.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[do_DIVF32_APPROX]>;
+//
+// F32 Semi-accurate reciprocal
+//
+// rcp.approx gives the same result as div.full(1.0f, a) and is faster.
+//
+def FDIV321r_approx_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins f32imm:$a, Float32Regs:$b),
+                      "rcp.approx.ftz.f32 \t$dst, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+                      Requires<[do_DIVF32_FULL, doF32FTZ]>;
+def FDIV321r_approx : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins f32imm:$a, Float32Regs:$b),
+                      "rcp.approx.f32 \t$dst, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+                      Requires<[do_DIVF32_FULL]>;
+//
+// F32 Semi-accurate division
+//
+def FDIV32rr_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      "div.full.ftz.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[do_DIVF32_FULL, doF32FTZ]>;
+def FDIV32ri_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      "div.full.ftz.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[do_DIVF32_FULL, doF32FTZ]>;
+def FDIV32rr : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      "div.full.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[do_DIVF32_FULL]>;
+def FDIV32ri : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      "div.full.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[do_DIVF32_FULL]>;
+//
+// F32 Accurate reciprocal
+//
+def FDIV321r_prec_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                        (ins f32imm:$a, Float32Regs:$b),
+                       "rcp.rn.ftz.f32 \t$dst, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+                      Requires<[reqPTX20, doF32FTZ]>;
+def FDIV321r_prec : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins f32imm:$a, Float32Regs:$b),
+                       "rcp.rn.f32 \t$dst, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+                      Requires<[reqPTX20]>;
+//
+// F32 Accurate division
+//
+def FDIV32rr_prec_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      "div.rn.ftz.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[doF32FTZ, reqPTX20]>;
+def FDIV32ri_prec_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      "div.rn.ftz.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[doF32FTZ, reqPTX20]>;
+def FDIV32rr_prec : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      "div.rn.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[reqPTX20]>;
+def FDIV32ri_prec : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      "div.rn.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[reqPTX20]>;
+
+
+multiclass FPCONTRACT32<string OpcStr, Predicate Pred> {
+   def rrr : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float32Regs:$dst, (fadd
+                        (fmul Float32Regs:$a, Float32Regs:$b),
+                        Float32Regs:$c))]>, Requires<[Pred]>;
+   // This is to WAR a weird bug in Tablegen that does not automatically
+   // generate the following permutated rule rrr2 from the above rrr.
+   // So we explicitly add it here. This happens to FMA32 only.
+   // See the comments at FMAD32 and FMA32 for more information.
+   def rrr2 : NVPTXInst<(outs Float32Regs:$dst),
+                        (ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float32Regs:$dst, (fadd Float32Regs:$c,
+                        (fmul Float32Regs:$a, Float32Regs:$b)))]>,
+                      Requires<[Pred]>;
+   def rri : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b, f32imm:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float32Regs:$dst, (fadd
+                        (fmul Float32Regs:$a, Float32Regs:$b), fpimm:$c))]>,
+                      Requires<[Pred]>;
+   def rir : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b, Float32Regs:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float32Regs:$dst, (fadd
+                        (fmul Float32Regs:$a, fpimm:$b), Float32Regs:$c))]>,
+                      Requires<[Pred]>;
+   def rii : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b, f32imm:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float32Regs:$dst, (fadd
+                        (fmul Float32Regs:$a, fpimm:$b), fpimm:$c))]>,
+                      Requires<[Pred]>;
+}
+
+multiclass FPCONTRACT64<string OpcStr, Predicate Pred> {
+   def rrr : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, Float64Regs:$b, Float64Regs:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float64Regs:$dst, (fadd
+                        (fmul Float64Regs:$a, Float64Regs:$b),
+                        Float64Regs:$c))]>, Requires<[Pred]>;
+   def rri : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, Float64Regs:$b, f64imm:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float64Regs:$dst, (fadd (fmul Float64Regs:$a,
+                        Float64Regs:$b), fpimm:$c))]>, Requires<[Pred]>;
+   def rir : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, f64imm:$b, Float64Regs:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float64Regs:$dst, (fadd
+                        (fmul Float64Regs:$a, fpimm:$b), Float64Regs:$c))]>,
+                      Requires<[Pred]>;
+   def rii : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, f64imm:$b, f64imm:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float64Regs:$dst, (fadd
+                        (fmul Float64Regs:$a, fpimm:$b), fpimm:$c))]>,
+                      Requires<[Pred]>;
+}
+
+// Due to a unknown reason (most likely a bug in tablegen), tablegen does not
+// automatically generate the rrr2 rule from
+// the rrr rule (see FPCONTRACT32) for FMA32, though it does for FMAD32.
+// If we reverse the order of the following two lines, then rrr2 rule will be
+// generated for FMA32, but not for rrr.
+// Therefore, we manually write the rrr2 rule in FPCONTRACT32.
+defm FMAD32_ftz : FPCONTRACT32<"mad.ftz.f32", doFMADF32_ftz>;
+defm FMAD32 : FPCONTRACT32<"mad.f32", doFMADF32>;
+defm FMA32_ftz  : FPCONTRACT32<"fma.rn.ftz.f32", doFMAF32_ftz>;
+defm FMA32  : FPCONTRACT32<"fma.rn.f32", doFMAF32>;
+defm FMA64  : FPCONTRACT64<"fma.rn.f64", doFMAF64>;
+
+// b*c-a => fmad(b, c, -a)
+multiclass FPCONTRACT32_SUB_PAT_MAD<NVPTXInst Inst, Predicate Pred> {
+  def : Pat<(fsub (fmul Float32Regs:$b, Float32Regs:$c), Float32Regs:$a),
+          (Inst Float32Regs:$b, Float32Regs:$c, (FNEGf32 Float32Regs:$a))>,
+          Requires<[Pred]>;
+}
+
+// a-b*c => fmad(-b,c, a)
+// - legal because a-b*c <=> a+(-b*c) <=> a+(-b)*c
+// b*c-a => fmad(b, c, -a)
+// - legal because b*c-a <=> b*c+(-a)
+multiclass FPCONTRACT32_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
+  def : Pat<(fsub Float32Regs:$a, (fmul Float32Regs:$b, Float32Regs:$c)),
+          (Inst (FNEGf32 Float32Regs:$b), Float32Regs:$c, Float32Regs:$a)>,
+          Requires<[Pred]>;
+  def : Pat<(fsub (fmul Float32Regs:$b, Float32Regs:$c), Float32Regs:$a),
+          (Inst Float32Regs:$b, Float32Regs:$c, (FNEGf32 Float32Regs:$a))>,
+          Requires<[Pred]>;
+}
+
+// a-b*c => fmad(-b,c, a)
+// b*c-a => fmad(b, c, -a)
+multiclass FPCONTRACT64_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
+  def : Pat<(fsub Float64Regs:$a, (fmul Float64Regs:$b, Float64Regs:$c)),
+          (Inst (FNEGf64 Float64Regs:$b), Float64Regs:$c, Float64Regs:$a)>,
+          Requires<[Pred]>;
+
+  def : Pat<(fsub (fmul Float64Regs:$b, Float64Regs:$c), Float64Regs:$a),
+          (Inst Float64Regs:$b, Float64Regs:$c, (FNEGf64 Float64Regs:$a))>,
+          Requires<[Pred]>;
+}
+
+defm FMAF32ext_ftz  : FPCONTRACT32_SUB_PAT<FMA32_ftzrrr, doFMAF32AGG_ftz>;
+defm FMAF32ext  : FPCONTRACT32_SUB_PAT<FMA32rrr, doFMAF32AGG>;
+defm FMADF32ext_ftz : FPCONTRACT32_SUB_PAT_MAD<FMAD32_ftzrrr, doFMADF32_ftz>;
+defm FMADF32ext : FPCONTRACT32_SUB_PAT_MAD<FMAD32rrr, doFMADF32>;
+defm FMAF64ext  : FPCONTRACT64_SUB_PAT<FMA64rrr, doFMAF64AGG>;
+
+def SINF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
+                      "sin.approx.f32 \t$dst, $src;",
+                      [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>;
+def COSF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
+                      "cos.approx.f32 \t$dst, $src;",
+                      [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>;
+
+//-----------------------------------
+// Logical Arithmetic
+//-----------------------------------
+
+multiclass LOG_FORMAT<string OpcStr, SDNode OpNode> {
+  def b1rr:  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
+                      !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
+                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
+  def b1ri:  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
+                      !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
+                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>;
+  def b8rr:  NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b),
+                      !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
+                      [(set Int8Regs:$dst, (OpNode Int8Regs:$a, Int8Regs:$b))]>;
+  def b8ri:  NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i8imm:$b),
+                      !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
+                      [(set Int8Regs:$dst, (OpNode Int8Regs:$a, imm:$b))]>;
+  def b16rr:  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+                      !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
+                      [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+                        Int16Regs:$b))]>;
+  def b16ri:  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+                      !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
+                      [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
+  def b32rr:  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+                      !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                        Int32Regs:$b))]>;
+  def b32ri:  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+  def b64rr:  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
+                      !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
+                      [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+                        Int64Regs:$b))]>;
+  def b64ri:  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+                      !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
+                      [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+}
+
+defm OR  : LOG_FORMAT<"or", or>;
+defm AND : LOG_FORMAT<"and", and>;
+defm XOR : LOG_FORMAT<"xor", xor>;
+
+def NOT1:  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
+                      "not.pred \t$dst, $src;",
+                      [(set Int1Regs:$dst, (not Int1Regs:$src))]>;
+def NOT8:  NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$src),
+                      "not.b16 \t$dst, $src;",
+                      [(set Int8Regs:$dst, (not Int8Regs:$src))]>;
+def NOT16:  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+                      "not.b16 \t$dst, $src;",
+                      [(set Int16Regs:$dst, (not Int16Regs:$src))]>;
+def NOT32:  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+                      "not.b32 \t$dst, $src;",
+                      [(set Int32Regs:$dst, (not Int32Regs:$src))]>;
+def NOT64:  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+                      "not.b64 \t$dst, $src;",
+                      [(set Int64Regs:$dst, (not Int64Regs:$src))]>;
+
+// For shifts, the second src operand must be 32-bit value
+multiclass LSHIFT_FORMAT<string OpcStr, SDNode OpNode> {
+   def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a,
+                      Int32Regs:$b),
+                      !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                      [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+                        Int32Regs:$b))]>;
+   def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                      [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+                        (i32 imm:$b)))]>;
+   def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a,
+                      Int32Regs:$b),
+                      !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                        Int32Regs:$b))]>;
+   def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                        (i32 imm:$b)))]>;
+   def i32ii : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
+                      !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode (i32 imm:$a),
+                        (i32 imm:$b)))]>;
+   def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a,
+                      Int32Regs:$b),
+                      !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                      [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+                        Int32Regs:$b))]>;
+   def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                      [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+                        (i32 imm:$b)))]>;
+   def i8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int32Regs:$b),
+                      !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                      [(set Int8Regs:$dst, (OpNode Int8Regs:$a,
+                        Int32Regs:$b))]>;
+   def i8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                      [(set Int8Regs:$dst, (OpNode Int8Regs:$a,
+                        (i32 imm:$b)))]>;
+}
+
+defm SHL : LSHIFT_FORMAT<"shl.b", shl>;
+
+// For shifts, the second src operand must be 32-bit value
+// Need to add cvt for the 8-bits.
+multiclass RSHIFT_FORMAT<string OpcStr, SDNode OpNode, string CVTStr> {
+   def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a,
+                      Int32Regs:$b),
+                      !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                      [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+                        Int32Regs:$b))]>;
+   def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                      [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+                        (i32 imm:$b)))]>;
+   def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a,
+                      Int32Regs:$b),
+                      !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                        Int32Regs:$b))]>;
+   def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                        (i32 imm:$b)))]>;
+   def i32ii : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
+                      !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode (i32 imm:$a),
+                        (i32 imm:$b)))]>;
+   def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a,
+                      Int32Regs:$b),
+                      !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                      [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+                        Int32Regs:$b))]>;
+   def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                      [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+                        (i32 imm:$b)))]>;
+   def i8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int32Regs:$b),
+                      !strconcat(CVTStr, !strconcat(" \t$dst, $a;\n\t",
+                      !strconcat(OpcStr, "16 \t$dst, $dst, $b;"))),
+                      [(set Int8Regs:$dst, (OpNode Int8Regs:$a,
+                        Int32Regs:$b))]>;
+   def i8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i32imm:$b),
+                      !strconcat(CVTStr, !strconcat(" \t$dst, $a;\n\t",
+                      !strconcat(OpcStr, "16 \t$dst, $dst, $b;"))),
+                      [(set Int8Regs:$dst, (OpNode Int8Regs:$a,
+                        (i32 imm:$b)))]>;
+}
+
+defm SRA : RSHIFT_FORMAT<"shr.s", sra, "cvt.s16.s8">;
+defm SRL : RSHIFT_FORMAT<"shr.u", srl, "cvt.u16.u8">;
+
+// 32bit
+def ROT32imm_sw : NVPTXInst<(outs Int32Regs:$dst),
+  (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
+    !strconcat("{{\n\t",
+    !strconcat(".reg .b32 %lhs;\n\t",
+    !strconcat(".reg .b32 %rhs;\n\t",
+    !strconcat("shl.b32 \t%lhs, $src, $amt1;\n\t",
+    !strconcat("shr.b32 \t%rhs, $src, $amt2;\n\t",
+    !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t",
+    !strconcat("}}", ""))))))),
+    []>;
+
+def SUB_FRM_32 : SDNodeXForm<imm, [{
+    return CurDAG->getTargetConstant(32-N->getZExtValue(), MVT::i32);
+}]>;
+
+def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)),
+          (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>;
+def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)),
+          (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>;
+
+def ROTL32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src,
+    Int32Regs:$amt),
+    !strconcat("{{\n\t",
+    !strconcat(".reg .b32 %lhs;\n\t",
+    !strconcat(".reg .b32 %rhs;\n\t",
+    !strconcat(".reg .b32 %amt2;\n\t",
+    !strconcat("shl.b32 \t%lhs, $src, $amt;\n\t",
+    !strconcat("sub.s32 \t%amt2, 32, $amt;\n\t",
+    !strconcat("shr.b32 \t%rhs, $src, %amt2;\n\t",
+    !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t",
+    !strconcat("}}", ""))))))))),
+    [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>;
+
+def ROTR32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src,
+    Int32Regs:$amt),
+    !strconcat("{{\n\t",
+    !strconcat(".reg .b32 %lhs;\n\t",
+    !strconcat(".reg .b32 %rhs;\n\t",
+    !strconcat(".reg .b32 %amt2;\n\t",
+    !strconcat("shr.b32 \t%lhs, $src, $amt;\n\t",
+    !strconcat("sub.s32 \t%amt2, 32, $amt;\n\t",
+    !strconcat("shl.b32 \t%rhs, $src, %amt2;\n\t",
+    !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t",
+    !strconcat("}}", ""))))))))),
+    [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>;
+
+// 64bit
+def ROT64imm_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src,
+    i32imm:$amt1, i32imm:$amt2),
+    !strconcat("{{\n\t",
+    !strconcat(".reg .b64 %lhs;\n\t",
+    !strconcat(".reg .b64 %rhs;\n\t",
+    !strconcat("shl.b64 \t%lhs, $src, $amt1;\n\t",
+    !strconcat("shr.b64 \t%rhs, $src, $amt2;\n\t",
+    !strconcat("add.u64 \t$dst, %lhs, %rhs;\n\t",
+    !strconcat("}}", ""))))))),
+    []>;
+
+def SUB_FRM_64 : SDNodeXForm<imm, [{
+    return CurDAG->getTargetConstant(64-N->getZExtValue(), MVT::i32);
+}]>;
+
+def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)),
+          (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>;
+def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)),
+          (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>;
+
+def ROTL64reg_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src,
+    Int32Regs:$amt),
+    !strconcat("{{\n\t",
+    !strconcat(".reg .b64 %lhs;\n\t",
+    !strconcat(".reg .b64 %rhs;\n\t",
+    !strconcat(".reg .u32 %amt2;\n\t",
+    !strconcat("shl.b64 \t%lhs, $src, $amt;\n\t",
+    !strconcat("sub.u32 \t%amt2, 64, $amt;\n\t",
+    !strconcat("shr.b64 \t%rhs, $src, %amt2;\n\t",
+    !strconcat("add.u64 \t$dst, %lhs, %rhs;\n\t",
+    !strconcat("}}", ""))))))))),
+    [(set Int64Regs:$dst, (rotl Int64Regs:$src, Int32Regs:$amt))]>;
+
+def ROTR64reg_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src,
+    Int32Regs:$amt),
+    !strconcat("{{\n\t",
+    !strconcat(".reg .b64 %lhs;\n\t",
+    !strconcat(".reg .b64 %rhs;\n\t",
+    !strconcat(".reg .u32 %amt2;\n\t",
+    !strconcat("shr.b64 \t%lhs, $src, $amt;\n\t",
+    !strconcat("sub.u32 \t%amt2, 64, $amt;\n\t",
+    !strconcat("shl.b64 \t%rhs, $src, %amt2;\n\t",
+    !strconcat("add.u64 \t$dst, %lhs, %rhs;\n\t",
+    !strconcat("}}", ""))))))))),
+    [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>;
+
+
+//-----------------------------------
+// Data Movement (Load / Store, Move)
+//-----------------------------------
+
+def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex],
+  [SDNPWantRoot]>;
+def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex],
+  [SDNPWantRoot]>;
+
+def MEMri : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops Int32Regs, i32imm);
+}
+def MEMri64 : Operand<i64> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops Int64Regs, i64imm);
+}
+
+def imem : Operand<iPTR> {
+    let PrintMethod = "printOperand";
+}
+
+def imemAny : Operand<iPTRAny> {
+    let PrintMethod = "printOperand";
+}
+
+def LdStCode : Operand<i32> {
+    let PrintMethod = "printLdStCode";
+}
+
+def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+def Wrapper    : SDNode<"NVPTXISD::Wrapper", SDTWrapper>;
+
+def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a),
+                     "mov.u32 \t$dst, $a;",
+                     [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>;
+
+def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a),
+                     "mov.u64 \t$dst, $a;",
+                     [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>;
+
+// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
+let IsSimpleMove=1 in {
+def IMOV1rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
+                   "mov.pred \t$dst, $sss;", []>;
+def IMOV8rr: NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$sss),
+                    "mov.u16 \t$dst, $sss;", []>;
+def IMOV16rr: NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
+                    "mov.u16 \t$dst, $sss;", []>;
+def IMOV32rr: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
+                    "mov.u32 \t$dst, $sss;", []>;
+def IMOV64rr: NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
+                    "mov.u64 \t$dst, $sss;", []>;
+
+def FMOV32rr: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
+                    "mov.f32 \t$dst, $src;", []>;
+def FMOV64rr: NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
+                    "mov.f64 \t$dst, $src;", []>;
+}
+def IMOV1ri: NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
+                    "mov.pred \t$dst, $src;",
+          [(set Int1Regs:$dst, imm:$src)]>;
+def IMOV8ri: NVPTXInst<(outs Int8Regs:$dst), (ins i8imm:$src),
+                    "mov.u16 \t$dst, $src;",
+          [(set Int8Regs:$dst, imm:$src)]>;
+def IMOV16ri: NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
+                    "mov.u16 \t$dst, $src;",
+          [(set Int16Regs:$dst, imm:$src)]>;
+def IMOV32ri: NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
+                    "mov.u32 \t$dst, $src;",
+          [(set Int32Regs:$dst, imm:$src)]>;
+def IMOV64i: NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
+                    "mov.u64 \t$dst, $src;",
+          [(set Int64Regs:$dst, imm:$src)]>;
+
+def FMOV32ri: NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
+                    "mov.f32 \t$dst, $src;",
+          [(set Float32Regs:$dst, fpimm:$src)]>;
+def FMOV64ri: NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
+                    "mov.f64 \t$dst, $src;",
+          [(set Float64Regs:$dst, fpimm:$src)]>;
+
+def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>;
+
+//---- Copy Frame Index ----
+def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr),
+                        "add.u32 \t$dst, ${addr:add};",
+                        [(set Int32Regs:$dst, ADDRri:$addr)]>;
+def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr),
+                        "add.u64 \t$dst, ${addr:add};",
+                        [(set Int64Regs:$dst, ADDRri64:$addr)]>;
+
+//-----------------------------------
+// Comparison and Selection
+//-----------------------------------
+
+// Generate string block like
+// {
+//   .reg .pred p;
+//   setp.gt.s16 p, %a, %b;
+//   selp.s16 %dst, -1, 0, p;
+// }
+// when OpcStr=setp.gt.s sz1=16 sz2=16 d=%dst a=%a b=%b
+class Set_Str<string OpcStr, string sz1, string sz2, string d, string a,
+  string b> {
+  string t1  = "{{\n\t.reg .pred p;\n\t";
+  string t2  = !strconcat(t1 , OpcStr);
+  string t3  = !strconcat(t2 , sz1);
+  string t4  = !strconcat(t3 , " \tp, ");
+  string t5  = !strconcat(t4 , a);
+  string t6  = !strconcat(t5 , ", ");
+  string t7  = !strconcat(t6 , b);
+  string t8  = !strconcat(t7 , ";\n\tselp.s");
+  string t9  = !strconcat(t8 , sz2);
+  string t10 = !strconcat(t9, " \t");
+  string t11 = !strconcat(t10, d);
+  string s   = !strconcat(t11, ", -1, 0, p;\n\t}}");
+}
+
+// Generate string block like
+// {
+//   .reg .pred p;
+//   .reg .s16 %temp1;
+//   .reg .s16 %temp2;
+//   cvt.s16.s8 %temp1, %a;
+//   cvt s16.s8 %temp1, %b;
+//   setp.gt.s16 p, %temp1, %temp2;
+//   selp.s16 %dst, -1, 0, p;
+// }
+// when OpcStr=setp.gt.s d=%dst a=%a b=%b type=s16 cvt=cvt.s16.s8
+class Set_Stri8<string OpcStr, string d, string a, string b, string type,
+  string cvt> {
+  string t1  = "{{\n\t.reg .pred p;\n\t";
+  string t2  = !strconcat(t1, ".reg .");
+  string t3  = !strconcat(t2, type);
+  string t4  = !strconcat(t3, " %temp1;\n\t");
+  string t5  = !strconcat(t4, ".reg .");
+  string t6  = !strconcat(t5, type);
+  string t7  = !strconcat(t6, " %temp2;\n\t");
+  string t8  = !strconcat(t7, cvt);
+  string t9  = !strconcat(t8, " \t%temp1, ");
+  string t10 = !strconcat(t9, a);
+  string t11 = !strconcat(t10, ";\n\t");
+  string t12 = !strconcat(t11, cvt);
+  string t13 = !strconcat(t12, " \t%temp2, ");
+  string t14 = !strconcat(t13, b);
+  string t15 = !strconcat(t14, ";\n\t");
+  string t16 = !strconcat(t15, OpcStr);
+  string t17 = !strconcat(t16, "16");
+  string t18 = !strconcat(t17, " \tp, %temp1, %temp2;\n\t");
+  string t19 = !strconcat(t18, "selp.s16 \t");
+  string t20 = !strconcat(t19, d);
+  string s   = !strconcat(t20, ", -1, 0, p;\n\t}}");
+}
+
+multiclass ISET_FORMAT<string OpcStr, string OpcStr_u32, PatFrag OpNode,
+  string TypeStr, string CVTStr> {
+  def i8rr_toi8: NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b),
+                     Set_Stri8<OpcStr, "$dst", "$a", "$b", TypeStr, CVTStr>.s,
+               []>;
+  def i16rr_toi16: NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a,
+      Int16Regs:$b),
+                     Set_Str<OpcStr, "16", "16", "$dst", "$a", "$b">.s,
+               []>;
+  def i32rr_toi32: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a,
+      Int32Regs:$b),
+                     Set_Str<OpcStr, "32", "32", "$dst", "$a", "$b">.s,
+               []>;
+  def i64rr_toi64: NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a,
+      Int64Regs:$b),
+                     Set_Str<OpcStr, "64", "64", "$dst", "$a", "$b">.s,
+               []>;
+
+  def i8rr_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b),
+                     Handle_i8rr<OpcStr, TypeStr, CVTStr>.s,
+               [(set Int1Regs:$dst, (OpNode Int8Regs:$a, Int8Regs:$b))]>;
+  def i8ri_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int8Regs:$a, i8imm:$b),
+                     Handle_i8ri<OpcStr, TypeStr, CVTStr>.s,
+               [(set Int1Regs:$dst, (OpNode Int8Regs:$a, imm:$b))]>;
+  def i8ir_p: NVPTXInst<(outs Int1Regs:$dst), (ins i8imm:$a, Int8Regs:$b),
+                     Handle_i8ir<OpcStr, TypeStr, CVTStr>.s,
+               [(set Int1Regs:$dst, (OpNode imm:$a, Int8Regs:$b))]>;
+  def i16rr_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+                 !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
+  def i16ri_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+                 !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
+  def i16ir_p: NVPTXInst<(outs Int1Regs:$dst), (ins i16imm:$a, Int16Regs:$b),
+                 !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode imm:$a, Int16Regs:$b))]>;
+  def i32rr_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+                 !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
+  def i32ri_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                 !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+  def i32ir_p: NVPTXInst<(outs Int1Regs:$dst), (ins i32imm:$a, Int32Regs:$b),
+                 !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode imm:$a, Int32Regs:$b))]>;
+  def i64rr_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
+                 !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
+  def i64ri_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+                 !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+  def i64ir_p: NVPTXInst<(outs Int1Regs:$dst), (ins i64imm:$a, Int64Regs:$b),
+                 !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode imm:$a, Int64Regs:$b))]>;
+
+  def i8rr_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b),
+                     Handle_i8rr<OpcStr_u32, TypeStr, CVTStr>.s,
+               [(set Int32Regs:$dst, (OpNode Int8Regs:$a, Int8Regs:$b))]>;
+  def i8ri_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int8Regs:$a, i8imm:$b),
+                     Handle_i8ri<OpcStr_u32, TypeStr, CVTStr>.s,
+               [(set Int32Regs:$dst, (OpNode Int8Regs:$a, imm:$b))]>;
+  def i8ir_u32: NVPTXInst<(outs Int32Regs:$dst), (ins i8imm:$a, Int8Regs:$b),
+                     Handle_i8ir<OpcStr_u32, TypeStr, CVTStr>.s,
+               [(set Int32Regs:$dst, (OpNode imm:$a, Int8Regs:$b))]>;
+  def i16rr_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a,
+      Int16Regs:$b),
+                 !strconcat(OpcStr_u32, "16 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
+  def i16ri_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+                 !strconcat(OpcStr_u32, "16 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
+  def i16ir_u32: NVPTXInst<(outs Int32Regs:$dst), (ins i16imm:$a, Int16Regs:$b),
+                 !strconcat(OpcStr_u32, "16 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode imm:$a, Int16Regs:$b))]>;
+  def i32rr_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a,
+      Int32Regs:$b),
+                 !strconcat(OpcStr_u32, "32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
+  def i32ri_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                 !strconcat(OpcStr_u32, "32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+  def i32ir_u32: NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, Int32Regs:$b),
+                 !strconcat(OpcStr_u32, "32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode imm:$a, Int32Regs:$b))]>;
+  def i64rr_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$a,
+      Int64Regs:$b),
+                 !strconcat(OpcStr_u32, "64 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
+  def i64ri_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+                 !strconcat(OpcStr_u32, "64 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+  def i64ir_u32: NVPTXInst<(outs Int32Regs:$dst), (ins i64imm:$a, Int64Regs:$b),
+                 !strconcat(OpcStr_u32, "64 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode imm:$a, Int64Regs:$b))]>;
+}
+
+multiclass FSET_FORMAT<string OpcStr, string OpcStr_u32, PatFrag OpNode> {
+  def f32rr_toi32_ftz: NVPTXInst<(outs Int32Regs:$dst), (ins Float32Regs:$a,
+      Float32Regs:$b),
+                     Set_Str<OpcStr, "ftz.f32", "32", "$dst", "$a", "$b">.s,
+               []>, Requires<[doF32FTZ]>;
+  def f32rr_toi32: NVPTXInst<(outs Int32Regs:$dst), (ins Float32Regs:$a,
+      Float32Regs:$b),
+                     Set_Str<OpcStr, "f32", "32", "$dst", "$a", "$b">.s,
+               []>;
+  def f64rr_toi64: NVPTXInst<(outs Int64Regs:$dst), (ins Float64Regs:$a,
+      Float64Regs:$b),
+                     Set_Str<OpcStr, "f64", "64", "$dst", "$a", "$b">.s,
+               []>;
+  def f64rr_toi32: NVPTXInst<(outs Int32Regs:$dst), (ins Float64Regs:$a,
+      Float64Regs:$b),
+                     Set_Str<OpcStr, "f64", "32", "$dst", "$a", "$b">.s,
+               []>;
+
+  def f32rr_p_ftz: NVPTXInst<(outs Int1Regs:$dst), (ins Float32Regs:$a
+      , Float32Regs:$b),
+                 !strconcat(OpcStr, "ftz.f32 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>
+  , Requires<[doF32FTZ]>;
+  def f32rr_p: NVPTXInst<(outs Int1Regs:$dst),
+    (ins Float32Regs:$a, Float32Regs:$b),
+                 !strconcat(OpcStr, "f32 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
+  def f32ri_p_ftz: NVPTXInst<(outs Int1Regs:$dst),
+    (ins Float32Regs:$a, f32imm:$b),
+                 !strconcat(OpcStr, "ftz.f32 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+  Requires<[doF32FTZ]>;
+  def f32ri_p: NVPTXInst<(outs Int1Regs:$dst), (ins Float32Regs:$a, f32imm:$b),
+                 !strconcat(OpcStr, "f32 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
+  def f32ir_p_ftz: NVPTXInst<(outs Int1Regs:$dst),
+    (ins f32imm:$a, Float32Regs:$b),
+                 !strconcat(OpcStr, "ftz.f32 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode fpimm:$a, Float32Regs:$b))]>,
+  Requires<[doF32FTZ]>;
+  def f32ir_p: NVPTXInst<(outs Int1Regs:$dst), (ins f32imm:$a, Float32Regs:$b),
+                 !strconcat(OpcStr, "f32 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode fpimm:$a, Float32Regs:$b))]>;
+  def f64rr_p: NVPTXInst<(outs Int1Regs:$dst),
+    (ins Float64Regs:$a, Float64Regs:$b),
+                 !strconcat(OpcStr, "f64 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>;
+  def f64ri_p: NVPTXInst<(outs Int1Regs:$dst), (ins Float64Regs:$a, f64imm:$b),
+                 !strconcat(OpcStr, "f64 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>;
+  def f64ir_p: NVPTXInst<(outs Int1Regs:$dst), (ins f64imm:$a, Float64Regs:$b),
+                 !strconcat(OpcStr, "f64 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode fpimm:$a, Float64Regs:$b))]>;
+
+  def f32rr_u32_ftz: NVPTXInst<(outs Int32Regs:$dst),
+    (ins Float32Regs:$a, Float32Regs:$b),
+                 !strconcat(OpcStr_u32, "ftz.f32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
+  def f32rr_u32: NVPTXInst<(outs Int32Regs:$dst),
+    (ins Float32Regs:$a, Float32Regs:$b),
+                 !strconcat(OpcStr_u32, "f32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
+  def f32ri_u32_ftz: NVPTXInst<(outs Int32Regs:$dst),
+    (ins Float32Regs:$a, f32imm:$b),
+                 !strconcat(OpcStr_u32, "ftz.f32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
+  def f32ri_u32: NVPTXInst<(outs Int32Regs:$dst),
+    (ins Float32Regs:$a, f32imm:$b),
+                 !strconcat(OpcStr_u32, "f32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
+  def f32ir_u32_ftz: NVPTXInst<(outs Int32Regs:$dst),
+    (ins f32imm:$a, Float32Regs:$b),
+                 !strconcat(OpcStr_u32, "ftz.f32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode fpimm:$a, Float32Regs:$b))]>;
+  def f32ir_u32: NVPTXInst<(outs Int32Regs:$dst),
+    (ins f32imm:$a, Float32Regs:$b),
+                 !strconcat(OpcStr_u32, "f32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode fpimm:$a, Float32Regs:$b))]>;
+  def f64rr_u32: NVPTXInst<(outs Int32Regs:$dst),
+    (ins Float64Regs:$a, Float64Regs:$b),
+                 !strconcat(OpcStr_u32, "f64 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>;
+  def f64ri_u32: NVPTXInst<(outs Int32Regs:$dst),
+    (ins Float64Regs:$a, f64imm:$b),
+                 !strconcat(OpcStr_u32, "f64 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>;
+  def f64ir_u32: NVPTXInst<(outs Int32Regs:$dst),
+    (ins f64imm:$a, Float64Regs:$b),
+                 !strconcat(OpcStr_u32, "f64 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode fpimm:$a, Float64Regs:$b))]>;
+}
+
+defm ISetSGT
+: ISET_FORMAT<"setp.gt.s", "set.gt.u32.s", setgt, "s16", "cvt.s16.s8">;
+defm ISetUGT
+: ISET_FORMAT<"setp.gt.u", "set.gt.u32.u", setugt, "u16", "cvt.u16.u8">;
+defm ISetSLT
+: ISET_FORMAT<"setp.lt.s", "set.lt.u32.s", setlt, "s16", "cvt.s16.s8">;
+defm ISetULT
+: ISET_FORMAT<"setp.lt.u", "set.lt.u32.u", setult, "u16", "cvt.u16.u8">;
+defm ISetSGE
+: ISET_FORMAT<"setp.ge.s", "set.ge.u32.s", setge, "s16", "cvt.s16.s8">;
+defm ISetUGE
+: ISET_FORMAT<"setp.ge.u", "set.ge.u32.u", setuge, "u16", "cvt.u16.u8">;
+defm ISetSLE
+: ISET_FORMAT<"setp.le.s", "set.le.u32.s", setle, "s16", "cvt.s16.s8">;
+defm ISetULE
+: ISET_FORMAT<"setp.le.u", "set.le.u32.u", setule, "u16", "cvt.u16.u8">;
+defm ISetSEQ
+: ISET_FORMAT<"setp.eq.s", "set.eq.u32.s", seteq, "s16", "cvt.s16.s8">;
+defm ISetUEQ
+: ISET_FORMAT<"setp.eq.u", "set.eq.u32.u", setueq, "u16", "cvt.u16.u8">;
+defm ISetSNE
+: ISET_FORMAT<"setp.ne.s", "set.ne.u32.s", setne, "s16", "cvt.s16.s8">;
+defm ISetUNE
+: ISET_FORMAT<"setp.ne.u", "set.ne.u32.u", setune, "u16", "cvt.u16.u8">;
+
+def ISetSNEi1rr_p : NVPTXInst<(outs Int1Regs:$dst),
+  (ins Int1Regs:$a, Int1Regs:$b),
+                      "xor.pred \t$dst, $a, $b;",
+            [(set Int1Regs:$dst, (setne Int1Regs:$a, Int1Regs:$b))]>;
+def ISetUNEi1rr_p : NVPTXInst<(outs Int1Regs:$dst),
+  (ins Int1Regs:$a, Int1Regs:$b),
+                      "xor.pred \t$dst, $a, $b;",
+            [(set Int1Regs:$dst, (setune Int1Regs:$a, Int1Regs:$b))]>;
+def ISetSEQi1rr_p : NVPTXInst<(outs Int1Regs:$dst),
+  (ins Int1Regs:$a, Int1Regs:$b),
+            !strconcat("{{\n\t",
+            !strconcat(".reg .pred temp;\n\t",
+            !strconcat("xor.pred \ttemp, $a, $b;\n\t",
+            !strconcat("not.pred \t$dst, temp;\n\t}}","")))),
+            [(set Int1Regs:$dst, (seteq Int1Regs:$a, Int1Regs:$b))]>;
+def ISetUEQi1rr_p : NVPTXInst<(outs Int1Regs:$dst),
+  (ins Int1Regs:$a, Int1Regs:$b),
+            !strconcat("{{\n\t",
+            !strconcat(".reg .pred temp;\n\t",
+            !strconcat("xor.pred \ttemp, $a, $b;\n\t",
+            !strconcat("not.pred \t$dst, temp;\n\t}}","")))),
+            [(set Int1Regs:$dst, (setueq Int1Regs:$a, Int1Regs:$b))]>;
+
+// Compare 2 i1's and produce a u32
+def ISETSNEi1rr_u32 : NVPTXInst<(outs Int32Regs:$dst),
+  (ins Int1Regs:$a, Int1Regs:$b),
+                  !strconcat("{{\n\t",
+                  !strconcat(".reg .pred temp;\n\t",
+                  !strconcat("xor.pred \ttemp, $a, $b;\n\t",
+                  !strconcat("selp.u32 \t$dst, -1, 0, temp;", "\n\t}}")))),
+                  [(set Int32Regs:$dst, (setne Int1Regs:$a, Int1Regs:$b))]>;
+def ISETSEQi1rr_u32 : NVPTXInst<(outs Int32Regs:$dst),
+  (ins Int1Regs:$a, Int1Regs:$b),
+                  !strconcat("{{\n\t",
+                  !strconcat(".reg .pred temp;\n\t",
+                  !strconcat("xor.pred \ttemp, $a, $b;\n\t",
+                  !strconcat("selp.u32 \t$dst, 0, -1, temp;", "\n\t}}")))),
+                  [(set Int32Regs:$dst, (seteq Int1Regs:$a, Int1Regs:$b))]>;
+
+defm FSetGT : FSET_FORMAT<"setp.gt.", "set.gt.u32.", setogt>;
+defm FSetLT : FSET_FORMAT<"setp.lt.", "set.lt.u32.", setolt>;
+defm FSetGE : FSET_FORMAT<"setp.ge.", "set.ge.u32.", setoge>;
+defm FSetLE : FSET_FORMAT<"setp.le.", "set.le.u32.", setole>;
+defm FSetEQ : FSET_FORMAT<"setp.eq.", "set.eq.u32.", setoeq>;
+defm FSetNE : FSET_FORMAT<"setp.ne.", "set.ne.u32.", setone>;
+
+defm FSetUGT : FSET_FORMAT<"setp.gtu.", "set.gtu.u32.", setugt>;
+defm FSetULT : FSET_FORMAT<"setp.ltu.", "set.ltu.u32.",setult>;
+defm FSetUGE : FSET_FORMAT<"setp.geu.", "set.geu.u32.",setuge>;
+defm FSetULE : FSET_FORMAT<"setp.leu.", "set.leu.u32.",setule>;
+defm FSetUEQ : FSET_FORMAT<"setp.equ.", "set.equ.u32.",setueq>;
+defm FSetUNE : FSET_FORMAT<"setp.neu.", "set.neu.u32.",setune>;
+
+defm FSetNUM : FSET_FORMAT<"setp.num.", "set.num.u32.",seto>;
+defm FSetNAN : FSET_FORMAT<"setp.nan.", "set.nan.u32.",setuo>;
+
+def SELECTi1rr : Pat<(i1 (select Int1Regs:$p, Int1Regs:$a, Int1Regs:$b)),
+                     (ORb1rr (ANDb1rr Int1Regs:$p, Int1Regs:$a),
+                             (ANDb1rr (NOT1 Int1Regs:$p), Int1Regs:$b))>;
+def SELECTi8rr : NVPTXInst<(outs Int8Regs:$dst),
+  (ins Int8Regs:$a, Int8Regs:$b, Int1Regs:$p),
+                      "selp.b16 \t$dst, $a, $b, $p;",
+      [(set Int8Regs:$dst, (select Int1Regs:$p, Int8Regs:$a, Int8Regs:$b))]>;
+def SELECTi8ri : NVPTXInst<(outs Int8Regs:$dst),
+  (ins Int8Regs:$a, i8imm:$b, Int1Regs:$p),
+                      "selp.b16 \t$dst, $a, $b, $p;",
+      [(set Int8Regs:$dst, (select Int1Regs:$p, Int8Regs:$a, imm:$b))]>;
+def SELECTi8ir : NVPTXInst<(outs Int8Regs:$dst),
+  (ins i8imm:$a, Int8Regs:$b, Int1Regs:$p),
+                      "selp.b16 \t$dst, $a, $b, $p;",
+      [(set Int8Regs:$dst, (select Int1Regs:$p, imm:$a, Int8Regs:$b))]>;
+def SELECTi8ii : NVPTXInst<(outs Int8Regs:$dst),
+  (ins i8imm:$a, i8imm:$b, Int1Regs:$p),
+                      "selp.b16 \t$dst, $a, $b, $p;",
+      [(set Int8Regs:$dst, (select Int1Regs:$p, imm:$a, imm:$b))]>;
+
+def SELECTi16rr : NVPTXInst<(outs Int16Regs:$dst),
+  (ins Int16Regs:$a, Int16Regs:$b, Int1Regs:$p),
+                      "selp.b16 \t$dst, $a, $b, $p;",
+      [(set Int16Regs:$dst, (select Int1Regs:$p, Int16Regs:$a, Int16Regs:$b))]>;
+def SELECTi16ri : NVPTXInst<(outs Int16Regs:$dst),
+  (ins Int16Regs:$a, i16imm:$b, Int1Regs:$p),
+                      "selp.b16 \t$dst, $a, $b, $p;",
+      [(set Int16Regs:$dst, (select Int1Regs:$p, Int16Regs:$a, imm:$b))]>;
+def SELECTi16ir : NVPTXInst<(outs Int16Regs:$dst),
+  (ins i16imm:$a, Int16Regs:$b, Int1Regs:$p),
+                      "selp.b16 \t$dst, $a, $b, $p;",
+      [(set Int16Regs:$dst, (select Int1Regs:$p, imm:$a, Int16Regs:$b))]>;
+def SELECTi16ii : NVPTXInst<(outs Int16Regs:$dst),
+  (ins i16imm:$a, i16imm:$b, Int1Regs:$p),
+                      "selp.b16 \t$dst, $a, $b, $p;",
+      [(set Int16Regs:$dst, (select Int1Regs:$p, imm:$a, imm:$b))]>;
+
+def SELECTi32rr : NVPTXInst<(outs Int32Regs:$dst),
+  (ins Int32Regs:$a, Int32Regs:$b, Int1Regs:$p),
+                      "selp.b32 \t$dst, $a, $b, $p;",
+      [(set Int32Regs:$dst, (select Int1Regs:$p, Int32Regs:$a, Int32Regs:$b))]>;
+def SELECTi32ri : NVPTXInst<(outs Int32Regs:$dst),
+  (ins Int32Regs:$a, i32imm:$b, Int1Regs:$p),
+                      "selp.b32 \t$dst, $a, $b, $p;",
+      [(set Int32Regs:$dst, (select Int1Regs:$p, Int32Regs:$a, imm:$b))]>;
+def SELECTi32ir : NVPTXInst<(outs Int32Regs:$dst),
+  (ins i32imm:$a, Int32Regs:$b, Int1Regs:$p),
+                      "selp.b32 \t$dst, $a, $b, $p;",
+      [(set Int32Regs:$dst, (select Int1Regs:$p, imm:$a, Int32Regs:$b))]>;
+def SELECTi32ii : NVPTXInst<(outs Int32Regs:$dst),
+  (ins i32imm:$a, i32imm:$b, Int1Regs:$p),
+                      "selp.b32 \t$dst, $a, $b, $p;",
+      [(set Int32Regs:$dst, (select Int1Regs:$p, imm:$a, imm:$b))]>;
+
+def SELECTi64rr : NVPTXInst<(outs Int64Regs:$dst),
+  (ins Int64Regs:$a, Int64Regs:$b, Int1Regs:$p),
+                      "selp.b64 \t$dst, $a, $b, $p;",
+      [(set Int64Regs:$dst, (select Int1Regs:$p, Int64Regs:$a, Int64Regs:$b))]>;
+def SELECTi64ri : NVPTXInst<(outs Int64Regs:$dst),
+  (ins Int64Regs:$a, i64imm:$b, Int1Regs:$p),
+                      "selp.b64 \t$dst, $a, $b, $p;",
+      [(set Int64Regs:$dst, (select Int1Regs:$p, Int64Regs:$a, imm:$b))]>;
+def SELECTi64ir : NVPTXInst<(outs Int64Regs:$dst),
+  (ins i64imm:$a, Int64Regs:$b, Int1Regs:$p),
+                      "selp.b64 \t$dst, $a, $b, $p;",
+      [(set Int64Regs:$dst, (select Int1Regs:$p, imm:$a, Int64Regs:$b))]>;
+def SELECTi64ii : NVPTXInst<(outs Int64Regs:$dst),
+  (ins i64imm:$a, i64imm:$b, Int1Regs:$p),
+                      "selp.b64 \t$dst, $a, $b, $p;",
+      [(set Int64Regs:$dst, (select Int1Regs:$p, imm:$a, imm:$b))]>;
+
+def SELECTf32rr : NVPTXInst<(outs Float32Regs:$dst),
+  (ins Float32Regs:$a, Float32Regs:$b, Int1Regs:$p),
+                      "selp.f32 \t$dst, $a, $b, $p;",
+      [(set Float32Regs:$dst,
+        (select Int1Regs:$p, Float32Regs:$a, Float32Regs:$b))]>;
+def SELECTf32ri : NVPTXInst<(outs Float32Regs:$dst),
+  (ins Float32Regs:$a, f32imm:$b, Int1Regs:$p),
+                      "selp.f32 \t$dst, $a, $b, $p;",
+      [(set Float32Regs:$dst, (select Int1Regs:$p, Float32Regs:$a, fpimm:$b))]>;
+def SELECTf32ir : NVPTXInst<(outs Float32Regs:$dst),
+  (ins f32imm:$a, Float32Regs:$b, Int1Regs:$p),
+                      "selp.f32 \t$dst, $a, $b, $p;",
+      [(set Float32Regs:$dst, (select Int1Regs:$p, fpimm:$a, Float32Regs:$b))]>;
+def SELECTf32ii : NVPTXInst<(outs Float32Regs:$dst),
+  (ins f32imm:$a, f32imm:$b, Int1Regs:$p),
+                      "selp.f32 \t$dst, $a, $b, $p;",
+      [(set Float32Regs:$dst, (select Int1Regs:$p, fpimm:$a, fpimm:$b))]>;
+
+def SELECTf64rr : NVPTXInst<(outs Float64Regs:$dst),
+  (ins Float64Regs:$a, Float64Regs:$b, Int1Regs:$p),
+                      "selp.f64 \t$dst, $a, $b, $p;",
+      [(set Float64Regs:$dst,
+        (select Int1Regs:$p, Float64Regs:$a, Float64Regs:$b))]>;
+def SELECTf64ri : NVPTXInst<(outs Float64Regs:$dst),
+  (ins Float64Regs:$a, f64imm:$b, Int1Regs:$p),
+                      "selp.f64 \t$dst, $a, $b, $p;",
+      [(set Float64Regs:$dst, (select Int1Regs:$p, Float64Regs:$a, fpimm:$b))]>;
+def SELECTf64ir : NVPTXInst<(outs Float64Regs:$dst),
+  (ins f64imm:$a, Float64Regs:$b, Int1Regs:$p),
+                      "selp.f64 \t$dst, $a, $b, $p;",
+      [(set Float64Regs:$dst, (select Int1Regs:$p, fpimm:$a, Float64Regs:$b))]>;
+def SELECTf64ii : NVPTXInst<(outs Float64Regs:$dst),
+  (ins f64imm:$a, f64imm:$b, Int1Regs:$p),
+                      "selp.f64 \t $dst, $a, $b, $p;",
+      [(set Float64Regs:$dst, (select Int1Regs:$p, fpimm:$a, fpimm:$b))]>;
+
+//def ld_param         : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad,
+//                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+def SDTDeclareParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>,
+  SDTCisInt<2>]>;
+def SDTDeclareScalarParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>,
+  SDTCisInt<1>, SDTCisInt<2>]>;
+def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
+def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
+def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
+def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
+def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>;
+def SDTCallVoidProfile : SDTypeProfile<0, 1, []>;
+def SDTCallValProfile : SDTypeProfile<1, 0, []>;
+def SDTMoveParamProfile : SDTypeProfile<1, 1, []>;
+def SDTMoveRetvalProfile : SDTypeProfile<0, 1, []>;
+def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
+def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>;
+
+def DeclareParam : SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def DeclareScalarParam : SDNode<"NVPTXISD::DeclareScalarParam",
+  SDTDeclareScalarParamProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def DeclareRetParam : SDNode<"NVPTXISD::DeclareRetParam",
+  SDTDeclareParamProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def DeclareRet   : SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def LoadParam    : SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
+                         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
+def PrintCall    : SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def PrintCallUni : SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParam   : SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamU32 : SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamS32 : SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def MoveToParam  : SDNode<"NVPTXISD::MoveToParam", SDTStoreParamProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallArgBegin : SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallArg      : SDNode<"NVPTXISD::CallArg", SDTCallArgProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def LastCallArg  : SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallArgEnd   : SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallVoid     : SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def Prototype    : SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallVal      : SDNode<"NVPTXISD::CallVal", SDTCallValProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def MoveParam    : SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile,
+                         []>;
+def MoveRetval   : SDNode<"NVPTXISD::MoveRetval", SDTMoveRetvalProfile,
+                         [SDNPHasChain, SDNPSideEffect]>;
+def StoreRetval  : SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile,
+                         [SDNPHasChain, SDNPSideEffect]>;
+def MoveToRetval : SDNode<"NVPTXISD::MoveToRetval", SDTStoreRetvalProfile,
+                         [SDNPHasChain, SDNPSideEffect]>;
+def PseudoUseParam : SDNode<"NVPTXISD::PseudoUseParam",
+  SDTPseudoUseParamProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def RETURNNode   : SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile,
+                         [SDNPHasChain, SDNPSideEffect]>;
+
+class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
+                !strconcat(!strconcat("ld.param", opstr),
+                "\t$dst, [retval0+$b];"),
+                [(set regclass:$dst, (LoadParam (i32 1), (i32 imm:$b)))]>;
+
+class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
+                !strconcat(!strconcat("mov", opstr),
+                "\t$dst, retval$b;"),
+                [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
+
+class StoreParamInst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
+                !strconcat(!strconcat("st.param", opstr),
+                "\t[param$a+$b], $val;"),
+                [(StoreParam (i32 imm:$a), (i32 imm:$b), regclass:$val)]>;
+
+class MoveToParamInst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
+                !strconcat(!strconcat("mov", opstr),
+                "\tparam$a, $val;"),
+                [(MoveToParam (i32 imm:$a), (i32 imm:$b), regclass:$val)]>;
+
+class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
+                !strconcat(!strconcat("st.param", opstr),
+                "\t[func_retval0+$a], $val;"),
+                [(StoreRetval (i32 imm:$a), regclass:$val)]>;
+
+class MoveToRetvalInst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs), (ins i32imm:$num, regclass:$val),
+                !strconcat(!strconcat("mov", opstr),
+                "\tfunc_retval$num, $val;"),
+                [(MoveToRetval (i32 imm:$num), regclass:$val)]>;
+
+class MoveRetvalInst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs), (ins regclass:$val),
+                !strconcat(!strconcat("mov", opstr),
+                "\tfunc_retval0, $val;"),
+                [(MoveRetval regclass:$val)]>;
+
+def PrintCallRetInst1 : NVPTXInst<(outs), (ins),
+"call (retval0), ",
+                                [(PrintCall (i32 1))]>;
+def PrintCallRetInst2 : NVPTXInst<(outs), (ins),
+"call (retval0, retval1), ",
+                                [(PrintCall (i32 2))]>;
+def PrintCallRetInst3 : NVPTXInst<(outs), (ins),
+"call (retval0, retval1, retval2), ",
+                                [(PrintCall (i32 3))]>;
+def PrintCallRetInst4 : NVPTXInst<(outs), (ins),
+"call (retval0, retval1, retval2, retval3), ",
+                                [(PrintCall (i32 4))]>;
+def PrintCallRetInst5 : NVPTXInst<(outs), (ins),
+"call (retval0, retval1, retval2, retval3, retval4), ",
+                                [(PrintCall (i32 5))]>;
+def PrintCallRetInst6 : NVPTXInst<(outs), (ins),
+"call (retval0, retval1, retval2, retval3, retval4, retval5), ",
+                                [(PrintCall (i32 6))]>;
+def PrintCallRetInst7 : NVPTXInst<(outs), (ins),
+"call (retval0, retval1, retval2, retval3, retval4, retval5, retval6), ",
+                                [(PrintCall (i32 7))]>;
+def PrintCallRetInst8 : NVPTXInst<(outs), (ins),
+!strconcat("call (retval0, retval1, retval2, retval3, retval4",
+           ", retval5, retval6, retval7), "),
+                                [(PrintCall (i32 8))]>;
+
+def PrintCallNoRetInst : NVPTXInst<(outs), (ins), "call ",
+                                [(PrintCall (i32 0))]>;
+
+def PrintCallUniRetInst1 : NVPTXInst<(outs), (ins),
+"call.uni (retval0), ",
+                                [(PrintCallUni (i32 1))]>;
+def PrintCallUniRetInst2 : NVPTXInst<(outs), (ins),
+"call.uni (retval0, retval1), ",
+                                [(PrintCallUni (i32 2))]>;
+def PrintCallUniRetInst3 : NVPTXInst<(outs), (ins),
+"call.uni (retval0, retval1, retval2), ",
+                                [(PrintCallUni (i32 3))]>;
+def PrintCallUniRetInst4 : NVPTXInst<(outs), (ins),
+"call.uni (retval0, retval1, retval2, retval3), ",
+                                [(PrintCallUni (i32 4))]>;
+def PrintCallUniRetInst5 : NVPTXInst<(outs), (ins),
+"call.uni (retval0, retval1, retval2, retval3, retval4), ",
+                                [(PrintCallUni (i32 5))]>;
+def PrintCallUniRetInst6 : NVPTXInst<(outs), (ins),
+"call.uni (retval0, retval1, retval2, retval3, retval4, retval5), ",
+                                [(PrintCallUni (i32 6))]>;
+def PrintCallUniRetInst7 : NVPTXInst<(outs), (ins),
+"call.uni (retval0, retval1, retval2, retval3, retval4, retval5, retval6), ",
+                                [(PrintCallUni (i32 7))]>;
+def PrintCallUniRetInst8 : NVPTXInst<(outs), (ins),
+!strconcat("call.uni (retval0, retval1, retval2, retval3, retval4",
+           ", retval5, retval6, retval7), "),
+                                [(PrintCallUni (i32 8))]>;
+
+def PrintCallUniNoRetInst : NVPTXInst<(outs), (ins), "call.uni ",
+                                [(PrintCallUni (i32 0))]>;
+
+def LoadParamMemI64    : LoadParamMemInst<Int64Regs, ".b64">;
+def LoadParamMemI32    : LoadParamMemInst<Int32Regs, ".b32">;
+def LoadParamMemI16    : LoadParamMemInst<Int16Regs, ".b16">;
+def LoadParamMemI8     : LoadParamMemInst<Int8Regs, ".b8">;
+
+//def LoadParamMemI16    : NVPTXInst<(outs Int16Regs:$dst), (ins i32imm:$b),
+//                !strconcat("ld.param.b32\ttemp_param_reg, [retval0+$b];\n\t",
+//                "cvt.u16.u32\t$dst, temp_param_reg;"),
+//                [(set Int16Regs:$dst, (LoadParam (i32 1), (i32 imm:$b)))]>;
+//def LoadParamMemI8     : NVPTXInst<(outs Int8Regs:$dst), (ins i32imm:$b),
+//                !strconcat("ld.param.b32\ttemp_param_reg, [retval0+$b];\n\t",
+//                "cvt.u16.u32\t$dst, temp_param_reg;"),
+//                [(set Int8Regs:$dst, (LoadParam (i32 1), (i32 imm:$b)))]>;
+
+def LoadParamMemF32    : LoadParamMemInst<Float32Regs, ".f32">;
+def LoadParamMemF64    : LoadParamMemInst<Float64Regs, ".f64">;
+
+def LoadParamRegI64    : LoadParamRegInst<Int64Regs, ".b64">;
+def LoadParamRegI32    : LoadParamRegInst<Int32Regs, ".b32">;
+def LoadParamRegI16    : NVPTXInst<(outs Int16Regs:$dst), (ins i32imm:$b),
+                         "cvt.u16.u32\t$dst, retval$b;",
+                         [(set Int16Regs:$dst,
+                           (LoadParam (i32 0), (i32 imm:$b)))]>;
+def LoadParamRegI8     : NVPTXInst<(outs Int8Regs:$dst), (ins i32imm:$b),
+                         "cvt.u16.u32\t$dst, retval$b;",
+                         [(set Int8Regs:$dst,
+                           (LoadParam (i32 0), (i32 imm:$b)))]>;
+
+def LoadParamRegF32    : LoadParamRegInst<Float32Regs, ".f32">;
+def LoadParamRegF64    : LoadParamRegInst<Float64Regs, ".f64">;
+
+def StoreParamI64    : StoreParamInst<Int64Regs, ".b64">;
+def StoreParamI32    : StoreParamInst<Int32Regs, ".b32">;
+
+def StoreParamI16    : NVPTXInst<(outs),
+  (ins Int16Regs:$val, i32imm:$a, i32imm:$b),
+                       "st.param.b16\t[param$a+$b], $val;",
+           [(StoreParam (i32 imm:$a), (i32 imm:$b), Int16Regs:$val)]>;
+
+def StoreParamI8     : NVPTXInst<(outs),
+  (ins Int8Regs:$val, i32imm:$a, i32imm:$b),
+                       "st.param.b8\t[param$a+$b], $val;",
+                       [(StoreParam
+                         (i32 imm:$a), (i32 imm:$b), Int8Regs:$val)]>;
+
+def StoreParamS32I16 : NVPTXInst<(outs),
+  (ins Int16Regs:$val, i32imm:$a, i32imm:$b),
+                 !strconcat("cvt.s32.s16\ttemp_param_reg, $val;\n\t",
+                            "st.param.b32\t[param$a+$b], temp_param_reg;"),
+                 [(StoreParamS32 (i32 imm:$a), (i32 imm:$b), Int16Regs:$val)]>;
+def StoreParamU32I16 : NVPTXInst<(outs),
+  (ins Int16Regs:$val, i32imm:$a, i32imm:$b),
+                 !strconcat("cvt.u32.u16\ttemp_param_reg, $val;\n\t",
+                            "st.param.b32\t[param$a+$b], temp_param_reg;"),
+                 [(StoreParamU32 (i32 imm:$a), (i32 imm:$b), Int16Regs:$val)]>;
+
+def StoreParamU32I8   : NVPTXInst<(outs),
+  (ins Int8Regs:$val, i32imm:$a, i32imm:$b),
+                 !strconcat("cvt.u32.u8\ttemp_param_reg, $val;\n\t",
+                            "st.param.b32\t[param$a+$b], temp_param_reg;"),
+                 [(StoreParamU32 (i32 imm:$a), (i32 imm:$b), Int8Regs:$val)]>;
+def StoreParamS32I8   : NVPTXInst<(outs),
+  (ins Int8Regs:$val, i32imm:$a, i32imm:$b),
+                 !strconcat("cvt.s32.s8\ttemp_param_reg, $val;\n\t",
+                            "st.param.b32\t[param$a+$b], temp_param_reg;"),
+                 [(StoreParamS32 (i32 imm:$a), (i32 imm:$b), Int8Regs:$val)]>;
+
+def StoreParamF32    : StoreParamInst<Float32Regs, ".f32">;
+def StoreParamF64    : StoreParamInst<Float64Regs, ".f64">;
+
+def MoveToParamI64   : MoveToParamInst<Int64Regs, ".b64">;
+def MoveToParamI32   : MoveToParamInst<Int32Regs, ".b32">;
+def MoveToParamF64   : MoveToParamInst<Float64Regs, ".f64">;
+def MoveToParamF32   : MoveToParamInst<Float32Regs, ".f32">;
+def MoveToParamI16   : NVPTXInst<(outs),
+  (ins Int16Regs:$val, i32imm:$a, i32imm:$b),
+                   !strconcat("cvt.u32.u16\ttemp_param_reg, $val;\n\t",
+                              "mov.b32\tparam$a, temp_param_reg;"),
+                   [(MoveToParam (i32 imm:$a), (i32 imm:$b), Int16Regs:$val)]>;
+def MoveToParamI8    : NVPTXInst<(outs),
+  (ins Int8Regs:$val, i32imm:$a, i32imm:$b),
+                   !strconcat("cvt.u32.u16\ttemp_param_reg, $val;\n\t",
+                              "mov.b32\tparam$a, temp_param_reg;"),
+                   [(MoveToParam (i32 imm:$a), (i32 imm:$b), Int8Regs:$val)]>;
+
+def StoreRetvalI64    : StoreRetvalInst<Int64Regs, ".b64">;
+def StoreRetvalI32    : StoreRetvalInst<Int32Regs, ".b32">;
+def StoreRetvalI16    : StoreRetvalInst<Int16Regs, ".b16">;
+def StoreRetvalI8     : StoreRetvalInst<Int8Regs, ".b8">;
+
+//def StoreRetvalI16    : NVPTXInst<(outs), (ins Int16Regs:$val, i32imm:$a),
+//     !strconcat("\{\n\t",
+//     !strconcat(".reg .b32 temp_retval_reg;\n\t",
+//     !strconcat("cvt.u32.u16\ttemp_retval_reg, $val;\n\t",
+//                "st.param.b32\t[func_retval0+$a], temp_retval_reg;\n\t\}"))),
+//     [(StoreRetval (i32 imm:$a), Int16Regs:$val)]>;
+//def StoreRetvalI8     : NVPTXInst<(outs), (ins Int8Regs:$val, i32imm:$a),
+//     !strconcat("\{\n\t",
+//     !strconcat(".reg .b32 temp_retval_reg;\n\t",
+//     !strconcat("cvt.u32.u16\ttemp_retval_reg, $val;\n\t",
+//                "st.param.b32\t[func_retval0+$a], temp_retval_reg;\n\t\}"))),
+//     [(StoreRetval (i32 imm:$a), Int8Regs:$val)]>;
+
+def StoreRetvalF64    : StoreRetvalInst<Float64Regs, ".f64">;
+def StoreRetvalF32    : StoreRetvalInst<Float32Regs, ".f32">;
+
+def MoveRetvalI64    : MoveRetvalInst<Int64Regs, ".b64">;
+def MoveRetvalI32    : MoveRetvalInst<Int32Regs, ".b32">;
+def MoveRetvalI16    : MoveRetvalInst<Int16Regs, ".b16">;
+def MoveRetvalI8     : MoveRetvalInst<Int8Regs, ".b8">;
+def MoveRetvalF64    : MoveRetvalInst<Float64Regs, ".f64">;
+def MoveRetvalF32    : MoveRetvalInst<Float32Regs, ".f32">;
+
+def MoveToRetvalI64    : MoveToRetvalInst<Int64Regs, ".b64">;
+def MoveToRetvalI32    : MoveToRetvalInst<Int32Regs, ".b32">;
+def MoveToRetvalF64    : MoveToRetvalInst<Float64Regs, ".f64">;
+def MoveToRetvalF32    : MoveToRetvalInst<Float32Regs, ".f32">;
+def MoveToRetvalI16    : NVPTXInst<(outs), (ins i32imm:$num, Int16Regs:$val),
+                         "cvt.u32.u16\tfunc_retval$num, $val;",
+                         [(MoveToRetval (i32 imm:$num), Int16Regs:$val)]>;
+def MoveToRetvalI8     : NVPTXInst<(outs), (ins i32imm:$num, Int8Regs:$val),
+                         "cvt.u32.u16\tfunc_retval$num, $val;",
+                         [(MoveToRetval (i32 imm:$num), Int8Regs:$val)]>;
+
+def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
+def CallArgEndInst1  : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
+def CallArgEndInst0  : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>;
+def RETURNInst       : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>;
+
+class CallArgInst<NVPTXRegClass regclass> :
+      NVPTXInst<(outs), (ins regclass:$a), "$a, ",
+                [(CallArg (i32 0), regclass:$a)]>;
+
+class LastCallArgInst<NVPTXRegClass regclass> :
+      NVPTXInst<(outs), (ins regclass:$a), "$a",
+                [(LastCallArg (i32 0), regclass:$a)]>;
+
+def CallArgI64     : CallArgInst<Int64Regs>;
+def CallArgI32     : CallArgInst<Int32Regs>;
+def CallArgI16     : CallArgInst<Int16Regs>;
+def CallArgI8      : CallArgInst<Int8Regs>;
+
+def CallArgF64     : CallArgInst<Float64Regs>;
+def CallArgF32     : CallArgInst<Float32Regs>;
+
+def LastCallArgI64 : LastCallArgInst<Int64Regs>;
+def LastCallArgI32 : LastCallArgInst<Int32Regs>;
+def LastCallArgI16 : LastCallArgInst<Int16Regs>;
+def LastCallArgI8  : LastCallArgInst<Int8Regs>;
+
+def LastCallArgF64 : LastCallArgInst<Float64Regs>;
+def LastCallArgF32 : LastCallArgInst<Float32Regs>;
+
+def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ",
+                              [(CallArg (i32 0), (i32 imm:$a))]>;
+def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a",
+                              [(LastCallArg (i32 0), (i32 imm:$a))]>;
+
+def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ",
+                             [(CallArg (i32 1), (i32 imm:$a))]>;
+def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a",
+                             [(LastCallArg (i32 1), (i32 imm:$a))]>;
+
+def CallVoidInst : NVPTXInst<(outs), (ins imem:$addr),
+                             "$addr, ",
+                             [(CallVoid (Wrapper tglobaladdr:$addr))]>;
+def CallVoidInstReg : NVPTXInst<(outs), (ins Int32Regs:$addr),
+                             "$addr, ",
+                             [(CallVoid Int32Regs:$addr)]>;
+def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr),
+                             "$addr, ",
+                             [(CallVoid Int64Regs:$addr)]>;
+def PrototypeInst : NVPTXInst<(outs), (ins i32imm:$val),
+                             ", prototype_$val;",
+                             [(Prototype (i32 imm:$val))]>;
+
+def DeclareRetMemInst : NVPTXInst<(outs),
+  (ins i32imm:$align, i32imm:$size, i32imm:$num),
+         ".param .align $align .b8 retval$num[$size];",
+         [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>;
+def DeclareRetScalarInst : NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
+         ".param .b$size retval$num;",
+         [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>;
+def DeclareRetRegInst : NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
+         ".reg .b$size retval$num;",
+         [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>;
+
+def DeclareParamInst : NVPTXInst<(outs),
+  (ins i32imm:$align, i32imm:$a, i32imm:$size),
+         ".param .align $align .b8 param$a[$size];",
+         [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>;
+def DeclareScalarParamInst : NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
+         ".param .b$size param$a;",
+         [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>;
+def DeclareScalarRegInst : NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
+         ".reg .b$size param$a;",
+         [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
+
+class MoveParamInst<NVPTXRegClass regclass, string asmstr> :
+      NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
+                !strconcat(!strconcat("mov", asmstr), "\t$dst, $src;"),
+                [(set regclass:$dst, (MoveParam regclass:$src))]>;
+
+def MoveParamI64 : MoveParamInst<Int64Regs, ".b64">;
+def MoveParamI32 : MoveParamInst<Int32Regs, ".b32">;
+def MoveParamI16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+                   "cvt.u16.u32\t$dst, $src;",
+                   [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>;
+def MoveParamI8  : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$src),
+                   "cvt.u16.u32\t$dst, $src;",
+                   [(set Int8Regs:$dst, (MoveParam Int8Regs:$src))]>;
+def MoveParamF64 : MoveParamInst<Float64Regs, ".f64">;
+def MoveParamF32 : MoveParamInst<Float32Regs, ".f32">;
+
+class PseudoUseParamInst<NVPTXRegClass regclass> :
+      NVPTXInst<(outs), (ins regclass:$src),
+      "// Pseudo use of $src",
+      [(PseudoUseParam regclass:$src)]>;
+
+def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs>;
+def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs>;
+def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs>;
+def PseudoUseParamI8  : PseudoUseParamInst<Int8Regs>;
+def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs>;
+def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs>;
+
+
+//
+// Load / Store Handling
+//
+multiclass LD<NVPTXRegClass regclass> {
+  def _avar : NVPTXInst<(outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, imem:$addr),
+!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+           "$fromWidth \t$dst, [$addr];"), []>;
+  def _areg : NVPTXInst<(outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, Int32Regs:$addr),
+!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+           "$fromWidth \t$dst, [$addr];"), []>;
+  def _ari : NVPTXInst<(outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+           "$fromWidth \t$dst, [$addr+$offset];"), []>;
+  def _asi : NVPTXInst<(outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+           "$fromWidth \t$dst, [$addr+$offset];"), []>;
+}
+
+let mayLoad=1, neverHasSideEffects=1 in {
+defm LD_i8  : LD<Int8Regs>;
+defm LD_i16 : LD<Int16Regs>;
+defm LD_i32 : LD<Int32Regs>;
+defm LD_i64 : LD<Int64Regs>;
+defm LD_f32 : LD<Float32Regs>;
+defm LD_f64 : LD<Float64Regs>;
+}
+
+let VecInstType=isVecLD.Value, mayLoad=1, neverHasSideEffects=1 in {
+defm LD_v2i8 : LD<V2I8Regs>;
+defm LD_v4i8 : LD<V4I8Regs>;
+defm LD_v2i16 : LD<V2I16Regs>;
+defm LD_v4i16 : LD<V4I16Regs>;
+defm LD_v2i32 : LD<V2I32Regs>;
+defm LD_v4i32 : LD<V4I32Regs>;
+defm LD_v2f32 : LD<V2F32Regs>;
+defm LD_v4f32 : LD<V4F32Regs>;
+defm LD_v2i64 : LD<V2I64Regs>;
+defm LD_v2f64 : LD<V2F64Regs>;
+}
+
+multiclass ST<NVPTXRegClass regclass> {
+  def _avar : NVPTXInst<(outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+      LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
+!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth",
+           " \t[$addr], $src;"), []>;
+  def _areg : NVPTXInst<(outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+      LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
+!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth",
+           " \t[$addr], $src;"), []>;
+  def _ari : NVPTXInst<(outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+      LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
+!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth",
+           " \t[$addr+$offset], $src;"), []>;
+  def _asi : NVPTXInst<(outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+      LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
+!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth",
+           " \t[$addr+$offset], $src;"), []>;
+}
+
+let mayStore=1, neverHasSideEffects=1 in {
+defm ST_i8  : ST<Int8Regs>;
+defm ST_i16 : ST<Int16Regs>;
+defm ST_i32 : ST<Int32Regs>;
+defm ST_i64 : ST<Int64Regs>;
+defm ST_f32 : ST<Float32Regs>;
+defm ST_f64 : ST<Float64Regs>;
+}
+
+let VecInstType=isVecST.Value, mayStore=1, neverHasSideEffects=1 in {
+defm ST_v2i8 : ST<V2I8Regs>;
+defm ST_v4i8 : ST<V4I8Regs>;
+defm ST_v2i16 : ST<V2I16Regs>;
+defm ST_v4i16 : ST<V4I16Regs>;
+defm ST_v2i32 : ST<V2I32Regs>;
+defm ST_v4i32 : ST<V4I32Regs>;
+defm ST_v2f32 : ST<V2F32Regs>;
+defm ST_v4f32 : ST<V4F32Regs>;
+defm ST_v2i64 : ST<V2I64Regs>;
+defm ST_v2f64 : ST<V2F64Regs>;
+}
+
+// The following is used only in and after vector elementizations.
+// Vector elementization happens at the machine instruction level, so the
+// following instruction
+// never appears in the DAG.
+multiclass LD_VEC<NVPTXRegClass regclass> {
+  def _v2_avar : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, imem:$addr),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2}}, [$addr];"), []>;
+  def _v2_areg : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, Int32Regs:$addr),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2}}, [$addr];"), []>;
+  def _v2_ari : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2}}, [$addr+$offset];"), []>;
+  def _v2_asi : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2}}, [$addr+$offset];"), []>;
+  def _v4_avar : NVPTXInst<(outs regclass:$dst1, regclass:$dst2,
+      regclass:$dst3, regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, imem:$addr),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];"), []>;
+  def _v4_areg : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+      regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, Int32Regs:$addr),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];"), []>;
+  def _v4_ari : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+      regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];"),
+                []>;
+  def _v4_asi : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+      regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];"),
+                []>;
+}
+let mayLoad=1, neverHasSideEffects=1 in {
+defm LDV_i8  : LD_VEC<Int8Regs>;
+defm LDV_i16 : LD_VEC<Int16Regs>;
+defm LDV_i32 : LD_VEC<Int32Regs>;
+defm LDV_i64 : LD_VEC<Int64Regs>;
+defm LDV_f32 : LD_VEC<Float32Regs>;
+defm LDV_f64 : LD_VEC<Float64Regs>;
+}
+
+multiclass ST_VEC<NVPTXRegClass regclass> {
+  def _v2_avar : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+      LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr], {{$src1, $src2}};"), []>;
+  def _v2_areg : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+      LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr], {{$src1, $src2}};"), []>;
+  def _v2_ari : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+      LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
+      i32imm:$offset),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr+$offset], {{$src1, $src2}};"), []>;
+  def _v2_asi : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+      LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
+      i32imm:$offset),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr+$offset], {{$src1, $src2}};"), []>;
+  def _v4_avar : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+      LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, imem:$addr),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr], {{$src1, $src2, $src3, $src4}};"), []>;
+  def _v4_areg : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+      LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, Int32Regs:$addr),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr], {{$src1, $src2, $src3, $src4}};"), []>;
+  def _v4_ari : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+      LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};"),
+    []>;
+  def _v4_asi : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+      LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};"),
+    []>;
+}
+let mayStore=1, neverHasSideEffects=1 in {
+defm STV_i8  : ST_VEC<Int8Regs>;
+defm STV_i16 : ST_VEC<Int16Regs>;
+defm STV_i32 : ST_VEC<Int32Regs>;
+defm STV_i64 : ST_VEC<Int64Regs>;
+defm STV_f32 : ST_VEC<Float32Regs>;
+defm STV_f64 : ST_VEC<Float64Regs>;
+}
+
+
+//---- Conversion ----
+
+multiclass CVT_INT_TO_FP <string OpStr, SDNode OpNode> {
+// FIXME: need to add f16 support
+//  def CVTf16i8 :
+//    NVPTXInst<(outs Float16Regs:$d), (ins Int8Regs:$a),
+//              !strconcat(!strconcat("cvt.rn.f16.", OpStr), "8 \t$d, $a;"),
+//        [(set Float16Regs:$d, (OpNode Int8Regs:$a))]>;
+//  def CVTf16i16 :
+//    NVPTXInst<(outs Float16Regs:$d), (ins Int16Regs:$a),
+//              !strconcat(!strconcat("cvt.rn.f16.", OpStr), "16 \t$d, $a;"),
+//        [(set Float16Regs:$d, (OpNode Int16Regs:$a))]>;
+//  def CVTf16i32 :
+//    NVPTXInst<(outs Float16Regs:$d), (ins Int32Regs:$a),
+//              !strconcat(!strconcat("cvt.rn.f16.", OpStr), "32 \t$d, $a;"),
+//        [(set Float16Regs:$d, (OpNode Int32Regs:$a))]>;
+//  def CVTf16i64:
+//    NVPTXInst<(outs Float16Regs:$d), (ins Int64Regs:$a),
+//          !strconcat(!strconcat("cvt.rn.f32.", OpStr), "64 \t$d, $a;"),
+//            [(set Float32Regs:$d, (OpNode Int64Regs:$a))]>;
+
+  def CVTf32i1 :
+    NVPTXInst<(outs Float32Regs:$d), (ins Int1Regs:$a),
+              "selp.f32 \t$d, 1.0, 0.0, $a;",
+        [(set Float32Regs:$d, (OpNode Int1Regs:$a))]>;
+  def CVTf32i8 :
+    NVPTXInst<(outs Float32Regs:$d), (ins Int8Regs:$a),
+              !strconcat(!strconcat("cvt.rn.f32.", OpStr), "8 \t$d, $a;"),
+        [(set Float32Regs:$d, (OpNode Int8Regs:$a))]>;
+  def CVTf32i16 :
+    NVPTXInst<(outs Float32Regs:$d), (ins Int16Regs:$a),
+              !strconcat(!strconcat("cvt.rn.f32.", OpStr), "16 \t$d, $a;"),
+        [(set Float32Regs:$d, (OpNode Int16Regs:$a))]>;
+  def CVTf32i32 :
+    NVPTXInst<(outs Float32Regs:$d), (ins Int32Regs:$a),
+              !strconcat(!strconcat("cvt.rn.f32.", OpStr), "32 \t$d, $a;"),
+        [(set Float32Regs:$d, (OpNode Int32Regs:$a))]>;
+  def CVTf32i64:
+    NVPTXInst<(outs Float32Regs:$d), (ins Int64Regs:$a),
+          !strconcat(!strconcat("cvt.rn.f32.", OpStr), "64 \t$d, $a;"),
+            [(set Float32Regs:$d, (OpNode Int64Regs:$a))]>;
+
+  def CVTf64i1 :
+    NVPTXInst<(outs Float64Regs:$d), (ins Int1Regs:$a),
+              "selp.f64 \t$d, 1.0, 0.0, $a;",
+        [(set Float64Regs:$d, (OpNode Int1Regs:$a))]>;
+  def CVTf64i8 :
+    NVPTXInst<(outs Float64Regs:$d), (ins Int8Regs:$a),
+              !strconcat(!strconcat("cvt.rn.f64.", OpStr), "8 \t$d, $a;"),
+        [(set Float64Regs:$d, (OpNode Int8Regs:$a))]>;
+  def CVTf64i16 :
+    NVPTXInst<(outs Float64Regs:$d), (ins Int16Regs:$a),
+              !strconcat(!strconcat("cvt.rn.f64.", OpStr), "16 \t$d, $a;"),
+        [(set Float64Regs:$d, (OpNode Int16Regs:$a))]>;
+  def CVTf64i32 :
+    NVPTXInst<(outs Float64Regs:$d), (ins Int32Regs:$a),
+              !strconcat(!strconcat("cvt.rn.f64.", OpStr), "32 \t$d, $a;"),
+        [(set Float64Regs:$d, (OpNode Int32Regs:$a))]>;
+  def CVTf64i64:
+    NVPTXInst<(outs Float64Regs:$d), (ins Int64Regs:$a),
+          !strconcat(!strconcat("cvt.rn.f64.", OpStr), "64 \t$d, $a;"),
+            [(set Float64Regs:$d, (OpNode Int64Regs:$a))]>;
+}
+
+defm Sint_to_fp : CVT_INT_TO_FP <"s", sint_to_fp>;
+defm Uint_to_fp : CVT_INT_TO_FP <"u", uint_to_fp>;
+
+multiclass CVT_FP_TO_INT <string OpStr, SDNode OpNode> {
+// FIXME: need to add f16 support
+//  def CVTi8f16:
+//    NVPTXInst<(outs Int8Regs:$d), (ins Float16Regs:$a),
+//              !strconcat(!strconcat("cvt.rzi.", OpStr), "8.f16 $d, $a;"),
+//        [(set Int8Regs:$d, (OpNode Float16Regs:$a))]>;
+  def CVTi8f32_ftz:
+    NVPTXInst<(outs Int8Regs:$d), (ins Float32Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.ftz.", OpStr), "16.f32 \t$d, $a;"),
+        [(set Int8Regs:$d, (OpNode Float32Regs:$a))]>, Requires<[doF32FTZ]>;
+  def CVTi8f32:
+    NVPTXInst<(outs Int8Regs:$d), (ins Float32Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.", OpStr), "16.f32 \t$d, $a;"),
+        [(set Int8Regs:$d, (OpNode Float32Regs:$a))]>;
+  def CVTi8f64:
+    NVPTXInst<(outs Int8Regs:$d), (ins Float64Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.", OpStr), "16.f64 \t$d, $a;"),
+        [(set Int8Regs:$d, (OpNode Float64Regs:$a))]>;
+
+// FIXME: need to add f16 support
+//  def CVTi16f16:
+//    NVPTXInst<(outs Int16Regs:$d), (ins Float16Regs:$a),
+//              !strconcat(!strconcat("cvt.rzi.", OpStr), "16.f16 \t$d, $a;"),
+//        [(set Int16Regs:$d, (OpNode Float16Regs:$a))]>;
+  def CVTi16f32_ftz:
+    NVPTXInst<(outs Int16Regs:$d), (ins Float32Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.ftz.", OpStr), "16.f32 \t$d, $a;"),
+        [(set Int16Regs:$d, (OpNode Float32Regs:$a))]>, Requires<[doF32FTZ]>;
+  def CVTi16f32:
+    NVPTXInst<(outs Int16Regs:$d), (ins Float32Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.", OpStr), "16.f32 \t$d, $a;"),
+        [(set Int16Regs:$d, (OpNode Float32Regs:$a))]>;
+  def CVTi16f64:
+    NVPTXInst<(outs Int16Regs:$d), (ins Float64Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.", OpStr), "16.f64 \t$d, $a;"),
+        [(set Int16Regs:$d, (OpNode Float64Regs:$a))]>;
+
+// FIXME: need to add f16 support
+//  def CVTi32f16:  def CVTi32f16:
+//    NVPTXInst<(outs Int32Regs:$d), (ins Float16Regs:$a),
+//              !strconcat(!strconcat("cvt.rzi.", OpStr), "32.f16 \t$d, $a;"),
+//        [(set Int32Regs:$d, (OpNode Float16Regs:$a))]>;
+  def CVTi32f32_ftz:
+    NVPTXInst<(outs Int32Regs:$d), (ins Float32Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.ftz.", OpStr), "32.f32 \t$d, $a;"),
+        [(set Int32Regs:$d, (OpNode Float32Regs:$a))]>, Requires<[doF32FTZ]>;
+  def CVTi32f32:
+    NVPTXInst<(outs Int32Regs:$d), (ins Float32Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.", OpStr), "32.f32 \t$d, $a;"),
+        [(set Int32Regs:$d, (OpNode Float32Regs:$a))]>;
+  def CVTi32f64:
+    NVPTXInst<(outs Int32Regs:$d), (ins Float64Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.", OpStr), "32.f64 \t$d, $a;"),
+        [(set Int32Regs:$d, (OpNode Float64Regs:$a))]>;
+
+// FIXME: need to add f16 support
+//  def CVTi64f16:
+//    NVPTXInst<(outs Int64Regs:$d), (ins Float16Regs:$a),
+//              !strconcat(!strconcat("cvt.rzi.", OpStr), "64.f16 \t$d, $a;"),
+//        [(set Int64Regs:$d, (OpNode Float16Regs:$a))]>;
+  def CVTi64f32_ftz:
+    NVPTXInst<(outs Int64Regs:$d), (ins Float32Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.ftz.", OpStr), "64.f32 \t$d, $a;"),
+        [(set Int64Regs:$d, (OpNode Float32Regs:$a))]>, Requires<[doF32FTZ]>;
+  def CVTi64f32:
+    NVPTXInst<(outs Int64Regs:$d), (ins Float32Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.", OpStr), "64.f32 \t$d, $a;"),
+        [(set Int64Regs:$d, (OpNode Float32Regs:$a))]>;
+  def CVTi64f64:
+    NVPTXInst<(outs Int64Regs:$d), (ins Float64Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.", OpStr), "64.f64 \t$d, $a;"),
+        [(set Int64Regs:$d, (OpNode Float64Regs:$a))]>;
+}
+
+defm Fp_to_sint : CVT_FP_TO_INT <"s", fp_to_sint>;
+defm Fp_to_uint : CVT_FP_TO_INT <"u", fp_to_uint>;
+
+multiclass INT_EXTEND_UNSIGNED_1 <SDNode OpNode> {
+  def ext1to8:
+       NVPTXInst<(outs Int8Regs:$d), (ins Int1Regs:$a),
+           "selp.u16 \t$d, 1, 0, $a;",
+     [(set Int8Regs:$d, (OpNode Int1Regs:$a))]>;
+  def ext1to16:
+       NVPTXInst<(outs Int16Regs:$d), (ins Int1Regs:$a),
+           "selp.u16 \t$d, 1, 0, $a;",
+     [(set Int16Regs:$d, (OpNode Int1Regs:$a))]>;
+  def ext1to32:
+       NVPTXInst<(outs Int32Regs:$d), (ins Int1Regs:$a),
+           "selp.u32 \t$d, 1, 0, $a;",
+     [(set Int32Regs:$d, (OpNode Int1Regs:$a))]>;
+  def ext1to64:
+       NVPTXInst<(outs Int64Regs:$d), (ins Int1Regs:$a),
+           "selp.u64 \t$d, 1, 0, $a;",
+     [(set Int64Regs:$d, (OpNode Int1Regs:$a))]>;
+}
+
+multiclass INT_EXTEND_SIGNED_1 <SDNode OpNode> {
+  def ext1to8:
+       NVPTXInst<(outs Int8Regs:$d), (ins Int1Regs:$a),
+           "selp.s16 \t$d, -1, 0, $a;",
+     [(set Int8Regs:$d, (OpNode Int1Regs:$a))]>;
+  def ext1to16:
+       NVPTXInst<(outs Int16Regs:$d), (ins Int1Regs:$a),
+           "selp.s16 \t$d, -1, 0, $a;",
+     [(set Int16Regs:$d, (OpNode Int1Regs:$a))]>;
+  def ext1to32:
+       NVPTXInst<(outs Int32Regs:$d), (ins Int1Regs:$a),
+           "selp.s32 \t$d, -1, 0, $a;",
+     [(set Int32Regs:$d, (OpNode Int1Regs:$a))]>;
+  def ext1to64:
+       NVPTXInst<(outs Int64Regs:$d), (ins Int1Regs:$a),
+           "selp.s64 \t$d, -1, 0, $a;",
+     [(set Int64Regs:$d, (OpNode Int1Regs:$a))]>;
+}
+
+multiclass INT_EXTEND <string OpStr, SDNode OpNode> {
+  // All Int8Regs are emiited as 16bit registers in ptx.
+  // And there is no selp.u8 in ptx.
+  def ext8to16:
+       NVPTXInst<(outs Int16Regs:$d), (ins Int8Regs:$a),
+           !strconcat("cvt.", !strconcat(OpStr, !strconcat("16.",
+             !strconcat(OpStr, "8 \t$d, $a;")))),
+     [(set Int16Regs:$d, (OpNode Int8Regs:$a))]>;
+  def ext8to32:
+       NVPTXInst<(outs Int32Regs:$d), (ins Int8Regs:$a),
+           !strconcat("cvt.", !strconcat(OpStr, !strconcat("32.",
+             !strconcat(OpStr, "8 \t$d, $a;")))),
+     [(set Int32Regs:$d, (OpNode Int8Regs:$a))]>;
+  def ext8to64:
+       NVPTXInst<(outs Int64Regs:$d), (ins Int8Regs:$a),
+           !strconcat("cvt.", !strconcat(OpStr, !strconcat("64.",
+             !strconcat(OpStr, "8 \t$d, $a;")))),
+     [(set Int64Regs:$d, (OpNode Int8Regs:$a))]>;
+  def ext16to32:
+       NVPTXInst<(outs Int32Regs:$d), (ins Int16Regs:$a),
+           !strconcat("cvt.", !strconcat(OpStr, !strconcat("32.",
+             !strconcat(OpStr, "16 \t$d, $a;")))),
+     [(set Int32Regs:$d, (OpNode Int16Regs:$a))]>;
+  def ext16to64:
+       NVPTXInst<(outs Int64Regs:$d), (ins Int16Regs:$a),
+           !strconcat("cvt.", !strconcat(OpStr, !strconcat("64.",
+             !strconcat(OpStr, "16 \t$d, $a;")))),
+     [(set Int64Regs:$d, (OpNode Int16Regs:$a))]>;
+  def ext32to64:
+       NVPTXInst<(outs Int64Regs:$d), (ins Int32Regs:$a),
+           !strconcat("cvt.", !strconcat(OpStr, !strconcat("64.",
+             !strconcat(OpStr, "32 \t$d, $a;")))),
+     [(set Int64Regs:$d, (OpNode Int32Regs:$a))]>;
+}
+
+defm Sint_extend_1 : INT_EXTEND_SIGNED_1<sext>;
+defm Zint_extend_1 : INT_EXTEND_UNSIGNED_1<zext>;
+defm Aint_extend_1 : INT_EXTEND_UNSIGNED_1<anyext>;
+
+defm Sint_extend : INT_EXTEND <"s", sext>;
+defm Zint_extend : INT_EXTEND <"u", zext>;
+defm Aint_extend : INT_EXTEND <"u", anyext>;
+
+class TRUNC_to1_asm<string sz> {
+  string s = !strconcat("{{\n\t",
+             !strconcat(".reg ",
+             !strconcat(sz,
+             !strconcat(" temp;\n\t",
+             !strconcat("and",
+             !strconcat(sz,
+             !strconcat("\t temp, $a, 1;\n\t",
+             !strconcat("setp",
+             !strconcat(sz, ".eq \t $d, temp, 1;\n\t}}")))))))));
+}
+
+def TRUNC_64to32 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+             "cvt.u32.u64 \t$d, $a;",
+       [(set Int32Regs:$d, (trunc Int64Regs:$a))]>;
+def TRUNC_64to16 : NVPTXInst<(outs Int16Regs:$d), (ins Int64Regs:$a),
+             "cvt.u16.u64 \t$d, $a;",
+       [(set Int16Regs:$d, (trunc Int64Regs:$a))]>;
+def TRUNC_64to8 : NVPTXInst<(outs Int8Regs:$d), (ins Int64Regs:$a),
+             "cvt.u8.u64 \t$d, $a;",
+       [(set Int8Regs:$d, (trunc Int64Regs:$a))]>;
+def TRUNC_32to16 : NVPTXInst<(outs Int16Regs:$d), (ins Int32Regs:$a),
+             "cvt.u16.u32 \t$d, $a;",
+       [(set Int16Regs:$d, (trunc Int32Regs:$a))]>;
+def TRUNC_32to8 : NVPTXInst<(outs Int8Regs:$d), (ins Int32Regs:$a),
+             "cvt.u8.u32 \t$d, $a;",
+       [(set Int8Regs:$d, (trunc Int32Regs:$a))]>;
+def TRUNC_16to8 : NVPTXInst<(outs Int8Regs:$d), (ins Int16Regs:$a),
+             "cvt.u8.u16 \t$d, $a;",
+       [(set Int8Regs:$d, (trunc Int16Regs:$a))]>;
+def TRUNC_64to1 : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+             TRUNC_to1_asm<".b64">.s,
+             [(set Int1Regs:$d, (trunc Int64Regs:$a))]>;
+def TRUNC_32to1 : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a),
+             TRUNC_to1_asm<".b32">.s,
+             [(set Int1Regs:$d, (trunc Int32Regs:$a))]>;
+def TRUNC_16to1 : NVPTXInst<(outs Int1Regs:$d), (ins Int16Regs:$a),
+             TRUNC_to1_asm<".b16">.s,
+             [(set Int1Regs:$d, (trunc Int16Regs:$a))]>;
+def TRUNC_8to1 : NVPTXInst<(outs Int1Regs:$d), (ins Int8Regs:$a),
+             TRUNC_to1_asm<".b16">.s,
+             [(set Int1Regs:$d, (trunc Int8Regs:$a))]>;
+
+// Select instructions
+def : Pat<(select Int32Regs:$pred, Int8Regs:$a, Int8Regs:$b),
+          (SELECTi8rr Int8Regs:$a, Int8Regs:$b, (TRUNC_32to1 Int32Regs:$pred))>;
+def : Pat<(select Int32Regs:$pred, Int16Regs:$a, Int16Regs:$b),
+          (SELECTi16rr Int16Regs:$a, Int16Regs:$b,
+            (TRUNC_32to1 Int32Regs:$pred))>;
+def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b),
+          (SELECTi32rr Int32Regs:$a, Int32Regs:$b,
+            (TRUNC_32to1 Int32Regs:$pred))>;
+def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b),
+          (SELECTi64rr Int64Regs:$a, Int64Regs:$b,
+            (TRUNC_32to1 Int32Regs:$pred))>;
+def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b),
+          (SELECTf32rr Float32Regs:$a, Float32Regs:$b,
+            (TRUNC_32to1 Int32Regs:$pred))>;
+def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b),
+          (SELECTf64rr Float64Regs:$a, Float64Regs:$b,
+            (TRUNC_32to1 Int32Regs:$pred))>;
+
+class F_BITCONVERT<string SzStr, NVPTXRegClass regclassIn,
+  NVPTXRegClass regclassOut> :
+           NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),
+           !strconcat("mov.b", !strconcat(SzStr, " \t $d, $a;")),
+     [(set regclassOut:$d, (bitconvert regclassIn:$a))]>;
+
+def BITCONVERT_32_I2F : F_BITCONVERT<"32", Int32Regs, Float32Regs>;
+def BITCONVERT_32_F2I : F_BITCONVERT<"32", Float32Regs, Int32Regs>;
+def BITCONVERT_64_I2F : F_BITCONVERT<"64", Int64Regs, Float64Regs>;
+def BITCONVERT_64_F2I : F_BITCONVERT<"64", Float64Regs, Int64Regs>;
+
+// pack a set of smaller int registers to a larger int register
+def V4I8toI32 : NVPTXInst<(outs Int32Regs:$d),
+                          (ins Int8Regs:$s1, Int8Regs:$s2,
+                               Int8Regs:$s3, Int8Regs:$s4),
+                          !strconcat("{{\n\t.reg .b8\t%t<4>;",
+                          !strconcat("\n\tcvt.u8.u8\t%t0, $s1;",
+                          !strconcat("\n\tcvt.u8.u8\t%t1, $s2;",
+                          !strconcat("\n\tcvt.u8.u8\t%t2, $s3;",
+                          !strconcat("\n\tcvt.u8.u8\t%t3, $s4;",
+                           "\n\tmov.b32\t$d, {%t0, %t1, %t2, %t3};\n\t}}"))))),
+                          []>;
+def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
+                          (ins Int16Regs:$s1, Int16Regs:$s2,
+                               Int16Regs:$s3, Int16Regs:$s4),
+                          "mov.b64\t$d, {{$s1, $s2, $s3, $s4}};",
+                          []>;
+def V2I8toI16 : NVPTXInst<(outs Int16Regs:$d),
+                          (ins Int8Regs:$s1, Int8Regs:$s2),
+                          !strconcat("{{\n\t.reg .b8\t%t<2>;",
+                          !strconcat("\n\tcvt.u8.u8\t%t0, $s1;",
+                          !strconcat("\n\tcvt.u8.u8\t%t1, $s2;",
+                                     "\n\tmov.b16\t$d, {%t0, %t1};\n\t}}"))),
+                          []>;
+def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
+                          (ins Int16Regs:$s1, Int16Regs:$s2),
+                          "mov.b32\t$d, {{$s1, $s2}};",
+                          []>;
+def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d),
+                          (ins Int32Regs:$s1, Int32Regs:$s2),
+                          "mov.b64\t$d, {{$s1, $s2}};",
+                          []>;
+def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
+                          (ins Float32Regs:$s1, Float32Regs:$s2),
+                          "mov.b64\t$d, {{$s1, $s2}};",
+                          []>;
+
+// unpack a larger int register to a set of smaller int registers
+def I32toV4I8 : NVPTXInst<(outs Int8Regs:$d1, Int8Regs:$d2,
+                                Int8Regs:$d3, Int8Regs:$d4),
+                          (ins Int32Regs:$s),
+                          !strconcat("{{\n\t.reg .b8\t%t<4>;",
+                          !strconcat("\n\tmov.b32\t{%t0, %t1, %t2, %t3}, $s;",
+                          !strconcat("\n\tcvt.u8.u8\t$d1, %t0;",
+                          !strconcat("\n\tcvt.u8.u8\t$d2, %t1;",
+                          !strconcat("\n\tcvt.u8.u8\t$d3, %t2;",
+                                     "\n\tcvt.u8.u8\t$d4, %t3;\n\t}}"))))),
+                          []>;
+def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
+                                 Int16Regs:$d3, Int16Regs:$d4),
+                           (ins Int64Regs:$s),
+                           "mov.b64\t{{$d1, $d2, $d3, $d4}}, $s;",
+                          []>;
+def I16toV2I8 : NVPTXInst<(outs Int8Regs:$d1, Int8Regs:$d2),
+                          (ins Int16Regs:$s),
+                          !strconcat("{{\n\t.reg .b8\t%t<2>;",
+                          !strconcat("\n\tmov.b16\t{%t0, %t1}, $s;",
+                          !strconcat("\n\tcvt.u8.u8\t$d1, %t0;",
+                                     "\n\tcvt.u8.u8\t$d2, %t1;\n\t}}"))),
+                          []>;
+def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
+                           (ins Int32Regs:$s),
+                           "mov.b32\t{{$d1, $d2}}, $s;",
+                          []>;
+def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
+                           (ins Int64Regs:$s),
+                           "mov.b64\t{{$d1, $d2}}, $s;",
+                          []>;
+def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
+                           (ins Float64Regs:$s),
+                           "mov.b64\t{{$d1, $d2}}, $s;",
+                          []>;
+
+def FPRound_ftz : NVPTXInst<(outs Float32Regs:$d), (ins Float64Regs:$a),
+            "cvt.rn.ftz.f32.f64 \t$d, $a;",
+      [(set Float32Regs:$d, (fround Float64Regs:$a))]>, Requires<[doF32FTZ]>;
+
+def FPRound : NVPTXInst<(outs Float32Regs:$d), (ins Float64Regs:$a),
+            "cvt.rn.f32.f64 \t$d, $a;",
+      [(set Float32Regs:$d, (fround Float64Regs:$a))]>;
+
+def FPExtend_ftz : NVPTXInst<(outs Float64Regs:$d), (ins Float32Regs:$a),
+            "cvt.ftz.f64.f32 \t$d, $a;",
+      [(set Float64Regs:$d, (fextend Float32Regs:$a))]>, Requires<[doF32FTZ]>;
+
+def FPExtend : NVPTXInst<(outs Float64Regs:$d), (ins Float32Regs:$a),
+            "cvt.f64.f32 \t$d, $a;",
+      [(set Float64Regs:$d, (fextend Float32Regs:$a))]>;
+
+def retflag       : SDNode<"NVPTXISD::RET_FLAG", SDTNone,
+                           [SDNPHasChain, SDNPOptInGlue]>;
+
+//-----------------------------------
+// Control-flow
+//-----------------------------------
+
+let isTerminator=1 in {
+   let isReturn=1, isBarrier=1 in
+      def Return : NVPTXInst<(outs), (ins), "ret;", [(retflag)]>;
+
+   let isBranch=1 in
+      def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
+                          "@$a bra \t$target;",
+                           [(brcond Int1Regs:$a, bb:$target)]>;
+   let isBranch=1 in
+      def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
+                          "@!$a bra \t$target;",
+                           []>;
+
+   let isBranch=1, isBarrier=1 in
+      def GOTO : NVPTXInst<(outs), (ins brtarget:$target),
+                        "bra.uni \t$target;",
+                  [(br bb:$target)]>;
+}
+
+def : Pat<(brcond Int32Regs:$a, bb:$target), (CBranch
+    (ISetUNEi32ri_p Int32Regs:$a, 0), bb:$target)>;
+
+// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a
+// conditional branch if
+// the target block is the next block so that the code can fall through to the
+// target block.
+// The invertion is done by 'xor condition, 1', which will be translated to
+// (setne condition, -1).
+// Since ptx supports '@!pred bra target', we should use it.
+def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target),
+  (CBranchOther Int1Regs:$a, bb:$target)>;
+
+// Call
+def SDT_NVPTXCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_NVPTXCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                        SDTCisVT<1, i32> ]>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
+                           [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_NVPTXCallSeqEnd,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                           SDNPSideEffect]>;
+
+def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+def call          : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def calltarget : Operand<i32>;
+let isCall=1 in {
+   def CALL : NVPTXInst<(outs), (ins calltarget:$dst),
+                  "call \t$dst, (1);", []>;
+}
+
+def : Pat<(call tglobaladdr:$dst),
+          (CALL tglobaladdr:$dst)>;
+def : Pat<(call texternalsym:$dst),
+          (CALL texternalsym:$dst)>;
+
+// Pseudo instructions.
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : NVPTXInst<outs, ins, asmstr, pattern>;
+
+// @TODO: We use some tricks here to emit curly braces.  Can we clean this up
+// a bit without TableGen modifications?
+def Callseq_Start : NVPTXInst<(outs), (ins i32imm:$amt),
+  "// Callseq Start $amt\n\t{{\n\t.reg .b32 temp_param_reg;\n\t// <end>}}",
+                               [(callseq_start timm:$amt)]>;
+def Callseq_End : NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+  "\n\t//{{\n\t}}// Callseq End $amt1",
+                            [(callseq_end timm:$amt1, timm:$amt2)]>;
+
+// trap instruction
+
+def trapinst : NVPTXInst<(outs), (ins),
+                         "trap;",
+                         [(trap)]>;
+
+include "NVPTXVector.td"
+
+include "NVPTXIntrinsics.td"
+
+
+//-----------------------------------
+// Notes
+//-----------------------------------
+// BSWAP is currently expanded. The following is a more efficient
+// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register
+// - for sm_20, use pmpt (use vector scalar mov to get the pack and
+//   unpack). sm_20 supports native 32-bit register, but not native 16-bit
+// register.
diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td
new file mode 100644
index 0000000..028a94b
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -0,0 +1,1675 @@
+//===- NVPTXIntrinsics.td - PTX Intrinsics Instructions -------*- tblgen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def immFloat0 : PatLeaf<(fpimm), [{
+    float f = (float)N->getValueAPF().convertToFloat();
+    return (f==0.0f);
+}]>;
+
+def immFloat1 : PatLeaf<(fpimm), [{
+    float f = (float)N->getValueAPF().convertToFloat();
+    return (f==1.0f);
+}]>;
+
+def immDouble0 : PatLeaf<(fpimm), [{
+    double d = (double)N->getValueAPF().convertToDouble();
+    return (d==0.0);
+}]>;
+
+def immDouble1 : PatLeaf<(fpimm), [{
+    double d = (double)N->getValueAPF().convertToDouble();
+    return (d==1.0);
+}]>;
+
+
+
+//-----------------------------------
+// Synchronization Functions
+//-----------------------------------
+def INT_CUDA_SYNCTHREADS : NVPTXInst<(outs), (ins),
+                  "bar.sync \t0;",
+      [(int_cuda_syncthreads)]>;
+def INT_BARRIER0 : NVPTXInst<(outs), (ins),
+                  "bar.sync \t0;",
+      [(int_nvvm_barrier0)]>;
+def INT_BARRIER0_POPC : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
+  !strconcat("{{ \n\t",
+      !strconcat(".reg .pred \t%p1; \n\t",
+      !strconcat("setp.ne.u32 \t%p1, $pred, 0; \n\t",
+      !strconcat("bar.red.popc.u32 \t$dst, 0, %p1; \n\t",
+        !strconcat("}}", ""))))),
+      [(set Int32Regs:$dst, (int_nvvm_barrier0_popc Int32Regs:$pred))]>;
+def INT_BARRIER0_AND : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
+  !strconcat("{{ \n\t",
+      !strconcat(".reg .pred \t%p1; \n\t",
+      !strconcat(".reg .pred \t%p2; \n\t",
+      !strconcat("setp.ne.u32 \t%p1, $pred, 0; \n\t",
+      !strconcat("bar.red.and.pred \t%p2, 0, %p1; \n\t",
+      !strconcat("selp.u32 \t$dst, 1, 0, %p2; \n\t",
+        !strconcat("}}", ""))))))),
+      [(set Int32Regs:$dst, (int_nvvm_barrier0_and Int32Regs:$pred))]>;
+def INT_BARRIER0_OR : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
+  !strconcat("{{ \n\t",
+      !strconcat(".reg .pred \t%p1; \n\t",
+      !strconcat(".reg .pred \t%p2; \n\t",
+      !strconcat("setp.ne.u32 \t%p1, $pred, 0; \n\t",
+      !strconcat("bar.red.or.pred \t%p2, 0, %p1; \n\t",
+      !strconcat("selp.u32 \t$dst, 1, 0, %p2; \n\t",
+        !strconcat("}}", ""))))))),
+      [(set Int32Regs:$dst, (int_nvvm_barrier0_or Int32Regs:$pred))]>;
+
+
+//-----------------------------------
+// Explicit Memory Fence Functions
+//-----------------------------------
+class MEMBAR<string StrOp, Intrinsic IntOP> :
+              NVPTXInst<(outs), (ins),
+            StrOp, [(IntOP)]>;
+
+def INT_MEMBAR_CTA : MEMBAR<"membar.cta;", int_nvvm_membar_cta>;
+def INT_MEMBAR_GL  : MEMBAR<"membar.gl;",  int_nvvm_membar_gl>;
+def INT_MEMBAR_SYS : MEMBAR<"membar.sys;", int_nvvm_membar_sys>;
+
+
+//-----------------------------------
+// Math Functions
+//-----------------------------------
+
+// Map min(1.0, max(0.0, x)) to sat(x)
+multiclass SAT<NVPTXRegClass regclass, Operand fimm, Intrinsic IntMinOp,
+  Intrinsic IntMaxOp, PatLeaf f0, PatLeaf f1, string OpStr> {
+
+   // fmin(1.0, fmax(0.0, x)) => sat(x)
+   def SAT11 : NVPTXInst<(outs regclass:$dst),
+     (ins fimm:$srcf0, fimm:$srcf1, regclass:$src),
+           OpStr,
+     [(set regclass:$dst, (IntMinOp f1:$srcf0 ,
+       (IntMaxOp f0:$srcf1, regclass:$src)))]>;
+
+   // fmin(1.0, fmax(x, 0.0)) => sat(x)
+   def SAT12 : NVPTXInst<(outs regclass:$dst),
+     (ins fimm:$srcf0, fimm:$srcf1, regclass:$src),
+           OpStr,
+     [(set regclass:$dst, (IntMinOp f1:$srcf0 ,
+       (IntMaxOp regclass:$src, f0:$srcf1)))]>;
+
+   // fmin(fmax(0.0, x), 1.0) => sat(x)
+   def SAT13 : NVPTXInst<(outs regclass:$dst),
+     (ins fimm:$srcf0, fimm:$srcf1, regclass:$src),
+           OpStr,
+     [(set regclass:$dst, (IntMinOp
+       (IntMaxOp f0:$srcf0, regclass:$src), f1:$srcf1))]>;
+
+   // fmin(fmax(x, 0.0), 1.0) => sat(x)
+   def SAT14 : NVPTXInst<(outs regclass:$dst),
+     (ins fimm:$srcf0, fimm:$srcf1, regclass:$src),
+         OpStr,
+     [(set regclass:$dst, (IntMinOp
+       (IntMaxOp regclass:$src, f0:$srcf0), f1:$srcf1))]>;
+
+}
+// Note that max(0.0, min(x, 1.0)) cannot be mapped to sat(x) because when x
+// is NaN
+// max(0.0, min(x, 1.0)) is 1.0 while sat(x) is 0.
+// Same story for fmax, fmin.
+
+defm SAT_fmin_fmax_f : SAT<Float32Regs, f32imm, int_nvvm_fmin_f,
+  int_nvvm_fmax_f, immFloat0, immFloat1,
+           "cvt.sat.f32.f32 \t$dst, $src; \n">;
+defm SAT_fmin_fmax_d : SAT<Float64Regs, f64imm, int_nvvm_fmin_d,
+  int_nvvm_fmax_d, immDouble0, immDouble1,
+           "cvt.sat.f64.f64 \t$dst, $src; \n">;
+
+
+// We need a full string for OpcStr here because we need to deal with case like
+// INT_PTX_RECIP.
+class F_MATH_1<string OpcStr, NVPTXRegClass target_regclass,
+  NVPTXRegClass src_regclass, Intrinsic IntOP>
+            : NVPTXInst<(outs target_regclass:$dst), (ins src_regclass:$src0),
+            OpcStr,
+        [(set target_regclass:$dst, (IntOP src_regclass:$src0))]>;
+
+// We need a full string for OpcStr here because we need to deal with the case
+// like INT_PTX_NATIVE_POWR_F.
+class F_MATH_2<string OpcStr, NVPTXRegClass t_regclass,
+  NVPTXRegClass s0_regclass, NVPTXRegClass s1_regclass, Intrinsic IntOP>
+            : NVPTXInst<(outs t_regclass:$dst),
+              (ins s0_regclass:$src0, s1_regclass:$src1),
+            OpcStr,
+        [(set t_regclass:$dst, (IntOP s0_regclass:$src0, s1_regclass:$src1))]>;
+
+class F_MATH_3<string OpcStr, NVPTXRegClass t_regclass,
+  NVPTXRegClass s0_regclass, NVPTXRegClass s1_regclass,
+  NVPTXRegClass s2_regclass, Intrinsic IntOP>
+            : NVPTXInst<(outs t_regclass:$dst),
+              (ins s0_regclass:$src0, s1_regclass:$src1, s2_regclass:$src2),
+            OpcStr,
+        [(set t_regclass:$dst,
+          (IntOP s0_regclass:$src0, s1_regclass:$src1, s2_regclass:$src2))]>;
+
+//
+// MISC
+//
+
+def INT_NVVM_CLZ_I : F_MATH_1<"clz.b32 \t$dst, $src0;", Int32Regs, Int32Regs,
+  int_nvvm_clz_i>;
+def INT_NVVM_CLZ_LL : F_MATH_1<"clz.b64 \t$dst, $src0;", Int32Regs, Int64Regs,
+  int_nvvm_clz_ll>;
+
+def INT_NVVM_POPC_I : F_MATH_1<"popc.b32 \t$dst, $src0;", Int32Regs, Int32Regs,
+  int_nvvm_popc_i>;
+def INT_NVVM_POPC_LL : F_MATH_1<"popc.b64 \t$dst, $src0;", Int32Regs, Int64Regs,
+  int_nvvm_popc_ll>;
+
+def INT_NVVM_PRMT : F_MATH_3<"prmt.b32 \t$dst, $src0, $src1, $src2;", Int32Regs,
+  Int32Regs, Int32Regs, Int32Regs, int_nvvm_prmt>;
+
+//
+// Min Max
+//
+
+def INT_NVVM_MIN_I : F_MATH_2<"min.s32 \t$dst, $src0, $src1;", Int32Regs,
+  Int32Regs, Int32Regs, int_nvvm_min_i>;
+def INT_NVVM_MIN_UI : F_MATH_2<"min.u32 \t$dst, $src0, $src1;", Int32Regs,
+  Int32Regs, Int32Regs, int_nvvm_min_ui>;
+
+def INT_NVVM_MIN_LL : F_MATH_2<"min.s64 \t$dst, $src0, $src1;", Int64Regs,
+  Int64Regs, Int64Regs, int_nvvm_min_ll>;
+def INT_NVVM_MIN_ULL : F_MATH_2<"min.u64 \t$dst, $src0, $src1;", Int64Regs,
+  Int64Regs, Int64Regs, int_nvvm_min_ull>;
+
+def INT_NVVM_MAX_I : F_MATH_2<"max.s32 \t$dst, $src0, $src1;", Int32Regs,
+  Int32Regs, Int32Regs, int_nvvm_max_i>;
+def INT_NVVM_MAX_UI : F_MATH_2<"max.u32 \t$dst, $src0, $src1;", Int32Regs,
+  Int32Regs, Int32Regs, int_nvvm_max_ui>;
+
+def INT_NVVM_MAX_LL : F_MATH_2<"max.s64 \t$dst, $src0, $src1;", Int64Regs,
+  Int64Regs, Int64Regs, int_nvvm_max_ll>;
+def INT_NVVM_MAX_ULL : F_MATH_2<"max.u64 \t$dst, $src0, $src1;", Int64Regs,
+  Int64Regs, Int64Regs, int_nvvm_max_ull>;
+
+def INT_NVVM_FMIN_F : F_MATH_2<"min.f32 \t$dst, $src0, $src1;", Float32Regs,
+  Float32Regs, Float32Regs, int_nvvm_fmin_f>;
+def INT_NVVM_FMIN_FTZ_F : F_MATH_2<"min.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_ftz_f>;
+
+def INT_NVVM_FMAX_F : F_MATH_2<"max.f32 \t$dst, $src0, $src1;", Float32Regs,
+  Float32Regs, Float32Regs, int_nvvm_fmax_f>;
+def INT_NVVM_FMAX_FTZ_F : F_MATH_2<"max.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_ftz_f>;
+
+def INT_NVVM_FMIN_D : F_MATH_2<"min.f64 \t$dst, $src0, $src1;", Float64Regs,
+  Float64Regs, Float64Regs, int_nvvm_fmin_d>;
+def INT_NVVM_FMAX_D : F_MATH_2<"max.f64 \t$dst, $src0, $src1;", Float64Regs,
+  Float64Regs, Float64Regs, int_nvvm_fmax_d>;
+
+//
+// Multiplication
+//
+
+def INT_NVVM_MULHI_I : F_MATH_2<"mul.hi.s32 \t$dst, $src0, $src1;", Int32Regs,
+  Int32Regs, Int32Regs, int_nvvm_mulhi_i>;
+def INT_NVVM_MULHI_UI : F_MATH_2<"mul.hi.u32 \t$dst, $src0, $src1;", Int32Regs,
+  Int32Regs, Int32Regs, int_nvvm_mulhi_ui>;
+
+def INT_NVVM_MULHI_LL : F_MATH_2<"mul.hi.s64 \t$dst, $src0, $src1;", Int64Regs,
+  Int64Regs, Int64Regs, int_nvvm_mulhi_ll>;
+def INT_NVVM_MULHI_ULL : F_MATH_2<"mul.hi.u64 \t$dst, $src0, $src1;", Int64Regs,
+  Int64Regs, Int64Regs, int_nvvm_mulhi_ull>;
+
+def INT_NVVM_MUL_RN_FTZ_F : F_MATH_2<"mul.rn.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rn_ftz_f>;
+def INT_NVVM_MUL_RN_F : F_MATH_2<"mul.rn.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rn_f>;
+def INT_NVVM_MUL_RZ_FTZ_F : F_MATH_2<"mul.rz.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rz_ftz_f>;
+def INT_NVVM_MUL_RZ_F : F_MATH_2<"mul.rz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rz_f>;
+def INT_NVVM_MUL_RM_FTZ_F : F_MATH_2<"mul.rm.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rm_ftz_f>;
+def INT_NVVM_MUL_RM_F : F_MATH_2<"mul.rm.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rm_f>;
+def INT_NVVM_MUL_RP_FTZ_F : F_MATH_2<"mul.rp.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rp_ftz_f>;
+def INT_NVVM_MUL_RP_F : F_MATH_2<"mul.rp.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rp_f>;
+
+def INT_NVVM_MUL_RN_D : F_MATH_2<"mul.rn.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rn_d>;
+def INT_NVVM_MUL_RZ_D : F_MATH_2<"mul.rz.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rz_d>;
+def INT_NVVM_MUL_RM_D : F_MATH_2<"mul.rm.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rm_d>;
+def INT_NVVM_MUL_RP_D : F_MATH_2<"mul.rp.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rp_d>;
+
+def INT_NVVM_MUL24_I : F_MATH_2<"mul24.lo.s32 \t$dst, $src0, $src1;",
+  Int32Regs, Int32Regs, Int32Regs, int_nvvm_mul24_i>;
+def INT_NVVM_MUL24_UI : F_MATH_2<"mul24.lo.u32 \t$dst, $src0, $src1;",
+  Int32Regs, Int32Regs, Int32Regs, int_nvvm_mul24_ui>;
+
+//
+// Div
+//
+
+def INT_NVVM_DIV_APPROX_FTZ_F
+  : F_MATH_2<"div.approx.ftz.f32 \t$dst, $src0, $src1;", Float32Regs,
+    Float32Regs, Float32Regs, int_nvvm_div_approx_ftz_f>;
+def INT_NVVM_DIV_APPROX_F : F_MATH_2<"div.approx.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_approx_f>;
+
+def INT_NVVM_DIV_RN_FTZ_F : F_MATH_2<"div.rn.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rn_ftz_f>;
+def INT_NVVM_DIV_RN_F     : F_MATH_2<"div.rn.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rn_f>;
+def INT_NVVM_DIV_RZ_FTZ_F : F_MATH_2<"div.rz.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rz_ftz_f>;
+def INT_NVVM_DIV_RZ_F     : F_MATH_2<"div.rz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rz_f>;
+def INT_NVVM_DIV_RM_FTZ_F : F_MATH_2<"div.rm.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rm_ftz_f>;
+def INT_NVVM_DIV_RM_F     : F_MATH_2<"div.rm.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rm_f>;
+def INT_NVVM_DIV_RP_FTZ_F : F_MATH_2<"div.rp.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rp_ftz_f>;
+def INT_NVVM_DIV_RP_F     : F_MATH_2<"div.rp.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rp_f>;
+
+def INT_NVVM_DIV_RN_D : F_MATH_2<"div.rn.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rn_d>;
+def INT_NVVM_DIV_RZ_D : F_MATH_2<"div.rz.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rz_d>;
+def INT_NVVM_DIV_RM_D : F_MATH_2<"div.rm.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rm_d>;
+def INT_NVVM_DIV_RP_D : F_MATH_2<"div.rp.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rp_d>;
+
+//
+// Brev
+//
+
+def INT_NVVM_BREV32 : F_MATH_1<"brev.b32 \t$dst, $src0;", Int32Regs, Int32Regs,
+  int_nvvm_brev32>;
+def INT_NVVM_BREV64 : F_MATH_1<"brev.b64 \t$dst, $src0;", Int64Regs, Int64Regs,
+  int_nvvm_brev64>;
+
+//
+// Sad
+//
+
+def INT_NVVM_SAD_I : F_MATH_3<"sad.s32 \t$dst, $src0, $src1, $src2;",
+  Int32Regs, Int32Regs, Int32Regs, Int32Regs, int_nvvm_sad_i>;
+def INT_NVVM_SAD_UI : F_MATH_3<"sad.u32 \t$dst, $src0, $src1, $src2;",
+  Int32Regs, Int32Regs, Int32Regs, Int32Regs, int_nvvm_sad_ui>;
+
+//
+// Floor  Ceil
+//
+
+def INT_NVVM_FLOOR_FTZ_F : F_MATH_1<"cvt.rmi.ftz.f32.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_floor_ftz_f>;
+def INT_NVVM_FLOOR_F : F_MATH_1<"cvt.rmi.f32.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_floor_f>;
+def INT_NVVM_FLOOR_D : F_MATH_1<"cvt.rmi.f64.f64 \t$dst, $src0;",
+  Float64Regs, Float64Regs, int_nvvm_floor_d>;
+
+def INT_NVVM_CEIL_FTZ_F : F_MATH_1<"cvt.rpi.ftz.f32.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_ceil_ftz_f>;
+def INT_NVVM_CEIL_F : F_MATH_1<"cvt.rpi.f32.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_ceil_f>;
+def INT_NVVM_CEIL_D : F_MATH_1<"cvt.rpi.f64.f64 \t$dst, $src0;",
+  Float64Regs, Float64Regs, int_nvvm_ceil_d>;
+
+//
+// Abs
+//
+
+def INT_NVVM_ABS_I : F_MATH_1<"abs.s32 \t$dst, $src0;", Int32Regs, Int32Regs,
+  int_nvvm_abs_i>;
+def INT_NVVM_ABS_LL : F_MATH_1<"abs.s64 \t$dst, $src0;", Int64Regs, Int64Regs,
+  int_nvvm_abs_ll>;
+
+def INT_NVVM_FABS_FTZ_F : F_MATH_1<"abs.ftz.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_fabs_ftz_f>;
+def INT_NVVM_FABS_F : F_MATH_1<"abs.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_fabs_f>;
+
+def INT_NVVM_FABS_D : F_MATH_1<"abs.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_fabs_d>;
+
+//
+// Round
+//
+
+def INT_NVVM_ROUND_FTZ_F : F_MATH_1<"cvt.rni.ftz.f32.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_round_ftz_f>;
+def INT_NVVM_ROUND_F : F_MATH_1<"cvt.rni.f32.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_round_f>;
+
+def INT_NVVM_ROUND_D : F_MATH_1<"cvt.rni.f64.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_round_d>;
+
+//
+// Trunc
+//
+
+def INT_NVVM_TRUNC_FTZ_F : F_MATH_1<"cvt.rzi.ftz.f32.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_trunc_ftz_f>;
+def INT_NVVM_TRUNC_F : F_MATH_1<"cvt.rzi.f32.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_trunc_f>;
+
+def INT_NVVM_TRUNC_D : F_MATH_1<"cvt.rzi.f64.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_trunc_d>;
+
+//
+// Saturate
+//
+
+def INT_NVVM_SATURATE_FTZ_F : F_MATH_1<"cvt.sat.ftz.f32.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_saturate_ftz_f>;
+def INT_NVVM_SATURATE_F : F_MATH_1<"cvt.sat.f32.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_saturate_f>;
+
+def INT_NVVM_SATURATE_D : F_MATH_1<"cvt.sat.f64.f64 \t$dst, $src0;",
+  Float64Regs, Float64Regs, int_nvvm_saturate_d>;
+
+//
+// Exp2  Log2
+//
+
+def INT_NVVM_EX2_APPROX_FTZ_F : F_MATH_1<"ex2.approx.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_ex2_approx_ftz_f>;
+def INT_NVVM_EX2_APPROX_F : F_MATH_1<"ex2.approx.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_ex2_approx_f>;
+def INT_NVVM_EX2_APPROX_D : F_MATH_1<"ex2.approx.f64 \t$dst, $src0;",
+  Float64Regs, Float64Regs, int_nvvm_ex2_approx_d>;
+
+def INT_NVVM_LG2_APPROX_FTZ_F : F_MATH_1<"lg2.approx.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_lg2_approx_ftz_f>;
+def INT_NVVM_LG2_APPROX_F : F_MATH_1<"lg2.approx.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_lg2_approx_f>;
+def INT_NVVM_LG2_APPROX_D : F_MATH_1<"lg2.approx.f64 \t$dst, $src0;",
+  Float64Regs, Float64Regs, int_nvvm_lg2_approx_d>;
+
+//
+// Sin  Cos
+//
+
+def INT_NVVM_SIN_APPROX_FTZ_F : F_MATH_1<"sin.approx.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sin_approx_ftz_f>;
+def INT_NVVM_SIN_APPROX_F : F_MATH_1<"sin.approx.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sin_approx_f>;
+
+def INT_NVVM_COS_APPROX_FTZ_F : F_MATH_1<"cos.approx.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_cos_approx_ftz_f>;
+def INT_NVVM_COS_APPROX_F : F_MATH_1<"cos.approx.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_cos_approx_f>;
+
+//
+// Fma
+//
+
+def INT_NVVM_FMA_RN_FTZ_F
+  : F_MATH_3<"fma.rn.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rn_ftz_f>;
+def INT_NVVM_FMA_RN_F : F_MATH_3<"fma.rn.f32 \t$dst, $src0, $src1, $src2;",
+  Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rn_f>;
+def INT_NVVM_FMA_RZ_FTZ_F
+  : F_MATH_3<"fma.rz.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rz_ftz_f>;
+def INT_NVVM_FMA_RZ_F : F_MATH_3<"fma.rz.f32 \t$dst, $src0, $src1, $src2;",
+  Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rz_f>;
+def INT_NVVM_FMA_RM_FTZ_F
+  : F_MATH_3<"fma.rm.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rm_ftz_f>;
+def INT_NVVM_FMA_RM_F : F_MATH_3<"fma.rm.f32 \t$dst, $src0, $src1, $src2;",
+  Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rm_f>;
+def INT_NVVM_FMA_RP_FTZ_F
+  : F_MATH_3<"fma.rp.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rp_ftz_f>;
+def INT_NVVM_FMA_RP_F : F_MATH_3<"fma.rp.f32 \t$dst, $src0, $src1, $src2;",
+  Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rp_f>;
+
+def INT_NVVM_FMA_RN_D : F_MATH_3<"fma.rn.f64 \t$dst, $src0, $src1, $src2;",
+  Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rn_d>;
+def INT_NVVM_FMA_RZ_D : F_MATH_3<"fma.rz.f64 \t$dst, $src0, $src1, $src2;",
+  Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rz_d>;
+def INT_NVVM_FMA_RM_D : F_MATH_3<"fma.rm.f64 \t$dst, $src0, $src1, $src2;",
+  Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rm_d>;
+def INT_NVVM_FMA_RP_D : F_MATH_3<"fma.rp.f64 \t$dst, $src0, $src1, $src2;",
+  Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rp_d>;
+
+//
+// Rcp
+//
+
+def INT_NVVM_RCP_RN_FTZ_F : F_MATH_1<"rcp.rn.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rn_ftz_f>;
+def INT_NVVM_RCP_RN_F : F_MATH_1<"rcp.rn.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rn_f>;
+def INT_NVVM_RCP_RZ_FTZ_F : F_MATH_1<"rcp.rz.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rz_ftz_f>;
+def INT_NVVM_RCP_RZ_F : F_MATH_1<"rcp.rz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rz_f>;
+def INT_NVVM_RCP_RM_FTZ_F : F_MATH_1<"rcp.rm.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rm_ftz_f>;
+def INT_NVVM_RCP_RM_F : F_MATH_1<"rcp.rm.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rm_f>;
+def INT_NVVM_RCP_RP_FTZ_F : F_MATH_1<"rcp.rp.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rp_ftz_f>;
+def INT_NVVM_RCP_RP_F : F_MATH_1<"rcp.rp.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rp_f>;
+
+def INT_NVVM_RCP_RN_D : F_MATH_1<"rcp.rn.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_rcp_rn_d>;
+def INT_NVVM_RCP_RZ_D : F_MATH_1<"rcp.rz.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_rcp_rz_d>;
+def INT_NVVM_RCP_RM_D : F_MATH_1<"rcp.rm.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_rcp_rm_d>;
+def INT_NVVM_RCP_RP_D : F_MATH_1<"rcp.rp.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_rcp_rp_d>;
+
+def INT_NVVM_RCP_APPROX_FTZ_D : F_MATH_1<"rcp.approx.ftz.f64 \t$dst, $src0;",
+  Float64Regs, Float64Regs, int_nvvm_rcp_approx_ftz_d>;
+
+//
+// Sqrt
+//
+
+def INT_NVVM_SQRT_RN_FTZ_F : F_MATH_1<"sqrt.rn.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sqrt_rn_ftz_f>;
+def INT_NVVM_SQRT_RN_F : F_MATH_1<"sqrt.rn.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_sqrt_rn_f>;
+def INT_NVVM_SQRT_RZ_FTZ_F : F_MATH_1<"sqrt.rz.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sqrt_rz_ftz_f>;
+def INT_NVVM_SQRT_RZ_F : F_MATH_1<"sqrt.rz.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_sqrt_rz_f>;
+def INT_NVVM_SQRT_RM_FTZ_F : F_MATH_1<"sqrt.rm.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sqrt_rm_ftz_f>;
+def INT_NVVM_SQRT_RM_F : F_MATH_1<"sqrt.rm.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_sqrt_rm_f>;
+def INT_NVVM_SQRT_RP_FTZ_F : F_MATH_1<"sqrt.rp.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sqrt_rp_ftz_f>;
+def INT_NVVM_SQRT_RP_F : F_MATH_1<"sqrt.rp.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_sqrt_rp_f>;
+def INT_NVVM_SQRT_APPROX_FTZ_F : F_MATH_1<"sqrt.approx.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sqrt_approx_ftz_f>;
+def INT_NVVM_SQRT_APPROX_F : F_MATH_1<"sqrt.approx.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sqrt_approx_f>;
+
+def INT_NVVM_SQRT_RN_D : F_MATH_1<"sqrt.rn.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_sqrt_rn_d>;
+def INT_NVVM_SQRT_RZ_D : F_MATH_1<"sqrt.rz.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_sqrt_rz_d>;
+def INT_NVVM_SQRT_RM_D : F_MATH_1<"sqrt.rm.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_sqrt_rm_d>;
+def INT_NVVM_SQRT_RP_D : F_MATH_1<"sqrt.rp.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_sqrt_rp_d>;
+
+//
+// Rsqrt
+//
+
+def INT_NVVM_RSQRT_APPROX_FTZ_F
+  : F_MATH_1<"rsqrt.approx.ftz.f32 \t$dst, $src0;", Float32Regs, Float32Regs,
+    int_nvvm_rsqrt_approx_ftz_f>;
+def INT_NVVM_RSQRT_APPROX_F : F_MATH_1<"rsqrt.approx.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rsqrt_approx_f>;
+def INT_NVVM_RSQRT_APPROX_D : F_MATH_1<"rsqrt.approx.f64 \t$dst, $src0;",
+  Float64Regs, Float64Regs, int_nvvm_rsqrt_approx_d>;
+
+//
+// Add
+//
+
+def INT_NVVM_ADD_RN_FTZ_F : F_MATH_2<"add.rn.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rn_ftz_f>;
+def INT_NVVM_ADD_RN_F : F_MATH_2<"add.rn.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rn_f>;
+def INT_NVVM_ADD_RZ_FTZ_F : F_MATH_2<"add.rz.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rz_ftz_f>;
+def INT_NVVM_ADD_RZ_F : F_MATH_2<"add.rz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rz_f>;
+def INT_NVVM_ADD_RM_FTZ_F : F_MATH_2<"add.rm.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rm_ftz_f>;
+def INT_NVVM_ADD_RM_F : F_MATH_2<"add.rm.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rm_f>;
+def INT_NVVM_ADD_RP_FTZ_F : F_MATH_2<"add.rp.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rp_ftz_f>;
+def INT_NVVM_ADD_RP_F : F_MATH_2<"add.rp.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rp_f>;
+
+def INT_NVVM_ADD_RN_D : F_MATH_2<"add.rn.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rn_d>;
+def INT_NVVM_ADD_RZ_D : F_MATH_2<"add.rz.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rz_d>;
+def INT_NVVM_ADD_RM_D : F_MATH_2<"add.rm.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rm_d>;
+def INT_NVVM_ADD_RP_D : F_MATH_2<"add.rp.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rp_d>;
+
+//
+// Convert
+//
+
+def INT_NVVM_D2F_RN_FTZ : F_MATH_1<"cvt.rn.ftz.f32.f64 \t$dst, $src0;",
+  Float32Regs, Float64Regs, int_nvvm_d2f_rn_ftz>;
+def INT_NVVM_D2F_RN : F_MATH_1<"cvt.rn.f32.f64 \t$dst, $src0;",
+  Float32Regs, Float64Regs, int_nvvm_d2f_rn>;
+def INT_NVVM_D2F_RZ_FTZ : F_MATH_1<"cvt.rz.ftz.f32.f64 \t$dst, $src0;",
+  Float32Regs, Float64Regs, int_nvvm_d2f_rz_ftz>;
+def INT_NVVM_D2F_RZ : F_MATH_1<"cvt.rz.f32.f64 \t$dst, $src0;",
+  Float32Regs, Float64Regs, int_nvvm_d2f_rz>;
+def INT_NVVM_D2F_RM_FTZ : F_MATH_1<"cvt.rm.ftz.f32.f64 \t$dst, $src0;",
+  Float32Regs, Float64Regs, int_nvvm_d2f_rm_ftz>;
+def INT_NVVM_D2F_RM : F_MATH_1<"cvt.rm.f32.f64 \t$dst, $src0;",
+  Float32Regs, Float64Regs, int_nvvm_d2f_rm>;
+def INT_NVVM_D2F_RP_FTZ : F_MATH_1<"cvt.rp.ftz.f32.f64 \t$dst, $src0;",
+  Float32Regs, Float64Regs, int_nvvm_d2f_rp_ftz>;
+def INT_NVVM_D2F_RP : F_MATH_1<"cvt.rp.f32.f64 \t$dst, $src0;",
+  Float32Regs, Float64Regs, int_nvvm_d2f_rp>;
+
+def INT_NVVM_D2I_RN : F_MATH_1<"cvt.rni.s32.f64 \t$dst, $src0;",
+  Int32Regs, Float64Regs, int_nvvm_d2i_rn>;
+def INT_NVVM_D2I_RZ : F_MATH_1<"cvt.rzi.s32.f64 \t$dst, $src0;",
+  Int32Regs, Float64Regs, int_nvvm_d2i_rz>;
+def INT_NVVM_D2I_RM : F_MATH_1<"cvt.rmi.s32.f64 \t$dst, $src0;",
+  Int32Regs, Float64Regs, int_nvvm_d2i_rm>;
+def INT_NVVM_D2I_RP : F_MATH_1<"cvt.rpi.s32.f64 \t$dst, $src0;",
+  Int32Regs, Float64Regs, int_nvvm_d2i_rp>;
+
+def INT_NVVM_D2UI_RN : F_MATH_1<"cvt.rni.u32.f64 \t$dst, $src0;",
+  Int32Regs, Float64Regs, int_nvvm_d2ui_rn>;
+def INT_NVVM_D2UI_RZ : F_MATH_1<"cvt.rzi.u32.f64 \t$dst, $src0;",
+  Int32Regs, Float64Regs, int_nvvm_d2ui_rz>;
+def INT_NVVM_D2UI_RM : F_MATH_1<"cvt.rmi.u32.f64 \t$dst, $src0;",
+  Int32Regs, Float64Regs, int_nvvm_d2ui_rm>;
+def INT_NVVM_D2UI_RP : F_MATH_1<"cvt.rpi.u32.f64 \t$dst, $src0;",
+  Int32Regs, Float64Regs, int_nvvm_d2ui_rp>;
+
+def INT_NVVM_I2D_RN : F_MATH_1<"cvt.rn.f64.s32 \t$dst, $src0;",
+  Float64Regs, Int32Regs, int_nvvm_i2d_rn>;
+def INT_NVVM_I2D_RZ : F_MATH_1<"cvt.rz.f64.s32 \t$dst, $src0;",
+  Float64Regs, Int32Regs, int_nvvm_i2d_rz>;
+def INT_NVVM_I2D_RM : F_MATH_1<"cvt.rm.f64.s32 \t$dst, $src0;",
+  Float64Regs, Int32Regs, int_nvvm_i2d_rm>;
+def INT_NVVM_I2D_RP : F_MATH_1<"cvt.rp.f64.s32 \t$dst, $src0;",
+  Float64Regs, Int32Regs, int_nvvm_i2d_rp>;
+
+def INT_NVVM_UI2D_RN : F_MATH_1<"cvt.rn.f64.u32 \t$dst, $src0;",
+  Float64Regs, Int32Regs, int_nvvm_ui2d_rn>;
+def INT_NVVM_UI2D_RZ : F_MATH_1<"cvt.rz.f64.u32 \t$dst, $src0;",
+  Float64Regs, Int32Regs, int_nvvm_ui2d_rz>;
+def INT_NVVM_UI2D_RM : F_MATH_1<"cvt.rm.f64.u32 \t$dst, $src0;",
+  Float64Regs, Int32Regs, int_nvvm_ui2d_rm>;
+def INT_NVVM_UI2D_RP : F_MATH_1<"cvt.rp.f64.u32 \t$dst, $src0;",
+  Float64Regs, Int32Regs, int_nvvm_ui2d_rp>;
+
+def INT_NVVM_F2I_RN_FTZ : F_MATH_1<"cvt.rni.ftz.s32.f32 \t$dst, $src0;",
+  Int32Regs, Float32Regs, int_nvvm_f2i_rn_ftz>;
+def INT_NVVM_F2I_RN : F_MATH_1<"cvt.rni.s32.f32 \t$dst, $src0;", Int32Regs,
+  Float32Regs, int_nvvm_f2i_rn>;
+def INT_NVVM_F2I_RZ_FTZ : F_MATH_1<"cvt.rzi.ftz.s32.f32 \t$dst, $src0;",
+  Int32Regs, Float32Regs, int_nvvm_f2i_rz_ftz>;
+def INT_NVVM_F2I_RZ : F_MATH_1<"cvt.rzi.s32.f32 \t$dst, $src0;", Int32Regs,
+  Float32Regs, int_nvvm_f2i_rz>;
+def INT_NVVM_F2I_RM_FTZ : F_MATH_1<"cvt.rmi.ftz.s32.f32 \t$dst, $src0;",
+  Int32Regs, Float32Regs, int_nvvm_f2i_rm_ftz>;
+def INT_NVVM_F2I_RM : F_MATH_1<"cvt.rmi.s32.f32 \t$dst, $src0;", Int32Regs,
+  Float32Regs, int_nvvm_f2i_rm>;
+def INT_NVVM_F2I_RP_FTZ : F_MATH_1<"cvt.rpi.ftz.s32.f32 \t$dst, $src0;",
+  Int32Regs, Float32Regs, int_nvvm_f2i_rp_ftz>;
+def INT_NVVM_F2I_RP : F_MATH_1<"cvt.rpi.s32.f32 \t$dst, $src0;", Int32Regs,
+  Float32Regs, int_nvvm_f2i_rp>;
+
+def INT_NVVM_F2UI_RN_FTZ : F_MATH_1<"cvt.rni.ftz.u32.f32 \t$dst, $src0;",
+  Int32Regs, Float32Regs, int_nvvm_f2ui_rn_ftz>;
+def INT_NVVM_F2UI_RN : F_MATH_1<"cvt.rni.u32.f32 \t$dst, $src0;", Int32Regs,
+  Float32Regs, int_nvvm_f2ui_rn>;
+def INT_NVVM_F2UI_RZ_FTZ : F_MATH_1<"cvt.rzi.ftz.u32.f32 \t$dst, $src0;",
+  Int32Regs, Float32Regs, int_nvvm_f2ui_rz_ftz>;
+def INT_NVVM_F2UI_RZ : F_MATH_1<"cvt.rzi.u32.f32 \t$dst, $src0;", Int32Regs,
+  Float32Regs, int_nvvm_f2ui_rz>;
+def INT_NVVM_F2UI_RM_FTZ : F_MATH_1<"cvt.rmi.ftz.u32.f32 \t$dst, $src0;",
+  Int32Regs, Float32Regs, int_nvvm_f2ui_rm_ftz>;
+def INT_NVVM_F2UI_RM : F_MATH_1<"cvt.rmi.u32.f32 \t$dst, $src0;", Int32Regs,
+  Float32Regs, int_nvvm_f2ui_rm>;
+def INT_NVVM_F2UI_RP_FTZ : F_MATH_1<"cvt.rpi.ftz.u32.f32 \t$dst, $src0;",
+  Int32Regs, Float32Regs, int_nvvm_f2ui_rp_ftz>;
+def INT_NVVM_F2UI_RP : F_MATH_1<"cvt.rpi.u32.f32 \t$dst, $src0;", Int32Regs,
+  Float32Regs, int_nvvm_f2ui_rp>;
+
+def INT_NVVM_I2F_RN : F_MATH_1<"cvt.rn.f32.s32 \t$dst, $src0;", Float32Regs,
+  Int32Regs, int_nvvm_i2f_rn>;
+def INT_NVVM_I2F_RZ : F_MATH_1<"cvt.rz.f32.s32 \t$dst, $src0;", Float32Regs,
+  Int32Regs, int_nvvm_i2f_rz>;
+def INT_NVVM_I2F_RM : F_MATH_1<"cvt.rm.f32.s32 \t$dst, $src0;", Float32Regs,
+  Int32Regs, int_nvvm_i2f_rm>;
+def INT_NVVM_I2F_RP : F_MATH_1<"cvt.rp.f32.s32 \t$dst, $src0;", Float32Regs,
+  Int32Regs, int_nvvm_i2f_rp>;
+
+def INT_NVVM_UI2F_RN : F_MATH_1<"cvt.rn.f32.u32 \t$dst, $src0;", Float32Regs,
+  Int32Regs, int_nvvm_ui2f_rn>;
+def INT_NVVM_UI2F_RZ : F_MATH_1<"cvt.rz.f32.u32 \t$dst, $src0;", Float32Regs,
+  Int32Regs, int_nvvm_ui2f_rz>;
+def INT_NVVM_UI2F_RM : F_MATH_1<"cvt.rm.f32.u32 \t$dst, $src0;", Float32Regs,
+  Int32Regs, int_nvvm_ui2f_rm>;
+def INT_NVVM_UI2F_RP : F_MATH_1<"cvt.rp.f32.u32 \t$dst, $src0;", Float32Regs,
+  Int32Regs, int_nvvm_ui2f_rp>;
+
+def INT_NVVM_LOHI_I2D : F_MATH_2<"mov.b64 \t$dst, {{$src0, $src1}};",
+  Float64Regs, Int32Regs, Int32Regs, int_nvvm_lohi_i2d>;
+
+def INT_NVVM_D2I_LO : F_MATH_1<!strconcat("{{\n\t",
+                       !strconcat(".reg .b32 %temp; \n\t",
+             !strconcat("mov.b64 \t{$dst, %temp}, $src0;\n\t",
+               "}}"))),
+             Int32Regs, Float64Regs, int_nvvm_d2i_lo>;
+def INT_NVVM_D2I_HI : F_MATH_1<!strconcat("{{\n\t",
+                       !strconcat(".reg .b32 %temp; \n\t",
+                         !strconcat("mov.b64 \t{%temp, $dst}, $src0;\n\t",
+                           "}}"))),
+             Int32Regs, Float64Regs, int_nvvm_d2i_hi>;
+
+def INT_NVVM_F2LL_RN_FTZ : F_MATH_1<"cvt.rni.ftz.s64.f32 \t$dst, $src0;",
+  Int64Regs, Float32Regs, int_nvvm_f2ll_rn_ftz>;
+def INT_NVVM_F2LL_RN : F_MATH_1<"cvt.rni.s64.f32 \t$dst, $src0;", Int64Regs,
+  Float32Regs, int_nvvm_f2ll_rn>;
+def INT_NVVM_F2LL_RZ_FTZ : F_MATH_1<"cvt.rzi.ftz.s64.f32 \t$dst, $src0;",
+  Int64Regs, Float32Regs, int_nvvm_f2ll_rz_ftz>;
+def INT_NVVM_F2LL_RZ : F_MATH_1<"cvt.rzi.s64.f32 \t$dst, $src0;", Int64Regs,
+  Float32Regs, int_nvvm_f2ll_rz>;
+def INT_NVVM_F2LL_RM_FTZ : F_MATH_1<"cvt.rmi.ftz.s64.f32 \t$dst, $src0;",
+  Int64Regs, Float32Regs, int_nvvm_f2ll_rm_ftz>;
+def INT_NVVM_F2LL_RM : F_MATH_1<"cvt.rmi.s64.f32 \t$dst, $src0;", Int64Regs,
+  Float32Regs, int_nvvm_f2ll_rm>;
+def INT_NVVM_F2LL_RP_FTZ : F_MATH_1<"cvt.rpi.ftz.s64.f32 \t$dst, $src0;",
+  Int64Regs, Float32Regs, int_nvvm_f2ll_rp_ftz>;
+def INT_NVVM_F2LL_RP : F_MATH_1<"cvt.rpi.s64.f32 \t$dst, $src0;", Int64Regs,
+  Float32Regs, int_nvvm_f2ll_rp>;
+
+def INT_NVVM_F2ULL_RN_FTZ : F_MATH_1<"cvt.rni.ftz.u64.f32 \t$dst, $src0;",
+  Int64Regs, Float32Regs, int_nvvm_f2ull_rn_ftz>;
+def INT_NVVM_F2ULL_RN : F_MATH_1<"cvt.rni.u64.f32 \t$dst, $src0;", Int64Regs,
+  Float32Regs, int_nvvm_f2ull_rn>;
+def INT_NVVM_F2ULL_RZ_FTZ : F_MATH_1<"cvt.rzi.ftz.u64.f32 \t$dst, $src0;",
+  Int64Regs, Float32Regs, int_nvvm_f2ull_rz_ftz>;
+def INT_NVVM_F2ULL_RZ : F_MATH_1<"cvt.rzi.u64.f32 \t$dst, $src0;", Int64Regs,
+  Float32Regs, int_nvvm_f2ull_rz>;
+def INT_NVVM_F2ULL_RM_FTZ : F_MATH_1<"cvt.rmi.ftz.u64.f32 \t$dst, $src0;",
+  Int64Regs, Float32Regs, int_nvvm_f2ull_rm_ftz>;
+def INT_NVVM_F2ULL_RM : F_MATH_1<"cvt.rmi.u64.f32 \t$dst, $src0;", Int64Regs,
+  Float32Regs, int_nvvm_f2ull_rm>;
+def INT_NVVM_F2ULL_RP_FTZ : F_MATH_1<"cvt.rpi.ftz.u64.f32 \t$dst, $src0;",
+  Int64Regs, Float32Regs, int_nvvm_f2ull_rp_ftz>;
+def INT_NVVM_F2ULL_RP : F_MATH_1<"cvt.rpi.u64.f32 \t$dst, $src0;", Int64Regs,
+  Float32Regs, int_nvvm_f2ull_rp>;
+
+def INT_NVVM_D2LL_RN : F_MATH_1<"cvt.rni.s64.f64 \t$dst, $src0;", Int64Regs,
+  Float64Regs, int_nvvm_d2ll_rn>;
+def INT_NVVM_D2LL_RZ : F_MATH_1<"cvt.rzi.s64.f64 \t$dst, $src0;", Int64Regs,
+  Float64Regs, int_nvvm_d2ll_rz>;
+def INT_NVVM_D2LL_RM : F_MATH_1<"cvt.rmi.s64.f64 \t$dst, $src0;", Int64Regs,
+  Float64Regs, int_nvvm_d2ll_rm>;
+def INT_NVVM_D2LL_RP : F_MATH_1<"cvt.rpi.s64.f64 \t$dst, $src0;", Int64Regs,
+  Float64Regs, int_nvvm_d2ll_rp>;
+
+def INT_NVVM_D2ULL_RN : F_MATH_1<"cvt.rni.u64.f64 \t$dst, $src0;", Int64Regs,
+  Float64Regs, int_nvvm_d2ull_rn>;
+def INT_NVVM_D2ULL_RZ : F_MATH_1<"cvt.rzi.u64.f64 \t$dst, $src0;", Int64Regs,
+  Float64Regs, int_nvvm_d2ull_rz>;
+def INT_NVVM_D2ULL_RM : F_MATH_1<"cvt.rmi.u64.f64 \t$dst, $src0;", Int64Regs,
+  Float64Regs, int_nvvm_d2ull_rm>;
+def INT_NVVM_D2ULL_RP : F_MATH_1<"cvt.rpi.u64.f64 \t$dst, $src0;", Int64Regs,
+  Float64Regs, int_nvvm_d2ull_rp>;
+
+def INT_NVVM_LL2F_RN : F_MATH_1<"cvt.rn.f32.s64 \t$dst, $src0;", Float32Regs,
+  Int64Regs, int_nvvm_ll2f_rn>;
+def INT_NVVM_LL2F_RZ : F_MATH_1<"cvt.rz.f32.s64 \t$dst, $src0;", Float32Regs,
+  Int64Regs, int_nvvm_ll2f_rz>;
+def INT_NVVM_LL2F_RM : F_MATH_1<"cvt.rm.f32.s64 \t$dst, $src0;", Float32Regs,
+  Int64Regs, int_nvvm_ll2f_rm>;
+def INT_NVVM_LL2F_RP : F_MATH_1<"cvt.rp.f32.s64 \t$dst, $src0;", Float32Regs,
+  Int64Regs, int_nvvm_ll2f_rp>;
+def INT_NVVM_ULL2F_RN : F_MATH_1<"cvt.rn.f32.u64 \t$dst, $src0;", Float32Regs,
+  Int64Regs, int_nvvm_ull2f_rn>;
+def INT_NVVM_ULL2F_RZ : F_MATH_1<"cvt.rz.f32.u64 \t$dst, $src0;", Float32Regs,
+  Int64Regs, int_nvvm_ull2f_rz>;
+def INT_NVVM_ULL2F_RM : F_MATH_1<"cvt.rm.f32.u64 \t$dst, $src0;", Float32Regs,
+  Int64Regs, int_nvvm_ull2f_rm>;
+def INT_NVVM_ULL2F_RP : F_MATH_1<"cvt.rp.f32.u64 \t$dst, $src0;", Float32Regs,
+  Int64Regs, int_nvvm_ull2f_rp>;
+
+def INT_NVVM_LL2D_RN : F_MATH_1<"cvt.rn.f64.s64 \t$dst, $src0;", Float64Regs,
+  Int64Regs, int_nvvm_ll2d_rn>;
+def INT_NVVM_LL2D_RZ : F_MATH_1<"cvt.rz.f64.s64 \t$dst, $src0;", Float64Regs,
+  Int64Regs, int_nvvm_ll2d_rz>;
+def INT_NVVM_LL2D_RM : F_MATH_1<"cvt.rm.f64.s64 \t$dst, $src0;", Float64Regs,
+  Int64Regs, int_nvvm_ll2d_rm>;
+def INT_NVVM_LL2D_RP : F_MATH_1<"cvt.rp.f64.s64 \t$dst, $src0;", Float64Regs,
+  Int64Regs, int_nvvm_ll2d_rp>;
+def INT_NVVM_ULL2D_RN : F_MATH_1<"cvt.rn.f64.u64 \t$dst, $src0;", Float64Regs,
+  Int64Regs, int_nvvm_ull2d_rn>;
+def INT_NVVM_ULL2D_RZ : F_MATH_1<"cvt.rz.f64.u64 \t$dst, $src0;", Float64Regs,
+  Int64Regs, int_nvvm_ull2d_rz>;
+def INT_NVVM_ULL2D_RM : F_MATH_1<"cvt.rm.f64.u64 \t$dst, $src0;", Float64Regs,
+  Int64Regs, int_nvvm_ull2d_rm>;
+def INT_NVVM_ULL2D_RP : F_MATH_1<"cvt.rp.f64.u64 \t$dst, $src0;", Float64Regs,
+  Int64Regs, int_nvvm_ull2d_rp>;
+
+def INT_NVVM_F2H_RN_FTZ : F_MATH_1<!strconcat("{{\n\t",
+                                   !strconcat(".reg .b16 %temp;\n\t",
+           !strconcat("cvt.rn.ftz.f16.f32 \t%temp, $src0;\n\t",
+           !strconcat("mov.b16 \t$dst, %temp;\n",
+             "}}")))),
+                                   Int16Regs, Float32Regs, int_nvvm_f2h_rn_ftz>;
+def INT_NVVM_F2H_RN : F_MATH_1<!strconcat("{{\n\t",
+                                   !strconcat(".reg .b16 %temp;\n\t",
+           !strconcat("cvt.rn.f16.f32 \t%temp, $src0;\n\t",
+           !strconcat("mov.b16 \t$dst, %temp;\n",
+             "}}")))),
+           Int16Regs, Float32Regs, int_nvvm_f2h_rn>;
+
+def INT_NVVM_H2F : F_MATH_1<!strconcat("{{\n\t",
+                            !strconcat(".reg .b16 %temp;\n\t",
+          !strconcat("mov.b16 \t%temp, $src0;\n\t",
+          !strconcat("cvt.f32.f16 \t$dst, %temp;\n\t",
+            "}}")))),
+          Float32Regs, Int16Regs, int_nvvm_h2f>;
+
+//
+// Bitcast
+//
+
+def INT_NVVM_BITCAST_F2I : F_MATH_1<"mov.b32 \t$dst, $src0;", Int32Regs,
+  Float32Regs, int_nvvm_bitcast_f2i>;
+def INT_NVVM_BITCAST_I2F : F_MATH_1<"mov.b32 \t$dst, $src0;", Float32Regs,
+  Int32Regs, int_nvvm_bitcast_i2f>;
+
+def INT_NVVM_BITCAST_LL2D : F_MATH_1<"mov.b64 \t$dst, $src0;", Float64Regs,
+  Int64Regs, int_nvvm_bitcast_ll2d>;
+def INT_NVVM_BITCAST_D2LL : F_MATH_1<"mov.b64 \t$dst, $src0;", Int64Regs,
+  Float64Regs, int_nvvm_bitcast_d2ll>;
+
+//-----------------------------------
+// Atomic Functions
+//-----------------------------------
+
+class ATOMIC_GLOBAL_CHK <dag ops, dag frag>
+ : PatFrag<ops, frag, [{
+   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL);
+}]>;
+class ATOMIC_SHARED_CHK <dag ops, dag frag>
+ : PatFrag<ops, frag, [{
+   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED);
+}]>;
+class ATOMIC_GENERIC_CHK <dag ops, dag frag>
+ : PatFrag<ops, frag, [{
+   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GENERIC);
+}]>;
+
+multiclass F_ATOMIC_2_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
+  string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
+  Operand IMMType, SDNode IMM, Predicate Pred> {
+  def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b),
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], $b;", ""))))),
+         [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b))]>,
+  Requires<[Pred]>;
+  def imm : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, IMMType:$b),
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], $b;", ""))))),
+         [(set regclass:$dst, (IntOp ptrclass:$addr, IMM:$b))]>,
+  Requires<[Pred]>;
+}
+multiclass F_ATOMIC_2<NVPTXRegClass regclass, string SpaceStr, string TypeStr,
+  string OpcStr, PatFrag IntOp, Operand IMMType, SDNode IMM, Predicate Pred> {
+  defm p32 : F_ATOMIC_2_imp<Int32Regs, regclass, SpaceStr, TypeStr, OpcStr,
+    IntOp, IMMType, IMM, Pred>;
+  defm p64 : F_ATOMIC_2_imp<Int64Regs, regclass, SpaceStr, TypeStr, OpcStr,
+    IntOp, IMMType, IMM, Pred>;
+}
+
+// has 2 operands, neg the second one
+multiclass F_ATOMIC_2_NEG_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
+  string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
+  Operand IMMType, Predicate Pred> {
+  def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b),
+    !strconcat("{{ \n\t",
+         !strconcat(".reg \t.s",
+         !strconcat(TypeStr,
+         !strconcat(" temp; \n\t",
+         !strconcat("neg.s",
+         !strconcat(TypeStr,
+         !strconcat(" \ttemp, $b; \n\t",
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(".u",
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], temp; \n\t",
+           !strconcat("}}", "")))))))))))))),
+         [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b))]>,
+  Requires<[Pred]>;
+}
+multiclass F_ATOMIC_2_NEG<NVPTXRegClass regclass, string SpaceStr,
+  string TypeStr, string OpcStr, PatFrag IntOp, Operand IMMType,
+  Predicate Pred> {
+ defm p32: F_ATOMIC_2_NEG_imp<Int32Regs, regclass, SpaceStr, TypeStr, OpcStr,
+   IntOp, IMMType, Pred> ;
+ defm p64: F_ATOMIC_2_NEG_imp<Int64Regs, regclass, SpaceStr, TypeStr, OpcStr,
+   IntOp, IMMType, Pred> ;
+}
+
+// has 3 operands
+multiclass F_ATOMIC_3_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
+  string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
+  Operand IMMType, Predicate Pred> {
+  def reg : NVPTXInst<(outs regclass:$dst),
+    (ins ptrclass:$addr, regclass:$b, regclass:$c),
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
+         [(set regclass:$dst,
+           (IntOp ptrclass:$addr, regclass:$b, regclass:$c))]>,
+         Requires<[Pred]>;
+  def imm1 : NVPTXInst<(outs regclass:$dst),
+    (ins ptrclass:$addr, IMMType:$b, regclass:$c),
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
+         [(set regclass:$dst, (IntOp ptrclass:$addr, imm:$b, regclass:$c))]>,
+  Requires<[Pred]>;
+  def imm2 : NVPTXInst<(outs regclass:$dst),
+    (ins ptrclass:$addr, regclass:$b, IMMType:$c),
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
+         [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b, imm:$c))]>,
+  Requires<[Pred]>;
+  def imm3 : NVPTXInst<(outs regclass:$dst),
+    (ins ptrclass:$addr, IMMType:$b, IMMType:$c),
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
+         [(set regclass:$dst, (IntOp ptrclass:$addr, imm:$b, imm:$c))]>,
+  Requires<[Pred]>;
+}
+multiclass F_ATOMIC_3<NVPTXRegClass regclass, string SpaceStr, string TypeStr,
+  string OpcStr, PatFrag IntOp, Operand IMMType, Predicate Pred> {
+  defm p32 : F_ATOMIC_3_imp<Int32Regs, regclass, SpaceStr, TypeStr, OpcStr,
+    IntOp, IMMType, Pred>;
+  defm p64 : F_ATOMIC_3_imp<Int64Regs, regclass, SpaceStr, TypeStr, OpcStr,
+    IntOp, IMMType, Pred>;
+}
+
+// atom_add
+
+def atomic_load_add_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_add_32 node:$a, node:$b)>;
+def atomic_load_add_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_add_32 node:$a, node:$b)>;
+def atomic_load_add_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_add_32 node:$a, node:$b)>;
+def atomic_load_add_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_add_64 node:$a, node:$b)>;
+def atomic_load_add_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_add_64 node:$a, node:$b)>;
+def atomic_load_add_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_add_64 node:$a, node:$b)>;
+def atomic_load_add_f32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
+def atomic_load_add_f32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
+def atomic_load_add_f32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_ADD_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".add",
+  atomic_load_add_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_ADD_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", ".add",
+  atomic_load_add_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_ADD_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".add",
+  atomic_load_add_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_ADD_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+  ".add", atomic_load_add_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+
+defm INT_PTX_ATOM_ADD_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64", ".add",
+  atomic_load_add_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_ADD_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64", ".add",
+  atomic_load_add_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_ADD_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".add",
+  atomic_load_add_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_ADD_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".u64",
+  ".add", atomic_load_add_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+
+defm INT_PTX_ATOM_ADD_G_F32 : F_ATOMIC_2<Float32Regs, ".global", ".f32", ".add",
+  atomic_load_add_f32_g, f32imm, fpimm, hasAtomAddF32>;
+defm INT_PTX_ATOM_ADD_S_F32 : F_ATOMIC_2<Float32Regs, ".shared", ".f32", ".add",
+  atomic_load_add_f32_s, f32imm, fpimm, hasAtomAddF32>;
+defm INT_PTX_ATOM_ADD_GEN_F32 : F_ATOMIC_2<Float32Regs, "", ".f32", ".add",
+  atomic_load_add_f32_gen, f32imm, fpimm, hasAtomAddF32>;
+
+// atom_sub
+
+def atomic_load_sub_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_sub_32 node:$a, node:$b)>;
+def atomic_load_sub_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_sub_32 node:$a, node:$b)>;
+def atomic_load_sub_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_sub_32 node:$a, node:$b)>;
+def atomic_load_sub_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_sub_64 node:$a, node:$b)>;
+def atomic_load_sub_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_sub_64 node:$a, node:$b)>;
+def atomic_load_sub_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_sub_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_SUB_G_32 : F_ATOMIC_2_NEG<Int32Regs, ".global", "32", ".add",
+  atomic_load_sub_32_g, i32imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_SUB_G_64 : F_ATOMIC_2_NEG<Int64Regs, ".global", "64", ".add",
+  atomic_load_sub_64_g, i64imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_SUB_GEN_32 : F_ATOMIC_2_NEG<Int32Regs, "", "32", ".add",
+  atomic_load_sub_32_gen, i32imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_SUB_GEN_32_USE_G : F_ATOMIC_2_NEG<Int32Regs, ".global", "32",
+  ".add", atomic_load_sub_32_gen, i32imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_SUB_S_32 : F_ATOMIC_2_NEG<Int32Regs, ".shared", "32", ".add",
+  atomic_load_sub_32_s, i32imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_SUB_S_64 : F_ATOMIC_2_NEG<Int64Regs, ".shared", "64", ".add",
+  atomic_load_sub_64_s, i64imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_SUB_GEN_64 : F_ATOMIC_2_NEG<Int64Regs, "", "64", ".add",
+  atomic_load_sub_64_gen, i64imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_SUB_GEN_64_USE_G : F_ATOMIC_2_NEG<Int64Regs, ".global", "64",
+  ".add", atomic_load_sub_64_gen, i64imm, useAtomRedG64forGen64>;
+
+// atom_swap
+
+def atomic_swap_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_swap_32 node:$a, node:$b)>;
+def atomic_swap_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_swap_32 node:$a, node:$b)>;
+def atomic_swap_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_swap_32 node:$a, node:$b)>;
+def atomic_swap_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_swap_64 node:$a, node:$b)>;
+def atomic_swap_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_swap_64 node:$a, node:$b)>;
+def atomic_swap_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_swap_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_SWAP_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".exch",
+  atomic_swap_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_SWAP_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".exch",
+  atomic_swap_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_SWAP_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".exch",
+  atomic_swap_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_SWAP_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
+  ".exch", atomic_swap_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_SWAP_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".exch",
+  atomic_swap_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_SWAP_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".exch",
+  atomic_swap_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_SWAP_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".exch",
+  atomic_swap_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_SWAP_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
+  ".exch", atomic_swap_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+
+// atom_max
+
+def atomic_load_max_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b)
+  , (atomic_load_max_32 node:$a, node:$b)>;
+def atomic_load_max_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_max_32 node:$a, node:$b)>;
+def atomic_load_max_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_max_32 node:$a, node:$b)>;
+def atomic_load_umax_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_umax_32 node:$a, node:$b)>;
+def atomic_load_umax_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_umax_32 node:$a, node:$b)>;
+def atomic_load_umax_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_umax_32 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_LOAD_MAX_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".s32",
+  ".max", atomic_load_max_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_LOAD_MAX_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".s32",
+  ".max", atomic_load_max_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_LOAD_MAX_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".s32", ".max",
+  atomic_load_max_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_LOAD_MAX_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
+  ".s32", ".max", atomic_load_max_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_LOAD_UMAX_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+  ".max", atomic_load_umax_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_LOAD_UMAX_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32",
+  ".max", atomic_load_umax_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_LOAD_UMAX_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".max",
+  atomic_load_umax_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_LOAD_UMAX_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
+  ".u32", ".max", atomic_load_umax_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+
+// atom_min
+
+def atomic_load_min_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_min_32 node:$a, node:$b)>;
+def atomic_load_min_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_min_32 node:$a, node:$b)>;
+def atomic_load_min_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_min_32 node:$a, node:$b)>;
+def atomic_load_umin_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_umin_32 node:$a, node:$b)>;
+def atomic_load_umin_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_umin_32 node:$a, node:$b)>;
+def atomic_load_umin_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_umin_32 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_LOAD_MIN_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".s32",
+  ".min", atomic_load_min_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_LOAD_MIN_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".s32",
+  ".min", atomic_load_min_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_LOAD_MIN_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".s32", ".min",
+  atomic_load_min_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_LOAD_MIN_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
+  ".s32", ".min", atomic_load_min_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_LOAD_UMIN_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+  ".min", atomic_load_umin_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_LOAD_UMIN_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32",
+  ".min", atomic_load_umin_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_LOAD_UMIN_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".min",
+  atomic_load_umin_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_LOAD_UMIN_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
+  ".u32", ".min", atomic_load_umin_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+
+// atom_inc  atom_dec
+
+def atomic_load_inc_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>;
+def atomic_load_inc_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>;
+def atomic_load_inc_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>;
+def atomic_load_dec_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>;
+def atomic_load_dec_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>;
+def atomic_load_dec_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_INC_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".inc",
+  atomic_load_inc_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_INC_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", ".inc",
+  atomic_load_inc_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_INC_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".inc",
+  atomic_load_inc_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_INC_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+  ".inc", atomic_load_inc_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_DEC_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".dec",
+  atomic_load_dec_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_DEC_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", ".dec",
+  atomic_load_dec_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_DEC_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".dec",
+  atomic_load_dec_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_DEC_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+  ".dec", atomic_load_dec_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+
+// atom_and
+
+def atomic_load_and_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_and_32 node:$a, node:$b)>;
+def atomic_load_and_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_and_32 node:$a, node:$b)>;
+def atomic_load_and_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_and_32 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_AND_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".and",
+  atomic_load_and_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_AND_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".and",
+  atomic_load_and_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_AND_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".and",
+  atomic_load_and_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_AND_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
+  ".and", atomic_load_and_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+
+// atom_or
+
+def atomic_load_or_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_or_32 node:$a, node:$b)>;
+def atomic_load_or_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_or_32 node:$a, node:$b)>;
+def atomic_load_or_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_or_32 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_OR_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".or",
+  atomic_load_or_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_OR_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".or",
+  atomic_load_or_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_OR_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
+  ".or", atomic_load_or_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_OR_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".or",
+  atomic_load_or_32_s, i32imm, imm, hasAtomRedS32>;
+
+// atom_xor
+
+def atomic_load_xor_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_xor_32 node:$a, node:$b)>;
+def atomic_load_xor_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_xor_32 node:$a, node:$b)>;
+def atomic_load_xor_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_xor_32 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_XOR_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".xor",
+  atomic_load_xor_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_XOR_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".xor",
+  atomic_load_xor_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_XOR_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".xor",
+  atomic_load_xor_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_XOR_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
+  ".xor", atomic_load_xor_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+
+// atom_cas
+
+def atomic_cmp_swap_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_32 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_32 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_32 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_64 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_64 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_64 node:$a, node:$b, node:$c)>;
+
+defm INT_PTX_ATOM_CAS_G_32 : F_ATOMIC_3<Int32Regs, ".global", ".b32", ".cas",
+  atomic_cmp_swap_32_g, i32imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_CAS_S_32 : F_ATOMIC_3<Int32Regs, ".shared", ".b32", ".cas",
+  atomic_cmp_swap_32_s, i32imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_CAS_GEN_32 : F_ATOMIC_3<Int32Regs, "", ".b32", ".cas",
+  atomic_cmp_swap_32_gen, i32imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_CAS_GEN_32_USE_G : F_ATOMIC_3<Int32Regs, ".global", ".b32",
+  ".cas", atomic_cmp_swap_32_gen, i32imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_CAS_G_64 : F_ATOMIC_3<Int64Regs, ".global", ".b64", ".cas",
+  atomic_cmp_swap_64_g, i64imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_CAS_S_64 : F_ATOMIC_3<Int64Regs, ".shared", ".b64", ".cas",
+  atomic_cmp_swap_64_s, i64imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_CAS_GEN_64 : F_ATOMIC_3<Int64Regs, "", ".b64", ".cas",
+  atomic_cmp_swap_64_gen, i64imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_CAS_GEN_64_USE_G : F_ATOMIC_3<Int64Regs, ".global", ".b64",
+  ".cas", atomic_cmp_swap_64_gen, i64imm, useAtomRedG64forGen64>;
+
+
+//-----------------------------------
+// Read Special Registers
+//-----------------------------------
+class F_SREG<string OpStr, NVPTXRegClass regclassOut, Intrinsic IntOp> :
+      NVPTXInst<(outs regclassOut:$dst), (ins),
+               OpStr,
+         [(set regclassOut:$dst, (IntOp))]>;
+
+def INT_PTX_SREG_TID_X : F_SREG<"mov.u32 \t$dst, %tid.x;", Int32Regs,
+  int_nvvm_read_ptx_sreg_tid_x>;
+def INT_PTX_SREG_TID_Y : F_SREG<"mov.u32 \t$dst, %tid.y;", Int32Regs,
+  int_nvvm_read_ptx_sreg_tid_y>;
+def INT_PTX_SREG_TID_Z : F_SREG<"mov.u32 \t$dst, %tid.z;", Int32Regs,
+  int_nvvm_read_ptx_sreg_tid_z>;
+
+def INT_PTX_SREG_NTID_X : F_SREG<"mov.u32 \t$dst, %ntid.x;", Int32Regs,
+  int_nvvm_read_ptx_sreg_ntid_x>;
+def INT_PTX_SREG_NTID_Y : F_SREG<"mov.u32 \t$dst, %ntid.y;", Int32Regs,
+  int_nvvm_read_ptx_sreg_ntid_y>;
+def INT_PTX_SREG_NTID_Z : F_SREG<"mov.u32 \t$dst, %ntid.z;", Int32Regs,
+  int_nvvm_read_ptx_sreg_ntid_z>;
+
+def INT_PTX_SREG_CTAID_X : F_SREG<"mov.u32 \t$dst, %ctaid.x;", Int32Regs,
+  int_nvvm_read_ptx_sreg_ctaid_x>;
+def INT_PTX_SREG_CTAID_Y : F_SREG<"mov.u32 \t$dst, %ctaid.y;", Int32Regs,
+  int_nvvm_read_ptx_sreg_ctaid_y>;
+def INT_PTX_SREG_CTAID_Z : F_SREG<"mov.u32 \t$dst, %ctaid.z;", Int32Regs,
+  int_nvvm_read_ptx_sreg_ctaid_z>;
+
+def INT_PTX_SREG_NCTAID_X : F_SREG<"mov.u32 \t$dst, %nctaid.x;", Int32Regs,
+  int_nvvm_read_ptx_sreg_nctaid_x>;
+def INT_PTX_SREG_NCTAID_Y : F_SREG<"mov.u32 \t$dst, %nctaid.y;", Int32Regs,
+  int_nvvm_read_ptx_sreg_nctaid_y>;
+def INT_PTX_SREG_NCTAID_Z : F_SREG<"mov.u32 \t$dst, %nctaid.z;", Int32Regs,
+  int_nvvm_read_ptx_sreg_nctaid_z>;
+
+def INT_PTX_SREG_WARPSIZE : F_SREG<"mov.u32 \t$dst, WARP_SZ;", Int32Regs,
+  int_nvvm_read_ptx_sreg_warpsize>;
+
+
+//-----------------------------------
+// Support for ldu on sm_20 or later
+//-----------------------------------
+
+// Scalar
+// @TODO: Revisit this, Changed imemAny to imem
+multiclass LDU_G<string TyStr, NVPTXRegClass regclass, Intrinsic IntOp> {
+  def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src),
+               !strconcat("ldu.global.", TyStr),
+         [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDU]>;
+  def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src),
+               !strconcat("ldu.global.", TyStr),
+         [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDU]>;
+ def avar:  NVPTXInst<(outs regclass:$result), (ins imem:$src),
+               !strconcat("ldu.global.", TyStr),
+                [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>,
+                Requires<[hasLDU]>;
+ def ari :  NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
+               !strconcat("ldu.global.", TyStr),
+         [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDU]>;
+ def ari64 :  NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
+               !strconcat("ldu.global.", TyStr),
+         [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDU]>;
+}
+
+defm INT_PTX_LDU_GLOBAL_i8  : LDU_G<"u8 \t$result, [$src];",  Int8Regs,
+int_nvvm_ldu_global_i>;
+defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs,
+int_nvvm_ldu_global_i>;
+defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs,
+int_nvvm_ldu_global_i>;
+defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64 \t$result, [$src];", Int64Regs,
+int_nvvm_ldu_global_i>;
+defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32 \t$result, [$src];", Float32Regs,
+int_nvvm_ldu_global_f>;
+defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs,
+int_nvvm_ldu_global_f>;
+defm INT_PTX_LDU_GLOBAL_p32 : LDU_G<"u32 \t$result, [$src];", Int32Regs,
+int_nvvm_ldu_global_p>;
+defm INT_PTX_LDU_GLOBAL_p64 : LDU_G<"u64 \t$result, [$src];", Int64Regs,
+int_nvvm_ldu_global_p>;
+
+// vector
+
+// Elementized vector ldu
+multiclass VLDU_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
+ def _32:     NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+   (ins Int32Regs:$src),
+                     !strconcat("ldu.global.", TyStr), []>;
+ def _64:     NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+   (ins Int64Regs:$src),
+                     !strconcat("ldu.global.", TyStr), []>;
+}
+
+multiclass VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass> {
+ def _32:    NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+     regclass:$dst4), (ins Int32Regs:$src),
+               !strconcat("ldu.global.", TyStr), []>;
+ def _64:    NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+     regclass:$dst4), (ins Int64Regs:$src),
+               !strconcat("ldu.global.", TyStr), []>;
+}
+
+defm INT_PTX_LDU_G_v2i8_ELE
+  : VLDU_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];",  Int8Regs>;
+defm INT_PTX_LDU_G_v2i16_ELE
+  : VLDU_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
+defm INT_PTX_LDU_G_v2i32_ELE
+  : VLDU_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
+defm INT_PTX_LDU_G_v2f32_ELE
+  : VLDU_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
+defm INT_PTX_LDU_G_v2i64_ELE
+  : VLDU_G_ELE_V2<"v2.u64 \t{{$dst1, $dst2}}, [$src];", Int64Regs>;
+defm INT_PTX_LDU_G_v2f64_ELE
+  : VLDU_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>;
+defm INT_PTX_LDU_G_v4i8_ELE
+  : VLDU_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int8Regs>;
+defm INT_PTX_LDU_G_v4i16_ELE
+  : VLDU_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
+    Int16Regs>;
+defm INT_PTX_LDU_G_v4i32_ELE
+  : VLDU_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
+    Int32Regs>;
+defm INT_PTX_LDU_G_v4f32_ELE
+  : VLDU_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
+    Float32Regs>;
+
+// Vector ldu
+multiclass VLDU_G<string TyStr, NVPTXRegClass regclass, Intrinsic IntOp,
+  NVPTXInst eleInst, NVPTXInst eleInst64> {
+ def _32:    NVPTXVecInst<(outs regclass:$result), (ins Int32Regs:$src),
+               !strconcat("ldu.global.", TyStr),
+         [(set regclass:$result, (IntOp Int32Regs:$src))], eleInst>,
+ Requires<[hasLDU]>;
+ def _64:    NVPTXVecInst<(outs regclass:$result), (ins Int64Regs:$src),
+               !strconcat("ldu.global.", TyStr),
+         [(set regclass:$result, (IntOp Int64Regs:$src))], eleInst64>,
+ Requires<[hasLDU]>;
+}
+
+let VecInstType=isVecLD.Value in {
+defm INT_PTX_LDU_G_v2i8  : VLDU_G<"v2.u8 \t${result:vecfull}, [$src];",
+  V2I8Regs,  int_nvvm_ldu_global_i, INT_PTX_LDU_G_v2i8_ELE_32,
+  INT_PTX_LDU_G_v2i8_ELE_64>;
+defm INT_PTX_LDU_G_v4i8  : VLDU_G<"v4.u8 \t${result:vecfull}, [$src];",
+  V4I8Regs,  int_nvvm_ldu_global_i, INT_PTX_LDU_G_v4i8_ELE_32,
+  INT_PTX_LDU_G_v4i8_ELE_64>;
+defm INT_PTX_LDU_G_v2i16 : VLDU_G<"v2.u16 \t${result:vecfull}, [$src];",
+  V2I16Regs, int_nvvm_ldu_global_i, INT_PTX_LDU_G_v2i16_ELE_32,
+  INT_PTX_LDU_G_v2i16_ELE_64>;
+defm INT_PTX_LDU_G_v4i16 : VLDU_G<"v4.u16 \t${result:vecfull}, [$src];",
+  V4I16Regs, int_nvvm_ldu_global_i, INT_PTX_LDU_G_v4i16_ELE_32,
+  INT_PTX_LDU_G_v4i16_ELE_64>;
+defm INT_PTX_LDU_G_v2i32 : VLDU_G<"v2.u32 \t${result:vecfull}, [$src];",
+  V2I32Regs, int_nvvm_ldu_global_i, INT_PTX_LDU_G_v2i32_ELE_32,
+  INT_PTX_LDU_G_v2i32_ELE_64>;
+defm INT_PTX_LDU_G_v4i32 : VLDU_G<"v4.u32 \t${result:vecfull}, [$src];",
+  V4I32Regs, int_nvvm_ldu_global_i, INT_PTX_LDU_G_v4i32_ELE_32,
+  INT_PTX_LDU_G_v4i32_ELE_64>;
+defm INT_PTX_LDU_G_v2f32 : VLDU_G<"v2.f32 \t${result:vecfull}, [$src];",
+  V2F32Regs, int_nvvm_ldu_global_f, INT_PTX_LDU_G_v2f32_ELE_32,
+  INT_PTX_LDU_G_v2f32_ELE_64>;
+defm INT_PTX_LDU_G_v4f32 : VLDU_G<"v4.f32 \t${result:vecfull}, [$src];",
+  V4F32Regs, int_nvvm_ldu_global_f, INT_PTX_LDU_G_v4f32_ELE_32,
+  INT_PTX_LDU_G_v4f32_ELE_64>;
+defm INT_PTX_LDU_G_v2i64 : VLDU_G<"v2.u64 \t${result:vecfull}, [$src];",
+  V2I64Regs, int_nvvm_ldu_global_i, INT_PTX_LDU_G_v2i64_ELE_32,
+  INT_PTX_LDU_G_v2i64_ELE_64>;
+defm INT_PTX_LDU_G_v2f64 : VLDU_G<"v2.f64 \t${result:vecfull}, [$src];",
+  V2F64Regs, int_nvvm_ldu_global_f, INT_PTX_LDU_G_v2f64_ELE_32,
+  INT_PTX_LDU_G_v2f64_ELE_64>;
+}
+
+
+
+multiclass NG_TO_G<string Str, Intrinsic Intrin> {
+   def _yes : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+          !strconcat("cvta.", !strconcat(Str, ".u32 \t$result, $src;")),
+      [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>,
+   Requires<[hasGenericLdSt]>;
+   def _yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+          !strconcat("cvta.", !strconcat(Str, ".u64 \t$result, $src;")),
+      [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>,
+   Requires<[hasGenericLdSt]>;
+
+// @TODO: Are these actually needed?  I believe global addresses will be copied
+// to register values anyway.
+   /*def __addr_yes : NVPTXInst<(outs Int32Regs:$result), (ins imemAny:$src),
+          !strconcat("cvta.", !strconcat(Str, ".u32 \t$result, $src;")),
+      [(set Int32Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>,
+      Requires<[hasGenericLdSt]>;
+   def __addr_yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins imemAny:$src),
+          !strconcat("cvta.", !strconcat(Str, ".u64 \t$result, $src;")),
+      [(set Int64Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>,
+      Requires<[hasGenericLdSt]>;*/
+
+   def _no : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+          "mov.u32 \t$result, $src;",
+      [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>;
+   def _no_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+          "mov.u64 \t$result, $src;",
+      [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>;
+
+// @TODO: Are these actually needed?  I believe global addresses will be copied
+// to register values anyway.
+   /*def _addr_no : NVPTXInst<(outs Int32Regs:$result), (ins imem:$src),
+          "mov.u32 \t$result, $src;",
+      [(set Int32Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>;
+   def _addr_no_64 : NVPTXInst<(outs Int64Regs:$result), (ins imem:$src),
+          "mov.u64 \t$result, $src;",
+      [(set Int64Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>;*/
+}
+
+multiclass G_TO_NG<string Str, Intrinsic Intrin> {
+   def _yes : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+          !strconcat("cvta.to.", !strconcat(Str, ".u32 \t$result, $src;")),
+      [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>,
+   Requires<[hasGenericLdSt]>;
+   def _yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+          !strconcat("cvta.to.", !strconcat(Str, ".u64 \t$result, $src;")),
+      [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>,
+   Requires<[hasGenericLdSt]>;
+   def _no : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+          "mov.u32 \t$result, $src;",
+      [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>;
+   def _no_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+          "mov.u64 \t$result, $src;",
+      [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>;
+}
+
+defm cvta_local  : NG_TO_G<"local", int_nvvm_ptr_local_to_gen>;
+defm cvta_shared : NG_TO_G<"shared", int_nvvm_ptr_shared_to_gen>;
+defm cvta_global : NG_TO_G<"global", int_nvvm_ptr_global_to_gen>;
+
+defm cvta_to_local   : G_TO_NG<"local", int_nvvm_ptr_gen_to_local>;
+defm cvta_to_shared : G_TO_NG<"shared", int_nvvm_ptr_gen_to_shared>;
+defm cvta_to_global : G_TO_NG<"global", int_nvvm_ptr_gen_to_global>;
+
+def cvta_const : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+               "mov.u32 \t$result, $src;",
+     [(set Int32Regs:$result, (int_nvvm_ptr_constant_to_gen Int32Regs:$src))]>;
+def cvta_const_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+               "mov.u64 \t$result, $src;",
+     [(set Int64Regs:$result, (int_nvvm_ptr_constant_to_gen Int64Regs:$src))]>;
+
+
+
+// @TODO: Revisit this.  There is a type
+// contradiction between iPTRAny and iPTR for the def.
+/*def cvta_const_addr : NVPTXInst<(outs Int32Regs:$result), (ins imemAny:$src),
+               "mov.u32 \t$result, $src;",
+     [(set Int32Regs:$result, (int_nvvm_ptr_constant_to_gen
+     (Wrapper tglobaladdr:$src)))]>;
+def cvta_const_addr_64 : NVPTXInst<(outs Int64Regs:$result), (ins imemAny:$src),
+               "mov.u64 \t$result, $src;",
+     [(set Int64Regs:$result, (int_nvvm_ptr_constant_to_gen
+     (Wrapper tglobaladdr:$src)))]>;*/
+
+
+def cvta_to_const : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+            "mov.u32 \t$result, $src;",
+     [(set Int32Regs:$result, (int_nvvm_ptr_gen_to_constant Int32Regs:$src))]>;
+def cvta_to_const_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+            "mov.u64 \t$result, $src;",
+     [(set Int64Regs:$result, (int_nvvm_ptr_gen_to_constant Int64Regs:$src))]>;
+
+
+// nvvm.ptr.gen.to.param
+def nvvm_ptr_gen_to_param : NVPTXInst<(outs Int32Regs:$result),
+  (ins Int32Regs:$src),
+                        "mov.u32 \t$result, $src;",
+                              [(set Int32Regs:$result,
+                                (int_nvvm_ptr_gen_to_param Int32Regs:$src))]>;
+def nvvm_ptr_gen_to_param_64 : NVPTXInst<(outs Int64Regs:$result),
+  (ins Int64Regs:$src),
+                        "mov.u64 \t$result, $src;",
+                              [(set Int64Regs:$result,
+                                (int_nvvm_ptr_gen_to_param Int64Regs:$src))]>;
+
+
+// nvvm.move intrinsicc
+def nvvm_move_i8 : NVPTXInst<(outs Int8Regs:$r), (ins Int8Regs:$s),
+                             "mov.b16 \t$r, $s;",
+                             [(set Int8Regs:$r,
+                               (int_nvvm_move_i8 Int8Regs:$s))]>;
+def nvvm_move_i16 : NVPTXInst<(outs Int16Regs:$r), (ins Int16Regs:$s),
+                             "mov.b16 \t$r, $s;",
+                             [(set Int16Regs:$r,
+                               (int_nvvm_move_i16 Int16Regs:$s))]>;
+def nvvm_move_i32 : NVPTXInst<(outs Int32Regs:$r), (ins Int32Regs:$s),
+                             "mov.b32 \t$r, $s;",
+                             [(set Int32Regs:$r,
+                               (int_nvvm_move_i32 Int32Regs:$s))]>;
+def nvvm_move_i64 : NVPTXInst<(outs Int64Regs:$r), (ins Int64Regs:$s),
+                             "mov.b64 \t$r, $s;",
+                             [(set Int64Regs:$r,
+                               (int_nvvm_move_i64 Int64Regs:$s))]>;
+def nvvm_move_float : NVPTXInst<(outs Float32Regs:$r), (ins Float32Regs:$s),
+                             "mov.f32 \t$r, $s;",
+                             [(set Float32Regs:$r,
+                               (int_nvvm_move_float Float32Regs:$s))]>;
+def nvvm_move_double : NVPTXInst<(outs Float64Regs:$r), (ins Float64Regs:$s),
+                             "mov.f64 \t$r, $s;",
+                             [(set Float64Regs:$r,
+                               (int_nvvm_move_double Float64Regs:$s))]>;
+def nvvm_move_ptr32 : NVPTXInst<(outs Int32Regs:$r), (ins Int32Regs:$s),
+                             "mov.u32 \t$r, $s;",
+                             [(set Int32Regs:$r,
+                               (int_nvvm_move_ptr Int32Regs:$s))]>;
+def nvvm_move_ptr64 : NVPTXInst<(outs Int64Regs:$r), (ins Int64Regs:$s),
+                             "mov.u64 \t$r, $s;",
+                             [(set Int64Regs:$r,
+                               (int_nvvm_move_ptr Int64Regs:$s))]>;
+
+// @TODO: Are these actually needed, or will we always just see symbols
+// copied to registers first?
+/*def nvvm_move_sym32 : NVPTXInst<(outs Int32Regs:$r), (ins imem:$s),
+                             "mov.u32 \t$r, $s;",
+                             [(set Int32Regs:$r,
+                             (int_nvvm_move_ptr texternalsym:$s))]>;
+def nvvm_move_sym64 : NVPTXInst<(outs Int64Regs:$r), (ins imem:$s),
+                             "mov.u64 \t$r, $s;",
+                             [(set Int64Regs:$r,
+                             (int_nvvm_move_ptr texternalsym:$s))]>;*/
+
+
+// MoveParam        %r1, param
+// ptr_local_to_gen %r2, %r1
+// ptr_gen_to_local %r3, %r2
+// ->
+// mov %r1, param
+
+// @TODO: Revisit this.  There is a type
+// contradiction between iPTRAny and iPTR for the addr defs, so the move_sym
+// instructions are not currently defined. However, we can use the ptr
+// variants and the asm printer will do the right thing.
+def : Pat<(i64 (int_nvvm_ptr_gen_to_local (int_nvvm_ptr_local_to_gen
+                (MoveParam texternalsym:$src)))),
+               (nvvm_move_ptr64  texternalsym:$src)>;
+def : Pat<(i32 (int_nvvm_ptr_gen_to_local (int_nvvm_ptr_local_to_gen
+                (MoveParam texternalsym:$src)))),
+               (nvvm_move_ptr32  texternalsym:$src)>;
+
+
+//-----------------------------------
+// Compiler Error Warn
+// - Just ignore them in codegen
+//-----------------------------------
+
+def INT_NVVM_COMPILER_WARN_32 : NVPTXInst<(outs), (ins Int32Regs:$a),
+                "// llvm.nvvm.compiler.warn()",
+                [(int_nvvm_compiler_warn Int32Regs:$a)]>;
+def INT_NVVM_COMPILER_WARN_64 : NVPTXInst<(outs), (ins Int64Regs:$a),
+                "// llvm.nvvm.compiler.warn()",
+                [(int_nvvm_compiler_warn Int64Regs:$a)]>;
+def INT_NVVM_COMPILER_ERROR_32 : NVPTXInst<(outs), (ins Int32Regs:$a),
+                "// llvm.nvvm.compiler.error()",
+                [(int_nvvm_compiler_error Int32Regs:$a)]>;
+def INT_NVVM_COMPILER_ERROR_64 : NVPTXInst<(outs), (ins Int64Regs:$a),
+                "// llvm.nvvm.compiler.error()",
+                [(int_nvvm_compiler_error Int64Regs:$a)]>;
+
+
+
+//===-- Old PTX Back-end Intrinsics ---------------------------------------===//
+
+// These intrinsics are handled to retain compatibility with the old backend.
+
+// PTX Special Purpose Register Accessor Intrinsics
+
+class PTX_READ_SPECIAL_REGISTER_R64<string regname, Intrinsic intop>
+  : NVPTXInst<(outs Int64Regs:$d), (ins),
+              !strconcat(!strconcat("mov.u64\t$d, %", regname), ";"),
+              [(set Int64Regs:$d, (intop))]>;
+
+class PTX_READ_SPECIAL_REGISTER_R32<string regname, Intrinsic intop>
+  : NVPTXInst<(outs Int32Regs:$d), (ins),
+              !strconcat(!strconcat("mov.u32\t$d, %", regname), ";"),
+              [(set Int32Regs:$d, (intop))]>;
+
+// TODO Add read vector-version of special registers
+
+def PTX_READ_TID_X   : PTX_READ_SPECIAL_REGISTER_R32<"tid.x",
+                                                     int_ptx_read_tid_x>;
+def PTX_READ_TID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"tid.y",
+                                                     int_ptx_read_tid_y>;
+def PTX_READ_TID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"tid.z",
+                                                     int_ptx_read_tid_z>;
+def PTX_READ_TID_W   : PTX_READ_SPECIAL_REGISTER_R32<"tid.w",
+                                                     int_ptx_read_tid_w>;
+
+def PTX_READ_NTID_X   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.x",
+                                                      int_ptx_read_ntid_x>;
+def PTX_READ_NTID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.y",
+                                                      int_ptx_read_ntid_y>;
+def PTX_READ_NTID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.z",
+                                                      int_ptx_read_ntid_z>;
+def PTX_READ_NTID_W   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.w",
+                                                      int_ptx_read_ntid_w>;
+
+def PTX_READ_LANEID  : PTX_READ_SPECIAL_REGISTER_R32<"laneid",
+                                                     int_ptx_read_laneid>;
+def PTX_READ_WARPID  : PTX_READ_SPECIAL_REGISTER_R32<"warpid",
+                                                     int_ptx_read_warpid>;
+def PTX_READ_NWARPID : PTX_READ_SPECIAL_REGISTER_R32<"nwarpid",
+                                                     int_ptx_read_nwarpid>;
+
+def PTX_READ_CTAID_X   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.x",
+                                                       int_ptx_read_ctaid_x>;
+def PTX_READ_CTAID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.y",
+                                                       int_ptx_read_ctaid_y>;
+def PTX_READ_CTAID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.z",
+                                                       int_ptx_read_ctaid_z>;
+def PTX_READ_CTAID_W   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.w",
+                                                       int_ptx_read_ctaid_w>;
+
+def PTX_READ_NCTAID_X   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.x",
+                                                        int_ptx_read_nctaid_x>;
+def PTX_READ_NCTAID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.y",
+                                                        int_ptx_read_nctaid_y>;
+def PTX_READ_NCTAID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.z",
+                                                        int_ptx_read_nctaid_z>;
+def PTX_READ_NCTAID_W   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.w",
+                                                        int_ptx_read_nctaid_w>;
+
+def PTX_READ_SMID  : PTX_READ_SPECIAL_REGISTER_R32<"smid",
+                                                   int_ptx_read_smid>;
+def PTX_READ_NSMID  : PTX_READ_SPECIAL_REGISTER_R32<"nsmid",
+                                                    int_ptx_read_nsmid>;
+def PTX_READ_GRIDID  : PTX_READ_SPECIAL_REGISTER_R32<"gridid",
+                                                     int_ptx_read_gridid>;
+
+def PTX_READ_LANEMASK_EQ
+  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_eq", int_ptx_read_lanemask_eq>;
+def PTX_READ_LANEMASK_LE
+  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_le", int_ptx_read_lanemask_le>;
+def PTX_READ_LANEMASK_LT
+  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_lt", int_ptx_read_lanemask_lt>;
+def PTX_READ_LANEMASK_GE
+  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_ge", int_ptx_read_lanemask_ge>;
+def PTX_READ_LANEMASK_GT
+  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_gt", int_ptx_read_lanemask_gt>;
+
+def PTX_READ_CLOCK
+  : PTX_READ_SPECIAL_REGISTER_R32<"clock", int_ptx_read_clock>;
+def PTX_READ_CLOCK64
+  : PTX_READ_SPECIAL_REGISTER_R64<"clock64", int_ptx_read_clock64>;
+
+def PTX_READ_PM0 : PTX_READ_SPECIAL_REGISTER_R32<"pm0", int_ptx_read_pm0>;
+def PTX_READ_PM1 : PTX_READ_SPECIAL_REGISTER_R32<"pm1", int_ptx_read_pm1>;
+def PTX_READ_PM2 : PTX_READ_SPECIAL_REGISTER_R32<"pm2", int_ptx_read_pm2>;
+def PTX_READ_PM3 : PTX_READ_SPECIAL_REGISTER_R32<"pm3", int_ptx_read_pm3>;
+
+// PTX Parallel Synchronization and Communication Intrinsics
+
+def PTX_BAR_SYNC : NVPTXInst<(outs), (ins i32imm:$i), "bar.sync\t$i;",
+                             [(int_ptx_bar_sync imm:$i)]>;
diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
new file mode 100644
index 0000000..56b2372
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@@ -0,0 +1,208 @@
+//===- NVPTXLowerAggrCopies.cpp - ------------------------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Lower aggregate copies, memset, memcpy, memmov intrinsics into loops when
+// the size is large or is not a compile-time constant.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXLowerAggrCopies.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Module.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Target/TargetData.h"
+
+using namespace llvm;
+
+namespace llvm {
+FunctionPass *createLowerAggrCopies();
+}
+
+char NVPTXLowerAggrCopies::ID = 0;
+
+// Lower MemTransferInst or load-store pair to loop
+static void convertTransferToLoop(Instruction *splitAt, Value *srcAddr,
+                                  Value *dstAddr, Value *len,
+                                  //unsigned numLoads,
+                                  bool srcVolatile, bool dstVolatile,
+                                  LLVMContext &Context, Function &F) {
+  Type *indType = len->getType();
+
+  BasicBlock *origBB = splitAt->getParent();
+  BasicBlock *newBB = splitAt->getParent()->splitBasicBlock(splitAt, "split");
+  BasicBlock *loopBB = BasicBlock::Create(Context, "loadstoreloop", &F, newBB);
+
+  origBB->getTerminator()->setSuccessor(0, loopBB);
+  IRBuilder<> builder(origBB, origBB->getTerminator());
+
+  // srcAddr and dstAddr are expected to be pointer types,
+  // so no check is made here.
+  unsigned srcAS =
+      dyn_cast<PointerType>(srcAddr->getType())->getAddressSpace();
+  unsigned dstAS =
+      dyn_cast<PointerType>(dstAddr->getType())->getAddressSpace();
+
+  // Cast pointers to (char *)
+  srcAddr = builder.CreateBitCast(srcAddr, Type::getInt8PtrTy(Context, srcAS));
+  dstAddr = builder.CreateBitCast(dstAddr, Type::getInt8PtrTy(Context, dstAS));
+
+  IRBuilder<> loop(loopBB);
+  // The loop index (ind) is a phi node.
+  PHINode *ind = loop.CreatePHI(indType, 0);
+  // Incoming value for ind is 0
+  ind->addIncoming(ConstantInt::get(indType, 0), origBB);
+
+  // load from srcAddr+ind
+  Value *val = loop.CreateLoad(loop.CreateGEP(srcAddr, ind), srcVolatile);
+  // store at dstAddr+ind
+  loop.CreateStore(val, loop.CreateGEP(dstAddr, ind), dstVolatile);
+
+  // The value for ind coming from backedge is (ind + 1)
+  Value *newind = loop.CreateAdd(ind, ConstantInt::get(indType, 1));
+  ind->addIncoming(newind, loopBB);
+
+  loop.CreateCondBr(loop.CreateICmpULT(newind, len), loopBB, newBB);
+}
+
+// Lower MemSetInst to loop
+static void convertMemSetToLoop(Instruction *splitAt, Value *dstAddr,
+                                Value *len, Value *val, LLVMContext &Context,
+                                Function &F) {
+  BasicBlock *origBB = splitAt->getParent();
+  BasicBlock *newBB = splitAt->getParent()->splitBasicBlock(splitAt, "split");
+  BasicBlock *loopBB = BasicBlock::Create(Context, "loadstoreloop", &F, newBB);
+
+  origBB->getTerminator()->setSuccessor(0, loopBB);
+  IRBuilder<> builder(origBB, origBB->getTerminator());
+
+  unsigned dstAS =
+      dyn_cast<PointerType>(dstAddr->getType())->getAddressSpace();
+
+  // Cast pointer to the type of value getting stored
+  dstAddr = builder.CreateBitCast(dstAddr,
+                                  PointerType::get(val->getType(), dstAS));
+
+  IRBuilder<> loop(loopBB);
+  PHINode *ind = loop.CreatePHI(len->getType(), 0);
+  ind->addIncoming(ConstantInt::get(len->getType(), 0), origBB);
+
+  loop.CreateStore(val, loop.CreateGEP(dstAddr, ind), false);
+
+  Value *newind = loop.CreateAdd(ind, ConstantInt::get(len->getType(), 1));
+  ind->addIncoming(newind, loopBB);
+
+  loop.CreateCondBr(loop.CreateICmpULT(newind, len), loopBB, newBB);
+}
+
+bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
+  SmallVector<LoadInst *, 4> aggrLoads;
+  SmallVector<MemTransferInst *, 4> aggrMemcpys;
+  SmallVector<MemSetInst *, 4> aggrMemsets;
+
+  TargetData *TD = &getAnalysis<TargetData>();
+  LLVMContext &Context = F.getParent()->getContext();
+
+  //
+  // Collect all the aggrLoads, aggrMemcpys and addrMemsets.
+  //
+  //const BasicBlock *firstBB = &F.front();  // first BB in F
+  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+    //BasicBlock *bb = BI;
+    for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;
+        ++II) {
+      if (LoadInst * load = dyn_cast<LoadInst>(II)) {
+
+        if (load->hasOneUse() == false) continue;
+
+        if (TD->getTypeStoreSize(load->getType()) < MaxAggrCopySize) continue;
+
+        User *use = *(load->use_begin());
+        if (StoreInst * store = dyn_cast<StoreInst>(use)) {
+          if (store->getOperand(0) != load) //getValueOperand
+          continue;
+          aggrLoads.push_back(load);
+        }
+      } else if (MemTransferInst * intr = dyn_cast<MemTransferInst>(II)) {
+        Value *len = intr->getLength();
+        // If the number of elements being copied is greater
+        // than MaxAggrCopySize, lower it to a loop
+        if (ConstantInt * len_int = dyn_cast < ConstantInt > (len)) {
+          if (len_int->getZExtValue() >= MaxAggrCopySize) {
+            aggrMemcpys.push_back(intr);
+          }
+        } else {
+          // turn variable length memcpy/memmov into loop
+          aggrMemcpys.push_back(intr);
+        }
+      } else if (MemSetInst * memsetintr = dyn_cast<MemSetInst>(II)) {
+        Value *len = memsetintr->getLength();
+        if (ConstantInt * len_int = dyn_cast<ConstantInt>(len)) {
+          if (len_int->getZExtValue() >= MaxAggrCopySize) {
+            aggrMemsets.push_back(memsetintr);
+          }
+        } else {
+          // turn variable length memset into loop
+          aggrMemsets.push_back(memsetintr);
+        }
+      }
+    }
+  }
+  if ((aggrLoads.size() == 0) && (aggrMemcpys.size() == 0)
+      && (aggrMemsets.size() == 0)) return false;
+
+  //
+  // Do the transformation of an aggr load/copy/set to a loop
+  //
+  for (unsigned i = 0, e = aggrLoads.size(); i != e; ++i) {
+    LoadInst *load = aggrLoads[i];
+    StoreInst *store = dyn_cast<StoreInst>(*load->use_begin());
+    Value *srcAddr = load->getOperand(0);
+    Value *dstAddr = store->getOperand(1);
+    unsigned numLoads = TD->getTypeStoreSize(load->getType());
+    Value *len = ConstantInt::get(Type::getInt32Ty(Context), numLoads);
+
+    convertTransferToLoop(store, srcAddr, dstAddr, len, load->isVolatile(),
+                          store->isVolatile(), Context, F);
+
+    store->eraseFromParent();
+    load->eraseFromParent();
+  }
+
+  for (unsigned i = 0, e = aggrMemcpys.size(); i != e; ++i) {
+    MemTransferInst *cpy = aggrMemcpys[i];
+    Value *len = cpy->getLength();
+    // llvm 2.7 version of memcpy does not have volatile
+    // operand yet. So always making it non-volatile
+    // optimistically, so that we don't see unnecessary
+    // st.volatile in ptx
+    convertTransferToLoop(cpy, cpy->getSource(), cpy->getDest(), len, false,
+                          false, Context, F);
+    cpy->eraseFromParent();
+  }
+
+  for (unsigned i = 0, e = aggrMemsets.size(); i != e; ++i) {
+    MemSetInst *memsetinst = aggrMemsets[i];
+    Value *len = memsetinst->getLength();
+    Value *val = memsetinst->getValue();
+    convertMemSetToLoop(memsetinst, memsetinst->getDest(), len, val, Context,
+                        F);
+    memsetinst->eraseFromParent();
+  }
+
+  return true;
+}
+
+FunctionPass *llvm::createLowerAggrCopies() {
+  return new NVPTXLowerAggrCopies();
+}
diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.h b/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
new file mode 100644
index 0000000..ac7f150
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
@@ -0,0 +1,47 @@
+//===-- llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.h ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the NVIDIA specific lowering of
+// aggregate copies
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_LOWER_AGGR_COPIES_H
+#define NVPTX_LOWER_AGGR_COPIES_H
+
+#include "llvm/Pass.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/Target/TargetData.h"
+
+namespace llvm {
+
+// actual analysis class, which is a functionpass
+struct NVPTXLowerAggrCopies : public FunctionPass {
+  static char ID;
+
+  NVPTXLowerAggrCopies() : FunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<TargetData>();
+    AU.addPreserved<MachineFunctionAnalysis>();
+  }
+
+  virtual bool runOnFunction(Function &F);
+
+  static const unsigned MaxAggrCopySize = 128;
+
+  virtual const char *getPassName() const {
+    return "Lower aggregate copies/intrinsics into loops";
+  }
+};
+
+extern FunctionPass *createLowerAggrCopies();
+}
+
+#endif
diff --git a/lib/Target/PTX/PTXMachineFunctionInfo.cpp b/lib/Target/NVPTX/NVPTXNumRegisters.h
index 60acfc7..b4a4dbc 100644
--- a/lib/Target/PTX/PTXMachineFunctionInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXNumRegisters.h
@@ -1,4 +1,5 @@
-//===-- PTXMachineFuctionInfo.cpp - PTX machine function info -------------===//
+
+//===-- NVPTXNumRegisters.h - PTX Register Info ---------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,8 +8,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "PTXMachineFunctionInfo.h"
+#ifndef NVPTX_NUM_REGISTERS_H
+#define NVPTX_NUM_REGISTERS_H
+
+namespace llvm {
+
+const unsigned NVPTXNumRegisters = 396;
 
-using namespace llvm;
+}
 
-void PTXMachineFunctionInfo::anchor() { }
+#endif
diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
new file mode 100644
index 0000000..e3cd46f
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
@@ -0,0 +1,325 @@
+//===- NVPTXRegisterInfo.cpp - NVPTX Register Information -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "nvptx-reg-info"
+
+#include "NVPTX.h"
+#include "NVPTXRegisterInfo.h"
+#include "NVPTXSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+
+using namespace llvm;
+
+namespace llvm
+{
+std::string getNVPTXRegClassName (TargetRegisterClass const *RC) {
+  if (RC == &NVPTX::Float32RegsRegClass) {
+    return ".f32";
+  }
+  if (RC == &NVPTX::Float64RegsRegClass) {
+    return ".f64";
+  }
+  else if (RC == &NVPTX::Int64RegsRegClass) {
+    return ".s64";
+  }
+  else if (RC == &NVPTX::Int32RegsRegClass) {
+    return ".s32";
+  }
+  else if (RC == &NVPTX::Int16RegsRegClass) {
+    return ".s16";
+  }
+  // Int8Regs become 16-bit registers in PTX
+  else if (RC == &NVPTX::Int8RegsRegClass) {
+    return ".s16";
+  }
+  else if (RC == &NVPTX::Int1RegsRegClass) {
+    return ".pred";
+  }
+  else if (RC == &NVPTX::SpecialRegsRegClass) {
+    return "!Special!";
+  }
+  else if (RC == &NVPTX::V2F32RegsRegClass) {
+    return ".v2.f32";
+  }
+  else if (RC == &NVPTX::V4F32RegsRegClass) {
+    return ".v4.f32";
+  }
+  else if (RC == &NVPTX::V2I32RegsRegClass) {
+    return ".v2.s32";
+  }
+  else if (RC == &NVPTX::V4I32RegsRegClass) {
+    return ".v4.s32";
+  }
+  else if (RC == &NVPTX::V2F64RegsRegClass) {
+    return ".v2.f64";
+  }
+  else if (RC == &NVPTX::V2I64RegsRegClass) {
+    return ".v2.s64";
+  }
+  else if (RC == &NVPTX::V2I16RegsRegClass) {
+    return ".v2.s16";
+  }
+  else if (RC == &NVPTX::V4I16RegsRegClass) {
+    return ".v4.s16";
+  }
+  else if (RC == &NVPTX::V2I8RegsRegClass) {
+    return ".v2.s16";
+  }
+  else if (RC == &NVPTX::V4I8RegsRegClass) {
+    return ".v4.s16";
+  }
+  else {
+    return "INTERNAL";
+  }
+  return "";
+}
+
+std::string getNVPTXRegClassStr (TargetRegisterClass const *RC) {
+  if (RC == &NVPTX::Float32RegsRegClass) {
+    return "%f";
+  }
+  if (RC == &NVPTX::Float64RegsRegClass) {
+    return "%fd";
+  }
+  else if (RC == &NVPTX::Int64RegsRegClass) {
+    return "%rd";
+  }
+  else if (RC == &NVPTX::Int32RegsRegClass) {
+    return "%r";
+  }
+  else if (RC == &NVPTX::Int16RegsRegClass) {
+    return "%rs";
+  }
+  else if (RC == &NVPTX::Int8RegsRegClass) {
+    return "%rc";
+  }
+  else if (RC == &NVPTX::Int1RegsRegClass) {
+    return "%p";
+  }
+  else if (RC == &NVPTX::SpecialRegsRegClass) {
+    return "!Special!";
+  }
+  else if (RC == &NVPTX::V2F32RegsRegClass) {
+    return "%v2f";
+  }
+  else if (RC == &NVPTX::V4F32RegsRegClass) {
+    return "%v4f";
+  }
+  else if (RC == &NVPTX::V2I32RegsRegClass) {
+    return "%v2r";
+  }
+  else if (RC == &NVPTX::V4I32RegsRegClass) {
+    return "%v4r";
+  }
+  else if (RC == &NVPTX::V2F64RegsRegClass) {
+    return "%v2fd";
+  }
+  else if (RC == &NVPTX::V2I64RegsRegClass) {
+    return "%v2rd";
+  }
+  else if (RC == &NVPTX::V2I16RegsRegClass) {
+    return "%v2s";
+  }
+  else if (RC == &NVPTX::V4I16RegsRegClass) {
+    return "%v4rs";
+  }
+  else if (RC == &NVPTX::V2I8RegsRegClass) {
+    return "%v2rc";
+  }
+  else if (RC == &NVPTX::V4I8RegsRegClass) {
+    return "%v4rc";
+  }
+  else {
+    return "INTERNAL";
+  }
+  return "";
+}
+
+bool isNVPTXVectorRegClass(TargetRegisterClass const *RC) {
+  if (RC->getID() == NVPTX::V2F32RegsRegClassID)
+    return true;
+  if (RC->getID() == NVPTX::V2F64RegsRegClassID)
+    return true;
+  if (RC->getID() == NVPTX::V2I16RegsRegClassID)
+    return true;
+  if (RC->getID() == NVPTX::V2I32RegsRegClassID)
+    return true;
+  if (RC->getID() == NVPTX::V2I64RegsRegClassID)
+    return true;
+  if (RC->getID() == NVPTX::V2I8RegsRegClassID)
+    return true;
+  if (RC->getID() == NVPTX::V4F32RegsRegClassID)
+    return true;
+  if (RC->getID() == NVPTX::V4I16RegsRegClassID)
+    return true;
+  if (RC->getID() == NVPTX::V4I32RegsRegClassID)
+    return true;
+  if (RC->getID() == NVPTX::V4I8RegsRegClassID)
+    return true;
+  return false;
+}
+
+std::string getNVPTXElemClassName(TargetRegisterClass const *RC) {
+  if (RC->getID() == NVPTX::V2F32RegsRegClassID)
+    return getNVPTXRegClassName(&NVPTX::Float32RegsRegClass);
+  if (RC->getID() == NVPTX::V2F64RegsRegClassID)
+    return getNVPTXRegClassName(&NVPTX::Float64RegsRegClass);
+  if (RC->getID() == NVPTX::V2I16RegsRegClassID)
+    return getNVPTXRegClassName(&NVPTX::Int16RegsRegClass);
+  if (RC->getID() == NVPTX::V2I32RegsRegClassID)
+    return getNVPTXRegClassName(&NVPTX::Int32RegsRegClass);
+  if (RC->getID() == NVPTX::V2I64RegsRegClassID)
+    return getNVPTXRegClassName(&NVPTX::Int64RegsRegClass);
+  if (RC->getID() == NVPTX::V2I8RegsRegClassID)
+    return getNVPTXRegClassName(&NVPTX::Int8RegsRegClass);
+  if (RC->getID() == NVPTX::V4F32RegsRegClassID)
+    return getNVPTXRegClassName(&NVPTX::Float32RegsRegClass);
+  if (RC->getID() == NVPTX::V4I16RegsRegClassID)
+    return getNVPTXRegClassName(&NVPTX::Int16RegsRegClass);
+  if (RC->getID() == NVPTX::V4I32RegsRegClassID)
+    return getNVPTXRegClassName(&NVPTX::Int32RegsRegClass);
+  if (RC->getID() == NVPTX::V4I8RegsRegClassID)
+    return getNVPTXRegClassName(&NVPTX::Int8RegsRegClass);
+  llvm_unreachable("Not a vector register class");
+}
+
+const TargetRegisterClass *getNVPTXElemClass(TargetRegisterClass const *RC) {
+  if (RC->getID() == NVPTX::V2F32RegsRegClassID)
+    return (&NVPTX::Float32RegsRegClass);
+  if (RC->getID() == NVPTX::V2F64RegsRegClassID)
+    return (&NVPTX::Float64RegsRegClass);
+  if (RC->getID() == NVPTX::V2I16RegsRegClassID)
+    return (&NVPTX::Int16RegsRegClass);
+  if (RC->getID() == NVPTX::V2I32RegsRegClassID)
+    return (&NVPTX::Int32RegsRegClass);
+  if (RC->getID() == NVPTX::V2I64RegsRegClassID)
+    return (&NVPTX::Int64RegsRegClass);
+  if (RC->getID() == NVPTX::V2I8RegsRegClassID)
+    return (&NVPTX::Int8RegsRegClass);
+  if (RC->getID() == NVPTX::V4F32RegsRegClassID)
+    return (&NVPTX::Float32RegsRegClass);
+  if (RC->getID() == NVPTX::V4I16RegsRegClassID)
+    return (&NVPTX::Int16RegsRegClass);
+  if (RC->getID() == NVPTX::V4I32RegsRegClassID)
+    return (&NVPTX::Int32RegsRegClass);
+  if (RC->getID() == NVPTX::V4I8RegsRegClassID)
+    return (&NVPTX::Int8RegsRegClass);
+  llvm_unreachable("Not a vector register class");
+}
+
+int getNVPTXVectorSize(TargetRegisterClass const *RC) {
+  if (RC->getID() == NVPTX::V2F32RegsRegClassID)
+    return 2;
+  if (RC->getID() == NVPTX::V2F64RegsRegClassID)
+    return 2;
+  if (RC->getID() == NVPTX::V2I16RegsRegClassID)
+    return 2;
+  if (RC->getID() == NVPTX::V2I32RegsRegClassID)
+    return 2;
+  if (RC->getID() == NVPTX::V2I64RegsRegClassID)
+    return 2;
+  if (RC->getID() == NVPTX::V2I8RegsRegClassID)
+    return 2;
+  if (RC->getID() == NVPTX::V4F32RegsRegClassID)
+    return 4;
+  if (RC->getID() == NVPTX::V4I16RegsRegClassID)
+    return 4;
+  if (RC->getID() == NVPTX::V4I32RegsRegClassID)
+    return 4;
+  if (RC->getID() == NVPTX::V4I8RegsRegClassID)
+    return 4;
+  llvm_unreachable("Not a vector register class");
+}
+}
+
+NVPTXRegisterInfo::NVPTXRegisterInfo(const TargetInstrInfo &tii,
+                                     const NVPTXSubtarget &st)
+  : NVPTXGenRegisterInfo(0),
+    Is64Bit(st.is64Bit()) {}
+
+#define GET_REGINFO_TARGET_DESC
+#include "NVPTXGenRegisterInfo.inc"
+
+/// NVPTX Callee Saved Registers
+const uint16_t* NVPTXRegisterInfo::
+getCalleeSavedRegs(const MachineFunction *MF) const {
+  static const uint16_t CalleeSavedRegs[] = { 0 };
+  return CalleeSavedRegs;
+}
+
+// NVPTX Callee Saved Reg Classes
+const TargetRegisterClass* const*
+NVPTXRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  static const TargetRegisterClass * const CalleeSavedRegClasses[] = { 0 };
+  return CalleeSavedRegClasses;
+}
+
+BitVector NVPTXRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  return Reserved;
+}
+
+void NVPTXRegisterInfo::
+eliminateFrameIndex(MachineBasicBlock::iterator II,
+                    int SPAdj,
+                    RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  unsigned i = 0;
+  MachineInstr &MI = *II;
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() &&
+           "Instr doesn't have FrameIndex operand!");
+  }
+
+  int FrameIndex = MI.getOperand(i).getIndex();
+
+  MachineFunction &MF = *MI.getParent()->getParent();
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
+      MI.getOperand(i+1).getImm();
+
+  // Using I0 as the frame pointer
+  MI.getOperand(i).ChangeToRegister(NVPTX::VRFrame, false);
+  MI.getOperand(i+1).ChangeToImmediate(Offset);
+}
+
+
+int NVPTXRegisterInfo::
+getDwarfRegNum(unsigned RegNum, bool isEH) const {
+  return 0;
+}
+
+unsigned NVPTXRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  return NVPTX::VRFrame;
+}
+
+unsigned NVPTXRegisterInfo::getRARegister() const {
+  return 0;
+}
+
+// This function eliminates ADJCALLSTACKDOWN,
+// ADJCALLSTACKUP pseudo instructions
+void NVPTXRegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  // Simply discard ADJCALLSTACKDOWN,
+  // ADJCALLSTACKUP instructions.
+  MBB.erase(I);
+}
diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.h b/lib/Target/NVPTX/NVPTXRegisterInfo.h
new file mode 100644
index 0000000..5951783
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.h
@@ -0,0 +1,92 @@
+//===- NVPTXRegisterInfo.h - NVPTX Register Information Impl ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXREGISTERINFO_H
+#define NVPTXREGISTERINFO_H
+
+#include "ManagedStringPool.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+
+#define GET_REGINFO_HEADER
+#include "NVPTXGenRegisterInfo.inc"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <sstream>
+
+namespace llvm {
+
+// Forward Declarations.
+class TargetInstrInfo;
+class NVPTXSubtarget;
+
+class NVPTXRegisterInfo : public NVPTXGenRegisterInfo {
+private:
+  bool Is64Bit;
+  // Hold Strings that can be free'd all together with NVPTXRegisterInfo
+  ManagedStringPool     ManagedStrPool;
+
+public:
+  NVPTXRegisterInfo(const TargetInstrInfo &tii,
+                    const NVPTXSubtarget &st);
+
+
+  //------------------------------------------------------
+  // Pure virtual functions from TargetRegisterInfo
+  //------------------------------------------------------
+
+  // NVPTX callee saved registers
+  virtual const uint16_t*
+  getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+  // NVPTX callee saved register classes
+  virtual const TargetRegisterClass* const *
+  getCalleeSavedRegClasses(const MachineFunction *MF) const;
+
+  virtual BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  virtual void eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                                   int SPAdj,
+                                   RegScavenger *RS=NULL) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  virtual int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+  virtual unsigned getFrameRegister(const MachineFunction &MF) const;
+  virtual unsigned getRARegister() const;
+
+  ManagedStringPool *getStrPool() const {
+    return const_cast<ManagedStringPool *>(&ManagedStrPool);
+  }
+
+  const char *getName(unsigned RegNo) const {
+    std::stringstream O;
+    O << "reg" << RegNo;
+    return getStrPool()->getManagedString(O.str().c_str())->c_str();
+  }
+
+};
+
+
+std::string getNVPTXRegClassName (const TargetRegisterClass *RC);
+std::string getNVPTXRegClassStr (const TargetRegisterClass *RC);
+bool isNVPTXVectorRegClass (const TargetRegisterClass *RC);
+std::string getNVPTXElemClassName (const TargetRegisterClass *RC);
+int getNVPTXVectorSize (const TargetRegisterClass *RC);
+const TargetRegisterClass *getNVPTXElemClass(const TargetRegisterClass *RC);
+
+} // end namespace llvm
+
+
+#endif
diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.td b/lib/Target/NVPTX/NVPTXRegisterInfo.td
new file mode 100644
index 0000000..ba15825
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.td
@@ -0,0 +1,108 @@
+//===-- NVPTXRegisterInfo.td - NVPTX Register defs ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the PTX register file
+//===----------------------------------------------------------------------===//
+
+class NVPTXReg<string n> : Register<n> {
+  let Namespace = "NVPTX";
+}
+
+class NVPTXRegClass<list<ValueType> regTypes, int alignment, dag regList>
+     : RegisterClass <"NVPTX", regTypes, alignment, regList>;
+
+//===----------------------------------------------------------------------===//
+//  Registers
+//===----------------------------------------------------------------------===//
+
+// Special Registers used as stack pointer
+def VRFrame         : NVPTXReg<"%SP">;
+def VRFrameLocal    : NVPTXReg<"%SPL">;
+
+// Special Registers used as the stack
+def VRDepot  : NVPTXReg<"%Depot">;
+
+foreach i = 0-395 in {
+  def P#i  : NVPTXReg<"%p"#i>;  // Predicate
+  def RC#i : NVPTXReg<"%rc"#i>; // 8-bit
+  def RS#i : NVPTXReg<"%rs"#i>; // 16-bit
+  def R#i  : NVPTXReg<"%r"#i>;  // 32-bit
+  def RL#i : NVPTXReg<"%rl"#i>; // 64-bit
+  def F#i  : NVPTXReg<"%f"#i>;  // 32-bit float
+  def FL#i : NVPTXReg<"%fl"#i>; // 64-bit float
+  // Vectors
+  foreach s = [ "2b8", "2b16", "2b32", "2b64", "4b8", "4b16", "4b32" ] in
+    def v#s#_#i : NVPTXReg<"%v"#s#"_"#i>;
+
+  // Arguments
+  def ia#i : NVPTXReg<"%ia"#i>;
+  def la#i : NVPTXReg<"%la"#i>;
+  def fa#i : NVPTXReg<"%fa"#i>;
+  def da#i : NVPTXReg<"%da"#i>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Register classes
+//===----------------------------------------------------------------------===//
+def Int1Regs : NVPTXRegClass<[i1], 8, (add (sequence "P%u", 0, 395))>;
+def Int8Regs : NVPTXRegClass<[i8], 8, (add (sequence "RC%u", 0, 395))>;
+def Int16Regs : NVPTXRegClass<[i16], 16, (add (sequence "RS%u", 0, 395))>;
+def Int32Regs : NVPTXRegClass<[i32], 32, (add (sequence "R%u", 0, 395))>;
+def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%u", 0, 395))>;
+def Float32Regs : NVPTXRegClass<[f32], 32, (add (sequence "F%u", 0, 395))>;
+def Float64Regs : NVPTXRegClass<[f64], 64, (add (sequence "FL%u", 0, 395))>;
+def Int32ArgRegs : NVPTXRegClass<[i32], 32, (add (sequence "ia%u", 0, 395))>;
+def Int64ArgRegs : NVPTXRegClass<[i64], 64, (add (sequence "la%u", 0, 395))>;
+def Float32ArgRegs : NVPTXRegClass<[f32], 32, (add (sequence "fa%u", 0, 395))>;
+def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%u", 0, 395))>;
+
+// Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used.
+def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRDepot)>;
+
+class NVPTXVecRegClass<list<ValueType> regTypes, int alignment, dag regList,
+                       NVPTXRegClass sClass,
+                       int e,
+                       string n>
+  : NVPTXRegClass<regTypes, alignment, regList>
+{
+  NVPTXRegClass scalarClass=sClass;
+  int elems=e;
+  string name=n;
+}
+def V2F32Regs
+  : NVPTXVecRegClass<[v2f32], 64, (add (sequence "v2b32_%u", 0, 395)),
+    Float32Regs, 2, ".v2.f32">;
+def V4F32Regs
+  : NVPTXVecRegClass<[v4f32], 128, (add (sequence "v4b32_%u", 0, 395)),
+    Float32Regs, 4, ".v4.f32">;
+def V2I32Regs
+  : NVPTXVecRegClass<[v2i32], 64, (add (sequence "v2b32_%u", 0, 395)),
+    Int32Regs, 2, ".v2.u32">;
+def V4I32Regs
+  : NVPTXVecRegClass<[v4i32], 128, (add (sequence "v4b32_%u", 0, 395)),
+    Int32Regs, 4, ".v4.u32">;
+def V2F64Regs
+  : NVPTXVecRegClass<[v2f64], 128, (add (sequence "v2b64_%u", 0, 395)),
+    Float64Regs, 2, ".v2.f64">;
+def V2I64Regs
+  : NVPTXVecRegClass<[v2i64], 128, (add (sequence "v2b64_%u", 0, 395)),
+    Int64Regs, 2, ".v2.u64">;
+def V2I16Regs
+  : NVPTXVecRegClass<[v2i16], 32, (add (sequence "v2b16_%u", 0, 395)),
+    Int16Regs, 2, ".v2.u16">;
+def V4I16Regs
+  : NVPTXVecRegClass<[v4i16], 64, (add (sequence "v4b16_%u", 0, 395)),
+    Int16Regs, 4, ".v4.u16">;
+def V2I8Regs
+  : NVPTXVecRegClass<[v2i8], 16, (add (sequence "v2b8_%u", 0, 395)),
+    Int8Regs, 2, ".v2.u8">;
+def V4I8Regs
+  : NVPTXVecRegClass<[v4i8], 32, (add (sequence "v4b8_%u", 0, 395)),
+    Int8Regs, 4, ".v4.u8">;
diff --git a/lib/Target/NVPTX/NVPTXSection.h b/lib/Target/NVPTX/NVPTXSection.h
new file mode 100644
index 0000000..f1ca466
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXSection.h
@@ -0,0 +1,45 @@
+//===- NVPTXSection.h - NVPTX-specific section representation -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the NVPTXSection class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_NVPTXSECTION_H
+#define LLVM_NVPTXSECTION_H
+
+#include "llvm/MC/MCSection.h"
+#include "llvm/GlobalVariable.h"
+#include <vector>
+
+namespace llvm {
+/// NVPTXSection - Represents a section in PTX
+/// PTX does not have sections. We create this class in order to use
+/// the ASMPrint interface.
+///
+class NVPTXSection : public MCSection {
+
+public:
+  NVPTXSection(SectionVariant V, SectionKind K) : MCSection(V, K) {}
+  ~NVPTXSection() {}
+
+  /// Override this as NVPTX has its own way of printing switching
+  /// to a section.
+  virtual void PrintSwitchToSection(const MCAsmInfo &MAI,
+                                    raw_ostream &OS) const {}
+
+  /// Base address of PTX sections is zero.
+  virtual bool isBaseAddressKnownZero() const { return true; }
+  virtual bool UseCodeAlign() const { return false; }
+  virtual bool isVirtualSection() const { return false; }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/NVPTX/NVPTXSplitBBatBar.cpp b/lib/Target/NVPTX/NVPTXSplitBBatBar.cpp
new file mode 100644
index 0000000..2836cad
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXSplitBBatBar.cpp
@@ -0,0 +1,77 @@
+//===- NVPTXSplitBBatBar.cpp - Split BB at Barrier  --*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Split basic blocks so that a basic block that contains a barrier instruction
+// only contains the barrier instruction.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Support/InstIterator.h"
+#include "NVPTXUtilities.h"
+#include "NVPTXSplitBBatBar.h"
+
+using namespace llvm;
+
+namespace llvm {
+FunctionPass *createSplitBBatBarPass();
+}
+
+char NVPTXSplitBBatBar::ID = 0;
+
+bool NVPTXSplitBBatBar::runOnFunction(Function &F) {
+
+  SmallVector<Instruction *, 4> SplitPoints;
+  bool changed = false;
+
+  // Collect all the split points in SplitPoints
+  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+    BasicBlock::iterator IB = BI->begin();
+    BasicBlock::iterator II = IB;
+    BasicBlock::iterator IE = BI->end();
+
+    // Skit the first intruction. No splitting is needed at this
+    // point even if this is a bar.
+    while (II != IE) {
+      if (IntrinsicInst *inst = dyn_cast<IntrinsicInst>(II)) {
+        Intrinsic::ID id = inst->getIntrinsicID();
+        // If this is a barrier, split at this instruction
+        // and the next instruction.
+        if (llvm::isBarrierIntrinsic(id)) {
+          if (II != IB)
+            SplitPoints.push_back(II);
+          II++;
+          if ((II != IE) && (!II->isTerminator())) {
+            SplitPoints.push_back(II);
+            II++;
+          }
+          continue;
+        }
+      }
+      II++;
+    }
+  }
+
+  for (unsigned i = 0; i != SplitPoints.size(); i++) {
+    changed = true;
+    Instruction *inst = SplitPoints[i];
+    inst->getParent()->splitBasicBlock(inst, "bar_split");
+  }
+
+  return changed;
+}
+
+// This interface will most likely not be necessary, because this pass will
+// not be invoked by the driver, but will be used as a prerequisite to
+// another pass.
+FunctionPass *llvm::createSplitBBatBarPass() {
+  return new NVPTXSplitBBatBar();
+}
diff --git a/lib/Target/NVPTX/NVPTXSplitBBatBar.h b/lib/Target/NVPTX/NVPTXSplitBBatBar.h
new file mode 100644
index 0000000..9e4d5a0
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXSplitBBatBar.h
@@ -0,0 +1,41 @@
+//===-- llvm/lib/Target/NVPTX/NVPTXSplitBBatBar.h ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the NVIDIA specific declarations
+// for splitting basic blocks at barrier instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_SPLIT_BB_AT_BAR_H
+#define NVPTX_SPLIT_BB_AT_BAR_H
+
+#include "llvm/Pass.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+
+namespace llvm {
+
+// actual analysis class, which is a functionpass
+struct NVPTXSplitBBatBar : public FunctionPass {
+  static char ID;
+
+  NVPTXSplitBBatBar() : FunctionPass(ID) {}
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addPreserved<MachineFunctionAnalysis>();
+  }
+  virtual bool runOnFunction(Function &F);
+
+  virtual const char *getPassName() const {
+    return "Split basic blocks at barrier";
+  }
+};
+
+extern FunctionPass *createSplitBBatBarPass();
+}
+
+#endif //NVPTX_SPLIT_BB_AT_BAR_H
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.cpp b/lib/Target/NVPTX/NVPTXSubtarget.cpp
new file mode 100644
index 0000000..6aadd43
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -0,0 +1,57 @@
+//===- NVPTXSubtarget.cpp - NVPTX Subtarget Information -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the NVPTX specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXSubtarget.h"
+#define GET_SUBTARGETINFO_ENUM
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "NVPTXGenSubtargetInfo.inc"
+
+using namespace llvm;
+
+// Select Driver Interface
+#include "llvm/Support/CommandLine.h"
+namespace {
+cl::opt<NVPTX::DrvInterface>
+DriverInterface(cl::desc("Choose driver interface:"),
+                cl::values(
+                    clEnumValN(NVPTX::NVCL, "drvnvcl", "Nvidia OpenCL driver"),
+                    clEnumValN(NVPTX::CUDA, "drvcuda", "Nvidia CUDA driver"),
+                    clEnumValN(NVPTX::TEST, "drvtest", "Plain Test"),
+                    clEnumValEnd),
+                    cl::init(NVPTX::NVCL));
+}
+
+NVPTXSubtarget::NVPTXSubtarget(const std::string &TT, const std::string &CPU,
+                               const std::string &FS, bool is64Bit)
+:NVPTXGenSubtargetInfo(TT, "", FS), // Don't pass CPU to subtarget,
+ // because we don't register all
+ // nvptx targets.
+ Is64Bit(is64Bit) {
+
+  drvInterface = DriverInterface;
+
+  // Provide the default CPU if none
+  std::string defCPU = "sm_10";
+
+  // Get the TargetName from the FS if available
+  if (FS.empty() && CPU.empty())
+    TargetName = defCPU;
+  else if (!CPU.empty())
+    TargetName = CPU;
+  else
+    llvm_unreachable("we are not using FeatureStr");
+
+  // Set up the SmVersion
+  SmVersion = atoi(TargetName.c_str()+3);
+}
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.h b/lib/Target/NVPTX/NVPTXSubtarget.h
new file mode 100644
index 0000000..8f2a629
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -0,0 +1,92 @@
+//=====-- NVPTXSubtarget.h - Define Subtarget for the NVPTX ---*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the NVPTX specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXSUBTARGET_H
+#define NVPTXSUBTARGET_H
+
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "NVPTX.h"
+
+#define GET_SUBTARGETINFO_HEADER
+#include "NVPTXGenSubtargetInfo.inc"
+
+#include <string>
+
+namespace llvm {
+
+class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
+
+  unsigned int SmVersion;
+  std::string TargetName;
+  NVPTX::DrvInterface drvInterface;
+  bool dummy; // For the 'dummy' feature, see NVPTX.td
+  bool Is64Bit;
+
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified module.
+  ///
+  NVPTXSubtarget(const std::string &TT, const std::string &CPU,
+                 const std::string &FS, bool is64Bit);
+
+  bool hasBrkPt() const { return SmVersion >= 11; }
+  bool hasAtomRedG32() const { return SmVersion >= 11; }
+  bool hasAtomRedS32() const { return SmVersion >= 12; }
+  bool hasAtomRedG64() const { return SmVersion >= 12; }
+  bool hasAtomRedS64() const { return SmVersion >= 20; }
+  bool hasAtomRedGen32() const { return SmVersion >= 20; }
+  bool hasAtomRedGen64() const { return SmVersion >= 20; }
+  bool hasAtomAddF32() const { return SmVersion >= 20; }
+  bool hasVote() const { return SmVersion >= 12; }
+  bool hasDouble() const { return SmVersion >= 13; }
+  bool reqPTX20() const { return SmVersion >= 20; }
+  bool hasF32FTZ() const { return SmVersion >= 20; }
+  bool hasFMAF32() const { return SmVersion >= 20; }
+  bool hasFMAF64() const { return SmVersion >= 13; }
+  bool hasLDU() const { return SmVersion >= 20; }
+  bool hasGenericLdSt() const { return SmVersion >= 20; }
+  inline bool hasHWROT32() const { return false; }
+  inline bool hasSWROT32() const {
+    return true;
+  }
+  inline bool hasROT32() const { return hasHWROT32() || hasSWROT32() ; }
+  inline bool hasROT64() const { return SmVersion >= 20; }
+
+
+  bool is64Bit() const { return Is64Bit; }
+
+  unsigned int getSmVersion() const { return SmVersion; }
+  NVPTX::DrvInterface getDrvInterface() const { return drvInterface; }
+  std::string getTargetName() const { return TargetName; }
+
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  std::string getDataLayout() const {
+    const char *p;
+    if (is64Bit())
+      p = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-"
+          "f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-"
+          "n16:32:64";
+    else
+      p = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-"
+          "f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-"
+          "n16:32:64";
+
+    return std::string(p);
+  }
+
+};
+
+} // End llvm namespace
+
+#endif  // NVPTXSUBTARGET_H
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
new file mode 100644
index 0000000..433f415
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -0,0 +1,133 @@
+//===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Top-level implementation for the NVPTX target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXTargetMachine.h"
+#include "NVPTX.h"
+#include "NVPTXSplitBBatBar.h"
+#include "NVPTXLowerAggrCopies.h"
+#include "MCTargetDesc/NVPTXMCAsmInfo.h"
+#include "NVPTXAllocaHoisting.h"
+#include "llvm/PassManager.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Assembly/PrintModulePass.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+
+
+using namespace llvm;
+
+
+extern "C" void LLVMInitializeNVPTXTarget() {
+  // Register the target.
+  RegisterTargetMachine<NVPTXTargetMachine32> X(TheNVPTXTarget32);
+  RegisterTargetMachine<NVPTXTargetMachine64> Y(TheNVPTXTarget64);
+
+  RegisterMCAsmInfo<NVPTXMCAsmInfo> A(TheNVPTXTarget32);
+  RegisterMCAsmInfo<NVPTXMCAsmInfo> B(TheNVPTXTarget64);
+
+}
+
+NVPTXTargetMachine::NVPTXTargetMachine(const Target &T,
+                                       StringRef TT,
+                                       StringRef CPU,
+                                       StringRef FS,
+                                       const TargetOptions& Options,
+                                       Reloc::Model RM,
+                                       CodeModel::Model CM,
+                                       CodeGenOpt::Level OL,
+                                       bool is64bit)
+: LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+  Subtarget(TT, CPU, FS, is64bit),
+  DataLayout(Subtarget.getDataLayout()),
+  InstrInfo(*this), TLInfo(*this), TSInfo(*this), FrameLowering(*this,is64bit)
+/*FrameInfo(TargetFrameInfo::StackGrowsUp, 8, 0)*/ {
+}
+
+
+
+void NVPTXTargetMachine32::anchor() {}
+
+NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, StringRef TT,
+                                           StringRef CPU, StringRef FS,
+                                           const TargetOptions &Options,
+                                           Reloc::Model RM, CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+: NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {
+}
+
+void NVPTXTargetMachine64::anchor() {}
+
+NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, StringRef TT,
+                                           StringRef CPU, StringRef FS,
+                                           const TargetOptions &Options,
+                                           Reloc::Model RM, CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+: NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {
+}
+
+
+namespace llvm {
+class NVPTXPassConfig : public TargetPassConfig {
+public:
+  NVPTXPassConfig(NVPTXTargetMachine *TM, PassManagerBase &PM)
+  : TargetPassConfig(TM, PM) {}
+
+  NVPTXTargetMachine &getNVPTXTargetMachine() const {
+    return getTM<NVPTXTargetMachine>();
+  }
+
+  virtual bool addInstSelector();
+  virtual bool addPreRegAlloc();
+};
+}
+
+TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
+  NVPTXPassConfig *PassConfig = new NVPTXPassConfig(this, PM);
+  return PassConfig;
+}
+
+bool NVPTXPassConfig::addInstSelector() {
+  addPass(createLowerAggrCopies());
+  addPass(createSplitBBatBarPass());
+  addPass(createAllocaHoisting());
+  addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel()));
+  addPass(createVectorElementizePass(getNVPTXTargetMachine()));
+  return false;
+}
+
+bool NVPTXPassConfig::addPreRegAlloc() {
+  return false;
+}
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h
new file mode 100644
index 0000000..b3f9cac
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -0,0 +1,125 @@
+//===-- NVPTXTargetMachine.h - Define TargetMachine for NVPTX ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the NVPTX specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef NVPTX_TARGETMACHINE_H
+#define NVPTX_TARGETMACHINE_H
+
+#include "NVPTXInstrInfo.h"
+#include "NVPTXISelLowering.h"
+#include "NVPTXRegisterInfo.h"
+#include "NVPTXSubtarget.h"
+#include "NVPTXFrameLowering.h"
+#include "ManagedStringPool.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+/// NVPTXTargetMachine
+///
+class NVPTXTargetMachine : public LLVMTargetMachine {
+  NVPTXSubtarget        Subtarget;
+  const TargetData      DataLayout;       // Calculates type size & alignment
+  NVPTXInstrInfo        InstrInfo;
+  NVPTXTargetLowering   TLInfo;
+  TargetSelectionDAGInfo   TSInfo;
+
+  // NVPTX does not have any call stack frame, but need a NVPTX specific
+  // FrameLowering class because TargetFrameLowering is abstract.
+  NVPTXFrameLowering       FrameLowering;
+
+  // Hold Strings that can be free'd all together with NVPTXTargetMachine
+  ManagedStringPool     ManagedStrPool;
+
+  //bool addCommonCodeGenPasses(PassManagerBase &, CodeGenOpt::Level,
+  //                            bool DisableVerify, MCContext *&OutCtx);
+
+public:
+  NVPTXTargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                     StringRef FS, const TargetOptions &Options,
+                     Reloc::Model RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OP,
+                     bool is64bit);
+
+  virtual const TargetFrameLowering *getFrameLowering() const {
+    return &FrameLowering;
+  }
+  virtual const NVPTXInstrInfo *getInstrInfo() const  { return &InstrInfo; }
+  virtual const TargetData *getTargetData() const     { return &DataLayout;}
+  virtual const NVPTXSubtarget *getSubtargetImpl() const { return &Subtarget;}
+
+  virtual const NVPTXRegisterInfo *getRegisterInfo() const {
+    return &(InstrInfo.getRegisterInfo());
+  }
+
+  virtual NVPTXTargetLowering *getTargetLowering() const {
+    return const_cast<NVPTXTargetLowering*>(&TLInfo);
+  }
+
+  virtual const TargetSelectionDAGInfo *getSelectionDAGInfo() const {
+    return &TSInfo;
+  }
+
+  //virtual bool addInstSelector(PassManagerBase &PM,
+  //                             CodeGenOpt::Level OptLevel);
+
+  //virtual bool addPreRegAlloc(PassManagerBase &, CodeGenOpt::Level);
+
+  ManagedStringPool *getManagedStrPool() const {
+    return const_cast<ManagedStringPool*>(&ManagedStrPool);
+  }
+
+  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+
+  // Emission of machine code through JITCodeEmitter is not supported.
+  virtual bool addPassesToEmitMachineCode(PassManagerBase &,
+                                          JITCodeEmitter &,
+                                          bool = true) {
+    return true;
+  }
+
+  // Emission of machine code through MCJIT is not supported.
+  virtual bool addPassesToEmitMC(PassManagerBase &,
+                                 MCContext *&,
+                                 raw_ostream &,
+                                 bool = true) {
+    return true;
+  }
+
+}; // NVPTXTargetMachine.
+
+class NVPTXTargetMachine32 : public NVPTXTargetMachine {
+  virtual void anchor();
+public:
+  NVPTXTargetMachine32(const Target &T, StringRef TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+};
+
+class NVPTXTargetMachine64 : public NVPTXTargetMachine {
+  virtual void anchor();
+public:
+  NVPTXTargetMachine64(const Target &T, StringRef TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+};
+
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/lib/Target/NVPTX/NVPTXTargetObjectFile.h
new file mode 100644
index 0000000..b5698a2
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXTargetObjectFile.h
@@ -0,0 +1,105 @@
+//===-- NVPTXTargetObjectFile.h - NVPTX Object Info -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_NVPTX_TARGETOBJECTFILE_H
+#define LLVM_TARGET_NVPTX_TARGETOBJECTFILE_H
+
+#include "NVPTXSection.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include <string>
+
+namespace llvm {
+class GlobalVariable;
+class Module;
+
+class NVPTXTargetObjectFile : public TargetLoweringObjectFile {
+
+public:
+  NVPTXTargetObjectFile() {}
+  ~NVPTXTargetObjectFile() {
+    delete TextSection;
+    delete DataSection;
+    delete BSSSection;
+    delete ReadOnlySection;
+
+    delete StaticCtorSection;
+    delete StaticDtorSection;
+    delete LSDASection;
+    delete EHFrameSection;
+    delete DwarfAbbrevSection;
+    delete DwarfInfoSection;
+    delete DwarfLineSection;
+    delete DwarfFrameSection;
+    delete DwarfPubTypesSection;
+    delete DwarfDebugInlineSection;
+    delete DwarfStrSection;
+    delete DwarfLocSection;
+    delete DwarfARangesSection;
+    delete DwarfRangesSection;
+    delete DwarfMacroInfoSection;
+  }
+
+  virtual void Initialize(MCContext &ctx, const TargetMachine &TM) {
+    TextSection = new NVPTXSection(MCSection::SV_ELF,
+                                   SectionKind::getText());
+    DataSection = new NVPTXSection(MCSection::SV_ELF,
+                                   SectionKind::getDataRel());
+    BSSSection = new NVPTXSection(MCSection::SV_ELF,
+                                  SectionKind::getBSS());
+    ReadOnlySection = new NVPTXSection(MCSection::SV_ELF,
+                                       SectionKind::getReadOnly());
+
+    StaticCtorSection = new NVPTXSection(MCSection::SV_ELF,
+                                         SectionKind::getMetadata());
+    StaticDtorSection = new NVPTXSection(MCSection::SV_ELF,
+                                         SectionKind::getMetadata());
+    LSDASection = new NVPTXSection(MCSection::SV_ELF,
+                                   SectionKind::getMetadata());
+    EHFrameSection = new NVPTXSection(MCSection::SV_ELF,
+                                      SectionKind::getMetadata());
+    DwarfAbbrevSection = new NVPTXSection(MCSection::SV_ELF,
+                                          SectionKind::getMetadata());
+    DwarfInfoSection = new NVPTXSection(MCSection::SV_ELF,
+                                        SectionKind::getMetadata());
+    DwarfLineSection = new NVPTXSection(MCSection::SV_ELF,
+                                        SectionKind::getMetadata());
+    DwarfFrameSection = new NVPTXSection(MCSection::SV_ELF,
+                                         SectionKind::getMetadata());
+    DwarfPubTypesSection = new NVPTXSection(MCSection::SV_ELF,
+                                            SectionKind::getMetadata());
+    DwarfDebugInlineSection = new NVPTXSection(MCSection::SV_ELF,
+                                               SectionKind::getMetadata());
+    DwarfStrSection = new NVPTXSection(MCSection::SV_ELF,
+                                       SectionKind::getMetadata());
+    DwarfLocSection = new NVPTXSection(MCSection::SV_ELF,
+                                       SectionKind::getMetadata());
+    DwarfARangesSection = new NVPTXSection(MCSection::SV_ELF,
+                                           SectionKind::getMetadata());
+    DwarfRangesSection = new NVPTXSection(MCSection::SV_ELF,
+                                          SectionKind::getMetadata());
+    DwarfMacroInfoSection = new NVPTXSection(MCSection::SV_ELF,
+                                             SectionKind::getMetadata());
+  }
+
+  virtual const MCSection *getSectionForConstant(SectionKind Kind) const {
+    return ReadOnlySection;
+  }
+
+  virtual const MCSection *
+  getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind,
+                           Mangler *Mang,
+                           const TargetMachine &TM) const {
+    return DataSection;
+  }
+
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/NVPTX/NVPTXUtilities.cpp b/lib/Target/NVPTX/NVPTXUtilities.cpp
new file mode 100644
index 0000000..3f52251
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -0,0 +1,514 @@
+//===- NVPTXUtilities.cpp - Utility Functions -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains miscellaneous utility functions
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXUtilities.h"
+#include "NVPTX.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Function.h"
+#include "llvm/Module.h"
+#include "llvm/Constants.h"
+#include "llvm/Operator.h"
+#include <algorithm>
+#include <cstring>
+#include <map>
+#include <string>
+#include <vector>
+//#include <iostream>
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/InstIterator.h"
+
+using namespace llvm;
+
+typedef std::map<std::string, std::vector<unsigned> > key_val_pair_t;
+typedef std::map<const GlobalValue *, key_val_pair_t> global_val_annot_t;
+typedef std::map<const Module *, global_val_annot_t> per_module_annot_t;
+
+ManagedStatic<per_module_annot_t> annotationCache;
+
+
+static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) {
+  assert(md && "Invalid mdnode for annotation");
+  assert((md->getNumOperands() % 2) == 1 && "Invalid number of operands");
+  // start index = 1, to skip the global variable key
+  // increment = 2, to skip the value for each property-value pairs
+  for (unsigned i = 1, e = md->getNumOperands(); i != e; i += 2) {
+    // property
+    const MDString *prop = dyn_cast<MDString>(md->getOperand(i));
+    assert(prop && "Annotation property not a string");
+
+    // value
+    ConstantInt *Val = dyn_cast<ConstantInt>(md->getOperand(i+1));
+    assert(Val && "Value operand not a constant int");
+
+    std::string keyname = prop->getString().str();
+    if (retval.find(keyname) != retval.end())
+      retval[keyname].push_back(Val->getZExtValue());
+    else {
+      std::vector<unsigned> tmp;
+      tmp.push_back(Val->getZExtValue());
+      retval[keyname] = tmp;
+    }
+  }
+}
+
+static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) {
+  NamedMDNode *NMD = m->getNamedMetadata(llvm::NamedMDForAnnotations);
+  if (!NMD)
+    return;
+  key_val_pair_t tmp;
+  for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
+    const MDNode *elem = NMD->getOperand(i);
+
+    Value *entity = elem->getOperand(0);
+    // entity may be null due to DCE
+    if (!entity)
+      continue;
+    if (entity != gv)
+      continue;
+
+    // accumulate annotations for entity in tmp
+    cacheAnnotationFromMD(elem, tmp);
+  }
+
+  if (tmp.empty()) // no annotations for this gv
+    return;
+
+  if ((*annotationCache).find(m) != (*annotationCache).end())
+    (*annotationCache)[m][gv] = tmp;
+  else {
+    global_val_annot_t tmp1;
+    tmp1[gv] = tmp;
+    (*annotationCache)[m] = tmp1;
+  }
+}
+
+bool llvm::findOneNVVMAnnotation(const GlobalValue *gv, std::string prop,
+                                 unsigned &retval) {
+  const Module *m = gv->getParent();
+  if ((*annotationCache).find(m) == (*annotationCache).end())
+    cacheAnnotationFromMD(m, gv);
+  else if ((*annotationCache)[m].find(gv) == (*annotationCache)[m].end())
+    cacheAnnotationFromMD(m, gv);
+  if ((*annotationCache)[m][gv].find(prop) == (*annotationCache)[m][gv].end())
+    return false;
+  retval = (*annotationCache)[m][gv][prop][0];
+  return true;
+}
+
+bool llvm::findAllNVVMAnnotation(const GlobalValue *gv, std::string prop,
+                                 std::vector<unsigned> &retval) {
+  const Module *m = gv->getParent();
+  if ((*annotationCache).find(m) == (*annotationCache).end())
+    cacheAnnotationFromMD(m, gv);
+  else if ((*annotationCache)[m].find(gv) == (*annotationCache)[m].end())
+    cacheAnnotationFromMD(m, gv);
+  if ((*annotationCache)[m][gv].find(prop) == (*annotationCache)[m][gv].end())
+    return false;
+  retval = (*annotationCache)[m][gv][prop];
+  return true;
+}
+
+bool llvm::isTexture(const llvm::Value &val) {
+  if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+    unsigned annot;
+    if (llvm::findOneNVVMAnnotation(gv,
+                       llvm::PropertyAnnotationNames[llvm::PROPERTY_ISTEXTURE],
+                                   annot)) {
+      assert((annot == 1) && "Unexpected annotation on a texture symbol");
+      return true;
+    }
+  }
+  return false;
+}
+
+bool llvm::isSurface(const llvm::Value &val) {
+  if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+    unsigned annot;
+    if (llvm::findOneNVVMAnnotation(gv,
+                       llvm::PropertyAnnotationNames[llvm::PROPERTY_ISSURFACE],
+                                   annot)) {
+      assert((annot == 1) && "Unexpected annotation on a surface symbol");
+      return true;
+    }
+  }
+  return false;
+}
+
+bool llvm::isSampler(const llvm::Value &val) {
+  if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+    unsigned annot;
+    if (llvm::findOneNVVMAnnotation(gv,
+                       llvm::PropertyAnnotationNames[llvm::PROPERTY_ISSAMPLER],
+                                   annot)) {
+      assert((annot == 1) && "Unexpected annotation on a sampler symbol");
+      return true;
+    }
+  }
+  if (const Argument *arg = dyn_cast<Argument>(&val)) {
+    const Function *func = arg->getParent();
+    std::vector<unsigned> annot;
+    if (llvm::findAllNVVMAnnotation(func,
+                       llvm::PropertyAnnotationNames[llvm::PROPERTY_ISSAMPLER],
+                                   annot)) {
+      if (std::find(annot.begin(), annot.end(), arg->getArgNo()) != annot.end())
+        return true;
+    }
+  }
+  return false;
+}
+
+bool llvm::isImageReadOnly(const llvm::Value &val) {
+  if (const Argument *arg = dyn_cast<Argument>(&val)) {
+    const Function *func = arg->getParent();
+    std::vector<unsigned> annot;
+    if (llvm::findAllNVVMAnnotation(func,
+          llvm::PropertyAnnotationNames[llvm::PROPERTY_ISREADONLY_IMAGE_PARAM],
+                                   annot)) {
+      if (std::find(annot.begin(), annot.end(), arg->getArgNo()) != annot.end())
+        return true;
+    }
+  }
+  return false;
+}
+
+bool llvm::isImageWriteOnly(const llvm::Value &val) {
+  if (const Argument *arg = dyn_cast<Argument>(&val)) {
+    const Function *func = arg->getParent();
+    std::vector<unsigned> annot;
+    if (llvm::findAllNVVMAnnotation(func,
+         llvm::PropertyAnnotationNames[llvm::PROPERTY_ISWRITEONLY_IMAGE_PARAM],
+                                   annot)) {
+      if (std::find(annot.begin(), annot.end(), arg->getArgNo()) != annot.end())
+        return true;
+    }
+  }
+  return false;
+}
+
+bool llvm::isImage(const llvm::Value &val) {
+  return llvm::isImageReadOnly(val) || llvm::isImageWriteOnly(val);
+}
+
+std::string llvm::getTextureName(const llvm::Value &val) {
+  assert(val.hasName() && "Found texture variable with no name");
+  return val.getName();
+}
+
+std::string llvm::getSurfaceName(const llvm::Value &val) {
+  assert(val.hasName() && "Found surface variable with no name");
+  return val.getName();
+}
+
+std::string llvm::getSamplerName(const llvm::Value &val) {
+  assert(val.hasName() && "Found sampler variable with no name");
+  return val.getName();
+}
+
+bool llvm::getMaxNTIDx(const Function &F, unsigned &x) {
+  return (llvm::findOneNVVMAnnotation(&F,
+                       llvm::PropertyAnnotationNames[llvm::PROPERTY_MAXNTID_X],
+                                      x));
+}
+
+bool llvm::getMaxNTIDy(const Function &F, unsigned &y) {
+  return (llvm::findOneNVVMAnnotation(&F,
+                       llvm::PropertyAnnotationNames[llvm::PROPERTY_MAXNTID_Y],
+                                      y));
+}
+
+bool llvm::getMaxNTIDz(const Function &F, unsigned &z) {
+  return (llvm::findOneNVVMAnnotation(&F,
+                       llvm::PropertyAnnotationNames[llvm::PROPERTY_MAXNTID_Z],
+                                      z));
+}
+
+bool llvm::getReqNTIDx(const Function &F, unsigned &x) {
+  return (llvm::findOneNVVMAnnotation(&F,
+                       llvm::PropertyAnnotationNames[llvm::PROPERTY_REQNTID_X],
+                                      x));
+}
+
+bool llvm::getReqNTIDy(const Function &F, unsigned &y) {
+  return (llvm::findOneNVVMAnnotation(&F,
+                       llvm::PropertyAnnotationNames[llvm::PROPERTY_REQNTID_Y],
+                                      y));
+}
+
+bool llvm::getReqNTIDz(const Function &F, unsigned &z) {
+  return (llvm::findOneNVVMAnnotation(&F,
+                       llvm::PropertyAnnotationNames[llvm::PROPERTY_REQNTID_Z],
+                                      z));
+}
+
+bool llvm::getMinCTASm(const Function &F, unsigned &x) {
+  return (llvm::findOneNVVMAnnotation(&F,
+                    llvm::PropertyAnnotationNames[llvm::PROPERTY_MINNCTAPERSM],
+                                      x));
+}
+
+bool llvm::isKernelFunction(const Function &F) {
+  unsigned x = 0;
+  bool retval = llvm::findOneNVVMAnnotation(&F,
+               llvm::PropertyAnnotationNames[llvm::PROPERTY_ISKERNEL_FUNCTION],
+                                            x);
+  if (retval == false) {
+    // There is no NVVM metadata, check the calling convention
+    if (F.getCallingConv() == llvm::CallingConv::PTX_Kernel)
+      return true;
+    else
+      return false;
+  }
+  return (x==1);
+}
+
+bool llvm::getAlign(const Function &F, unsigned index, unsigned &align) {
+  std::vector<unsigned> Vs;
+  bool retval = llvm::findAllNVVMAnnotation(&F,
+                           llvm::PropertyAnnotationNames[llvm::PROPERTY_ALIGN],
+                                            Vs);
+  if (retval == false)
+    return false;
+  for (int i=0, e=Vs.size(); i<e; i++) {
+    unsigned v = Vs[i];
+    if ( (v >> 16) == index ) {
+      align =  v & 0xFFFF;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool llvm::getAlign(const CallInst &I, unsigned index, unsigned &align) {
+  if (MDNode *alignNode = I.getMetadata("callalign")) {
+    for (int i=0, n = alignNode->getNumOperands();
+        i<n; i++) {
+      if (const ConstantInt *CI =
+          dyn_cast<ConstantInt>(alignNode->getOperand(i))) {
+        unsigned v = CI->getZExtValue();
+        if ( (v>>16) == index ) {
+          align = v & 0xFFFF;
+          return true;
+        }
+        if ( (v>>16) > index ) {
+          return false;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+bool llvm::isBarrierIntrinsic(Intrinsic::ID id) {
+  if ((id == Intrinsic::nvvm_barrier0) ||
+      (id == Intrinsic::nvvm_barrier0_popc) ||
+      (id == Intrinsic::nvvm_barrier0_and) ||
+      (id == Intrinsic::nvvm_barrier0_or) ||
+      (id == Intrinsic::cuda_syncthreads))
+    return true;
+  return false;
+}
+
+// Interface for checking all memory space transfer related intrinsics
+bool llvm::isMemorySpaceTransferIntrinsic(Intrinsic::ID id) {
+  if (id == Intrinsic::nvvm_ptr_local_to_gen ||
+      id == Intrinsic::nvvm_ptr_shared_to_gen ||
+      id == Intrinsic::nvvm_ptr_global_to_gen ||
+      id == Intrinsic::nvvm_ptr_constant_to_gen ||
+      id == Intrinsic::nvvm_ptr_gen_to_global ||
+      id == Intrinsic::nvvm_ptr_gen_to_shared ||
+      id == Intrinsic::nvvm_ptr_gen_to_local ||
+      id == Intrinsic::nvvm_ptr_gen_to_constant ||
+      id == Intrinsic::nvvm_ptr_gen_to_param) {
+    return true;
+  }
+
+  return false;
+}
+
+// consider several special intrinsics in striping pointer casts, and
+// provide an option to ignore GEP indicies for find out the base address only
+// which could be used in simple alias disambigurate.
+const Value *llvm::skipPointerTransfer(const Value *V,
+                                       bool ignore_GEP_indices) {
+  V = V->stripPointerCasts();
+  while (true) {
+    if (const IntrinsicInst *IS = dyn_cast<IntrinsicInst>(V)) {
+      if (isMemorySpaceTransferIntrinsic(IS->getIntrinsicID())) {
+        V = IS->getArgOperand(0)->stripPointerCasts();
+        continue;
+      }
+    } else if (ignore_GEP_indices)
+      if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+        V = GEP->getPointerOperand()->stripPointerCasts();
+        continue;
+      }
+    break;
+  }
+  return V;
+}
+
+// consider several special intrinsics in striping pointer casts, and
+// - ignore GEP indicies for find out the base address only, and
+// - tracking PHINode
+// which could be used in simple alias disambigurate.
+const Value *llvm::skipPointerTransfer(const Value *V,
+                                       std::set<const Value *> &processed) {
+  if (processed.find(V) != processed.end())
+    return NULL;
+  processed.insert(V);
+
+  const Value *V2 = V->stripPointerCasts();
+  if (V2 != V && processed.find(V2) != processed.end())
+    return NULL;
+  processed.insert(V2);
+
+  V = V2;
+
+  while (true) {
+    if (const IntrinsicInst *IS = dyn_cast<IntrinsicInst>(V)) {
+      if (isMemorySpaceTransferIntrinsic(IS->getIntrinsicID())) {
+        V = IS->getArgOperand(0)->stripPointerCasts();
+        continue;
+      }
+    } else if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+      V = GEP->getPointerOperand()->stripPointerCasts();
+      continue;
+    } else if (const PHINode *PN = dyn_cast<PHINode>(V)) {
+      if (V != V2 && processed.find(V) != processed.end())
+        return NULL;
+      processed.insert(PN);
+      const Value *common = 0;
+      for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) {
+        const Value *pv = PN->getIncomingValue(i);
+        const Value *base = skipPointerTransfer(pv, processed);
+        if (base) {
+          if (common == 0)
+            common = base;
+          else if (common != base)
+            return PN;
+        }
+      }
+      if (common == 0)
+        return PN;
+      V = common;
+    }
+    break;
+  }
+  return V;
+}
+
+
+// The following are some useful utilities for debuggung
+
+BasicBlock *llvm::getParentBlock(Value *v) {
+  if (BasicBlock *B = dyn_cast<BasicBlock>(v))
+    return B;
+
+  if (Instruction *I = dyn_cast<Instruction>(v))
+    return I->getParent();
+
+  return 0;
+}
+
+Function *llvm::getParentFunction(Value *v) {
+  if (Function *F = dyn_cast<Function>(v))
+    return F;
+
+  if (Instruction *I = dyn_cast<Instruction>(v))
+    return I->getParent()->getParent();
+
+  if (BasicBlock *B = dyn_cast<BasicBlock>(v))
+    return B->getParent();
+
+  return 0;
+}
+
+// Dump a block by name
+void llvm::dumpBlock(Value *v, char *blockName) {
+  Function *F = getParentFunction(v);
+  if (F == 0)
+    return;
+
+  for (Function::iterator it = F->begin(), ie = F->end(); it != ie; ++it) {
+    BasicBlock *B = it;
+    if (strcmp(B->getName().data(), blockName) == 0) {
+      B->dump();
+      return;
+    }
+  }
+}
+
+// Find an instruction by name
+Instruction *llvm::getInst(Value *base, char *instName) {
+  Function *F = getParentFunction(base);
+  if (F == 0)
+    return 0;
+
+  for (inst_iterator it = inst_begin(F), ie = inst_end(F); it != ie; ++it) {
+    Instruction *I = &*it;
+    if (strcmp(I->getName().data(), instName) == 0) {
+      return I;
+    }
+  }
+
+  return 0;
+}
+
+// Dump an instruction by nane
+void llvm::dumpInst(Value *base, char *instName) {
+  Instruction *I = getInst(base, instName);
+  if (I)
+    I->dump();
+}
+
+// Dump an instruction and all dependent instructions
+void llvm::dumpInstRec(Value *v, std::set<Instruction *> *visited) {
+  if (Instruction *I = dyn_cast<Instruction>(v)) {
+
+    if (visited->find(I) != visited->end())
+      return;
+
+    visited->insert(I);
+
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+      dumpInstRec(I->getOperand(i), visited);
+
+    I->dump();
+  }
+}
+
+// Dump an instruction and all dependent instructions
+void llvm::dumpInstRec(Value *v) {
+  std::set<Instruction *> visited;
+
+  //BasicBlock *B = getParentBlock(v);
+
+  dumpInstRec(v, &visited);
+}
+
+// Dump the parent for Instruction, block or function
+void llvm::dumpParent(Value *v) {
+  if (Instruction *I = dyn_cast<Instruction>(v)) {
+    I->getParent()->dump();
+    return;
+  }
+
+  if (BasicBlock *B = dyn_cast<BasicBlock>(v)) {
+    B->getParent()->dump();
+    return;
+  }
+
+  if (Function *F = dyn_cast<Function>(v)) {
+    F->getParent()->dump();
+    return;
+  }
+}
diff --git a/lib/Target/NVPTX/NVPTXUtilities.h b/lib/Target/NVPTX/NVPTXUtilities.h
new file mode 100644
index 0000000..fe6ad55
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXUtilities.h
@@ -0,0 +1,94 @@
+//===-- NVPTXUtilities - Utilities -----------------------------*- C++ -*-====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the NVVM specific utility functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXUTILITIES_H
+#define NVPTXUTILITIES_H
+
+#include "llvm/Value.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Function.h"
+#include "llvm/IntrinsicInst.h"
+#include <cstdarg>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace llvm
+{
+
+#define NVCL_IMAGE2D_READONLY_FUNCNAME "__is_image2D_readonly"
+#define NVCL_IMAGE3D_READONLY_FUNCNAME "__is_image3D_readonly"
+
+bool findOneNVVMAnnotation(const llvm::GlobalValue *, std::string, unsigned &);
+bool findAllNVVMAnnotation(const llvm::GlobalValue *, std::string,
+                           std::vector<unsigned> &);
+
+bool isTexture(const llvm::Value &);
+bool isSurface(const llvm::Value &);
+bool isSampler(const llvm::Value &);
+bool isImage(const llvm::Value &);
+bool isImageReadOnly(const llvm::Value &);
+bool isImageWriteOnly(const llvm::Value &);
+
+std::string getTextureName(const llvm::Value &);
+std::string getSurfaceName(const llvm::Value &);
+std::string getSamplerName(const llvm::Value &);
+
+bool getMaxNTIDx(const llvm::Function &, unsigned &);
+bool getMaxNTIDy(const llvm::Function &, unsigned &);
+bool getMaxNTIDz(const llvm::Function &, unsigned &);
+
+bool getReqNTIDx(const llvm::Function &, unsigned &);
+bool getReqNTIDy(const llvm::Function &, unsigned &);
+bool getReqNTIDz(const llvm::Function &, unsigned &);
+
+bool getMinCTASm(const llvm::Function &, unsigned &);
+bool isKernelFunction(const llvm::Function &);
+
+bool getAlign(const llvm::Function &, unsigned index, unsigned &);
+bool getAlign(const llvm::CallInst &, unsigned index, unsigned &);
+
+bool isBarrierIntrinsic(llvm::Intrinsic::ID);
+
+/// make_vector - Helper function which is useful for building temporary vectors
+/// to pass into type construction of CallInst ctors.  This turns a null
+/// terminated list of pointers (or other value types) into a real live vector.
+///
+template<typename T>
+inline std::vector<T> make_vector(T A, ...) {
+  va_list Args;
+  va_start(Args, A);
+  std::vector<T> Result;
+  Result.push_back(A);
+  while (T Val = va_arg(Args, T))
+    Result.push_back(Val);
+  va_end(Args);
+  return Result;
+}
+
+bool isMemorySpaceTransferIntrinsic(Intrinsic::ID id);
+const Value *skipPointerTransfer(const Value *V, bool ignore_GEP_indices);
+const Value *skipPointerTransfer(const Value *V,
+                                 std::set<const Value *> &processed);
+BasicBlock *getParentBlock(Value *v);
+Function *getParentFunction(Value *v);
+void dumpBlock(Value *v, char *blockName);
+Instruction *getInst(Value *base, char *instName);
+void dumpInst(Value *base, char *instName);
+void dumpInstRec(Value *v, std::set<Instruction *> *visited);
+void dumpInstRec(Value *v);
+void dumpParent(Value *v);
+
+}
+
+#endif
diff --git a/lib/Target/NVPTX/NVPTXVector.td b/lib/Target/NVPTX/NVPTXVector.td
new file mode 100644
index 0000000..775df19
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXVector.td
@@ -0,0 +1,1481 @@
+//===- NVPTXVector.td - NVPTX Vector Specific Instruction defs -*- tblgen-*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//-----------------------------------
+// Vector Specific
+//-----------------------------------
+
+//
+// All vector instructions derive from NVPTXVecInst
+//
+
+class NVPTXVecInst<dag outs, dag ins, string asmstr, list<dag> pattern,
+  NVPTXInst sInst=NOP>
+  : NVPTXInst<outs, ins, asmstr, pattern> {
+  NVPTXInst scalarInst=sInst;
+}
+
+let isAsCheapAsAMove=1, VecInstType=isVecExtract.Value in {
+// Extract v2i16
+def V2i16Extract : NVPTXVecInst<(outs Int16Regs:$dst),
+  (ins V2I16Regs:$src, i8imm:$c),
+                         "mov.u16 \t$dst, $src${c:vecelem};",
+                         [(set Int16Regs:$dst, (vector_extract
+                           (v2i16 V2I16Regs:$src), imm:$c))],
+                         IMOV16rr>;
+
+// Extract v4i16
+def V4i16Extract : NVPTXVecInst<(outs Int16Regs:$dst),
+  (ins V4I16Regs:$src, i8imm:$c),
+                         "mov.u16 \t$dst, $src${c:vecelem};",
+                         [(set Int16Regs:$dst, (vector_extract
+                           (v4i16 V4I16Regs:$src), imm:$c))],
+                         IMOV16rr>;
+
+// Extract v2i8
+def V2i8Extract : NVPTXVecInst<(outs Int8Regs:$dst),
+  (ins V2I8Regs:$src, i8imm:$c),
+                         "mov.u16 \t$dst, $src${c:vecelem};",
+                         [(set Int8Regs:$dst, (vector_extract
+                           (v2i8 V2I8Regs:$src), imm:$c))],
+                         IMOV8rr>;
+
+// Extract v4i8
+def V4i8Extract : NVPTXVecInst<(outs Int8Regs:$dst),
+  (ins V4I8Regs:$src, i8imm:$c),
+                         "mov.u16 \t$dst, $src${c:vecelem};",
+                         [(set Int8Regs:$dst, (vector_extract
+                           (v4i8 V4I8Regs:$src), imm:$c))],
+                         IMOV8rr>;
+
+// Extract v2i32
+def V2i32Extract : NVPTXVecInst<(outs Int32Regs:$dst),
+  (ins V2I32Regs:$src, i8imm:$c),
+                         "mov.u32 \t$dst, $src${c:vecelem};",
+                         [(set Int32Regs:$dst, (vector_extract
+                           (v2i32 V2I32Regs:$src), imm:$c))],
+                         IMOV32rr>;
+
+// Extract v2f32
+def V2f32Extract : NVPTXVecInst<(outs Float32Regs:$dst),
+  (ins V2F32Regs:$src, i8imm:$c),
+                         "mov.f32 \t$dst, $src${c:vecelem};",
+                         [(set Float32Regs:$dst, (vector_extract
+                           (v2f32 V2F32Regs:$src), imm:$c))],
+                         FMOV32rr>;
+
+// Extract v2i64
+def V2i64Extract : NVPTXVecInst<(outs Int64Regs:$dst),
+  (ins V2I64Regs:$src, i8imm:$c),
+                         "mov.u64 \t$dst, $src${c:vecelem};",
+                         [(set Int64Regs:$dst, (vector_extract
+                           (v2i64 V2I64Regs:$src), imm:$c))],
+                         IMOV64rr>;
+
+// Extract v2f64
+def V2f64Extract : NVPTXVecInst<(outs Float64Regs:$dst),
+  (ins V2F64Regs:$src, i8imm:$c),
+                         "mov.f64 \t$dst, $src${c:vecelem};",
+                         [(set Float64Regs:$dst, (vector_extract
+                           (v2f64 V2F64Regs:$src), imm:$c))],
+                         FMOV64rr>;
+
+// Extract v4i32
+def V4i32Extract : NVPTXVecInst<(outs Int32Regs:$dst),
+  (ins V4I32Regs:$src, i8imm:$c),
+                         "mov.u32 \t$dst, $src${c:vecelem};",
+                         [(set Int32Regs:$dst, (vector_extract
+                           (v4i32 V4I32Regs:$src), imm:$c))],
+                         IMOV32rr>;
+
+// Extract v4f32
+def V4f32Extract : NVPTXVecInst<(outs Float32Regs:$dst),
+  (ins V4F32Regs:$src, i8imm:$c),
+                         "mov.f32 \t$dst, $src${c:vecelem};",
+                         [(set Float32Regs:$dst, (vector_extract
+                           (v4f32 V4F32Regs:$src), imm:$c))],
+                         FMOV32rr>;
+}
+
+let isAsCheapAsAMove=1, VecInstType=isVecInsert.Value in {
+// Insert v2i8
+def V2i8Insert : NVPTXVecInst<(outs V2I8Regs:$dst),
+  (ins V2I8Regs:$src, Int8Regs:$val, i8imm:$c),
+        "mov.v2.u16 \t${dst:vecfull}, ${src:vecfull};"
+        "\n\tmov.u16 \t$dst${c:vecelem}, $val;",
+       [(set V2I8Regs:$dst,
+         (vector_insert V2I8Regs:$src, Int8Regs:$val, imm:$c))],
+                         IMOV8rr>;
+
+// Insert v4i8
+def V4i8Insert : NVPTXVecInst<(outs V4I8Regs:$dst),
+  (ins V4I8Regs:$src, Int8Regs:$val, i8imm:$c),
+                       "mov.v4.u16 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.u16 \t$dst${c:vecelem}, $val;",
+       [(set V4I8Regs:$dst,
+         (vector_insert V4I8Regs:$src, Int8Regs:$val, imm:$c))],
+                         IMOV8rr>;
+
+// Insert v2i16
+def V2i16Insert : NVPTXVecInst<(outs V2I16Regs:$dst),
+  (ins V2I16Regs:$src, Int16Regs:$val, i8imm:$c),
+                       "mov.v2.u16 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.u16 \t$dst${c:vecelem}, $val;",
+       [(set V2I16Regs:$dst,
+         (vector_insert V2I16Regs:$src, Int16Regs:$val, imm:$c))],
+                         IMOV16rr>;
+
+// Insert v4i16
+def V4i16Insert : NVPTXVecInst<(outs V4I16Regs:$dst),
+  (ins V4I16Regs:$src, Int16Regs:$val, i8imm:$c),
+                       "mov.v4.u16 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.u16 \t$dst${c:vecelem}, $val;",
+       [(set V4I16Regs:$dst,
+         (vector_insert V4I16Regs:$src, Int16Regs:$val, imm:$c))],
+                         IMOV16rr>;
+
+// Insert v2i32
+def V2i32Insert : NVPTXVecInst<(outs V2I32Regs:$dst),
+  (ins V2I32Regs:$src, Int32Regs:$val, i8imm:$c),
+                       "mov.v2.u32 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.u32 \t$dst${c:vecelem}, $val;",
+       [(set V2I32Regs:$dst,
+         (vector_insert V2I32Regs:$src, Int32Regs:$val, imm:$c))],
+                         IMOV32rr>;
+
+// Insert v2f32
+def V2f32Insert : NVPTXVecInst<(outs V2F32Regs:$dst),
+  (ins V2F32Regs:$src, Float32Regs:$val, i8imm:$c),
+                       "mov.v2.f32 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.f32 \t$dst${c:vecelem}, $val;",
+       [(set V2F32Regs:$dst,
+         (vector_insert V2F32Regs:$src, Float32Regs:$val, imm:$c))],
+                         FMOV32rr>;
+
+// Insert v2i64
+def V2i64Insert : NVPTXVecInst<(outs V2I64Regs:$dst),
+  (ins V2I64Regs:$src, Int64Regs:$val, i8imm:$c),
+                       "mov.v2.u64 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.u64 \t$dst${c:vecelem}, $val;",
+       [(set V2I64Regs:$dst,
+         (vector_insert V2I64Regs:$src, Int64Regs:$val, imm:$c))],
+                         IMOV64rr>;
+
+// Insert v2f64
+def V2f64Insert : NVPTXVecInst<(outs V2F64Regs:$dst),
+  (ins V2F64Regs:$src, Float64Regs:$val, i8imm:$c),
+                       "mov.v2.f64 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.f64 \t$dst${c:vecelem}, $val;",
+       [(set V2F64Regs:$dst,
+         (vector_insert V2F64Regs:$src, Float64Regs:$val, imm:$c))],
+                         FMOV64rr>;
+
+// Insert v4i32
+def V4i32Insert : NVPTXVecInst<(outs V4I32Regs:$dst),
+  (ins V4I32Regs:$src, Int32Regs:$val, i8imm:$c),
+                       "mov.v4.u32 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.u32 \t$dst${c:vecelem}, $val;",
+       [(set V4I32Regs:$dst,
+         (vector_insert V4I32Regs:$src, Int32Regs:$val, imm:$c))],
+                         IMOV32rr>;
+
+// Insert v4f32
+def V4f32Insert : NVPTXVecInst<(outs V4F32Regs:$dst),
+  (ins V4F32Regs:$src, Float32Regs:$val, i8imm:$c),
+                       "mov.v4.f32 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.f32 \t$dst${c:vecelem}, $val;",
+       [(set V4F32Regs:$dst,
+         (vector_insert V4F32Regs:$src, Float32Regs:$val, imm:$c))],
+                         FMOV32rr>;
+}
+
+class BinOpAsmString<string c> {
+  string s = c;
+}
+
+class V4AsmStr<string opcode> : BinOpAsmString<
+                          !strconcat(!strconcat(!strconcat(!strconcat(
+                            !strconcat(!strconcat(!strconcat(
+                          opcode,  " \t${dst}_0, ${a}_0, ${b}_0;\n\t"),
+                          opcode), " \t${dst}_1, ${a}_1, ${b}_1;\n\t"),
+                          opcode), " \t${dst}_2, ${a}_2, ${b}_2;\n\t"),
+                          opcode), " \t${dst}_3, ${a}_3, ${b}_3;")>;
+
+class V2AsmStr<string opcode> : BinOpAsmString<
+                           !strconcat(!strconcat(!strconcat(
+                           opcode,  " \t${dst}_0, ${a}_0, ${b}_0;\n\t"),
+                           opcode), " \t${dst}_1, ${a}_1, ${b}_1;")>;
+
+class V4MADStr<string opcode> : BinOpAsmString<
+                          !strconcat(!strconcat(!strconcat(!strconcat(
+                            !strconcat(!strconcat(!strconcat(
+                          opcode,  " \t${dst}_0, ${a}_0, ${b}_0, ${c}_0;\n\t"),
+                          opcode), " \t${dst}_1, ${a}_1, ${b}_1, ${c}_1;\n\t"),
+                          opcode), " \t${dst}_2, ${a}_2, ${b}_2, ${c}_2;\n\t"),
+                          opcode), " \t${dst}_3, ${a}_3, ${b}_3, ${c}_3;")>;
+
+class V2MADStr<string opcode> : BinOpAsmString<
+                           !strconcat(!strconcat(!strconcat(
+                           opcode,  " \t${dst}_0, ${a}_0, ${b}_0, ${c}_0;\n\t"),
+                           opcode), " \t${dst}_1, ${a}_1, ${b}_1, ${c}_1;")>;
+
+class V4UnaryStr<string opcode> : BinOpAsmString<
+                          !strconcat(!strconcat(!strconcat(!strconcat(
+                            !strconcat(!strconcat(!strconcat(
+                          opcode,  " \t${dst}_0, ${a}_0;\n\t"),
+                          opcode), " \t${dst}_1, ${a}_1;\n\t"),
+                          opcode), " \t${dst}_2, ${a}_2;\n\t"),
+                          opcode), " \t${dst}_3, ${a}_3;")>;
+
+class V2UnaryStr<string opcode> : BinOpAsmString<
+                           !strconcat(!strconcat(!strconcat(
+                           opcode,  " \t${dst}_0, ${a}_0;\n\t"),
+                           opcode), " \t${dst}_1, ${a}_1;")>;
+
+class VecBinaryOp<BinOpAsmString asmstr, SDNode OpNode, NVPTXRegClass regclass,
+  NVPTXInst sInst=NOP> :
+      NVPTXVecInst<(outs regclass:$dst), (ins regclass:$a, regclass:$b),
+                 asmstr.s,
+                 [(set regclass:$dst, (OpNode regclass:$a, regclass:$b))],
+                 sInst>;
+
+class VecShiftOp<BinOpAsmString asmstr, SDNode OpNode, NVPTXRegClass regclass1,
+                 NVPTXRegClass regclass2, NVPTXInst sInst=NOP> :
+      NVPTXVecInst<(outs regclass1:$dst), (ins regclass1:$a, regclass2:$b),
+                 asmstr.s,
+                 [(set regclass1:$dst, (OpNode regclass1:$a, regclass2:$b))],
+                 sInst>;
+
+class VecUnaryOp<BinOpAsmString asmstr, PatFrag OpNode, NVPTXRegClass regclass,
+  NVPTXInst sInst=NOP> :
+      NVPTXVecInst<(outs regclass:$dst), (ins regclass:$a),
+                 asmstr.s,
+                 [(set regclass:$dst, (OpNode regclass:$a))], sInst>;
+
+multiclass IntBinVOp<string asmstr, SDNode OpNode,
+                     NVPTXInst i64op=NOP, NVPTXInst i32op=NOP, NVPTXInst
+                     i16op=NOP, NVPTXInst i8op=NOP> {
+  def V2I64 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "64")>, OpNode, V2I64Regs,
+    i64op>;
+  def V4I32 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "32")>, OpNode, V4I32Regs,
+    i32op>;
+  def V2I32 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "32")>, OpNode, V2I32Regs,
+    i32op>;
+  def V4I16 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "16")>, OpNode, V4I16Regs,
+    i16op>;
+  def V2I16 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "16")>, OpNode, V2I16Regs,
+    i16op>;
+  def V4I8 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "16")>, OpNode, V4I8Regs,
+    i8op>;
+  def V2I8 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "16")>, OpNode, V2I8Regs,
+    i8op>;
+}
+
+multiclass FloatBinVOp<string asmstr, SDNode OpNode,
+                       NVPTXInst f64=NOP, NVPTXInst f32=NOP,
+                       NVPTXInst f32_ftz=NOP> {
+  def V2F64 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "f64")>, OpNode,
+    V2F64Regs, f64>;
+  def V4F32_ftz : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "ftz.f32")>, OpNode,
+    V4F32Regs, f32_ftz>, Requires<[doF32FTZ]>;
+  def V2F32_ftz : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "ftz.f32")>, OpNode,
+    V2F32Regs, f32_ftz>, Requires<[doF32FTZ]>;
+  def V4F32 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "f32")>, OpNode,
+    V4F32Regs, f32>;
+  def V2F32 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "f32")>, OpNode,
+    V2F32Regs, f32>;
+}
+
+multiclass IntUnaryVOp<string asmstr, PatFrag OpNode,
+                       NVPTXInst i64op=NOP, NVPTXInst i32op=NOP,
+                       NVPTXInst i16op=NOP, NVPTXInst i8op=NOP> {
+  def V2I64 : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "64")>, OpNode,
+    V2I64Regs, i64op>;
+  def V4I32 : VecUnaryOp<V4UnaryStr<!strconcat(asmstr, "32")>, OpNode,
+    V4I32Regs, i32op>;
+  def V2I32 : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "32")>, OpNode,
+    V2I32Regs, i32op>;
+  def V4I16 : VecUnaryOp<V4UnaryStr<!strconcat(asmstr, "16")>, OpNode,
+    V4I16Regs, i16op>;
+  def V2I16 : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "16")>, OpNode,
+    V2I16Regs, i16op>;
+  def V4I8  : VecUnaryOp<V4UnaryStr<!strconcat(asmstr, "16")>, OpNode,
+    V4I8Regs,   i8op>;
+  def V2I8  : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "16")>, OpNode,
+    V2I8Regs,   i8op>;
+}
+
+
+// Integer Arithmetic
+let VecInstType=isVecOther.Value in {
+defm VAdd : IntBinVOp<"add.s", add, ADDi64rr, ADDi32rr, ADDi16rr, ADDi8rr>;
+defm VSub : IntBinVOp<"sub.s", sub, SUBi64rr, SUBi32rr, SUBi16rr, SUBi8rr>;
+
+def AddCCV4I32 : VecBinaryOp<V4AsmStr<"add.cc.s32">, addc, V4I32Regs,
+  ADDCCi32rr>;
+def AddCCV2I32 : VecBinaryOp<V2AsmStr<"add.cc.s32">, addc, V2I32Regs,
+  ADDCCi32rr>;
+def SubCCV4I32 : VecBinaryOp<V4AsmStr<"sub.cc.s32">, subc, V4I32Regs,
+  SUBCCi32rr>;
+def SubCCV2I32 : VecBinaryOp<V2AsmStr<"sub.cc.s32">, subc, V2I32Regs,
+  SUBCCi32rr>;
+def AddCCCV4I32 : VecBinaryOp<V4AsmStr<"addc.cc.s32">, adde, V4I32Regs,
+  ADDCCCi32rr>;
+def AddCCCV2I32 : VecBinaryOp<V2AsmStr<"addc.cc.s32">, adde, V2I32Regs,
+  ADDCCCi32rr>;
+def SubCCCV4I32 : VecBinaryOp<V4AsmStr<"subc.cc.s32">, sube, V4I32Regs,
+  SUBCCCi32rr>;
+def SubCCCV2I32 : VecBinaryOp<V2AsmStr<"subc.cc.s32">, sube, V2I32Regs,
+  SUBCCCi32rr>;
+
+def ShiftLV2I64 : VecShiftOp<V2AsmStr<"shl.b64">, shl, V2I64Regs, V2I32Regs,
+  SHLi64rr>;
+def ShiftLV2I32 : VecShiftOp<V2AsmStr<"shl.b32">, shl, V2I32Regs, V2I32Regs,
+  SHLi32rr>;
+def ShiftLV4I32 : VecShiftOp<V4AsmStr<"shl.b32">, shl, V4I32Regs, V4I32Regs,
+  SHLi32rr>;
+def ShiftLV2I16 : VecShiftOp<V2AsmStr<"shl.b16">, shl, V2I16Regs, V2I32Regs,
+  SHLi16rr>;
+def ShiftLV4I16 : VecShiftOp<V4AsmStr<"shl.b16">, shl, V4I16Regs, V4I32Regs,
+  SHLi16rr>;
+def ShiftLV2I8  : VecShiftOp<V2AsmStr<"shl.b16">, shl, V2I8Regs,  V2I32Regs,
+  SHLi8rr>;
+def ShiftLV4I8  : VecShiftOp<V4AsmStr<"shl.b16">, shl, V4I8Regs,  V4I32Regs,
+  SHLi8rr>;
+}
+
+// cvt to v*i32, helpers for shift
+class CVTtoVeci32<NVPTXRegClass inclass, NVPTXRegClass outclass, string asmstr,
+  NVPTXInst sInst=NOP> :
+      NVPTXVecInst<(outs outclass:$d), (ins inclass:$s), asmstr, [], sInst>;
+
+class VecCVTStrHelper<string op, string dest, string src> {
+  string s=!strconcat(op, !strconcat("\t",
+           !strconcat(dest, !strconcat(", ", !strconcat(src, ";")))));
+}
+
+class Vec2CVTStr<string op> {
+  string s=!strconcat(VecCVTStrHelper<op, "${d}_0", "${s}_0">.s,
+           !strconcat("\n\t", VecCVTStrHelper<op, "${d}_1", "${s}_1">.s));
+}
+
+class Vec4CVTStr<string op> {
+  string s=!strconcat(VecCVTStrHelper<op, "${d}_0", "${s}_0">.s,
+           !strconcat("\n\t",
+           !strconcat(VecCVTStrHelper<op, "${d}_1", "${s}_1">.s,
+           !strconcat("\n\t",
+           !strconcat(VecCVTStrHelper<op, "${d}_2", "${s}_2">.s,
+           !strconcat("\n\t", VecCVTStrHelper<op, "${d}_3", "${s}_3">.s))))));
+}
+
+let VecInstType=isVecOther.Value in {
+def CVTv2i8tov2i32 : CVTtoVeci32<V2I8Regs, V2I32Regs,
+  Vec2CVTStr<"cvt.u32.u16">.s, Zint_extendext8to32>;
+def CVTv2i16tov2i32 : CVTtoVeci32<V2I16Regs, V2I32Regs,
+  Vec2CVTStr<"cvt.u32.u16">.s, Zint_extendext16to32>;
+def CVTv4i8tov4i32 : CVTtoVeci32<V4I8Regs, V4I32Regs,
+  Vec4CVTStr<"cvt.u32.u16">.s, Zint_extendext8to32>;
+def CVTv4i16tov4i32 : CVTtoVeci32<V4I16Regs, V4I32Regs,
+  Vec4CVTStr<"cvt.u32.u16">.s, Zint_extendext16to32>;
+def CVTv2i64tov2i32 : CVTtoVeci32<V2I64Regs, V2I32Regs,
+  Vec2CVTStr<"cvt.u32.u64">.s, TRUNC_64to32>;
+}
+
+def : Pat<(shl V2I16Regs:$src1, V2I16Regs:$src2),
+          (ShiftLV2I16 V2I16Regs:$src1, (CVTv2i16tov2i32 V2I16Regs:$src2))>;
+def : Pat<(shl V2I8Regs:$src1, V2I8Regs:$src2),
+          (ShiftLV2I8 V2I8Regs:$src1, (CVTv2i8tov2i32 V2I8Regs:$src2))>;
+def : Pat<(shl V2I64Regs:$src1, V2I64Regs:$src2),
+          (ShiftLV2I64 V2I64Regs:$src1, (CVTv2i64tov2i32 V2I64Regs:$src2))>;
+
+def : Pat<(shl V4I16Regs:$src1, V4I16Regs:$src2),
+          (ShiftLV4I16 V4I16Regs:$src1, (CVTv4i16tov4i32 V4I16Regs:$src2))>;
+def : Pat<(shl V4I8Regs:$src1, V4I8Regs:$src2),
+          (ShiftLV4I8 V4I8Regs:$src1, (CVTv4i8tov4i32 V4I8Regs:$src2))>;
+
+let VecInstType=isVecOther.Value in {
+def ShiftRAV2I64 : VecShiftOp<V2AsmStr<"shr.s64">, sra, V2I64Regs, V2I32Regs,
+  SRAi64rr>;
+def ShiftRAV2I32 : VecShiftOp<V2AsmStr<"shr.s32">, sra, V2I32Regs, V2I32Regs,
+  SRAi32rr>;
+def ShiftRAV4I32 : VecShiftOp<V4AsmStr<"shr.s32">, sra, V4I32Regs, V4I32Regs,
+  SRAi32rr>;
+def ShiftRAV2I16 : VecShiftOp<V2AsmStr<"shr.s16">, sra, V2I16Regs, V2I32Regs,
+  SRAi16rr>;
+def ShiftRAV4I16 : VecShiftOp<V4AsmStr<"shr.s16">, sra, V4I16Regs, V4I32Regs,
+  SRAi16rr>;
+def ShiftRAV2I8  : VecShiftOp<V2AsmStr<"shr.s16">, sra, V2I8Regs,  V2I32Regs,
+  SRAi8rr>;
+def ShiftRAV4I8  : VecShiftOp<V4AsmStr<"shr.s16">, sra, V4I8Regs,  V4I32Regs,
+  SRAi8rr>;
+
+def ShiftRLV2I64 : VecShiftOp<V2AsmStr<"shr.u64">, srl, V2I64Regs, V2I32Regs,
+  SRLi64rr>;
+def ShiftRLV2I32 : VecShiftOp<V2AsmStr<"shr.u32">, srl, V2I32Regs, V2I32Regs,
+  SRLi32rr>;
+def ShiftRLV4I32 : VecShiftOp<V4AsmStr<"shr.u32">, srl, V4I32Regs, V4I32Regs,
+  SRLi32rr>;
+def ShiftRLV2I16 : VecShiftOp<V2AsmStr<"shr.u16">, srl, V2I16Regs, V2I32Regs,
+  SRLi16rr>;
+def ShiftRLV4I16 : VecShiftOp<V4AsmStr<"shr.u16">, srl, V4I16Regs, V4I32Regs,
+  SRLi16rr>;
+def ShiftRLV2I8  : VecShiftOp<V2AsmStr<"shr.u16">, srl, V2I8Regs,  V2I32Regs,
+  SRLi8rr>;
+def ShiftRLV4I8  : VecShiftOp<V4AsmStr<"shr.u16">, srl, V4I8Regs,  V4I32Regs,
+  SRLi8rr>;
+
+defm VMult   : IntBinVOp<"mul.lo.s", mul, MULTi64rr, MULTi32rr, MULTi16rr,
+  MULTi8rr>;
+defm VMultHS : IntBinVOp<"mul.hi.s", mulhs, MULTHSi64rr, MULTHSi32rr,
+  MULTHSi16rr,
+  MULTHSi8rr>;
+defm VMultHU : IntBinVOp<"mul.hi.u", mulhu, MULTHUi64rr, MULTHUi32rr,
+  MULTHUi16rr,
+  MULTHUi8rr>;
+defm VSDiv   : IntBinVOp<"div.s", sdiv, SDIVi64rr, SDIVi32rr, SDIVi16rr,
+  SDIVi8rr>;
+defm VUDiv   : IntBinVOp<"div.u", udiv, UDIVi64rr, UDIVi32rr, UDIVi16rr,
+  UDIVi8rr>;
+defm VSRem   : IntBinVOp<"rem.s", srem, SREMi64rr, SREMi32rr, SREMi16rr,
+  SREMi8rr>;
+defm VURem   : IntBinVOp<"rem.u", urem, UREMi64rr, UREMi32rr, UREMi16rr,
+  UREMi8rr>;
+}
+
+def : Pat<(sra V2I16Regs:$src1, V2I16Regs:$src2),
+          (ShiftRAV2I16 V2I16Regs:$src1, (CVTv2i16tov2i32 V2I16Regs:$src2))>;
+def : Pat<(sra V2I8Regs:$src1, V2I8Regs:$src2),
+          (ShiftRAV2I8 V2I8Regs:$src1, (CVTv2i8tov2i32 V2I8Regs:$src2))>;
+def : Pat<(sra V2I64Regs:$src1, V2I64Regs:$src2),
+          (ShiftRAV2I64 V2I64Regs:$src1, (CVTv2i64tov2i32 V2I64Regs:$src2))>;
+
+def : Pat<(sra V4I16Regs:$src1, V4I16Regs:$src2),
+          (ShiftRAV4I16 V4I16Regs:$src1, (CVTv4i16tov4i32 V4I16Regs:$src2))>;
+def : Pat<(sra V4I8Regs:$src1, V4I8Regs:$src2),
+          (ShiftRAV4I8 V4I8Regs:$src1, (CVTv4i8tov4i32 V4I8Regs:$src2))>;
+
+def : Pat<(srl V2I16Regs:$src1, V2I16Regs:$src2),
+          (ShiftRLV2I16 V2I16Regs:$src1, (CVTv2i16tov2i32 V2I16Regs:$src2))>;
+def : Pat<(srl V2I8Regs:$src1, V2I8Regs:$src2),
+          (ShiftRLV2I8 V2I8Regs:$src1, (CVTv2i8tov2i32 V2I8Regs:$src2))>;
+def : Pat<(srl V2I64Regs:$src1, V2I64Regs:$src2),
+          (ShiftRLV2I64 V2I64Regs:$src1, (CVTv2i64tov2i32 V2I64Regs:$src2))>;
+
+def : Pat<(srl V4I16Regs:$src1, V4I16Regs:$src2),
+          (ShiftRLV4I16 V4I16Regs:$src1, (CVTv4i16tov4i32 V4I16Regs:$src2))>;
+def : Pat<(srl V4I8Regs:$src1, V4I8Regs:$src2),
+          (ShiftRLV4I8 V4I8Regs:$src1, (CVTv4i8tov4i32 V4I8Regs:$src2))>;
+
+multiclass VMAD<string asmstr, NVPTXRegClass regclassv4,
+  NVPTXRegClass regclassv2,
+                SDNode an=add, SDNode mn=mul, NVPTXInst sop=NOP,
+                Predicate Pred> {
+  def V4 : NVPTXVecInst<(outs regclassv4:$dst),
+    (ins regclassv4:$a, regclassv4:$b, regclassv4:$c),
+                      V4MADStr<asmstr>.s,
+                      [(set regclassv4:$dst,
+                        (an (mn regclassv4:$a, regclassv4:$b), regclassv4:$c))],
+                      sop>,
+           Requires<[Pred]>;
+  def V2 : NVPTXVecInst<(outs regclassv2:$dst),
+    (ins regclassv2:$a, regclassv2:$b, regclassv2:$c),
+                      V2MADStr<asmstr>.s,
+                      [(set regclassv2:$dst,
+                        (an (mn regclassv2:$a, regclassv2:$b), regclassv2:$c))],
+                      sop>,
+           Requires<[Pred]>;
+}
+
+multiclass VMADV2Only<string asmstr, NVPTXRegClass regclass, NVPTXInst sop=NOP,
+  Predicate Pred> {
+  def V2 : NVPTXVecInst<(outs regclass:$dst),
+    (ins regclass:$a, regclass:$b, regclass:$c),
+                      V2MADStr<asmstr>.s,
+                      [(set regclass:$dst, (add
+                        (mul regclass:$a, regclass:$b), regclass:$c))], sop>,
+           Requires<[Pred]>;
+}
+multiclass VFMADV2Only<string asmstr, NVPTXRegClass regclass, NVPTXInst sop=NOP,
+  Predicate Pred> {
+  def V2 : NVPTXVecInst<(outs regclass:$dst),
+    (ins regclass:$a, regclass:$b, regclass:$c),
+                      V2MADStr<asmstr>.s,
+                      [(set regclass:$dst, (fadd
+                        (fmul regclass:$a, regclass:$b), regclass:$c))], sop>,
+           Requires<[Pred]>;
+}
+
+let VecInstType=isVecOther.Value in {
+defm I8MAD  : VMAD<"mad.lo.s16", V4I8Regs, V2I8Regs, add, mul, MAD8rrr, true>;
+defm I16MAD : VMAD<"mad.lo.s16", V4I16Regs, V2I16Regs, add, mul, MAD16rrr,
+  true>;
+defm I32MAD : VMAD<"mad.lo.s32", V4I32Regs, V2I32Regs, add, mul, MAD32rrr,
+  true>;
+defm I64MAD : VMADV2Only<"mad.lo.s64", V2I64Regs, MAD64rrr, true>;
+
+defm VNeg : IntUnaryVOp<"neg.s", ineg, INEG64, INEG32, INEG16, INEG8>;
+
+defm VAddf : FloatBinVOp<"add.", fadd, FADDf64rr, FADDf32rr, FADDf32rr_ftz>;
+defm VSubf : FloatBinVOp<"sub.", fsub, FSUBf64rr, FSUBf32rr, FSUBf32rr_ftz>;
+defm VMulf : FloatBinVOp<"mul.", fmul, FMULf64rr, FMULf32rr, FMULf32rr_ftz>;
+
+defm F32MAD_ftz : VMAD<"mad.ftz.f32", V4F32Regs, V2F32Regs, fadd, fmul,
+  FMAD32_ftzrrr, doFMADF32_ftz>;
+defm F32FMA_ftz : VMAD<"fma.rn.ftz.f32", V4F32Regs, V2F32Regs, fadd, fmul,
+  FMA32_ftzrrr, doFMAF32_ftz>;
+defm F32MAD : VMAD<"mad.f32", V4F32Regs, V2F32Regs, fadd, fmul, FMAD32rrr,
+  doFMADF32>;
+defm F32FMA : VMAD<"fma.rn.f32", V4F32Regs, V2F32Regs, fadd, fmul, FMA32rrr,
+  doFMAF32>;
+defm F64FMA : VFMADV2Only<"fma.rn.f64", V2F64Regs, FMA64rrr, doFMAF64>;
+}
+
+let VecInstType=isVecOther.Value in {
+def V4F32Div_prec_ftz : VecBinaryOp<V4AsmStr<"div.rn.ftz.f32">, fdiv, V4F32Regs,
+  FDIV32rr_prec_ftz>, Requires<[doF32FTZ, reqPTX20]>;
+def V2F32Div_prec_ftz : VecBinaryOp<V2AsmStr<"div.rn.ftz.f32">, fdiv, V2F32Regs,
+  FDIV32rr_prec_ftz>, Requires<[doF32FTZ, reqPTX20]>;
+def V4F32Div_prec : VecBinaryOp<V4AsmStr<"div.rn.f32">, fdiv, V4F32Regs,
+  FDIV32rr_prec>, Requires<[reqPTX20]>;
+def V2F32Div_prec : VecBinaryOp<V2AsmStr<"div.rn.f32">, fdiv, V2F32Regs,
+  FDIV32rr_prec>, Requires<[reqPTX20]>;
+def V2F32Div_ftz : VecBinaryOp<V2AsmStr<"div.full.ftz.f32">, fdiv, V2F32Regs,
+  FDIV32rr_ftz>, Requires<[doF32FTZ]>;
+def V4F32Div_ftz : VecBinaryOp<V4AsmStr<"div.full.ftz.f32">, fdiv, V4F32Regs,
+  FDIV32rr_ftz>, Requires<[doF32FTZ]>;
+def V2F32Div : VecBinaryOp<V2AsmStr<"div.full.f32">, fdiv, V2F32Regs, FDIV32rr>;
+def V4F32Div : VecBinaryOp<V4AsmStr<"div.full.f32">, fdiv, V4F32Regs, FDIV32rr>;
+def V2F64Div : VecBinaryOp<V2AsmStr<"div.rn.f64">, fdiv, V2F64Regs, FDIV64rr>;
+}
+
+def fnegpat : PatFrag<(ops node:$in), (fneg node:$in)>;
+
+let VecInstType=isVecOther.Value in {
+def VNegv2f32_ftz : VecUnaryOp<V2UnaryStr<"neg.ftz.f32">, fnegpat, V2F32Regs,
+  FNEGf32_ftz>, Requires<[doF32FTZ]>;
+def VNegv4f32_ftz : VecUnaryOp<V4UnaryStr<"neg.ftz.f32">, fnegpat, V4F32Regs,
+  FNEGf32_ftz>, Requires<[doF32FTZ]>;
+def VNegv2f32 : VecUnaryOp<V2UnaryStr<"neg.f32">, fnegpat, V2F32Regs, FNEGf32>;
+def VNegv4f32 : VecUnaryOp<V4UnaryStr<"neg.f32">, fnegpat, V4F32Regs, FNEGf32>;
+def VNegv2f64 : VecUnaryOp<V2UnaryStr<"neg.f64">, fnegpat, V2F64Regs, FNEGf64>;
+
+// Logical Arithmetic
+defm VAnd : IntBinVOp<"and.b", and, ANDb64rr, ANDb32rr, ANDb16rr, ANDb8rr>;
+defm VOr  : IntBinVOp<"or.b", or, ORb64rr, ORb32rr, ORb16rr, ORb8rr>;
+defm VXor : IntBinVOp<"xor.b", xor, XORb64rr, XORb32rr, XORb16rr, XORb8rr>;
+
+defm VNot : IntUnaryVOp<"not.b", not, NOT64, NOT32, NOT16, NOT8>;
+}
+
+
+multiclass V2FPCONTRACT32_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
+  def : Pat<(fsub V2F32Regs:$a, (fmul V2F32Regs:$b, V2F32Regs:$c)),
+          (Inst (VNegv2f32 V2F32Regs:$b), V2F32Regs:$c,  V2F32Regs:$a)>,
+          Requires<[Pred]>;
+
+  def : Pat<(fsub (fmul V2F32Regs:$a, V2F32Regs:$b), V2F32Regs:$c),
+          (Inst V2F32Regs:$a, V2F32Regs:$b, (VNegv2f32 V2F32Regs:$c))>,
+          Requires<[Pred]>;
+}
+
+defm V2FMAF32ext_ftz  : V2FPCONTRACT32_SUB_PAT<F32FMA_ftzV2, doFMAF32AGG_ftz>;
+defm V2FMADF32ext_ftz : V2FPCONTRACT32_SUB_PAT<F32MAD_ftzV2, doFMADF32_ftz>;
+defm V2FMAF32ext  : V2FPCONTRACT32_SUB_PAT<F32FMAV2, doFMAF32AGG>;
+defm V2FMADF32ext : V2FPCONTRACT32_SUB_PAT<F32MADV2, doFMADF32>;
+
+multiclass V4FPCONTRACT32_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
+  def : Pat<(fsub V4F32Regs:$a, (fmul V4F32Regs:$b, V4F32Regs:$c)),
+          (Inst (VNegv4f32 V4F32Regs:$b), V4F32Regs:$c,  V4F32Regs:$a)>,
+          Requires<[Pred]>;
+
+  def : Pat<(fsub (fmul V4F32Regs:$a, V4F32Regs:$b), V4F32Regs:$c),
+          (Inst V4F32Regs:$a, V4F32Regs:$b, (VNegv4f32 V4F32Regs:$c))>,
+          Requires<[Pred]>;
+}
+
+defm V4FMAF32ext_ftz  : V4FPCONTRACT32_SUB_PAT<F32FMA_ftzV4, doFMAF32AGG_ftz>;
+defm V4FMADF32ext_ftz : V4FPCONTRACT32_SUB_PAT<F32MAD_ftzV4, doFMADF32_ftz>;
+defm V4FMAF32ext  : V4FPCONTRACT32_SUB_PAT<F32FMAV4, doFMAF32AGG>;
+defm V4FMADF32ext : V4FPCONTRACT32_SUB_PAT<F32MADV4, doFMADF32>;
+
+multiclass V2FPCONTRACT64_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
+  def : Pat<(fsub V2F64Regs:$a, (fmul V2F64Regs:$b, V2F64Regs:$c)),
+          (Inst (VNegv2f64 V2F64Regs:$b), V2F64Regs:$c, V2F64Regs:$a)>,
+          Requires<[Pred]>;
+
+  def : Pat<(fsub (fmul V2F64Regs:$a, V2F64Regs:$b), V2F64Regs:$c),
+          (Inst V2F64Regs:$a, V2F64Regs:$b, (VNegv2f64 V2F64Regs:$c))>,
+          Requires<[Pred]>;
+}
+
+defm V2FMAF64ext : V2FPCONTRACT64_SUB_PAT<F64FMAV2, doFMAF64AGG>;
+
+class VecModStr<string vecsize, string elem, string extra, string l="">
+{
+  string t1 = !strconcat("${c", elem);
+  string t2 = !strconcat(t1, ":vecv");
+  string t3 = !strconcat(t2, vecsize);
+  string t4 = !strconcat(t3, extra);
+  string t5 = !strconcat(t4, l);
+  string s =  !strconcat(t5, "}");
+}
+class ShuffleOneLine<string vecsize, string elem, string type>
+{
+  string t1 = VecModStr<vecsize, elem, "comm", "1">.s;
+  string t2 = !strconcat(t1, "mov.");
+  string t3 = !strconcat(t2, type);
+  string t4 = !strconcat(t3, " \t${dst}_");
+  string t5 = !strconcat(t4, elem);
+  string t6 = !strconcat(t5, ", $src1");
+  string t7 = !strconcat(t6, VecModStr<vecsize, elem, "pos">.s);
+  string t8 = !strconcat(t7, ";\n\t");
+  string t9 = !strconcat(t8, VecModStr<vecsize, elem, "comm", "2">.s);
+  string t10 = !strconcat(t9, "mov.");
+  string t11 = !strconcat(t10, type);
+  string t12 = !strconcat(t11, " \t${dst}_");
+  string t13 = !strconcat(t12, elem);
+  string t14 = !strconcat(t13, ", $src2");
+  string t15 = !strconcat(t14, VecModStr<vecsize, elem, "pos">.s);
+  string s =   !strconcat(t15, ";");
+}
+class ShuffleAsmStr2<string type>
+{
+  string t1 = ShuffleOneLine<"2", "0", type>.s;
+  string t2 = !strconcat(t1, "\n\t");
+  string s  = !strconcat(t2, ShuffleOneLine<"2", "1", type>.s);
+}
+class ShuffleAsmStr4<string type>
+{
+  string t1 = ShuffleOneLine<"4", "0", type>.s;
+  string t2 = !strconcat(t1, "\n\t");
+  string t3 = !strconcat(t2, ShuffleOneLine<"4", "1", type>.s);
+  string t4 = !strconcat(t3, "\n\t");
+  string t5 = !strconcat(t4, ShuffleOneLine<"4", "2", type>.s);
+  string t6 = !strconcat(t5, "\n\t");
+  string s  = !strconcat(t6, ShuffleOneLine<"4", "3", type>.s);
+}
+
+let neverHasSideEffects=1, VecInstType=isVecShuffle.Value in {
+def VecShuffle_v4f32 : NVPTXVecInst<(outs V4F32Regs:$dst),
+                       (ins  V4F32Regs:$src1, V4F32Regs:$src2,
+                             i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3),
+                 !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t",
+                                 ShuffleAsmStr4<"f32">.s),
+                       [], FMOV32rr>;
+
+def VecShuffle_v4i32 : NVPTXVecInst<(outs V4I32Regs:$dst),
+                       (ins  V4I32Regs:$src1, V4I32Regs:$src2,
+                             i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3),
+                 !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t",
+                                 ShuffleAsmStr4<"u32">.s),
+                       [], IMOV32rr>;
+
+def VecShuffle_v4i16 : NVPTXVecInst<(outs V4I16Regs:$dst),
+                       (ins  V4I16Regs:$src1, V4I16Regs:$src2,
+                             i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3),
+                 !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t",
+                                 ShuffleAsmStr4<"u16">.s),
+                       [], IMOV16rr>;
+
+def VecShuffle_v4i8 : NVPTXVecInst<(outs V4I8Regs:$dst),
+                       (ins  V4I8Regs:$src1, V4I8Regs:$src2,
+                             i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3),
+                 !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t",
+                                 ShuffleAsmStr4<"u16">.s),
+                       [], IMOV8rr>;
+
+def VecShuffle_v2f32 : NVPTXVecInst<(outs V2F32Regs:$dst),
+                       (ins  V2F32Regs:$src1, V2F32Regs:$src2,
+                             i8imm:$c0, i8imm:$c1),
+                       !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+                                 ShuffleAsmStr2<"f32">.s),
+                       [], FMOV32rr>;
+
+def VecShuffle_v2i32 : NVPTXVecInst<(outs V2I32Regs:$dst),
+                       (ins  V2I32Regs:$src1, V2I32Regs:$src2,
+                             i8imm:$c0, i8imm:$c1),
+                       !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+                                 ShuffleAsmStr2<"u32">.s),
+                       [], IMOV32rr>;
+
+def VecShuffle_v2i8 : NVPTXVecInst<(outs V2I8Regs:$dst),
+                       (ins  V2I8Regs:$src1, V2I8Regs:$src2,
+                             i8imm:$c0, i8imm:$c1),
+                       !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+                                 ShuffleAsmStr2<"u16">.s),
+                       [], IMOV8rr>;
+
+def VecShuffle_v2i16 : NVPTXVecInst<(outs V2I16Regs:$dst),
+                       (ins  V2I16Regs:$src1, V2I16Regs:$src2,
+                             i8imm:$c0, i8imm:$c1),
+                       !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+                                 ShuffleAsmStr2<"u16">.s),
+                       [], IMOV16rr>;
+
+def VecShuffle_v2f64 : NVPTXVecInst<(outs V2F64Regs:$dst),
+                       (ins  V2F64Regs:$src1, V2F64Regs:$src2,
+                             i8imm:$c0, i8imm:$c1),
+                       !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+                                 ShuffleAsmStr2<"f64">.s),
+                       [], FMOV64rr>;
+
+def VecShuffle_v2i64 : NVPTXVecInst<(outs V2I64Regs:$dst),
+                       (ins  V2I64Regs:$src1, V2I64Regs:$src2,
+                             i8imm:$c0, i8imm:$c1),
+                       !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+                                 ShuffleAsmStr2<"u64">.s),
+                       [], IMOV64rr>;
+}
+
+def ShuffleMask0 : SDNodeXForm<vector_shuffle, [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return CurDAG->getTargetConstant(SVOp->getMaskElt(0), MVT::i32);
+}]>;
+def ShuffleMask1 : SDNodeXForm<vector_shuffle, [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return CurDAG->getTargetConstant(SVOp->getMaskElt(1), MVT::i32);
+}]>;
+def ShuffleMask2 : SDNodeXForm<vector_shuffle, [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return CurDAG->getTargetConstant(SVOp->getMaskElt(2), MVT::i32);
+}]>;
+def ShuffleMask3 : SDNodeXForm<vector_shuffle, [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return CurDAG->getTargetConstant(SVOp->getMaskElt(3), MVT::i32);
+}]>;
+
+// The spurious call is here to silence a compiler warning about N being
+// unused.
+def vec_shuf : PatFrag<(ops node:$lhs, node:$rhs),
+                       (vector_shuffle node:$lhs, node:$rhs),
+                       [{ N->getGluedNode(); return true; }]>;
+
+def : Pat<(v2f64 (vec_shuf:$op V2F64Regs:$src1, V2F64Regs:$src2)),
+          (VecShuffle_v2f64 V2F64Regs:$src1, V2F64Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v4f32 (vec_shuf:$op V4F32Regs:$src1, V4F32Regs:$src2)),
+          (VecShuffle_v4f32 V4F32Regs:$src1, V4F32Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op),
+                            (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>;
+
+def : Pat<(v2f32 (vec_shuf:$op V2F32Regs:$src1, V2F32Regs:$src2)),
+          (VecShuffle_v2f32 V2F32Regs:$src1, V2F32Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v2i64 (vec_shuf:$op V2I64Regs:$src1, V2I64Regs:$src2)),
+          (VecShuffle_v2i64 V2I64Regs:$src1, V2I64Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v4i32 (vec_shuf:$op V4I32Regs:$src1, V4I32Regs:$src2)),
+          (VecShuffle_v4i32 V4I32Regs:$src1, V4I32Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op),
+                            (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>;
+
+def : Pat<(v2i32 (vec_shuf:$op V2I32Regs:$src1, V2I32Regs:$src2)),
+          (VecShuffle_v2i32 V2I32Regs:$src1, V2I32Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v4i16 (vec_shuf:$op V4I16Regs:$src1, V4I16Regs:$src2)),
+          (VecShuffle_v4i16 V4I16Regs:$src1, V4I16Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op),
+                            (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>;
+
+def : Pat<(v2i16 (vec_shuf:$op V2I16Regs:$src1, V2I16Regs:$src2)),
+          (VecShuffle_v2i16 V2I16Regs:$src1, V2I16Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v4i8 (vec_shuf:$op V4I8Regs:$src1, V4I8Regs:$src2)),
+          (VecShuffle_v4i8 V4I8Regs:$src1, V4I8Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op),
+                            (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>;
+
+def : Pat<(v2i8 (vec_shuf:$op V2I8Regs:$src1, V2I8Regs:$src2)),
+          (VecShuffle_v2i8 V2I8Regs:$src1, V2I8Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+class Build_Vector2<string asmstr, NVPTXRegClass vclass, NVPTXRegClass sclass,
+  NVPTXInst si>
+                   : NVPTXVecInst<(outs vclass:$dst),
+                   (ins  sclass:$a1, sclass:$a2),
+                   !strconcat(asmstr, "\t${dst:vecfull}, {{$a1, $a2}};"),
+                   [(set vclass:$dst, (build_vector sclass:$a1, sclass:$a2))],
+                   si>;
+class Build_Vector4<string asmstr, NVPTXRegClass vclass, NVPTXRegClass sclass,
+  NVPTXInst si>
+                   : NVPTXVecInst<(outs vclass:$dst),
+                   (ins  sclass:$a1, sclass:$a2, sclass:$a3, sclass:$a4),
+               !strconcat(asmstr, "\t${dst:vecfull}, {{$a1, $a2, $a3, $a4}};"),
+                   [(set vclass:$dst,
+                     (build_vector sclass:$a1, sclass:$a2,
+                       sclass:$a3, sclass:$a4))], si>;
+
+let isAsCheapAsAMove=1, VecInstType=isVecBuild.Value in {
+def Build_Vector2_f32 : Build_Vector2<"mov.v2.f32", V2F32Regs, Float32Regs,
+  FMOV32rr>;
+def Build_Vector2_f64 : Build_Vector2<"mov.v2.f64", V2F64Regs, Float64Regs,
+  FMOV64rr>;
+
+def Build_Vector2_i32 : Build_Vector2<"mov.v2.u32", V2I32Regs, Int32Regs,
+  IMOV32rr>;
+def Build_Vector2_i64 : Build_Vector2<"mov.v2.u64", V2I64Regs, Int64Regs,
+  IMOV64rr>;
+def Build_Vector2_i16 : Build_Vector2<"mov.v2.u16", V2I16Regs, Int16Regs,
+  IMOV16rr>;
+def Build_Vector2_i8  : Build_Vector2<"mov.v2.u16",  V2I8Regs,  Int8Regs,
+  IMOV8rr>;
+
+def Build_Vector4_f32 : Build_Vector4<"mov.v4.f32", V4F32Regs, Float32Regs,
+  FMOV32rr>;
+
+def Build_Vector4_i32 : Build_Vector4<"mov.v4.u32", V4I32Regs, Int32Regs,
+  IMOV32rr>;
+def Build_Vector4_i16 : Build_Vector4<"mov.v4.u16", V4I16Regs, Int16Regs,
+  IMOV16rr>;
+def Build_Vector4_i8  : Build_Vector4<"mov.v4.u16", V4I8Regs, Int8Regs,
+  IMOV8rr>;
+}
+
+class Vec_Move<string asmstr, NVPTXRegClass vclass, NVPTXInst sop=NOP>
+                 : NVPTXVecInst<(outs vclass:$dst), (ins vclass:$src),
+                   !strconcat(asmstr, "\t${dst:vecfull}, ${src:vecfull};"),
+                   [], sop>;
+
+let isAsCheapAsAMove=1, neverHasSideEffects=1, IsSimpleMove=1,
+  VecInstType=isVecOther.Value in {
+def V4f32Mov : Vec_Move<"mov.v4.f32", V4F32Regs, FMOV32rr>;
+def V2f32Mov : Vec_Move<"mov.v2.f32", V2F32Regs, FMOV32rr>;
+
+def V4i32Mov : Vec_Move<"mov.v4.u32", V4I32Regs, IMOV32rr>;
+def V2i32Mov : Vec_Move<"mov.v2.u32", V2I32Regs, IMOV32rr>;
+
+def V4i16Mov : Vec_Move<"mov.v4.u16", V4I16Regs, IMOV16rr>;
+def V2i16Mov : Vec_Move<"mov.v2.u16", V2I16Regs, IMOV16rr>;
+
+def V4i8Mov : Vec_Move<"mov.v4.u16", V4I8Regs, IMOV8rr>;
+def V2i8Mov : Vec_Move<"mov.v2.u16", V2I8Regs, IMOV8rr>;
+
+def V2f64Mov : Vec_Move<"mov.v2.f64", V2F64Regs, FMOV64rr>;
+def V2i64Mov : Vec_Move<"mov.v2.u64", V2I64Regs, IMOV64rr>;
+}
+
+// extract subvector patterns
+def extract_subvec : SDNode<"ISD::EXTRACT_SUBVECTOR",
+                        SDTypeProfile<1, 2, [SDTCisPtrTy<2>]>>;
+
+def : Pat<(v2f32 (extract_subvec V4F32Regs:$src, 0)),
+                 (Build_Vector2_f32 (V4f32Extract V4F32Regs:$src, 0),
+                                    (V4f32Extract V4F32Regs:$src, 1))>;
+def : Pat<(v2f32 (extract_subvec V4F32Regs:$src, 2)),
+                 (Build_Vector2_f32 (V4f32Extract V4F32Regs:$src, 2),
+                                    (V4f32Extract V4F32Regs:$src, 3))>;
+def : Pat<(v2i32 (extract_subvec V4I32Regs:$src, 0)),
+                 (Build_Vector2_i32 (V4i32Extract V4I32Regs:$src, 0),
+                                    (V4i32Extract V4I32Regs:$src, 1))>;
+def : Pat<(v2i32 (extract_subvec V4I32Regs:$src, 2)),
+                 (Build_Vector2_i32 (V4i32Extract V4I32Regs:$src, 2),
+                                    (V4i32Extract V4I32Regs:$src, 3))>;
+def : Pat<(v2i16 (extract_subvec V4I16Regs:$src, 0)),
+                 (Build_Vector2_i16 (V4i16Extract V4I16Regs:$src, 0),
+                                    (V4i16Extract V4I16Regs:$src, 1))>;
+def : Pat<(v2i16 (extract_subvec V4I16Regs:$src, 2)),
+                 (Build_Vector2_i16 (V4i16Extract V4I16Regs:$src, 2),
+                                    (V4i16Extract V4I16Regs:$src, 3))>;
+def : Pat<(v2i8 (extract_subvec V4I8Regs:$src, 0)),
+                 (Build_Vector2_i8 (V4i8Extract V4I8Regs:$src, 0),
+                                    (V4i8Extract V4I8Regs:$src, 1))>;
+def : Pat<(v2i8 (extract_subvec V4I8Regs:$src, 2)),
+                 (Build_Vector2_i8 (V4i8Extract V4I8Regs:$src, 2),
+                                    (V4i8Extract V4I8Regs:$src, 3))>;
+
+// Select instructions
+class Select_OneLine<string type, string pos> {
+  string t1 = !strconcat("selp.", type);
+  string t2 = !strconcat(t1, " \t${dst}_");
+  string t3 = !strconcat(t2, pos);
+  string t4 = !strconcat(t3, ", ${src1}_");
+  string t5 = !strconcat(t4, pos);
+  string t6 = !strconcat(t5, ", ${src2}_");
+  string t7 = !strconcat(t6, pos);
+  string s  = !strconcat(t7, ", $p;");
+}
+
+class Select_Str2<string type> {
+  string t1 = Select_OneLine<type, "0">.s;
+  string t2 = !strconcat(t1, "\n\t");
+  string s  = !strconcat(t2, Select_OneLine<type, "1">.s);
+}
+
+class Select_Str4<string type> {
+  string t1 = Select_OneLine<type, "0">.s;
+  string t2 = !strconcat(t1, "\n\t");
+  string t3 = !strconcat(t2, Select_OneLine<type, "1">.s);
+  string t4 = !strconcat(t3, "\n\t");
+  string t5 = !strconcat(t4, Select_OneLine<type, "2">.s);
+  string t6 = !strconcat(t5, "\n\t");
+  string s  = !strconcat(t6, Select_OneLine<type, "3">.s);
+
+}
+
+class Vec_Select<NVPTXRegClass vclass, string asmstr, NVPTXInst sop>
+      : NVPTXVecInst<(outs vclass:$dst),
+                     (ins  vclass:$src1, vclass:$src2, Int1Regs:$p),
+                     asmstr,
+                     [(set vclass:$dst, (select Int1Regs:$p, vclass:$src1,
+                       vclass:$src2))],
+                     sop>;
+
+let VecInstType=isVecOther.Value in {
+def V2I64_Select : Vec_Select<V2I64Regs, Select_Str2<"b64">.s, SELECTi64rr>;
+def V4I32_Select : Vec_Select<V4I32Regs, Select_Str4<"b32">.s, SELECTi32rr>;
+def V2I32_Select : Vec_Select<V2I32Regs, Select_Str2<"b32">.s, SELECTi32rr>;
+def V4I16_Select : Vec_Select<V4I16Regs, Select_Str4<"b16">.s, SELECTi16rr>;
+def V2I16_Select : Vec_Select<V2I16Regs, Select_Str2<"b16">.s, SELECTi16rr>;
+def V4I8_Select  : Vec_Select<V4I8Regs,  Select_Str4<"b16">.s, SELECTi8rr>;
+def V2I8_Select  : Vec_Select<V2I8Regs,  Select_Str2<"b16">.s, SELECTi8rr>;
+
+def V2F64_Select : Vec_Select<V2F64Regs, Select_Str2<"f64">.s, SELECTf64rr>;
+def V4F32_Select : Vec_Select<V4F32Regs, Select_Str4<"f32">.s, SELECTf32rr>;
+def V2F32_Select : Vec_Select<V2F32Regs, Select_Str2<"f32">.s, SELECTf32rr>;
+}
+
+// Comparison instructions
+
+// setcc convenience fragments.
+def vsetoeq : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETOEQ)>;
+def vsetogt : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETOGT)>;
+def vsetoge : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETOGE)>;
+def vsetolt : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETOLT)>;
+def vsetole : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETOLE)>;
+def vsetone : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETONE)>;
+def vseto   : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETO)>;
+def vsetuo  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETUO)>;
+def vsetueq : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETUEQ)>;
+def vsetugt : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETUGT)>;
+def vsetuge : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETUGE)>;
+def vsetult : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETULT)>;
+def vsetule : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETULE)>;
+def vsetune : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETUNE)>;
+def vseteq  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETEQ)>;
+def vsetgt  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETGT)>;
+def vsetge  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETGE)>;
+def vsetlt  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETLT)>;
+def vsetle  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETLE)>;
+def vsetne  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETNE)>;
+
+class Vec_Compare<PatFrag op, NVPTXRegClass outrclass, NVPTXRegClass inrclass,
+  NVPTXInst sop>
+    : NVPTXVecInst<(outs outrclass:$dst),
+                   (ins  inrclass:$a, inrclass:$b),
+                   "Unsupported",
+                   [(set outrclass:$dst, (op inrclass:$a, inrclass:$b))],
+                   sop>;
+
+multiclass Vec_Compare_All<PatFrag op,
+                           NVPTXInst inst8,
+                           NVPTXInst inst16,
+                           NVPTXInst inst32,
+                           NVPTXInst inst64>
+{
+  def  V2I8 : Vec_Compare<op, V2I8Regs,  V2I8Regs,  inst8>;
+  def  V4I8 : Vec_Compare<op, V4I8Regs,  V4I8Regs,  inst8>;
+  def V2I16 : Vec_Compare<op, V2I16Regs, V2I16Regs, inst16>;
+  def V4I16 : Vec_Compare<op, V4I16Regs, V4I16Regs, inst16>;
+  def V2I32 : Vec_Compare<op, V2I32Regs, V2I32Regs, inst32>;
+  def V4I32 : Vec_Compare<op, V4I32Regs, V4I32Regs, inst32>;
+  def V2I64 : Vec_Compare<op, V2I64Regs, V2I64Regs, inst64>;
+}
+
+let VecInstType=isVecOther.Value in {
+  defm VecSGT : Vec_Compare_All<vsetgt,  ISetSGTi8rr_toi8, ISetSGTi16rr_toi16,
+    ISetSGTi32rr_toi32, ISetSGTi64rr_toi64>;
+  defm VecUGT : Vec_Compare_All<vsetugt, ISetUGTi8rr_toi8, ISetUGTi16rr_toi16,
+    ISetUGTi32rr_toi32, ISetUGTi64rr_toi64>;
+  defm VecSLT : Vec_Compare_All<vsetlt,  ISetSLTi8rr_toi8, ISetSLTi16rr_toi16,
+    ISetSLTi32rr_toi32, ISetSLTi64rr_toi64>;
+  defm VecULT : Vec_Compare_All<vsetult, ISetULTi8rr_toi8, ISetULTi16rr_toi16,
+    ISetULTi32rr_toi32, ISetULTi64rr_toi64>;
+  defm VecSGE : Vec_Compare_All<vsetge,  ISetSGEi8rr_toi8, ISetSGEi16rr_toi16,
+    ISetSGEi32rr_toi32, ISetSGEi64rr_toi64>;
+  defm VecUGE : Vec_Compare_All<vsetuge, ISetUGEi8rr_toi8, ISetUGEi16rr_toi16,
+    ISetUGEi32rr_toi32, ISetUGEi64rr_toi64>;
+  defm VecSLE : Vec_Compare_All<vsetle,  ISetSLEi8rr_toi8, ISetSLEi16rr_toi16,
+    ISetSLEi32rr_toi32, ISetSLEi64rr_toi64>;
+  defm VecULE : Vec_Compare_All<vsetule, ISetULEi8rr_toi8, ISetULEi16rr_toi16,
+    ISetULEi32rr_toi32, ISetULEi64rr_toi64>;
+  defm VecSEQ : Vec_Compare_All<vseteq,  ISetSEQi8rr_toi8, ISetSEQi16rr_toi16,
+    ISetSEQi32rr_toi32, ISetSEQi64rr_toi64>;
+  defm VecUEQ : Vec_Compare_All<vsetueq, ISetUEQi8rr_toi8, ISetUEQi16rr_toi16,
+    ISetUEQi32rr_toi32, ISetUEQi64rr_toi64>;
+  defm VecSNE : Vec_Compare_All<vsetne,  ISetSNEi8rr_toi8, ISetSNEi16rr_toi16,
+    ISetSNEi32rr_toi32, ISetSNEi64rr_toi64>;
+  defm VecUNE : Vec_Compare_All<vsetune, ISetUNEi8rr_toi8, ISetUNEi16rr_toi16,
+    ISetUNEi32rr_toi32, ISetUNEi64rr_toi64>;
+}
+
+multiclass FVec_Compare_All<PatFrag op,
+                            NVPTXInst instf32,
+                            NVPTXInst instf64>
+{
+  def V2F32 : Vec_Compare<op, V2I32Regs, V2F32Regs, instf32>;
+  def V4F32 : Vec_Compare<op, V4I32Regs, V4F32Regs, instf32>;
+  def V2F64 : Vec_Compare<op, V2I64Regs, V2F64Regs, instf64>;
+}
+
+let VecInstType=isVecOther.Value in {
+  defm FVecGT :  FVec_Compare_All<vsetogt, FSetGTf32rr_toi32,
+    FSetGTf64rr_toi64>;
+  defm FVecLT :  FVec_Compare_All<vsetolt, FSetLTf32rr_toi32,
+    FSetLTf64rr_toi64>;
+  defm FVecGE :  FVec_Compare_All<vsetoge, FSetGEf32rr_toi32,
+    FSetGEf64rr_toi64>;
+  defm FVecLE :  FVec_Compare_All<vsetole, FSetLEf32rr_toi32,
+    FSetLEf64rr_toi64>;
+  defm FVecEQ :  FVec_Compare_All<vsetoeq, FSetEQf32rr_toi32,
+    FSetEQf64rr_toi64>;
+  defm FVecNE :  FVec_Compare_All<vsetone, FSetNEf32rr_toi32,
+    FSetNEf64rr_toi64>;
+
+  defm FVecUGT :  FVec_Compare_All<vsetugt, FSetUGTf32rr_toi32,
+    FSetUGTf64rr_toi64>;
+  defm FVecULT :  FVec_Compare_All<vsetult, FSetULTf32rr_toi32,
+    FSetULTf64rr_toi64>;
+  defm FVecUGE :  FVec_Compare_All<vsetuge, FSetUGEf32rr_toi32,
+    FSetUGEf64rr_toi64>;
+  defm FVecULE :  FVec_Compare_All<vsetule, FSetULEf32rr_toi32,
+    FSetULEf64rr_toi64>;
+  defm FVecUEQ :  FVec_Compare_All<vsetueq, FSetUEQf32rr_toi32,
+    FSetUEQf64rr_toi64>;
+  defm FVecUNE :  FVec_Compare_All<vsetune, FSetUNEf32rr_toi32,
+    FSetUNEf64rr_toi64>;
+
+  defm FVecNUM :  FVec_Compare_All<vseto,  FSetNUMf32rr_toi32,
+    FSetNUMf64rr_toi64>;
+  defm FVecNAN :  FVec_Compare_All<vsetuo, FSetNANf32rr_toi32,
+    FSetNANf64rr_toi64>;
+}
+
+class LoadParamScalar4Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs regclass:$d1, regclass:$d2, regclass:$d3, regclass:$d4),
+                (ins i32imm:$a, i32imm:$b),
+                !strconcat(!strconcat("ld.param", opstr),
+                  "\t{{$d1, $d2, $d3, $d4}}, [retval0+$b];"), []>;
+
+class LoadParamScalar2Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs regclass:$d1, regclass:$d2),
+                (ins i32imm:$a, i32imm:$b),
+                !strconcat(!strconcat("ld.param", opstr),
+                  "\t{{$d1, $d2}}, [retval0+$b];"), []>;
+
+
+class StoreParamScalar4Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs),
+                (ins regclass:$s1, regclass:$s2, regclass:$s3, regclass:$s4,
+                  i32imm:$a, i32imm:$b),
+                !strconcat(!strconcat("st.param", opstr),
+                  "\t[param$a+$b], {{$s1, $s2, $s3, $s4}};"), []>;
+
+class StoreParamScalar2Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs),
+                (ins regclass:$s1, regclass:$s2, i32imm:$a, i32imm:$b),
+                !strconcat(!strconcat("st.param", opstr),
+                  "\t[param$a+$b], {{$s1, $s2}};"), []>;
+
+class StoreRetvalScalar4Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs),
+                (ins regclass:$s1, regclass:$s2, regclass:$s3, regclass:$s4,
+                  i32imm:$a),
+                !strconcat(!strconcat("st.param", opstr),
+                  "\t[func_retval+$a], {{$s1, $s2, $s3, $s4}};"), []>;
+
+class StoreRetvalScalar2Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs),
+                (ins regclass:$s1, regclass:$s2, i32imm:$a),
+                !strconcat(!strconcat("st.param", opstr),
+                  "\t[func_retval+$a], {{$s1, $s2}};"), []>;
+
+def LoadParamScalar4I32 : LoadParamScalar4Inst<Int32Regs, ".v4.b32">;
+def LoadParamScalar4I16 : LoadParamScalar4Inst<Int16Regs, ".v4.b16">;
+def LoadParamScalar4I8  : LoadParamScalar4Inst<Int8Regs, ".v4.b8">;
+
+def LoadParamScalar2I64 : LoadParamScalar2Inst<Int32Regs, ".v2.b64">;
+def LoadParamScalar2I32 : LoadParamScalar2Inst<Int32Regs, ".v2.b32">;
+def LoadParamScalar2I16 : LoadParamScalar2Inst<Int32Regs, ".v2.b16">;
+def LoadParamScalar2I8  : LoadParamScalar2Inst<Int32Regs, ".v2.b8">;
+
+def LoadParamScalar4F32 : LoadParamScalar4Inst<Float32Regs, ".v4.f32">;
+def LoadParamScalar2F32 : LoadParamScalar2Inst<Float32Regs, ".v2.f32">;
+def LoadParamScalar2F64 : LoadParamScalar2Inst<Float64Regs, ".v2.f64">;
+
+def StoreParamScalar4I32 : StoreParamScalar4Inst<Int32Regs, ".v4.b32">;
+def StoreParamScalar4I16 : StoreParamScalar4Inst<Int16Regs, ".v4.b16">;
+def StoreParamScalar4I8  : StoreParamScalar4Inst<Int8Regs, ".v4.b8">;
+
+def StoreParamScalar2I64 : StoreParamScalar2Inst<Int64Regs, ".v2.b64">;
+def StoreParamScalar2I32 : StoreParamScalar2Inst<Int32Regs, ".v2.b32">;
+def StoreParamScalar2I16 : StoreParamScalar2Inst<Int16Regs, ".v2.b16">;
+def StoreParamScalar2I8  : StoreParamScalar2Inst<Int8Regs, ".v2.b8">;
+
+def StoreParamScalar4F32 : StoreParamScalar4Inst<Float32Regs, ".v4.f32">;
+def StoreParamScalar2F32 : StoreParamScalar2Inst<Float32Regs, ".v2.f32">;
+def StoreParamScalar2F64 : StoreParamScalar2Inst<Float64Regs, ".v2.f64">;
+
+def StoreRetvalScalar4I32 : StoreRetvalScalar4Inst<Int32Regs, ".v4.b32">;
+def StoreRetvalScalar4I16 : StoreRetvalScalar4Inst<Int16Regs, ".v4.b16">;
+def StoreRetvalScalar4I8  : StoreRetvalScalar4Inst<Int8Regs, ".v4.b8">;
+
+def StoreRetvalScalar2I64 : StoreRetvalScalar2Inst<Int64Regs, ".v2.b64">;
+def StoreRetvalScalar2I32 : StoreRetvalScalar2Inst<Int32Regs, ".v2.b32">;
+def StoreRetvalScalar2I16 : StoreRetvalScalar2Inst<Int16Regs, ".v2.b16">;
+def StoreRetvalScalar2I8  : StoreRetvalScalar2Inst<Int8Regs, ".v2.b8">;
+
+def StoreRetvalScalar4F32 : StoreRetvalScalar4Inst<Float32Regs, ".v4.f32">;
+def StoreRetvalScalar2F32 : StoreRetvalScalar2Inst<Float32Regs, ".v2.f32">;
+def StoreRetvalScalar2F64 : StoreRetvalScalar2Inst<Float64Regs, ".v2.f64">;
+
+class LoadParamVecInst<NVPTXRegClass regclass, string opstr, NVPTXInst sop=NOP>:
+      NVPTXVecInst<(outs regclass:$dst), (ins i32imm:$a, i32imm:$b),
+                "loadparam : $dst <- [$a, $b]",
+                [(set regclass:$dst, (LoadParam (i32 imm:$a), (i32 imm:$b)))],
+                sop>;
+
+class StoreParamVecInst<NVPTXRegClass regclass, string opstr, NVPTXInst sop=NOP>
+      : NVPTXVecInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
+                "storeparam : [$a, $b] <- $val",
+                [(StoreParam (i32 imm:$a), (i32 imm:$b), regclass:$val)], sop>;
+
+class StoreRetvalVecInst<NVPTXRegClass regclass, string opstr,
+  NVPTXInst sop=NOP>
+      : NVPTXVecInst<(outs), (ins regclass:$val, i32imm:$a),
+                "storeretval : retval[$a] <- $val",
+                [(StoreRetval (i32 imm:$a), regclass:$val)], sop>;
+
+let VecInstType=isVecLD.Value in {
+def LoadParamV4I32  : LoadParamVecInst<V4I32Regs, ".v4.b32",
+  LoadParamScalar4I32>;
+def LoadParamV4I16  : LoadParamVecInst<V4I16Regs, ".v4.b16",
+  LoadParamScalar4I16>;
+def LoadParamV4I8   : LoadParamVecInst<V4I8Regs, ".v4.b8",
+  LoadParamScalar4I8>;
+
+def LoadParamV2I64  : LoadParamVecInst<V2I64Regs, ".v2.b64",
+  LoadParamScalar2I64>;
+def LoadParamV2I32  : LoadParamVecInst<V2I32Regs, ".v2.b32",
+  LoadParamScalar2I32>;
+def LoadParamV2I16  : LoadParamVecInst<V2I16Regs, ".v2.b16",
+  LoadParamScalar2I16>;
+def LoadParamV2I8   : LoadParamVecInst<V2I8Regs, ".v2.b8",
+  LoadParamScalar2I8>;
+
+def LoadParamV4F32  : LoadParamVecInst<V4F32Regs, ".v4.f32",
+  LoadParamScalar4F32>;
+def LoadParamV2F32  : LoadParamVecInst<V2F32Regs, ".v2.f32",
+  LoadParamScalar2F32>;
+def LoadParamV2F64  : LoadParamVecInst<V2F64Regs, ".v2.f64",
+  LoadParamScalar2F64>;
+}
+
+let VecInstType=isVecST.Value in {
+def StoreParamV4I32  : StoreParamVecInst<V4I32Regs, ".v4.b32",
+  StoreParamScalar4I32>;
+def StoreParamV4I16  : StoreParamVecInst<V4I16Regs, ".v4.b16",
+  StoreParamScalar4I16>;
+def StoreParamV4I8   : StoreParamVecInst<V4I8Regs, ".v4.b8",
+  StoreParamScalar4I8>;
+
+def StoreParamV2I64  : StoreParamVecInst<V2I64Regs, ".v2.b64",
+  StoreParamScalar2I64>;
+def StoreParamV2I32  : StoreParamVecInst<V2I32Regs, ".v2.b32",
+  StoreParamScalar2I32>;
+def StoreParamV2I16  : StoreParamVecInst<V2I16Regs, ".v2.b16",
+  StoreParamScalar2I16>;
+def StoreParamV2I8   : StoreParamVecInst<V2I8Regs, ".v2.b8",
+  StoreParamScalar2I8>;
+
+def StoreParamV4F32  : StoreParamVecInst<V4F32Regs, ".v4.f32",
+  StoreParamScalar4F32>;
+def StoreParamV2F32  : StoreParamVecInst<V2F32Regs, ".v2.f32",
+  StoreParamScalar2F32>;
+def StoreParamV2F64  : StoreParamVecInst<V2F64Regs, ".v2.f64",
+  StoreParamScalar2F64>;
+
+def StoreRetvalV4I32  : StoreRetvalVecInst<V4I32Regs, ".v4.b32",
+  StoreRetvalScalar4I32>;
+def StoreRetvalV4I16  : StoreRetvalVecInst<V4I16Regs, ".v4.b16",
+  StoreRetvalScalar4I16>;
+def StoreRetvalV4I8   : StoreRetvalVecInst<V4I8Regs,  ".v4.b8",
+  StoreRetvalScalar4I8>;
+
+def StoreRetvalV2I64  : StoreRetvalVecInst<V2I64Regs, ".v2.b64",
+  StoreRetvalScalar2I64>;
+def StoreRetvalV2I32  : StoreRetvalVecInst<V2I32Regs, ".v2.b32",
+  StoreRetvalScalar2I32>;
+def StoreRetvalV2I16  : StoreRetvalVecInst<V2I16Regs, ".v2.b16",
+  StoreRetvalScalar2I16>;
+def StoreRetvalV2I8   : StoreRetvalVecInst<V2I8Regs,  ".v2.b8",
+  StoreRetvalScalar2I8>;
+
+def StoreRetvalV4F32  : StoreRetvalVecInst<V4F32Regs, ".v4.f32",
+  StoreRetvalScalar4F32>;
+def StoreRetvalV2F32  : StoreRetvalVecInst<V2F32Regs, ".v2.f32",
+  StoreRetvalScalar2F32>;
+def StoreRetvalV2F64  : StoreRetvalVecInst<V2F64Regs, ".v2.f64",
+  StoreRetvalScalar2F64>;
+
+}
+
+
+// Int vector to int scalar bit convert
+// v4i8 -> i32
+def : Pat<(i32 (bitconvert V4I8Regs:$s)),
+          (V4I8toI32 (V4i8Extract V4I8Regs:$s,0), (V4i8Extract V4I8Regs:$s,1),
+                     (V4i8Extract V4I8Regs:$s,2), (V4i8Extract V4I8Regs:$s,3))>;
+// v4i16 -> i64
+def : Pat<(i64 (bitconvert V4I16Regs:$s)),
+          (V4I16toI64 (V4i16Extract V4I16Regs:$s,0),
+            (V4i16Extract V4I16Regs:$s,1),
+                     (V4i16Extract V4I16Regs:$s,2),
+                     (V4i16Extract V4I16Regs:$s,3))>;
+// v2i8 -> i16
+def : Pat<(i16 (bitconvert V2I8Regs:$s)),
+          (V2I8toI16 (V2i8Extract V2I8Regs:$s,0), (V2i8Extract V2I8Regs:$s,1))>;
+// v2i16 -> i32
+def : Pat<(i32 (bitconvert V2I16Regs:$s)),
+          (V2I16toI32 (V2i16Extract V2I16Regs:$s,0),
+            (V2i16Extract V2I16Regs:$s,1))>;
+// v2i32 -> i64
+def : Pat<(i64 (bitconvert V2I32Regs:$s)),
+          (V2I32toI64 (V2i32Extract V2I32Regs:$s,0),
+            (V2i32Extract V2I32Regs:$s,1))>;
+
+// Int scalar to int vector bit convert
+let VecInstType=isVecDest.Value in {
+// i32 -> v4i8
+def VecI32toV4I8 : NVPTXVecInst<(outs V4I8Regs:$d), (ins Int32Regs:$s),
+                                "Error!",
+                                [(set V4I8Regs:$d, (bitconvert Int32Regs:$s))],
+                                I32toV4I8>;
+// i64 -> v4i16
+def VecI64toV4I16 : NVPTXVecInst<(outs V4I16Regs:$d), (ins Int64Regs:$s),
+                                 "Error!",
+                                [(set V4I16Regs:$d, (bitconvert Int64Regs:$s))],
+                                 I64toV4I16>;
+// i16 -> v2i8
+def VecI16toV2I8 : NVPTXVecInst<(outs V2I8Regs:$d), (ins Int16Regs:$s),
+                                "Error!",
+                               [(set V2I8Regs:$d, (bitconvert Int16Regs:$s))],
+                                I16toV2I8>;
+// i32 -> v2i16
+def VecI32toV2I16 : NVPTXVecInst<(outs V2I16Regs:$d), (ins Int32Regs:$s),
+                                 "Error!",
+                                [(set V2I16Regs:$d, (bitconvert Int32Regs:$s))],
+                                 I32toV2I16>;
+// i64 -> v2i32
+def VecI64toV2I32 : NVPTXVecInst<(outs V2I32Regs:$d), (ins Int64Regs:$s),
+                                  "Error!",
+                                [(set V2I32Regs:$d, (bitconvert Int64Regs:$s))],
+                                  I64toV2I32>;
+}
+
+// Int vector to int vector bit convert
+// v4i8 -> v2i16
+def : Pat<(v2i16 (bitconvert V4I8Regs:$s)),
+          (VecI32toV2I16
+          (V4I8toI32 (V4i8Extract V4I8Regs:$s,0), (V4i8Extract V4I8Regs:$s,1),
+                    (V4i8Extract V4I8Regs:$s,2), (V4i8Extract V4I8Regs:$s,3)))>;
+// v4i16 -> v2i32
+def : Pat<(v2i32 (bitconvert V4I16Regs:$s)),
+          (VecI64toV2I32
+       (V4I16toI64 (V4i16Extract V4I16Regs:$s,0), (V4i16Extract V4I16Regs:$s,1),
+                (V4i16Extract V4I16Regs:$s,2), (V4i16Extract V4I16Regs:$s,3)))>;
+// v2i16 -> v4i8
+def : Pat<(v4i8 (bitconvert V2I16Regs:$s)),
+          (VecI32toV4I8
+    (V2I16toI32 (V2i16Extract V2I16Regs:$s,0), (V2i16Extract V2I16Regs:$s,1)))>;
+// v2i32 -> v4i16
+def : Pat<(v4i16 (bitconvert V2I32Regs:$s)),
+          (VecI64toV4I16
+    (V2I32toI64 (V2i32Extract V2I32Regs:$s,0), (V2i32Extract V2I32Regs:$s,1)))>;
+// v2i64 -> v4i32
+def : Pat<(v4i32 (bitconvert V2I64Regs:$s)),
+          (Build_Vector4_i32
+            (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 0)), 0),
+            (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 0)), 1),
+            (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 1)), 0),
+            (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 1)), 1))>;
+// v4i32 -> v2i64
+def : Pat<(v2i64 (bitconvert V4I32Regs:$s)),
+          (Build_Vector2_i64
+      (V2I32toI64 (V4i32Extract V4I32Regs:$s,0), (V4i32Extract V4I32Regs:$s,1)),
+    (V2I32toI64 (V4i32Extract V4I32Regs:$s,2), (V4i32Extract V4I32Regs:$s,3)))>;
+
+// Fp scalar to fp vector convert
+// f64 -> v2f32
+let VecInstType=isVecDest.Value in {
+def VecF64toV2F32 : NVPTXVecInst<(outs V2F32Regs:$d), (ins Float64Regs:$s),
+                                  "Error!",
+                              [(set V2F32Regs:$d, (bitconvert Float64Regs:$s))],
+                                  F64toV2F32>;
+}
+
+// Fp vector to fp scalar convert
+// v2f32 -> f64
+def : Pat<(f64 (bitconvert V2F32Regs:$s)),
+     (V2F32toF64 (V2f32Extract V2F32Regs:$s,0), (V2f32Extract V2F32Regs:$s,1))>;
+
+// Fp scalar to int vector convert
+// f32 -> v4i8
+def : Pat<(v4i8 (bitconvert Float32Regs:$s)),
+          (VecI32toV4I8 (BITCONVERT_32_F2I Float32Regs:$s))>;
+// f32 -> v2i16
+def : Pat<(v2i16 (bitconvert Float32Regs:$s)),
+          (VecI32toV2I16 (BITCONVERT_32_F2I Float32Regs:$s))>;
+// f64 -> v4i16
+def : Pat<(v4i16 (bitconvert Float64Regs:$s)),
+          (VecI64toV4I16 (BITCONVERT_64_F2I Float64Regs:$s))>;
+// f64 -> v2i32
+def : Pat<(v2i32 (bitconvert Float64Regs:$s)),
+          (VecI64toV2I32 (BITCONVERT_64_F2I Float64Regs:$s))>;
+
+// Int vector to fp scalar convert
+// v4i8 -> f32
+def : Pat<(f32 (bitconvert V4I8Regs:$s)),
+          (BITCONVERT_32_I2F
+          (V4I8toI32 (V4i8Extract V4I8Regs:$s,0), (V4i8Extract V4I8Regs:$s,1),
+                    (V4i8Extract V4I8Regs:$s,2), (V4i8Extract V4I8Regs:$s,3)))>;
+// v4i16 -> f64
+def : Pat<(f64 (bitconvert V4I16Regs:$s)),
+          (BITCONVERT_64_I2F
+       (V4I16toI64 (V4i16Extract V4I16Regs:$s,0), (V4i16Extract V4I16Regs:$s,1),
+                (V4i16Extract V4I16Regs:$s,2), (V4i16Extract V4I16Regs:$s,3)))>;
+// v2i16 -> f32
+def : Pat<(f32 (bitconvert V2I16Regs:$s)),
+          (BITCONVERT_32_I2F
+    (V2I16toI32 (V2i16Extract V2I16Regs:$s,0), (V2i16Extract V2I16Regs:$s,1)))>;
+// v2i32 -> f64
+def : Pat<(f64 (bitconvert V2I32Regs:$s)),
+          (BITCONVERT_64_I2F
+    (V2I32toI64 (V2i32Extract V2I32Regs:$s,0), (V2i32Extract V2I32Regs:$s,1)))>;
+
+// Int scalar to fp vector convert
+// i64 -> v2f32
+def : Pat<(v2f32 (bitconvert Int64Regs:$s)),
+          (VecF64toV2F32 (BITCONVERT_64_I2F Int64Regs:$s))>;
+
+// Fp vector to int scalar convert
+// v2f32 -> i64
+def : Pat<(i64 (bitconvert V2F32Regs:$s)),
+          (BITCONVERT_64_F2I
+    (V2F32toF64 (V2f32Extract V2F32Regs:$s,0), (V2f32Extract V2F32Regs:$s,1)))>;
+
+// Int vector to fp vector convert
+// v2i64 -> v4f32
+def : Pat<(v4f32 (bitconvert V2I64Regs:$s)),
+          (Build_Vector4_f32
+            (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32
+              (V2i64Extract V2I64Regs:$s, 0)), 0)),
+            (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32
+              (V2i64Extract V2I64Regs:$s, 0)), 1)),
+            (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32
+              (V2i64Extract V2I64Regs:$s, 1)), 0)),
+            (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32
+              (V2i64Extract V2I64Regs:$s, 1)), 1)))>;
+// v2i64 -> v2f64
+def : Pat<(v2f64 (bitconvert V2I64Regs:$s)),
+    (Build_Vector2_f64
+            (BITCONVERT_64_I2F (V2i64Extract V2I64Regs:$s,0)),
+            (BITCONVERT_64_I2F (V2i64Extract V2I64Regs:$s,1)))>;
+// v2i32 -> v2f32
+def : Pat<(v2f32 (bitconvert V2I32Regs:$s)),
+    (Build_Vector2_f32
+            (BITCONVERT_32_I2F (V2i32Extract V2I32Regs:$s,0)),
+            (BITCONVERT_32_I2F (V2i32Extract V2I32Regs:$s,1)))>;
+// v4i32 -> v2f64
+def : Pat<(v2f64 (bitconvert V4I32Regs:$s)),
+          (Build_Vector2_f64
+           (BITCONVERT_64_I2F (V2I32toI64 (V4i32Extract V4I32Regs:$s,0),
+             (V4i32Extract V4I32Regs:$s,1))),
+           (BITCONVERT_64_I2F (V2I32toI64 (V4i32Extract V4I32Regs:$s,2),
+             (V4i32Extract V4I32Regs:$s,3))))>;
+// v4i32 -> v4f32
+def : Pat<(v4f32 (bitconvert V4I32Regs:$s)),
+    (Build_Vector4_f32
+            (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,0)),
+            (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,1)),
+            (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,2)),
+            (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,3)))>;
+// v4i16 -> v2f32
+def : Pat<(v2f32 (bitconvert V4I16Regs:$s)),
+          (VecF64toV2F32 (BITCONVERT_64_I2F
+          (V4I16toI64 (V4i16Extract V4I16Regs:$s,0),
+            (V4i16Extract V4I16Regs:$s,1),
+                      (V4i16Extract V4I16Regs:$s,2),
+                      (V4i16Extract V4I16Regs:$s,3))))>;
+
+// Fp vector to int vector convert
+// v2i64 <- v4f32
+def : Pat<(v2i64 (bitconvert V4F32Regs:$s)),
+          (Build_Vector2_i64
+           (BITCONVERT_64_F2I (V2F32toF64 (V4f32Extract V4F32Regs:$s,0),
+             (V4f32Extract V4F32Regs:$s,1))),
+           (BITCONVERT_64_F2I (V2F32toF64 (V4f32Extract V4F32Regs:$s,2),
+             (V4f32Extract V4F32Regs:$s,3))))>;
+// v2i64 <- v2f64
+def : Pat<(v2i64 (bitconvert V2F64Regs:$s)),
+    (Build_Vector2_i64
+            (BITCONVERT_64_F2I (V2f64Extract V2F64Regs:$s,0)),
+            (BITCONVERT_64_F2I (V2f64Extract V2F64Regs:$s,1)))>;
+// v2i32 <- v2f32
+def : Pat<(v2i32 (bitconvert V2F32Regs:$s)),
+    (Build_Vector2_i32
+            (BITCONVERT_32_F2I (V2f32Extract V2F32Regs:$s,0)),
+            (BITCONVERT_32_F2I (V2f32Extract V2F32Regs:$s,1)))>;
+// v4i32 <- v2f64
+def : Pat<(v4i32 (bitconvert V2F64Regs:$s)),
+          (Build_Vector4_i32
+            (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32
+              (V2f64Extract V2F64Regs:$s, 0)), 0)),
+            (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32
+              (V2f64Extract V2F64Regs:$s, 0)), 1)),
+            (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32
+              (V2f64Extract V2F64Regs:$s, 1)), 0)),
+            (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32
+              (V2f64Extract V2F64Regs:$s, 1)), 1)))>;
+// v4i32 <- v4f32
+def : Pat<(v4i32 (bitconvert V4F32Regs:$s)),
+          (Build_Vector4_i32
+            (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,0)),
+            (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,1)),
+            (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,2)),
+            (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,3)))>;
+// v4i16 <- v2f32
+def : Pat<(v4i16 (bitconvert V2F32Regs:$s)),
+          (VecI64toV4I16 (BITCONVERT_64_F2I
+          (V2F32toF64 (V2f32Extract V2F32Regs:$s,0),
+            (V2f32Extract V2F32Regs:$s,1))))>;
diff --git a/lib/Target/NVPTX/NVPTXutil.cpp b/lib/Target/NVPTX/NVPTXutil.cpp
new file mode 100644
index 0000000..6a0e532
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXutil.cpp
@@ -0,0 +1,92 @@
+//===-- NVPTXutil.cpp - Functions exported to CodeGen --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the functions that can be used in CodeGen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXutil.h"
+#include "NVPTX.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+bool isParamLoad(const MachineInstr *MI)
+{
+  if ((MI->getOpcode() != NVPTX::LD_i32_avar) &&
+      (MI->getOpcode() != NVPTX::LD_i64_avar))
+    return false;
+  if (MI->getOperand(2).isImm() == false)
+    return false;
+  if (MI->getOperand(2).getImm() != NVPTX::PTXLdStInstCode::PARAM)
+    return false;
+  return true;
+}
+
+#define DATA_MASK     0x7f
+#define DIGIT_WIDTH   7
+#define MORE_BYTES    0x80
+
+static int encode_leb128(uint64_t val, int *nbytes,
+                         char *space, int splen)
+{
+  char *a;
+  char *end = space + splen;
+
+  a = space;
+  do {
+    unsigned char uc;
+
+    if (a >= end)
+      return 1;
+    uc = val & DATA_MASK;
+    val >>= DIGIT_WIDTH;
+    if (val != 0)
+      uc |= MORE_BYTES;
+    *a = uc;
+    a++;
+  } while (val);
+  *nbytes = a - space;
+  return 0;
+}
+
+#undef DATA_MASK
+#undef DIGIT_WIDTH
+#undef MORE_BYTES
+
+uint64_t encode_leb128(const char *str)
+{
+  union { uint64_t x; char a[8]; } temp64;
+
+  temp64.x = 0;
+
+  for (unsigned i=0,e=strlen(str); i!=e; ++i)
+    temp64.a[i] = str[e-1-i];
+
+  char encoded[16];
+  int nbytes;
+
+  int retval = encode_leb128(temp64.x, &nbytes, encoded, 16);
+
+  (void)retval;
+  assert(retval == 0 &&
+         "Encoding to leb128 failed");
+
+  assert(nbytes <= 8 &&
+         "Cannot support register names with leb128 encoding > 8 bytes");
+
+  temp64.x = 0;
+  for (int i=0; i<nbytes; ++i)
+    temp64.a[i] = encoded[i];
+
+  return temp64.x;
+}
+
+} // end namespace llvm
diff --git a/lib/Target/NVPTX/NVPTXutil.h b/lib/Target/NVPTX/NVPTXutil.h
new file mode 100644
index 0000000..d1d1171
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXutil.h
@@ -0,0 +1,25 @@
+//===-- NVPTXutil.h - Functions exported to CodeGen --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the functions that can be used in CodeGen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_NVPTX_UTIL_H
+#define LLVM_TARGET_NVPTX_UTIL_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+
+namespace llvm {
+bool isParamLoad(const MachineInstr *);
+uint64_t encode_leb128(const char *str);
+}
+
+#endif
diff --git a/lib/Target/NVPTX/TargetInfo/CMakeLists.txt b/lib/Target/NVPTX/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..0bf1334
--- /dev/null
+++ b/lib/Target/NVPTX/TargetInfo/CMakeLists.txt
@@ -0,0 +1,7 @@
+#include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMNVPTXInfo
+  NVPTXTargetInfo.cpp
+  )
+
+add_dependencies(LLVMNVPTXInfo NVPTXCommonTableGen)
diff --git a/lib/Target/PTX/TargetInfo/LLVMBuild.txt b/lib/Target/NVPTX/TargetInfo/LLVMBuild.txt
index 2cc30c4..ef12b0e 100644
--- a/lib/Target/PTX/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/NVPTX/TargetInfo/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/PTX/TargetInfo/LLVMBuild.txt ----------------*- Conf -*--===;
+;===- ./lib/Target/NVPTX/TargetInfo/LLVMBuild.txt --------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
@@ -17,7 +17,7 @@
 
 [component_0]
 type = Library
-name = PTXInfo
-parent = PTX
+name = NVPTXInfo
+parent = NVPTX
 required_libraries = MC Support Target
-add_to_library_groups = PTX
+add_to_library_groups = NVPTX
diff --git a/lib/Target/PTX/TargetInfo/Makefile b/lib/Target/NVPTX/TargetInfo/Makefile
index 8619785..8622315 100644
--- a/lib/Target/PTX/TargetInfo/Makefile
+++ b/lib/Target/NVPTX/TargetInfo/Makefile
@@ -1,4 +1,4 @@
-##===- lib/Target/PTX/TargetInfo/Makefile ------------------*- Makefile -*-===##
+##===- lib/Target/NVPTX/TargetInfo/Makefile ----------------*- Makefile -*-===##
 #
 #                     The LLVM Compiler Infrastructure
 #
@@ -7,7 +7,7 @@
 #
 ##===----------------------------------------------------------------------===##
 LEVEL = ../../../..
-LIBRARYNAME = LLVMPTXInfo
+LIBRARYNAME = LLVMNVPTXInfo
 
 # Hack: we need to include 'main' target directory to grab private headers
 CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
diff --git a/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp b/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp
new file mode 100644
index 0000000..f3624b9
--- /dev/null
+++ b/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp
@@ -0,0 +1,23 @@
+//===-- NVPTXTargetInfo.cpp - NVPTX Target Implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheNVPTXTarget32;
+Target llvm::TheNVPTXTarget64;
+
+extern "C" void LLVMInitializeNVPTXTargetInfo() {
+  RegisterTarget<Triple::nvptx> X(TheNVPTXTarget32, "nvptx",
+    "NVIDIA PTX 32-bit");
+  RegisterTarget<Triple::nvptx64> Y(TheNVPTXTarget64, "nvptx64",
+    "NVIDIA PTX 64-bit");
+}
diff --git a/lib/Target/NVPTX/VectorElementize.cpp b/lib/Target/NVPTX/VectorElementize.cpp
new file mode 100644
index 0000000..8043e2d
--- /dev/null
+++ b/lib/Target/NVPTX/VectorElementize.cpp
@@ -0,0 +1,1248 @@
+//===-- VectorElementize.cpp - Remove unreachable blocks for codegen --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass converts operations on vector types to operations on their
+// element types.
+//
+// For generic binary and unary vector instructions, the conversion is simple.
+// Suppose we have
+//        av = bv Vop cv
+// where av, bv, and cv are vector virtual registers, and Vop is a vector op.
+// This gets converted to the following :
+//       a1 = b1 Sop c1
+//       a2 = b2 Sop c2
+//
+// VectorToScalarMap maintains the vector vreg to scalar vreg mapping.
+// For the above example, the map will look as follows:
+// av => [a1, a2]
+// bv => [b1, b2]
+//
+// In addition, initVectorInfo creates the following opcode->opcode map.
+// Vop => Sop
+// OtherVop => OtherSop
+// ...
+//
+// For vector specific instructions like vecbuild, vecshuffle etc, the
+// conversion is different. Look at comments near the functions with
+// prefix createVec<...>.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Constant.h"
+#include "llvm/Instructions.h"
+#include "llvm/Function.h"
+#include "llvm/Pass.h"
+#include "llvm/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "NVPTX.h"
+#include "NVPTXTargetMachine.h"
+
+using namespace llvm;
+
+namespace {
+
+class LLVM_LIBRARY_VISIBILITY VectorElementize : public MachineFunctionPass {
+  virtual bool runOnMachineFunction(MachineFunction &F);
+
+  NVPTXTargetMachine &TM;
+  MachineRegisterInfo *MRI;
+  const NVPTXRegisterInfo *RegInfo;
+  const NVPTXInstrInfo *InstrInfo;
+
+  llvm::DenseMap<const TargetRegisterClass *, const TargetRegisterClass *>
+  RegClassMap;
+  llvm::DenseMap<unsigned, bool> SimpleMoveMap;
+
+  llvm::DenseMap<unsigned, SmallVector<unsigned, 4> > VectorToScalarMap;
+
+  bool isVectorInstr(MachineInstr *);
+
+  SmallVector<unsigned, 4> getScalarRegisters(unsigned);
+  unsigned getScalarVersion(unsigned);
+  unsigned getScalarVersion(MachineInstr *);
+
+  bool isVectorRegister(unsigned);
+  const TargetRegisterClass *getScalarRegClass(const TargetRegisterClass *RC);
+  unsigned numCopiesNeeded(MachineInstr *);
+
+  void createLoadCopy(MachineFunction&, MachineInstr *,
+                      std::vector<MachineInstr *>&);
+  void createStoreCopy(MachineFunction&, MachineInstr *,
+                       std::vector<MachineInstr *>&);
+
+  void createVecDest(MachineFunction&, MachineInstr *,
+                     std::vector<MachineInstr *>&);
+
+  void createCopies(MachineFunction&, MachineInstr *,
+                    std::vector<MachineInstr *>&);
+
+  unsigned copyProp(MachineFunction&);
+  unsigned removeDeadMoves(MachineFunction&);
+
+  void elementize(MachineFunction&);
+
+  bool isSimpleMove(MachineInstr *);
+
+  void createVecShuffle(MachineFunction& F, MachineInstr *Instr,
+                        std::vector<MachineInstr *>& copies);
+
+  void createVecExtract(MachineFunction& F, MachineInstr *Instr,
+                        std::vector<MachineInstr *>& copies);
+
+  void createVecInsert(MachineFunction& F, MachineInstr *Instr,
+                       std::vector<MachineInstr *>& copies);
+
+  void createVecBuild(MachineFunction& F, MachineInstr *Instr,
+                      std::vector<MachineInstr *>& copies);
+
+public:
+
+  static char ID; // Pass identification, replacement for typeid
+  VectorElementize(NVPTXTargetMachine &tm)
+  : MachineFunctionPass(ID), TM(tm) {}
+
+  virtual const char *getPassName() const {
+    return "Convert LLVM vector types to their element types";
+  }
+};
+
+char VectorElementize::ID = 1;
+}
+
+static cl::opt<bool>
+RemoveRedundantMoves("nvptx-remove-redundant-moves",
+       cl::desc("NVPTX: Remove redundant moves introduced by vector lowering"),
+                     cl::init(true));
+
+#define VECINST(x) ((((x)->getDesc().TSFlags) & NVPTX::VecInstTypeMask) \
+    >> NVPTX::VecInstTypeShift)
+#define ISVECINST(x) (VECINST(x) != NVPTX::VecNOP)
+#define ISVECLOAD(x)    (VECINST(x) == NVPTX::VecLoad)
+#define ISVECSTORE(x)   (VECINST(x) == NVPTX::VecStore)
+#define ISVECBUILD(x)   (VECINST(x) == NVPTX::VecBuild)
+#define ISVECSHUFFLE(x) (VECINST(x) == NVPTX::VecShuffle)
+#define ISVECEXTRACT(x) (VECINST(x) == NVPTX::VecExtract)
+#define ISVECINSERT(x)  (VECINST(x) == NVPTX::VecInsert)
+#define ISVECDEST(x)     (VECINST(x) == NVPTX::VecDest)
+
+bool VectorElementize::isSimpleMove(MachineInstr *mi) {
+  if (mi->isCopy())
+    return true;
+  unsigned TSFlags = (mi->getDesc().TSFlags & NVPTX::SimpleMoveMask)
+        >> NVPTX::SimpleMoveShift;
+  return (TSFlags == 1);
+}
+
+bool VectorElementize::isVectorInstr(MachineInstr *mi) {
+  if ((mi->getOpcode() == NVPTX::PHI) ||
+      (mi->getOpcode() == NVPTX::IMPLICIT_DEF) || mi->isCopy()) {
+    MachineOperand dest = mi->getOperand(0);
+    return isVectorRegister(dest.getReg());
+  }
+  return ISVECINST(mi);
+}
+
+unsigned VectorElementize::getScalarVersion(MachineInstr *mi) {
+  return getScalarVersion(mi->getOpcode());
+}
+
+///=============================================================================
+///Instr is assumed to be a vector instruction. For most vector instructions,
+///the size of the destination vector register gives the number of scalar copies
+///needed. For VecStore, size of getOperand(1) gives the number of scalar copies
+///needed. For VecExtract, the dest is a scalar. So getOperand(1) gives the
+///number of scalar copies needed.
+///=============================================================================
+unsigned VectorElementize::numCopiesNeeded(MachineInstr *Instr) {
+  unsigned numDefs=0;
+  unsigned def;
+  for (unsigned i=0, e=Instr->getNumOperands(); i!=e; ++i) {
+    MachineOperand oper = Instr->getOperand(i);
+
+    if (!oper.isReg()) continue;
+    if (!oper.isDef()) continue;
+    def = i;
+    numDefs++;
+  }
+  assert((numDefs <= 1) && "Only 0 or 1 defs supported");
+
+  if (numDefs == 1) {
+    unsigned regnum = Instr->getOperand(def).getReg();
+    if (ISVECEXTRACT(Instr))
+      regnum = Instr->getOperand(1).getReg();
+    return getNVPTXVectorSize(MRI->getRegClass(regnum));
+  }
+  else if (numDefs == 0) {
+    assert(ISVECSTORE(Instr)
+           && "Only 0 def instruction supported is vector store");
+
+    unsigned regnum = Instr->getOperand(0).getReg();
+    return getNVPTXVectorSize(MRI->getRegClass(regnum));
+  }
+  return 1;
+}
+
+const TargetRegisterClass *VectorElementize::
+getScalarRegClass(const TargetRegisterClass *RC) {
+  assert(isNVPTXVectorRegClass(RC) &&
+         "Not a vector register class");
+  return getNVPTXElemClass(RC);
+}
+
+bool VectorElementize::isVectorRegister(unsigned reg) {
+  const TargetRegisterClass *RC=MRI->getRegClass(reg);
+  return isNVPTXVectorRegClass(RC);
+}
+
+///=============================================================================
+///For every vector register 'v' that is not already in the VectorToScalarMap,
+///create n scalar registers of the corresponding element type, where n
+///is 2 or 4 (getNVPTXVectorSize) and add it VectorToScalarMap.
+///=============================================================================
+SmallVector<unsigned, 4> VectorElementize::getScalarRegisters(unsigned regnum) {
+  assert(isVectorRegister(regnum) && "Expecting a vector register here");
+  // Create the scalar registers and put them in the map, if not already there.
+  if (VectorToScalarMap.find(regnum) == VectorToScalarMap.end()) {
+    const TargetRegisterClass *vecClass = MRI->getRegClass(regnum);
+    const TargetRegisterClass *scalarClass = getScalarRegClass(vecClass);
+
+    SmallVector<unsigned, 4> temp;
+
+    for (unsigned i=0, e=getNVPTXVectorSize(vecClass); i!=e; ++i)
+      temp.push_back(MRI->createVirtualRegister(scalarClass));
+
+    VectorToScalarMap[regnum] = temp;
+  }
+  return VectorToScalarMap[regnum];
+}
+
+///=============================================================================
+///For a vector load of the form
+///va <= ldv2 [addr]
+///the following multi output instruction is created :
+///[v1, v2] <= LD [addr]
+///Look at NVPTXVector.td for the definitions of multi output loads.
+///=============================================================================
+void VectorElementize::createLoadCopy(MachineFunction& F, MachineInstr *Instr,
+                                      std::vector<MachineInstr *>& copies) {
+  copies.push_back(F.CloneMachineInstr(Instr));
+
+  MachineInstr *copy=copies[0];
+  copy->setDesc(InstrInfo->get(getScalarVersion(copy)));
+
+  // Remove the dest, that should be a vector operand.
+  MachineOperand dest = copy->getOperand(0);
+  unsigned regnum = dest.getReg();
+
+  SmallVector<unsigned, 4> scalarRegs = getScalarRegisters(regnum);
+  copy->RemoveOperand(0);
+
+  std::vector<MachineOperand> otherOperands;
+  for (unsigned i=0, e=copy->getNumOperands(); i!=e; ++i)
+    otherOperands.push_back(copy->getOperand(i));
+
+  for (unsigned i=0, e=copy->getNumOperands(); i!=e; ++i)
+    copy->RemoveOperand(0);
+
+  for (unsigned i=0, e=scalarRegs.size(); i!=e; ++i) {
+    copy->addOperand(MachineOperand::CreateReg(scalarRegs[i], true));
+  }
+
+  for (unsigned i=0, e=otherOperands.size(); i!=e; ++i)
+    copy->addOperand(otherOperands[i]);
+
+}
+
+///=============================================================================
+///For a vector store of the form
+///stv2 va, [addr]
+///the following multi input instruction is created :
+///ST v1, v2, [addr]
+///Look at NVPTXVector.td for the definitions of multi input stores.
+///=============================================================================
+void VectorElementize::createStoreCopy(MachineFunction& F, MachineInstr *Instr,
+                                       std::vector<MachineInstr *>& copies) {
+  copies.push_back(F.CloneMachineInstr(Instr));
+
+  MachineInstr *copy=copies[0];
+  copy->setDesc(InstrInfo->get(getScalarVersion(copy)));
+
+  MachineOperand src = copy->getOperand(0);
+  unsigned regnum = src.getReg();
+
+  SmallVector<unsigned, 4> scalarRegs = getScalarRegisters(regnum);
+  copy->RemoveOperand(0);
+
+  std::vector<MachineOperand> otherOperands;
+  for (unsigned i=0, e=copy->getNumOperands(); i!=e; ++i)
+    otherOperands.push_back(copy->getOperand(i));
+
+  for (unsigned i=0, e=copy->getNumOperands(); i!=e; ++i)
+    copy->RemoveOperand(0);
+
+  for (unsigned i=0, e=scalarRegs.size(); i!=e; ++i)
+    copy->addOperand(MachineOperand::CreateReg(scalarRegs[i], false));
+
+  for (unsigned i=0, e=otherOperands.size(); i!=e; ++i)
+    copy->addOperand(otherOperands[i]);
+}
+
+///=============================================================================
+///va <= shufflev2 vb, vc, <i1>, <i2>
+///gets converted to 2 moves into a1 and a2. The source of the moves depend on
+///i1 and i2. i1, i2 can belong to the set {0, 1, 2, 3} for shufflev2. For
+///shufflev4 the set is {0,..7}. For example, if i1=3, i2=0, the move
+///instructions will be
+///a1 <= c2
+///a2 <= b1
+///=============================================================================
+void VectorElementize::createVecShuffle(MachineFunction& F, MachineInstr *Instr,
+                                        std::vector<MachineInstr *>& copies) {
+  unsigned numcopies=numCopiesNeeded(Instr);
+
+  unsigned destregnum = Instr->getOperand(0).getReg();
+  unsigned src1regnum = Instr->getOperand(1).getReg();
+  unsigned src2regnum = Instr->getOperand(2).getReg();
+
+  SmallVector<unsigned, 4> dest = getScalarRegisters(destregnum);
+  SmallVector<unsigned, 4> src1 = getScalarRegisters(src1regnum);
+  SmallVector<unsigned, 4> src2 = getScalarRegisters(src2regnum);
+
+  DebugLoc DL = Instr->getDebugLoc();
+
+  for (unsigned i=0; i<numcopies; i++) {
+    MachineInstr *copy = BuildMI(F, DL,
+                              InstrInfo->get(getScalarVersion(Instr)), dest[i]);
+    MachineOperand which=Instr->getOperand(3+i);
+    assert(which.isImm() && "Shuffle operand not a constant");
+
+    int src=which.getImm();
+    int elem=src%numcopies;
+
+    if (which.getImm() < numcopies)
+      copy->addOperand(MachineOperand::CreateReg(src1[elem], false));
+    else
+      copy->addOperand(MachineOperand::CreateReg(src2[elem], false));
+    copies.push_back(copy);
+  }
+}
+
+///=============================================================================
+///a <= extractv2 va, <i1>
+///gets turned into a simple move to the scalar register a. The source depends
+///on i1.
+///=============================================================================
+void VectorElementize::createVecExtract(MachineFunction& F, MachineInstr *Instr,
+                                        std::vector<MachineInstr *>& copies) {
+  unsigned srcregnum = Instr->getOperand(1).getReg();
+
+  SmallVector<unsigned, 4> src = getScalarRegisters(srcregnum);
+
+  MachineOperand which = Instr->getOperand(2);
+  assert(which.isImm() && "Extract operand not a constant");
+
+  DebugLoc DL = Instr->getDebugLoc();
+
+  MachineInstr *copy = BuildMI(F, DL, InstrInfo->get(getScalarVersion(Instr)),
+                               Instr->getOperand(0).getReg());
+  copy->addOperand(MachineOperand::CreateReg(src[which.getImm()], false));
+
+  copies.push_back(copy);
+}
+
+///=============================================================================
+///va <= vecinsertv2 vb, c, <i1>
+///This instruction copies all elements of vb to va, except the 'i1'th element.
+///The scalar value c becomes the 'i1'th element of va.
+///This gets translated to 2 (4 for vecinsertv4) moves.
+///=============================================================================
+void VectorElementize::createVecInsert(MachineFunction& F, MachineInstr *Instr,
+                                       std::vector<MachineInstr *>& copies) {
+  unsigned numcopies=numCopiesNeeded(Instr);
+
+  unsigned destregnum = Instr->getOperand(0).getReg();
+  unsigned srcregnum = Instr->getOperand(1).getReg();
+
+  SmallVector<unsigned, 4> dest = getScalarRegisters(destregnum);
+  SmallVector<unsigned, 4> src = getScalarRegisters(srcregnum);
+
+  MachineOperand which=Instr->getOperand(3);
+  assert(which.isImm() && "Insert operand not a constant");
+  unsigned int elem=which.getImm();
+
+  DebugLoc DL = Instr->getDebugLoc();
+
+  for (unsigned i=0; i<numcopies; i++) {
+    MachineInstr *copy = BuildMI(F, DL,
+                              InstrInfo->get(getScalarVersion(Instr)), dest[i]);
+
+    if (i != elem)
+      copy->addOperand(MachineOperand::CreateReg(src[i], false));
+    else
+      copy->addOperand(Instr->getOperand(2));
+
+    copies.push_back(copy);
+  }
+
+}
+
+///=============================================================================
+///va <= buildv2 b1, b2
+///gets translated to
+///a1 <= b1
+///a2 <= b2
+///=============================================================================
+void VectorElementize::createVecBuild(MachineFunction& F, MachineInstr *Instr,
+                                      std::vector<MachineInstr *>& copies) {
+  unsigned numcopies=numCopiesNeeded(Instr);
+
+  unsigned destregnum = Instr->getOperand(0).getReg();
+
+  SmallVector<unsigned, 4> dest = getScalarRegisters(destregnum);
+
+  DebugLoc DL = Instr->getDebugLoc();
+
+  for (unsigned i=0; i<numcopies; i++) {
+    MachineInstr *copy = BuildMI(F, DL,
+                              InstrInfo->get(getScalarVersion(Instr)), dest[i]);
+
+    copy->addOperand(Instr->getOperand(1+i));
+
+    copies.push_back(copy);
+  }
+
+}
+
+///=============================================================================
+///For a tex inst of the form
+///va <= op [scalar operands]
+///the following multi output instruction is created :
+///[v1, v2] <= op' [scalar operands]
+///=============================================================================
+void VectorElementize::createVecDest(MachineFunction& F, MachineInstr *Instr,
+                                     std::vector<MachineInstr *>& copies) {
+  copies.push_back(F.CloneMachineInstr(Instr));
+
+  MachineInstr *copy=copies[0];
+  copy->setDesc(InstrInfo->get(getScalarVersion(copy)));
+
+  // Remove the dest, that should be a vector operand.
+  MachineOperand dest = copy->getOperand(0);
+  unsigned regnum = dest.getReg();
+
+  SmallVector<unsigned, 4> scalarRegs = getScalarRegisters(regnum);
+  copy->RemoveOperand(0);
+
+  std::vector<MachineOperand> otherOperands;
+  for (unsigned i=0, e=copy->getNumOperands(); i!=e; ++i)
+    otherOperands.push_back(copy->getOperand(i));
+
+  for (unsigned i=0, e=copy->getNumOperands(); i!=e; ++i)
+    copy->RemoveOperand(0);
+
+  for (unsigned i=0, e=scalarRegs.size(); i!=e; ++i)
+    copy->addOperand(MachineOperand::CreateReg(scalarRegs[i], true));
+
+  for (unsigned i=0, e=otherOperands.size(); i!=e; ++i)
+    copy->addOperand(otherOperands[i]);
+}
+
+///=============================================================================
+///Look at the vector instruction type and dispatch to the createVec<...>
+///function that creates the scalar copies.
+///=============================================================================
+void VectorElementize::createCopies(MachineFunction& F, MachineInstr *Instr,
+                                    std::vector<MachineInstr *>& copies) {
+  if (ISVECLOAD(Instr)) {
+    createLoadCopy(F, Instr, copies);
+    return;
+  }
+  if (ISVECSTORE(Instr)) {
+    createStoreCopy(F, Instr, copies);
+    return;
+  }
+  if (ISVECSHUFFLE(Instr)) {
+    createVecShuffle(F, Instr, copies);
+    return;
+  }
+  if (ISVECEXTRACT(Instr)) {
+    createVecExtract(F, Instr, copies);
+    return;
+  }
+  if (ISVECINSERT(Instr)) {
+    createVecInsert(F, Instr, copies);
+    return;
+  }
+  if (ISVECDEST(Instr)) {
+    createVecDest(F, Instr, copies);
+    return;
+  }
+  if (ISVECBUILD(Instr)) {
+    createVecBuild(F, Instr, copies);
+    return;
+  }
+
+  unsigned numcopies=numCopiesNeeded(Instr);
+
+  for (unsigned i=0; i<numcopies; ++i)
+    copies.push_back(F.CloneMachineInstr(Instr));
+
+  for (unsigned i=0; i<numcopies; ++i) {
+    MachineInstr *copy = copies[i];
+
+    std::vector<MachineOperand> allOperands;
+    std::vector<bool> isDef;
+
+    for (unsigned j=0, e=copy->getNumOperands(); j!=e; ++j) {
+      MachineOperand oper = copy->getOperand(j);
+      allOperands.push_back(oper);
+      if (oper.isReg())
+        isDef.push_back(oper.isDef());
+      else
+        isDef.push_back(false);
+    }
+
+    for (unsigned j=0, e=copy->getNumOperands(); j!=e; ++j)
+      copy->RemoveOperand(0);
+
+    copy->setDesc(InstrInfo->get(getScalarVersion(Instr)));
+
+    for (unsigned j=0, e=allOperands.size(); j!=e; ++j) {
+      MachineOperand oper=allOperands[j];
+      if (oper.isReg()) {
+        unsigned regnum = oper.getReg();
+        if (isVectorRegister(regnum)) {
+
+          SmallVector<unsigned, 4> scalarRegs = getScalarRegisters(regnum);
+          copy->addOperand(MachineOperand::CreateReg(scalarRegs[i], isDef[j]));
+        }
+        else
+          copy->addOperand(oper);
+      }
+      else
+        copy->addOperand(oper);
+    }
+  }
+}
+
+///=============================================================================
+///Scan through all basic blocks, looking for vector instructions.
+///For each vector instruction I, insert the scalar copies before I, and
+///add I into toRemove vector. Finally remove all instructions in toRemove.
+///=============================================================================
+void VectorElementize::elementize(MachineFunction &F) {
+  for (MachineFunction::reverse_iterator BI=F.rbegin(), BE=F.rend();
+      BI!=BE; ++BI) {
+    MachineBasicBlock *BB = &*BI;
+
+    std::vector<MachineInstr *> copies;
+    std::vector<MachineInstr *> toRemove;
+
+    for (MachineBasicBlock::iterator II=BB->begin(), IE=BB->end();
+        II!=IE; ++II) {
+      MachineInstr *Instr = &*II;
+
+      if (!isVectorInstr(Instr))
+        continue;
+
+      copies.clear();
+      createCopies(F, Instr, copies);
+      for (unsigned i=0, e=copies.size(); i!=e; ++i)
+        BB->insert(II, copies[i]);
+
+      assert((copies.size() > 0) && "Problem in createCopies");
+      toRemove.push_back(Instr);
+    }
+    for (unsigned i=0, e=toRemove.size(); i!=e; ++i)
+      F.DeleteMachineInstr(toRemove[i]->getParent()->remove(toRemove[i]));
+  }
+}
+
+///=============================================================================
+///a <= b
+///...
+///...
+///x <= op(a, ...)
+///gets converted to
+///
+///x <= op(b, ...)
+///The original move is still present. This works on SSA form machine code.
+///Note that a <= b should be a simple vreg-to-vreg move instruction.
+///TBD : I didn't find a function that can do replaceOperand, so I remove
+///all operands and add all of them again, replacing the one while adding.
+///=============================================================================
+unsigned VectorElementize::copyProp(MachineFunction &F) {
+  unsigned numReplacements = 0;
+
+  for (MachineFunction::reverse_iterator BI=F.rbegin(), BE=F.rend(); BI!=BE;
+      ++BI) {
+    MachineBasicBlock *BB = &*BI;
+
+    for (MachineBasicBlock::iterator II=BB->begin(), IE=BB->end(); II!=IE;
+        ++II) {
+      MachineInstr *Instr = &*II;
+
+      // Don't do copy propagation on PHI as it will cause unnecessary
+      // live range overlap.
+      if ((Instr->getOpcode() == TargetOpcode::PHI) ||
+          (Instr->getOpcode() == TargetOpcode::DBG_VALUE))
+        continue;
+
+      bool needsReplacement = false;
+
+      for (unsigned i=0, e=Instr->getNumOperands(); i!=e; ++i) {
+        MachineOperand oper = Instr->getOperand(i);
+        if (!oper.isReg()) continue;
+        if (oper.isDef()) continue;
+        if (!RegInfo->isVirtualRegister(oper.getReg())) continue;
+
+        MachineInstr *defInstr = MRI->getVRegDef(oper.getReg());
+
+        if (!defInstr) continue;
+
+        if (!isSimpleMove(defInstr)) continue;
+
+        MachineOperand defSrc = defInstr->getOperand(1);
+        if (!defSrc.isReg()) continue;
+        if (!RegInfo->isVirtualRegister(defSrc.getReg())) continue;
+
+        needsReplacement = true;
+
+      }
+      if (!needsReplacement) continue;
+
+      numReplacements++;
+
+      std::vector<MachineOperand> operands;
+
+      for (unsigned i=0, e=Instr->getNumOperands(); i!=e; ++i) {
+        MachineOperand oper = Instr->getOperand(i);
+        bool flag = false;
+        do {
+          if (!(oper.isReg()))
+            break;
+          if (oper.isDef())
+            break;
+          if (!(RegInfo->isVirtualRegister(oper.getReg())))
+            break;
+          MachineInstr *defInstr = MRI->getVRegDef(oper.getReg());
+          if (!(isSimpleMove(defInstr)))
+            break;
+          MachineOperand defSrc = defInstr->getOperand(1);
+          if (!(defSrc.isReg()))
+            break;
+          if (!(RegInfo->isVirtualRegister(defSrc.getReg())))
+            break;
+          operands.push_back(defSrc);
+          flag = true;
+        } while (0);
+        if (flag == false)
+          operands.push_back(oper);
+      }
+
+      for (unsigned i=0, e=Instr->getNumOperands(); i!=e; ++i)
+        Instr->RemoveOperand(0);
+      for (unsigned i=0, e=operands.size(); i!=e; ++i)
+        Instr->addOperand(operands[i]);
+
+    }
+  }
+  return numReplacements;
+}
+
+///=============================================================================
+///Look for simple vreg-to-vreg instructions whose use_empty() is true, add
+///them to deadMoves vector. Then remove all instructions in deadMoves.
+///=============================================================================
+unsigned VectorElementize::removeDeadMoves(MachineFunction &F) {
+  std::vector<MachineInstr *> deadMoves;
+  for (MachineFunction::reverse_iterator BI=F.rbegin(), BE=F.rend(); BI!=BE;
+      ++BI) {
+    MachineBasicBlock *BB = &*BI;
+
+    for (MachineBasicBlock::iterator II=BB->begin(), IE=BB->end(); II!=IE;
+        ++II) {
+      MachineInstr *Instr = &*II;
+
+      if (!isSimpleMove(Instr)) continue;
+
+      MachineOperand dest = Instr->getOperand(0);
+      assert(dest.isReg() && "dest of move not a register");
+      assert(RegInfo->isVirtualRegister(dest.getReg()) &&
+             "dest of move not a virtual register");
+
+      if (MRI->use_empty(dest.getReg())) {
+        deadMoves.push_back(Instr);
+      }
+    }
+  }
+
+  for (unsigned i=0, e=deadMoves.size(); i!=e; ++i)
+    F.DeleteMachineInstr(deadMoves[i]->getParent()->remove(deadMoves[i]));
+
+  return deadMoves.size();
+}
+
+///=============================================================================
+///Main function for this pass.
+///=============================================================================
+bool VectorElementize::runOnMachineFunction(MachineFunction &F) {
+  MRI = &F.getRegInfo();
+
+  RegInfo = TM.getRegisterInfo();
+  InstrInfo = TM.getInstrInfo();
+
+  VectorToScalarMap.clear();
+
+  elementize(F);
+
+  if (RemoveRedundantMoves)
+    while (1) {
+      if (copyProp(F) == 0) break;
+      removeDeadMoves(F);
+    }
+
+  return true;
+}
+
+FunctionPass *llvm::createVectorElementizePass(NVPTXTargetMachine &tm) {
+  return new VectorElementize(tm);
+}
+
+unsigned VectorElementize::getScalarVersion(unsigned opcode) {
+  if (opcode == NVPTX::PHI)
+    return opcode;
+  if (opcode == NVPTX::IMPLICIT_DEF)
+    return opcode;
+  switch(opcode) {
+  default: llvm_unreachable("Scalar version not set, fix NVPTXVector.td");
+  case TargetOpcode::COPY: return TargetOpcode::COPY;
+  case NVPTX::AddCCCV2I32: return NVPTX::ADDCCCi32rr;
+  case NVPTX::AddCCCV4I32: return NVPTX::ADDCCCi32rr;
+  case NVPTX::AddCCV2I32: return NVPTX::ADDCCi32rr;
+  case NVPTX::AddCCV4I32: return NVPTX::ADDCCi32rr;
+  case NVPTX::Build_Vector2_f32: return NVPTX::FMOV32rr;
+  case NVPTX::Build_Vector2_f64: return NVPTX::FMOV64rr;
+  case NVPTX::Build_Vector2_i16: return NVPTX::IMOV16rr;
+  case NVPTX::Build_Vector2_i32: return NVPTX::IMOV32rr;
+  case NVPTX::Build_Vector2_i64: return NVPTX::IMOV64rr;
+  case NVPTX::Build_Vector2_i8: return NVPTX::IMOV8rr;
+  case NVPTX::Build_Vector4_f32: return NVPTX::FMOV32rr;
+  case NVPTX::Build_Vector4_i16: return NVPTX::IMOV16rr;
+  case NVPTX::Build_Vector4_i32: return NVPTX::IMOV32rr;
+  case NVPTX::Build_Vector4_i8: return NVPTX::IMOV8rr;
+  case NVPTX::CVTv2i16tov2i32: return NVPTX::Zint_extendext16to32;
+  case NVPTX::CVTv2i64tov2i32: return NVPTX::TRUNC_64to32;
+  case NVPTX::CVTv2i8tov2i32: return NVPTX::Zint_extendext8to32;
+  case NVPTX::CVTv4i16tov4i32: return NVPTX::Zint_extendext16to32;
+  case NVPTX::CVTv4i8tov4i32: return NVPTX::Zint_extendext8to32;
+  case NVPTX::F32MAD_ftzV2: return NVPTX::FMAD32_ftzrrr;
+  case NVPTX::F32MADV2: return NVPTX::FMAD32rrr;
+  case NVPTX::F32MAD_ftzV4: return NVPTX::FMAD32_ftzrrr;
+  case NVPTX::F32MADV4: return NVPTX::FMAD32rrr;
+  case NVPTX::F32FMA_ftzV2: return NVPTX::FMA32_ftzrrr;
+  case NVPTX::F32FMAV2: return NVPTX::FMA32rrr;
+  case NVPTX::F32FMA_ftzV4: return NVPTX::FMA32_ftzrrr;
+  case NVPTX::F32FMAV4: return NVPTX::FMA32rrr;
+  case NVPTX::F64FMAV2: return NVPTX::FMA64rrr;
+  case NVPTX::FVecEQV2F32: return NVPTX::FSetEQf32rr_toi32;
+  case NVPTX::FVecEQV2F64: return NVPTX::FSetEQf64rr_toi64;
+  case NVPTX::FVecEQV4F32: return NVPTX::FSetEQf32rr_toi32;
+  case NVPTX::FVecGEV2F32: return NVPTX::FSetGEf32rr_toi32;
+  case NVPTX::FVecGEV2F64: return NVPTX::FSetGEf64rr_toi64;
+  case NVPTX::FVecGEV4F32: return NVPTX::FSetGEf32rr_toi32;
+  case NVPTX::FVecGTV2F32: return NVPTX::FSetGTf32rr_toi32;
+  case NVPTX::FVecGTV2F64: return NVPTX::FSetGTf64rr_toi64;
+  case NVPTX::FVecGTV4F32: return NVPTX::FSetGTf32rr_toi32;
+  case NVPTX::FVecLEV2F32: return NVPTX::FSetLEf32rr_toi32;
+  case NVPTX::FVecLEV2F64: return NVPTX::FSetLEf64rr_toi64;
+  case NVPTX::FVecLEV4F32: return NVPTX::FSetLEf32rr_toi32;
+  case NVPTX::FVecLTV2F32: return NVPTX::FSetLTf32rr_toi32;
+  case NVPTX::FVecLTV2F64: return NVPTX::FSetLTf64rr_toi64;
+  case NVPTX::FVecLTV4F32: return NVPTX::FSetLTf32rr_toi32;
+  case NVPTX::FVecNANV2F32: return NVPTX::FSetNANf32rr_toi32;
+  case NVPTX::FVecNANV2F64: return NVPTX::FSetNANf64rr_toi64;
+  case NVPTX::FVecNANV4F32: return NVPTX::FSetNANf32rr_toi32;
+  case NVPTX::FVecNEV2F32: return NVPTX::FSetNEf32rr_toi32;
+  case NVPTX::FVecNEV2F64: return NVPTX::FSetNEf64rr_toi64;
+  case NVPTX::FVecNEV4F32: return NVPTX::FSetNEf32rr_toi32;
+  case NVPTX::FVecNUMV2F32: return NVPTX::FSetNUMf32rr_toi32;
+  case NVPTX::FVecNUMV2F64: return NVPTX::FSetNUMf64rr_toi64;
+  case NVPTX::FVecNUMV4F32: return NVPTX::FSetNUMf32rr_toi32;
+  case NVPTX::FVecUEQV2F32: return NVPTX::FSetUEQf32rr_toi32;
+  case NVPTX::FVecUEQV2F64: return NVPTX::FSetUEQf64rr_toi64;
+  case NVPTX::FVecUEQV4F32: return NVPTX::FSetUEQf32rr_toi32;
+  case NVPTX::FVecUGEV2F32: return NVPTX::FSetUGEf32rr_toi32;
+  case NVPTX::FVecUGEV2F64: return NVPTX::FSetUGEf64rr_toi64;
+  case NVPTX::FVecUGEV4F32: return NVPTX::FSetUGEf32rr_toi32;
+  case NVPTX::FVecUGTV2F32: return NVPTX::FSetUGTf32rr_toi32;
+  case NVPTX::FVecUGTV2F64: return NVPTX::FSetUGTf64rr_toi64;
+  case NVPTX::FVecUGTV4F32: return NVPTX::FSetUGTf32rr_toi32;
+  case NVPTX::FVecULEV2F32: return NVPTX::FSetULEf32rr_toi32;
+  case NVPTX::FVecULEV2F64: return NVPTX::FSetULEf64rr_toi64;
+  case NVPTX::FVecULEV4F32: return NVPTX::FSetULEf32rr_toi32;
+  case NVPTX::FVecULTV2F32: return NVPTX::FSetULTf32rr_toi32;
+  case NVPTX::FVecULTV2F64: return NVPTX::FSetULTf64rr_toi64;
+  case NVPTX::FVecULTV4F32: return NVPTX::FSetULTf32rr_toi32;
+  case NVPTX::FVecUNEV2F32: return NVPTX::FSetUNEf32rr_toi32;
+  case NVPTX::FVecUNEV2F64: return NVPTX::FSetUNEf64rr_toi64;
+  case NVPTX::FVecUNEV4F32: return NVPTX::FSetUNEf32rr_toi32;
+  case NVPTX::I16MADV2: return NVPTX::MAD16rrr;
+  case NVPTX::I16MADV4: return NVPTX::MAD16rrr;
+  case NVPTX::I32MADV2: return NVPTX::MAD32rrr;
+  case NVPTX::I32MADV4: return NVPTX::MAD32rrr;
+  case NVPTX::I64MADV2: return NVPTX::MAD64rrr;
+  case NVPTX::I8MADV2: return NVPTX::MAD8rrr;
+  case NVPTX::I8MADV4: return NVPTX::MAD8rrr;
+  case NVPTX::ShiftLV2I16: return NVPTX::SHLi16rr;
+  case NVPTX::ShiftLV2I32: return NVPTX::SHLi32rr;
+  case NVPTX::ShiftLV2I64: return NVPTX::SHLi64rr;
+  case NVPTX::ShiftLV2I8: return NVPTX::SHLi8rr;
+  case NVPTX::ShiftLV4I16: return NVPTX::SHLi16rr;
+  case NVPTX::ShiftLV4I32: return NVPTX::SHLi32rr;
+  case NVPTX::ShiftLV4I8: return NVPTX::SHLi8rr;
+  case NVPTX::ShiftRAV2I16: return NVPTX::SRAi16rr;
+  case NVPTX::ShiftRAV2I32: return NVPTX::SRAi32rr;
+  case NVPTX::ShiftRAV2I64: return NVPTX::SRAi64rr;
+  case NVPTX::ShiftRAV2I8: return NVPTX::SRAi8rr;
+  case NVPTX::ShiftRAV4I16: return NVPTX::SRAi16rr;
+  case NVPTX::ShiftRAV4I32: return NVPTX::SRAi32rr;
+  case NVPTX::ShiftRAV4I8: return NVPTX::SRAi8rr;
+  case NVPTX::ShiftRLV2I16: return NVPTX::SRLi16rr;
+  case NVPTX::ShiftRLV2I32: return NVPTX::SRLi32rr;
+  case NVPTX::ShiftRLV2I64: return NVPTX::SRLi64rr;
+  case NVPTX::ShiftRLV2I8: return NVPTX::SRLi8rr;
+  case NVPTX::ShiftRLV4I16: return NVPTX::SRLi16rr;
+  case NVPTX::ShiftRLV4I32: return NVPTX::SRLi32rr;
+  case NVPTX::ShiftRLV4I8: return NVPTX::SRLi8rr;
+  case NVPTX::SubCCCV2I32: return NVPTX::SUBCCCi32rr;
+  case NVPTX::SubCCCV4I32: return NVPTX::SUBCCCi32rr;
+  case NVPTX::SubCCV2I32: return NVPTX::SUBCCi32rr;
+  case NVPTX::SubCCV4I32: return NVPTX::SUBCCi32rr;
+  case NVPTX::V2F32Div_prec_ftz: return NVPTX::FDIV32rr_prec_ftz;
+  case NVPTX::V2F32Div_prec: return NVPTX::FDIV32rr_prec;
+  case NVPTX::V2F32Div_ftz: return NVPTX::FDIV32rr_ftz;
+  case NVPTX::V2F32Div: return NVPTX::FDIV32rr;
+  case NVPTX::V2F32_Select: return NVPTX::SELECTf32rr;
+  case NVPTX::V2F64Div: return NVPTX::FDIV64rr;
+  case NVPTX::V2F64_Select: return NVPTX::SELECTf64rr;
+  case NVPTX::V2I16_Select: return NVPTX::SELECTi16rr;
+  case NVPTX::V2I32_Select: return NVPTX::SELECTi32rr;
+  case NVPTX::V2I64_Select: return NVPTX::SELECTi64rr;
+  case NVPTX::V2I8_Select: return NVPTX::SELECTi8rr;
+  case NVPTX::V2f32Extract: return NVPTX::FMOV32rr;
+  case NVPTX::V2f32Insert: return NVPTX::FMOV32rr;
+  case NVPTX::V2f32Mov: return NVPTX::FMOV32rr;
+  case NVPTX::V2f64Extract: return NVPTX::FMOV64rr;
+  case NVPTX::V2f64Insert: return NVPTX::FMOV64rr;
+  case NVPTX::V2f64Mov: return NVPTX::FMOV64rr;
+  case NVPTX::V2i16Extract: return NVPTX::IMOV16rr;
+  case NVPTX::V2i16Insert: return NVPTX::IMOV16rr;
+  case NVPTX::V2i16Mov: return NVPTX::IMOV16rr;
+  case NVPTX::V2i32Extract: return NVPTX::IMOV32rr;
+  case NVPTX::V2i32Insert: return NVPTX::IMOV32rr;
+  case NVPTX::V2i32Mov: return NVPTX::IMOV32rr;
+  case NVPTX::V2i64Extract: return NVPTX::IMOV64rr;
+  case NVPTX::V2i64Insert: return NVPTX::IMOV64rr;
+  case NVPTX::V2i64Mov: return NVPTX::IMOV64rr;
+  case NVPTX::V2i8Extract: return NVPTX::IMOV8rr;
+  case NVPTX::V2i8Insert: return NVPTX::IMOV8rr;
+  case NVPTX::V2i8Mov: return NVPTX::IMOV8rr;
+  case NVPTX::V4F32Div_prec_ftz: return NVPTX::FDIV32rr_prec_ftz;
+  case NVPTX::V4F32Div_prec: return NVPTX::FDIV32rr_prec;
+  case NVPTX::V4F32Div_ftz: return NVPTX::FDIV32rr_ftz;
+  case NVPTX::V4F32Div: return NVPTX::FDIV32rr;
+  case NVPTX::V4F32_Select: return NVPTX::SELECTf32rr;
+  case NVPTX::V4I16_Select: return NVPTX::SELECTi16rr;
+  case NVPTX::V4I32_Select: return NVPTX::SELECTi32rr;
+  case NVPTX::V4I8_Select: return NVPTX::SELECTi8rr;
+  case NVPTX::V4f32Extract: return NVPTX::FMOV32rr;
+  case NVPTX::V4f32Insert: return NVPTX::FMOV32rr;
+  case NVPTX::V4f32Mov: return NVPTX::FMOV32rr;
+  case NVPTX::V4i16Extract: return NVPTX::IMOV16rr;
+  case NVPTX::V4i16Insert: return NVPTX::IMOV16rr;
+  case NVPTX::V4i16Mov: return NVPTX::IMOV16rr;
+  case NVPTX::V4i32Extract: return NVPTX::IMOV32rr;
+  case NVPTX::V4i32Insert: return NVPTX::IMOV32rr;
+  case NVPTX::V4i32Mov: return NVPTX::IMOV32rr;
+  case NVPTX::V4i8Extract: return NVPTX::IMOV8rr;
+  case NVPTX::V4i8Insert: return NVPTX::IMOV8rr;
+  case NVPTX::V4i8Mov: return NVPTX::IMOV8rr;
+  case NVPTX::VAddV2I16: return NVPTX::ADDi16rr;
+  case NVPTX::VAddV2I32: return NVPTX::ADDi32rr;
+  case NVPTX::VAddV2I64: return NVPTX::ADDi64rr;
+  case NVPTX::VAddV2I8: return NVPTX::ADDi8rr;
+  case NVPTX::VAddV4I16: return NVPTX::ADDi16rr;
+  case NVPTX::VAddV4I32: return NVPTX::ADDi32rr;
+  case NVPTX::VAddV4I8: return NVPTX::ADDi8rr;
+  case NVPTX::VAddfV2F32: return NVPTX::FADDf32rr;
+  case NVPTX::VAddfV2F32_ftz: return NVPTX::FADDf32rr_ftz;
+  case NVPTX::VAddfV2F64: return NVPTX::FADDf64rr;
+  case NVPTX::VAddfV4F32: return NVPTX::FADDf32rr;
+  case NVPTX::VAddfV4F32_ftz: return NVPTX::FADDf32rr_ftz;
+  case NVPTX::VAndV2I16: return NVPTX::ANDb16rr;
+  case NVPTX::VAndV2I32: return NVPTX::ANDb32rr;
+  case NVPTX::VAndV2I64: return NVPTX::ANDb64rr;
+  case NVPTX::VAndV2I8: return NVPTX::ANDb8rr;
+  case NVPTX::VAndV4I16: return NVPTX::ANDb16rr;
+  case NVPTX::VAndV4I32: return NVPTX::ANDb32rr;
+  case NVPTX::VAndV4I8: return NVPTX::ANDb8rr;
+  case NVPTX::VMulfV2F32_ftz: return NVPTX::FMULf32rr_ftz;
+  case NVPTX::VMulfV2F32: return NVPTX::FMULf32rr;
+  case NVPTX::VMulfV2F64: return NVPTX::FMULf64rr;
+  case NVPTX::VMulfV4F32_ftz: return NVPTX::FMULf32rr_ftz;
+  case NVPTX::VMulfV4F32: return NVPTX::FMULf32rr;
+  case NVPTX::VMultHSV2I16: return NVPTX::MULTHSi16rr;
+  case NVPTX::VMultHSV2I32: return NVPTX::MULTHSi32rr;
+  case NVPTX::VMultHSV2I64: return NVPTX::MULTHSi64rr;
+  case NVPTX::VMultHSV2I8: return NVPTX::MULTHSi8rr;
+  case NVPTX::VMultHSV4I16: return NVPTX::MULTHSi16rr;
+  case NVPTX::VMultHSV4I32: return NVPTX::MULTHSi32rr;
+  case NVPTX::VMultHSV4I8: return NVPTX::MULTHSi8rr;
+  case NVPTX::VMultHUV2I16: return NVPTX::MULTHUi16rr;
+  case NVPTX::VMultHUV2I32: return NVPTX::MULTHUi32rr;
+  case NVPTX::VMultHUV2I64: return NVPTX::MULTHUi64rr;
+  case NVPTX::VMultHUV2I8: return NVPTX::MULTHUi8rr;
+  case NVPTX::VMultHUV4I16: return NVPTX::MULTHUi16rr;
+  case NVPTX::VMultHUV4I32: return NVPTX::MULTHUi32rr;
+  case NVPTX::VMultHUV4I8: return NVPTX::MULTHUi8rr;
+  case NVPTX::VMultV2I16: return NVPTX::MULTi16rr;
+  case NVPTX::VMultV2I32: return NVPTX::MULTi32rr;
+  case NVPTX::VMultV2I64: return NVPTX::MULTi64rr;
+  case NVPTX::VMultV2I8: return NVPTX::MULTi8rr;
+  case NVPTX::VMultV4I16: return NVPTX::MULTi16rr;
+  case NVPTX::VMultV4I32: return NVPTX::MULTi32rr;
+  case NVPTX::VMultV4I8: return NVPTX::MULTi8rr;
+  case NVPTX::VNegV2I16: return NVPTX::INEG16;
+  case NVPTX::VNegV2I32: return NVPTX::INEG32;
+  case NVPTX::VNegV2I64: return NVPTX::INEG64;
+  case NVPTX::VNegV2I8: return NVPTX::INEG8;
+  case NVPTX::VNegV4I16: return NVPTX::INEG16;
+  case NVPTX::VNegV4I32: return NVPTX::INEG32;
+  case NVPTX::VNegV4I8: return NVPTX::INEG8;
+  case NVPTX::VNegv2f32: return NVPTX::FNEGf32;
+  case NVPTX::VNegv2f32_ftz: return NVPTX::FNEGf32_ftz;
+  case NVPTX::VNegv2f64: return NVPTX::FNEGf64;
+  case NVPTX::VNegv4f32: return NVPTX::FNEGf32;
+  case NVPTX::VNegv4f32_ftz: return NVPTX::FNEGf32_ftz;
+  case NVPTX::VNotV2I16: return NVPTX::NOT16;
+  case NVPTX::VNotV2I32: return NVPTX::NOT32;
+  case NVPTX::VNotV2I64: return NVPTX::NOT64;
+  case NVPTX::VNotV2I8: return NVPTX::NOT8;
+  case NVPTX::VNotV4I16: return NVPTX::NOT16;
+  case NVPTX::VNotV4I32: return NVPTX::NOT32;
+  case NVPTX::VNotV4I8: return NVPTX::NOT8;
+  case NVPTX::VOrV2I16: return NVPTX::ORb16rr;
+  case NVPTX::VOrV2I32: return NVPTX::ORb32rr;
+  case NVPTX::VOrV2I64: return NVPTX::ORb64rr;
+  case NVPTX::VOrV2I8: return NVPTX::ORb8rr;
+  case NVPTX::VOrV4I16: return NVPTX::ORb16rr;
+  case NVPTX::VOrV4I32: return NVPTX::ORb32rr;
+  case NVPTX::VOrV4I8: return NVPTX::ORb8rr;
+  case NVPTX::VSDivV2I16: return NVPTX::SDIVi16rr;
+  case NVPTX::VSDivV2I32: return NVPTX::SDIVi32rr;
+  case NVPTX::VSDivV2I64: return NVPTX::SDIVi64rr;
+  case NVPTX::VSDivV2I8: return NVPTX::SDIVi8rr;
+  case NVPTX::VSDivV4I16: return NVPTX::SDIVi16rr;
+  case NVPTX::VSDivV4I32: return NVPTX::SDIVi32rr;
+  case NVPTX::VSDivV4I8: return NVPTX::SDIVi8rr;
+  case NVPTX::VSRemV2I16: return NVPTX::SREMi16rr;
+  case NVPTX::VSRemV2I32: return NVPTX::SREMi32rr;
+  case NVPTX::VSRemV2I64: return NVPTX::SREMi64rr;
+  case NVPTX::VSRemV2I8: return NVPTX::SREMi8rr;
+  case NVPTX::VSRemV4I16: return NVPTX::SREMi16rr;
+  case NVPTX::VSRemV4I32: return NVPTX::SREMi32rr;
+  case NVPTX::VSRemV4I8: return NVPTX::SREMi8rr;
+  case NVPTX::VSubV2I16: return NVPTX::SUBi16rr;
+  case NVPTX::VSubV2I32: return NVPTX::SUBi32rr;
+  case NVPTX::VSubV2I64: return NVPTX::SUBi64rr;
+  case NVPTX::VSubV2I8: return NVPTX::SUBi8rr;
+  case NVPTX::VSubV4I16: return NVPTX::SUBi16rr;
+  case NVPTX::VSubV4I32: return NVPTX::SUBi32rr;
+  case NVPTX::VSubV4I8: return NVPTX::SUBi8rr;
+  case NVPTX::VSubfV2F32_ftz: return NVPTX::FSUBf32rr_ftz;
+  case NVPTX::VSubfV2F32: return NVPTX::FSUBf32rr;
+  case NVPTX::VSubfV2F64: return NVPTX::FSUBf64rr;
+  case NVPTX::VSubfV4F32_ftz: return NVPTX::FSUBf32rr_ftz;
+  case NVPTX::VSubfV4F32: return NVPTX::FSUBf32rr;
+  case NVPTX::VUDivV2I16: return NVPTX::UDIVi16rr;
+  case NVPTX::VUDivV2I32: return NVPTX::UDIVi32rr;
+  case NVPTX::VUDivV2I64: return NVPTX::UDIVi64rr;
+  case NVPTX::VUDivV2I8: return NVPTX::UDIVi8rr;
+  case NVPTX::VUDivV4I16: return NVPTX::UDIVi16rr;
+  case NVPTX::VUDivV4I32: return NVPTX::UDIVi32rr;
+  case NVPTX::VUDivV4I8: return NVPTX::UDIVi8rr;
+  case NVPTX::VURemV2I16: return NVPTX::UREMi16rr;
+  case NVPTX::VURemV2I32: return NVPTX::UREMi32rr;
+  case NVPTX::VURemV2I64: return NVPTX::UREMi64rr;
+  case NVPTX::VURemV2I8: return NVPTX::UREMi8rr;
+  case NVPTX::VURemV4I16: return NVPTX::UREMi16rr;
+  case NVPTX::VURemV4I32: return NVPTX::UREMi32rr;
+  case NVPTX::VURemV4I8: return NVPTX::UREMi8rr;
+  case NVPTX::VXorV2I16: return NVPTX::XORb16rr;
+  case NVPTX::VXorV2I32: return NVPTX::XORb32rr;
+  case NVPTX::VXorV2I64: return NVPTX::XORb64rr;
+  case NVPTX::VXorV2I8: return NVPTX::XORb8rr;
+  case NVPTX::VXorV4I16: return NVPTX::XORb16rr;
+  case NVPTX::VXorV4I32: return NVPTX::XORb32rr;
+  case NVPTX::VXorV4I8: return NVPTX::XORb8rr;
+  case NVPTX::VecSEQV2I16: return NVPTX::ISetSEQi16rr_toi16;
+  case NVPTX::VecSEQV2I32: return NVPTX::ISetSEQi32rr_toi32;
+  case NVPTX::VecSEQV2I64: return NVPTX::ISetSEQi64rr_toi64;
+  case NVPTX::VecSEQV2I8: return NVPTX::ISetSEQi8rr_toi8;
+  case NVPTX::VecSEQV4I16: return NVPTX::ISetSEQi16rr_toi16;
+  case NVPTX::VecSEQV4I32: return NVPTX::ISetSEQi32rr_toi32;
+  case NVPTX::VecSEQV4I8: return NVPTX::ISetSEQi8rr_toi8;
+  case NVPTX::VecSGEV2I16: return NVPTX::ISetSGEi16rr_toi16;
+  case NVPTX::VecSGEV2I32: return NVPTX::ISetSGEi32rr_toi32;
+  case NVPTX::VecSGEV2I64: return NVPTX::ISetSGEi64rr_toi64;
+  case NVPTX::VecSGEV2I8: return NVPTX::ISetSGEi8rr_toi8;
+  case NVPTX::VecSGEV4I16: return NVPTX::ISetSGEi16rr_toi16;
+  case NVPTX::VecSGEV4I32: return NVPTX::ISetSGEi32rr_toi32;
+  case NVPTX::VecSGEV4I8: return NVPTX::ISetSGEi8rr_toi8;
+  case NVPTX::VecSGTV2I16: return NVPTX::ISetSGTi16rr_toi16;
+  case NVPTX::VecSGTV2I32: return NVPTX::ISetSGTi32rr_toi32;
+  case NVPTX::VecSGTV2I64: return NVPTX::ISetSGTi64rr_toi64;
+  case NVPTX::VecSGTV2I8: return NVPTX::ISetSGTi8rr_toi8;
+  case NVPTX::VecSGTV4I16: return NVPTX::ISetSGTi16rr_toi16;
+  case NVPTX::VecSGTV4I32: return NVPTX::ISetSGTi32rr_toi32;
+  case NVPTX::VecSGTV4I8: return NVPTX::ISetSGTi8rr_toi8;
+  case NVPTX::VecSLEV2I16: return NVPTX::ISetSLEi16rr_toi16;
+  case NVPTX::VecSLEV2I32: return NVPTX::ISetSLEi32rr_toi32;
+  case NVPTX::VecSLEV2I64: return NVPTX::ISetSLEi64rr_toi64;
+  case NVPTX::VecSLEV2I8: return NVPTX::ISetSLEi8rr_toi8;
+  case NVPTX::VecSLEV4I16: return NVPTX::ISetSLEi16rr_toi16;
+  case NVPTX::VecSLEV4I32: return NVPTX::ISetSLEi32rr_toi32;
+  case NVPTX::VecSLEV4I8: return NVPTX::ISetSLEi8rr_toi8;
+  case NVPTX::VecSLTV2I16: return NVPTX::ISetSLTi16rr_toi16;
+  case NVPTX::VecSLTV2I32: return NVPTX::ISetSLTi32rr_toi32;
+  case NVPTX::VecSLTV2I64: return NVPTX::ISetSLTi64rr_toi64;
+  case NVPTX::VecSLTV2I8: return NVPTX::ISetSLTi8rr_toi8;
+  case NVPTX::VecSLTV4I16: return NVPTX::ISetSLTi16rr_toi16;
+  case NVPTX::VecSLTV4I32: return NVPTX::ISetSLTi32rr_toi32;
+  case NVPTX::VecSLTV4I8: return NVPTX::ISetSLTi8rr_toi8;
+  case NVPTX::VecSNEV2I16: return NVPTX::ISetSNEi16rr_toi16;
+  case NVPTX::VecSNEV2I32: return NVPTX::ISetSNEi32rr_toi32;
+  case NVPTX::VecSNEV2I64: return NVPTX::ISetSNEi64rr_toi64;
+  case NVPTX::VecSNEV2I8: return NVPTX::ISetSNEi8rr_toi8;
+  case NVPTX::VecSNEV4I16: return NVPTX::ISetSNEi16rr_toi16;
+  case NVPTX::VecSNEV4I32: return NVPTX::ISetSNEi32rr_toi32;
+  case NVPTX::VecSNEV4I8: return NVPTX::ISetSNEi8rr_toi8;
+  case NVPTX::VecShuffle_v2f32: return NVPTX::FMOV32rr;
+  case NVPTX::VecShuffle_v2f64: return NVPTX::FMOV64rr;
+  case NVPTX::VecShuffle_v2i16: return NVPTX::IMOV16rr;
+  case NVPTX::VecShuffle_v2i32: return NVPTX::IMOV32rr;
+  case NVPTX::VecShuffle_v2i64: return NVPTX::IMOV64rr;
+  case NVPTX::VecShuffle_v2i8: return NVPTX::IMOV8rr;
+  case NVPTX::VecShuffle_v4f32: return NVPTX::FMOV32rr;
+  case NVPTX::VecShuffle_v4i16: return NVPTX::IMOV16rr;
+  case NVPTX::VecShuffle_v4i32: return NVPTX::IMOV32rr;
+  case NVPTX::VecShuffle_v4i8: return NVPTX::IMOV8rr;
+  case NVPTX::VecUEQV2I16: return NVPTX::ISetUEQi16rr_toi16;
+  case NVPTX::VecUEQV2I32: return NVPTX::ISetUEQi32rr_toi32;
+  case NVPTX::VecUEQV2I64: return NVPTX::ISetUEQi64rr_toi64;
+  case NVPTX::VecUEQV2I8: return NVPTX::ISetUEQi8rr_toi8;
+  case NVPTX::VecUEQV4I16: return NVPTX::ISetUEQi16rr_toi16;
+  case NVPTX::VecUEQV4I32: return NVPTX::ISetUEQi32rr_toi32;
+  case NVPTX::VecUEQV4I8: return NVPTX::ISetUEQi8rr_toi8;
+  case NVPTX::VecUGEV2I16: return NVPTX::ISetUGEi16rr_toi16;
+  case NVPTX::VecUGEV2I32: return NVPTX::ISetUGEi32rr_toi32;
+  case NVPTX::VecUGEV2I64: return NVPTX::ISetUGEi64rr_toi64;
+  case NVPTX::VecUGEV2I8: return NVPTX::ISetUGEi8rr_toi8;
+  case NVPTX::VecUGEV4I16: return NVPTX::ISetUGEi16rr_toi16;
+  case NVPTX::VecUGEV4I32: return NVPTX::ISetUGEi32rr_toi32;
+  case NVPTX::VecUGEV4I8: return NVPTX::ISetUGEi8rr_toi8;
+  case NVPTX::VecUGTV2I16: return NVPTX::ISetUGTi16rr_toi16;
+  case NVPTX::VecUGTV2I32: return NVPTX::ISetUGTi32rr_toi32;
+  case NVPTX::VecUGTV2I64: return NVPTX::ISetUGTi64rr_toi64;
+  case NVPTX::VecUGTV2I8: return NVPTX::ISetUGTi8rr_toi8;
+  case NVPTX::VecUGTV4I16: return NVPTX::ISetUGTi16rr_toi16;
+  case NVPTX::VecUGTV4I32: return NVPTX::ISetUGTi32rr_toi32;
+  case NVPTX::VecUGTV4I8: return NVPTX::ISetUGTi8rr_toi8;
+  case NVPTX::VecULEV2I16: return NVPTX::ISetULEi16rr_toi16;
+  case NVPTX::VecULEV2I32: return NVPTX::ISetULEi32rr_toi32;
+  case NVPTX::VecULEV2I64: return NVPTX::ISetULEi64rr_toi64;
+  case NVPTX::VecULEV2I8: return NVPTX::ISetULEi8rr_toi8;
+  case NVPTX::VecULEV4I16: return NVPTX::ISetULEi16rr_toi16;
+  case NVPTX::VecULEV4I32: return NVPTX::ISetULEi32rr_toi32;
+  case NVPTX::VecULEV4I8: return NVPTX::ISetULEi8rr_toi8;
+  case NVPTX::VecULTV2I16: return NVPTX::ISetULTi16rr_toi16;
+  case NVPTX::VecULTV2I32: return NVPTX::ISetULTi32rr_toi32;
+  case NVPTX::VecULTV2I64: return NVPTX::ISetULTi64rr_toi64;
+  case NVPTX::VecULTV2I8: return NVPTX::ISetULTi8rr_toi8;
+  case NVPTX::VecULTV4I16: return NVPTX::ISetULTi16rr_toi16;
+  case NVPTX::VecULTV4I32: return NVPTX::ISetULTi32rr_toi32;
+  case NVPTX::VecULTV4I8: return NVPTX::ISetULTi8rr_toi8;
+  case NVPTX::VecUNEV2I16: return NVPTX::ISetUNEi16rr_toi16;
+  case NVPTX::VecUNEV2I32: return NVPTX::ISetUNEi32rr_toi32;
+  case NVPTX::VecUNEV2I64: return NVPTX::ISetUNEi64rr_toi64;
+  case NVPTX::VecUNEV2I8: return NVPTX::ISetUNEi8rr_toi8;
+  case NVPTX::VecUNEV4I16: return NVPTX::ISetUNEi16rr_toi16;
+  case NVPTX::VecUNEV4I32: return NVPTX::ISetUNEi32rr_toi32;
+  case NVPTX::VecUNEV4I8: return NVPTX::ISetUNEi8rr_toi8;
+  case NVPTX::INT_PTX_LDU_G_v2i8_32: return NVPTX::INT_PTX_LDU_G_v2i8_ELE_32;
+  case NVPTX::INT_PTX_LDU_G_v4i8_32: return NVPTX::INT_PTX_LDU_G_v4i8_ELE_32;
+  case NVPTX::INT_PTX_LDU_G_v2i16_32: return NVPTX::INT_PTX_LDU_G_v2i16_ELE_32;
+  case NVPTX::INT_PTX_LDU_G_v4i16_32: return NVPTX::INT_PTX_LDU_G_v4i16_ELE_32;
+  case NVPTX::INT_PTX_LDU_G_v2i32_32: return NVPTX::INT_PTX_LDU_G_v2i32_ELE_32;
+  case NVPTX::INT_PTX_LDU_G_v4i32_32: return NVPTX::INT_PTX_LDU_G_v4i32_ELE_32;
+  case NVPTX::INT_PTX_LDU_G_v2f32_32: return NVPTX::INT_PTX_LDU_G_v2f32_ELE_32;
+  case NVPTX::INT_PTX_LDU_G_v4f32_32: return NVPTX::INT_PTX_LDU_G_v4f32_ELE_32;
+  case NVPTX::INT_PTX_LDU_G_v2i64_32: return NVPTX::INT_PTX_LDU_G_v2i64_ELE_32;
+  case NVPTX::INT_PTX_LDU_G_v2f64_32: return NVPTX::INT_PTX_LDU_G_v2f64_ELE_32;
+  case NVPTX::INT_PTX_LDU_G_v2i8_64: return NVPTX::INT_PTX_LDU_G_v2i8_ELE_64;
+  case NVPTX::INT_PTX_LDU_G_v4i8_64: return NVPTX::INT_PTX_LDU_G_v4i8_ELE_64;
+  case NVPTX::INT_PTX_LDU_G_v2i16_64: return NVPTX::INT_PTX_LDU_G_v2i16_ELE_64;
+  case NVPTX::INT_PTX_LDU_G_v4i16_64: return NVPTX::INT_PTX_LDU_G_v4i16_ELE_64;
+  case NVPTX::INT_PTX_LDU_G_v2i32_64: return NVPTX::INT_PTX_LDU_G_v2i32_ELE_64;
+  case NVPTX::INT_PTX_LDU_G_v4i32_64: return NVPTX::INT_PTX_LDU_G_v4i32_ELE_64;
+  case NVPTX::INT_PTX_LDU_G_v2f32_64: return NVPTX::INT_PTX_LDU_G_v2f32_ELE_64;
+  case NVPTX::INT_PTX_LDU_G_v4f32_64: return NVPTX::INT_PTX_LDU_G_v4f32_ELE_64;
+  case NVPTX::INT_PTX_LDU_G_v2i64_64: return NVPTX::INT_PTX_LDU_G_v2i64_ELE_64;
+  case NVPTX::INT_PTX_LDU_G_v2f64_64: return NVPTX::INT_PTX_LDU_G_v2f64_ELE_64;
+
+  case NVPTX::LoadParamV4I32: return NVPTX::LoadParamScalar4I32;
+  case NVPTX::LoadParamV4I16: return NVPTX::LoadParamScalar4I16;
+  case NVPTX::LoadParamV4I8: return NVPTX::LoadParamScalar4I8;
+  case NVPTX::LoadParamV2I64: return NVPTX::LoadParamScalar2I64;
+  case NVPTX::LoadParamV2I32: return NVPTX::LoadParamScalar2I32;
+  case NVPTX::LoadParamV2I16: return NVPTX::LoadParamScalar2I16;
+  case NVPTX::LoadParamV2I8: return NVPTX::LoadParamScalar2I8;
+  case NVPTX::LoadParamV4F32: return NVPTX::LoadParamScalar4F32;
+  case NVPTX::LoadParamV2F32: return NVPTX::LoadParamScalar2F32;
+  case NVPTX::LoadParamV2F64: return NVPTX::LoadParamScalar2F64;
+  case NVPTX::StoreParamV4I32: return NVPTX::StoreParamScalar4I32;
+  case NVPTX::StoreParamV4I16: return NVPTX::StoreParamScalar4I16;
+  case NVPTX::StoreParamV4I8: return NVPTX::StoreParamScalar4I8;
+  case NVPTX::StoreParamV2I64: return NVPTX::StoreParamScalar2I64;
+  case NVPTX::StoreParamV2I32: return NVPTX::StoreParamScalar2I32;
+  case NVPTX::StoreParamV2I16: return NVPTX::StoreParamScalar2I16;
+  case NVPTX::StoreParamV2I8: return NVPTX::StoreParamScalar2I8;
+  case NVPTX::StoreParamV4F32: return NVPTX::StoreParamScalar4F32;
+  case NVPTX::StoreParamV2F32: return NVPTX::StoreParamScalar2F32;
+  case NVPTX::StoreParamV2F64: return NVPTX::StoreParamScalar2F64;
+  case NVPTX::StoreRetvalV4I32: return NVPTX::StoreRetvalScalar4I32;
+  case NVPTX::StoreRetvalV4I16: return NVPTX::StoreRetvalScalar4I16;
+  case NVPTX::StoreRetvalV4I8: return NVPTX::StoreRetvalScalar4I8;
+  case NVPTX::StoreRetvalV2I64: return NVPTX::StoreRetvalScalar2I64;
+  case NVPTX::StoreRetvalV2I32: return NVPTX::StoreRetvalScalar2I32;
+  case NVPTX::StoreRetvalV2I16: return NVPTX::StoreRetvalScalar2I16;
+  case NVPTX::StoreRetvalV2I8: return NVPTX::StoreRetvalScalar2I8;
+  case NVPTX::StoreRetvalV4F32: return NVPTX::StoreRetvalScalar4F32;
+  case NVPTX::StoreRetvalV2F32: return NVPTX::StoreRetvalScalar2F32;
+  case NVPTX::StoreRetvalV2F64: return NVPTX::StoreRetvalScalar2F64;
+  case NVPTX::VecI32toV4I8: return NVPTX::I32toV4I8;
+  case NVPTX::VecI64toV4I16: return NVPTX::I64toV4I16;
+  case NVPTX::VecI16toV2I8: return NVPTX::I16toV2I8;
+  case NVPTX::VecI32toV2I16: return NVPTX::I32toV2I16;
+  case NVPTX::VecI64toV2I32: return NVPTX::I64toV2I32;
+  case NVPTX::VecF64toV2F32: return NVPTX::F64toV2F32;
+
+  case NVPTX::LD_v2i8_avar: return NVPTX::LDV_i8_v2_avar;
+  case NVPTX::LD_v2i8_areg: return NVPTX::LDV_i8_v2_areg;
+  case NVPTX::LD_v2i8_ari:  return NVPTX::LDV_i8_v2_ari;
+  case NVPTX::LD_v2i8_asi:  return NVPTX::LDV_i8_v2_asi;
+  case NVPTX::LD_v4i8_avar: return NVPTX::LDV_i8_v4_avar;
+  case NVPTX::LD_v4i8_areg: return NVPTX::LDV_i8_v4_areg;
+  case NVPTX::LD_v4i8_ari:  return NVPTX::LDV_i8_v4_ari;
+  case NVPTX::LD_v4i8_asi:  return NVPTX::LDV_i8_v4_asi;
+
+  case NVPTX::LD_v2i16_avar: return NVPTX::LDV_i16_v2_avar;
+  case NVPTX::LD_v2i16_areg: return NVPTX::LDV_i16_v2_areg;
+  case NVPTX::LD_v2i16_ari:  return NVPTX::LDV_i16_v2_ari;
+  case NVPTX::LD_v2i16_asi:  return NVPTX::LDV_i16_v2_asi;
+  case NVPTX::LD_v4i16_avar: return NVPTX::LDV_i16_v4_avar;
+  case NVPTX::LD_v4i16_areg: return NVPTX::LDV_i16_v4_areg;
+  case NVPTX::LD_v4i16_ari:  return NVPTX::LDV_i16_v4_ari;
+  case NVPTX::LD_v4i16_asi:  return NVPTX::LDV_i16_v4_asi;
+
+  case NVPTX::LD_v2i32_avar: return NVPTX::LDV_i32_v2_avar;
+  case NVPTX::LD_v2i32_areg: return NVPTX::LDV_i32_v2_areg;
+  case NVPTX::LD_v2i32_ari:  return NVPTX::LDV_i32_v2_ari;
+  case NVPTX::LD_v2i32_asi:  return NVPTX::LDV_i32_v2_asi;
+  case NVPTX::LD_v4i32_avar: return NVPTX::LDV_i32_v4_avar;
+  case NVPTX::LD_v4i32_areg: return NVPTX::LDV_i32_v4_areg;
+  case NVPTX::LD_v4i32_ari:  return NVPTX::LDV_i32_v4_ari;
+  case NVPTX::LD_v4i32_asi:  return NVPTX::LDV_i32_v4_asi;
+
+  case NVPTX::LD_v2f32_avar: return NVPTX::LDV_f32_v2_avar;
+  case NVPTX::LD_v2f32_areg: return NVPTX::LDV_f32_v2_areg;
+  case NVPTX::LD_v2f32_ari:  return NVPTX::LDV_f32_v2_ari;
+  case NVPTX::LD_v2f32_asi:  return NVPTX::LDV_f32_v2_asi;
+  case NVPTX::LD_v4f32_avar: return NVPTX::LDV_f32_v4_avar;
+  case NVPTX::LD_v4f32_areg: return NVPTX::LDV_f32_v4_areg;
+  case NVPTX::LD_v4f32_ari:  return NVPTX::LDV_f32_v4_ari;
+  case NVPTX::LD_v4f32_asi:  return NVPTX::LDV_f32_v4_asi;
+
+  case NVPTX::LD_v2i64_avar: return NVPTX::LDV_i64_v2_avar;
+  case NVPTX::LD_v2i64_areg: return NVPTX::LDV_i64_v2_areg;
+  case NVPTX::LD_v2i64_ari:  return NVPTX::LDV_i64_v2_ari;
+  case NVPTX::LD_v2i64_asi:  return NVPTX::LDV_i64_v2_asi;
+  case NVPTX::LD_v2f64_avar: return NVPTX::LDV_f64_v2_avar;
+  case NVPTX::LD_v2f64_areg: return NVPTX::LDV_f64_v2_areg;
+  case NVPTX::LD_v2f64_ari:  return NVPTX::LDV_f64_v2_ari;
+  case NVPTX::LD_v2f64_asi:  return NVPTX::LDV_f64_v2_asi;
+
+  case NVPTX::ST_v2i8_avar: return NVPTX::STV_i8_v2_avar;
+  case NVPTX::ST_v2i8_areg: return NVPTX::STV_i8_v2_areg;
+  case NVPTX::ST_v2i8_ari:  return NVPTX::STV_i8_v2_ari;
+  case NVPTX::ST_v2i8_asi:  return NVPTX::STV_i8_v2_asi;
+  case NVPTX::ST_v4i8_avar: return NVPTX::STV_i8_v4_avar;
+  case NVPTX::ST_v4i8_areg: return NVPTX::STV_i8_v4_areg;
+  case NVPTX::ST_v4i8_ari:  return NVPTX::STV_i8_v4_ari;
+  case NVPTX::ST_v4i8_asi:  return NVPTX::STV_i8_v4_asi;
+
+  case NVPTX::ST_v2i16_avar: return NVPTX::STV_i16_v2_avar;
+  case NVPTX::ST_v2i16_areg: return NVPTX::STV_i16_v2_areg;
+  case NVPTX::ST_v2i16_ari:  return NVPTX::STV_i16_v2_ari;
+  case NVPTX::ST_v2i16_asi:  return NVPTX::STV_i16_v2_asi;
+  case NVPTX::ST_v4i16_avar: return NVPTX::STV_i16_v4_avar;
+  case NVPTX::ST_v4i16_areg: return NVPTX::STV_i16_v4_areg;
+  case NVPTX::ST_v4i16_ari:  return NVPTX::STV_i16_v4_ari;
+  case NVPTX::ST_v4i16_asi:  return NVPTX::STV_i16_v4_asi;
+
+  case NVPTX::ST_v2i32_avar: return NVPTX::STV_i32_v2_avar;
+  case NVPTX::ST_v2i32_areg: return NVPTX::STV_i32_v2_areg;
+  case NVPTX::ST_v2i32_ari:  return NVPTX::STV_i32_v2_ari;
+  case NVPTX::ST_v2i32_asi:  return NVPTX::STV_i32_v2_asi;
+  case NVPTX::ST_v4i32_avar: return NVPTX::STV_i32_v4_avar;
+  case NVPTX::ST_v4i32_areg: return NVPTX::STV_i32_v4_areg;
+  case NVPTX::ST_v4i32_ari:  return NVPTX::STV_i32_v4_ari;
+  case NVPTX::ST_v4i32_asi:  return NVPTX::STV_i32_v4_asi;
+
+  case NVPTX::ST_v2f32_avar: return NVPTX::STV_f32_v2_avar;
+  case NVPTX::ST_v2f32_areg: return NVPTX::STV_f32_v2_areg;
+  case NVPTX::ST_v2f32_ari:  return NVPTX::STV_f32_v2_ari;
+  case NVPTX::ST_v2f32_asi:  return NVPTX::STV_f32_v2_asi;
+  case NVPTX::ST_v4f32_avar: return NVPTX::STV_f32_v4_avar;
+  case NVPTX::ST_v4f32_areg: return NVPTX::STV_f32_v4_areg;
+  case NVPTX::ST_v4f32_ari:  return NVPTX::STV_f32_v4_ari;
+  case NVPTX::ST_v4f32_asi:  return NVPTX::STV_f32_v4_asi;
+
+  case NVPTX::ST_v2i64_avar: return NVPTX::STV_i64_v2_avar;
+  case NVPTX::ST_v2i64_areg: return NVPTX::STV_i64_v2_areg;
+  case NVPTX::ST_v2i64_ari:  return NVPTX::STV_i64_v2_ari;
+  case NVPTX::ST_v2i64_asi:  return NVPTX::STV_i64_v2_asi;
+  case NVPTX::ST_v2f64_avar: return NVPTX::STV_f64_v2_avar;
+  case NVPTX::ST_v2f64_areg: return NVPTX::STV_f64_v2_areg;
+  case NVPTX::ST_v2f64_ari:  return NVPTX::STV_f64_v2_ari;
+  case NVPTX::ST_v2f64_asi:  return NVPTX::STV_f64_v2_asi;
+  }
+  return 0;
+}
diff --git a/lib/Target/NVPTX/cl_common_defines.h b/lib/Target/NVPTX/cl_common_defines.h
new file mode 100644
index 0000000..a7347ef
--- /dev/null
+++ b/lib/Target/NVPTX/cl_common_defines.h
@@ -0,0 +1,125 @@
+#ifndef __CL_COMMON_DEFINES_H__
+#define __CL_COMMON_DEFINES_H__
+// This file includes defines that are common to both kernel code and
+// the NVPTX back-end.
+
+//
+// Common defines for Image intrinsics
+// Channel order
+enum {
+  CLK_R = 0x10B0,
+  CLK_A = 0x10B1,
+  CLK_RG = 0x10B2,
+  CLK_RA = 0x10B3,
+  CLK_RGB = 0x10B4,
+  CLK_RGBA = 0x10B5,
+  CLK_BGRA = 0x10B6,
+  CLK_ARGB = 0x10B7,
+
+#if (__NV_CL_C_VERSION == __NV_CL_C_VERSION_1_0)
+  CLK_xRGB = 0x10B7,
+#endif
+
+  CLK_INTENSITY = 0x10B8,
+  CLK_LUMINANCE = 0x10B9
+
+#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
+  ,
+  CLK_Rx = 0x10BA,
+  CLK_RGx = 0x10BB,
+  CLK_RGBx = 0x10BC
+#endif
+};
+
+
+typedef enum clk_channel_type {
+  // valid formats for float return types
+  CLK_SNORM_INT8 = 0x10D0,            // four channel RGBA unorm8
+  CLK_SNORM_INT16 = 0x10D1,           // four channel RGBA unorm16
+  CLK_UNORM_INT8 = 0x10D2,            // four channel RGBA unorm8
+  CLK_UNORM_INT16 = 0x10D3,           // four channel RGBA unorm16
+  CLK_HALF_FLOAT = 0x10DD,            // four channel RGBA half
+  CLK_FLOAT = 0x10DE,                 // four channel RGBA float
+
+#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
+  CLK_UNORM_SHORT_565 = 0x10D4,
+  CLK_UNORM_SHORT_555 = 0x10D5,
+  CLK_UNORM_INT_101010 = 0x10D6,
+#endif
+
+  // valid only for integer return types
+  CLK_SIGNED_INT8 =  0x10D7,
+  CLK_SIGNED_INT16 = 0x10D8,
+  CLK_SIGNED_INT32 = 0x10D9,
+  CLK_UNSIGNED_INT8 = 0x10DA,
+  CLK_UNSIGNED_INT16 = 0x10DB,
+  CLK_UNSIGNED_INT32 = 0x10DC,
+
+  // CI SPI for CPU
+  __CLK_UNORM_INT8888 ,         // four channel ARGB unorm8
+  __CLK_UNORM_INT8888R,        // four channel BGRA unorm8
+
+  __CLK_VALID_IMAGE_TYPE_COUNT,
+  __CLK_INVALID_IMAGE_TYPE = __CLK_VALID_IMAGE_TYPE_COUNT,
+  __CLK_VALID_IMAGE_TYPE_MASK_BITS = 4,         // number of bits required to
+                                                // represent any image type
+  __CLK_VALID_IMAGE_TYPE_MASK = ( 1 << __CLK_VALID_IMAGE_TYPE_MASK_BITS ) - 1
+}clk_channel_type;
+
+typedef enum clk_sampler_type {
+    __CLK_ADDRESS_BASE             = 0,
+    CLK_ADDRESS_NONE               = 0 << __CLK_ADDRESS_BASE,
+    CLK_ADDRESS_CLAMP              = 1 << __CLK_ADDRESS_BASE,
+    CLK_ADDRESS_CLAMP_TO_EDGE      = 2 << __CLK_ADDRESS_BASE,
+    CLK_ADDRESS_REPEAT             = 3 << __CLK_ADDRESS_BASE,
+    CLK_ADDRESS_MIRROR             = 4 << __CLK_ADDRESS_BASE,
+
+#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
+    CLK_ADDRESS_MIRRORED_REPEAT    = CLK_ADDRESS_MIRROR,
+#endif
+    __CLK_ADDRESS_MASK             = CLK_ADDRESS_NONE | CLK_ADDRESS_CLAMP |
+                                     CLK_ADDRESS_CLAMP_TO_EDGE |
+                                     CLK_ADDRESS_REPEAT | CLK_ADDRESS_MIRROR,
+    __CLK_ADDRESS_BITS             = 3,        // number of bits required to
+                                               // represent address info
+
+    __CLK_NORMALIZED_BASE          = __CLK_ADDRESS_BITS,
+    CLK_NORMALIZED_COORDS_FALSE    = 0,
+    CLK_NORMALIZED_COORDS_TRUE     = 1 << __CLK_NORMALIZED_BASE,
+    __CLK_NORMALIZED_MASK          = CLK_NORMALIZED_COORDS_FALSE |
+                                     CLK_NORMALIZED_COORDS_TRUE,
+    __CLK_NORMALIZED_BITS          = 1,        // number of bits required to
+                                               // represent normalization
+
+    __CLK_FILTER_BASE              = __CLK_NORMALIZED_BASE +
+                                     __CLK_NORMALIZED_BITS,
+    CLK_FILTER_NEAREST             = 0 << __CLK_FILTER_BASE,
+    CLK_FILTER_LINEAR              = 1 << __CLK_FILTER_BASE,
+    CLK_FILTER_ANISOTROPIC         = 2 << __CLK_FILTER_BASE,
+    __CLK_FILTER_MASK              = CLK_FILTER_NEAREST | CLK_FILTER_LINEAR |
+                                     CLK_FILTER_ANISOTROPIC,
+    __CLK_FILTER_BITS              = 2,        // number of bits required to
+                                               // represent address info
+
+    __CLK_MIP_BASE                 = __CLK_FILTER_BASE + __CLK_FILTER_BITS,
+    CLK_MIP_NEAREST                = 0 << __CLK_MIP_BASE,
+    CLK_MIP_LINEAR                 = 1 << __CLK_MIP_BASE,
+    CLK_MIP_ANISOTROPIC            = 2 << __CLK_MIP_BASE,
+    __CLK_MIP_MASK                 = CLK_MIP_NEAREST | CLK_MIP_LINEAR |
+                                     CLK_MIP_ANISOTROPIC,
+    __CLK_MIP_BITS                 = 2,
+
+    __CLK_SAMPLER_BITS             = __CLK_MIP_BASE + __CLK_MIP_BITS,
+    __CLK_SAMPLER_MASK             = __CLK_MIP_MASK | __CLK_FILTER_MASK |
+                                     __CLK_NORMALIZED_MASK | __CLK_ADDRESS_MASK,
+
+    __CLK_ANISOTROPIC_RATIO_BITS   = 5,
+    __CLK_ANISOTROPIC_RATIO_MASK   = (int) 0x80000000 >>
+                                      (__CLK_ANISOTROPIC_RATIO_BITS-1)
+} clk_sampler_type;
+
+// Memory synchronization
+#define CLK_LOCAL_MEM_FENCE     (1 << 0)
+#define CLK_GLOBAL_MEM_FENCE    (1 << 1)
+
+#endif // __CL_COMMON_DEFINES_H__
diff --git a/lib/Target/NVPTX/gen-register-defs.py b/lib/Target/NVPTX/gen-register-defs.py
new file mode 100644
index 0000000..ed06668
--- /dev/null
+++ b/lib/Target/NVPTX/gen-register-defs.py
@@ -0,0 +1,202 @@
+#!/usr/bin/env python
+
+num_regs = 396
+
+outFile = open('NVPTXRegisterInfo.td', 'w')
+
+outFile.write('''
+//===-- NVPTXRegisterInfo.td - NVPTX Register defs ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the PTX register file
+//===----------------------------------------------------------------------===//
+
+class NVPTXReg<string n> : Register<n> {
+  let Namespace = "NVPTX";
+}
+
+class NVPTXRegClass<list<ValueType> regTypes, int alignment, dag regList>
+     : RegisterClass <"NVPTX", regTypes, alignment, regList>;
+
+//===----------------------------------------------------------------------===//
+//  Registers
+//===----------------------------------------------------------------------===//
+
+// Special Registers used as stack pointer
+def VRFrame         : NVPTXReg<"%SP">;
+def VRFrameLocal    : NVPTXReg<"%SPL">;
+
+// Special Registers used as the stack
+def VRDepot  : NVPTXReg<"%Depot">;
+''')
+
+# Predicates
+outFile.write('''
+//===--- Predicate --------------------------------------------------------===//
+''')
+for i in range(0, num_regs):
+  outFile.write('def P%d : NVPTXReg<"%%p%d">;\n' % (i, i))
+
+# Int8
+outFile.write('''
+//===--- 8-bit ------------------------------------------------------------===//
+''')
+for i in range(0, num_regs):
+  outFile.write('def RC%d : NVPTXReg<"%%rc%d">;\n' % (i, i))
+
+# Int16
+outFile.write('''
+//===--- 16-bit -----------------------------------------------------------===//
+''')
+for i in range(0, num_regs):
+  outFile.write('def RS%d : NVPTXReg<"%%rs%d">;\n' % (i, i))
+
+# Int32
+outFile.write('''
+//===--- 32-bit -----------------------------------------------------------===//
+''')
+for i in range(0, num_regs):
+  outFile.write('def R%d : NVPTXReg<"%%r%d">;\n' % (i, i))
+
+# Int64
+outFile.write('''
+//===--- 64-bit -----------------------------------------------------------===//
+''')
+for i in range(0, num_regs):
+  outFile.write('def RL%d : NVPTXReg<"%%rl%d">;\n' % (i, i))
+
+# F32
+outFile.write('''
+//===--- 32-bit float -----------------------------------------------------===//
+''')
+for i in range(0, num_regs):
+  outFile.write('def F%d : NVPTXReg<"%%f%d">;\n' % (i, i))
+
+# F64
+outFile.write('''
+//===--- 64-bit float -----------------------------------------------------===//
+''')
+for i in range(0, num_regs):
+  outFile.write('def FL%d : NVPTXReg<"%%fl%d">;\n' % (i, i))
+
+# Vector registers
+outFile.write('''
+//===--- Vector -----------------------------------------------------------===//
+''')
+for i in range(0, num_regs):
+  outFile.write('def v2b8_%d : NVPTXReg<"%%v2b8_%d">;\n' % (i, i))
+for i in range(0, num_regs):
+  outFile.write('def v2b16_%d : NVPTXReg<"%%v2b16_%d">;\n' % (i, i))
+for i in range(0, num_regs):
+  outFile.write('def v2b32_%d : NVPTXReg<"%%v2b32_%d">;\n' % (i, i))
+for i in range(0, num_regs):
+  outFile.write('def v2b64_%d : NVPTXReg<"%%v2b64_%d">;\n' % (i, i))
+
+for i in range(0, num_regs):
+  outFile.write('def v4b8_%d : NVPTXReg<"%%v4b8_%d">;\n' % (i, i))
+for i in range(0, num_regs):
+  outFile.write('def v4b16_%d : NVPTXReg<"%%v4b16_%d">;\n' % (i, i))
+for i in range(0, num_regs):
+  outFile.write('def v4b32_%d : NVPTXReg<"%%v4b32_%d">;\n' % (i, i))
+
+# Argument registers
+outFile.write('''
+//===--- Arguments --------------------------------------------------------===//
+''')
+for i in range(0, num_regs):
+  outFile.write('def ia%d : NVPTXReg<"%%ia%d">;\n' % (i, i))
+for i in range(0, num_regs):
+  outFile.write('def la%d : NVPTXReg<"%%la%d">;\n' % (i, i))
+for i in range(0, num_regs):
+  outFile.write('def fa%d : NVPTXReg<"%%fa%d">;\n' % (i, i))
+for i in range(0, num_regs):
+  outFile.write('def da%d : NVPTXReg<"%%da%d">;\n' % (i, i))
+
+outFile.write('''
+//===----------------------------------------------------------------------===//
+//  Register classes
+//===----------------------------------------------------------------------===//
+''')
+
+outFile.write('def Int1Regs : NVPTXRegClass<[i1], 8, (add (sequence "P%%u", 0, %d))>;\n' % (num_regs-1))
+outFile.write('def Int8Regs : NVPTXRegClass<[i8], 8, (add (sequence "RC%%u", 0, %d))>;\n' % (num_regs-1))
+outFile.write('def Int16Regs : NVPTXRegClass<[i16], 16, (add (sequence "RS%%u", 0, %d))>;\n' % (num_regs-1))
+outFile.write('def Int32Regs : NVPTXRegClass<[i32], 32, (add (sequence "R%%u", 0, %d))>;\n' % (num_regs-1))
+outFile.write('def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%%u", 0, %d))>;\n' % (num_regs-1))
+
+outFile.write('def Float32Regs : NVPTXRegClass<[f32], 32, (add (sequence "F%%u", 0, %d))>;\n' % (num_regs-1))
+outFile.write('def Float64Regs : NVPTXRegClass<[f64], 64, (add (sequence "FL%%u", 0, %d))>;\n' % (num_regs-1))
+
+outFile.write('def Int32ArgRegs : NVPTXRegClass<[i32], 32, (add (sequence "ia%%u", 0, %d))>;\n' % (num_regs-1))
+outFile.write('def Int64ArgRegs : NVPTXRegClass<[i64], 64, (add (sequence "la%%u", 0, %d))>;\n' % (num_regs-1))
+outFile.write('def Float32ArgRegs : NVPTXRegClass<[f32], 32, (add (sequence "fa%%u", 0, %d))>;\n' % (num_regs-1))
+outFile.write('def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%%u", 0, %d))>;\n' % (num_regs-1))
+
+outFile.write('''
+// Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used.
+def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRDepot)>;
+''')
+
+outFile.write('''
+class NVPTXVecRegClass<list<ValueType> regTypes, int alignment, dag regList,
+                       NVPTXRegClass sClass,
+                       int e,
+                       string n>
+  : NVPTXRegClass<regTypes, alignment, regList>
+{
+  NVPTXRegClass scalarClass=sClass;
+  int elems=e;
+  string name=n;
+}
+''')
+
+
+outFile.write('def V2F32Regs\n  : NVPTXVecRegClass<[v2f32], 64, (add (sequence "v2b32_%%u", 0, %d)),\n    Float32Regs, 2, ".v2.f32">;\n' % (num_regs-1))
+outFile.write('def V4F32Regs\n  : NVPTXVecRegClass<[v4f32], 128, (add (sequence "v4b32_%%u", 0, %d)),\n    Float32Regs, 4, ".v4.f32">;\n' % (num_regs-1))
+
+outFile.write('def V2I32Regs\n  : NVPTXVecRegClass<[v2i32], 64, (add (sequence "v2b32_%%u", 0, %d)),\n    Int32Regs, 2, ".v2.u32">;\n' % (num_regs-1))
+outFile.write('def V4I32Regs\n  : NVPTXVecRegClass<[v4i32], 128, (add (sequence "v4b32_%%u", 0, %d)),\n    Int32Regs, 4, ".v4.u32">;\n' % (num_regs-1))
+
+outFile.write('def V2F64Regs\n  : NVPTXVecRegClass<[v2f64], 128, (add (sequence "v2b64_%%u", 0, %d)),\n    Float64Regs, 2, ".v2.f64">;\n' % (num_regs-1))
+outFile.write('def V2I64Regs\n  : NVPTXVecRegClass<[v2i64], 128, (add (sequence "v2b64_%%u", 0, %d)),\n    Int64Regs, 2, ".v2.u64">;\n' % (num_regs-1))
+
+outFile.write('def V2I16Regs\n  : NVPTXVecRegClass<[v2i16], 32, (add (sequence "v2b16_%%u", 0, %d)),\n    Int16Regs, 2, ".v2.u16">;\n' % (num_regs-1))
+outFile.write('def V4I16Regs\n  : NVPTXVecRegClass<[v4i16], 64, (add (sequence "v4b16_%%u", 0, %d)),\n    Int16Regs, 4, ".v4.u16">;\n' % (num_regs-1))
+
+outFile.write('def V2I8Regs\n  : NVPTXVecRegClass<[v2i8], 16, (add (sequence "v2b8_%%u", 0, %d)),\n    Int8Regs, 2, ".v2.u8">;\n' % (num_regs-1))
+outFile.write('def V4I8Regs\n  : NVPTXVecRegClass<[v4i8], 32, (add (sequence "v4b8_%%u", 0, %d)),\n    Int8Regs, 4, ".v4.u8">;\n' % (num_regs-1))
+
+outFile.close()
+
+
+outFile = open('NVPTXNumRegisters.h', 'w')
+outFile.write('''
+//===-- NVPTXNumRegisters.h - PTX Register Info ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_NUM_REGISTERS_H
+#define NVPTX_NUM_REGISTERS_H
+
+namespace llvm {
+
+const unsigned NVPTXNumRegisters = %d;
+
+}
+
+#endif
+''' % num_regs)
+
+outFile.close()
diff --git a/lib/Target/PTX/CMakeLists.txt b/lib/Target/PTX/CMakeLists.txt
deleted file mode 100644
index a3be342..0000000
--- a/lib/Target/PTX/CMakeLists.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-set(LLVM_TARGET_DEFINITIONS PTX.td)
-
-tablegen(LLVM PTXGenAsmWriter.inc -gen-asm-writer)
-tablegen(LLVM PTXGenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM PTXGenInstrInfo.inc -gen-instr-info)
-tablegen(LLVM PTXGenRegisterInfo.inc -gen-register-info)
-tablegen(LLVM PTXGenSubtargetInfo.inc -gen-subtarget)
-add_public_tablegen_target(PTXCommonTableGen)
-
-add_llvm_target(PTXCodeGen
-  PTXAsmPrinter.cpp
-  PTXISelDAGToDAG.cpp
-  PTXISelLowering.cpp
-  PTXInstrInfo.cpp
-  PTXFPRoundingModePass.cpp
-  PTXFrameLowering.cpp
-  PTXMCAsmStreamer.cpp
-  PTXMCInstLower.cpp
-  PTXMFInfoExtract.cpp
-  PTXMachineFunctionInfo.cpp
-  PTXParamManager.cpp
-  PTXRegAlloc.cpp
-  PTXRegisterInfo.cpp
-  PTXSelectionDAGInfo.cpp
-  PTXSubtarget.cpp
-  PTXTargetMachine.cpp
-  )
-
-add_subdirectory(TargetInfo)
-add_subdirectory(InstPrinter)
-add_subdirectory(MCTargetDesc)
-
diff --git a/lib/Target/PTX/InstPrinter/CMakeLists.txt b/lib/Target/PTX/InstPrinter/CMakeLists.txt
deleted file mode 100644
index b252893..0000000
--- a/lib/Target/PTX/InstPrinter/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
-add_llvm_library(LLVMPTXAsmPrinter
-  PTXInstPrinter.cpp
-  )
-
-add_dependencies(LLVMPTXAsmPrinter PTXCommonTableGen)
-
diff --git a/lib/Target/PTX/InstPrinter/PTXInstPrinter.cpp b/lib/Target/PTX/InstPrinter/PTXInstPrinter.cpp
deleted file mode 100644
index 1830213..0000000
--- a/lib/Target/PTX/InstPrinter/PTXInstPrinter.cpp
+++ /dev/null
@@ -1,249 +0,0 @@
-//===-- PTXInstPrinter.cpp - Convert PTX MCInst to assembly syntax --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This class prints a PTX MCInst to a .ptx file.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "asm-printer"
-#include "PTXInstPrinter.h"
-#include "MCTargetDesc/PTXBaseInfo.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-#include "PTXGenAsmWriter.inc"
-
-PTXInstPrinter::PTXInstPrinter(const MCAsmInfo &MAI,
-                               const MCInstrInfo &MII,
-                               const MCRegisterInfo &MRI,
-                               const MCSubtargetInfo &STI) :
-  MCInstPrinter(MAI, MII, MRI) {
-  // Initialize the set of available features.
-  setAvailableFeatures(STI.getFeatureBits());
-}
-
-void PTXInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
-  // Decode the register number into type and offset
-  unsigned RegSpace  = RegNo & 0x7;
-  unsigned RegType   = (RegNo >> 3) & 0x7;
-  unsigned RegOffset = RegNo >> 6;
-
-  // Print the register
-  OS << "%";
-
-  switch (RegSpace) {
-  default:
-    llvm_unreachable("Unknown register space!");
-  case PTXRegisterSpace::Reg:
-    switch (RegType) {
-    default:
-      llvm_unreachable("Unknown register type!");
-    case PTXRegisterType::Pred:
-      OS << "p";
-      break;
-    case PTXRegisterType::B16:
-      OS << "rh";
-      break;
-    case PTXRegisterType::B32:
-      OS << "r";
-      break;
-    case PTXRegisterType::B64:
-      OS << "rd";
-      break;
-    case PTXRegisterType::F32:
-      OS << "f";
-      break;
-    case PTXRegisterType::F64:
-      OS << "fd";
-      break;
-    }
-    break;
-  case PTXRegisterSpace::Return:
-    OS << "ret";
-    break;
-  case PTXRegisterSpace::Argument:
-    OS << "arg";
-    break;
-  }
-
-  OS << RegOffset;
-}
-
-void PTXInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
-                               StringRef Annot) {
-  printPredicate(MI, O);
-  switch (MI->getOpcode()) {
-  default:
-    printInstruction(MI, O);
-    break;
-  case PTX::CALL:
-    printCall(MI, O);
-  }
-  O << ";";
-  printAnnotation(O, Annot);
-}
-
-void PTXInstPrinter::printPredicate(const MCInst *MI, raw_ostream &O) {
-  // The last two operands are the predicate operands
-  int RegIndex;
-  int OpIndex;
-
-  if (MI->getOpcode() == PTX::CALL) {
-    RegIndex = 0;
-    OpIndex  = 1;
-  } else {
-    RegIndex = MI->getNumOperands()-2;
-    OpIndex = MI->getNumOperands()-1;
-  }
-
-  int PredOp = MI->getOperand(OpIndex).getImm();
-  if (PredOp == PTXPredicate::None)
-    return;
-
-  if (PredOp == PTXPredicate::Negate)
-    O << '!';
-  else
-    O << '@';
-
-  printOperand(MI, RegIndex, O);
-}
-
-void PTXInstPrinter::printCall(const MCInst *MI, raw_ostream &O) {
-  O << "\tcall.uni\t";
-  // The first two operands are the predicate slot
-  unsigned Index = 2;
-  unsigned NumRets = MI->getOperand(Index++).getImm();
-
-  if (NumRets > 0) {
-    O << "(";
-    printOperand(MI, Index++, O);
-    for (unsigned i = 1; i < NumRets; ++i) {
-      O << ", ";
-      printOperand(MI, Index++, O);
-    }
-    O << "), ";
-  }
-
-  const MCExpr* Expr = MI->getOperand(Index++).getExpr();
-  unsigned NumArgs = MI->getOperand(Index++).getImm();
-  
-  // if the function call is to printf or puts, change to vprintf
-  if (const MCSymbolRefExpr *SymRefExpr = dyn_cast<MCSymbolRefExpr>(Expr)) {
-    const MCSymbol &Sym = SymRefExpr->getSymbol();
-    if (Sym.getName() == "printf" || Sym.getName() == "puts") {
-      O << "vprintf";
-    } else {
-      O << Sym.getName();
-    }
-  } else {
-    O << *Expr;
-  }
-  
-  O << ", (";
-
-  if (NumArgs > 0) {
-    printOperand(MI, Index++, O);
-    for (unsigned i = 1; i < NumArgs; ++i) {
-      O << ", ";
-      printOperand(MI, Index++, O);
-    }
-  }
-  O << ")";
-}
-
-void PTXInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
-                                  raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  if (Op.isImm()) {
-    O << Op.getImm();
-  } else if (Op.isFPImm()) {
-    double Imm = Op.getFPImm();
-    APFloat FPImm(Imm);
-    APInt FPIntImm = FPImm.bitcastToAPInt();
-    O << "0D";
-    // PTX requires us to output the full 64 bits, even if the number is zero
-    if (FPIntImm.getZExtValue() > 0) {
-      O << FPIntImm.toString(16, false);
-    } else {
-      O << "0000000000000000";
-    }
-  } else if (Op.isReg()) {
-    printRegName(O, Op.getReg());
-  } else {
-    assert(Op.isExpr() && "unknown operand kind in printOperand");
-    const MCExpr *Expr = Op.getExpr();
-    if (const MCSymbolRefExpr *SymRefExpr = dyn_cast<MCSymbolRefExpr>(Expr)) {
-      const MCSymbol &Sym = SymRefExpr->getSymbol();
-      O << Sym.getName();
-    } else {
-      O << *Op.getExpr();
-    }
-  }
-}
-
-void PTXInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo,
-                                     raw_ostream &O) {
-  // By definition, operand OpNo+1 is an i32imm
-  const MCOperand &Op2 = MI->getOperand(OpNo+1);
-  printOperand(MI, OpNo, O);
-  if (Op2.getImm() == 0)
-    return; // don't print "+0"
-  O << "+" << Op2.getImm();
-}
-
-void PTXInstPrinter::printRoundingMode(const MCInst *MI, unsigned OpNo,
-                                       raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  assert (Op.isImm() && "Rounding modes must be immediate values");
-  switch (Op.getImm()) {
-  default:
-    llvm_unreachable("Unknown rounding mode!");
-  case PTXRoundingMode::RndDefault:
-    llvm_unreachable("FP rounding-mode pass did not handle instruction!");
-  case PTXRoundingMode::RndNone:
-    // Do not print anything.
-    break;
-  case PTXRoundingMode::RndNearestEven:
-    O << ".rn";
-    break;
-  case PTXRoundingMode::RndTowardsZero:
-    O << ".rz";
-    break;
-  case PTXRoundingMode::RndNegInf:
-    O << ".rm";
-    break;
-  case PTXRoundingMode::RndPosInf:
-    O << ".rp";
-    break;
-  case PTXRoundingMode::RndApprox:
-    O << ".approx";
-    break;
-  case PTXRoundingMode::RndNearestEvenInt:
-    O << ".rni";
-    break;
-  case PTXRoundingMode::RndTowardsZeroInt:
-    O << ".rzi";
-    break;
-  case PTXRoundingMode::RndNegInfInt:
-    O << ".rmi";
-    break;
-  case PTXRoundingMode::RndPosInfInt:
-    O << ".rpi";
-    break;
-  }
-}
-
diff --git a/lib/Target/PTX/InstPrinter/PTXInstPrinter.h b/lib/Target/PTX/InstPrinter/PTXInstPrinter.h
deleted file mode 100644
index ea4d504..0000000
--- a/lib/Target/PTX/InstPrinter/PTXInstPrinter.h
+++ /dev/null
@@ -1,45 +0,0 @@
-//===- PTXInstPrinter.h - Convert PTX MCInst to assembly syntax -*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This class prints n PTX MCInst to a .ptx file.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef PTXINSTPRINTER_H
-#define PTXINSTPRINTER_H
-
-#include "llvm/MC/MCInstPrinter.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-
-namespace llvm {
-
-class MCOperand;
-
-class PTXInstPrinter : public MCInstPrinter {
-public:
-  PTXInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
-                 const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
-
-  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
-  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
-
-  // Autogenerated by tblgen.
-  void printInstruction(const MCInst *MI, raw_ostream &O);
-  static const char *getRegisterName(unsigned RegNo);
-
-  void printPredicate(const MCInst *MI, raw_ostream &O);
-  void printCall(const MCInst *MI, raw_ostream &O);
-  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printRoundingMode(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-};
-}
-
-#endif
-
diff --git a/lib/Target/PTX/MCTargetDesc/CMakeLists.txt b/lib/Target/PTX/MCTargetDesc/CMakeLists.txt
deleted file mode 100644
index d1fd74c..0000000
--- a/lib/Target/PTX/MCTargetDesc/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-add_llvm_library(LLVMPTXDesc
-  PTXMCTargetDesc.cpp
-  PTXMCAsmInfo.cpp
-  )
-
-add_dependencies(LLVMPTXDesc PTXCommonTableGen)
diff --git a/lib/Target/PTX/MCTargetDesc/PTXBaseInfo.h b/lib/Target/PTX/MCTargetDesc/PTXBaseInfo.h
deleted file mode 100644
index a3e0f32..0000000
--- a/lib/Target/PTX/MCTargetDesc/PTXBaseInfo.h
+++ /dev/null
@@ -1,134 +0,0 @@
-//===-- PTXBaseInfo.h - Top level definitions for PTX -------- --*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains small standalone helper functions and enum definitions for
-// the PTX target useful for the compiler back-end and the MC libraries.
-// As such, it deliberately does not include references to LLVM core
-// code gen types, passes, etc..
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef PTXBASEINFO_H
-#define PTXBASEINFO_H
-
-#include "PTXMCTargetDesc.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-
-namespace llvm {
-  namespace PTXStateSpace {
-    enum {
-      Global    = 0, // default to global state space
-      Constant  = 1,
-      Local     = 2,
-      Parameter = 3,
-      Shared    = 4
-    };
-  } // namespace PTXStateSpace
-
-  namespace PTXPredicate {
-    enum {
-      Normal = 0,
-      Negate = 1,
-      None   = 2
-    };
-  } // namespace PTXPredicate
-
-  /// Namespace to hold all target-specific flags.
-  namespace PTXRoundingMode {
-    // Instruction Flags
-    enum {
-      // Rounding Mode Flags
-      RndMask             = 15,
-      RndDefault          =  0, // ---
-      RndNone             =  1, // <NONE>
-      RndNearestEven      =  2, // .rn
-      RndTowardsZero      =  3, // .rz
-      RndNegInf           =  4, // .rm
-      RndPosInf           =  5, // .rp
-      RndApprox           =  6, // .approx
-      RndNearestEvenInt   =  7, // .rni
-      RndTowardsZeroInt   =  8, // .rzi
-      RndNegInfInt        =  9, // .rmi
-      RndPosInfInt        = 10  // .rpi
-    };
-  } // namespace PTXII
-
-  namespace PTXRegisterType {
-    // Register type encoded in MCOperands
-    enum {
-      Pred  = 0,
-      B16,
-      B32,
-      B64,
-      F32,
-      F64
-    };
-  } // namespace PTXRegisterType
-
-  namespace PTXRegisterSpace {
-    // Register space encoded in MCOperands
-    enum {
-      Reg = 0,
-      Local,
-      Param,
-      Argument,
-      Return
-    };
-  }
-
-  inline static void decodeRegisterName(raw_ostream &OS,
-                                        unsigned EncodedReg) {
-    OS << "%";
-
-    unsigned RegSpace  = EncodedReg & 0x7;
-    unsigned RegType   = (EncodedReg >> 3) & 0x7;
-    unsigned RegOffset = EncodedReg >> 6;
-
-    switch (RegSpace) {
-    default:
-      llvm_unreachable("Unknown register space!");
-    case PTXRegisterSpace::Reg:
-      switch (RegType) {
-      default:
-        llvm_unreachable("Unknown register type!");
-      case PTXRegisterType::Pred:
-        OS << "p";
-        break;
-      case PTXRegisterType::B16:
-        OS << "rh";
-        break;
-      case PTXRegisterType::B32:
-        OS << "r";
-        break;
-      case PTXRegisterType::B64:
-        OS << "rd";
-        break;
-      case PTXRegisterType::F32:
-        OS << "f";
-        break;
-      case PTXRegisterType::F64:
-        OS << "fd";
-        break;
-      }
-      break;
-    case PTXRegisterSpace::Return:
-      OS << "ret";
-      break;
-    case PTXRegisterSpace::Argument:
-      OS << "arg";
-      break;
-    }
-
-    OS << RegOffset;
-  }
-} // namespace llvm
-
-#endif
-
diff --git a/lib/Target/PTX/MCTargetDesc/PTXMCAsmInfo.cpp b/lib/Target/PTX/MCTargetDesc/PTXMCAsmInfo.cpp
deleted file mode 100644
index cdfbc80..0000000
--- a/lib/Target/PTX/MCTargetDesc/PTXMCAsmInfo.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-//===-- PTXMCAsmInfo.cpp - PTX asm properties -----------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the declarations of the PTXMCAsmInfo properties.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PTXMCAsmInfo.h"
-#include "llvm/ADT/Triple.h"
-
-using namespace llvm;
-
-void PTXMCAsmInfo::anchor() { }
-
-PTXMCAsmInfo::PTXMCAsmInfo(const Target &T, const StringRef &TT) {
-  Triple TheTriple(TT);
-  if (TheTriple.getArch() == Triple::ptx64)
-    PointerSize = 8;
-
-  CommentString = "//";
-
-  PrivateGlobalPrefix = "$L__";
-
-  AllowPeriodsInName = false;
-
-  HasSetDirective = false;
-
-  HasDotTypeDotSizeDirective = false;
-
-  HasSingleParameterDotFile = false;
-}
diff --git a/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.cpp b/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.cpp
deleted file mode 100644
index 08fb970..0000000
--- a/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-//===-- PTXMCTargetDesc.cpp - PTX Target Descriptions ---------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides PTX specific target descriptions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PTXMCTargetDesc.h"
-#include "PTXMCAsmInfo.h"
-#include "InstPrinter/PTXInstPrinter.h"
-#include "llvm/MC/MCCodeGenInfo.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/TargetRegistry.h"
-
-#define GET_INSTRINFO_MC_DESC
-#include "PTXGenInstrInfo.inc"
-
-#define GET_SUBTARGETINFO_MC_DESC
-#include "PTXGenSubtargetInfo.inc"
-
-#define GET_REGINFO_MC_DESC
-#include "PTXGenRegisterInfo.inc"
-
-using namespace llvm;
-
-static MCInstrInfo *createPTXMCInstrInfo() {
-  MCInstrInfo *X = new MCInstrInfo();
-  InitPTXMCInstrInfo(X);
-  return X;
-}
-
-static MCRegisterInfo *createPTXMCRegisterInfo(StringRef TT) {
-  MCRegisterInfo *X = new MCRegisterInfo();
-  // PTX does not have a return address register.
-  InitPTXMCRegisterInfo(X, 0);
-  return X;
-}
-
-static MCSubtargetInfo *createPTXMCSubtargetInfo(StringRef TT, StringRef CPU,
-                                                 StringRef FS) {
-  MCSubtargetInfo *X = new MCSubtargetInfo();
-  InitPTXMCSubtargetInfo(X, TT, CPU, FS);
-  return X;
-}
-
-static MCCodeGenInfo *createPTXMCCodeGenInfo(StringRef TT, Reloc::Model RM,
-                                             CodeModel::Model CM,
-                                             CodeGenOpt::Level OL) {
-  MCCodeGenInfo *X = new MCCodeGenInfo();
-  X->InitMCCodeGenInfo(RM, CM, OL);
-  return X;
-}
-
-static MCInstPrinter *createPTXMCInstPrinter(const Target &T,
-                                             unsigned SyntaxVariant,
-                                             const MCAsmInfo &MAI,
-                                             const MCInstrInfo &MII,
-                                             const MCRegisterInfo &MRI,
-                                             const MCSubtargetInfo &STI) {
-  assert(SyntaxVariant == 0 && "We only have one syntax variant");
-  return new PTXInstPrinter(MAI, MII, MRI, STI);
-}
-
-extern "C" void LLVMInitializePTXTargetMC() {
-  // Register the MC asm info.
-  RegisterMCAsmInfo<PTXMCAsmInfo> X(ThePTX32Target);
-  RegisterMCAsmInfo<PTXMCAsmInfo> Y(ThePTX64Target);
-
-  // Register the MC codegen info.
-  TargetRegistry::RegisterMCCodeGenInfo(ThePTX32Target, createPTXMCCodeGenInfo);
-  TargetRegistry::RegisterMCCodeGenInfo(ThePTX64Target, createPTXMCCodeGenInfo);
-
-  // Register the MC instruction info.
-  TargetRegistry::RegisterMCInstrInfo(ThePTX32Target, createPTXMCInstrInfo);
-  TargetRegistry::RegisterMCInstrInfo(ThePTX64Target, createPTXMCInstrInfo);
-
-  // Register the MC register info.
-  TargetRegistry::RegisterMCRegInfo(ThePTX32Target, createPTXMCRegisterInfo);
-  TargetRegistry::RegisterMCRegInfo(ThePTX64Target, createPTXMCRegisterInfo);
-
-  // Register the MC subtarget info.
-  TargetRegistry::RegisterMCSubtargetInfo(ThePTX32Target,
-                                          createPTXMCSubtargetInfo);
-  TargetRegistry::RegisterMCSubtargetInfo(ThePTX64Target,
-                                          createPTXMCSubtargetInfo);
-
-  // Register the MCInstPrinter.
-  TargetRegistry::RegisterMCInstPrinter(ThePTX32Target, createPTXMCInstPrinter);
-  TargetRegistry::RegisterMCInstPrinter(ThePTX64Target, createPTXMCInstPrinter);
-}
diff --git a/lib/Target/PTX/PTX.h b/lib/Target/PTX/PTX.h
deleted file mode 100644
index ffb92cb..0000000
--- a/lib/Target/PTX/PTX.h
+++ /dev/null
@@ -1,43 +0,0 @@
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the entry points for global functions defined in the LLVM
-// PTX back-end.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef PTX_H
-#define PTX_H
-
-#include "MCTargetDesc/PTXBaseInfo.h"
-#include "llvm/Target/TargetMachine.h"
-
-namespace llvm {
-  class MachineInstr;
-  class MCInst;
-  class PTXAsmPrinter;
-  class PTXTargetMachine;
-  class FunctionPass;
-
-  FunctionPass *createPTXISelDag(PTXTargetMachine &TM,
-                                 CodeGenOpt::Level OptLevel);
-
-  FunctionPass *createPTXMFInfoExtract(PTXTargetMachine &TM,
-                                       CodeGenOpt::Level OptLevel);
-
-  FunctionPass *createPTXFPRoundingModePass(PTXTargetMachine &TM,
-                                            CodeGenOpt::Level OptLevel);
-
-  FunctionPass *createPTXRegisterAllocator();
-
-  void LowerPTXMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
-                                    PTXAsmPrinter &AP);
-
-} // namespace llvm;
-
-#endif // PTX_H
diff --git a/lib/Target/PTX/PTX.td b/lib/Target/PTX/PTX.td
deleted file mode 100644
index 994a68e..0000000
--- a/lib/Target/PTX/PTX.td
+++ /dev/null
@@ -1,141 +0,0 @@
-//===-- PTX.td - Describe the PTX Target Machine -----------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// This is the top level entry point for the PTX target.
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Target-independent interfaces
-//===----------------------------------------------------------------------===//
-
-include "llvm/Target/Target.td"
-
-//===----------------------------------------------------------------------===//
-// Subtarget Features
-//===----------------------------------------------------------------------===//
-
-//===- Architectural Features ---------------------------------------------===//
-
-def FeatureDouble : SubtargetFeature<"double", "SupportsDouble", "true",
-                                     "Do not demote .f64 to .f32">;
-
-def FeatureNoFMA  : SubtargetFeature<"no-fma","SupportsFMA", "false",
-                                     "Disable Fused-Multiply Add">;
-
-//===- PTX Version --------------------------------------------------------===//
-
-def FeaturePTX20 : SubtargetFeature<"ptx20", "PTXVersion", "PTX_VERSION_2_0",
-                                    "Use PTX Language Version 2.0">;
-
-def FeaturePTX21 : SubtargetFeature<"ptx21", "PTXVersion", "PTX_VERSION_2_1",
-                                    "Use PTX Language Version 2.1">;
-
-def FeaturePTX22 : SubtargetFeature<"ptx22", "PTXVersion", "PTX_VERSION_2_2",
-                                    "Use PTX Language Version 2.2">;
-
-def FeaturePTX23 : SubtargetFeature<"ptx23", "PTXVersion", "PTX_VERSION_2_3",
-                                    "Use PTX Language Version 2.3">;
-
-//===- PTX Target ---------------------------------------------------------===//
-
-def FeatureSM10 : SubtargetFeature<"sm10", "PTXTarget", "PTX_SM_1_0",
-                                   "Use Shader Model 1.0">;
-def FeatureSM11 : SubtargetFeature<"sm11", "PTXTarget", "PTX_SM_1_1",
-                                   "Use Shader Model 1.1">;
-def FeatureSM12 : SubtargetFeature<"sm12", "PTXTarget", "PTX_SM_1_2",
-                                   "Use Shader Model 1.2">;
-def FeatureSM13 : SubtargetFeature<"sm13", "PTXTarget", "PTX_SM_1_3",
-                                   "Use Shader Model 1.3">;
-def FeatureSM20 : SubtargetFeature<"sm20", "PTXTarget", "PTX_SM_2_0",
-                                   "Use Shader Model 2.0", [FeatureDouble]>;
-def FeatureSM21 : SubtargetFeature<"sm21", "PTXTarget", "PTX_SM_2_1",
-                                   "Use Shader Model 2.1", [FeatureDouble]>;
-def FeatureSM22 : SubtargetFeature<"sm22", "PTXTarget", "PTX_SM_2_2",
-                                   "Use Shader Model 2.2", [FeatureDouble]>;
-def FeatureSM23 : SubtargetFeature<"sm23", "PTXTarget", "PTX_SM_2_3",
-                                   "Use Shader Model 2.3", [FeatureDouble]>;
-
-def FeatureCOMPUTE10 : SubtargetFeature<"compute10", "PTXTarget",
-                                        "PTX_COMPUTE_1_0",
-                                        "Use Compute Compatibility 1.0">;
-def FeatureCOMPUTE11 : SubtargetFeature<"compute11", "PTXTarget",
-                                        "PTX_COMPUTE_1_1",
-                                        "Use Compute Compatibility 1.1">;
-def FeatureCOMPUTE12 : SubtargetFeature<"compute12", "PTXTarget",
-                                        "PTX_COMPUTE_1_2",
-                                        "Use Compute Compatibility 1.2">;
-def FeatureCOMPUTE13 : SubtargetFeature<"compute13", "PTXTarget",
-                                        "PTX_COMPUTE_1_3",
-                                        "Use Compute Compatibility 1.3">;
-def FeatureCOMPUTE20 : SubtargetFeature<"compute20", "PTXTarget",
-                                        "PTX_COMPUTE_2_0",
-                                        "Use Compute Compatibility 2.0",
-                                        [FeatureDouble]>;
-
-//===----------------------------------------------------------------------===//
-// PTX supported processors
-//===----------------------------------------------------------------------===//
-
-class Proc<string Name, list<SubtargetFeature> Features>
-  : Processor<Name, NoItineraries, Features>;
-
-def : Proc<"generic", []>;
-
-// Processor definitions for compute/shader models
-def : Proc<"compute_10", [FeatureCOMPUTE10]>;
-def : Proc<"compute_11", [FeatureCOMPUTE11]>;
-def : Proc<"compute_12", [FeatureCOMPUTE12]>;
-def : Proc<"compute_13", [FeatureCOMPUTE13]>;
-def : Proc<"compute_20", [FeatureCOMPUTE20]>;
-def : Proc<"sm_10",      [FeatureSM10]>;
-def : Proc<"sm_11",      [FeatureSM11]>;
-def : Proc<"sm_12",      [FeatureSM12]>;
-def : Proc<"sm_13",      [FeatureSM13]>;
-def : Proc<"sm_20",      [FeatureSM20]>;
-def : Proc<"sm_21",      [FeatureSM21]>;
-def : Proc<"sm_22",      [FeatureSM22]>;
-def : Proc<"sm_23",      [FeatureSM23]>;
-
-// Processor definitions for common GPU architectures
-def : Proc<"g80",        [FeatureSM10]>;
-def : Proc<"gt200",      [FeatureSM13]>;
-def : Proc<"gf100",      [FeatureSM20, FeatureDouble]>;
-def : Proc<"fermi",      [FeatureSM20, FeatureDouble]>;
-
-//===----------------------------------------------------------------------===//
-// Register File Description
-//===----------------------------------------------------------------------===//
-
-include "PTXRegisterInfo.td"
-
-//===----------------------------------------------------------------------===//
-// Instruction Descriptions
-//===----------------------------------------------------------------------===//
-
-include "PTXInstrInfo.td"
-
-def PTXInstrInfo : InstrInfo;
-
-//===----------------------------------------------------------------------===//
-// Assembly printer
-//===----------------------------------------------------------------------===//
-// PTX uses the MC printer for asm output, so make sure the TableGen
-// AsmWriter bits get associated with the correct class.
-def PTXAsmWriter : AsmWriter {
-  string AsmWriterClassName  = "InstPrinter";
-  bit isMCAsmWriter = 1;
-}
-
-//===----------------------------------------------------------------------===//
-// Target Declaration
-//===----------------------------------------------------------------------===//
-
-def PTX : Target {
-  let InstructionSet = PTXInstrInfo;
-  let AssemblyWriters = [PTXAsmWriter];
-}
diff --git a/lib/Target/PTX/PTXAsmPrinter.cpp b/lib/Target/PTX/PTXAsmPrinter.cpp
deleted file mode 100644
index 0b6ac7b..0000000
--- a/lib/Target/PTX/PTXAsmPrinter.cpp
+++ /dev/null
@@ -1,561 +0,0 @@
-//===-- PTXAsmPrinter.cpp - PTX LLVM assembly writer ----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a printer that converts from our internal representation
-// of machine-dependent LLVM code to PTX assembly language.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "ptx-asm-printer"
-
-#include "PTXAsmPrinter.h"
-#include "PTX.h"
-#include "PTXMachineFunctionInfo.h"
-#include "PTXParamManager.h"
-#include "PTXRegisterInfo.h"
-#include "PTXTargetMachine.h"
-#include "llvm/Argument.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Function.h"
-#include "llvm/Module.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Analysis/DebugInfo.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-static const char PARAM_PREFIX[] = "__param_";
-static const char RETURN_PREFIX[] = "__ret_";
-
-static const char *getRegisterTypeName(unsigned RegType) {
-  switch (RegType) {
-  default:
-    llvm_unreachable("Unknown register type");
-  case PTXRegisterType::Pred:
-    return ".pred";
-  case PTXRegisterType::B16:
-    return ".b16";
-  case PTXRegisterType::B32:
-    return ".b32";
-  case PTXRegisterType::B64:
-    return ".b64";
-  case PTXRegisterType::F32:
-    return ".f32";
-  case PTXRegisterType::F64:
-    return ".f64";
-  }
-}
-
-static const char *getStateSpaceName(unsigned addressSpace) {
-  switch (addressSpace) {
-  default: llvm_unreachable("Unknown state space");
-  case PTXStateSpace::Global:    return "global";
-  case PTXStateSpace::Constant:  return "const";
-  case PTXStateSpace::Local:     return "local";
-  case PTXStateSpace::Parameter: return "param";
-  case PTXStateSpace::Shared:    return "shared";
-  }
-}
-
-static const char *getTypeName(Type* type) {
-  while (true) {
-    switch (type->getTypeID()) {
-      default: llvm_unreachable("Unknown type");
-      case Type::FloatTyID: return ".f32";
-      case Type::DoubleTyID: return ".f64";
-      case Type::IntegerTyID:
-        switch (type->getPrimitiveSizeInBits()) {
-          default: llvm_unreachable("Unknown integer bit-width");
-          case 16: return ".u16";
-          case 32: return ".u32";
-          case 64: return ".u64";
-        }
-      case Type::ArrayTyID:
-      case Type::PointerTyID:
-        type = dyn_cast<SequentialType>(type)->getElementType();
-        break;
-    }
-  }
-  return NULL;
-}
-
-bool PTXAsmPrinter::doFinalization(Module &M) {
-  // XXX Temproarily remove global variables so that doFinalization() will not
-  // emit them again (global variables are emitted at beginning).
-
-  Module::GlobalListType &global_list = M.getGlobalList();
-  int i, n = global_list.size();
-  GlobalVariable **gv_array = new GlobalVariable* [n];
-
-  // first, back-up GlobalVariable in gv_array
-  i = 0;
-  for (Module::global_iterator I = global_list.begin(), E = global_list.end();
-       I != E; ++I)
-    gv_array[i++] = &*I;
-
-  // second, empty global_list
-  while (!global_list.empty())
-    global_list.remove(global_list.begin());
-
-  // call doFinalization
-  bool ret = AsmPrinter::doFinalization(M);
-
-  // now we restore global variables
-  for (i = 0; i < n; i ++)
-    global_list.insert(global_list.end(), gv_array[i]);
-
-  delete[] gv_array;
-  return ret;
-}
-
-void PTXAsmPrinter::EmitStartOfAsmFile(Module &M)
-{
-  const PTXSubtarget& ST = TM.getSubtarget<PTXSubtarget>();
-
-  // Emit the PTX .version and .target attributes
-  OutStreamer.EmitRawText(Twine("\t.version ") + ST.getPTXVersionString());
-  OutStreamer.EmitRawText(Twine("\t.target ") + ST.getTargetString() +
-                                (ST.supportsDouble() ? ""
-                                                     : ", map_f64_to_f32"));
-  // .address_size directive is optional, but it must immediately follow
-  // the .target directive if present within a module
-  if (ST.supportsPTX23()) {
-    const char *addrSize = ST.is64Bit() ? "64" : "32";
-    OutStreamer.EmitRawText(Twine("\t.address_size ") + addrSize);
-  }
-
-  OutStreamer.AddBlankLine();
-
-  // Define any .file directives
-  DebugInfoFinder DbgFinder;
-  DbgFinder.processModule(M);
-
-  for (DebugInfoFinder::iterator I = DbgFinder.compile_unit_begin(),
-       E = DbgFinder.compile_unit_end(); I != E; ++I) {
-    DICompileUnit DIUnit(*I);
-    StringRef FN = DIUnit.getFilename();
-    StringRef Dir = DIUnit.getDirectory();
-    GetOrCreateSourceID(FN, Dir);
-  }
-
-  OutStreamer.AddBlankLine();
-
-  // declare external functions
-  for (Module::const_iterator i = M.begin(), e = M.end();
-       i != e; ++i)
-    EmitFunctionDeclaration(i);
-  
-  // declare global variables
-  for (Module::const_global_iterator i = M.global_begin(), e = M.global_end();
-       i != e; ++i)
-    EmitVariableDeclaration(i);
-}
-
-void PTXAsmPrinter::EmitFunctionBodyStart() {
-  OutStreamer.EmitRawText(Twine("{"));
-
-  const PTXMachineFunctionInfo *MFI = MF->getInfo<PTXMachineFunctionInfo>();
-  const PTXParamManager &PM = MFI->getParamManager();
-
-  // Print register definitions
-  SmallString<128> regDefs;
-  raw_svector_ostream os(regDefs);
-  unsigned numRegs;
-
-  // pred
-  numRegs = MFI->countRegisters(PTXRegisterType::Pred, PTXRegisterSpace::Reg);
-  if(numRegs > 0)
-    os << "\t.reg .pred %p<" << numRegs << ">;\n";
-
-  // i16
-  numRegs = MFI->countRegisters(PTXRegisterType::B16, PTXRegisterSpace::Reg);
-  if(numRegs > 0)
-    os << "\t.reg .b16 %rh<" << numRegs << ">;\n";
-
-  // i32
-  numRegs = MFI->countRegisters(PTXRegisterType::B32, PTXRegisterSpace::Reg);
-  if(numRegs > 0)
-    os << "\t.reg .b32 %r<" << numRegs << ">;\n";
-
-  // i64
-  numRegs = MFI->countRegisters(PTXRegisterType::B64, PTXRegisterSpace::Reg);
-  if(numRegs > 0)
-    os << "\t.reg .b64 %rd<" << numRegs << ">;\n";
-
-  // f32
-  numRegs = MFI->countRegisters(PTXRegisterType::F32, PTXRegisterSpace::Reg);
-  if(numRegs > 0)
-    os << "\t.reg .f32 %f<" << numRegs << ">;\n";
-
-  // f64
-  numRegs = MFI->countRegisters(PTXRegisterType::F64, PTXRegisterSpace::Reg);
-  if(numRegs > 0)
-    os << "\t.reg .f64 %fd<" << numRegs << ">;\n";
-
-  // Local params
-  for (PTXParamManager::param_iterator i = PM.local_begin(), e = PM.local_end();
-       i != e; ++i)
-    os << "\t.param .b" << PM.getParamSize(*i) << ' ' << PM.getParamName(*i)
-       << ";\n";
-
-  OutStreamer.EmitRawText(os.str());
-
-
-  const MachineFrameInfo* FrameInfo = MF->getFrameInfo();
-  DEBUG(dbgs() << "Have " << FrameInfo->getNumObjects()
-               << " frame object(s)\n");
-  for (unsigned i = 0, e = FrameInfo->getNumObjects(); i != e; ++i) {
-    DEBUG(dbgs() << "Size of object: " << FrameInfo->getObjectSize(i) << "\n");
-    if (FrameInfo->getObjectSize(i) > 0) {
-      OutStreamer.EmitRawText("\t.local .align " +
-                              Twine(FrameInfo->getObjectAlignment(i)) +
-                              " .b8 __local" +
-                              Twine(i) +
-                              "[" +
-                              Twine(FrameInfo->getObjectSize(i)) +
-                              "];");
-    }
-  }
-
-  //unsigned Index = 1;
-  // Print parameter passing params
-  //for (PTXMachineFunctionInfo::param_iterator
-  //     i = MFI->paramBegin(), e = MFI->paramEnd(); i != e; ++i) {
-  //  std::string def = "\t.param .b";
-  //  def += utostr(*i);
-  //  def += " __ret_";
-  //  def += utostr(Index);
-  //  Index++;
-  //  def += ";";
-  //  OutStreamer.EmitRawText(Twine(def));
-  //}
-}
-
-void PTXAsmPrinter::EmitFunctionBodyEnd() {
-  OutStreamer.EmitRawText(Twine("}"));
-}
-
-void PTXAsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  MCInst TmpInst;
-  LowerPTXMachineInstrToMCInst(MI, TmpInst, *this);
-  OutStreamer.EmitInstruction(TmpInst);
-}
-
-void PTXAsmPrinter::EmitVariableDeclaration(const GlobalVariable *gv) {
-  // Check to see if this is a special global used by LLVM, if so, emit it.
-  if (EmitSpecialLLVMGlobal(gv))
-    return;
-
-  MCSymbol *gvsym = Mang->getSymbol(gv);
-
-  assert(gvsym->isUndefined() && "Cannot define a symbol twice!");
-
-  SmallString<128> decl;
-  raw_svector_ostream os(decl);
-
-  // check if it is defined in some other translation unit
-  if (gv->isDeclaration())
-    os << ".extern ";
-
-  // state space: e.g., .global
-  os << '.' << getStateSpaceName(gv->getType()->getAddressSpace()) << ' ';
-
-  // alignment (optional)
-  unsigned alignment = gv->getAlignment();
-  if (alignment != 0)
-    os << ".align " << gv->getAlignment() << ' ';
-
-
-  if (PointerType::classof(gv->getType())) {
-    PointerType* pointerTy = dyn_cast<PointerType>(gv->getType());
-    Type* elementTy = pointerTy->getElementType();
-
-    if (elementTy->isArrayTy()) {
-      assert(elementTy->isArrayTy() && "Only pointers to arrays are supported");
-
-      ArrayType* arrayTy = dyn_cast<ArrayType>(elementTy);
-      elementTy = arrayTy->getElementType();
-
-      unsigned numElements = arrayTy->getNumElements();
-
-      while (elementTy->isArrayTy()) {
-        arrayTy = dyn_cast<ArrayType>(elementTy);
-        elementTy = arrayTy->getElementType();
-
-        numElements *= arrayTy->getNumElements();
-      }
-
-      // FIXME: isPrimitiveType() == false for i16?
-      assert(elementTy->isSingleValueType() &&
-             "Non-primitive types are not handled");
-
-      // Find the size of the element in bits
-      unsigned elementSize = elementTy->getPrimitiveSizeInBits();
-
-      os << ".b" << elementSize << ' ' << gvsym->getName()
-         << '[' << numElements << ']';
-    } else {
-      os << ".b8" << gvsym->getName() << "[]";
-    }
-
-    // handle string constants (assume ConstantArray means string)
-    if (gv->hasInitializer()) {
-      const Constant *C = gv->getInitializer();
-      if (const ConstantArray *CA = dyn_cast<ConstantArray>(C)) {
-        os << " = {";
-
-        for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i) {
-          if (i > 0)
-            os << ',';
-
-          os << "0x";
-          os.write_hex(cast<ConstantInt>(CA->getOperand(i))->getZExtValue());
-        }
-
-        os << '}';
-      }
-    }
-  } else {
-    // Note: this is currently the fall-through case and most likely generates
-    //       incorrect code.
-    os << getTypeName(gv->getType()) << ' ' << gvsym->getName();
-
-    if (isa<ArrayType>(gv->getType()) || isa<PointerType>(gv->getType()))
-      os << "[]";
-  }
-
-  os << ';';
-
-  OutStreamer.EmitRawText(os.str());
-  OutStreamer.AddBlankLine();
-}
-
-void PTXAsmPrinter::EmitFunctionEntryLabel() {
-  // The function label could have already been emitted if two symbols end up
-  // conflicting due to asm renaming.  Detect this and emit an error.
-  if (!CurrentFnSym->isUndefined())
-    report_fatal_error("'" + Twine(CurrentFnSym->getName()) +
-                       "' label emitted multiple times to assembly file");
-
-  const PTXMachineFunctionInfo *MFI = MF->getInfo<PTXMachineFunctionInfo>();
-  const PTXParamManager &PM = MFI->getParamManager();
-  const bool isKernel = MFI->isKernel();
-  const PTXSubtarget& ST = TM.getSubtarget<PTXSubtarget>();
-
-  SmallString<128> decl;
-  raw_svector_ostream os(decl);
-  os << (isKernel ? ".entry" : ".func");
-
-  if (!isKernel) {
-    os << " (";
-    if (ST.useParamSpaceForDeviceArgs()) {
-      for (PTXParamManager::param_iterator i = PM.ret_begin(), e = PM.ret_end(),
-           b = i; i != e; ++i) {
-        if (i != b)
-          os << ", ";
-
-        os << ".param .b" << PM.getParamSize(*i) << ' ' << PM.getParamName(*i);
-      }
-    } else {
-      for (PTXMachineFunctionInfo::reg_iterator
-           i = MFI->retreg_begin(), e = MFI->retreg_end(), b = i;
-           i != e; ++i) {
-        if (i != b)
-          os << ", ";
-
-        os << ".reg " << getRegisterTypeName(MFI->getRegisterType(*i)) << ' '
-           << MFI->getRegisterName(*i);
-      }
-    }
-    os << ')';
-  }
-
-  // Print function name
-  os << ' ' << CurrentFnSym->getName() << " (";
-
-  const Function *F = MF->getFunction();
-
-  // Print parameters
-  if (isKernel || ST.useParamSpaceForDeviceArgs()) {
-    /*for (PTXParamManager::param_iterator i = PM.arg_begin(), e = PM.arg_end(),
-         b = i; i != e; ++i) {
-      if (i != b)
-        os << ", ";
-
-      os << ".param .b" << PM.getParamSize(*i) << ' ' << PM.getParamName(*i);
-    }*/
-    int Counter = 1;
-    for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end(),
-         b = i; i != e; ++i) {
-      if (i != b)
-        os << ", ";
-      const Type *ArgType = (*i).getType();
-      os << ".param .b";
-      if (ArgType->isPointerTy()) {
-        if (ST.is64Bit())
-          os << "64";
-        else
-          os << "32";
-      } else {
-        os << ArgType->getPrimitiveSizeInBits();
-      }
-      if (ArgType->isPointerTy() && ST.emitPtrAttribute()) {
-        const PointerType *PtrType = dyn_cast<const PointerType>(ArgType);
-        os << " .ptr";
-        switch (PtrType->getAddressSpace()) {
-        default:
-          llvm_unreachable("Unknown address space in argument");
-        case PTXStateSpace::Global:
-          os << " .global";
-          break;
-        case PTXStateSpace::Shared:
-          os << " .shared";
-          break;
-        }
-      }
-      os << " __param_" << Counter++;
-    }
-  } else {
-    for (PTXMachineFunctionInfo::reg_iterator
-         i = MFI->argreg_begin(), e = MFI->argreg_end(), b = i;
-         i != e; ++i) {
-      if (i != b)
-        os << ", ";
-
-      os << ".reg " << getRegisterTypeName(MFI->getRegisterType(*i)) << ' '
-         << MFI->getRegisterName(*i);
-    }
-  }
-  os << ')';
-
-  OutStreamer.EmitRawText(os.str());
-}
-
-void PTXAsmPrinter::EmitFunctionDeclaration(const Function* func)
-{
-  const PTXSubtarget& ST = TM.getSubtarget<PTXSubtarget>();
-	
-  std::string decl = "";
-
-  // hard-coded emission of extern vprintf function 
-  
-  if (func->getName() == "printf" || func->getName() == "puts") {		
-    decl += ".extern .func (.param .b32 __param_1) vprintf (.param .b";
-    if (ST.is64Bit())	
-      decl += "64";
-    else				
-      decl += "32";
-    decl += " __param_2, .param .b";
-    if (ST.is64Bit())	
-      decl += "64";
-    else				
-      decl += "32";
-    decl += " __param_3)\n";
-  }
-  
-  OutStreamer.EmitRawText(Twine(decl));
-}
-
-unsigned PTXAsmPrinter::GetOrCreateSourceID(StringRef FileName,
-                                            StringRef DirName) {
-  // If FE did not provide a file name, then assume stdin.
-  if (FileName.empty())
-    return GetOrCreateSourceID("<stdin>", StringRef());
-
-  // MCStream expects full path name as filename.
-  if (!DirName.empty() && !sys::path::is_absolute(FileName)) {
-    SmallString<128> FullPathName = DirName;
-    sys::path::append(FullPathName, FileName);
-    // Here FullPathName will be copied into StringMap by GetOrCreateSourceID.
-    return GetOrCreateSourceID(StringRef(FullPathName), StringRef());
-  }
-
-  StringMapEntry<unsigned> &Entry = SourceIdMap.GetOrCreateValue(FileName);
-  if (Entry.getValue())
-    return Entry.getValue();
-
-  unsigned SrcId = SourceIdMap.size();
-  Entry.setValue(SrcId);
-
-  // Print out a .file directive to specify files for .loc directives.
-  OutStreamer.EmitDwarfFileDirective(SrcId, "", Entry.getKey());
-
-  return SrcId;
-}
-
-MCOperand PTXAsmPrinter::GetSymbolRef(const MachineOperand &MO,
-                                      const MCSymbol *Symbol) {
-  const MCExpr *Expr;
-  Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None, OutContext);
-  return MCOperand::CreateExpr(Expr);
-}
-
-MCOperand PTXAsmPrinter::lowerOperand(const MachineOperand &MO) {
-  MCOperand MCOp;
-  const PTXMachineFunctionInfo *MFI = MF->getInfo<PTXMachineFunctionInfo>();
-  unsigned EncodedReg;
-  switch (MO.getType()) {
-  default:
-    llvm_unreachable("Unknown operand type");
-  case MachineOperand::MO_Register:
-    if (MO.getReg() > 0) {
-      // Encode the register
-      EncodedReg = MFI->getEncodedRegister(MO.getReg());
-    } else {
-      EncodedReg = 0;
-    }
-    MCOp = MCOperand::CreateReg(EncodedReg);
-    break;
-  case MachineOperand::MO_Immediate:
-    MCOp = MCOperand::CreateImm(MO.getImm());
-    break;
-  case MachineOperand::MO_MachineBasicBlock:
-    MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
-                                 MO.getMBB()->getSymbol(), OutContext));
-    break;
-  case MachineOperand::MO_GlobalAddress:
-    MCOp = GetSymbolRef(MO, Mang->getSymbol(MO.getGlobal()));
-    break;
-  case MachineOperand::MO_ExternalSymbol:
-    MCOp = GetSymbolRef(MO, GetExternalSymbolSymbol(MO.getSymbolName()));
-    break;
-  case MachineOperand::MO_FPImmediate:
-    APFloat Val = MO.getFPImm()->getValueAPF();
-    bool ignored;
-    Val.convert(APFloat::IEEEdouble, APFloat::rmTowardZero, &ignored);
-    MCOp = MCOperand::CreateFPImm(Val.convertToDouble());
-    break;
-  }
-
-  return MCOp;
-}
-
-// Force static initialization.
-extern "C" void LLVMInitializePTXAsmPrinter() {
-  RegisterAsmPrinter<PTXAsmPrinter> X(ThePTX32Target);
-  RegisterAsmPrinter<PTXAsmPrinter> Y(ThePTX64Target);
-}
diff --git a/lib/Target/PTX/PTXAsmPrinter.h b/lib/Target/PTX/PTXAsmPrinter.h
deleted file mode 100644
index 74c8d58..0000000
--- a/lib/Target/PTX/PTXAsmPrinter.h
+++ /dev/null
@@ -1,57 +0,0 @@
-//===-- PTXAsmPrinter.h - Print machine code to a PTX file ------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// PTX Assembly printer class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef PTXASMPRINTER_H
-#define PTXASMPRINTER_H
-
-#include "PTX.h"
-#include "PTXTargetMachine.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/Support/Compiler.h"
-
-namespace llvm {
-
-class MCOperand;
-
-class LLVM_LIBRARY_VISIBILITY PTXAsmPrinter : public AsmPrinter {
-public:
-  explicit PTXAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-    : AsmPrinter(TM, Streamer) {}
-
-  const char *getPassName() const { return "PTX Assembly Printer"; }
-
-  bool doFinalization(Module &M);
-
-  virtual void EmitStartOfAsmFile(Module &M);
-  virtual void EmitFunctionBodyStart();
-  virtual void EmitFunctionBodyEnd();
-  virtual void EmitFunctionEntryLabel();
-  virtual void EmitInstruction(const MachineInstr *MI);
-
-  unsigned GetOrCreateSourceID(StringRef FileName,
-                               StringRef DirName);
-
-  MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol);
-  MCOperand lowerOperand(const MachineOperand &MO);
-
-private:
-  void EmitVariableDeclaration(const GlobalVariable *gv);
-  void EmitFunctionDeclaration(const Function* func);
-
-  StringMap<unsigned> SourceIdMap;
-}; // class PTXAsmPrinter
-} // namespace llvm
-
-#endif
-
diff --git a/lib/Target/PTX/PTXFPRoundingModePass.cpp b/lib/Target/PTX/PTXFPRoundingModePass.cpp
deleted file mode 100644
index a21d172..0000000
--- a/lib/Target/PTX/PTXFPRoundingModePass.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-//===-- PTXFPRoundingModePass.cpp - Assign rounding modes pass ------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines a machine function pass that sets appropriate FP rounding
-// modes for all relevant instructions.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "ptx-fp-rounding-mode"
-
-#include "PTX.h"
-#include "PTXTargetMachine.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-// NOTE: PTXFPRoundingModePass should be executed just before emission.
-
-namespace {
-  /// PTXFPRoundingModePass - Pass to assign appropriate FP rounding modes to
-  /// all FP instructions. Essentially, this pass just looks for all FP
-  /// instructions that have a rounding mode set to RndDefault, and sets an
-  /// appropriate rounding mode based on the target device.
-  ///
-  class PTXFPRoundingModePass : public MachineFunctionPass {
-    private:
-      static char ID;
-
-      typedef std::pair<unsigned, unsigned> RndModeDesc;
-
-      PTXTargetMachine& TargetMachine;
-      DenseMap<unsigned, RndModeDesc> Instrs;
-
-    public:
-      PTXFPRoundingModePass(PTXTargetMachine &TM, CodeGenOpt::Level OptLevel)
-        : MachineFunctionPass(ID),
-          TargetMachine(TM) {
-        initializeMap();
-      }
-
-      virtual bool runOnMachineFunction(MachineFunction &MF);
-
-      virtual const char *getPassName() const {
-        return "PTX FP Rounding Mode Pass";
-      }
-
-    private:
-
-      void initializeMap();
-      void processInstruction(MachineInstr &MI);
-  }; // class PTXFPRoundingModePass
-} // end anonymous namespace
-
-using namespace llvm;
-
-char PTXFPRoundingModePass::ID = 0;
-
-bool PTXFPRoundingModePass::runOnMachineFunction(MachineFunction &MF) {
-  // Look at each basic block
-  for (MachineFunction::iterator bbi = MF.begin(), bbe = MF.end(); bbi != bbe;
-       ++bbi) {
-    MachineBasicBlock &MBB = *bbi;
-    // Look at each instruction
-    for (MachineBasicBlock::iterator ii = MBB.begin(), ie = MBB.end();
-         ii != ie; ++ii) {
-      MachineInstr &MI = *ii;
-      processInstruction(MI);
-    }
-  }
-  return false;
-}
-
-void PTXFPRoundingModePass::initializeMap() {
-  using namespace PTXRoundingMode;
-  const PTXSubtarget& ST = TargetMachine.getSubtarget<PTXSubtarget>();
-
-  // Build a map of default rounding mode for all instructions that need a
-  // rounding mode.
-  Instrs[PTX::FADDrr32] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::FADDri32] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::FADDrr64] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::FADDri64] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::FSUBrr32] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::FSUBri32] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::FSUBrr64] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::FSUBri64] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::FMULrr32] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::FMULri32] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::FMULrr64] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::FMULri64] = std::make_pair(1U, (unsigned)RndNearestEven);
-
-  Instrs[PTX::FNEGrr32] = std::make_pair(1U, (unsigned)RndNone);
-  Instrs[PTX::FNEGri32] = std::make_pair(1U, (unsigned)RndNone);
-  Instrs[PTX::FNEGrr64] = std::make_pair(1U, (unsigned)RndNone);
-  Instrs[PTX::FNEGri64] = std::make_pair(1U, (unsigned)RndNone);
-
-  unsigned FDivRndMode = ST.fdivNeedsRoundingMode() ? RndNearestEven : RndNone;
-  Instrs[PTX::FDIVrr32] = std::make_pair(1U, FDivRndMode);
-  Instrs[PTX::FDIVri32] = std::make_pair(1U, FDivRndMode);
-  Instrs[PTX::FDIVrr64] = std::make_pair(1U, FDivRndMode);
-  Instrs[PTX::FDIVri64] = std::make_pair(1U, FDivRndMode);
-
-  unsigned FMADRndMode = ST.fmadNeedsRoundingMode() ? RndNearestEven : RndNone;
-  Instrs[PTX::FMADrrr32] = std::make_pair(1U, FMADRndMode);
-  Instrs[PTX::FMADrri32] = std::make_pair(1U, FMADRndMode);
-  Instrs[PTX::FMADrii32] = std::make_pair(1U, FMADRndMode);
-  Instrs[PTX::FMADrrr64] = std::make_pair(1U, FMADRndMode);
-  Instrs[PTX::FMADrri64] = std::make_pair(1U, FMADRndMode);
-  Instrs[PTX::FMADrii64] = std::make_pair(1U, FMADRndMode);
-
-  Instrs[PTX::FSQRTrr32] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::FSQRTri32] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::FSQRTrr64] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::FSQRTri64] = std::make_pair(1U, (unsigned)RndNearestEven);
-
-  Instrs[PTX::FSINrr32] = std::make_pair(1U, (unsigned)RndApprox);
-  Instrs[PTX::FSINri32] = std::make_pair(1U, (unsigned)RndApprox);
-  Instrs[PTX::FSINrr64] = std::make_pair(1U, (unsigned)RndApprox);
-  Instrs[PTX::FSINri64] = std::make_pair(1U, (unsigned)RndApprox);
-  Instrs[PTX::FCOSrr32] = std::make_pair(1U, (unsigned)RndApprox);
-  Instrs[PTX::FCOSri32] = std::make_pair(1U, (unsigned)RndApprox);
-  Instrs[PTX::FCOSrr64] = std::make_pair(1U, (unsigned)RndApprox);
-  Instrs[PTX::FCOSri64] = std::make_pair(1U, (unsigned)RndApprox);
-
-  Instrs[PTX::CVTu16f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
-  Instrs[PTX::CVTs16f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
-  Instrs[PTX::CVTu16f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
-  Instrs[PTX::CVTs16f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
-  Instrs[PTX::CVTu32f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
-  Instrs[PTX::CVTs32f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
-  Instrs[PTX::CVTu32f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
-  Instrs[PTX::CVTs32f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
-  Instrs[PTX::CVTu64f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
-  Instrs[PTX::CVTs64f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
-  Instrs[PTX::CVTu64f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
-  Instrs[PTX::CVTs64f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
-
-  Instrs[PTX::CVTf32u16] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::CVTf32s16] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::CVTf32u32] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::CVTf32s32] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::CVTf32u64] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::CVTf32s64] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::CVTf32f64] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::CVTf64u16] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::CVTf64s16] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::CVTf64u32] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::CVTf64s32] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::CVTf64u64] = std::make_pair(1U, (unsigned)RndNearestEven);
-  Instrs[PTX::CVTf64s64] = std::make_pair(1U, (unsigned)RndNearestEven);
-}
-
-void PTXFPRoundingModePass::processInstruction(MachineInstr &MI) {
-  // Is this an instruction that needs a rounding mode?
-  if (Instrs.count(MI.getOpcode())) {
-    const RndModeDesc &Desc = Instrs[MI.getOpcode()];
-    // Get the rounding mode operand
-    MachineOperand &Op = MI.getOperand(Desc.first);
-    // Update the rounding mode if needed
-    if (Op.getImm() == PTXRoundingMode::RndDefault) {
-      Op.setImm(Desc.second);
-    }
-  }
-}
-
-FunctionPass *llvm::createPTXFPRoundingModePass(PTXTargetMachine &TM,
-                                                CodeGenOpt::Level OptLevel) {
-  return new PTXFPRoundingModePass(TM, OptLevel);
-}
-
diff --git a/lib/Target/PTX/PTXFrameLowering.cpp b/lib/Target/PTX/PTXFrameLowering.cpp
deleted file mode 100644
index e6e268e..0000000
--- a/lib/Target/PTX/PTXFrameLowering.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-//===-- PTXFrameLowering.cpp - PTX Frame Information ----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the PTX implementation of TargetFrameLowering class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PTXFrameLowering.h"
-#include "llvm/CodeGen/MachineFunction.h"
-
-using namespace llvm;
-
-void PTXFrameLowering::emitPrologue(MachineFunction &MF) const {
-}
-
-void PTXFrameLowering::emitEpilogue(MachineFunction &MF,
-                                    MachineBasicBlock &MBB) const {
-}
diff --git a/lib/Target/PTX/PTXFrameLowering.h b/lib/Target/PTX/PTXFrameLowering.h
deleted file mode 100644
index 831e818..0000000
--- a/lib/Target/PTX/PTXFrameLowering.h
+++ /dev/null
@@ -1,44 +0,0 @@
-//===-- PTXFrameLowering.h - Define frame lowering for PTX -----*- C++ -*--===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef PTX_FRAMEINFO_H
-#define PTX_FRAMEINFO_H
-
-#include "PTX.h"
-#include "PTXSubtarget.h"
-#include "llvm/Target/TargetFrameLowering.h"
-
-namespace llvm {
-  class PTXSubtarget;
-
-class PTXFrameLowering : public TargetFrameLowering {
-protected:
-  const PTXSubtarget &STI;
-
-public:
-  explicit PTXFrameLowering(const PTXSubtarget &sti)
-    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 2, -2),
-      STI(sti) {
-  }
-
-  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
-  /// the function.
-  void emitPrologue(MachineFunction &MF) const;
-  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
-
-  bool hasFP(const MachineFunction &MF) const { return false; }
-};
-
-} // End llvm namespace
-
-#endif
diff --git a/lib/Target/PTX/PTXISelDAGToDAG.cpp b/lib/Target/PTX/PTXISelDAGToDAG.cpp
deleted file mode 100644
index 5c7ee29..0000000
--- a/lib/Target/PTX/PTXISelDAGToDAG.cpp
+++ /dev/null
@@ -1,356 +0,0 @@
-//===-- PTXISelDAGToDAG.cpp - A dag to dag inst selector for PTX ----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines an instruction selector for the PTX target.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PTX.h"
-#include "PTXMachineFunctionInfo.h"
-#include "PTXTargetMachine.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-namespace {
-// PTXDAGToDAGISel - PTX specific code to select PTX machine
-// instructions for SelectionDAG operations.
-class PTXDAGToDAGISel : public SelectionDAGISel {
-  public:
-    PTXDAGToDAGISel(PTXTargetMachine &TM, CodeGenOpt::Level OptLevel);
-
-    virtual const char *getPassName() const {
-      return "PTX DAG->DAG Pattern Instruction Selection";
-    }
-
-    SDNode *Select(SDNode *Node);
-
-    // Complex Pattern Selectors.
-    bool SelectADDRrr(SDValue &Addr, SDValue &R1, SDValue &R2);
-    bool SelectADDRri(SDValue &Addr, SDValue &Base, SDValue &Offset);
-    bool SelectADDRii(SDValue &Addr, SDValue &Base, SDValue &Offset);
-    bool SelectADDRlocal(SDValue &Addr, SDValue &Base, SDValue &Offset);
-
-    // Include the pieces auto'gened from the target description
-#include "PTXGenDAGISel.inc"
-
-  private:
-    // We need this only because we can't match intruction BRAdp
-    // pattern (PTXbrcond bb:$d, ...) in PTXInstrInfo.td
-    SDNode *SelectBRCOND(SDNode *Node);
-
-    SDNode *SelectREADPARAM(SDNode *Node);
-    SDNode *SelectWRITEPARAM(SDNode *Node);
-    SDNode *SelectFrameIndex(SDNode *Node);
-
-    bool isImm(const SDValue &operand);
-    bool SelectImm(const SDValue &operand, SDValue &imm);
-
-    const PTXSubtarget& getSubtarget() const;
-}; // class PTXDAGToDAGISel
-} // namespace
-
-// createPTXISelDag - This pass converts a legalized DAG into a
-// PTX-specific DAG, ready for instruction scheduling
-FunctionPass *llvm::createPTXISelDag(PTXTargetMachine &TM,
-                                     CodeGenOpt::Level OptLevel) {
-  return new PTXDAGToDAGISel(TM, OptLevel);
-}
-
-PTXDAGToDAGISel::PTXDAGToDAGISel(PTXTargetMachine &TM,
-                                 CodeGenOpt::Level OptLevel)
-  : SelectionDAGISel(TM, OptLevel) {}
-
-SDNode *PTXDAGToDAGISel::Select(SDNode *Node) {
-  switch (Node->getOpcode()) {
-    case ISD::BRCOND:
-      return SelectBRCOND(Node);
-    case PTXISD::READ_PARAM:
-      return SelectREADPARAM(Node);
-    case PTXISD::WRITE_PARAM:
-      return SelectWRITEPARAM(Node);
-    case ISD::FrameIndex:
-      return SelectFrameIndex(Node);
-    default:
-      return SelectCode(Node);
-  }
-}
-
-SDNode *PTXDAGToDAGISel::SelectBRCOND(SDNode *Node) {
-  assert(Node->getNumOperands() >= 3);
-
-  SDValue Chain  = Node->getOperand(0);
-  SDValue Pred   = Node->getOperand(1);
-  SDValue Target = Node->getOperand(2); // branch target
-  SDValue PredOp = CurDAG->getTargetConstant(PTXPredicate::Normal, MVT::i32);
-  DebugLoc dl = Node->getDebugLoc();
-
-  assert(Target.getOpcode()  == ISD::BasicBlock);
-  assert(Pred.getValueType() == MVT::i1);
-
-  // Emit BRAdp
-  SDValue Ops[] = { Target, Pred, PredOp, Chain };
-  return CurDAG->getMachineNode(PTX::BRAdp, dl, MVT::Other, Ops, 4);
-}
-
-SDNode *PTXDAGToDAGISel::SelectREADPARAM(SDNode *Node) {
-  SDValue Chain = Node->getOperand(0);
-  SDValue Index = Node->getOperand(1);
-
-  int OpCode;
-
-  // Get the type of parameter we are reading
-  EVT VT = Node->getValueType(0);
-  assert(VT.isSimple() && "READ_PARAM only implemented for MVT types");
-
-  MVT Type = VT.getSimpleVT();
-
-  if (Type == MVT::i1)
-    OpCode = PTX::READPARAMPRED;
-  else if (Type == MVT::i16)
-    OpCode = PTX::READPARAMI16;
-  else if (Type == MVT::i32)
-    OpCode = PTX::READPARAMI32;
-  else if (Type == MVT::i64)
-    OpCode = PTX::READPARAMI64;
-  else if (Type == MVT::f32)
-    OpCode = PTX::READPARAMF32;
-  else {
-    assert(Type == MVT::f64 && "Unexpected type!");
-    OpCode = PTX::READPARAMF64;
-  }
-
-  SDValue Pred = CurDAG->getRegister(PTX::NoRegister, MVT::i1);
-  SDValue PredOp = CurDAG->getTargetConstant(PTXPredicate::None, MVT::i32);
-  DebugLoc dl = Node->getDebugLoc();
-
-  SDValue Ops[] = { Index, Pred, PredOp, Chain };
-  return CurDAG->getMachineNode(OpCode, dl, VT, Ops, 4);
-}
-
-SDNode *PTXDAGToDAGISel::SelectWRITEPARAM(SDNode *Node) {
-
-  SDValue Chain = Node->getOperand(0);
-  SDValue Value = Node->getOperand(1);
-
-  int OpCode;
-
-  //Node->dumpr(CurDAG);
-
-  // Get the type of parameter we are writing
-  EVT VT = Value->getValueType(0);
-  assert(VT.isSimple() && "WRITE_PARAM only implemented for MVT types");
-
-  MVT Type = VT.getSimpleVT();
-
-  if (Type == MVT::i1)
-    OpCode = PTX::WRITEPARAMPRED;
-  else if (Type == MVT::i16)
-    OpCode = PTX::WRITEPARAMI16;
-  else if (Type == MVT::i32)
-    OpCode = PTX::WRITEPARAMI32;
-  else if (Type == MVT::i64)
-    OpCode = PTX::WRITEPARAMI64;
-  else if (Type == MVT::f32)
-    OpCode = PTX::WRITEPARAMF32;
-  else if (Type == MVT::f64)
-    OpCode = PTX::WRITEPARAMF64;
-  else
-    llvm_unreachable("Invalid type in SelectWRITEPARAM");
-
-  SDValue Pred = CurDAG->getRegister(PTX::NoRegister, MVT::i1);
-  SDValue PredOp = CurDAG->getTargetConstant(PTXPredicate::None, MVT::i32);
-  DebugLoc dl = Node->getDebugLoc();
-
-  SDValue Ops[] = { Value, Pred, PredOp, Chain };
-  SDNode* Ret = CurDAG->getMachineNode(OpCode, dl, MVT::Other, Ops, 4);
-
-  //dbgs() << "SelectWRITEPARAM produced:\n\t";
-  //Ret->dumpr(CurDAG);
-
-  return Ret;
-}
-
-SDNode *PTXDAGToDAGISel::SelectFrameIndex(SDNode *Node) {
-  int FI = cast<FrameIndexSDNode>(Node)->getIndex();
-  //dbgs() << "Selecting FrameIndex at index " << FI << "\n";
-  //SDValue TFI = CurDAG->getTargetFrameIndex(FI, Node->getValueType(0));
-
-  PTXMachineFunctionInfo *MFI = MF->getInfo<PTXMachineFunctionInfo>();
-
-  SDValue FrameSymbol = CurDAG->getTargetExternalSymbol(MFI->getFrameSymbol(FI),
-                                                        Node->getValueType(0));
-
-  return FrameSymbol.getNode();
-}
-
-// Match memory operand of the form [reg+reg]
-bool PTXDAGToDAGISel::SelectADDRrr(SDValue &Addr, SDValue &R1, SDValue &R2) {
-  if (Addr.getOpcode() != ISD::ADD || Addr.getNumOperands() < 2 ||
-      isImm(Addr.getOperand(0)) || isImm(Addr.getOperand(1)))
-    return false;
-
-  assert(Addr.getValueType().isSimple() && "Type must be simple");
-
-  R1 = Addr;
-  R2 = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT());
-
-  return true;
-}
-
-// Match memory operand of the form [reg], [imm+reg], and [reg+imm]
-bool PTXDAGToDAGISel::SelectADDRri(SDValue &Addr, SDValue &Base,
-                                   SDValue &Offset) {
-  // FrameIndex addresses are handled separately
-  //errs() << "SelectADDRri: ";
-  //Addr.getNode()->dumpr();
-  if (isa<FrameIndexSDNode>(Addr)) {
-    //errs() << "Failure\n";
-    return false;
-  }
-
-  if (CurDAG->isBaseWithConstantOffset(Addr)) {
-    Base = Addr.getOperand(0);
-    if (isa<FrameIndexSDNode>(Base)) {
-      //errs() << "Failure\n";
-      return false;
-    }
-    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
-    Offset = CurDAG->getTargetConstant(CN->getZExtValue(), MVT::i32);
-    //errs() << "Success\n";
-    return true;
-  }
-
-  /*if (Addr.getNumOperands() == 1) {
-    Base = Addr;
-    Offset = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT());
-    errs() << "Success\n";
-    return true;
-  }*/
-
-  //errs() << "SelectADDRri fails on: ";
-  //Addr.getNode()->dumpr();
-
-  if (isImm(Addr)) {
-    //errs() << "Failure\n";
-    return false;
-  }
-
-  Base = Addr;
-  Offset = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT());
-
-  //errs() << "Success\n";
-  return true;
-
-  /*if (Addr.getOpcode() != ISD::ADD) {
-    // let SelectADDRii handle the [imm] case
-    if (isImm(Addr))
-      return false;
-    // it is [reg]
-
-    assert(Addr.getValueType().isSimple() && "Type must be simple");
-    Base = Addr;
-    Offset = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT());
-
-    return true;
-  }
-
-  if (Addr.getNumOperands() < 2)
-    return false;
-
-  // let SelectADDRii handle the [imm+imm] case
-  if (isImm(Addr.getOperand(0)) && isImm(Addr.getOperand(1)))
-    return false;
-
-  // try [reg+imm] and [imm+reg]
-  for (int i = 0; i < 2; i ++)
-    if (SelectImm(Addr.getOperand(1-i), Offset)) {
-      Base = Addr.getOperand(i);
-      return true;
-    }
-
-  // neither [reg+imm] nor [imm+reg]
-  return false;*/
-}
-
-// Match memory operand of the form [imm+imm] and [imm]
-bool PTXDAGToDAGISel::SelectADDRii(SDValue &Addr, SDValue &Base,
-                                   SDValue &Offset) {
-  // is [imm+imm]?
-  if (Addr.getOpcode() == ISD::ADD) {
-    return SelectImm(Addr.getOperand(0), Base) &&
-           SelectImm(Addr.getOperand(1), Offset);
-  }
-
-  // is [imm]?
-  if (SelectImm(Addr, Base)) {
-    assert(Addr.getValueType().isSimple() && "Type must be simple");
-
-    Offset = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT());
-
-    return true;
-  }
-
-  return false;
-}
-
-// Match memory operand of the form [reg], [imm+reg], and [reg+imm]
-bool PTXDAGToDAGISel::SelectADDRlocal(SDValue &Addr, SDValue &Base,
-                                      SDValue &Offset) {
-  //errs() << "SelectADDRlocal: ";
-  //Addr.getNode()->dumpr();
-  if (isa<FrameIndexSDNode>(Addr)) {
-    Base = Addr;
-    Offset = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT());
-    //errs() << "Success\n";
-    return true;
-  }
-
-  if (CurDAG->isBaseWithConstantOffset(Addr)) {
-    Base = Addr.getOperand(0);
-    if (!isa<FrameIndexSDNode>(Base)) {
-      //errs() << "Failure\n";
-      return false;
-    }
-    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
-    Offset = CurDAG->getTargetConstant(CN->getZExtValue(), MVT::i32);
-    //errs() << "Offset: ";
-    //Offset.getNode()->dumpr();
-    //errs() << "Success\n";
-    return true;
-  }
-
-  //errs() << "Failure\n";
-  return false;
-}
-
-bool PTXDAGToDAGISel::isImm(const SDValue &operand) {
-  return ConstantSDNode::classof(operand.getNode());
-}
-
-bool PTXDAGToDAGISel::SelectImm(const SDValue &operand, SDValue &imm) {
-  SDNode *node = operand.getNode();
-  if (!ConstantSDNode::classof(node))
-    return false;
-
-  ConstantSDNode *CN = cast<ConstantSDNode>(node);
-  imm = CurDAG->getTargetConstant(*CN->getConstantIntValue(),
-                                  operand.getValueType());
-  return true;
-}
-
-const PTXSubtarget& PTXDAGToDAGISel::getSubtarget() const
-{
-  return TM.getSubtarget<PTXSubtarget>();
-}
-
diff --git a/lib/Target/PTX/PTXISelLowering.cpp b/lib/Target/PTX/PTXISelLowering.cpp
deleted file mode 100644
index ef4455b..0000000
--- a/lib/Target/PTX/PTXISelLowering.cpp
+++ /dev/null
@@ -1,522 +0,0 @@
-//===-- PTXISelLowering.cpp - PTX DAG Lowering Implementation -------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the PTXTargetLowering class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PTXISelLowering.h"
-#include "PTX.h"
-#include "PTXMachineFunctionInfo.h"
-#include "PTXRegisterInfo.h"
-#include "PTXSubtarget.h"
-#include "llvm/Function.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-//===----------------------------------------------------------------------===//
-// TargetLowering Implementation
-//===----------------------------------------------------------------------===//
-
-PTXTargetLowering::PTXTargetLowering(TargetMachine &TM)
-  : TargetLowering(TM, new TargetLoweringObjectFileELF()) {
-  // Set up the register classes.
-  addRegisterClass(MVT::i1,  PTX::RegPredRegisterClass);
-  addRegisterClass(MVT::i16, PTX::RegI16RegisterClass);
-  addRegisterClass(MVT::i32, PTX::RegI32RegisterClass);
-  addRegisterClass(MVT::i64, PTX::RegI64RegisterClass);
-  addRegisterClass(MVT::f32, PTX::RegF32RegisterClass);
-  addRegisterClass(MVT::f64, PTX::RegF64RegisterClass);
-
-  setBooleanContents(ZeroOrOneBooleanContent);
-  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
-  setMinFunctionAlignment(2);
-
-  // Let LLVM use loads/stores for all mem* operations
-  maxStoresPerMemcpy  = 4096;
-  maxStoresPerMemmove = 4096;
-  maxStoresPerMemset  = 4096;
-
-  ////////////////////////////////////
-  /////////// Expansion //////////////
-  ////////////////////////////////////
-
-  // (any/zero/sign) extload => load + (any/zero/sign) extend
-
-  setLoadExtAction(ISD::EXTLOAD, MVT::i16, Expand);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Expand);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Expand);
-
-  // f32 extload => load + fextend
-
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
-
-  // f64 truncstore => trunc + store
-
-  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
-
-  // sign_extend_inreg => sign_extend
-
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
-
-  // br_cc => brcond
-
-  setOperationAction(ISD::BR_CC, MVT::Other, Expand);
-
-  // select_cc => setcc
-
-  setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
-  setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
-  setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
-
-  ////////////////////////////////////
-  //////////// Legal /////////////////
-  ////////////////////////////////////
-
-  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
-  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
-
-  ////////////////////////////////////
-  //////////// Custom ////////////////
-  ////////////////////////////////////
-
-  // customise setcc to use bitwise logic if possible
-
-  //setOperationAction(ISD::SETCC, MVT::i1, Custom);
-  setOperationAction(ISD::SETCC, MVT::i1, Legal);
-
-  // customize translation of memory addresses
-
-  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
-  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
-
-  // Compute derived properties from the register classes
-  computeRegisterProperties();
-}
-
-EVT PTXTargetLowering::getSetCCResultType(EVT VT) const {
-  return MVT::i1;
-}
-
-SDValue PTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
-  switch (Op.getOpcode()) {
-    default:
-      llvm_unreachable("Unimplemented operand");
-    case ISD::SETCC:
-      return LowerSETCC(Op, DAG);
-    case ISD::GlobalAddress:
-      return LowerGlobalAddress(Op, DAG);
-  }
-}
-
-const char *PTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
-  switch (Opcode) {
-    default:
-      llvm_unreachable("Unknown opcode");
-    case PTXISD::COPY_ADDRESS:
-      return "PTXISD::COPY_ADDRESS";
-    case PTXISD::LOAD_PARAM:
-      return "PTXISD::LOAD_PARAM";
-    case PTXISD::STORE_PARAM:
-      return "PTXISD::STORE_PARAM";
-    case PTXISD::READ_PARAM:
-      return "PTXISD::READ_PARAM";
-    case PTXISD::WRITE_PARAM:
-      return "PTXISD::WRITE_PARAM";
-    case PTXISD::EXIT:
-      return "PTXISD::EXIT";
-    case PTXISD::RET:
-      return "PTXISD::RET";
-    case PTXISD::CALL:
-      return "PTXISD::CALL";
-  }
-}
-
-//===----------------------------------------------------------------------===//
-//                      Custom Lower Operation
-//===----------------------------------------------------------------------===//
-
-SDValue PTXTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
-  assert(Op.getValueType() == MVT::i1 && "SetCC type must be 1-bit integer");
-  SDValue Op0 = Op.getOperand(0);
-  SDValue Op1 = Op.getOperand(1);
-  SDValue Op2 = Op.getOperand(2);
-  DebugLoc dl = Op.getDebugLoc();
-  //ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
-
-  // Look for X == 0, X == 1, X != 0, or X != 1
-  // We can simplify these to bitwise logic
-
-  //if (Op1.getOpcode() == ISD::Constant &&
-  //    (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
-  //     cast<ConstantSDNode>(Op1)->isNullValue()) &&
-  //    (CC == ISD::SETEQ || CC == ISD::SETNE)) {
-  //
-  //  return DAG.getNode(ISD::AND, dl, MVT::i1, Op0, Op1);
-  //}
-
-  //ConstantSDNode* COp1 = cast<ConstantSDNode>(Op1);
-  //if(COp1 && COp1->getZExtValue() == 1) {
-  //  if(CC == ISD::SETNE) {
-  //    return DAG.getNode(PTX::XORripreds, dl, MVT::i1, Op0);
-  //  }
-  //}
-
-  llvm_unreachable("setcc was not matched by a pattern!");
-
-  return DAG.getNode(ISD::SETCC, dl, MVT::i1, Op0, Op1, Op2);
-}
-
-SDValue PTXTargetLowering::
-LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
-  EVT PtrVT = getPointerTy();
-  DebugLoc dl = Op.getDebugLoc();
-  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-
-  assert(PtrVT.isSimple() && "Pointer must be to primitive type.");
-
-  SDValue targetGlobal = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
-  SDValue movInstr = DAG.getNode(PTXISD::COPY_ADDRESS,
-                                 dl,
-                                 PtrVT.getSimpleVT(),
-                                 targetGlobal);
-
-  return movInstr;
-}
-
-//===----------------------------------------------------------------------===//
-//                      Calling Convention Implementation
-//===----------------------------------------------------------------------===//
-
-SDValue PTXTargetLowering::
-  LowerFormalArguments(SDValue Chain,
-                       CallingConv::ID CallConv,
-                       bool isVarArg,
-                       const SmallVectorImpl<ISD::InputArg> &Ins,
-                       DebugLoc dl,
-                       SelectionDAG &DAG,
-                       SmallVectorImpl<SDValue> &InVals) const {
-  if (isVarArg) llvm_unreachable("PTX does not support varargs");
-
-  MachineFunction &MF = DAG.getMachineFunction();
-  const PTXSubtarget& ST = getTargetMachine().getSubtarget<PTXSubtarget>();
-  PTXMachineFunctionInfo *MFI = MF.getInfo<PTXMachineFunctionInfo>();
-  PTXParamManager &PM = MFI->getParamManager();
-
-  switch (CallConv) {
-    default:
-      llvm_unreachable("Unsupported calling convention");
-    case CallingConv::PTX_Kernel:
-      MFI->setKernel(true);
-      break;
-    case CallingConv::PTX_Device:
-      MFI->setKernel(false);
-      break;
-  }
-
-  // We do one of two things here:
-  // IsKernel || SM >= 2.0  ->  Use param space for arguments
-  // SM < 2.0               ->  Use registers for arguments
-  if (MFI->isKernel() || ST.useParamSpaceForDeviceArgs()) {
-    // We just need to emit the proper LOAD_PARAM ISDs
-    for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
-      assert((!MFI->isKernel() || Ins[i].VT != MVT::i1) &&
-             "Kernels cannot take pred operands");
-
-      unsigned ParamSize = Ins[i].VT.getStoreSizeInBits();
-      unsigned Param = PM.addArgumentParam(ParamSize);
-      const std::string &ParamName = PM.getParamName(Param);
-      SDValue ParamValue = DAG.getTargetExternalSymbol(ParamName.c_str(),
-                                                       MVT::Other);
-      SDValue ArgValue = DAG.getNode(PTXISD::LOAD_PARAM, dl, Ins[i].VT, Chain,
-                                     ParamValue);
-      InVals.push_back(ArgValue);
-    }
-  }
-  else {
-    for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
-      EVT                        RegVT = Ins[i].VT;
-      const TargetRegisterClass* TRC   = getRegClassFor(RegVT);
-      unsigned                   RegType;
-
-      // Determine which register class we need
-      if (RegVT == MVT::i1)
-        RegType = PTXRegisterType::Pred;
-      else if (RegVT == MVT::i16)
-        RegType = PTXRegisterType::B16;
-      else if (RegVT == MVT::i32)
-        RegType = PTXRegisterType::B32;
-      else if (RegVT == MVT::i64)
-        RegType = PTXRegisterType::B64;
-      else if (RegVT == MVT::f32)
-        RegType = PTXRegisterType::F32;
-      else if (RegVT == MVT::f64)
-        RegType = PTXRegisterType::F64;
-      else
-        llvm_unreachable("Unknown parameter type");
-
-      // Use a unique index in the instruction to prevent instruction folding.
-      // Yes, this is a hack.
-      SDValue Index = DAG.getTargetConstant(i, MVT::i32);
-      unsigned Reg = MF.getRegInfo().createVirtualRegister(TRC);
-      SDValue ArgValue = DAG.getNode(PTXISD::READ_PARAM, dl, RegVT, Chain,
-                                     Index);
-
-      InVals.push_back(ArgValue);
-
-      MFI->addRegister(Reg, RegType, PTXRegisterSpace::Argument);
-    }
-  }
-
-  return Chain;
-}
-
-SDValue PTXTargetLowering::
-  LowerReturn(SDValue Chain,
-              CallingConv::ID CallConv,
-              bool isVarArg,
-              const SmallVectorImpl<ISD::OutputArg> &Outs,
-              const SmallVectorImpl<SDValue> &OutVals,
-              DebugLoc dl,
-              SelectionDAG &DAG) const {
-  if (isVarArg) llvm_unreachable("PTX does not support varargs");
-
-  switch (CallConv) {
-    default:
-      llvm_unreachable("Unsupported calling convention.");
-    case CallingConv::PTX_Kernel:
-      assert(Outs.size() == 0 && "Kernel must return void.");
-      return DAG.getNode(PTXISD::EXIT, dl, MVT::Other, Chain);
-    case CallingConv::PTX_Device:
-      assert(Outs.size() <= 1 && "Can at most return one value.");
-      break;
-  }
-
-  MachineFunction& MF = DAG.getMachineFunction();
-  PTXMachineFunctionInfo *MFI = MF.getInfo<PTXMachineFunctionInfo>();
-  PTXParamManager &PM = MFI->getParamManager();
-
-  SDValue Flag;
-  const PTXSubtarget& ST = getTargetMachine().getSubtarget<PTXSubtarget>();
-
-  if (ST.useParamSpaceForDeviceArgs()) {
-    assert(Outs.size() < 2 && "Device functions can return at most one value");
-
-    if (Outs.size() == 1) {
-      unsigned ParamSize = OutVals[0].getValueType().getSizeInBits();
-      unsigned Param = PM.addReturnParam(ParamSize);
-      const std::string &ParamName = PM.getParamName(Param);
-      SDValue ParamValue = DAG.getTargetExternalSymbol(ParamName.c_str(),
-                                                       MVT::Other);
-      Chain = DAG.getNode(PTXISD::STORE_PARAM, dl, MVT::Other, Chain,
-                          ParamValue, OutVals[0]);
-    }
-  } else {
-    for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
-      EVT                  RegVT = Outs[i].VT;
-      const TargetRegisterClass* TRC;
-      unsigned             RegType;
-
-      // Determine which register class we need
-      if (RegVT == MVT::i1) {
-        TRC = PTX::RegPredRegisterClass;
-        RegType = PTXRegisterType::Pred;
-      }
-      else if (RegVT == MVT::i16) {
-        TRC = PTX::RegI16RegisterClass;
-        RegType = PTXRegisterType::B16;
-      }
-      else if (RegVT == MVT::i32) {
-        TRC = PTX::RegI32RegisterClass;
-        RegType = PTXRegisterType::B32;
-      }
-      else if (RegVT == MVT::i64) {
-        TRC = PTX::RegI64RegisterClass;
-        RegType = PTXRegisterType::B64;
-      }
-      else if (RegVT == MVT::f32) {
-        TRC = PTX::RegF32RegisterClass;
-        RegType = PTXRegisterType::F32;
-      }
-      else if (RegVT == MVT::f64) {
-        TRC = PTX::RegF64RegisterClass;
-        RegType = PTXRegisterType::F64;
-      }
-      else {
-        llvm_unreachable("Unknown parameter type");
-      }
-
-      unsigned Reg = MF.getRegInfo().createVirtualRegister(TRC);
-
-      SDValue Copy = DAG.getCopyToReg(Chain, dl, Reg, OutVals[i]/*, Flag*/);
-      SDValue OutReg = DAG.getRegister(Reg, RegVT);
-
-      Chain = DAG.getNode(PTXISD::WRITE_PARAM, dl, MVT::Other, Copy, OutReg);
-
-      MFI->addRegister(Reg, RegType, PTXRegisterSpace::Return);
-    }
-  }
-
-  if (Flag.getNode() == 0) {
-    return DAG.getNode(PTXISD::RET, dl, MVT::Other, Chain);
-  }
-  else {
-    return DAG.getNode(PTXISD::RET, dl, MVT::Other, Chain, Flag);
-  }
-}
-
-SDValue
-PTXTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
-                             CallingConv::ID CallConv, bool isVarArg,
-                             bool doesNotRet, bool &isTailCall,
-                             const SmallVectorImpl<ISD::OutputArg> &Outs,
-                             const SmallVectorImpl<SDValue> &OutVals,
-                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                             DebugLoc dl, SelectionDAG &DAG,
-                             SmallVectorImpl<SDValue> &InVals) const {
-
-  MachineFunction& MF = DAG.getMachineFunction();
-  PTXMachineFunctionInfo *PTXMFI = MF.getInfo<PTXMachineFunctionInfo>();
-  PTXParamManager &PM = PTXMFI->getParamManager();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-
-  assert(getTargetMachine().getSubtarget<PTXSubtarget>().callsAreHandled() &&
-         "Calls are not handled for the target device");
-
-  // Identify the callee function
-  const GlobalValue *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
-  const Function *function = cast<Function>(GV);
-
-  // allow non-device calls only for printf
-  bool isPrintf = function->getName() == "printf" || function->getName() == "puts";
-
-  assert((isPrintf || function->getCallingConv() == CallingConv::PTX_Device) &&
-			 "PTX function calls must be to PTX device functions");
-
-  unsigned outSize = isPrintf ? 2 : Outs.size();
-
-  std::vector<SDValue> Ops;
-  // The layout of the ops will be [Chain, #Ins, Ins, Callee, #Outs, Outs]
-  Ops.resize(outSize + Ins.size() + 4);
-
-  Ops[0] = Chain;
-
-  // Identify the callee function
-  Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy());
-  Ops[Ins.size()+2] = Callee;
-
-  // #Outs
-  Ops[Ins.size()+3] = DAG.getTargetConstant(outSize, MVT::i32);
-
-  if (isPrintf) {
-    // first argument is the address of the global string variable in memory
-    unsigned Param0 = PM.addLocalParam(getPointerTy().getSizeInBits());
-    SDValue ParamValue0 = DAG.getTargetExternalSymbol(PM.getParamName(Param0).c_str(),
-                                                      MVT::Other);
-    Chain = DAG.getNode(PTXISD::STORE_PARAM, dl, MVT::Other, Chain,
-                        ParamValue0, OutVals[0]);
-    Ops[Ins.size()+4] = ParamValue0;
-
-    // alignment is the maximum size of all the arguments
-    unsigned alignment = 0;
-    for (unsigned i = 1; i < OutVals.size(); ++i) {
-      alignment = std::max(alignment,
-    		               OutVals[i].getValueType().getSizeInBits());
-    }
-
-    // size is the alignment multiplied by the number of arguments
-    unsigned size = alignment * (OutVals.size() - 1);
-
-    // second argument is the address of the stack object (unless no arguments)
-    unsigned Param1 = PM.addLocalParam(getPointerTy().getSizeInBits());
-    SDValue ParamValue1 = DAG.getTargetExternalSymbol(PM.getParamName(Param1).c_str(),
-                                                      MVT::Other);
-    Ops[Ins.size()+5] = ParamValue1;
-
-    if (size > 0)
-    {
-      // create a local stack object to store the arguments
-      unsigned StackObject = MFI->CreateStackObject(size / 8, alignment / 8, false);
-      SDValue FrameIndex = DAG.getFrameIndex(StackObject, getPointerTy());
-
-      // store each of the arguments to the stack in turn
-      for (unsigned int i = 1; i != OutVals.size(); i++) {
-        SDValue FrameAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FrameIndex, DAG.getTargetConstant((i - 1) * 8, getPointerTy()));
-        Chain = DAG.getStore(Chain, dl, OutVals[i], FrameAddr,
-                             MachinePointerInfo(),
-                             false, false, 0);
-      }
-
-      // copy the address of the local frame index to get the address in non-local space
-      SDValue genericAddr = DAG.getNode(PTXISD::COPY_ADDRESS, dl, getPointerTy(), FrameIndex);
-
-      // store this address in the second argument
-      Chain = DAG.getNode(PTXISD::STORE_PARAM, dl, MVT::Other, Chain, ParamValue1, genericAddr);
-    }
-  }
-  else
-  {
-	  // Generate STORE_PARAM nodes for each function argument.  In PTX, function
-	  // arguments are explicitly stored into .param variables and passed as
-	  // arguments. There is no register/stack-based calling convention in PTX.
-	  for (unsigned i = 0; i != OutVals.size(); ++i) {
-		unsigned Size = OutVals[i].getValueType().getSizeInBits();
-		unsigned Param = PM.addLocalParam(Size);
-		const std::string &ParamName = PM.getParamName(Param);
-		SDValue ParamValue = DAG.getTargetExternalSymbol(ParamName.c_str(),
-														 MVT::Other);
-		Chain = DAG.getNode(PTXISD::STORE_PARAM, dl, MVT::Other, Chain,
-							ParamValue, OutVals[i]);
-		Ops[i+Ins.size()+4] = ParamValue;
-	  }
-  }
-
-  std::vector<SDValue> InParams;
-
-  // Generate list of .param variables to hold the return value(s).
-  Ops[1] = DAG.getTargetConstant(Ins.size(), MVT::i32);
-  for (unsigned i = 0; i < Ins.size(); ++i) {
-    unsigned Size = Ins[i].VT.getStoreSizeInBits();
-    unsigned Param = PM.addLocalParam(Size);
-    const std::string &ParamName = PM.getParamName(Param);
-    SDValue ParamValue = DAG.getTargetExternalSymbol(ParamName.c_str(),
-                                                     MVT::Other);
-    Ops[i+2] = ParamValue;
-    InParams.push_back(ParamValue);
-  }
-
-  Ops[0] = Chain;
-
-  // Create the CALL node.
-  Chain = DAG.getNode(PTXISD::CALL, dl, MVT::Other, &Ops[0], Ops.size());
-
-  // Create the LOAD_PARAM nodes that retrieve the function return value(s).
-  for (unsigned i = 0; i < Ins.size(); ++i) {
-    SDValue Load = DAG.getNode(PTXISD::LOAD_PARAM, dl, Ins[i].VT, Chain,
-                               InParams[i]);
-    InVals.push_back(Load);
-  }
-
-  return Chain;
-}
-
-unsigned PTXTargetLowering::getNumRegisters(LLVMContext &Context, EVT VT) {
-  // All arguments consist of one "register," regardless of the type.
-  return 1;
-}
-
diff --git a/lib/Target/PTX/PTXISelLowering.h b/lib/Target/PTX/PTXISelLowering.h
deleted file mode 100644
index 33220f4..0000000
--- a/lib/Target/PTX/PTXISelLowering.h
+++ /dev/null
@@ -1,82 +0,0 @@
-//===-- PTXISelLowering.h - PTX DAG Lowering Interface ----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the interfaces that PTX uses to lower LLVM code into a
-// selection DAG.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef PTX_ISEL_LOWERING_H
-#define PTX_ISEL_LOWERING_H
-
-#include "llvm/Target/TargetLowering.h"
-
-namespace llvm {
-
-namespace PTXISD {
-  enum NodeType {
-    FIRST_NUMBER = ISD::BUILTIN_OP_END,
-    LOAD_PARAM,
-    STORE_PARAM,
-    READ_PARAM,
-    WRITE_PARAM,
-    EXIT,
-    RET,
-    COPY_ADDRESS,
-    CALL
-  };
-}                               // namespace PTXISD
-
-class PTXTargetLowering : public TargetLowering {
-  public:
-    explicit PTXTargetLowering(TargetMachine &TM);
-
-    virtual const char *getTargetNodeName(unsigned Opcode) const;
-
-    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
-
-    virtual SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
-
-    virtual SDValue
-      LowerFormalArguments(SDValue Chain,
-                           CallingConv::ID CallConv,
-                           bool isVarArg,
-                           const SmallVectorImpl<ISD::InputArg> &Ins,
-                           DebugLoc dl,
-                           SelectionDAG &DAG,
-                           SmallVectorImpl<SDValue> &InVals) const;
-
-    virtual SDValue
-      LowerReturn(SDValue Chain,
-                  CallingConv::ID CallConv,
-                  bool isVarArg,
-                  const SmallVectorImpl<ISD::OutputArg> &Outs,
-                  const SmallVectorImpl<SDValue> &OutVals,
-                  DebugLoc dl,
-                  SelectionDAG &DAG) const;
-
-    virtual SDValue
-      LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
-                bool isVarArg, bool doesNotRet, bool &isTailCall,
-                const SmallVectorImpl<ISD::OutputArg> &Outs,
-                const SmallVectorImpl<SDValue> &OutVals,
-                const SmallVectorImpl<ISD::InputArg> &Ins,
-                DebugLoc dl, SelectionDAG &DAG,
-                SmallVectorImpl<SDValue> &InVals) const;
-
-    virtual EVT getSetCCResultType(EVT VT) const;
-
-    virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT);
-
-  private:
-    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
-}; // class PTXTargetLowering
-} // namespace llvm
-
-#endif // PTX_ISEL_LOWERING_H
diff --git a/lib/Target/PTX/PTXInstrFormats.td b/lib/Target/PTX/PTXInstrFormats.td
deleted file mode 100644
index 267e834..0000000
--- a/lib/Target/PTX/PTXInstrFormats.td
+++ /dev/null
@@ -1,51 +0,0 @@
-//===-- PTXInstrFormats.td - PTX Instruction Formats -------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-// Rounding Mode Specifier
-/*class RoundingMode<bits<3> val> {
-  bits<3> Value = val;
-}
-
-def RndDefault     : RoundingMode<0>;
-def RndNearestEven : RoundingMode<1>;
-def RndNearestZero : RoundingMode<2>;
-def RndNegInf      : RoundingMode<3>;
-def RndPosInf      : RoundingMode<4>;
-def RndApprox      : RoundingMode<5>;*/
-
-
-// Rounding Mode Operand
-def RndMode : Operand<i32> {
-  let PrintMethod = "printRoundingMode";
-}
-
-def RndDefault : PatLeaf<(i32 0)>;
-
-// PTX Predicate operand, default to (0, 0) = (zero-reg, none).
-// Leave PrintMethod empty; predicate printing is defined elsewhere.
-def pred : PredicateOperand<OtherVT, (ops RegPred, i32imm),
-                                     (ops (i1 zero_reg), (i32 2))>;
-
-def RndModeOperand : Operand<OtherVT> {
-  let MIOperandInfo = (ops i32imm);
-}
-
-// Instruction Types
-let Namespace = "PTX" in {
-
-  class InstPTX<dag oops, dag iops, string asmstr, list<dag> pattern>
-    : Instruction {
-      dag OutOperandList = oops;
-      dag InOperandList = !con(iops, (ins pred:$_p));
-      let AsmString = asmstr; // Predicate printing is defined elsewhere.
-      let Pattern = pattern;
-      let isPredicable = 1;
-  }
-}
diff --git a/lib/Target/PTX/PTXInstrInfo.cpp b/lib/Target/PTX/PTXInstrInfo.cpp
deleted file mode 100644
index 443cd54..0000000
--- a/lib/Target/PTX/PTXInstrInfo.cpp
+++ /dev/null
@@ -1,359 +0,0 @@
-//===-- PTXInstrInfo.cpp - PTX Instruction Information --------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the PTX implementation of the TargetInstrInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "ptx-instrinfo"
-
-#include "PTXInstrInfo.h"
-#include "PTX.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/CodeGen/SelectionDAGNodes.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/raw_ostream.h"
-
-#define GET_INSTRINFO_CTOR
-#include "PTXGenInstrInfo.inc"
-
-using namespace llvm;
-
-PTXInstrInfo::PTXInstrInfo(PTXTargetMachine &_TM)
-  : PTXGenInstrInfo(),
-    RI(_TM, *this), TM(_TM) {}
-
-static const struct map_entry {
-  const TargetRegisterClass *cls;
-  const int opcode;
-} map[] = {
-  { &PTX::RegI16RegClass, PTX::MOVU16rr },
-  { &PTX::RegI32RegClass, PTX::MOVU32rr },
-  { &PTX::RegI64RegClass, PTX::MOVU64rr },
-  { &PTX::RegF32RegClass, PTX::MOVF32rr },
-  { &PTX::RegF64RegClass, PTX::MOVF64rr },
-  { &PTX::RegPredRegClass,   PTX::MOVPREDrr }
-};
-
-void PTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator I, DebugLoc DL,
-                               unsigned DstReg, unsigned SrcReg,
-                               bool KillSrc) const {
-
-  const MachineRegisterInfo& MRI = MBB.getParent()->getRegInfo();
-  //assert(MRI.getRegClass(SrcReg) == MRI.getRegClass(DstReg) &&
-  //  "Invalid register copy between two register classes");
-
-  for (int i = 0, e = sizeof(map)/sizeof(map[0]); i != e; ++i) {
-    if (map[i].cls == MRI.getRegClass(DstReg)) {
-      const MCInstrDesc &MCID = get(map[i].opcode);
-      MachineInstr *MI = BuildMI(MBB, I, DL, MCID, DstReg).
-        addReg(SrcReg, getKillRegState(KillSrc));
-      AddDefaultPredicate(MI);
-      return;
-    }
-  }
-
-  llvm_unreachable("Impossible reg-to-reg copy");
-}
-
-bool PTXInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator I,
-                                unsigned DstReg, unsigned SrcReg,
-                                const TargetRegisterClass *DstRC,
-                                const TargetRegisterClass *SrcRC,
-                                DebugLoc DL) const {
-  if (DstRC != SrcRC)
-    return false;
-
-  for (int i = 0, e = sizeof(map)/sizeof(map[0]); i != e; ++ i)
-    if (DstRC == map[i].cls) {
-      const MCInstrDesc &MCID = get(map[i].opcode);
-      MachineInstr *MI = BuildMI(MBB, I, DL, MCID, DstReg).addReg(SrcReg);
-      AddDefaultPredicate(MI);
-      return true;
-    }
-
-  return false;
-}
-
-bool PTXInstrInfo::isMoveInstr(const MachineInstr& MI,
-                               unsigned &SrcReg, unsigned &DstReg,
-                               unsigned &SrcSubIdx, unsigned &DstSubIdx) const {
-  switch (MI.getOpcode()) {
-    default:
-      return false;
-    case PTX::MOVU16rr:
-    case PTX::MOVU32rr:
-    case PTX::MOVU64rr:
-    case PTX::MOVF32rr:
-    case PTX::MOVF64rr:
-    case PTX::MOVPREDrr:
-      assert(MI.getNumOperands() >= 2 &&
-             MI.getOperand(0).isReg() && MI.getOperand(1).isReg() &&
-             "Invalid register-register move instruction");
-      SrcSubIdx = DstSubIdx = 0; // No sub-registers
-      DstReg = MI.getOperand(0).getReg();
-      SrcReg = MI.getOperand(1).getReg();
-      return true;
-  }
-}
-
-// predicate support
-
-bool PTXInstrInfo::isPredicated(const MachineInstr *MI) const {
-  int i = MI->findFirstPredOperandIdx();
-  return i != -1 && MI->getOperand(i).getReg() != PTX::NoRegister;
-}
-
-bool PTXInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
-  return !isPredicated(MI) && MI->isTerminator();
-}
-
-bool PTXInstrInfo::
-PredicateInstruction(MachineInstr *MI,
-                     const SmallVectorImpl<MachineOperand> &Pred) const {
-  if (Pred.size() < 2)
-    llvm_unreachable("lesser than 2 predicate operands are provided");
-
-  int i = MI->findFirstPredOperandIdx();
-  if (i == -1)
-    llvm_unreachable("missing predicate operand");
-
-  MI->getOperand(i).setReg(Pred[0].getReg());
-  MI->getOperand(i+1).setImm(Pred[1].getImm());
-
-  return true;
-}
-
-bool PTXInstrInfo::
-SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
-                  const SmallVectorImpl<MachineOperand> &Pred2) const {
-  const MachineOperand &PredReg1 = Pred1[0];
-  const MachineOperand &PredReg2 = Pred2[0];
-  if (PredReg1.getReg() != PredReg2.getReg())
-    return false;
-
-  const MachineOperand &PredOp1 = Pred1[1];
-  const MachineOperand &PredOp2 = Pred2[1];
-  if (PredOp1.getImm() != PredOp2.getImm())
-    return false;
-
-  return true;
-}
-
-bool PTXInstrInfo::
-DefinesPredicate(MachineInstr *MI,
-                 std::vector<MachineOperand> &Pred) const {
-  // If an instruction sets a predicate register, it defines a predicate.
-
-  // TODO supprot 5-operand format of setp instruction
-
-  if (MI->getNumOperands() < 1)
-    return false;
-
-  const MachineOperand &MO = MI->getOperand(0);
-
-  if (!MO.isReg() || RI.getRegClass(MO.getReg()) != &PTX::RegPredRegClass)
-    return false;
-
-  Pred.push_back(MO);
-  Pred.push_back(MachineOperand::CreateImm(PTXPredicate::None));
-  return true;
-}
-
-// branch support
-
-bool PTXInstrInfo::
-AnalyzeBranch(MachineBasicBlock &MBB,
-              MachineBasicBlock *&TBB,
-              MachineBasicBlock *&FBB,
-              SmallVectorImpl<MachineOperand> &Cond,
-              bool AllowModify) const {
-  // TODO implement cases when AllowModify is true
-
-  if (MBB.empty())
-    return true;
-
-  MachineBasicBlock::iterator iter = MBB.end();
-  const MachineInstr& instLast1 = *--iter;
-  // for special case that MBB has only 1 instruction
-  const bool IsSizeOne = MBB.size() == 1;
-  // if IsSizeOne is true, *--iter and instLast2 are invalid
-  // we put a dummy value in instLast2 and desc2 since they are used
-  const MachineInstr& instLast2 = IsSizeOne ? instLast1 : *--iter;
-
-  DEBUG(dbgs() << "\n");
-  DEBUG(dbgs() << "AnalyzeBranch: opcode: " << instLast1.getOpcode() << "\n");
-  DEBUG(dbgs() << "AnalyzeBranch: MBB:    " << MBB.getName().str() << "\n");
-  DEBUG(dbgs() << "AnalyzeBranch: TBB:    " << TBB << "\n");
-  DEBUG(dbgs() << "AnalyzeBranch: FBB:    " << FBB << "\n");
-
-  // this block ends with no branches
-  if (!IsAnyKindOfBranch(instLast1)) {
-    DEBUG(dbgs() << "AnalyzeBranch: ends with no branch\n");
-    return false;
-  }
-
-  // this block ends with only an unconditional branch
-  if (instLast1.isUnconditionalBranch() &&
-      // when IsSizeOne is true, it "absorbs" the evaluation of instLast2
-      (IsSizeOne || !IsAnyKindOfBranch(instLast2))) {
-    DEBUG(dbgs() << "AnalyzeBranch: ends with only uncond branch\n");
-    TBB = GetBranchTarget(instLast1);
-    return false;
-  }
-
-  // this block ends with a conditional branch and
-  // it falls through to a successor block
-  if (instLast1.isConditionalBranch() &&
-      IsAnySuccessorAlsoLayoutSuccessor(MBB)) {
-    DEBUG(dbgs() << "AnalyzeBranch: ends with cond branch and fall through\n");
-    TBB = GetBranchTarget(instLast1);
-    int i = instLast1.findFirstPredOperandIdx();
-    Cond.push_back(instLast1.getOperand(i));
-    Cond.push_back(instLast1.getOperand(i+1));
-    return false;
-  }
-
-  // when IsSizeOne is true, we are done
-  if (IsSizeOne)
-    return true;
-
-  // this block ends with a conditional branch
-  // followed by an unconditional branch
-  if (instLast2.isConditionalBranch() &&
-      instLast1.isUnconditionalBranch()) {
-    DEBUG(dbgs() << "AnalyzeBranch: ends with cond and uncond branch\n");
-    TBB = GetBranchTarget(instLast2);
-    FBB = GetBranchTarget(instLast1);
-    int i = instLast2.findFirstPredOperandIdx();
-    Cond.push_back(instLast2.getOperand(i));
-    Cond.push_back(instLast2.getOperand(i+1));
-    return false;
-  }
-
-  // branch cannot be understood
-  DEBUG(dbgs() << "AnalyzeBranch: cannot be understood\n");
-  return true;
-}
-
-unsigned PTXInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
-  unsigned count = 0;
-  while (!MBB.empty())
-    if (IsAnyKindOfBranch(MBB.back())) {
-      MBB.pop_back();
-      ++count;
-    } else
-      break;
-  DEBUG(dbgs() << "RemoveBranch: MBB:   " << MBB.getName().str() << "\n");
-  DEBUG(dbgs() << "RemoveBranch: remove " << count << " branch inst\n");
-  return count;
-}
-
-unsigned PTXInstrInfo::
-InsertBranch(MachineBasicBlock &MBB,
-             MachineBasicBlock *TBB,
-             MachineBasicBlock *FBB,
-             const SmallVectorImpl<MachineOperand> &Cond,
-             DebugLoc DL) const {
-  DEBUG(dbgs() << "InsertBranch: MBB: " << MBB.getName().str() << "\n");
-  DEBUG(if (TBB) dbgs() << "InsertBranch: TBB: " << TBB->getName().str()
-                        << "\n";
-        else     dbgs() << "InsertBranch: TBB: (NULL)\n");
-  DEBUG(if (FBB) dbgs() << "InsertBranch: FBB: " << FBB->getName().str()
-                        << "\n";
-        else     dbgs() << "InsertBranch: FBB: (NULL)\n");
-  DEBUG(dbgs() << "InsertBranch: Cond size: " << Cond.size() << "\n");
-
-  assert(TBB && "TBB is NULL");
-
-  if (FBB) {
-    BuildMI(&MBB, DL, get(PTX::BRAdp))
-      .addMBB(TBB).addReg(Cond[0].getReg()).addImm(Cond[1].getImm());
-    BuildMI(&MBB, DL, get(PTX::BRAd))
-      .addMBB(FBB).addReg(PTX::NoRegister).addImm(PTXPredicate::None);
-    return 2;
-  } else if (Cond.size()) {
-    BuildMI(&MBB, DL, get(PTX::BRAdp))
-      .addMBB(TBB).addReg(Cond[0].getReg()).addImm(Cond[1].getImm());
-    return 1;
-  } else {
-    BuildMI(&MBB, DL, get(PTX::BRAd))
-      .addMBB(TBB).addReg(PTX::NoRegister).addImm(PTXPredicate::None);
-    return 1;
-  }
-}
-
-// Memory operand folding for spills
-void PTXInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
-                                       MachineBasicBlock::iterator MII,
-                                     unsigned SrcReg, bool isKill, int FrameIdx,
-                                       const TargetRegisterClass *RC,
-                                       const TargetRegisterInfo *TRI) const {
-  llvm_unreachable("storeRegToStackSlot should not be called for PTX");
-}
-
-void PTXInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                        MachineBasicBlock::iterator MII,
-                                        unsigned DestReg, int FrameIdx,
-                                        const TargetRegisterClass *RC,
-                                        const TargetRegisterInfo *TRI) const {
-  llvm_unreachable("loadRegFromStackSlot should not be called for PTX");
-}
-
-// static helper routines
-
-MachineSDNode *PTXInstrInfo::
-GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode,
-                  DebugLoc dl, EVT VT, SDValue Op1) {
-  SDValue predReg = DAG->getRegister(PTX::NoRegister, MVT::i1);
-  SDValue predOp = DAG->getTargetConstant(PTXPredicate::None, MVT::i32);
-  SDValue ops[] = { Op1, predReg, predOp };
-  return DAG->getMachineNode(Opcode, dl, VT, ops, array_lengthof(ops));
-}
-
-MachineSDNode *PTXInstrInfo::
-GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode,
-                  DebugLoc dl, EVT VT, SDValue Op1, SDValue Op2) {
-  SDValue predReg = DAG->getRegister(PTX::NoRegister, MVT::i1);
-  SDValue predOp = DAG->getTargetConstant(PTXPredicate::None, MVT::i32);
-  SDValue ops[] = { Op1, Op2, predReg, predOp };
-  return DAG->getMachineNode(Opcode, dl, VT, ops, array_lengthof(ops));
-}
-
-void PTXInstrInfo::AddDefaultPredicate(MachineInstr *MI) {
-  if (MI->findFirstPredOperandIdx() == -1) {
-    MI->addOperand(MachineOperand::CreateReg(PTX::NoRegister, /*IsDef=*/false));
-    MI->addOperand(MachineOperand::CreateImm(PTXPredicate::None));
-  }
-}
-
-bool PTXInstrInfo::IsAnyKindOfBranch(const MachineInstr& inst) {
-  return inst.isTerminator() || inst.isBranch() || inst.isIndirectBranch();
-}
-
-bool PTXInstrInfo::
-IsAnySuccessorAlsoLayoutSuccessor(const MachineBasicBlock& MBB) {
-  for (MachineBasicBlock::const_succ_iterator
-      i = MBB.succ_begin(), e = MBB.succ_end(); i != e; ++i)
-    if (MBB.isLayoutSuccessor((const MachineBasicBlock*) &*i))
-      return true;
-  return false;
-}
-
-MachineBasicBlock *PTXInstrInfo::GetBranchTarget(const MachineInstr& inst) {
-  // FIXME So far all branch instructions put destination in 1st operand
-  const MachineOperand& target = inst.getOperand(0);
-  assert(target.isMBB() && "FIXME: detect branch target operand");
-  return target.getMBB();
-}
diff --git a/lib/Target/PTX/PTXInstrInfo.h b/lib/Target/PTX/PTXInstrInfo.h
deleted file mode 100644
index fba89c0..0000000
--- a/lib/Target/PTX/PTXInstrInfo.h
+++ /dev/null
@@ -1,133 +0,0 @@
-//===-- PTXInstrInfo.h - PTX Instruction Information ------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the PTX implementation of the TargetInstrInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef PTX_INSTR_INFO_H
-#define PTX_INSTR_INFO_H
-
-#include "PTXRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
-
-#define GET_INSTRINFO_HEADER
-#include "PTXGenInstrInfo.inc"
-
-namespace llvm {
-class PTXTargetMachine;
-
-class MachineSDNode;
-class SDValue;
-class SelectionDAG;
-
-class PTXInstrInfo : public PTXGenInstrInfo {
-private:
-  const PTXRegisterInfo RI;
-  PTXTargetMachine &TM;
-
-public:
-  explicit PTXInstrInfo(PTXTargetMachine &_TM);
-
-  virtual const PTXRegisterInfo &getRegisterInfo() const { return RI; }
-
-  virtual void copyPhysReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator I, DebugLoc DL,
-                           unsigned DstReg, unsigned SrcReg,
-                           bool KillSrc) const;
-
-  virtual bool copyRegToReg(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator I,
-                            unsigned DstReg, unsigned SrcReg,
-                            const TargetRegisterClass *DstRC,
-                            const TargetRegisterClass *SrcRC,
-                            DebugLoc DL) const;
-
-  virtual bool isMoveInstr(const MachineInstr& MI,
-                           unsigned &SrcReg, unsigned &DstReg,
-                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
-
-  // predicate support
-
-  virtual bool isPredicated(const MachineInstr *MI) const;
-
-  virtual bool isUnpredicatedTerminator(const MachineInstr *MI) const;
-
-  virtual
-  bool PredicateInstruction(MachineInstr *MI,
-                            const SmallVectorImpl<MachineOperand> &Pred) const;
-
-  virtual
-  bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
-                         const SmallVectorImpl<MachineOperand> &Pred2) const;
-
-  virtual bool DefinesPredicate(MachineInstr *MI,
-                                std::vector<MachineOperand> &Pred) const;
-
-  // PTX is fully-predicable
-  virtual bool isPredicable(MachineInstr *MI) const { return true; }
-
-  // branch support
-
-  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
-                             MachineBasicBlock *&FBB,
-                             SmallVectorImpl<MachineOperand> &Cond,
-                             bool AllowModify = false) const;
-
-  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
-
-  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                                MachineBasicBlock *FBB,
-                                const SmallVectorImpl<MachineOperand> &Cond,
-                                DebugLoc DL) const;
-
-  // Memory operand folding for spills
-  // TODO: Implement this eventually and get rid of storeRegToStackSlot and
-  //       loadRegFromStackSlot.  Doing so will get rid of the "stack" registers
-  //       we currently use to spill, though I doubt the overall effect on ptxas
-  //       output will be large.  I have yet to see a case where ptxas is unable
-  //       to see through the "stack" register usage and hence generates
-  //       efficient code anyway.
-  // virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-  //                                             MachineInstr* MI,
-  //                                       const SmallVectorImpl<unsigned> &Ops,
-  //                                             int FrameIndex) const;
-
-  virtual void storeRegToStackSlot(MachineBasicBlock& MBB,
-                                   MachineBasicBlock::iterator MII,
-                                   unsigned SrcReg, bool isKill, int FrameIndex,
-                                   const TargetRegisterClass* RC,
-                                   const TargetRegisterInfo* TRI) const;
-  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator MII,
-                                    unsigned DestReg, int FrameIdx,
-                                    const TargetRegisterClass *RC,
-                                    const TargetRegisterInfo *TRI) const;
-
-  // static helper routines
-
-  static MachineSDNode *GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode,
-                                          DebugLoc dl, EVT VT,
-                                          SDValue Op1);
-
-  static MachineSDNode *GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode,
-                                          DebugLoc dl, EVT VT,
-                                          SDValue Op1, SDValue Op2);
-
-  static void AddDefaultPredicate(MachineInstr *MI);
-
-  static bool IsAnyKindOfBranch(const MachineInstr& inst);
-
-  static bool IsAnySuccessorAlsoLayoutSuccessor(const MachineBasicBlock& MBB);
-
-  static MachineBasicBlock *GetBranchTarget(const MachineInstr& inst);
-}; // class PTXInstrInfo
-} // namespace llvm
-
-#endif // PTX_INSTR_INFO_H
diff --git a/lib/Target/PTX/PTXInstrInfo.td b/lib/Target/PTX/PTXInstrInfo.td
deleted file mode 100644
index bead428..0000000
--- a/lib/Target/PTX/PTXInstrInfo.td
+++ /dev/null
@@ -1,1031 +0,0 @@
-//===-- PTXInstrInfo.td - PTX Instruction defs --------------*- tablegen-*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the PTX instructions in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Instruction format superclass
-//===----------------------------------------------------------------------===//
-
-include "PTXInstrFormats.td"
-
-//===----------------------------------------------------------------------===//
-// Code Generation Predicates
-//===----------------------------------------------------------------------===//
-
-// Shader Model Support
-def FDivNeedsRoundingMode : Predicate<"getSubtarget().fdivNeedsRoundingMode()">;
-def FDivNoRoundingMode : Predicate<"!getSubtarget().fdivNeedsRoundingMode()">;
-def FMadNeedsRoundingMode : Predicate<"getSubtarget().fmadNeedsRoundingMode()">;
-def FMadNoRoundingMode : Predicate<"!getSubtarget().fmadNeedsRoundingMode()">;
-
-// PTX Version Support
-def SupportsPTX21       : Predicate<"getSubtarget().supportsPTX21()">;
-def DoesNotSupportPTX21 : Predicate<"!getSubtarget().supportsPTX21()">;
-def SupportsPTX22       : Predicate<"getSubtarget().supportsPTX22()">;
-def DoesNotSupportPTX22 : Predicate<"!getSubtarget().supportsPTX22()">;
-def SupportsPTX23       : Predicate<"getSubtarget().supportsPTX23()">;
-def DoesNotSupportPTX23 : Predicate<"!getSubtarget().supportsPTX23()">;
-
-// Fused-Multiply Add
-def SupportsFMA         : Predicate<"getSubtarget().supportsFMA()">;
-def DoesNotSupportFMA   : Predicate<"!getSubtarget().supportsFMA()">;
-
-
-
-// def SDT_PTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
-// def SDT_PTXCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
-
-// def PTXcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_PTXCallSeqStart,
-//                               [SDNPHasChain, SDNPOutGlue]>;
-// def PTXcallseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_PTXCallSeqEnd,
-//                               [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
-
-def PTXcall : SDNode<"PTXISD::CALL", SDTNone,
-                     [SDNPHasChain, SDNPVariadic, SDNPOptInGlue, SDNPOutGlue]>;
-
-
-// Branch & call targets have OtherVT type.
-def brtarget   : Operand<OtherVT>;
-def calltarget : Operand<i32>;
-
-//===----------------------------------------------------------------------===//
-// PTX Specific Node Definitions
-//===----------------------------------------------------------------------===//
-
-// PTX allow generic 3-reg shifts like shl r0, r1, r2
-def PTXshl : SDNode<"ISD::SHL", SDTIntBinOp>;
-def PTXsrl : SDNode<"ISD::SRL", SDTIntBinOp>;
-def PTXsra : SDNode<"ISD::SRA", SDTIntBinOp>;
-
-def PTXexit
-  : SDNode<"PTXISD::EXIT", SDTNone, [SDNPHasChain]>;
-def PTXret
-  : SDNode<"PTXISD::RET",  SDTNone,
-           [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
-def PTXcopyaddress
-  : SDNode<"PTXISD::COPY_ADDRESS", SDTypeProfile<1, 1, []>, []>;
-
-
-
-//===----------------------------------------------------------------------===//
-// Instruction Class Templates
-//===----------------------------------------------------------------------===//
-
-// For floating-point instructions, we cannot just embed the pattern into the
-// instruction definition since we need to muck around with the rounding mode,
-// and I do not know how to insert constants into instructions directly from
-// pattern matches.
-
-//===- Floating-Point Instructions - 2 Operand Form -----------------------===//
-multiclass PTX_FLOAT_2OP<string opcstr> {
-  def rr32 : InstPTX<(outs RegF32:$d),
-                     (ins RndMode:$r, RegF32:$a),
-                     !strconcat(opcstr, "$r.f32\t$d, $a"), []>;
-  def ri32 : InstPTX<(outs RegF32:$d),
-                     (ins RndMode:$r, f32imm:$a),
-                     !strconcat(opcstr, "$r.f32\t$d, $a"), []>;
-  def rr64 : InstPTX<(outs RegF64:$d),
-                     (ins RndMode:$r, RegF64:$a),
-                     !strconcat(opcstr, "$r.f64\t$d, $a"), []>;
-  def ri64 : InstPTX<(outs RegF64:$d),
-                     (ins RndMode:$r, f64imm:$a),
-                     !strconcat(opcstr, "$r.f64\t$d, $a"), []>;
-}
-
-//===- Floating-Point Instructions - 3 Operand Form -----------------------===//
-multiclass PTX_FLOAT_3OP<string opcstr> {
-  def rr32 : InstPTX<(outs RegF32:$d),
-                     (ins RndMode:$r, RegF32:$a, RegF32:$b),
-                     !strconcat(opcstr, "$r.f32\t$d, $a, $b"), []>;
-  def ri32 : InstPTX<(outs RegF32:$d),
-                     (ins RndMode:$r, RegF32:$a, f32imm:$b),
-                     !strconcat(opcstr, "$r.f32\t$d, $a, $b"), []>;
-  def rr64 : InstPTX<(outs RegF64:$d),
-                     (ins RndMode:$r, RegF64:$a, RegF64:$b),
-                     !strconcat(opcstr, "$r.f64\t$d, $a, $b"), []>;
-  def ri64 : InstPTX<(outs RegF64:$d),
-                     (ins RndMode:$r, RegF64:$a, f64imm:$b),
-                     !strconcat(opcstr, "$r.f64\t$d, $a, $b"), []>;
-}
-
-//===- Floating-Point Instructions - 4 Operand Form -----------------------===//
-multiclass PTX_FLOAT_4OP<string opcstr> {
-  def rrr32 : InstPTX<(outs RegF32:$d),
-                      (ins RndMode:$r, RegF32:$a, RegF32:$b, RegF32:$c),
-                      !strconcat(opcstr, "$r.f32\t$d, $a, $b, $c"), []>;
-  def rri32 : InstPTX<(outs RegF32:$d),
-                      (ins RndMode:$r, RegF32:$a, RegF32:$b, f32imm:$c),
-                      !strconcat(opcstr, "$r.f32\t$d, $a, $b, $c"), []>;
-  def rii32 : InstPTX<(outs RegF32:$d),
-                      (ins RndMode:$r, RegF32:$a, f32imm:$b, f32imm:$c),
-                      !strconcat(opcstr, "$r.f32\t$d, $a, $b, $c"), []>;
-  def rrr64 : InstPTX<(outs RegF64:$d),
-                      (ins RndMode:$r, RegF64:$a, RegF64:$b, RegF64:$c),
-                      !strconcat(opcstr, "$r.f64\t$d, $a, $b, $c"), []>;
-  def rri64 : InstPTX<(outs RegF64:$d),
-                      (ins RndMode:$r, RegF64:$a, RegF64:$b, f64imm:$c),
-                      !strconcat(opcstr, "$r.f64\t$d, $a, $b, $c"), []>;
-  def rii64 : InstPTX<(outs RegF64:$d),
-                      (ins RndMode:$r, RegF64:$a, f64imm:$b, f64imm:$c),
-                      !strconcat(opcstr, "$r.f64\t$d, $a, $b, $c"), []>;
-}
-
-//===- Integer Instructions - 3 Operand Form ------------------------------===//
-multiclass PTX_INT3<string opcstr, SDNode opnode> {
-  def rr16 : InstPTX<(outs RegI16:$d),
-                     (ins RegI16:$a, RegI16:$b),
-                     !strconcat(opcstr, ".u16\t$d, $a, $b"),
-                     [(set RegI16:$d, (opnode RegI16:$a, RegI16:$b))]>;
-  def ri16 : InstPTX<(outs RegI16:$d),
-                     (ins RegI16:$a, i16imm:$b),
-                     !strconcat(opcstr, ".u16\t$d, $a, $b"),
-                     [(set RegI16:$d, (opnode RegI16:$a, imm:$b))]>;
-  def rr32 : InstPTX<(outs RegI32:$d),
-                     (ins RegI32:$a, RegI32:$b),
-                     !strconcat(opcstr, ".u32\t$d, $a, $b"),
-                     [(set RegI32:$d, (opnode RegI32:$a, RegI32:$b))]>;
-  def ri32 : InstPTX<(outs RegI32:$d),
-                     (ins RegI32:$a, i32imm:$b),
-                     !strconcat(opcstr, ".u32\t$d, $a, $b"),
-                     [(set RegI32:$d, (opnode RegI32:$a, imm:$b))]>;
-  def rr64 : InstPTX<(outs RegI64:$d),
-                     (ins RegI64:$a, RegI64:$b),
-                     !strconcat(opcstr, ".u64\t$d, $a, $b"),
-                     [(set RegI64:$d, (opnode RegI64:$a, RegI64:$b))]>;
-  def ri64 : InstPTX<(outs RegI64:$d),
-                     (ins RegI64:$a, i64imm:$b),
-                     !strconcat(opcstr, ".u64\t$d, $a, $b"),
-                     [(set RegI64:$d, (opnode RegI64:$a, imm:$b))]>;
-}
-
-//===- Integer Instructions - 3 Operand Form (Signed) ---------------------===//
-multiclass PTX_INT3_SIGNED<string opcstr, SDNode opnode> {
-  def rr16 : InstPTX<(outs RegI16:$d),
-                     (ins RegI16:$a, RegI16:$b),
-                     !strconcat(opcstr, ".s16\t$d, $a, $b"),
-                     [(set RegI16:$d, (opnode RegI16:$a, RegI16:$b))]>;
-  def ri16 : InstPTX<(outs RegI16:$d),
-                     (ins RegI16:$a, i16imm:$b),
-                     !strconcat(opcstr, ".s16\t$d, $a, $b"),
-                     [(set RegI16:$d, (opnode RegI16:$a, imm:$b))]>;
-  def rr32 : InstPTX<(outs RegI32:$d),
-                     (ins RegI32:$a, RegI32:$b),
-                     !strconcat(opcstr, ".s32\t$d, $a, $b"),
-                     [(set RegI32:$d, (opnode RegI32:$a, RegI32:$b))]>;
-  def ri32 : InstPTX<(outs RegI32:$d),
-                     (ins RegI32:$a, i32imm:$b),
-                     !strconcat(opcstr, ".s32\t$d, $a, $b"),
-                     [(set RegI32:$d, (opnode RegI32:$a, imm:$b))]>;
-  def rr64 : InstPTX<(outs RegI64:$d),
-                     (ins RegI64:$a, RegI64:$b),
-                     !strconcat(opcstr, ".s64\t$d, $a, $b"),
-                     [(set RegI64:$d, (opnode RegI64:$a, RegI64:$b))]>;
-  def ri64 : InstPTX<(outs RegI64:$d),
-                     (ins RegI64:$a, i64imm:$b),
-                     !strconcat(opcstr, ".s64\t$d, $a, $b"),
-                     [(set RegI64:$d, (opnode RegI64:$a, imm:$b))]>;
-}
-
-//===- Bitwise Logic Instructions - 3 Operand Form ------------------------===//
-multiclass PTX_LOGIC<string opcstr, SDNode opnode> {
-  def ripreds : InstPTX<(outs RegPred:$d),
-                     (ins RegPred:$a, i1imm:$b),
-                     !strconcat(opcstr, ".pred\t$d, $a, $b"),
-                     [(set RegPred:$d, (opnode RegPred:$a, imm:$b))]>;
-  def rrpreds : InstPTX<(outs RegPred:$d),
-                     (ins RegPred:$a, RegPred:$b),
-                     !strconcat(opcstr, ".pred\t$d, $a, $b"),
-                     [(set RegPred:$d, (opnode RegPred:$a, RegPred:$b))]>;
-  def rr16 : InstPTX<(outs RegI16:$d),
-                     (ins RegI16:$a, RegI16:$b),
-                     !strconcat(opcstr, ".b16\t$d, $a, $b"),
-                     [(set RegI16:$d, (opnode RegI16:$a, RegI16:$b))]>;
-  def ri16 : InstPTX<(outs RegI16:$d),
-                     (ins RegI16:$a, i16imm:$b),
-                     !strconcat(opcstr, ".b16\t$d, $a, $b"),
-                     [(set RegI16:$d, (opnode RegI16:$a, imm:$b))]>;
-  def rr32 : InstPTX<(outs RegI32:$d),
-                     (ins RegI32:$a, RegI32:$b),
-                     !strconcat(opcstr, ".b32\t$d, $a, $b"),
-                     [(set RegI32:$d, (opnode RegI32:$a, RegI32:$b))]>;
-  def ri32 : InstPTX<(outs RegI32:$d),
-                     (ins RegI32:$a, i32imm:$b),
-                     !strconcat(opcstr, ".b32\t$d, $a, $b"),
-                     [(set RegI32:$d, (opnode RegI32:$a, imm:$b))]>;
-  def rr64 : InstPTX<(outs RegI64:$d),
-                     (ins RegI64:$a, RegI64:$b),
-                     !strconcat(opcstr, ".b64\t$d, $a, $b"),
-                     [(set RegI64:$d, (opnode RegI64:$a, RegI64:$b))]>;
-  def ri64 : InstPTX<(outs RegI64:$d),
-                     (ins RegI64:$a, i64imm:$b),
-                     !strconcat(opcstr, ".b64\t$d, $a, $b"),
-                     [(set RegI64:$d, (opnode RegI64:$a, imm:$b))]>;
-}
-
-//===- Integer Shift Instructions - 3 Operand Form ------------------------===//
-multiclass PTX_INT3ntnc<string opcstr, SDNode opnode> {
-  def rr16 : InstPTX<(outs RegI16:$d),
-                     (ins RegI16:$a, RegI16:$b),
-                     !strconcat(opcstr, "16\t$d, $a, $b"),
-                     [(set RegI16:$d, (opnode RegI16:$a, RegI16:$b))]>;
-  def rr32 : InstPTX<(outs RegI32:$d),
-                     (ins RegI32:$a, RegI32:$b),
-                     !strconcat(opcstr, "32\t$d, $a, $b"),
-                     [(set RegI32:$d, (opnode RegI32:$a, RegI32:$b))]>;
-  def rr64 : InstPTX<(outs RegI64:$d),
-                     (ins RegI64:$a, RegI64:$b),
-                     !strconcat(opcstr, "64\t$d, $a, $b"),
-                     [(set RegI64:$d, (opnode RegI64:$a, RegI64:$b))]>;
-  def ri16 : InstPTX<(outs RegI16:$d),
-                     (ins RegI16:$a, i16imm:$b),
-                     !strconcat(opcstr, "16\t$d, $a, $b"),
-                     [(set RegI16:$d, (opnode RegI16:$a, imm:$b))]>;
-  def ri32 : InstPTX<(outs RegI32:$d),
-                     (ins RegI32:$a, i32imm:$b),
-                     !strconcat(opcstr, "32\t$d, $a, $b"),
-                     [(set RegI32:$d, (opnode RegI32:$a, imm:$b))]>;
-  def ri64 : InstPTX<(outs RegI64:$d),
-                     (ins RegI64:$a, i64imm:$b),
-                     !strconcat(opcstr, "64\t$d, $a, $b"),
-                     [(set RegI64:$d, (opnode RegI64:$a, imm:$b))]>;
-  def ir16 : InstPTX<(outs RegI16:$d),
-                     (ins i16imm:$a, RegI16:$b),
-                     !strconcat(opcstr, "16\t$d, $a, $b"),
-                     [(set RegI16:$d, (opnode imm:$a, RegI16:$b))]>;
-  def ir32 : InstPTX<(outs RegI32:$d),
-                     (ins i32imm:$a, RegI32:$b),
-                     !strconcat(opcstr, "32\t$d, $a, $b"),
-                     [(set RegI32:$d, (opnode imm:$a, RegI32:$b))]>;
-  def ir64 : InstPTX<(outs RegI64:$d),
-                     (ins i64imm:$a, RegI64:$b),
-                     !strconcat(opcstr, "64\t$d, $a, $b"),
-                     [(set RegI64:$d, (opnode imm:$a, RegI64:$b))]>;
-}
-
-//===- Set Predicate Instructions (Int) - 3/4 Operand Forms ---------------===//
-multiclass PTX_SETP_I<RegisterClass RC, string regclsname, Operand immcls,
-                        CondCode cmp, string cmpstr> {
-  // TODO support 5-operand format: p|q, a, b, c
-
-  def rr
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b),
-              !strconcat("setp.", cmpstr, ".", regclsname, "\t$p, $a, $b"),
-              [(set RegPred:$p, (setcc RC:$a, RC:$b, cmp))]>;
-  def ri
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b),
-              !strconcat("setp.", cmpstr, ".", regclsname, "\t$p, $a, $b"),
-              [(set RegPred:$p, (setcc RC:$a, imm:$b, cmp))]>;
-
-  def rr_and_r
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".and.", regclsname,
-                         "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, cmp), RegPred:$c))]>;
-  def ri_and_r
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".and.", regclsname,
-                         "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (and (setcc RC:$a, imm:$b, cmp),
-                                     RegPred:$c))]>;
-  def rr_or_r
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".or.", regclsname,
-                         "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, cmp), RegPred:$c))]>;
-  def ri_or_r
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".or.", regclsname,
-                         "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (or (setcc RC:$a, imm:$b, cmp), RegPred:$c))]>;
-  def rr_xor_r
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".xor.", regclsname,
-                         "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, cmp), RegPred:$c))]>;
-  def ri_xor_r
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".xor.", regclsname,
-                         "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (xor (setcc RC:$a, imm:$b, cmp),
-                                     RegPred:$c))]>;
-
-  def rr_and_not_r
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".and.", regclsname,
-                         "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, cmp),
-                                     (not RegPred:$c)))]>;
-  def ri_and_not_r
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".and.", regclsname,
-                         "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (and (setcc RC:$a, imm:$b, cmp),
-                                     (not RegPred:$c)))]>;
-  def rr_or_not_r
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".or.", regclsname,
-                         "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, cmp),
-                                    (not RegPred:$c)))]>;
-  def ri_or_not_r
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".or.", regclsname,
-                         "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (or (setcc RC:$a, imm:$b, cmp),
-                                    (not RegPred:$c)))]>;
-  def rr_xor_not_r
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".xor.", regclsname,
-                         "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, cmp),
-                                     (not RegPred:$c)))]>;
-  def ri_xor_not_r
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".xor.", regclsname,
-                         "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (xor (setcc RC:$a, imm:$b, cmp),
-                                     (not RegPred:$c)))]>;
-}
-
-//===- Set Predicate Instructions (FP) - 3/4 Operand Form -----------------===//
-multiclass PTX_SETP_FP<RegisterClass RC, string regclsname, Operand immcls,
-                        CondCode ucmp, CondCode ocmp, string cmpstr> {
-  // TODO support 5-operand format: p|q, a, b, c
-
-  def rr_u
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b),
-              !strconcat("setp.", cmpstr, "u.", regclsname, "\t$p, $a, $b"),
-              [(set RegPred:$p, (setcc RC:$a, RC:$b, ucmp))]>;
-  def rr_o
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b),
-              !strconcat("setp.", cmpstr, ".", regclsname, "\t$p, $a, $b"),
-              [(set RegPred:$p, (setcc RC:$a, RC:$b, ocmp))]>;
-
-  def ri_u
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b),
-              !strconcat("setp.", cmpstr, "u.", regclsname, "\t$p, $a, $b"),
-              [(set RegPred:$p, (setcc RC:$a, fpimm:$b, ucmp))]>;
-  def ri_o
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b),
-              !strconcat("setp.", cmpstr, ".", regclsname, "\t$p, $a, $b"),
-              [(set RegPred:$p, (setcc RC:$a, fpimm:$b, ocmp))]>;
-
-  def rr_and_r_u
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, "u.and.", regclsname,
-                         "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ucmp),
-                                     RegPred:$c))]>;
-  def rr_and_r_o
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".and.", regclsname,
-                         "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ocmp),
-                                     RegPred:$c))]>;
-
-  def rr_or_r_u
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, "u.or.", regclsname,
-                         "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ucmp), RegPred:$c))]>;
-  def rr_or_r_o
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".or.", regclsname,
-                         "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ocmp), RegPred:$c))]>;
-
-  def rr_xor_r_u
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, "u.xor.", regclsname,
-                         "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ucmp),
-                                     RegPred:$c))]>;
-  def rr_xor_r_o
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".xor.", regclsname,
-                         "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ocmp),
-                                     RegPred:$c))]>;
-
-  def rr_and_not_r_u
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, "u.and.", regclsname,
-                         "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ucmp),
-                                     (not RegPred:$c)))]>;
-  def rr_and_not_r_o
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".and.", regclsname,
-                         "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ocmp),
-                                     (not RegPred:$c)))]>;
-
-  def rr_or_not_r_u
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, "u.or.", regclsname,
-                         "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ucmp),
-                                    (not RegPred:$c)))]>;
-  def rr_or_not_r_o
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".or.", regclsname,
-                         "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ocmp),
-                                    (not RegPred:$c)))]>;
-
-  def rr_xor_not_r_u
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, "u.xor.", regclsname,
-                         "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ucmp),
-                                     (not RegPred:$c)))]>;
-  def rr_xor_not_r_o
-    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".xor.", regclsname,
-                         "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ocmp),
-                                     (not RegPred:$c)))]>;
-}
-
-//===- Select Predicate Instructions - 4 Operand Form ---------------------===//
-multiclass PTX_SELP<RegisterClass RC, string regclsname, Operand immcls,
-                    SDNode immnode> {
-  def rr
-    : InstPTX<(outs RC:$r), (ins RegPred:$a, RC:$b, RC:$c),
-              !strconcat("selp.", regclsname, "\t$r, $b, $c, $a"),
-              [(set RC:$r, (select RegPred:$a, RC:$b, RC:$c))]>;
-  def ri
-    : InstPTX<(outs RC:$r), (ins RegPred:$a, RC:$b, immcls:$c),
-              !strconcat("selp.", regclsname, "\t$r, $b, $c, $a"),
-              [(set RC:$r, (select RegPred:$a, RC:$b, immnode:$c))]>;
-  def ii
-    : InstPTX<(outs RC:$r), (ins RegPred:$a, immcls:$b, immcls:$c),
-              !strconcat("selp.", regclsname, "\t$r, $b, $c, $a"),
-              [(set RC:$r, (select RegPred:$a, immnode:$b, immnode:$c))]>;
-}
-
-
-
-//===----------------------------------------------------------------------===//
-// Instructions
-//===----------------------------------------------------------------------===//
-
-///===- Integer Arithmetic Instructions -----------------------------------===//
-
-defm ADD  : PTX_INT3<"add", add>;
-defm SUB  : PTX_INT3<"sub", sub>;
-defm MUL  : PTX_INT3<"mul.lo", mul>; // FIXME: Allow 32x32 -> 64 multiplies
-defm DIV  : PTX_INT3<"div", udiv>;
-defm SDIV : PTX_INT3_SIGNED<"div", sdiv>;
-defm REM  : PTX_INT3<"rem", urem>;
-
-///===- Floating-Point Arithmetic Instructions ----------------------------===//
-
-// FNEG
-defm FNEG : PTX_FLOAT_2OP<"neg">;
-
-// Standard Binary Operations
-defm FADD : PTX_FLOAT_3OP<"add">;
-defm FSUB : PTX_FLOAT_3OP<"sub">;
-defm FMUL : PTX_FLOAT_3OP<"mul">;
-defm FDIV : PTX_FLOAT_3OP<"div">;
-
-// Multi-operation hybrid instructions
-defm FMAD : PTX_FLOAT_4OP<"mad">, Requires<[SupportsFMA]>;
-
-
-///===- Floating-Point Intrinsic Instructions -----------------------------===//
-
-// SQRT
-def FSQRTrr32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegF32:$a),
-                        "sqrt$r.f32\t$d, $a", []>;
-def FSQRTri32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, f32imm:$a),
-                        "sqrt$r.f32\t$d, $a", []>;
-def FSQRTrr64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegF64:$a),
-                        "sqrt$r.f64\t$d, $a", []>;
-def FSQRTri64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, f64imm:$a),
-                        "sqrt$r.f64\t$d, $a", []>;
-
-// SIN
-def FSINrr32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegF32:$a),
-                       "sin$r.f32\t$d, $a", []>;
-def FSINri32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, f32imm:$a),
-                       "sin$r.f32\t$d, $a", []>;
-def FSINrr64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegF64:$a),
-                       "sin$r.f64\t$d, $a", []>;
-def FSINri64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, f64imm:$a),
-                       "sin$r.f64\t$d, $a", []>;
-
-// COS
-def FCOSrr32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegF32:$a),
-                       "cos$r.f32\t$d, $a", []>;
-def FCOSri32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, f32imm:$a),
-                       "cos$r.f32\t$d, $a", []>;
-def FCOSrr64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegF64:$a),
-                       "cos$r.f64\t$d, $a", []>;
-def FCOSri64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, f64imm:$a),
-                       "cos$r.f64\t$d, $a", []>;
-
-
-
-
-///===- Comparison and Selection Instructions -----------------------------===//
-
-// .setp
-
-// Compare u16
-
-defm SETPEQu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETEQ,  "eq">;
-defm SETPNEu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETNE,  "ne">;
-defm SETPLTu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETULT, "lt">;
-defm SETPLEu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETULE, "le">;
-defm SETPGTu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETUGT, "gt">;
-defm SETPGEu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETUGE, "ge">;
-defm SETPLTs16 : PTX_SETP_I<RegI16, "s16", i16imm, SETLT,  "lt">;
-defm SETPLEs16 : PTX_SETP_I<RegI16, "s16", i16imm, SETLE,  "le">;
-defm SETPGTs16 : PTX_SETP_I<RegI16, "s16", i16imm, SETGT,  "gt">;
-defm SETPGEs16 : PTX_SETP_I<RegI16, "s16", i16imm, SETGE,  "ge">;
-
-// Compare u32
-
-defm SETPEQu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETEQ,  "eq">;
-defm SETPNEu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETNE,  "ne">;
-defm SETPLTu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETULT, "lt">;
-defm SETPLEu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETULE, "le">;
-defm SETPGTu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETUGT, "gt">;
-defm SETPGEu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETUGE, "ge">;
-defm SETPLTs32 : PTX_SETP_I<RegI32, "s32", i32imm, SETLT,  "lt">;
-defm SETPLEs32 : PTX_SETP_I<RegI32, "s32", i32imm, SETLE,  "le">;
-defm SETPGTs32 : PTX_SETP_I<RegI32, "s32", i32imm, SETGT,  "gt">;
-defm SETPGEs32 : PTX_SETP_I<RegI32, "s32", i32imm, SETGE,  "ge">;
-
-// Compare u64
-
-defm SETPEQu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETEQ,  "eq">;
-defm SETPNEu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETNE,  "ne">;
-defm SETPLTu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETULT, "lt">;
-defm SETPLEu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETULE, "le">;
-defm SETPGTu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETUGT, "gt">;
-defm SETPGEu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETUGE, "ge">;
-defm SETPLTs64 : PTX_SETP_I<RegI64, "s64", i64imm, SETLT,  "lt">;
-defm SETPLEs64 : PTX_SETP_I<RegI64, "s64", i64imm, SETLE,  "le">;
-defm SETPGTs64 : PTX_SETP_I<RegI64, "s64", i64imm, SETGT,  "gt">;
-defm SETPGEs64 : PTX_SETP_I<RegI64, "s64", i64imm, SETGE,  "ge">;
-
-// Compare f32
-
-defm SETPEQf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETUEQ, SETOEQ, "eq">;
-defm SETPNEf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETUNE, SETONE, "ne">;
-defm SETPLTf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETULT, SETOLT, "lt">;
-defm SETPLEf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETULE, SETOLE, "le">;
-defm SETPGTf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETUGT, SETOGT, "gt">;
-defm SETPGEf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETUGE, SETOGE, "ge">;
-
-// Compare f64
-
-defm SETPEQf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETUEQ, SETOEQ, "eq">;
-defm SETPNEf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETUNE, SETONE, "ne">;
-defm SETPLTf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETULT, SETOLT, "lt">;
-defm SETPLEf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETULE, SETOLE, "le">;
-defm SETPGTf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETUGT, SETOGT, "gt">;
-defm SETPGEf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETUGE, SETOGE, "ge">;
-
-// .selp
-
-defm SELPi16 : PTX_SELP<RegI16, "u16", i16imm, imm>;
-defm SELPi32 : PTX_SELP<RegI32, "u32", i32imm, imm>;
-defm SELPi64 : PTX_SELP<RegI64, "u64", i64imm, imm>;
-defm SELPf32 : PTX_SELP<RegF32, "f32", f32imm, fpimm>;
-defm SELPf64 : PTX_SELP<RegF64, "f64", f64imm, fpimm>;
-
-///===- Logic and Shift Instructions --------------------------------------===//
-
-defm SHL : PTX_INT3ntnc<"shl.b", PTXshl>;
-defm SRL : PTX_INT3ntnc<"shr.u", PTXsrl>;
-defm SRA : PTX_INT3ntnc<"shr.s", PTXsra>;
-
-defm AND : PTX_LOGIC<"and", and>;
-defm OR  : PTX_LOGIC<"or",  or>;
-defm XOR : PTX_LOGIC<"xor", xor>;
-
-///===- Data Movement and Conversion Instructions -------------------------===//
-
-// any_extend
-// Implement the anyext instruction in terms of the PTX cvt instructions.
-//def : Pat<(i32 (anyext RegI16:$a)), (CVT_u32_u16 RegI16:$a)>;
-//def : Pat<(i64 (anyext RegI16:$a)), (CVT_u64_u16 RegI16:$a)>;
-//def : Pat<(i64 (anyext RegI32:$a)), (CVT_u64_u32 RegI32:$a)>;
-
-// bitconvert
-// These instructions implement the bit-wise conversion between integer and
-// floating-point types.
-def MOVi32f32
-  : InstPTX<(outs RegI32:$d), (ins RegF32:$a), "mov.b32\t$d, $a", []>;
-def MOVf32i32
-  : InstPTX<(outs RegF32:$d), (ins RegI32:$a), "mov.b32\t$d, $a", []>;
-def MOVi64f64
-  : InstPTX<(outs RegI64:$d), (ins RegF64:$a), "mov.b64\t$d, $a", []>;
-def MOVf64i64
-  : InstPTX<(outs RegF64:$d), (ins RegI64:$a), "mov.b64\t$d, $a", []>;
-
-let neverHasSideEffects = 1 in {
-  def MOVPREDrr
-    : InstPTX<(outs RegPred:$d), (ins RegPred:$a), "mov.pred\t$d, $a", []>;
-  def MOVU16rr
-    : InstPTX<(outs RegI16:$d), (ins RegI16:$a), "mov.u16\t$d, $a", []>;
-  def MOVU32rr
-    : InstPTX<(outs RegI32:$d), (ins RegI32:$a), "mov.u32\t$d, $a", []>;
-  def MOVU64rr
-    : InstPTX<(outs RegI64:$d), (ins RegI64:$a), "mov.u64\t$d, $a", []>;
-  def MOVF32rr
-    : InstPTX<(outs RegF32:$d), (ins RegF32:$a), "mov.f32\t$d, $a", []>;
-  def MOVF64rr
-    : InstPTX<(outs RegF64:$d), (ins RegF64:$a), "mov.f64\t$d, $a", []>;
-}
-
-let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
-  def MOVPREDri
-    : InstPTX<(outs RegPred:$d), (ins i1imm:$a), "mov.pred\t$d, $a",
-              [(set RegPred:$d, imm:$a)]>;
-  def MOVU16ri
-    : InstPTX<(outs RegI16:$d), (ins i16imm:$a), "mov.u16\t$d, $a",
-              [(set RegI16:$d, imm:$a)]>;
-  def MOVU32ri
-    : InstPTX<(outs RegI32:$d), (ins i32imm:$a), "mov.u32\t$d, $a",
-              [(set RegI32:$d, imm:$a)]>;
-  def MOVU64ri
-    : InstPTX<(outs RegI64:$d), (ins i64imm:$a), "mov.u64\t$d, $a",
-              [(set RegI64:$d, imm:$a)]>;
-  def MOVF32ri
-    : InstPTX<(outs RegF32:$d), (ins f32imm:$a), "mov.f32\t$d, $a",
-              [(set RegF32:$d, fpimm:$a)]>;
-  def MOVF64ri
-    : InstPTX<(outs RegF64:$d), (ins f64imm:$a), "mov.f64\t$d, $a",
-              [(set RegF64:$d, fpimm:$a)]>;
-}
-
-let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
-  def MOVaddr32
-    : InstPTX<(outs RegI32:$d), (ins i32imm:$a), "mov.u32\t$d, $a",
-              [(set RegI32:$d, (PTXcopyaddress tglobaladdr:$a))]>;
-  def MOVaddr64
-    : InstPTX<(outs RegI64:$d), (ins i64imm:$a), "mov.u64\t$d, $a",
-              [(set RegI64:$d, (PTXcopyaddress tglobaladdr:$a))]>;
-  def MOVframe32
-    : InstPTX<(outs RegI32:$d), (ins i32imm:$a), "cvta.local.u32\t$d, $a",
-              [(set RegI32:$d, (PTXcopyaddress frameindex:$a))]>;
-  def MOVframe64
-    : InstPTX<(outs RegI64:$d), (ins i64imm:$a), "cvta.local.u64\t$d, $a",
-              [(set RegI64:$d, (PTXcopyaddress frameindex:$a))]>;
-}
-
-// PTX cvt instructions
-// Note all of these may actually be used, we just define all possible patterns
-// here (that make sense).
-// FIXME: Can we collapse this somehow into a multiclass def?
-
-// To i16
-def CVTu16u32
-  : InstPTX<(outs RegI16:$d), (ins RegI32:$a), "cvt.u16.u32\t$d, $a", []>;
-def CVTu16u64
-  : InstPTX<(outs RegI16:$d), (ins RegI64:$a), "cvt.u16.u64\t$d, $a", []>;
-def CVTu16f32
-  : InstPTX<(outs RegI16:$d), (ins RndMode:$r, RegF32:$a),
-            "cvt$r.u16.f32\t$d, $a", []>;
-def CVTs16f32
-  : InstPTX<(outs RegI16:$d), (ins RndMode:$r, RegF32:$a),
-            "cvt$r.s16.f32\t$d, $a", []>;
-def CVTu16f64
-  : InstPTX<(outs RegI16:$d), (ins RndMode:$r, RegF64:$a),
-            "cvt$r.u16.f64\t$d, $a", []>;
-def CVTs16f64
-  : InstPTX<(outs RegI16:$d), (ins RndMode:$r, RegF64:$a),
-            "cvt$r.s16.f64\t$d, $a", []>;
-
-// To i32
-def CVTu32u16
-  : InstPTX<(outs RegI32:$d), (ins RegI16:$a), "cvt.u32.u16\t$d, $a", []>;
-def CVTs32s16
-  : InstPTX<(outs RegI32:$d), (ins RegI16:$a), "cvt.s32.s16\t$d, $a", []>;
-def CVTu32u64
-  : InstPTX<(outs RegI32:$d), (ins RegI64:$a), "cvt.u32.u64\t$d, $a", []>;
-def CVTu32f32
-  : InstPTX<(outs RegI32:$d), (ins RndMode:$r, RegF32:$a),
-            "cvt$r.u32.f32\t$d, $a", []>;
-def CVTs32f32
-  : InstPTX<(outs RegI32:$d), (ins RndMode:$r, RegF32:$a),
-            "cvt$r.s32.f32\t$d, $a", []>;
-def CVTu32f64
-  : InstPTX<(outs RegI32:$d), (ins RndMode:$r, RegF64:$a),
-            "cvt$r.u32.f64\t$d, $a", []>;
-def CVTs32f64
-  : InstPTX<(outs RegI32:$d), (ins RndMode:$r, RegF64:$a),
-            "cvt$r.s32.f64\t$d, $a", []>;
-
-// To i64
-def CVTu64u16
-  : InstPTX<(outs RegI64:$d), (ins RegI16:$a), "cvt.u64.u16\t$d, $a", []>;
-def CVTs64s16
-  : InstPTX<(outs RegI64:$d), (ins RegI16:$a), "cvt.s64.s16\t$d, $a", []>;
-def CVTu64u32
-  : InstPTX<(outs RegI64:$d), (ins RegI32:$a), "cvt.u64.u32\t$d, $a", []>;
-def CVTs64s32
-  : InstPTX<(outs RegI64:$d), (ins RegI32:$a), "cvt.s64.s32\t$d, $a", []>;
-def CVTu64f32
-  : InstPTX<(outs RegI64:$d), (ins RndMode:$r, RegF32:$a),
-            "cvt$r.u64.f32\t$d, $a", []>;
-def CVTs64f32
-  : InstPTX<(outs RegI64:$d), (ins RndMode:$r, RegF32:$a),
-            "cvt$r.s64.f32\t$d, $a", []>;
-def CVTu64f64
-  : InstPTX<(outs RegI64:$d), (ins RndMode:$r, RegF64:$a),
-            "cvt$r.u64.f64\t$d, $a", []>;
-def CVTs64f64
-  : InstPTX<(outs RegI64:$d), (ins RndMode:$r, RegF64:$a),
-            "cvt$r.s64.f64\t$d, $a", []>;
-
-// To f32
-def CVTf32u16
-  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI16:$a),
-            "cvt$r.f32.u16\t$d, $a", []>;
-def CVTf32s16
-  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI16:$a),
-            "cvt$r.f32.s16\t$d, $a", []>;
-def CVTf32u32
-  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI32:$a),
-            "cvt$r.f32.u32\t$d, $a", []>;
-def CVTf32s32
-  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI32:$a),
-            "cvt$r.f32.s32\t$d, $a", []>;
-def CVTf32u64
-  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI64:$a),
-            "cvt$r.f32.u64\t$d, $a", []>;
-def CVTf32s64
-  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI64:$a),
-            "cvt$r.f32.s64\t$d, $a", []>;
-def CVTf32f64
-  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegF64:$a),
-            "cvt$r.f32.f64\t$d, $a", []>;
-
-// To f64
-def CVTf64u16
-  : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI16:$a),
-            "cvt$r.f64.u16\t$d, $a", []>;
-def CVTf64s16
-  : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI16:$a),
-            "cvt$r.f64.s16\t$d, $a", []>;
-def CVTf64u32
-  : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI32:$a),
-            "cvt$r.f64.u32\t$d, $a", []>;
-def CVTf64s32
-  : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI32:$a),
-            "cvt$r.f64.s32\t$d, $a", []>;
-def CVTf64u64
-  : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI64:$a),
-            "cvt$r.f64.u64\t$d, $a", []>;
-def CVTf64s64
-  : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI64:$a),
-            "cvt$r.f64.s64\t$d, $a", []>;
-def CVTf64f32
-  : InstPTX<(outs RegF64:$d), (ins RegF32:$a), "cvt.f64.f32\t$d, $a", []>;
-
-  ///===- Control Flow Instructions -----------------------------------------===//
-
-let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
-  def BRAd
-    : InstPTX<(outs), (ins brtarget:$d), "bra\t$d", [(br bb:$d)]>;
-}
-
-let isBranch = 1, isTerminator = 1 in {
-  // FIXME: The pattern part is blank because I cannot (or do not yet know
-  // how to) use the first operand of PredicateOperand (a RegPred register) here
-  // When this is revisited, make sure to also look at LowerSETCC and try to
-  // fold it into negated predicates, if possible.
-  def BRAdp
-    : InstPTX<(outs), (ins brtarget:$d), "bra\t$d",
-              [/*(brcond pred:$_p, bb:$d)*/]>;
-}
-
-let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
-  def EXIT : InstPTX<(outs), (ins), "exit", [(PTXexit)]>;
-  def RET  : InstPTX<(outs), (ins), "ret",  [(PTXret)]>;
-}
-
-let hasSideEffects = 1 in {
-  def CALL : InstPTX<(outs), (ins), "call", [(PTXcall)]>;
-}
-
-///===- Parameter Passing Pseudo-Instructions -----------------------------===//
-
-def READPARAMPRED : InstPTX<(outs RegPred:$a), (ins i32imm:$b),
-                            "mov.pred\t$a, %arg$b", []>;
-def READPARAMI16  : InstPTX<(outs RegI16:$a), (ins i32imm:$b),
-                            "mov.b16\t$a, %arg$b", []>;
-def READPARAMI32  : InstPTX<(outs RegI32:$a), (ins i32imm:$b),
-                            "mov.b32\t$a, %arg$b", []>;
-def READPARAMI64  : InstPTX<(outs RegI64:$a), (ins i32imm:$b),
-                            "mov.b64\t$a, %arg$b", []>;
-def READPARAMF32  : InstPTX<(outs RegF32:$a), (ins i32imm:$b),
-                            "mov.f32\t$a, %arg$b", []>;
-def READPARAMF64  : InstPTX<(outs RegF64:$a), (ins i32imm:$b),
-                            "mov.f64\t$a, %arg$b", []>;
-
-def WRITEPARAMPRED : InstPTX<(outs), (ins RegPred:$a), "//w", []>;
-def WRITEPARAMI16  : InstPTX<(outs), (ins RegI16:$a), "//w", []>;
-def WRITEPARAMI32  : InstPTX<(outs), (ins RegI32:$a), "//w", []>;
-def WRITEPARAMI64  : InstPTX<(outs), (ins RegI64:$a), "//w", []>;
-def WRITEPARAMF32  : InstPTX<(outs), (ins RegF32:$a), "//w", []>;
-def WRITEPARAMF64  : InstPTX<(outs), (ins RegF64:$a), "//w", []>;
-
-
-//===----------------------------------------------------------------------===//
-// Instruction Selection Patterns
-//===----------------------------------------------------------------------===//
-
-// FADD
-def : Pat<(f32 (fadd RegF32:$a, RegF32:$b)),
-          (FADDrr32 RndDefault, RegF32:$a, RegF32:$b)>;
-def : Pat<(f32 (fadd RegF32:$a, fpimm:$b)),
-          (FADDri32 RndDefault, RegF32:$a, fpimm:$b)>;
-def : Pat<(f64 (fadd RegF64:$a, RegF64:$b)),
-          (FADDrr64 RndDefault, RegF64:$a, RegF64:$b)>;
-def : Pat<(f64 (fadd RegF64:$a, fpimm:$b)),
-          (FADDri64 RndDefault, RegF64:$a, fpimm:$b)>;
-
-// FSUB
-def : Pat<(f32 (fsub RegF32:$a, RegF32:$b)),
-          (FSUBrr32 RndDefault, RegF32:$a, RegF32:$b)>;
-def : Pat<(f32 (fsub RegF32:$a, fpimm:$b)),
-          (FSUBri32 RndDefault, RegF32:$a, fpimm:$b)>;
-def : Pat<(f64 (fsub RegF64:$a, RegF64:$b)),
-          (FSUBrr64 RndDefault, RegF64:$a, RegF64:$b)>;
-def : Pat<(f64 (fsub RegF64:$a, fpimm:$b)),
-          (FSUBri64 RndDefault, RegF64:$a, fpimm:$b)>;
-
-// FMUL
-def : Pat<(f32 (fmul RegF32:$a, RegF32:$b)),
-          (FMULrr32 RndDefault, RegF32:$a, RegF32:$b)>;
-def : Pat<(f32 (fmul RegF32:$a, fpimm:$b)),
-          (FMULri32 RndDefault, RegF32:$a, fpimm:$b)>;
-def : Pat<(f64 (fmul RegF64:$a, RegF64:$b)),
-          (FMULrr64 RndDefault, RegF64:$a, RegF64:$b)>;
-def : Pat<(f64 (fmul RegF64:$a, fpimm:$b)),
-          (FMULri64 RndDefault, RegF64:$a, fpimm:$b)>;
-
-// FDIV
-def : Pat<(f32 (fdiv RegF32:$a, RegF32:$b)),
-          (FDIVrr32 RndDefault, RegF32:$a, RegF32:$b)>;
-def : Pat<(f32 (fdiv RegF32:$a, fpimm:$b)),
-          (FDIVri32 RndDefault, RegF32:$a, fpimm:$b)>;
-def : Pat<(f64 (fdiv RegF64:$a, RegF64:$b)),
-          (FDIVrr64 RndDefault, RegF64:$a, RegF64:$b)>;
-def : Pat<(f64 (fdiv RegF64:$a, fpimm:$b)),
-          (FDIVri64 RndDefault, RegF64:$a, fpimm:$b)>;
-
-// FMUL+FADD
-def : Pat<(f32 (fadd (fmul RegF32:$a, RegF32:$b), RegF32:$c)),
-          (FMADrrr32 RndDefault, RegF32:$a, RegF32:$b, RegF32:$c)>,
-          Requires<[SupportsFMA]>;
-def : Pat<(f32 (fadd (fmul RegF32:$a, RegF32:$b), fpimm:$c)),
-          (FMADrri32 RndDefault, RegF32:$a, RegF32:$b, fpimm:$c)>,
-          Requires<[SupportsFMA]>;
-def : Pat<(f32 (fadd (fmul RegF32:$a, fpimm:$b), fpimm:$c)),
-          (FMADrrr32 RndDefault, RegF32:$a, fpimm:$b, fpimm:$c)>,
-          Requires<[SupportsFMA]>;
-def : Pat<(f32 (fadd (fmul RegF32:$a, RegF32:$b), fpimm:$c)),
-          (FMADrri32 RndDefault, RegF32:$a, RegF32:$b, fpimm:$c)>,
-          Requires<[SupportsFMA]>;
-def : Pat<(f64 (fadd (fmul RegF64:$a, RegF64:$b), RegF64:$c)),
-          (FMADrrr64 RndDefault, RegF64:$a, RegF64:$b, RegF64:$c)>,
-          Requires<[SupportsFMA]>;
-def : Pat<(f64 (fadd (fmul RegF64:$a, RegF64:$b), fpimm:$c)),
-          (FMADrri64 RndDefault, RegF64:$a, RegF64:$b, fpimm:$c)>,
-          Requires<[SupportsFMA]>;
-def : Pat<(f64 (fadd (fmul RegF64:$a, fpimm:$b), fpimm:$c)),
-          (FMADrri64 RndDefault, RegF64:$a, fpimm:$b, fpimm:$c)>,
-          Requires<[SupportsFMA]>;
-
-// FNEG
-def : Pat<(f32 (fneg RegF32:$a)), (FNEGrr32 RndDefault, RegF32:$a)>;
-def : Pat<(f32 (fneg fpimm:$a)), (FNEGri32 RndDefault, fpimm:$a)>;
-def : Pat<(f64 (fneg RegF64:$a)), (FNEGrr64 RndDefault, RegF64:$a)>;
-def : Pat<(f64 (fneg fpimm:$a)), (FNEGri64 RndDefault, fpimm:$a)>;
-
-// FSQRT
-def : Pat<(f32 (fsqrt RegF32:$a)), (FSQRTrr32 RndDefault, RegF32:$a)>;
-def : Pat<(f32 (fsqrt fpimm:$a)), (FSQRTri32 RndDefault, fpimm:$a)>;
-def : Pat<(f64 (fsqrt RegF64:$a)), (FSQRTrr64 RndDefault, RegF64:$a)>;
-def : Pat<(f64 (fsqrt fpimm:$a)), (FSQRTri64 RndDefault, fpimm:$a)>;
-
-// FSIN
-def : Pat<(f32 (fsin RegF32:$a)), (FSINrr32 RndDefault, RegF32:$a)>;
-def : Pat<(f32 (fsin fpimm:$a)), (FSINri32 RndDefault, fpimm:$a)>;
-def : Pat<(f64 (fsin RegF64:$a)), (FSINrr64 RndDefault, RegF64:$a)>;
-def : Pat<(f64 (fsin fpimm:$a)), (FSINri64 RndDefault, fpimm:$a)>;
-
-// FCOS
-def : Pat<(f32 (fcos RegF32:$a)), (FCOSrr32 RndDefault, RegF32:$a)>;
-def : Pat<(f32 (fcos fpimm:$a)), (FCOSri32 RndDefault, fpimm:$a)>;
-def : Pat<(f64 (fcos RegF64:$a)), (FCOSrr64 RndDefault, RegF64:$a)>;
-def : Pat<(f64 (fcos fpimm:$a)), (FCOSri64 RndDefault, fpimm:$a)>;
-
-// Type conversion notes:
-// - PTX does not directly support converting a predicate to a value, so we
-//   use a select instruction to select either 0 or 1 (integer or fp) based
-//   on the truth value of the predicate.
-// - PTX does not directly support converting to a predicate type, so we fake it
-//   by performing a greater-than test between the value and zero.  This follows
-//   the C convention that any non-zero value is equivalent to 'true'.
-
-// Conversion to pred
-def : Pat<(i1 (trunc RegI16:$a)),      (SETPGTu16ri RegI16:$a, 0)>;
-def : Pat<(i1 (trunc RegI32:$a)),      (SETPGTu32ri RegI32:$a, 0)>;
-def : Pat<(i1 (trunc RegI64:$a)),      (SETPGTu64ri RegI64:$a, 0)>;
-def : Pat<(i1 (fp_to_uint RegF32:$a)), (SETPGTu32ri (MOVi32f32 RegF32:$a), 0)>;
-def : Pat<(i1 (fp_to_uint RegF64:$a)), (SETPGTu64ri (MOVi64f64 RegF64:$a), 0)>;
-
-// Conversion to u16
-def : Pat<(i16 (anyext RegPred:$a)),    (SELPi16ii RegPred:$a, 1, 0)>;
-def : Pat<(i16 (sext RegPred:$a)),      (SELPi16ii RegPred:$a, 0xFFFF, 0)>;
-def : Pat<(i16 (zext RegPred:$a)),      (SELPi16ii RegPred:$a, 1, 0)>;
-def : Pat<(i16 (trunc RegI32:$a)),      (CVTu16u32 RegI32:$a)>;
-def : Pat<(i16 (trunc RegI64:$a)),      (CVTu16u64 RegI64:$a)>;
-def : Pat<(i16 (fp_to_uint RegF32:$a)), (CVTu16f32 RndDefault, RegF32:$a)>;
-def : Pat<(i16 (fp_to_sint RegF32:$a)), (CVTs16f32 RndDefault, RegF32:$a)>;
-def : Pat<(i16 (fp_to_uint RegF64:$a)), (CVTu16f64 RndDefault, RegF64:$a)>;
-def : Pat<(i16 (fp_to_sint RegF64:$a)), (CVTs16f64 RndDefault, RegF64:$a)>;
-
-// Conversion to u32
-def : Pat<(i32 (anyext RegPred:$a)),    (SELPi32ii RegPred:$a, 1, 0)>;
-def : Pat<(i32 (sext RegPred:$a)),      (SELPi32ii RegPred:$a, 0xFFFFFFFF, 0)>;
-def : Pat<(i32 (zext RegPred:$a)),      (SELPi32ii RegPred:$a, 1, 0)>;
-def : Pat<(i32 (anyext RegI16:$a)),     (CVTu32u16 RegI16:$a)>;
-def : Pat<(i32 (sext RegI16:$a)),       (CVTs32s16 RegI16:$a)>;
-def : Pat<(i32 (zext RegI16:$a)),       (CVTu32u16 RegI16:$a)>;
-def : Pat<(i32 (trunc RegI64:$a)),      (CVTu32u64 RegI64:$a)>;
-def : Pat<(i32 (fp_to_uint RegF32:$a)), (CVTu32f32 RndDefault, RegF32:$a)>;
-def : Pat<(i32 (fp_to_sint RegF32:$a)), (CVTs32f32 RndDefault, RegF32:$a)>;
-def : Pat<(i32 (fp_to_uint RegF64:$a)), (CVTu32f64 RndDefault, RegF64:$a)>;
-def : Pat<(i32 (fp_to_sint RegF64:$a)), (CVTs32f64 RndDefault, RegF64:$a)>;
-def : Pat<(i32 (bitconvert RegF32:$a)), (MOVi32f32 RegF32:$a)>;
-
-// Conversion to u64
-def : Pat<(i64 (anyext RegPred:$a)),    (SELPi64ii RegPred:$a, 1, 0)>;
-def : Pat<(i64 (sext RegPred:$a)),      (SELPi64ii RegPred:$a,
-                                         0xFFFFFFFFFFFFFFFF, 0)>;
-def : Pat<(i64 (zext RegPred:$a)),      (SELPi64ii RegPred:$a, 1, 0)>;
-def : Pat<(i64 (anyext RegI16:$a)),     (CVTu64u16 RegI16:$a)>;
-def : Pat<(i64 (sext RegI16:$a)),       (CVTs64s16 RegI16:$a)>;
-def : Pat<(i64 (zext RegI16:$a)),       (CVTu64u16 RegI16:$a)>;
-def : Pat<(i64 (anyext RegI32:$a)),     (CVTu64u32 RegI32:$a)>;
-def : Pat<(i64 (sext RegI32:$a)),       (CVTs64s32 RegI32:$a)>;
-def : Pat<(i64 (zext RegI32:$a)),       (CVTu64u32 RegI32:$a)>;
-def : Pat<(i64 (fp_to_uint RegF32:$a)), (CVTu64f32 RndDefault, RegF32:$a)>;
-def : Pat<(i64 (fp_to_sint RegF32:$a)), (CVTs64f32 RndDefault, RegF32:$a)>;
-def : Pat<(i64 (fp_to_uint RegF64:$a)), (CVTu64f64 RndDefault, RegF64:$a)>;
-def : Pat<(i64 (fp_to_sint RegF64:$a)), (CVTs64f64 RndDefault, RegF64:$a)>;
-def : Pat<(i64 (bitconvert RegF64:$a)), (MOVi64f64 RegF64:$a)>;
-
-// Conversion to f32
-def : Pat<(f32 (uint_to_fp RegPred:$a)), (SELPf32rr RegPred:$a,
-                                        (MOVf32i32 0x3F800000), (MOVf32i32 0))>;
-def : Pat<(f32 (uint_to_fp RegI16:$a)),  (CVTf32u16 RndDefault, RegI16:$a)>;
-def : Pat<(f32 (sint_to_fp RegI16:$a)),  (CVTf32s16 RndDefault, RegI16:$a)>;
-def : Pat<(f32 (uint_to_fp RegI32:$a)),  (CVTf32u32 RndDefault, RegI32:$a)>;
-def : Pat<(f32 (sint_to_fp RegI32:$a)),  (CVTf32s32 RndDefault, RegI32:$a)>;
-def : Pat<(f32 (uint_to_fp RegI64:$a)),  (CVTf32u64 RndDefault, RegI64:$a)>;
-def : Pat<(f32 (sint_to_fp RegI64:$a)),  (CVTf32s64 RndDefault, RegI64:$a)>;
-def : Pat<(f32 (fround RegF64:$a)),      (CVTf32f64 RndDefault, RegF64:$a)>;
-def : Pat<(f32 (bitconvert RegI32:$a)),  (MOVf32i32 RegI32:$a)>;
-
-// Conversion to f64
-def : Pat<(f64 (uint_to_fp RegPred:$a)), (SELPf64rr RegPred:$a,
-                                (MOVf64i64 0x3F80000000000000), (MOVf64i64 0))>;
-def : Pat<(f64 (uint_to_fp RegI16:$a)), (CVTf64u16 RndDefault, RegI16:$a)>;
-def : Pat<(f64 (sint_to_fp RegI16:$a)), (CVTf64s16 RndDefault, RegI16:$a)>;
-def : Pat<(f64 (uint_to_fp RegI32:$a)), (CVTf64u32 RndDefault, RegI32:$a)>;
-def : Pat<(f64 (sint_to_fp RegI32:$a)), (CVTf64s32 RndDefault, RegI32:$a)>;
-def : Pat<(f64 (uint_to_fp RegI64:$a)), (CVTf64u64 RndDefault, RegI64:$a)>;
-def : Pat<(f64 (sint_to_fp RegI64:$a)), (CVTf64s64 RndDefault, RegI64:$a)>;
-def : Pat<(f64 (fextend RegF32:$a)),    (CVTf64f32 RegF32:$a)>;
-def : Pat<(f64 (bitconvert RegI64:$a)), (MOVf64i64 RegI64:$a)>;
-
-// setcc - predicate inversion for branch conditions
-def : Pat<(i1 (setcc RegPred:$a, imm:$b, SETNE)),
-          (XORripreds RegPred:$a, imm:$b)>;
-
-///===- Intrinsic Instructions --------------------------------------------===//
-include "PTXIntrinsicInstrInfo.td"
-
-///===- Load/Store Instructions -------------------------------------------===//
-include "PTXInstrLoadStore.td"
-
diff --git a/lib/Target/PTX/PTXInstrLoadStore.td b/lib/Target/PTX/PTXInstrLoadStore.td
deleted file mode 100644
index 7a62684..0000000
--- a/lib/Target/PTX/PTXInstrLoadStore.td
+++ /dev/null
@@ -1,278 +0,0 @@
-//===- PTXInstrLoadStore.td - PTX Load/Store Instruction Defs -*- tablegen-*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the PTX load/store instructions in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-
-// Addressing Predicates
-// We have to differentiate between 32- and 64-bit pointer types
-def Use32BitAddresses : Predicate<"!getSubtarget().is64Bit()">;
-def Use64BitAddresses : Predicate<"getSubtarget().is64Bit()">;
-
-//===----------------------------------------------------------------------===//
-// Pattern Fragments for Loads/Stores
-//===----------------------------------------------------------------------===//
-
-def load_global : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-  const Value *Src;
-  const PointerType *PT;
-  if ((Src = cast<LoadSDNode>(N)->getSrcValue()) &&
-      (PT = dyn_cast<PointerType>(Src->getType())))
-    return PT->getAddressSpace() == PTXStateSpace::Global;
-  return false;
-}]>;
-
-def load_constant : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-  const Value *Src;
-  const PointerType *PT;
-  if ((Src = cast<LoadSDNode>(N)->getSrcValue()) &&
-      (PT = dyn_cast<PointerType>(Src->getType())))
-    return PT->getAddressSpace() == PTXStateSpace::Constant;
-  return false;
-}]>;
-
-def load_shared : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-  const Value *Src;
-  const PointerType *PT;
-  if ((Src = cast<LoadSDNode>(N)->getSrcValue()) &&
-      (PT = dyn_cast<PointerType>(Src->getType())))
-    return PT->getAddressSpace() == PTXStateSpace::Shared;
-  return false;
-}]>;
-
-def store_global
-  : PatFrag<(ops node:$d, node:$ptr), (store node:$d, node:$ptr), [{
-  const Value *Src;
-  const PointerType *PT;
-  if ((Src = cast<StoreSDNode>(N)->getSrcValue()) &&
-      (PT = dyn_cast<PointerType>(Src->getType())))
-    return PT->getAddressSpace() == PTXStateSpace::Global;
-  return false;
-}]>;
-
-def store_shared
-  : PatFrag<(ops node:$d, node:$ptr), (store node:$d, node:$ptr), [{
-  const Value *Src;
-  const PointerType *PT;
-  if ((Src = cast<StoreSDNode>(N)->getSrcValue()) &&
-      (PT = dyn_cast<PointerType>(Src->getType())))
-    return PT->getAddressSpace() == PTXStateSpace::Shared;
-  return false;
-}]>;
-
-// Addressing modes.
-def ADDRrr32    : ComplexPattern<i32, 2, "SelectADDRrr", [], []>;
-def ADDRrr64    : ComplexPattern<i64, 2, "SelectADDRrr", [], []>;
-def ADDRri32    : ComplexPattern<i32, 2, "SelectADDRri", [], []>;
-def ADDRri64    : ComplexPattern<i64, 2, "SelectADDRri", [], []>;
-def ADDRii32    : ComplexPattern<i32, 2, "SelectADDRii", [], []>;
-def ADDRii64    : ComplexPattern<i64, 2, "SelectADDRii", [], []>;
-def ADDRlocal32 : ComplexPattern<i32, 2, "SelectADDRlocal", [], []>;
-def ADDRlocal64 : ComplexPattern<i64, 2, "SelectADDRlocal", [], []>;
-
-// Address operands
-def MEMri32 : Operand<i32> {
-  let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops RegI32, i32imm);
-}
-def MEMri64 : Operand<i64> {
-  let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops RegI64, i64imm);
-}
-def LOCALri32 : Operand<i32> {
-  let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops i32imm, i32imm);
-}
-def LOCALri64 : Operand<i64> {
-  let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops i64imm, i64imm);
-}
-def MEMii32 : Operand<i32> {
-  let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops i32imm, i32imm);
-}
-def MEMii64 : Operand<i64> {
-  let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops i64imm, i64imm);
-}
-// The operand here does not correspond to an actual address, so we
-// can use i32 in 64-bit address modes.
-def MEMpi : Operand<i32> {
-  let PrintMethod = "printParamOperand";
-  let MIOperandInfo = (ops i32imm);
-}
-def MEMret : Operand<i32> {
-  let PrintMethod = "printReturnOperand";
-  let MIOperandInfo = (ops i32imm);
-}
-
-
-// Load/store .param space
-def PTXloadparam
-  : SDNode<"PTXISD::LOAD_PARAM", SDTypeProfile<1, 1, [SDTCisPtrTy<1>]>,
-           [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue]>;
-def PTXstoreparam
-  : SDNode<"PTXISD::STORE_PARAM", SDTypeProfile<0, 2, [SDTCisVT<0, i32>]>,
-           [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue]>;
-
-def PTXreadparam
-  : SDNode<"PTXISD::READ_PARAM", SDTypeProfile<1, 1, [SDTCisVT<1, i32>]>,
-      [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue]>;
-def PTXwriteparam
-  : SDNode<"PTXISD::WRITE_PARAM", SDTypeProfile<0, 1, []>,
-      [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue]>;
-
-
-
-//===----------------------------------------------------------------------===//
-// Classes for loads/stores
-//===----------------------------------------------------------------------===//
-multiclass PTX_LD<string opstr, string typestr,
-           RegisterClass RC, PatFrag pat_load> {
-  def rr32 : InstPTX<(outs RC:$d),
-                     (ins MEMri32:$a),
-                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
-                     [(set RC:$d, (pat_load ADDRrr32:$a))]>,
-                     Requires<[Use32BitAddresses]>;
-  def rr64 : InstPTX<(outs RC:$d),
-                     (ins MEMri64:$a),
-                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
-                     [(set RC:$d, (pat_load ADDRrr64:$a))]>,
-                     Requires<[Use64BitAddresses]>;
-  def ri32 : InstPTX<(outs RC:$d),
-                     (ins MEMri32:$a),
-                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
-                     [(set RC:$d, (pat_load ADDRri32:$a))]>,
-                     Requires<[Use32BitAddresses]>;
-  def ri64 : InstPTX<(outs RC:$d),
-                     (ins MEMri64:$a),
-                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
-                     [(set RC:$d, (pat_load ADDRri64:$a))]>,
-                     Requires<[Use64BitAddresses]>;
-  def ii32 : InstPTX<(outs RC:$d),
-                     (ins MEMii32:$a),
-                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
-                     [(set RC:$d, (pat_load ADDRii32:$a))]>,
-                     Requires<[Use32BitAddresses]>;
-  def ii64 : InstPTX<(outs RC:$d),
-                     (ins MEMii64:$a),
-                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
-                     [(set RC:$d, (pat_load ADDRii64:$a))]>,
-                     Requires<[Use64BitAddresses]>;
-}
-
-multiclass PTX_ST<string opstr, string typestr, RegisterClass RC,
-                  PatFrag pat_store> {
-  def rr32 : InstPTX<(outs),
-                     (ins RC:$d, MEMri32:$a),
-                     !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
-                     [(pat_store RC:$d, ADDRrr32:$a)]>,
-                     Requires<[Use32BitAddresses]>;
-  def rr64 : InstPTX<(outs),
-                     (ins RC:$d, MEMri64:$a),
-                     !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
-                     [(pat_store RC:$d, ADDRrr64:$a)]>,
-                     Requires<[Use64BitAddresses]>;
-  def ri32 : InstPTX<(outs),
-                   (ins RC:$d, MEMri32:$a),
-                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
-                   [(pat_store RC:$d, ADDRri32:$a)]>,
-                   Requires<[Use32BitAddresses]>;
-  def ri64 : InstPTX<(outs),
-                   (ins RC:$d, MEMri64:$a),
-                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
-                   [(pat_store RC:$d, ADDRri64:$a)]>,
-                   Requires<[Use64BitAddresses]>;
-  def ii32 : InstPTX<(outs),
-                   (ins RC:$d, MEMii32:$a),
-                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
-                   [(pat_store RC:$d, ADDRii32:$a)]>,
-                   Requires<[Use32BitAddresses]>;
-  def ii64 : InstPTX<(outs),
-                   (ins RC:$d, MEMii64:$a),
-                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
-                   [(pat_store RC:$d, ADDRii64:$a)]>,
-                   Requires<[Use64BitAddresses]>;
-}
-
-multiclass PTX_LOCAL_LD_ST<string typestr, RegisterClass RC> {
-  def LDri32 : InstPTX<(outs RC:$d), (ins LOCALri32:$a),
-                      !strconcat("ld.local", !strconcat(typestr, "\t$d, [$a]")),
-                       [(set RC:$d, (load_global ADDRlocal32:$a))]>;
-  def LDri64 : InstPTX<(outs RC:$d), (ins LOCALri64:$a),
-                      !strconcat("ld.local", !strconcat(typestr, "\t$d, [$a]")),
-                       [(set RC:$d, (load_global ADDRlocal64:$a))]>;
-  def STri32 : InstPTX<(outs), (ins RC:$d, LOCALri32:$a),
-                      !strconcat("st.local", !strconcat(typestr, "\t[$a], $d")),
-                       [(store_global RC:$d, ADDRlocal32:$a)]>;
-  def STri64 : InstPTX<(outs), (ins RC:$d, LOCALri64:$a),
-                      !strconcat("st.local", !strconcat(typestr, "\t[$a], $d")),
-                       [(store_global RC:$d, ADDRlocal64:$a)]>;
-}
-
-multiclass PTX_PARAM_LD_ST<string typestr, RegisterClass RC> {
-  let hasSideEffects = 1 in {
-  def LDpi : InstPTX<(outs RC:$d), (ins i32imm:$a),
-                     !strconcat("ld.param", !strconcat(typestr, "\t$d, [$a]")),
-                     [(set RC:$d, (PTXloadparam texternalsym:$a))]>;
-  def STpi : InstPTX<(outs), (ins i32imm:$d, RC:$a),
-                     !strconcat("st.param", !strconcat(typestr, "\t[$d], $a")),
-                     [(PTXstoreparam texternalsym:$d, RC:$a)]>;
-  }
-}
-
-multiclass PTX_LD_ALL<string opstr, PatFrag pat_load> {
-  defm u16 : PTX_LD<opstr, ".u16", RegI16, pat_load>;
-  defm u32 : PTX_LD<opstr, ".u32", RegI32, pat_load>;
-  defm u64 : PTX_LD<opstr, ".u64", RegI64, pat_load>;
-  defm f32 : PTX_LD<opstr, ".f32", RegF32, pat_load>;
-  defm f64 : PTX_LD<opstr, ".f64", RegF64, pat_load>;
-}
-
-multiclass PTX_ST_ALL<string opstr, PatFrag pat_store> {
-  defm u16 : PTX_ST<opstr, ".u16", RegI16, pat_store>;
-  defm u32 : PTX_ST<opstr, ".u32", RegI32, pat_store>;
-  defm u64 : PTX_ST<opstr, ".u64", RegI64, pat_store>;
-  defm f32 : PTX_ST<opstr, ".f32", RegF32, pat_store>;
-  defm f64 : PTX_ST<opstr, ".f64", RegF64, pat_store>;
-}
-
-
-
-//===----------------------------------------------------------------------===//
-// Instruction definitions for loads/stores
-//===----------------------------------------------------------------------===//
-
-// Global/shared stores
-defm STg : PTX_ST_ALL<"st.global", store_global>;
-defm STs : PTX_ST_ALL<"st.shared", store_shared>;
-
-// Global/shared/constant loads
-defm LDg : PTX_LD_ALL<"ld.global", load_global>;
-defm LDc : PTX_LD_ALL<"ld.const",  load_constant>;
-defm LDs : PTX_LD_ALL<"ld.shared", load_shared>;
-
-// Param loads/stores
-defm PARAMPRED : PTX_PARAM_LD_ST<".pred", RegPred>;
-defm PARAMU16  : PTX_PARAM_LD_ST<".u16", RegI16>;
-defm PARAMU32  : PTX_PARAM_LD_ST<".u32", RegI32>;
-defm PARAMU64  : PTX_PARAM_LD_ST<".u64", RegI64>;
-defm PARAMF32  : PTX_PARAM_LD_ST<".f32", RegF32>;
-defm PARAMF64  : PTX_PARAM_LD_ST<".f64", RegF64>;
-
-// Local loads/stores
-defm LOCALPRED : PTX_LOCAL_LD_ST<".pred", RegPred>;
-defm LOCALU16  : PTX_LOCAL_LD_ST<".u16", RegI16>;
-defm LOCALU32  : PTX_LOCAL_LD_ST<".u32", RegI32>;
-defm LOCALU64  : PTX_LOCAL_LD_ST<".u64", RegI64>;
-defm LOCALF32  : PTX_LOCAL_LD_ST<".f32", RegF32>;
-defm LOCALF64  : PTX_LOCAL_LD_ST<".f64", RegF64>;
-
diff --git a/lib/Target/PTX/PTXIntrinsicInstrInfo.td b/lib/Target/PTX/PTXIntrinsicInstrInfo.td
deleted file mode 100644
index 3416f1c..0000000
--- a/lib/Target/PTX/PTXIntrinsicInstrInfo.td
+++ /dev/null
@@ -1,110 +0,0 @@
-//===-- PTXIntrinsicInstrInfo.td - Defines PTX intrinsics --*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines all of the PTX-specific intrinsic instructions.
-//
-//===----------------------------------------------------------------------===//
-
-// PTX Special Purpose Register Accessor Intrinsics
-
-class PTX_READ_SPECIAL_REGISTER_R64<string regname, Intrinsic intop>
-  : InstPTX<(outs RegI64:$d), (ins),
-            !strconcat("mov.u64\t$d, %", regname),
-            [(set RegI64:$d, (intop))]>;
-
-class PTX_READ_SPECIAL_REGISTER_R32<string regname, Intrinsic intop>
-  : InstPTX<(outs RegI32:$d), (ins),
-            !strconcat("mov.u32\t$d, %", regname),
-            [(set RegI32:$d, (intop))]>;
-
-// TODO Add read vector-version of special registers
-
-//def PTX_READ_TID_R64 : PTX_READ_SPECIAL_REGISTER_R64<"tid",
-//                                                     int_ptx_read_tid_r64>;
-def PTX_READ_TID_X   : PTX_READ_SPECIAL_REGISTER_R32<"tid.x",
-                                                     int_ptx_read_tid_x>;
-def PTX_READ_TID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"tid.y",
-                                                     int_ptx_read_tid_y>;
-def PTX_READ_TID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"tid.z",
-                                                     int_ptx_read_tid_z>;
-def PTX_READ_TID_W   : PTX_READ_SPECIAL_REGISTER_R32<"tid.w",
-                                                     int_ptx_read_tid_w>;
-
-//def PTX_READ_NTID_R64 : PTX_READ_SPECIAL_REGISTER_R64<"ntid",
-//                                                      int_ptx_read_ntid_r64>;
-def PTX_READ_NTID_X   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.x",
-                                                      int_ptx_read_ntid_x>;
-def PTX_READ_NTID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.y",
-                                                      int_ptx_read_ntid_y>;
-def PTX_READ_NTID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.z",
-                                                      int_ptx_read_ntid_z>;
-def PTX_READ_NTID_W   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.w",
-                                                      int_ptx_read_ntid_w>;
-
-def PTX_READ_LANEID  : PTX_READ_SPECIAL_REGISTER_R32<"laneid",
-                                                     int_ptx_read_laneid>;
-def PTX_READ_WARPID  : PTX_READ_SPECIAL_REGISTER_R32<"warpid",
-                                                     int_ptx_read_warpid>;
-def PTX_READ_NWARPID : PTX_READ_SPECIAL_REGISTER_R32<"nwarpid",
-                                                     int_ptx_read_nwarpid>;
-
-//def PTX_READ_CTAID_R64 :
-//PTX_READ_SPECIAL_REGISTER_R64<"ctaid", int_ptx_read_ctaid_r64>;
-def PTX_READ_CTAID_X   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.x",
-                                                       int_ptx_read_ctaid_x>;
-def PTX_READ_CTAID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.y",
-                                                       int_ptx_read_ctaid_y>;
-def PTX_READ_CTAID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.z",
-                                                       int_ptx_read_ctaid_z>;
-def PTX_READ_CTAID_W   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.w",
-                                                       int_ptx_read_ctaid_w>;
-
-//def PTX_READ_NCTAID_R64 :
-//PTX_READ_SPECIAL_REGISTER_R64<"nctaid", int_ptx_read_nctaid_r64>;
-def PTX_READ_NCTAID_X   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.x",
-                                                        int_ptx_read_nctaid_x>;
-def PTX_READ_NCTAID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.y",
-                                                        int_ptx_read_nctaid_y>;
-def PTX_READ_NCTAID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.z",
-                                                        int_ptx_read_nctaid_z>;
-def PTX_READ_NCTAID_W   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.w",
-                                                        int_ptx_read_nctaid_w>;
-
-def PTX_READ_SMID  : PTX_READ_SPECIAL_REGISTER_R32<"smid",
-                                                   int_ptx_read_smid>;
-def PTX_READ_NSMID  : PTX_READ_SPECIAL_REGISTER_R32<"nsmid",
-                                                    int_ptx_read_nsmid>;
-def PTX_READ_GRIDID  : PTX_READ_SPECIAL_REGISTER_R32<"gridid",
-                                                     int_ptx_read_gridid>;
-
-def PTX_READ_LANEMASK_EQ
-  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_eq", int_ptx_read_lanemask_eq>;
-def PTX_READ_LANEMASK_LE
-  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_le", int_ptx_read_lanemask_le>;
-def PTX_READ_LANEMASK_LT
-  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_lt", int_ptx_read_lanemask_lt>;
-def PTX_READ_LANEMASK_GE
-  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_ge", int_ptx_read_lanemask_ge>;
-def PTX_READ_LANEMASK_GT
-  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_gt", int_ptx_read_lanemask_gt>;
-
-def PTX_READ_CLOCK
-  : PTX_READ_SPECIAL_REGISTER_R32<"clock", int_ptx_read_clock>;
-def PTX_READ_CLOCK64
-  : PTX_READ_SPECIAL_REGISTER_R64<"clock64", int_ptx_read_clock64>;
-
-def PTX_READ_PM0 : PTX_READ_SPECIAL_REGISTER_R32<"pm0", int_ptx_read_pm0>;
-def PTX_READ_PM1 : PTX_READ_SPECIAL_REGISTER_R32<"pm1", int_ptx_read_pm1>;
-def PTX_READ_PM2 : PTX_READ_SPECIAL_REGISTER_R32<"pm2", int_ptx_read_pm2>;
-def PTX_READ_PM3 : PTX_READ_SPECIAL_REGISTER_R32<"pm3", int_ptx_read_pm3>;
-
-// PTX Parallel Synchronization and Communication Intrinsics
-
-def PTX_BAR_SYNC : InstPTX<(outs), (ins i32imm:$i), "bar.sync\t$i",
-                           [(int_ptx_bar_sync imm:$i)]>;
diff --git a/lib/Target/PTX/PTXMCAsmStreamer.cpp b/lib/Target/PTX/PTXMCAsmStreamer.cpp
deleted file mode 100644
index 3ed67a6..0000000
--- a/lib/Target/PTX/PTXMCAsmStreamer.cpp
+++ /dev/null
@@ -1,556 +0,0 @@
-//===-- PTXMCAsmStreamer.cpp - PTX Text Assembly Output -------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/OwningPtr.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstPrinter.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/PathV2.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-namespace {
-class PTXMCAsmStreamer : public MCStreamer {
-  formatted_raw_ostream &OS;
-  const MCAsmInfo &MAI;
-  OwningPtr<MCInstPrinter> InstPrinter;
-  OwningPtr<MCCodeEmitter> Emitter;
-
-  SmallString<128> CommentToEmit;
-  raw_svector_ostream CommentStream;
-
-  unsigned IsVerboseAsm : 1;
-  unsigned ShowInst : 1;
-
-public:
-  PTXMCAsmStreamer(MCContext &Context,
-                   formatted_raw_ostream &os,
-                   bool isVerboseAsm, bool useLoc,
-                   MCInstPrinter *printer,
-                   MCCodeEmitter *emitter,
-                   bool showInst)
-    : MCStreamer(Context), OS(os), MAI(Context.getAsmInfo()),
-      InstPrinter(printer), Emitter(emitter), CommentStream(CommentToEmit),
-      IsVerboseAsm(isVerboseAsm),
-      ShowInst(showInst) {
-    if (InstPrinter && IsVerboseAsm)
-      InstPrinter->setCommentStream(CommentStream);
-  }
-
-  ~PTXMCAsmStreamer() {}
-
-  inline void EmitEOL() {
-    // If we don't have any comments, just emit a \n.
-    if (!IsVerboseAsm) {
-      OS << '\n';
-      return;
-    }
-    EmitCommentsAndEOL();
-  }
-  void EmitCommentsAndEOL();
-
-  /// isVerboseAsm - Return true if this streamer supports verbose assembly at
-  /// all.
-  virtual bool isVerboseAsm() const { return IsVerboseAsm; }
-
-  /// hasRawTextSupport - We support EmitRawText.
-  virtual bool hasRawTextSupport() const { return true; }
-
-  /// AddComment - Add a comment that can be emitted to the generated .s
-  /// file if applicable as a QoI issue to make the output of the compiler
-  /// more readable.  This only affects the MCAsmStreamer, and only when
-  /// verbose assembly output is enabled.
-  virtual void AddComment(const Twine &T);
-
-  /// AddEncodingComment - Add a comment showing the encoding of an instruction.
-  virtual void AddEncodingComment(const MCInst &Inst);
-
-  /// GetCommentOS - Return a raw_ostream that comments can be written to.
-  /// Unlike AddComment, you are required to terminate comments with \n if you
-  /// use this method.
-  virtual raw_ostream &GetCommentOS() {
-    if (!IsVerboseAsm)
-      return nulls();  // Discard comments unless in verbose asm mode.
-    return CommentStream;
-  }
-
-  /// AddBlankLine - Emit a blank line to a .s file to pretty it up.
-  virtual void AddBlankLine() {
-    EmitEOL();
-  }
-
-  /// @name MCStreamer Interface
-  /// @{
-
-  virtual void ChangeSection(const MCSection *Section);
-  virtual void InitSections() { /* PTX does not use sections */ }
-
-  virtual void EmitLabel(MCSymbol *Symbol);
-
-  virtual void EmitAssemblerFlag(MCAssemblerFlag Flag);
-
-  virtual void EmitThumbFunc(MCSymbol *Func);
-
-  virtual void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value);
-
-  virtual void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol);
-
-  virtual void EmitDwarfAdvanceLineAddr(int64_t LineDelta,
-                                        const MCSymbol *LastLabel,
-                                        const MCSymbol *Label,
-                                        unsigned PointerSize);
-
-  virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
-
-  virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue);
-  virtual void BeginCOFFSymbolDef(const MCSymbol *Symbol);
-  virtual void EmitCOFFSymbolStorageClass(int StorageClass);
-  virtual void EmitCOFFSymbolType(int Type);
-  virtual void EndCOFFSymbolDef();
-  virtual void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value);
-  virtual void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
-                                unsigned ByteAlignment);
-
-  /// EmitLocalCommonSymbol - Emit a local common (.lcomm) symbol.
-  ///
-  /// @param Symbol - The common symbol to emit.
-  /// @param Size - The size of the common symbol.
-  /// @param ByteAlignment - The alignment of the common symbol in bytes.
-  virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
-                                     unsigned ByteAlignment);
-
-  virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
-                            unsigned Size = 0, unsigned ByteAlignment = 0);
-
-  virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
-                              uint64_t Size, unsigned ByteAlignment = 0);
-
-  virtual void EmitBytes(StringRef Data, unsigned AddrSpace);
-
-  virtual void EmitValueImpl(const MCExpr *Value, unsigned Size,
-                             unsigned AddrSpace);
-  virtual void EmitULEB128Value(const MCExpr *Value);
-  virtual void EmitSLEB128Value(const MCExpr *Value);
-  virtual void EmitGPRel32Value(const MCExpr *Value);
-
-
-  virtual void EmitFill(uint64_t NumBytes, uint8_t FillValue,
-                        unsigned AddrSpace);
-
-  virtual void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value = 0,
-                                    unsigned ValueSize = 1,
-                                    unsigned MaxBytesToEmit = 0);
-
-  virtual void EmitCodeAlignment(unsigned ByteAlignment,
-                                 unsigned MaxBytesToEmit = 0);
-
-  virtual bool EmitValueToOffset(const MCExpr *Offset,
-                                 unsigned char Value = 0);
-
-  virtual void EmitFileDirective(StringRef Filename);
-  virtual bool EmitDwarfFileDirective(unsigned FileNo, StringRef Directory,
-                                      StringRef Filename);
-
-  virtual void EmitInstruction(const MCInst &Inst);
-
-  /// EmitRawText - If this file is backed by an assembly streamer, this dumps
-  /// the specified string in the output .s file.  This capability is
-  /// indicated by the hasRawTextSupport() predicate.
-  virtual void EmitRawText(StringRef String);
-
-  virtual void FinishImpl();
-
-  /// @}
-
-}; // class PTXMCAsmStreamer
-
-}
-
-/// TODO: Add appropriate implementation of Emit*() methods when needed
-
-void PTXMCAsmStreamer::AddComment(const Twine &T) {
-  if (!IsVerboseAsm) return;
-
-  // Make sure that CommentStream is flushed.
-  CommentStream.flush();
-
-  T.toVector(CommentToEmit);
-  // Each comment goes on its own line.
-  CommentToEmit.push_back('\n');
-
-  // Tell the comment stream that the vector changed underneath it.
-  CommentStream.resync();
-}
-
-void PTXMCAsmStreamer::EmitCommentsAndEOL() {
-  if (CommentToEmit.empty() && CommentStream.GetNumBytesInBuffer() == 0) {
-    OS << '\n';
-    return;
-  }
-
-  CommentStream.flush();
-  StringRef Comments = CommentToEmit.str();
-
-  assert(Comments.back() == '\n' &&
-         "Comment array not newline terminated");
-  do {
-    // Emit a line of comments.
-    OS.PadToColumn(MAI.getCommentColumn());
-    size_t Position = Comments.find('\n');
-    OS << MAI.getCommentString() << ' ' << Comments.substr(0, Position) << '\n';
-
-    Comments = Comments.substr(Position+1);
-  } while (!Comments.empty());
-
-  CommentToEmit.clear();
-  // Tell the comment stream that the vector changed underneath it.
-  CommentStream.resync();
-}
-
-static inline int64_t truncateToSize(int64_t Value, unsigned Bytes) {
-  assert(Bytes && "Invalid size!");
-  return Value & ((uint64_t) (int64_t) -1 >> (64 - Bytes * 8));
-}
-
-void PTXMCAsmStreamer::ChangeSection(const MCSection *Section) {
-  assert(Section && "Cannot switch to a null section!");
-}
-
-void PTXMCAsmStreamer::EmitLabel(MCSymbol *Symbol) {
-  assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
-  assert(!Symbol->isVariable() && "Cannot emit a variable symbol!");
-  assert(getCurrentSection() && "Cannot emit before setting section!");
-
-  OS << *Symbol << MAI.getLabelSuffix();
-  EmitEOL();
-  Symbol->setSection(*getCurrentSection());
-}
-
-void PTXMCAsmStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {}
-
-void PTXMCAsmStreamer::EmitThumbFunc(MCSymbol *Func) {}
-
-void PTXMCAsmStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
-  OS << *Symbol << " = " << *Value;
-  EmitEOL();
-
-  // FIXME: Lift context changes into super class.
-  Symbol->setVariableValue(Value);
-}
-
-void PTXMCAsmStreamer::EmitWeakReference(MCSymbol *Alias,
-                                         const MCSymbol *Symbol) {
-  OS << ".weakref " << *Alias << ", " << *Symbol;
-  EmitEOL();
-}
-
-void PTXMCAsmStreamer::EmitDwarfAdvanceLineAddr(int64_t LineDelta,
-                                                const MCSymbol *LastLabel,
-                                                const MCSymbol *Label,
-                                                unsigned PointerSize) {
-  report_fatal_error("Unimplemented.");
-}
-
-void PTXMCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
-                                           MCSymbolAttr Attribute) {}
-
-void PTXMCAsmStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {}
-
-void PTXMCAsmStreamer::BeginCOFFSymbolDef(const MCSymbol *Symbol) {}
-
-void PTXMCAsmStreamer::EmitCOFFSymbolStorageClass (int StorageClass) {}
-
-void PTXMCAsmStreamer::EmitCOFFSymbolType (int Type) {}
-
-void PTXMCAsmStreamer::EndCOFFSymbolDef() {}
-
-void PTXMCAsmStreamer::EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) {}
-
-void PTXMCAsmStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
-                                        unsigned ByteAlignment) {}
-
-void PTXMCAsmStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
-                                             unsigned ByteAlignment) {}
-
-void PTXMCAsmStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
-                                    unsigned Size, unsigned ByteAlignment) {}
-
-void PTXMCAsmStreamer::EmitTBSSSymbol(const MCSection *Section,
-                                      MCSymbol *Symbol,
-                                      uint64_t Size, unsigned ByteAlignment) {}
-
-static inline char toOctal(int X) { return (X&7)+'0'; }
-
-static void PrintQuotedString(StringRef Data, raw_ostream &OS) {
-  OS << '"';
-
-  for (unsigned i = 0, e = Data.size(); i != e; ++i) {
-    unsigned char C = Data[i];
-    if (C == '"' || C == '\\') {
-      OS << '\\' << (char)C;
-      continue;
-    }
-
-    if (isprint((unsigned char)C)) {
-      OS << (char)C;
-      continue;
-    }
-
-    switch (C) {
-      case '\b': OS << "\\b"; break;
-      case '\f': OS << "\\f"; break;
-      case '\n': OS << "\\n"; break;
-      case '\r': OS << "\\r"; break;
-      case '\t': OS << "\\t"; break;
-      default:
-        OS << '\\';
-        OS << toOctal(C >> 6);
-        OS << toOctal(C >> 3);
-        OS << toOctal(C >> 0);
-        break;
-    }
-  }
-
-  OS << '"';
-}
-
-void PTXMCAsmStreamer::EmitBytes(StringRef Data, unsigned AddrSpace) {
-  assert(getCurrentSection() && "Cannot emit contents before setting section!");
-  if (Data.empty()) return;
-
-  if (Data.size() == 1) {
-    OS << MAI.getData8bitsDirective(AddrSpace);
-    OS << (unsigned)(unsigned char)Data[0];
-    EmitEOL();
-    return;
-  }
-
-  // If the data ends with 0 and the target supports .asciz, use it, otherwise
-  // use .ascii
-  if (MAI.getAscizDirective() && Data.back() == 0) {
-    OS << MAI.getAscizDirective();
-    Data = Data.substr(0, Data.size()-1);
-  } else {
-    OS << MAI.getAsciiDirective();
-  }
-
-  OS << ' ';
-  PrintQuotedString(Data, OS);
-  EmitEOL();
-}
-
-void PTXMCAsmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
-                                     unsigned AddrSpace) {
-  assert(getCurrentSection() && "Cannot emit contents before setting section!");
-  const char *Directive = 0;
-  switch (Size) {
-  default: break;
-  case 1: Directive = MAI.getData8bitsDirective(AddrSpace); break;
-  case 2: Directive = MAI.getData16bitsDirective(AddrSpace); break;
-  case 4: Directive = MAI.getData32bitsDirective(AddrSpace); break;
-  case 8:
-    Directive = MAI.getData64bitsDirective(AddrSpace);
-    // If the target doesn't support 64-bit data, emit as two 32-bit halves.
-    if (Directive) break;
-    int64_t IntValue;
-    if (!Value->EvaluateAsAbsolute(IntValue))
-      report_fatal_error("Don't know how to emit this value.");
-    if (getContext().getAsmInfo().isLittleEndian()) {
-      EmitIntValue((uint32_t)(IntValue >> 0 ), 4, AddrSpace);
-      EmitIntValue((uint32_t)(IntValue >> 32), 4, AddrSpace);
-    } else {
-      EmitIntValue((uint32_t)(IntValue >> 32), 4, AddrSpace);
-      EmitIntValue((uint32_t)(IntValue >> 0 ), 4, AddrSpace);
-    }
-    return;
-  }
-
-  assert(Directive && "Invalid size for machine code value!");
-  OS << Directive << *Value;
-  EmitEOL();
-}
-
-void PTXMCAsmStreamer::EmitULEB128Value(const MCExpr *Value) {
-  assert(MAI.hasLEB128() && "Cannot print a .uleb");
-  OS << ".uleb128 " << *Value;
-  EmitEOL();
-}
-
-void PTXMCAsmStreamer::EmitSLEB128Value(const MCExpr *Value) {
-  assert(MAI.hasLEB128() && "Cannot print a .sleb");
-  OS << ".sleb128 " << *Value;
-  EmitEOL();
-}
-
-void PTXMCAsmStreamer::EmitGPRel32Value(const MCExpr *Value) {
-  assert(MAI.getGPRel32Directive() != 0);
-  OS << MAI.getGPRel32Directive() << *Value;
-  EmitEOL();
-}
-
-
-/// EmitFill - Emit NumBytes bytes worth of the value specified by
-/// FillValue.  This implements directives such as '.space'.
-void PTXMCAsmStreamer::EmitFill(uint64_t NumBytes, uint8_t FillValue,
-                                unsigned AddrSpace) {
-  if (NumBytes == 0) return;
-
-  if (AddrSpace == 0)
-    if (const char *ZeroDirective = MAI.getZeroDirective()) {
-      OS << ZeroDirective << NumBytes;
-      if (FillValue != 0)
-        OS << ',' << (int)FillValue;
-      EmitEOL();
-      return;
-    }
-
-  // Emit a byte at a time.
-  MCStreamer::EmitFill(NumBytes, FillValue, AddrSpace);
-}
-
-void PTXMCAsmStreamer::EmitValueToAlignment(unsigned ByteAlignment,
-                                            int64_t Value,
-                                            unsigned ValueSize,
-                                            unsigned MaxBytesToEmit) {
-  // Some assemblers don't support non-power of two alignments, so we always
-  // emit alignments as a power of two if possible.
-  if (isPowerOf2_32(ByteAlignment)) {
-    switch (ValueSize) {
-    default: llvm_unreachable("Invalid size for machine code value!");
-    case 1: OS << MAI.getAlignDirective(); break;
-    // FIXME: use MAI for this!
-    case 2: OS << ".p2alignw "; break;
-    case 4: OS << ".p2alignl "; break;
-    case 8: llvm_unreachable("Unsupported alignment size!");
-    }
-
-    if (MAI.getAlignmentIsInBytes())
-      OS << ByteAlignment;
-    else
-      OS << Log2_32(ByteAlignment);
-
-    if (Value || MaxBytesToEmit) {
-      OS << ", 0x";
-      OS.write_hex(truncateToSize(Value, ValueSize));
-
-      if (MaxBytesToEmit)
-        OS << ", " << MaxBytesToEmit;
-    }
-    EmitEOL();
-    return;
-  }
-
-  // Non-power of two alignment.  This is not widely supported by assemblers.
-  // FIXME: Parameterize this based on MAI.
-  switch (ValueSize) {
-  default: llvm_unreachable("Invalid size for machine code value!");
-  case 1: OS << ".balign";  break;
-  case 2: OS << ".balignw"; break;
-  case 4: OS << ".balignl"; break;
-  case 8: llvm_unreachable("Unsupported alignment size!");
-  }
-
-  OS << ' ' << ByteAlignment;
-  OS << ", " << truncateToSize(Value, ValueSize);
-  if (MaxBytesToEmit)
-    OS << ", " << MaxBytesToEmit;
-  EmitEOL();
-}
-
-void PTXMCAsmStreamer::EmitCodeAlignment(unsigned ByteAlignment,
-                                         unsigned MaxBytesToEmit) {}
-
-bool PTXMCAsmStreamer::EmitValueToOffset(const MCExpr *Offset,
-                                         unsigned char Value) {return false;}
-
-
-void PTXMCAsmStreamer::EmitFileDirective(StringRef Filename) {
-  assert(MAI.hasSingleParameterDotFile());
-  OS << "\t.file\t";
-  PrintQuotedString(Filename, OS);
-  EmitEOL();
-}
-
-// FIXME: should we inherit from MCAsmStreamer?
-bool PTXMCAsmStreamer::EmitDwarfFileDirective(unsigned FileNo,
-                                              StringRef Directory,
-                                              StringRef Filename) {
-  if (!Directory.empty()) {
-    if (sys::path::is_absolute(Filename))
-      return EmitDwarfFileDirective(FileNo, "", Filename);
-    SmallString<128> FullPathName = Directory;
-    sys::path::append(FullPathName, Filename);
-    return EmitDwarfFileDirective(FileNo, "", FullPathName);
-  }
-
-  OS << "\t.file\t" << FileNo << ' ';
-  PrintQuotedString(Filename, OS);
-  EmitEOL();
-  return this->MCStreamer::EmitDwarfFileDirective(FileNo, Directory, Filename);
-}
-
-void PTXMCAsmStreamer::AddEncodingComment(const MCInst &Inst) {}
-
-void PTXMCAsmStreamer::EmitInstruction(const MCInst &Inst) {
-  assert(getCurrentSection() && "Cannot emit contents before setting section!");
-
-  // Show the encoding in a comment if we have a code emitter.
-  if (Emitter)
-    AddEncodingComment(Inst);
-
-  // Show the MCInst if enabled.
-  if (ShowInst) {
-    Inst.dump_pretty(GetCommentOS(), &MAI, InstPrinter.get(), "\n ");
-    GetCommentOS() << "\n";
-  }
-
-  // If we have an AsmPrinter, use that to print, otherwise print the MCInst.
-  if (InstPrinter)
-    InstPrinter->printInst(&Inst, OS, "");
-  else
-    Inst.print(OS, &MAI);
-  EmitEOL();
-}
-
-/// EmitRawText - If this file is backed by an assembly streamer, this dumps
-/// the specified string in the output .s file.  This capability is
-/// indicated by the hasRawTextSupport() predicate.
-void PTXMCAsmStreamer::EmitRawText(StringRef String) {
-  if (!String.empty() && String.back() == '\n')
-    String = String.substr(0, String.size()-1);
-  OS << String;
-  EmitEOL();
-}
-
-void PTXMCAsmStreamer::FinishImpl() {}
-
-namespace llvm {
-  MCStreamer *createPTXAsmStreamer(MCContext &Context,
-                                   formatted_raw_ostream &OS,
-                                   bool isVerboseAsm, bool useLoc, bool useCFI,
-                                   bool useDwarfDirectory,
-                                   MCInstPrinter *IP,
-                                   MCCodeEmitter *CE, MCAsmBackend *MAB,
-                                   bool ShowInst) {
-    return new PTXMCAsmStreamer(Context, OS, isVerboseAsm, useLoc,
-                                IP, CE, ShowInst);
-  }
-}
diff --git a/lib/Target/PTX/PTXMCInstLower.cpp b/lib/Target/PTX/PTXMCInstLower.cpp
deleted file mode 100644
index 142e639..0000000
--- a/lib/Target/PTX/PTXMCInstLower.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-//===-- PTXMCInstLower.cpp - Convert PTX MachineInstr to an MCInst --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains code to lower PTX MachineInstrs to their corresponding
-// MCInst records.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PTX.h"
-#include "PTXAsmPrinter.h"
-#include "llvm/Constants.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/Target/Mangler.h"
-
-void llvm::LowerPTXMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
-                                        PTXAsmPrinter &AP) {
-  OutMI.setOpcode(MI->getOpcode());
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
-    MCOperand MCOp;
-    OutMI.addOperand(AP.lowerOperand(MO));
-  }
-}
-
diff --git a/lib/Target/PTX/PTXMFInfoExtract.cpp b/lib/Target/PTX/PTXMFInfoExtract.cpp
deleted file mode 100644
index 172a0e0..0000000
--- a/lib/Target/PTX/PTXMFInfoExtract.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-//===-- PTXMFInfoExtract.cpp - Extract PTX machine function info ----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines an information extractor for PTX machine functions.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "ptx-mf-info-extract"
-
-#include "PTX.h"
-#include "PTXTargetMachine.h"
-#include "PTXMachineFunctionInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-// NOTE: PTXMFInfoExtract must after register allocation!
-
-namespace {
-  /// PTXMFInfoExtract - PTX specific code to extract of PTX machine
-  /// function information for PTXAsmPrinter
-  ///
-  class PTXMFInfoExtract : public MachineFunctionPass {
-    private:
-      static char ID;
-
-    public:
-      PTXMFInfoExtract(PTXTargetMachine &TM, CodeGenOpt::Level OptLevel)
-        : MachineFunctionPass(ID) {}
-
-      virtual bool runOnMachineFunction(MachineFunction &MF);
-
-      virtual const char *getPassName() const {
-        return "PTX Machine Function Info Extractor";
-      }
-  }; // class PTXMFInfoExtract
-} // end anonymous namespace
-
-using namespace llvm;
-
-char PTXMFInfoExtract::ID = 0;
-
-bool PTXMFInfoExtract::runOnMachineFunction(MachineFunction &MF) {
-  PTXMachineFunctionInfo *MFI = MF.getInfo<PTXMachineFunctionInfo>();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-
-  // Generate list of all virtual registers used in this function
-  for (unsigned i = 0; i < MRI.getNumVirtRegs(); ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
-    const TargetRegisterClass *TRC = MRI.getRegClass(Reg);
-    unsigned RegType;
-    if (TRC == PTX::RegPredRegisterClass)
-      RegType = PTXRegisterType::Pred;
-    else if (TRC == PTX::RegI16RegisterClass)
-      RegType = PTXRegisterType::B16;
-    else if (TRC == PTX::RegI32RegisterClass)
-      RegType = PTXRegisterType::B32;
-    else if (TRC == PTX::RegI64RegisterClass)
-      RegType = PTXRegisterType::B64;
-    else if (TRC == PTX::RegF32RegisterClass)
-      RegType = PTXRegisterType::F32;
-    else if (TRC == PTX::RegF64RegisterClass)
-      RegType = PTXRegisterType::F64;
-    else
-      llvm_unreachable("Unkown register class.");
-    MFI->addRegister(Reg, RegType, PTXRegisterSpace::Reg);
-  }
-
-  return false;
-}
-
-FunctionPass *llvm::createPTXMFInfoExtract(PTXTargetMachine &TM,
-                                           CodeGenOpt::Level OptLevel) {
-  return new PTXMFInfoExtract(TM, OptLevel);
-}
diff --git a/lib/Target/PTX/PTXMachineFunctionInfo.h b/lib/Target/PTX/PTXMachineFunctionInfo.h
deleted file mode 100644
index bb7574c..0000000
--- a/lib/Target/PTX/PTXMachineFunctionInfo.h
+++ /dev/null
@@ -1,202 +0,0 @@
-//===-- PTXMachineFuctionInfo.h - PTX machine function info ------*- C++ -*-==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares PTX-specific per-machine-function information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef PTX_MACHINE_FUNCTION_INFO_H
-#define PTX_MACHINE_FUNCTION_INFO_H
-
-#include "PTX.h"
-#include "PTXParamManager.h"
-#include "PTXRegisterInfo.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-namespace llvm {
-
-/// PTXMachineFunctionInfo - This class is derived from MachineFunction and
-/// contains private PTX target-specific information for each MachineFunction.
-///
-class PTXMachineFunctionInfo : public MachineFunctionInfo {
-  virtual void anchor();
-  bool IsKernel;
-  DenseSet<unsigned> RegArgs;
-  DenseSet<unsigned> RegRets;
-
-  typedef DenseMap<int, std::string> FrameMap;
-
-  FrameMap FrameSymbols;
-
-  struct RegisterInfo {
-    unsigned Reg;
-    unsigned Type;
-    unsigned Space;
-    unsigned Offset;
-    unsigned Encoded;
-  };
-
-  typedef DenseMap<unsigned, RegisterInfo> RegisterInfoMap;
-
-  RegisterInfoMap RegInfo;
-
-  PTXParamManager ParamManager;
-
-public:
-  typedef DenseSet<unsigned>::const_iterator reg_iterator;
-
-  PTXMachineFunctionInfo(MachineFunction &MF)
-    : IsKernel(false) {
-  }
-
-  /// getParamManager - Returns the PTXParamManager instance for this function.
-  PTXParamManager& getParamManager() { return ParamManager; }
-  const PTXParamManager& getParamManager() const { return ParamManager; }
-
-  /// setKernel/isKernel - Gets/sets a flag that indicates if this function is
-  /// a PTX kernel function.
-  void setKernel(bool _IsKernel=true) { IsKernel = _IsKernel; }
-  bool isKernel() const { return IsKernel; }
-
-  /// argreg_begin/argreg_end - Returns iterators to the set of registers
-  /// containing function arguments.
-  reg_iterator argreg_begin() const { return RegArgs.begin(); }
-  reg_iterator argreg_end()   const { return RegArgs.end(); }
-
-  /// retreg_begin/retreg_end - Returns iterators to the set of registers
-  /// containing the function return values.
-  reg_iterator retreg_begin() const { return RegRets.begin(); }
-  reg_iterator retreg_end()   const { return RegRets.end(); }
-
-  /// addRegister - Adds a virtual register to the set of all used registers
-  void addRegister(unsigned Reg, unsigned RegType, unsigned RegSpace) {
-    if (!RegInfo.count(Reg)) {
-      RegisterInfo Info;
-      Info.Reg = Reg;
-      Info.Type = RegType;
-      Info.Space = RegSpace;
-
-      // Determine register offset
-      Info.Offset = 0;
-      for(RegisterInfoMap::const_iterator i = RegInfo.begin(),
-          e = RegInfo.end(); i != e; ++i) {
-        const RegisterInfo& RI = i->second;
-        if (RI.Space == RegSpace)
-          if (RI.Space != PTXRegisterSpace::Reg || RI.Type == Info.Type)
-            Info.Offset++;
-      }
-
-      // Encode the register data into a single register number
-      Info.Encoded = (Info.Offset << 6) | (Info.Type << 3) | Info.Space;
-
-      RegInfo[Reg] = Info;
-
-      if (RegSpace == PTXRegisterSpace::Argument)
-        RegArgs.insert(Reg);
-      else if (RegSpace == PTXRegisterSpace::Return)
-        RegRets.insert(Reg);
-    }
-  }
-
-  /// countRegisters - Returns the number of registers of the given type and
-  /// space.
-  unsigned countRegisters(unsigned RegType, unsigned RegSpace) const {
-    unsigned Count = 0;
-    for(RegisterInfoMap::const_iterator i = RegInfo.begin(), e = RegInfo.end();
-        i != e; ++i) {
-      const RegisterInfo& RI = i->second;
-      if (RI.Type == RegType && RI.Space == RegSpace)
-        Count++;
-    }
-    return Count;
-  }
-
-  /// getEncodedRegister - Returns the encoded value of the register.
-  unsigned getEncodedRegister(unsigned Reg) const {
-    return RegInfo.lookup(Reg).Encoded;
-  }
-
-  /// addRetReg - Adds a register to the set of return-value registers.
-  void addRetReg(unsigned Reg) {
-    if (!RegRets.count(Reg)) {
-      RegRets.insert(Reg);
-    }
-  }
-
-  /// addArgReg - Adds a register to the set of function argument registers.
-  void addArgReg(unsigned Reg) {
-    RegArgs.insert(Reg);
-  }
-
-  /// getRegisterName - Returns the name of the specified virtual register. This
-  /// name is used during PTX emission.
-  std::string getRegisterName(unsigned Reg) const {
-    if (RegInfo.count(Reg)) {
-      const RegisterInfo& RI = RegInfo.lookup(Reg);
-      std::string Name;
-      raw_string_ostream NameStr(Name);
-      decodeRegisterName(NameStr, RI.Encoded);
-      NameStr.flush();
-      return Name;
-    }
-    else if (Reg == PTX::NoRegister)
-      return "%noreg";
-    else
-      llvm_unreachable("Register not in register name map");
-  }
-
-  /// getEncodedRegisterName - Returns the name of the encoded register.
-  std::string getEncodedRegisterName(unsigned EncodedReg) const {
-    std::string Name;
-    raw_string_ostream NameStr(Name);
-    decodeRegisterName(NameStr, EncodedReg);
-    NameStr.flush();
-    return Name;
-  }
-
-  /// getRegisterType - Returns the type of the specified virtual register.
-  unsigned getRegisterType(unsigned Reg) const {
-    if (RegInfo.count(Reg))
-      return RegInfo.lookup(Reg).Type;
-    else
-      llvm_unreachable("Unknown register");
-  }
-
-  /// getOffsetForRegister - Returns the offset of the virtual register
-  unsigned getOffsetForRegister(unsigned Reg) const {
-    if (RegInfo.count(Reg))
-      return RegInfo.lookup(Reg).Offset;
-    else
-      return 0;
-  }
-
-  /// getFrameSymbol - Returns the symbol name for the given FrameIndex.
-  const char* getFrameSymbol(int FrameIndex) {
-    if (FrameSymbols.count(FrameIndex)) {
-      return FrameSymbols.lookup(FrameIndex).c_str();
-    } else {
-      std::string Name          = "__local";
-      Name                     += utostr(FrameIndex);
-      // The whole point of caching this name is to ensure the pointer we pass
-      // to any getExternalSymbol() calls will remain valid for the lifetime of
-      // the back-end instance. This is to work around an issue in SelectionDAG
-      // where symbol names are expected to be life-long strings.
-      FrameSymbols[FrameIndex]  = Name;
-      return FrameSymbols[FrameIndex].c_str();
-    }
-  }
-}; // class PTXMachineFunctionInfo
-} // namespace llvm
-
-#endif // PTX_MACHINE_FUNCTION_INFO_H
diff --git a/lib/Target/PTX/PTXParamManager.cpp b/lib/Target/PTX/PTXParamManager.cpp
deleted file mode 100644
index cc1cc71..0000000
--- a/lib/Target/PTX/PTXParamManager.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-//===-- PTXParamManager.cpp - Manager for .param variables ----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the PTXParamManager class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PTXParamManager.h"
-#include "PTX.h"
-#include "llvm/ADT/StringExtras.h"
-
-using namespace llvm;
-
-PTXParamManager::PTXParamManager() {
-}
-
-unsigned PTXParamManager::addArgumentParam(unsigned Size) {
-  PTXParam Param;
-  Param.Type = PTX_PARAM_TYPE_ARGUMENT;
-  Param.Size = Size;
-
-  std::string Name;
-  Name = "__param_";
-  Name += utostr(ArgumentParams.size()+1);
-  Param.Name = Name;
-
-  unsigned Index = AllParams.size();
-  AllParams[Index] = Param;
-  ArgumentParams.push_back(Index);
-
-  return Index;
-}
-
-unsigned PTXParamManager::addReturnParam(unsigned Size) {
-  PTXParam Param;
-  Param.Type = PTX_PARAM_TYPE_RETURN;
-  Param.Size = Size;
-
-  std::string Name;
-  Name = "__ret_";
-  Name += utostr(ReturnParams.size()+1);
-  Param.Name = Name;
-
-  unsigned Index = AllParams.size();
-  AllParams[Index] = Param;
-  ReturnParams.push_back(Index);
-
-  return Index;
-}
-
-unsigned PTXParamManager::addLocalParam(unsigned Size) {
-  PTXParam Param;
-  Param.Type = PTX_PARAM_TYPE_LOCAL;
-  Param.Size = Size;
-
-  std::string Name;
-  Name = "__localparam_";
-  Name += utostr(LocalParams.size()+1);
-  Param.Name = Name;
-
-  unsigned Index = AllParams.size();
-  AllParams[Index] = Param;
-  LocalParams.push_back(Index);
-
-  return Index;
-}
-
diff --git a/lib/Target/PTX/PTXParamManager.h b/lib/Target/PTX/PTXParamManager.h
deleted file mode 100644
index 92e7728..0000000
--- a/lib/Target/PTX/PTXParamManager.h
+++ /dev/null
@@ -1,87 +0,0 @@
-//===-- PTXParamManager.h - Manager for .param variables --------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the PTXParamManager class, which manages all defined .param
-// variables for a particular function.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef PTX_PARAM_MANAGER_H
-#define PTX_PARAM_MANAGER_H
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
-#include <string>
-
-namespace llvm {
-
-/// PTXParamManager - This class manages all .param variables defined for a
-/// particular function.
-class PTXParamManager {
-private:
-
-  /// PTXParamType - Type of a .param variable
-  enum PTXParamType {
-    PTX_PARAM_TYPE_ARGUMENT,
-    PTX_PARAM_TYPE_RETURN,
-    PTX_PARAM_TYPE_LOCAL
-  };
-
-  /// PTXParam - Definition of a PTX .param variable
-  struct PTXParam {
-    PTXParamType  Type;
-    unsigned      Size;
-    std::string   Name;
-  };
-
-  DenseMap<unsigned, PTXParam> AllParams;
-  SmallVector<unsigned, 4> ArgumentParams;
-  SmallVector<unsigned, 4> ReturnParams;
-  SmallVector<unsigned, 4> LocalParams;
-
-public:
-
-  typedef SmallVector<unsigned, 4>::const_iterator param_iterator;
-
-  PTXParamManager();
-
-  param_iterator arg_begin() const { return ArgumentParams.begin(); }
-  param_iterator arg_end() const { return ArgumentParams.end(); }
-  param_iterator ret_begin() const { return ReturnParams.begin(); }
-  param_iterator ret_end() const { return ReturnParams.end(); }
-  param_iterator local_begin() const { return LocalParams.begin(); }
-  param_iterator local_end() const { return LocalParams.end(); }
-
-  /// addArgumentParam - Returns a new .param used as an argument.
-  unsigned addArgumentParam(unsigned Size);
-
-  /// addReturnParam - Returns a new .param used as a return argument.
-  unsigned addReturnParam(unsigned Size);
-
-  /// addLocalParam - Returns a new .param used as a local .param variable.
-  unsigned addLocalParam(unsigned Size);
-
-  /// getParamName - Returns the name of the parameter as a string.
-  const std::string &getParamName(unsigned Param) const {
-    assert(AllParams.count(Param) == 1 && "Param has not been defined!");
-    return AllParams.find(Param)->second.Name;
-  }
-
-  /// getParamSize - Returns the size of the parameter in bits.
-  unsigned getParamSize(unsigned Param) const {
-    assert(AllParams.count(Param) == 1 && "Param has not been defined!");
-    return AllParams.find(Param)->second.Size;
-  }
-
-};
-
-}
-
-#endif
-
diff --git a/lib/Target/PTX/PTXRegAlloc.cpp b/lib/Target/PTX/PTXRegAlloc.cpp
deleted file mode 100644
index 7fd5375..0000000
--- a/lib/Target/PTX/PTXRegAlloc.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-//===-- PTXRegAlloc.cpp - PTX Register Allocator --------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a register allocator for PTX code.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "ptx-reg-alloc"
-
-#include "PTX.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/RegAllocRegistry.h"
-
-using namespace llvm;
-
-namespace {
-  // Special register allocator for PTX.
-  class PTXRegAlloc : public MachineFunctionPass {
-  public:
-    static char ID;
-    PTXRegAlloc() : MachineFunctionPass(ID) {}
-
-    virtual const char* getPassName() const {
-      return "PTX Register Allocator";
-    }
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesCFG();
-      MachineFunctionPass::getAnalysisUsage(AU);
-    }
-
-    virtual bool runOnMachineFunction(MachineFunction &MF) {
-      // We do not actually do anything (at least not yet).
-      return false;
-    }
-  };
-
-  char PTXRegAlloc::ID = 0;
-
-  static RegisterRegAlloc
-    ptxRegAlloc("ptx", "PTX register allocator", createPTXRegisterAllocator);
-}
-
-FunctionPass *llvm::createPTXRegisterAllocator() {
-  return new PTXRegAlloc();
-}
-
diff --git a/lib/Target/PTX/PTXRegisterInfo.cpp b/lib/Target/PTX/PTXRegisterInfo.cpp
deleted file mode 100644
index b6ffd38..0000000
--- a/lib/Target/PTX/PTXRegisterInfo.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-//===-- PTXRegisterInfo.cpp - PTX Register Information --------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the PTX implementation of the TargetRegisterInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PTXRegisterInfo.h"
-#include "PTX.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-#define GET_REGINFO_TARGET_DESC
-#include "PTXGenRegisterInfo.inc"
-
-using namespace llvm;
-
-PTXRegisterInfo::PTXRegisterInfo(PTXTargetMachine &TM,
-                                 const TargetInstrInfo &tii)
-  // PTX does not have a return address register.
-  : PTXGenRegisterInfo(0), TII(tii) {
-}
-
-void PTXRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator /*II*/,
-                                          int /*SPAdj*/,
-                                          RegScavenger * /*RS*/) const {
-  llvm_unreachable("FrameIndex should have been previously eliminated!");
-}
diff --git a/lib/Target/PTX/PTXRegisterInfo.h b/lib/Target/PTX/PTXRegisterInfo.h
deleted file mode 100644
index 5614ce7..0000000
--- a/lib/Target/PTX/PTXRegisterInfo.h
+++ /dev/null
@@ -1,56 +0,0 @@
-//===-- PTXRegisterInfo.h - PTX Register Information Impl -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the PTX implementation of the MRegisterInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef PTX_REGISTER_INFO_H
-#define PTX_REGISTER_INFO_H
-
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/ADT/BitVector.h"
-
-#define GET_REGINFO_HEADER
-#include "PTXGenRegisterInfo.inc"
-
-namespace llvm {
-class PTXTargetMachine;
-class MachineFunction;
-
-struct PTXRegisterInfo : public PTXGenRegisterInfo {
-private:
-  const TargetInstrInfo &TII;
-
-public:
-  PTXRegisterInfo(PTXTargetMachine &TM,
-                  const TargetInstrInfo &tii);
-
-  virtual const uint16_t
-    *getCalleeSavedRegs(const MachineFunction *MF = 0) const {
-    static const uint16_t CalleeSavedRegs[] = { 0 };
-    return CalleeSavedRegs; // save nothing
-  }
-
-  virtual BitVector getReservedRegs(const MachineFunction &MF) const {
-    BitVector Reserved(getNumRegs());
-    return Reserved; // reserve no regs
-  }
-
-  virtual void eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                   int SPAdj,
-                                   RegScavenger *RS = NULL) const;
-
-  virtual unsigned getFrameRegister(const MachineFunction &MF) const {
-    llvm_unreachable("PTX does not have a frame register");
-  }
-}; // struct PTXRegisterInfo
-} // namespace llvm
-
-#endif // PTX_REGISTER_INFO_H
diff --git a/lib/Target/PTX/PTXRegisterInfo.td b/lib/Target/PTX/PTXRegisterInfo.td
deleted file mode 100644
index e8b262e..0000000
--- a/lib/Target/PTX/PTXRegisterInfo.td
+++ /dev/null
@@ -1,36 +0,0 @@
-//===-- PTXRegisterInfo.td - PTX Register defs -------------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-//  Declarations that describe the PTX register file
-//===----------------------------------------------------------------------===//
-
-class PTXReg<string n> : Register<n> {
-  let Namespace = "PTX";
-}
-
-//===----------------------------------------------------------------------===//
-//  Registers
-//===----------------------------------------------------------------------===//
-
-// The generated register info code throws warnings for empty register classes
-// (e.g. zero-length arrays), so we use a dummy register here just to prevent
-// these warnings.
-def DUMMY_REG : PTXReg<"R0">;
-
-//===----------------------------------------------------------------------===//
-//  Register classes
-//===----------------------------------------------------------------------===//
-def RegPred : RegisterClass<"PTX", [i1], 8, (add DUMMY_REG)>;
-def RegI16 : RegisterClass<"PTX", [i16], 16, (add DUMMY_REG)>;
-def RegI32 : RegisterClass<"PTX", [i32], 32, (add DUMMY_REG)>;
-def RegI64 : RegisterClass<"PTX", [i64], 64, (add DUMMY_REG)>;
-def RegF32 : RegisterClass<"PTX", [f32], 32, (add DUMMY_REG)>;
-def RegF64 : RegisterClass<"PTX", [f64], 64, (add DUMMY_REG)>;
-
diff --git a/lib/Target/PTX/PTXSelectionDAGInfo.cpp b/lib/Target/PTX/PTXSelectionDAGInfo.cpp
deleted file mode 100644
index a116fab..0000000
--- a/lib/Target/PTX/PTXSelectionDAGInfo.cpp
+++ /dev/null
@@ -1,150 +0,0 @@
-//===-- PTXSelectionDAGInfo.cpp - PTX SelectionDAG Info -------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the PTXSelectionDAGInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "ptx-selectiondag-info"
-#include "PTXTargetMachine.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/CodeGen/SelectionDAG.h"
-using namespace llvm;
-
-PTXSelectionDAGInfo::PTXSelectionDAGInfo(const TargetMachine &TM)
-  : TargetSelectionDAGInfo(TM),
-    Subtarget(&TM.getSubtarget<PTXSubtarget>()) {
-}
-
-PTXSelectionDAGInfo::~PTXSelectionDAGInfo() {
-}
-
-SDValue
-PTXSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
-                                             SDValue Chain,
-                                             SDValue Dst, SDValue Src,
-                                             SDValue Size, unsigned Align,
-                                             bool isVolatile, bool AlwaysInline,
-                                             MachinePointerInfo DstPtrInfo,
-                                          MachinePointerInfo SrcPtrInfo) const {
-  // Do repeated 4-byte loads and stores. To be improved.
-  // This requires 4-byte alignment.
-  if ((Align & 3) != 0)
-    return SDValue();
-  // This requires the copy size to be a constant, preferably
-  // within a subtarget-specific limit.
-  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
-  if (!ConstantSize)
-    return SDValue();
-  uint64_t SizeVal = ConstantSize->getZExtValue();
-  // Always inline memcpys. In PTX, we do not have a C library that provides
-  // a memcpy function.
-  //if (!AlwaysInline)
-  //  return SDValue();
-
-  unsigned BytesLeft = SizeVal & 3;
-  unsigned NumMemOps = SizeVal >> 2;
-  unsigned EmittedNumMemOps = 0;
-  EVT VT = MVT::i32;
-  unsigned VTSize = 4;
-  unsigned i = 0;
-  const unsigned MAX_LOADS_IN_LDM = 6;
-  SDValue TFOps[MAX_LOADS_IN_LDM];
-  SDValue Loads[MAX_LOADS_IN_LDM];
-  uint64_t SrcOff = 0, DstOff = 0;
-  EVT PointerType = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
-
-  // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the
-  // same number of stores.  The loads and stores will get combined into
-  // ldm/stm later on.
-  while (EmittedNumMemOps < NumMemOps) {
-    for (i = 0;
-         i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
-      Loads[i] = DAG.getLoad(VT, dl, Chain,
-                             DAG.getNode(ISD::ADD, dl, PointerType, Src,
-                                         DAG.getConstant(SrcOff, PointerType)),
-                             SrcPtrInfo.getWithOffset(SrcOff), isVolatile,
-                             false, false, 0);
-      TFOps[i] = Loads[i].getValue(1);
-      SrcOff += VTSize;
-    }
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
-
-    for (i = 0;
-         i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
-      TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
-                              DAG.getNode(ISD::ADD, dl, PointerType, Dst,
-                                          DAG.getConstant(DstOff, PointerType)),
-                              DstPtrInfo.getWithOffset(DstOff),
-                              isVolatile, false, 0);
-      DstOff += VTSize;
-    }
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
-
-    EmittedNumMemOps += i;
-  }
-
-  if (BytesLeft == 0)
-    return Chain;
-
-  // Issue loads / stores for the trailing (1 - 3) bytes.
-  unsigned BytesLeftSave = BytesLeft;
-  i = 0;
-  while (BytesLeft) {
-    if (BytesLeft >= 2) {
-      VT = MVT::i16;
-      VTSize = 2;
-    } else {
-      VT = MVT::i8;
-      VTSize = 1;
-    }
-
-    Loads[i] = DAG.getLoad(VT, dl, Chain,
-                           DAG.getNode(ISD::ADD, dl, PointerType, Src,
-                                       DAG.getConstant(SrcOff, PointerType)),
-                           SrcPtrInfo.getWithOffset(SrcOff), false, false,
-                           false, 0);
-    TFOps[i] = Loads[i].getValue(1);
-    ++i;
-    SrcOff += VTSize;
-    BytesLeft -= VTSize;
-  }
-  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
-
-  i = 0;
-  BytesLeft = BytesLeftSave;
-  while (BytesLeft) {
-    if (BytesLeft >= 2) {
-      VT = MVT::i16;
-      VTSize = 2;
-    } else {
-      VT = MVT::i8;
-      VTSize = 1;
-    }
-
-    TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
-                            DAG.getNode(ISD::ADD, dl, PointerType, Dst,
-                                        DAG.getConstant(DstOff, PointerType)),
-                            DstPtrInfo.getWithOffset(DstOff), false, false, 0);
-    ++i;
-    DstOff += VTSize;
-    BytesLeft -= VTSize;
-  }
-  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
-}
-
-SDValue PTXSelectionDAGInfo::
-EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
-                        SDValue Chain, SDValue Dst,
-                        SDValue Src, SDValue Size,
-                        unsigned Align, bool isVolatile,
-                        MachinePointerInfo DstPtrInfo) const {
-  llvm_unreachable("memset lowering not implemented for PTX yet");
-}
-
diff --git a/lib/Target/PTX/PTXSelectionDAGInfo.h b/lib/Target/PTX/PTXSelectionDAGInfo.h
deleted file mode 100644
index e0c7167..0000000
--- a/lib/Target/PTX/PTXSelectionDAGInfo.h
+++ /dev/null
@@ -1,53 +0,0 @@
-//===-- PTXSelectionDAGInfo.h - PTX SelectionDAG Info -----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the PTX subclass for TargetSelectionDAGInfo.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef PTXSELECTIONDAGINFO_H
-#define PTXSELECTIONDAGINFO_H
-
-#include "llvm/Target/TargetSelectionDAGInfo.h"
-
-namespace llvm {
-
-/// PTXSelectionDAGInfo - TargetSelectionDAGInfo sub-class for the PTX target.
-/// At the moment, this is mostly just a copy of ARMSelectionDAGInfo.
-class PTXSelectionDAGInfo : public TargetSelectionDAGInfo {
-  /// Subtarget - Keep a pointer to the PTXSubtarget around so that we can
-  /// make the right decision when generating code for different targets.
-  const PTXSubtarget *Subtarget;
-
-public:
-  explicit PTXSelectionDAGInfo(const TargetMachine &TM);
-  ~PTXSelectionDAGInfo();
-
-  virtual
-  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
-                                  SDValue Chain,
-                                  SDValue Dst, SDValue Src,
-                                  SDValue Size, unsigned Align,
-                                  bool isVolatile, bool AlwaysInline,
-                                  MachinePointerInfo DstPtrInfo,
-                                  MachinePointerInfo SrcPtrInfo) const;
-
-  virtual
-  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
-                                  SDValue Chain,
-                                  SDValue Op1, SDValue Op2,
-                                  SDValue Op3, unsigned Align,
-                                  bool isVolatile,
-                                  MachinePointerInfo DstPtrInfo) const;
-};
-
-}
-
-#endif
-
diff --git a/lib/Target/PTX/PTXSubtarget.cpp b/lib/Target/PTX/PTXSubtarget.cpp
deleted file mode 100644
index 454f64e..0000000
--- a/lib/Target/PTX/PTXSubtarget.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-//===-- PTXSubtarget.cpp - PTX Subtarget Information ----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the PTX specific subclass of TargetSubtargetInfo.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PTXSubtarget.h"
-#include "PTX.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
-
-#define GET_SUBTARGETINFO_TARGET_DESC
-#define GET_SUBTARGETINFO_CTOR
-#include "PTXGenSubtargetInfo.inc"
-
-using namespace llvm;
-
-void PTXSubtarget::anchor() { }
-
-PTXSubtarget::PTXSubtarget(const std::string &TT, const std::string &CPU,
-                           const std::string &FS, bool is64Bit)
-  : PTXGenSubtargetInfo(TT, CPU, FS),
-    PTXTarget(PTX_COMPUTE_1_0),
-    PTXVersion(PTX_VERSION_2_0),
-    SupportsDouble(false),
-    SupportsFMA(true),
-    Is64Bit(is64Bit) {
-  std::string TARGET = CPU;
-  if (TARGET.empty())
-    TARGET = "generic";
-  ParseSubtargetFeatures(TARGET, FS);
-}
-
-std::string PTXSubtarget::getTargetString() const {
-  switch(PTXTarget) {
-    default: llvm_unreachable("Unknown PTX target");
-    case PTX_SM_1_0: return "sm_10";
-    case PTX_SM_1_1: return "sm_11";
-    case PTX_SM_1_2: return "sm_12";
-    case PTX_SM_1_3: return "sm_13";
-    case PTX_SM_2_0: return "sm_20";
-    case PTX_SM_2_1: return "sm_21";
-    case PTX_SM_2_2: return "sm_22";
-    case PTX_SM_2_3: return "sm_23";
-    case PTX_COMPUTE_1_0: return "compute_10";
-    case PTX_COMPUTE_1_1: return "compute_11";
-    case PTX_COMPUTE_1_2: return "compute_12";
-    case PTX_COMPUTE_1_3: return "compute_13";
-    case PTX_COMPUTE_2_0: return "compute_20";
-  }
-}
-
-std::string PTXSubtarget::getPTXVersionString() const {
-  switch(PTXVersion) {
-    case PTX_VERSION_2_0: return "2.0";
-    case PTX_VERSION_2_1: return "2.1";
-    case PTX_VERSION_2_2: return "2.2";
-    case PTX_VERSION_2_3: return "2.3";
-  }
-  llvm_unreachable("Invalid PTX version");
-}
diff --git a/lib/Target/PTX/PTXSubtarget.h b/lib/Target/PTX/PTXSubtarget.h
deleted file mode 100644
index ce93fef..0000000
--- a/lib/Target/PTX/PTXSubtarget.h
+++ /dev/null
@@ -1,131 +0,0 @@
-//===-- PTXSubtarget.h - Define Subtarget for the PTX -----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares the PTX specific subclass of TargetSubtargetInfo.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef PTX_SUBTARGET_H
-#define PTX_SUBTARGET_H
-
-#include "llvm/Target/TargetSubtargetInfo.h"
-
-#define GET_SUBTARGETINFO_HEADER
-#include "PTXGenSubtargetInfo.inc"
-
-namespace llvm {
-class StringRef;
-
-  class PTXSubtarget : public PTXGenSubtargetInfo {
-      virtual void anchor(); 
-    public:
-
-      /**
-       * Enumeration of Shader Models supported by the back-end.
-       */
-      enum PTXTargetEnum {
-        PTX_COMPUTE_1_0, /*< Compute Compatibility 1.0 */
-        PTX_COMPUTE_1_1, /*< Compute Compatibility 1.1 */
-        PTX_COMPUTE_1_2, /*< Compute Compatibility 1.2 */
-        PTX_COMPUTE_1_3, /*< Compute Compatibility 1.3 */
-        PTX_COMPUTE_2_0, /*< Compute Compatibility 2.0 */
-        PTX_LAST_COMPUTE,
-
-        PTX_SM_1_0, /*< Shader Model 1.0 */
-        PTX_SM_1_1, /*< Shader Model 1.1 */
-        PTX_SM_1_2, /*< Shader Model 1.2 */
-        PTX_SM_1_3, /*< Shader Model 1.3 */
-        PTX_SM_2_0, /*< Shader Model 2.0 */
-        PTX_SM_2_1, /*< Shader Model 2.1 */
-        PTX_SM_2_2, /*< Shader Model 2.2 */
-        PTX_SM_2_3, /*< Shader Model 2.3 */
-        PTX_LAST_SM
-      };
-
-      /**
-       * Enumeration of PTX versions supported by the back-end.
-       *
-       * Currently, PTX 2.0 is the minimum supported version.
-       */
-      enum PTXVersionEnum {
-        PTX_VERSION_2_0,  /*< PTX Version 2.0 */
-        PTX_VERSION_2_1,  /*< PTX Version 2.1 */
-        PTX_VERSION_2_2,  /*< PTX Version 2.2 */
-        PTX_VERSION_2_3   /*< PTX Version 2.3 */
-      };
-
-  private:
-
-      /// Shader Model supported on the target GPU.
-      PTXTargetEnum PTXTarget;
-
-      /// PTX Language Version.
-      PTXVersionEnum PTXVersion;
-
-      // The native .f64 type is supported on the hardware.
-      bool SupportsDouble;
-
-      // Support the fused-multiply add (FMA) and multiply-add (MAD)
-      // instructions
-      bool SupportsFMA;
-
-      // Use .u64 instead of .u32 for addresses.
-      bool Is64Bit;
-
-    public:
-
-      PTXSubtarget(const std::string &TT, const std::string &CPU,
-                   const std::string &FS, bool is64Bit);
-
-      // Target architecture accessors
-      std::string getTargetString() const;
-
-      std::string getPTXVersionString() const;
-
-      bool supportsDouble() const { return SupportsDouble; }
-
-      bool is64Bit() const { return Is64Bit; }
-
-      bool supportsFMA() const { return SupportsFMA; }
-
-      bool supportsPTX21() const { return PTXVersion >= PTX_VERSION_2_1; }
-
-      bool supportsPTX22() const { return PTXVersion >= PTX_VERSION_2_2; }
-
-      bool supportsPTX23() const { return PTXVersion >= PTX_VERSION_2_3; }
-
-      bool fdivNeedsRoundingMode() const {
-        return (PTXTarget >= PTX_SM_1_3 && PTXTarget < PTX_LAST_SM) ||
-               (PTXTarget >= PTX_COMPUTE_1_3 && PTXTarget < PTX_LAST_COMPUTE);
-      }
-
-      bool fmadNeedsRoundingMode() const {
-        return (PTXTarget >= PTX_SM_1_3 && PTXTarget < PTX_LAST_SM) ||
-               (PTXTarget >= PTX_COMPUTE_1_3 && PTXTarget < PTX_LAST_COMPUTE);
-      }
-
-      bool useParamSpaceForDeviceArgs() const {
-        return (PTXTarget >= PTX_SM_2_0 && PTXTarget < PTX_LAST_SM) ||
-               (PTXTarget >= PTX_COMPUTE_2_0 && PTXTarget < PTX_LAST_COMPUTE);
-      }
-
-      bool callsAreHandled() const {
-        return (PTXTarget >= PTX_SM_2_0 && PTXTarget < PTX_LAST_SM) ||
-               (PTXTarget >= PTX_COMPUTE_2_0 && PTXTarget < PTX_LAST_COMPUTE);
-      }
-
-      bool emitPtrAttribute() const {
-        return PTXVersion >= PTX_VERSION_2_2;
-      }
-
-      void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
-  }; // class PTXSubtarget
-} // namespace llvm
-
-#endif // PTX_SUBTARGET_H
diff --git a/lib/Target/PTX/PTXTargetMachine.cpp b/lib/Target/PTX/PTXTargetMachine.cpp
deleted file mode 100644
index c55a658..0000000
--- a/lib/Target/PTX/PTXTargetMachine.cpp
+++ /dev/null
@@ -1,165 +0,0 @@
-//===-- PTXTargetMachine.cpp - Define TargetMachine for PTX ---------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Top-level implementation for the PTX target.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PTXTargetMachine.h"
-#include "PTX.h"
-#include "llvm/PassManager.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/Verifier.h"
-#include "llvm/Assembly/PrintModulePass.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/MachineFunctionAnalysis.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
-#include "llvm/Transforms/Scalar.h"
-
-
-using namespace llvm;
-
-namespace llvm {
-  MCStreamer *createPTXAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
-                                   bool isVerboseAsm, bool useLoc,
-                                   bool useCFI, bool useDwarfDirectory,
-                                   MCInstPrinter *InstPrint,
-                                   MCCodeEmitter *CE,
-                                   MCAsmBackend *MAB,
-                                   bool ShowInst);
-}
-
-extern "C" void LLVMInitializePTXTarget() {
-
-  RegisterTargetMachine<PTX32TargetMachine> X(ThePTX32Target);
-  RegisterTargetMachine<PTX64TargetMachine> Y(ThePTX64Target);
-
-  TargetRegistry::RegisterAsmStreamer(ThePTX32Target, createPTXAsmStreamer);
-  TargetRegistry::RegisterAsmStreamer(ThePTX64Target, createPTXAsmStreamer);
-}
-
-namespace {
-  const char* DataLayout32 =
-    "e-p:32:32-i64:32:32-f64:32:32-v128:32:128-v64:32:64-n32:64";
-  const char* DataLayout64 =
-    "e-p:64:64-i64:32:32-f64:32:32-v128:32:128-v64:32:64-n32:64";
-}
-
-// DataLayout and FrameLowering are filled with dummy data
-PTXTargetMachine::PTXTargetMachine(const Target &T,
-                                   StringRef TT, StringRef CPU, StringRef FS,
-                                   const TargetOptions &Options,
-                                   Reloc::Model RM, CodeModel::Model CM,
-                                   CodeGenOpt::Level OL,
-                                   bool is64Bit)
-  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-    DataLayout(is64Bit ? DataLayout64 : DataLayout32),
-    Subtarget(TT, CPU, FS, is64Bit),
-    FrameLowering(Subtarget),
-    InstrInfo(*this),
-    TSInfo(*this),
-    TLInfo(*this) {
-}
-
-void PTX32TargetMachine::anchor() { }
-
-PTX32TargetMachine::PTX32TargetMachine(const Target &T, StringRef TT,
-                                       StringRef CPU, StringRef FS,
-                                       const TargetOptions &Options,
-                                       Reloc::Model RM, CodeModel::Model CM,
-                                       CodeGenOpt::Level OL)
-  : PTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {
-}
-
-void PTX64TargetMachine::anchor() { }
-
-PTX64TargetMachine::PTX64TargetMachine(const Target &T, StringRef TT,
-                                       StringRef CPU, StringRef FS,
-                                       const TargetOptions &Options,
-                                       Reloc::Model RM, CodeModel::Model CM,
-                                       CodeGenOpt::Level OL)
-  : PTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {
-}
-
-namespace llvm {
-/// PTX Code Generator Pass Configuration Options.
-class PTXPassConfig : public TargetPassConfig {
-public:
-  PTXPassConfig(PTXTargetMachine *TM, PassManagerBase &PM)
-    : TargetPassConfig(TM, PM) {}
-
-  PTXTargetMachine &getPTXTargetMachine() const {
-      return getTM<PTXTargetMachine>();
-  }
-
-  bool addInstSelector();
-  FunctionPass *createTargetRegisterAllocator(bool);
-  void addOptimizedRegAlloc(FunctionPass *RegAllocPass);
-  bool addPostRegAlloc();
-  void addMachineLateOptimization();
-  bool addPreEmitPass();
-};
-} // namespace
-
-TargetPassConfig *PTXTargetMachine::createPassConfig(PassManagerBase &PM) {
-  PTXPassConfig *PassConfig = new PTXPassConfig(this, PM);
-  PassConfig->disablePass(PrologEpilogCodeInserterID);
-  return PassConfig;
-}
-
-bool PTXPassConfig::addInstSelector() {
-  PM.add(createPTXISelDag(getPTXTargetMachine(), getOptLevel()));
-  return false;
-}
-
-FunctionPass *PTXPassConfig::createTargetRegisterAllocator(bool /*Optimized*/) {
-  return createPTXRegisterAllocator();
-}
-
-// Modify the optimized compilation path to bypass optimized register alloction.
-void PTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
-  addFastRegAlloc(RegAllocPass);
-}
-
-bool PTXPassConfig::addPostRegAlloc() {
-  // PTXMFInfoExtract must after register allocation!
-  //PM.add(createPTXMFInfoExtract(getPTXTargetMachine()));
-  return false;
-}
-
-/// Add passes that optimize machine instructions after register allocation.
-void PTXPassConfig::addMachineLateOptimization() {
-  if (addPass(BranchFolderPassID) != &NoPassID)
-    printAndVerify("After BranchFolding");
-
-  if (addPass(TailDuplicateID) != &NoPassID)
-    printAndVerify("After TailDuplicate");
-}
-
-bool PTXPassConfig::addPreEmitPass() {
-  PM.add(createPTXMFInfoExtract(getPTXTargetMachine(), getOptLevel()));
-  PM.add(createPTXFPRoundingModePass(getPTXTargetMachine(), getOptLevel()));
-  return true;
-}
diff --git a/lib/Target/PTX/PTXTargetMachine.h b/lib/Target/PTX/PTXTargetMachine.h
deleted file mode 100644
index 278d155..0000000
--- a/lib/Target/PTX/PTXTargetMachine.h
+++ /dev/null
@@ -1,104 +0,0 @@
-//===-- PTXTargetMachine.h - Define TargetMachine for PTX -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares the PTX specific subclass of TargetMachine.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef PTX_TARGET_MACHINE_H
-#define PTX_TARGET_MACHINE_H
-
-#include "PTXISelLowering.h"
-#include "PTXInstrInfo.h"
-#include "PTXFrameLowering.h"
-#include "PTXSelectionDAGInfo.h"
-#include "PTXSubtarget.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetMachine.h"
-
-namespace llvm {
-class PTXTargetMachine : public LLVMTargetMachine {
-  private:
-    const TargetData    DataLayout;
-    PTXSubtarget        Subtarget; // has to be initialized before FrameLowering
-    PTXFrameLowering    FrameLowering;
-    PTXInstrInfo        InstrInfo;
-    PTXSelectionDAGInfo TSInfo;
-    PTXTargetLowering   TLInfo;
-
-  public:
-    PTXTargetMachine(const Target &T, StringRef TT,
-                     StringRef CPU, StringRef FS, const TargetOptions &Options,
-                     Reloc::Model RM, CodeModel::Model CM,
-                     CodeGenOpt::Level OL,
-                     bool is64Bit);
-
-    virtual const TargetData *getTargetData() const { return &DataLayout; }
-
-    virtual const TargetFrameLowering *getFrameLowering() const {
-      return &FrameLowering;
-    }
-
-    virtual const PTXInstrInfo *getInstrInfo() const { return &InstrInfo; }
-    virtual const TargetRegisterInfo *getRegisterInfo() const {
-      return &InstrInfo.getRegisterInfo(); }
-
-    virtual const PTXTargetLowering *getTargetLowering() const {
-      return &TLInfo; }
-
-    virtual const PTXSelectionDAGInfo* getSelectionDAGInfo() const {
-      return &TSInfo;
-    }
-
-    virtual const PTXSubtarget *getSubtargetImpl() const { return &Subtarget; }
-
-    // Emission of machine code through JITCodeEmitter is not supported.
-    virtual bool addPassesToEmitMachineCode(PassManagerBase &,
-                                            JITCodeEmitter &,
-                                            bool = true) {
-      return true;
-    }
-
-    // Emission of machine code through MCJIT is not supported.
-    virtual bool addPassesToEmitMC(PassManagerBase &,
-                                   MCContext *&,
-                                   raw_ostream &,
-                                   bool = true) {
-      return true;
-    }
-
-    // Pass Pipeline Configuration
-    virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
-}; // class PTXTargetMachine
-
-
-class PTX32TargetMachine : public PTXTargetMachine {
-  virtual void anchor();
-public:
-
-  PTX32TargetMachine(const Target &T, StringRef TT,
-                     StringRef CPU, StringRef FS, const TargetOptions &Options,
-                     Reloc::Model RM, CodeModel::Model CM,
-                     CodeGenOpt::Level OL);
-}; // class PTX32TargetMachine
-
-class PTX64TargetMachine : public PTXTargetMachine {
-  virtual void anchor();
-public:
-
-  PTX64TargetMachine(const Target &T, StringRef TT,
-                     StringRef CPU, StringRef FS, const TargetOptions &Options,
-                     Reloc::Model RM, CodeModel::Model CM,
-                     CodeGenOpt::Level OL);
-}; // class PTX32TargetMachine
-
-} // namespace llvm
-
-#endif // PTX_TARGET_MACHINE_H
diff --git a/lib/Target/PTX/TargetInfo/CMakeLists.txt b/lib/Target/PTX/TargetInfo/CMakeLists.txt
deleted file mode 100644
index d9a5da3..0000000
--- a/lib/Target/PTX/TargetInfo/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
-add_llvm_library(LLVMPTXInfo
-  PTXTargetInfo.cpp
-  )
-
-add_dependencies(LLVMPTXInfo PTXCommonTableGen)
diff --git a/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp b/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp
deleted file mode 100644
index 09a2735..0000000
--- a/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-//===-- PTXTargetInfo.cpp - PTX Target Implementation ---------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PTX.h"
-#include "llvm/Module.h"
-#include "llvm/Support/TargetRegistry.h"
-
-using namespace llvm;
-
-Target llvm::ThePTX32Target;
-Target llvm::ThePTX64Target;
-
-extern "C" void LLVMInitializePTXTargetInfo() {
-  // see llvm/ADT/Triple.h
-  RegisterTarget<Triple::ptx32> X32(ThePTX32Target, "ptx32",
-                                    "PTX (32-bit) [Experimental]");
-  RegisterTarget<Triple::ptx64> X64(ThePTX64Target, "ptx64",
-                                    "PTX (64-bit) [Experimental]");
-}
diff --git a/lib/Target/PowerPC/CMakeLists.txt b/lib/Target/PowerPC/CMakeLists.txt
index bcd8bd2..192d18d 100644
--- a/lib/Target/PowerPC/CMakeLists.txt
+++ b/lib/Target/PowerPC/CMakeLists.txt
@@ -14,6 +14,7 @@ add_llvm_target(PowerPCCodeGen
   PPCAsmPrinter.cpp
   PPCBranchSelector.cpp
   PPCCodeEmitter.cpp
+  PPCCTRLoops.cpp
   PPCHazardRecognizers.cpp
   PPCInstrInfo.cpp
   PPCISelDAGToDAG.cpp
@@ -28,6 +29,8 @@ add_llvm_target(PowerPCCodeGen
   PPCSelectionDAGInfo.cpp
   )
 
+add_dependencies(LLVMPowerPCCodeGen intrinsics_gen)
+
 add_subdirectory(InstPrinter)
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
index 61d23ce..d175e3e 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -86,8 +86,33 @@ void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
 void PPCInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo,
                                            raw_ostream &O, 
                                            const char *Modifier) {
-  assert(Modifier && "Must specify 'cc' or 'reg' as predicate op modifier!");
   unsigned Code = MI->getOperand(OpNo).getImm();
+  if (!Modifier) {
+    unsigned CCReg = MI->getOperand(OpNo+1).getReg();
+    unsigned RegNo;
+    switch (CCReg) {
+    default: llvm_unreachable("Unknown CR register");
+    case PPC::CR0: RegNo = 0; break;
+    case PPC::CR1: RegNo = 1; break;
+    case PPC::CR2: RegNo = 2; break;
+    case PPC::CR3: RegNo = 3; break;
+    case PPC::CR4: RegNo = 4; break;
+    case PPC::CR5: RegNo = 5; break;
+    case PPC::CR6: RegNo = 6; break;
+    case PPC::CR7: RegNo = 7; break;
+    }
+
+    // Print the CR bit number. The Code is ((BI << 5) | BO) for a
+    // BCC, but we must have the positive form here (BO == 12)
+    unsigned BI = Code >> 5;
+    assert((Code & 0xF) == 12 &&
+           "BO in predicate bit must have the positive form");
+
+    unsigned Value = 4*RegNo + BI;
+    O << Value;
+    return;
+  }
+
   if (StringRef(Modifier) == "cc") {
     switch ((PPC::Predicate)Code) {
     case PPC::PRED_ALWAYS: return; // Don't print anything for always.
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
index 73fd534..8f1e211 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
@@ -42,7 +42,7 @@ public:
 
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printPredicateOperand(const MCInst *MI, unsigned OpNo,
-                             raw_ostream &O, const char *Modifier);
+                             raw_ostream &O, const char *Modifier = 0);
 
 
   void printS5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index 5a6827f..f652422 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -77,6 +77,7 @@ public:
 } // end anonymous namespace
   
 MCCodeEmitter *llvm::createPPCMCCodeEmitter(const MCInstrInfo &MCII,
+                                            const MCRegisterInfo &MRI,
                                             const MCSubtargetInfo &STI,
                                             MCContext &Ctx) {
   return new PPCMCCodeEmitter(MCII, STI, Ctx);
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
index b7fa064..7162e15 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
@@ -22,6 +22,7 @@ class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
 class MCObjectWriter;
+class MCRegisterInfo;
 class MCSubtargetInfo;
 class Target;
 class StringRef;
@@ -31,6 +32,7 @@ extern Target ThePPC32Target;
 extern Target ThePPC64Target;
   
 MCCodeEmitter *createPPCMCCodeEmitter(const MCInstrInfo &MCII,
+                                      const MCRegisterInfo &MRI,
                                       const MCSubtargetInfo &STI,
                                       MCContext &Ctx);
 
diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
index 24a7178..9103e12 100644
--- a/lib/Target/PowerPC/PPC.h
+++ b/lib/Target/PowerPC/PPC.h
@@ -30,6 +30,7 @@ namespace llvm {
   class AsmPrinter;
   class MCInst;
 
+  FunctionPass *createPPCCTRLoops();
   FunctionPass *createPPCBranchSelectionPass();
   FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
   FunctionPass *createPPCJITCodeEmitterPass(PPCTargetMachine &TM,
@@ -50,21 +51,27 @@ namespace llvm {
     /// and jumps to external functions on Tiger and earlier.
     MO_DARWIN_STUB = 1,
     
-    /// MO_LO16, MO_HA16 - lo16(symbol) and ha16(symbol)
-    MO_LO16 = 4, MO_HA16 = 8,
-
     /// MO_PIC_FLAG - If this bit is set, the symbol reference is relative to
     /// the function's picbase, e.g. lo16(symbol-picbase).
-    MO_PIC_FLAG = 16,
+    MO_PIC_FLAG = 4,
 
     /// MO_NLP_FLAG - If this bit is set, the symbol reference is actually to
     /// the non_lazy_ptr for the global, e.g. lo16(symbol$non_lazy_ptr-picbase).
-    MO_NLP_FLAG = 32,
+    MO_NLP_FLAG = 8,
     
     /// MO_NLP_HIDDEN_FLAG - If this bit is set, the symbol reference is to a
     /// symbol with hidden visibility.  This causes a different kind of
     /// non-lazy-pointer to be generated.
-    MO_NLP_HIDDEN_FLAG = 64
+    MO_NLP_HIDDEN_FLAG = 16,
+
+    /// The next are not flags but distinct values.
+    MO_ACCESS_MASK = 224,
+
+    /// MO_LO16, MO_HA16 - lo16(symbol) and ha16(symbol)
+    MO_LO16 = 32, MO_HA16 = 64,
+
+    MO_TPREL16_HA = 96,
+    MO_TPREL16_LO = 128
   };
   } // end namespace PPCII
   
diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td
index c554d39..b7f1688 100644
--- a/lib/Target/PowerPC/PPC.td
+++ b/lib/Target/PowerPC/PPC.td
@@ -35,6 +35,8 @@ def Directive970 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_970", "">;
 def Directive32  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_32", "">;
 def Directive64  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_64", "">;
 def DirectiveA2  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_A2", "">;
+def DirectivePwr6: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR6", "">;
+def DirectivePwr7: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR7", "">;
 
 def Feature64Bit     : SubtargetFeature<"64bit","Has64BitSupport", "true",
                                         "Enable 64-bit instructions">;
@@ -42,12 +44,14 @@ def Feature64BitRegs : SubtargetFeature<"64bitregs","Use64BitRegs", "true",
                               "Enable 64-bit registers usage for ppc32 [beta]">;
 def FeatureAltivec   : SubtargetFeature<"altivec","HasAltivec", "true",
                                         "Enable Altivec instructions">;
-def FeatureGPUL      : SubtargetFeature<"gpul","IsGigaProcessor", "true",
-                                        "Enable GPUL instructions">;
+def FeatureMFOCRF    : SubtargetFeature<"mfocrf","HasMFOCRF", "true",
+                                        "Enable the MFOCRF instruction">;
 def FeatureFSqrt     : SubtargetFeature<"fsqrt","HasFSQRT", "true",
                                         "Enable the fsqrt instruction">;
 def FeatureSTFIWX    : SubtargetFeature<"stfiwx","HasSTFIWX", "true",
                                         "Enable the stfiwx instruction">;
+def FeatureISEL      : SubtargetFeature<"isel","HasISEL", "true",
+                                        "Enable the isel instruction">;
 def FeatureBookE     : SubtargetFeature<"booke", "IsBookE", "true",
                                         "Enable Book E instructions">;
 
@@ -64,8 +68,10 @@ include "PPCInstrInfo.td"
 //
 
 def : Processor<"generic", G3Itineraries, [Directive32]>;
-def : Processor<"440", PPC440Itineraries, [Directive440, FeatureBookE]>;
-def : Processor<"450", PPC440Itineraries, [Directive440, FeatureBookE]>;
+def : Processor<"440", PPC440Itineraries, [Directive440, FeatureISEL,
+                                           FeatureBookE]>;
+def : Processor<"450", PPC440Itineraries, [Directive440, FeatureISEL,
+                                           FeatureBookE]>;
 def : Processor<"601", G3Itineraries, [Directive601]>;
 def : Processor<"602", G3Itineraries, [Directive602]>;
 def : Processor<"603", G3Itineraries, [Directive603]>;
@@ -74,28 +80,37 @@ def : Processor<"603ev", G3Itineraries, [Directive603]>;
 def : Processor<"604", G3Itineraries, [Directive604]>;
 def : Processor<"604e", G3Itineraries, [Directive604]>;
 def : Processor<"620", G3Itineraries, [Directive620]>;
-def : Processor<"g3", G3Itineraries, [Directive7400]>;
+def : Processor<"750", G4Itineraries, [Directive750]>;
+def : Processor<"g3", G3Itineraries, [Directive750]>;
 def : Processor<"7400", G4Itineraries, [Directive7400, FeatureAltivec]>;
 def : Processor<"g4", G4Itineraries, [Directive7400, FeatureAltivec]>;
 def : Processor<"7450", G4PlusItineraries, [Directive7400, FeatureAltivec]>;
-def : Processor<"g4+", G4PlusItineraries, [Directive750, FeatureAltivec]>;
-def : Processor<"750", G4Itineraries, [Directive750, FeatureAltivec]>;
+def : Processor<"g4+", G4PlusItineraries, [Directive7400, FeatureAltivec]>;
 def : Processor<"970", G5Itineraries,
                   [Directive970, FeatureAltivec,
-                   FeatureGPUL, FeatureFSqrt, FeatureSTFIWX,
+                   FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
                    Feature64Bit /*, Feature64BitRegs */]>;
 def : Processor<"g5", G5Itineraries,
                   [Directive970, FeatureAltivec,
-                   FeatureGPUL, FeatureFSqrt, FeatureSTFIWX,
+                   FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
                    Feature64Bit /*, Feature64BitRegs */]>;
-def : Processor<"a2",  PPCA2Itineraries, [DirectiveA2, FeatureBookE,
-                                          FeatureFSqrt, FeatureSTFIWX,
-                                          Feature64Bit
-                                      /*, Feature64BitRegs */]>;
+def : Processor<"a2", PPCA2Itineraries, [DirectiveA2, FeatureBookE,
+                                         FeatureMFOCRF, FeatureFSqrt,
+                                         FeatureSTFIWX, FeatureISEL,
+                                         Feature64Bit
+                                     /*, Feature64BitRegs */]>;
+def : Processor<"pwr6", G5Itineraries,
+                  [DirectivePwr6, FeatureAltivec,
+                   FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
+                   Feature64Bit /*, Feature64BitRegs */]>;
+def : Processor<"pwr7", G5Itineraries,
+                  [DirectivePwr7, FeatureAltivec,
+                   FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
+                   FeatureISEL, Feature64Bit /*, Feature64BitRegs */]>;
 def : Processor<"ppc", G3Itineraries, [Directive32]>;
 def : Processor<"ppc64", G5Itineraries,
                   [Directive64, FeatureAltivec,
-                   FeatureGPUL, FeatureFSqrt, FeatureSTFIWX,
+                   FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
                    Feature64Bit /*, Feature64BitRegs */]>;
 
 
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index fb7aa71..f76b89c 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -22,8 +22,8 @@
 #include "PPCSubtarget.h"
 #include "InstPrinter/PPCInstPrinter.h"
 #include "MCTargetDesc/PPCPredicates.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Module.h"
 #include "llvm/Assembly/Writer.h"
@@ -248,7 +248,9 @@ bool PPCAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
     if (ExtraCode[1] != 0) return true; // Unknown modifier.
 
     switch (ExtraCode[0]) {
-    default: return true;  // Unknown modifier.
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
     case 'c': // Don't print "$" before a global var name or constant.
       break; // PPC never has a prefix.
     case 'L': // Write second word of DImode reference.
@@ -451,11 +453,13 @@ void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) {
     "ppc750",
     "ppc970",
     "ppcA2",
+    "power6",
+    "power7",
     "ppc64"
   };
 
   unsigned Directive = Subtarget.getDarwinDirective();
-  if (Subtarget.isGigaProcessor() && Directive < PPC::DIR_970)
+  if (Subtarget.hasMFOCRF() && Directive < PPC::DIR_970)
     Directive = PPC::DIR_970;
   if (Subtarget.hasAltivec() && Directive < PPC::DIR_7400)
     Directive = PPC::DIR_7400;
diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp
index 5f775e1..21a0fb2 100644
--- a/lib/Target/PowerPC/PPCBranchSelector.cpp
+++ b/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -135,21 +135,33 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
           MBBStartOffset += 4;
           continue;
         }
-        
+
         // Otherwise, we have to expand it to a long branch.
-        // The BCC operands are:
-        // 0. PPC branch predicate
-        // 1. CR register
-        // 2. Target MBB
-        PPC::Predicate Pred = (PPC::Predicate)I->getOperand(0).getImm();
-        unsigned CRReg = I->getOperand(1).getReg();
-        
         MachineInstr *OldBranch = I;
         DebugLoc dl = OldBranch->getDebugLoc();
-        
-        // Jump over the uncond branch inst (i.e. $PC+8) on opposite condition.
-        BuildMI(MBB, I, dl, TII->get(PPC::BCC))
-          .addImm(PPC::InvertPredicate(Pred)).addReg(CRReg).addImm(2);
+ 
+        if (I->getOpcode() == PPC::BCC) {
+          // The BCC operands are:
+          // 0. PPC branch predicate
+          // 1. CR register
+          // 2. Target MBB
+          PPC::Predicate Pred = (PPC::Predicate)I->getOperand(0).getImm();
+          unsigned CRReg = I->getOperand(1).getReg();
+       
+          // Jump over the uncond branch inst (i.e. $PC+8) on opposite condition.
+          BuildMI(MBB, I, dl, TII->get(PPC::BCC))
+            .addImm(PPC::InvertPredicate(Pred)).addReg(CRReg).addImm(2);
+        } else if (I->getOpcode() == PPC::BDNZ) {
+          BuildMI(MBB, I, dl, TII->get(PPC::BDZ)).addImm(2);
+        } else if (I->getOpcode() == PPC::BDNZ8) {
+          BuildMI(MBB, I, dl, TII->get(PPC::BDZ8)).addImm(2);
+        } else if (I->getOpcode() == PPC::BDZ) {
+          BuildMI(MBB, I, dl, TII->get(PPC::BDNZ)).addImm(2);
+        } else if (I->getOpcode() == PPC::BDZ8) {
+          BuildMI(MBB, I, dl, TII->get(PPC::BDNZ8)).addImm(2);
+        } else {
+           llvm_unreachable("Unhandled branch type!");
+        }
         
         // Uncond branch to the real destination.
         I = BuildMI(MBB, I, dl, TII->get(PPC::B)).addMBB(Dest);
diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp
new file mode 100644
index 0000000..f50f9b5
--- /dev/null
+++ b/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -0,0 +1,721 @@
+//===-- PPCCTRLoops.cpp - Identify and generate CTR loops -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass identifies loops where we can generate the PPC branch instructions
+// that decrement and test the count register (CTR) (bdnz and friends).
+// This pass is based on the HexagonHardwareLoops pass.
+//
+// The pattern that defines the induction variable can changed depending on
+// prior optimizations.  For example, the IndVarSimplify phase run by 'opt'
+// normalizes induction variables, and the Loop Strength Reduction pass
+// run by 'llc' may also make changes to the induction variable.
+// The pattern detected by this phase is due to running Strength Reduction.
+//
+// Criteria for CTR loops:
+//  - Countable loops (w/ ind. var for a trip count)
+//  - Assumes loops are normalized by IndVarSimplify
+//  - Try inner-most loops first
+//  - No nested CTR loops.
+//  - No function calls in loops.
+//
+//  Note: As with unconverted loops, PPCBranchSelector must be run after this
+//  pass in order to convert long-displacement jumps into jump pairs.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ctrloops"
+#include "PPC.h"
+#include "PPCTargetMachine.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "llvm/Constants.h"
+#include "llvm/PassSupport.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include <algorithm>
+
+using namespace llvm;
+
+STATISTIC(NumCTRLoops, "Number of loops converted to CTR loops");
+
+namespace {
+  class CountValue;
+  struct PPCCTRLoops : public MachineFunctionPass {
+    MachineLoopInfo       *MLI;
+    MachineRegisterInfo   *MRI;
+    const TargetInstrInfo *TII;
+
+  public:
+    static char ID;   // Pass identification, replacement for typeid
+
+    PPCCTRLoops() : MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    const char *getPassName() const { return "PPC CTR Loops"; }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTree>();
+      AU.addRequired<MachineLoopInfo>();
+      AU.addPreserved<MachineLoopInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    /// getCanonicalInductionVariable - Check to see if the loop has a canonical
+    /// induction variable.
+    /// Should be defined in MachineLoop. Based upon version in class Loop.
+    void getCanonicalInductionVariable(MachineLoop *L,
+                              SmallVector<MachineInstr *, 4> &IVars,
+                              SmallVector<MachineInstr *, 4> &IOps) const;
+
+    /// getTripCount - Return a loop-invariant LLVM register indicating the
+    /// number of times the loop will be executed.  If the trip-count cannot
+    /// be determined, this return null.
+    CountValue *getTripCount(MachineLoop *L,
+                             SmallVector<MachineInstr *, 2> &OldInsts) const;
+
+    /// isInductionOperation - Return true if the instruction matches the
+    /// pattern for an opertion that defines an induction variable.
+    bool isInductionOperation(const MachineInstr *MI, unsigned IVReg) const;
+
+    /// isInvalidOperation - Return true if the instruction is not valid within
+    /// a CTR loop.
+    bool isInvalidLoopOperation(const MachineInstr *MI) const;
+
+    /// containsInavlidInstruction - Return true if the loop contains an
+    /// instruction that inhibits using the CTR loop.
+    bool containsInvalidInstruction(MachineLoop *L) const;
+
+    /// converToCTRLoop - Given a loop, check if we can convert it to a
+    /// CTR loop.  If so, then perform the conversion and return true.
+    bool convertToCTRLoop(MachineLoop *L);
+
+    /// isDead - Return true if the instruction is now dead.
+    bool isDead(const MachineInstr *MI,
+                SmallVector<MachineInstr *, 1> &DeadPhis) const;
+
+    /// removeIfDead - Remove the instruction if it is now dead.
+    void removeIfDead(MachineInstr *MI);
+  };
+
+  char PPCCTRLoops::ID = 0;
+
+
+  // CountValue class - Abstraction for a trip count of a loop. A
+  // smaller vesrsion of the MachineOperand class without the concerns
+  // of changing the operand representation.
+  class CountValue {
+  public:
+    enum CountValueType {
+      CV_Register,
+      CV_Immediate
+    };
+  private:
+    CountValueType Kind;
+    union Values {
+      unsigned RegNum;
+      int64_t ImmVal;
+      Values(unsigned r) : RegNum(r) {}
+      Values(int64_t i) : ImmVal(i) {}
+    } Contents;
+    bool isNegative;
+
+  public:
+    CountValue(unsigned r, bool neg) : Kind(CV_Register), Contents(r),
+                                       isNegative(neg) {}
+    explicit CountValue(int64_t i) : Kind(CV_Immediate), Contents(i),
+                                     isNegative(i < 0) {}
+    CountValueType getType() const { return Kind; }
+    bool isReg() const { return Kind == CV_Register; }
+    bool isImm() const { return Kind == CV_Immediate; }
+    bool isNeg() const { return isNegative; }
+
+    unsigned getReg() const {
+      assert(isReg() && "Wrong CountValue accessor");
+      return Contents.RegNum;
+    }
+    void setReg(unsigned Val) {
+      Contents.RegNum = Val;
+    }
+    int64_t getImm() const {
+      assert(isImm() && "Wrong CountValue accessor");
+      if (isNegative) {
+        return -Contents.ImmVal;
+      }
+      return Contents.ImmVal;
+    }
+    void setImm(int64_t Val) {
+      Contents.ImmVal = Val;
+    }
+
+    void print(raw_ostream &OS, const TargetMachine *TM = 0) const {
+      if (isReg()) { OS << PrintReg(getReg()); }
+      if (isImm()) { OS << getImm(); }
+    }
+  };
+} // end anonymous namespace
+
+
+/// isCompareEquals - Returns true if the instruction is a compare equals
+/// instruction with an immediate operand.
+static bool isCompareEqualsImm(const MachineInstr *MI, bool &SignedCmp) {
+  if (MI->getOpcode() == PPC::CMPWI || MI->getOpcode() == PPC::CMPDI) {
+    SignedCmp = true;
+    return true;
+  } else if (MI->getOpcode() == PPC::CMPLWI || MI->getOpcode() == PPC::CMPLDI) {
+    SignedCmp = false;
+    return true;
+  }
+
+  return false;
+}
+
+
+/// createPPCCTRLoops - Factory for creating
+/// the CTR loop phase.
+FunctionPass *llvm::createPPCCTRLoops() {
+  return new PPCCTRLoops();
+}
+
+
+bool PPCCTRLoops::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********* PPC CTR Loops *********\n");
+
+  bool Changed = false;
+
+  // get the loop information
+  MLI = &getAnalysis<MachineLoopInfo>();
+  // get the register information
+  MRI = &MF.getRegInfo();
+  // the target specific instructio info.
+  TII = MF.getTarget().getInstrInfo();
+
+  for (MachineLoopInfo::iterator I = MLI->begin(), E = MLI->end();
+       I != E; ++I) {
+    MachineLoop *L = *I;
+    if (!L->getParentLoop()) {
+      Changed |= convertToCTRLoop(L);
+    }
+  }
+
+  return Changed;
+}
+
+/// getCanonicalInductionVariable - Check to see if the loop has a canonical
+/// induction variable. We check for a simple recurrence pattern - an
+/// integer recurrence that decrements by one each time through the loop and
+/// ends at zero.  If so, return the phi node that corresponds to it.
+///
+/// Based upon the similar code in LoopInfo except this code is specific to
+/// the machine.
+/// This method assumes that the IndVarSimplify pass has been run by 'opt'.
+///
+void
+PPCCTRLoops::getCanonicalInductionVariable(MachineLoop *L,
+                                  SmallVector<MachineInstr *, 4> &IVars,
+                                  SmallVector<MachineInstr *, 4> &IOps) const {
+  MachineBasicBlock *TopMBB = L->getTopBlock();
+  MachineBasicBlock::pred_iterator PI = TopMBB->pred_begin();
+  assert(PI != TopMBB->pred_end() &&
+         "Loop must have more than one incoming edge!");
+  MachineBasicBlock *Backedge = *PI++;
+  if (PI == TopMBB->pred_end()) return;  // dead loop
+  MachineBasicBlock *Incoming = *PI++;
+  if (PI != TopMBB->pred_end()) return;  // multiple backedges?
+
+  // make sure there is one incoming and one backedge and determine which
+  // is which.
+  if (L->contains(Incoming)) {
+    if (L->contains(Backedge))
+      return;
+    std::swap(Incoming, Backedge);
+  } else if (!L->contains(Backedge))
+    return;
+
+  // Loop over all of the PHI nodes, looking for a canonical induction variable:
+  //   - The PHI node is "reg1 = PHI reg2, BB1, reg3, BB2".
+  //   - The recurrence comes from the backedge.
+  //   - the definition is an induction operatio.n
+  for (MachineBasicBlock::iterator I = TopMBB->begin(), E = TopMBB->end();
+       I != E && I->isPHI(); ++I) {
+    MachineInstr *MPhi = &*I;
+    unsigned DefReg = MPhi->getOperand(0).getReg();
+    for (unsigned i = 1; i != MPhi->getNumOperands(); i += 2) {
+      // Check each operand for the value from the backedge.
+      MachineBasicBlock *MBB = MPhi->getOperand(i+1).getMBB();
+      if (L->contains(MBB)) { // operands comes from the backedge
+        // Check if the definition is an induction operation.
+        MachineInstr *DI = MRI->getVRegDef(MPhi->getOperand(i).getReg());
+        if (isInductionOperation(DI, DefReg)) {
+          IOps.push_back(DI);
+          IVars.push_back(MPhi);
+        }
+      }
+    }
+  }
+  return;
+}
+
+/// getTripCount - Return a loop-invariant LLVM value indicating the
+/// number of times the loop will be executed.  The trip count can
+/// be either a register or a constant value.  If the trip-count
+/// cannot be determined, this returns null.
+///
+/// We find the trip count from the phi instruction that defines the
+/// induction variable.  We follow the links to the CMP instruction
+/// to get the trip count.
+///
+/// Based upon getTripCount in LoopInfo.
+///
+CountValue *PPCCTRLoops::getTripCount(MachineLoop *L,
+                           SmallVector<MachineInstr *, 2> &OldInsts) const {
+  MachineBasicBlock *LastMBB = L->getExitingBlock();
+  // Don't generate a CTR loop if the loop has more than one exit.
+  if (LastMBB == 0)
+    return 0;
+
+  MachineBasicBlock::iterator LastI = LastMBB->getFirstTerminator();
+  if (LastI->getOpcode() != PPC::BCC)
+    return 0;
+
+  // We need to make sure that this compare is defining the condition
+  // register actually used by the terminating branch.
+
+  unsigned PredReg = LastI->getOperand(1).getReg();
+  DEBUG(dbgs() << "Examining loop with first terminator: " << *LastI);
+
+  unsigned PredCond = LastI->getOperand(0).getImm();
+  if (PredCond != PPC::PRED_EQ && PredCond != PPC::PRED_NE)
+    return 0;
+
+  // Check that the loop has a induction variable.
+  SmallVector<MachineInstr *, 4> IVars, IOps;
+  getCanonicalInductionVariable(L, IVars, IOps);
+  for (unsigned i = 0; i < IVars.size(); ++i) {
+    MachineInstr *IOp = IOps[i];
+    MachineInstr *IV_Inst = IVars[i];
+
+    // Canonical loops will end with a 'cmpwi/cmpdi cr, IV, Imm',
+    //  if Imm is 0, get the count from the PHI opnd
+    //  if Imm is -M, than M is the count
+    //  Otherwise, Imm is the count
+    MachineOperand *IV_Opnd;
+    const MachineOperand *InitialValue;
+    if (!L->contains(IV_Inst->getOperand(2).getMBB())) {
+      InitialValue = &IV_Inst->getOperand(1);
+      IV_Opnd = &IV_Inst->getOperand(3);
+    } else {
+      InitialValue = &IV_Inst->getOperand(3);
+      IV_Opnd = &IV_Inst->getOperand(1);
+    }
+
+    DEBUG(dbgs() << "Considering:\n");
+    DEBUG(dbgs() << "  induction operation: " << *IOp);
+    DEBUG(dbgs() << "  induction variable: " << *IV_Inst);
+    DEBUG(dbgs() << "  initial value: " << *InitialValue << "\n");
+  
+    // Look for the cmp instruction to determine if we
+    // can get a useful trip count.  The trip count can
+    // be either a register or an immediate.  The location
+    // of the value depends upon the type (reg or imm).
+    while ((IV_Opnd = IV_Opnd->getNextOperandForReg())) {
+      bool SignedCmp;
+      MachineInstr *MI = IV_Opnd->getParent();
+      if (L->contains(MI) && isCompareEqualsImm(MI, SignedCmp) &&
+          MI->getOperand(0).getReg() == PredReg) {
+
+        OldInsts.push_back(MI);
+        OldInsts.push_back(IOp);
+ 
+        DEBUG(dbgs() << "  compare: " << *MI);
+ 
+        const MachineOperand &MO = MI->getOperand(2);
+        assert(MO.isImm() && "IV Cmp Operand should be an immediate");
+
+        int64_t ImmVal;
+        if (SignedCmp)
+          ImmVal = (short) MO.getImm();
+        else
+          ImmVal = MO.getImm();
+  
+        const MachineInstr *IV_DefInstr = MRI->getVRegDef(IV_Opnd->getReg());
+        assert(L->contains(IV_DefInstr->getParent()) &&
+               "IV definition should occurs in loop");
+        int64_t iv_value = (short) IV_DefInstr->getOperand(2).getImm();
+  
+        assert(InitialValue->isReg() && "Expecting register for init value");
+        unsigned InitialValueReg = InitialValue->getReg();
+  
+        const MachineInstr *DefInstr = MRI->getVRegDef(InitialValueReg);
+  
+        // Here we need to look for an immediate load (an li or lis/ori pair).
+        if (DefInstr && (DefInstr->getOpcode() == PPC::ORI8 ||
+                         DefInstr->getOpcode() == PPC::ORI)) {
+          int64_t start = (short) DefInstr->getOperand(2).getImm();
+          const MachineInstr *DefInstr2 =
+            MRI->getVRegDef(DefInstr->getOperand(0).getReg());
+          if (DefInstr2 && (DefInstr2->getOpcode() == PPC::LIS8 ||
+                            DefInstr2->getOpcode() == PPC::LIS)) {
+            DEBUG(dbgs() << "  initial constant: " << *DefInstr);
+            DEBUG(dbgs() << "  initial constant: " << *DefInstr2);
+
+            start |= int64_t(short(DefInstr2->getOperand(1).getImm())) << 16;
+  
+            int64_t count = ImmVal - start;
+            if ((count % iv_value) != 0) {
+              return 0;
+            }
+            return new CountValue(count/iv_value);
+          }
+        } else if (DefInstr && (DefInstr->getOpcode() == PPC::LI8 ||
+                                DefInstr->getOpcode() == PPC::LI)) {
+          DEBUG(dbgs() << "  initial constant: " << *DefInstr);
+
+          int64_t count = ImmVal - int64_t(short(DefInstr->getOperand(1).getImm()));
+          if ((count % iv_value) != 0) {
+            return 0;
+          }
+          return new CountValue(count/iv_value);
+        } else if (iv_value == 1 || iv_value == -1) {
+          // We can't determine a constant starting value.
+          if (ImmVal == 0) {
+            return new CountValue(InitialValueReg, iv_value > 0);
+          }
+          // FIXME: handle non-zero end value.
+        }
+        // FIXME: handle non-unit increments (we might not want to introduce division
+        // but we can handle some 2^n cases with shifts).
+  
+      }
+    }
+  }
+  return 0;
+}
+
+/// isInductionOperation - return true if the operation is matches the
+/// pattern that defines an induction variable:
+///    addi iv, c
+///
+bool
+PPCCTRLoops::isInductionOperation(const MachineInstr *MI,
+                                           unsigned IVReg) const {
+  return ((MI->getOpcode() == PPC::ADDI || MI->getOpcode() == PPC::ADDI8) &&
+          MI->getOperand(1).isReg() && // could be a frame index instead
+          MI->getOperand(1).getReg() == IVReg);
+}
+
+/// isInvalidOperation - Return true if the operation is invalid within
+/// CTR loop.
+bool
+PPCCTRLoops::isInvalidLoopOperation(const MachineInstr *MI) const {
+
+  // call is not allowed because the callee may use a CTR loop
+  if (MI->getDesc().isCall()) {
+    return true;
+  }
+  // check if the instruction defines a CTR loop register
+  // (this will also catch nested CTR loops)
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isDef() &&
+        (MO.getReg() == PPC::CTR || MO.getReg() == PPC::CTR8)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/// containsInvalidInstruction - Return true if the loop contains
+/// an instruction that inhibits the use of the CTR loop function.
+///
+bool PPCCTRLoops::containsInvalidInstruction(MachineLoop *L) const {
+  const std::vector<MachineBasicBlock*> Blocks = L->getBlocks();
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = Blocks[i];
+    for (MachineBasicBlock::iterator
+           MII = MBB->begin(), E = MBB->end(); MII != E; ++MII) {
+      const MachineInstr *MI = &*MII;
+      if (isInvalidLoopOperation(MI)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+/// isDead returns true if the instruction is dead
+/// (this was essentially copied from DeadMachineInstructionElim::isDead, but
+/// with special cases for inline asm, physical registers and instructions with
+/// side effects removed)
+bool PPCCTRLoops::isDead(const MachineInstr *MI,
+                         SmallVector<MachineInstr *, 1> &DeadPhis) const {
+  // Examine each operand.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isDef()) {
+      unsigned Reg = MO.getReg();
+      if (!MRI->use_nodbg_empty(Reg)) {
+        // This instruction has users, but if the only user is the phi node for the
+        // parent block, and the only use of that phi node is this instruction, then
+        // this instruction is dead: both it (and the phi node) can be removed.
+        MachineRegisterInfo::use_iterator I = MRI->use_begin(Reg);
+        if (llvm::next(I) == MRI->use_end() &&
+            I.getOperand().getParent()->isPHI()) {
+          MachineInstr *OnePhi = I.getOperand().getParent();
+
+          for (unsigned j = 0, f = OnePhi->getNumOperands(); j != f; ++j) {
+            const MachineOperand &OPO = OnePhi->getOperand(j);
+            if (OPO.isReg() && OPO.isDef()) {
+              unsigned OPReg = OPO.getReg();
+
+              MachineRegisterInfo::use_iterator nextJ;
+              for (MachineRegisterInfo::use_iterator J = MRI->use_begin(OPReg),
+                   E = MRI->use_end(); J!=E; J=nextJ) {
+                nextJ = llvm::next(J);
+                MachineOperand& Use = J.getOperand();
+                MachineInstr *UseMI = Use.getParent();
+
+                if (MI != UseMI) {
+                  // The phi node has a user that is not MI, bail...
+                  return false;
+                }
+              }
+            }
+          }
+
+          DeadPhis.push_back(OnePhi);
+        } else {
+          // This def has a non-debug use. Don't delete the instruction!
+          return false;
+        }
+      }
+    }
+  }
+
+  // If there are no defs with uses, the instruction is dead.
+  return true;
+}
+
+void PPCCTRLoops::removeIfDead(MachineInstr *MI) {
+  // This procedure was essentially copied from DeadMachineInstructionElim
+
+  SmallVector<MachineInstr *, 1> DeadPhis;
+  if (isDead(MI, DeadPhis)) {
+    DEBUG(dbgs() << "CTR looping will remove: " << *MI);
+
+    // It is possible that some DBG_VALUE instructions refer to this
+    // instruction.  Examine each def operand for such references;
+    // if found, mark the DBG_VALUE as undef (but don't delete it).
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+      unsigned Reg = MO.getReg();
+      MachineRegisterInfo::use_iterator nextI;
+      for (MachineRegisterInfo::use_iterator I = MRI->use_begin(Reg),
+           E = MRI->use_end(); I!=E; I=nextI) {
+        nextI = llvm::next(I);  // I is invalidated by the setReg
+        MachineOperand& Use = I.getOperand();
+        MachineInstr *UseMI = Use.getParent();
+        if (UseMI==MI)
+          continue;
+        if (Use.isDebug()) // this might also be a instr -> phi -> instr case
+                           // which can also be removed.
+          UseMI->getOperand(0).setReg(0U);
+      }
+    }
+
+    MI->eraseFromParent();
+    for (unsigned i = 0; i < DeadPhis.size(); ++i) {
+      DeadPhis[i]->eraseFromParent();
+    }
+  }
+}
+
+/// converToCTRLoop - check if the loop is a candidate for
+/// converting to a CTR loop.  If so, then perform the
+/// transformation.
+///
+/// This function works on innermost loops first.  A loop can
+/// be converted if it is a counting loop; either a register
+/// value or an immediate.
+///
+/// The code makes several assumptions about the representation
+/// of the loop in llvm.
+bool PPCCTRLoops::convertToCTRLoop(MachineLoop *L) {
+  bool Changed = false;
+  // Process nested loops first.
+  for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
+    Changed |= convertToCTRLoop(*I);
+  }
+  // If a nested loop has been converted, then we can't convert this loop.
+  if (Changed) {
+    return Changed;
+  }
+
+  SmallVector<MachineInstr *, 2> OldInsts;
+  // Are we able to determine the trip count for the loop?
+  CountValue *TripCount = getTripCount(L, OldInsts);
+  if (TripCount == 0) {
+    DEBUG(dbgs() << "failed to get trip count!\n");
+    return false;
+  }
+  // Does the loop contain any invalid instructions?
+  if (containsInvalidInstruction(L)) {
+    return false;
+  }
+  MachineBasicBlock *Preheader = L->getLoopPreheader();
+  // No preheader means there's not place for the loop instr.
+  if (Preheader == 0) {
+    return false;
+  }
+  MachineBasicBlock::iterator InsertPos = Preheader->getFirstTerminator();
+
+  DebugLoc dl;
+  if (InsertPos != Preheader->end())
+    dl = InsertPos->getDebugLoc();
+
+  MachineBasicBlock *LastMBB = L->getExitingBlock();
+  // Don't generate CTR loop if the loop has more than one exit.
+  if (LastMBB == 0) {
+    return false;
+  }
+  MachineBasicBlock::iterator LastI = LastMBB->getFirstTerminator();
+
+  // Determine the loop start.
+  MachineBasicBlock *LoopStart = L->getTopBlock();
+  if (L->getLoopLatch() != LastMBB) {
+    // When the exit and latch are not the same, use the latch block as the
+    // start.
+    // The loop start address is used only after the 1st iteration, and the loop
+    // latch may contains instrs. that need to be executed after the 1st iter.
+    LoopStart = L->getLoopLatch();
+    // Make sure the latch is a successor of the exit, otherwise it won't work.
+    if (!LastMBB->isSuccessor(LoopStart)) {
+      return false;
+    }
+  }
+
+  // Convert the loop to a CTR loop
+  DEBUG(dbgs() << "Change to CTR loop at "; L->dump());
+
+  MachineFunction *MF = LastMBB->getParent();
+  const PPCSubtarget &Subtarget = MF->getTarget().getSubtarget<PPCSubtarget>();
+  bool isPPC64 = Subtarget.isPPC64();
+
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+  const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+  const TargetRegisterClass *RC = isPPC64 ? G8RC : GPRC;
+
+  unsigned CountReg;
+  if (TripCount->isReg()) {
+    // Create a copy of the loop count register.
+    const TargetRegisterClass *SrcRC =
+      MF->getRegInfo().getRegClass(TripCount->getReg());
+    CountReg = MF->getRegInfo().createVirtualRegister(RC);
+    unsigned CopyOp = (isPPC64 && SrcRC == GPRC) ?
+                        (unsigned) PPC::EXTSW_32_64 :
+                        (unsigned) TargetOpcode::COPY;
+    BuildMI(*Preheader, InsertPos, dl,
+            TII->get(CopyOp), CountReg).addReg(TripCount->getReg());
+    if (TripCount->isNeg()) {
+      unsigned CountReg1 = CountReg;
+      CountReg = MF->getRegInfo().createVirtualRegister(RC);
+      BuildMI(*Preheader, InsertPos, dl,
+              TII->get(isPPC64 ? PPC::NEG8 : PPC::NEG),
+                       CountReg).addReg(CountReg1);
+    }
+  } else {
+    assert(TripCount->isImm() && "Expecting immedate vaule for trip count");
+    // Put the trip count in a register for transfer into the count register.
+
+    int64_t CountImm = TripCount->getImm();
+    assert(!TripCount->isNeg() && "Constant trip count must be positive");
+
+    CountReg = MF->getRegInfo().createVirtualRegister(RC);
+    if (CountImm > 0xFFFF) {
+      BuildMI(*Preheader, InsertPos, dl,
+              TII->get(isPPC64 ? PPC::LIS8 : PPC::LIS),
+              CountReg).addImm(CountImm >> 16);
+      unsigned CountReg1 = CountReg;
+      CountReg = MF->getRegInfo().createVirtualRegister(RC);
+      BuildMI(*Preheader, InsertPos, dl,
+              TII->get(isPPC64 ? PPC::ORI8 : PPC::ORI),
+              CountReg).addReg(CountReg1).addImm(CountImm & 0xFFFF);
+    } else {
+      BuildMI(*Preheader, InsertPos, dl,
+              TII->get(isPPC64 ? PPC::LI8 : PPC::LI),
+              CountReg).addImm(CountImm);
+    }
+  }
+
+  // Add the mtctr instruction to the beginning of the loop.
+  BuildMI(*Preheader, InsertPos, dl,
+          TII->get(isPPC64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(CountReg,
+            TripCount->isImm() ? RegState::Kill : 0);
+
+  // Make sure the loop start always has a reference in the CFG.  We need to
+  // create a BlockAddress operand to get this mechanism to work both the
+  // MachineBasicBlock and BasicBlock objects need the flag set.
+  LoopStart->setHasAddressTaken();
+  // This line is needed to set the hasAddressTaken flag on the BasicBlock
+  // object
+  BlockAddress::get(const_cast<BasicBlock *>(LoopStart->getBasicBlock()));
+
+  // Replace the loop branch with a bdnz instruction.
+  dl = LastI->getDebugLoc();
+  const std::vector<MachineBasicBlock*> Blocks = L->getBlocks();
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = Blocks[i];
+    if (MBB != Preheader)
+      MBB->addLiveIn(isPPC64 ? PPC::CTR8 : PPC::CTR);
+  }
+
+  // The loop ends with either:
+  //  - a conditional branch followed by an unconditional branch, or
+  //  - a conditional branch to the loop start.
+  assert(LastI->getOpcode() == PPC::BCC &&
+         "loop end must start with a BCC instruction");
+  // Either the BCC branches to the beginning of the loop, or it
+  // branches out of the loop and there is an unconditional branch
+  // to the start of the loop.
+  MachineBasicBlock *BranchTarget = LastI->getOperand(2).getMBB();
+  BuildMI(*LastMBB, LastI, dl,
+        TII->get((BranchTarget == LoopStart) ?
+                 (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
+                 (isPPC64 ? PPC::BDZ8 : PPC::BDZ))).addMBB(BranchTarget);
+
+  // Conditional branch; just delete it.
+  DEBUG(dbgs() << "Removing old branch: " << *LastI);
+  LastMBB->erase(LastI);
+
+  delete TripCount;
+
+  // The induction operation (add) and the comparison (cmpwi) may now be
+  // unneeded. If these are unneeded, then remove them.
+  for (unsigned i = 0; i < OldInsts.size(); ++i)
+    removeIfDead(OldInsts[i]);
+
+  ++NumCTRLoops;
+  return true;
+}
+
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index b77a80b..c24afa9 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -330,6 +330,8 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       BuildMI(MBB, MBBI, dl, TII.get(PPC::MFLR), PPC::R0);
 
     if (HasFP)
+      // FIXME: On PPC32 SVR4, FPOffset is negative and access to negative
+      // offsets of R1 is not allowed.
       BuildMI(MBB, MBBI, dl, TII.get(PPC::STW))
         .addReg(PPC::R31)
         .addImm(FPOffset)
@@ -366,9 +368,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       BuildMI(MBB, MBBI, dl, TII.get(PPC::SUBFIC) ,PPC::R0)
         .addReg(PPC::R0, RegState::Kill)
         .addImm(NegFrameSize);
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STWUX))
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STWUX), PPC::R1)
         .addReg(PPC::R1, RegState::Kill)
-        .addReg(PPC::R1, RegState::Define)
+        .addReg(PPC::R1)
         .addReg(PPC::R0);
     } else if (isInt<16>(NegFrameSize)) {
       BuildMI(MBB, MBBI, dl, TII.get(PPC::STWU), PPC::R1)
@@ -381,9 +383,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI), PPC::R0)
         .addReg(PPC::R0, RegState::Kill)
         .addImm(NegFrameSize & 0xFFFF);
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STWUX))
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STWUX), PPC::R1)
         .addReg(PPC::R1, RegState::Kill)
-        .addReg(PPC::R1, RegState::Define)
+        .addReg(PPC::R1)
         .addReg(PPC::R0);
     }
   } else {    // PPC64.
@@ -399,9 +401,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       BuildMI(MBB, MBBI, dl, TII.get(PPC::SUBFIC8), PPC::X0)
         .addReg(PPC::X0)
         .addImm(NegFrameSize);
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STDUX))
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STDUX), PPC::X1)
         .addReg(PPC::X1, RegState::Kill)
-        .addReg(PPC::X1, RegState::Define)
+        .addReg(PPC::X1)
         .addReg(PPC::X0);
     } else if (isInt<16>(NegFrameSize)) {
       BuildMI(MBB, MBBI, dl, TII.get(PPC::STDU), PPC::X1)
@@ -414,9 +416,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI8), PPC::X0)
         .addReg(PPC::X0, RegState::Kill)
         .addImm(NegFrameSize & 0xFFFF);
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STDUX))
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STDUX), PPC::X1)
         .addReg(PPC::X1, RegState::Kill)
-        .addReg(PPC::X1, RegState::Define)
+        .addReg(PPC::X1)
         .addReg(PPC::X0);
     }
   }
@@ -492,7 +494,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
 
       // This is a bit of a hack: CR2LT, CR2GT, CR2EQ and CR2UN are just
       // subregisters of CR2. We just need to emit a move of CR2.
-      if (PPC::CRBITRCRegisterClass->contains(Reg))
+      if (PPC::CRBITRCRegClass.contains(Reg))
         continue;
 
       MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
@@ -817,7 +819,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF)
 
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
-    if (PPC::GPRCRegisterClass->contains(Reg)) {
+    if (PPC::GPRCRegClass.contains(Reg)) {
       HasGPSaveArea = true;
 
       GPRegs.push_back(CSI[i]);
@@ -825,7 +827,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF)
       if (Reg < MinGPR) {
         MinGPR = Reg;
       }
-    } else if (PPC::G8RCRegisterClass->contains(Reg)) {
+    } else if (PPC::G8RCRegClass.contains(Reg)) {
       HasG8SaveArea = true;
 
       G8Regs.push_back(CSI[i]);
@@ -833,7 +835,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF)
       if (Reg < MinG8R) {
         MinG8R = Reg;
       }
-    } else if (PPC::F8RCRegisterClass->contains(Reg)) {
+    } else if (PPC::F8RCRegClass.contains(Reg)) {
       HasFPSaveArea = true;
 
       FPRegs.push_back(CSI[i]);
@@ -842,12 +844,12 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF)
         MinFPR = Reg;
       }
 // FIXME SVR4: Disable CR save area for now.
-    } else if (PPC::CRBITRCRegisterClass->contains(Reg)
-               || PPC::CRRCRegisterClass->contains(Reg)) {
+    } else if (PPC::CRBITRCRegClass.contains(Reg) ||
+               PPC::CRRCRegClass.contains(Reg)) {
 //      HasCRSaveArea = true;
-    } else if (PPC::VRSAVERCRegisterClass->contains(Reg)) {
+    } else if (PPC::VRSAVERCRegClass.contains(Reg)) {
       HasVRSAVESaveArea = true;
-    } else if (PPC::VRRCRegisterClass->contains(Reg)) {
+    } else if (PPC::VRRCRegClass.contains(Reg)) {
       HasVRSaveArea = true;
 
       VRegs.push_back(CSI[i]);
@@ -932,8 +934,8 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF)
     for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
       unsigned Reg = CSI[i].getReg();
 
-      if (PPC::CRBITRCRegisterClass->contains(Reg) ||
-          PPC::CRRCRegisterClass->contains(Reg)) {
+      if (PPC::CRBITRCRegClass.contains(Reg) ||
+          PPC::CRRCRegClass.contains(Reg)) {
         int FI = CSI[i].getFrameIdx();
 
         FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
@@ -950,7 +952,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF)
     for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
       unsigned Reg = CSI[i].getReg();
 
-      if (PPC::VRSAVERCRegisterClass->contains(Reg)) {
+      if (PPC::VRSAVERCRegClass.contains(Reg)) {
         int FI = CSI[i].getFrameIdx();
 
         FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 5a04888..a00f686 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -111,6 +111,23 @@ namespace {
     /// immediate field.  Because preinc imms have already been validated, just
     /// accept it.
     bool SelectAddrImmOffs(SDValue N, SDValue &Out) const {
+      if (isa<ConstantSDNode>(N) || N.getOpcode() == PPCISD::Lo ||
+          N.getOpcode() == ISD::TargetGlobalAddress) {
+        Out = N;
+        return true;
+      }
+
+      return false;
+    }
+
+    /// SelectAddrIdxOffs - Return true if the operand is valid for a preinc
+    /// index field.  Because preinc imms have already been validated, just
+    /// accept it.
+    bool SelectAddrIdxOffs(SDValue N, SDValue &Out) const {
+      if (isa<ConstantSDNode>(N) || N.getOpcode() == PPCISD::Lo ||
+          N.getOpcode() == ISD::TargetGlobalAddress)
+        return false;
+
       Out = N;
       return true;
     }
@@ -238,11 +255,11 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
     DebugLoc dl;
 
     if (PPCLowering.getPointerTy() == MVT::i32) {
-      GlobalBaseReg = RegInfo->createVirtualRegister(PPC::GPRCRegisterClass);
+      GlobalBaseReg = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
       BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR));
       BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
     } else {
-      GlobalBaseReg = RegInfo->createVirtualRegister(PPC::G8RCRegisterClass);
+      GlobalBaseReg = RegInfo->createVirtualRegister(&PPC::G8RCRegClass);
       BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR8));
       BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR8), GlobalBaseReg);
     }
@@ -697,7 +714,7 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
   CCReg = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, CR7Reg, CCReg,
                                InFlag).getValue(1);
 
-  if (PPCSubTarget.isGigaProcessor() && OtherCondIdx == -1)
+  if (PPCSubTarget.hasMFOCRF() && OtherCondIdx == -1)
     IntCR = SDValue(CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32, CR7Reg,
                                            CCReg), 0);
  else
@@ -833,7 +850,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
   case PPCISD::MFCR: {
     SDValue InFlag = N->getOperand(1);
     // Use MFOCRF if supported.
-    if (PPCSubTarget.isGigaProcessor())
+    if (PPCSubTarget.hasMFOCRF())
       return CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32,
                                     N->getOperand(0), InFlag);
     else
@@ -915,12 +932,44 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       SDValue Chain = LD->getChain();
       SDValue Base = LD->getBasePtr();
       SDValue Ops[] = { Offset, Base, Chain };
-      // FIXME: PPC64
       return CurDAG->getMachineNode(Opcode, dl, LD->getValueType(0),
                                     PPCLowering.getPointerTy(),
                                     MVT::Other, Ops, 3);
     } else {
-      llvm_unreachable("R+R preindex loads not supported yet!");
+      unsigned Opcode;
+      bool isSExt = LD->getExtensionType() == ISD::SEXTLOAD;
+      if (LD->getValueType(0) != MVT::i64) {
+        // Handle PPC32 integer and normal FP loads.
+        assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load");
+        switch (LoadedVT.getSimpleVT().SimpleTy) {
+          default: llvm_unreachable("Invalid PPC load type!");
+          case MVT::f64: Opcode = PPC::LFDUX; break;
+          case MVT::f32: Opcode = PPC::LFSUX; break;
+          case MVT::i32: Opcode = PPC::LWZUX; break;
+          case MVT::i16: Opcode = isSExt ? PPC::LHAUX : PPC::LHZUX; break;
+          case MVT::i1:
+          case MVT::i8:  Opcode = PPC::LBZUX; break;
+        }
+      } else {
+        assert(LD->getValueType(0) == MVT::i64 && "Unknown load result type!");
+        assert((!isSExt || LoadedVT == MVT::i16 || LoadedVT == MVT::i32) &&
+               "Invalid sext update load");
+        switch (LoadedVT.getSimpleVT().SimpleTy) {
+          default: llvm_unreachable("Invalid PPC load type!");
+          case MVT::i64: Opcode = PPC::LDUX; break;
+          case MVT::i32: Opcode = isSExt ? PPC::LWAUX  : PPC::LWZUX8; break;
+          case MVT::i16: Opcode = isSExt ? PPC::LHAUX8 : PPC::LHZUX8; break;
+          case MVT::i1:
+          case MVT::i8:  Opcode = PPC::LBZUX8; break;
+        }
+      }
+
+      SDValue Chain = LD->getChain();
+      SDValue Base = LD->getBasePtr();
+      SDValue Ops[] = { Offset, Base, Chain };
+      return CurDAG->getMachineNode(Opcode, dl, LD->getValueType(0),
+                                    PPCLowering.getPointerTy(),
+                                    MVT::Other, Ops, 3);
     }
   }
 
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 3b24951..13250b3 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -51,9 +51,11 @@ static bool CC_PPC_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
                                               ISD::ArgFlagsTy &ArgFlags,
                                               CCState &State);
 
-static cl::opt<bool> EnablePPCPreinc("enable-ppc-preinc",
-cl::desc("enable preincrement load/store generation on PPC (experimental)"),
-                                     cl::Hidden);
+static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
+cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
+
+static cl::opt<bool> DisableILPPref("disable-ppc-ilp-pref",
+cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hidden);
 
 static TargetLoweringObjectFile *CreateTLOF(const PPCTargetMachine &TM) {
   if (TM.getSubtargetImpl()->isDarwin())
@@ -64,6 +66,7 @@ static TargetLoweringObjectFile *CreateTLOF(const PPCTargetMachine &TM) {
 
 PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   : TargetLowering(TM, CreateTLOF(TM)), PPCSubTarget(*TM.getSubtargetImpl()) {
+  const PPCSubtarget *Subtarget = &TM.getSubtarget<PPCSubtarget>();
 
   setPow2DivIsCheap();
 
@@ -73,12 +76,13 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
 
   // On PPC32/64, arguments smaller than 4/8 bytes are extended, so all
   // arguments are at least 4/8 bytes aligned.
-  setMinStackArgumentAlignment(TM.getSubtarget<PPCSubtarget>().isPPC64() ? 8:4);
+  bool isPPC64 = Subtarget->isPPC64();
+  setMinStackArgumentAlignment(isPPC64 ? 8:4);
 
   // Set up the register classes.
-  addRegisterClass(MVT::i32, PPC::GPRCRegisterClass);
-  addRegisterClass(MVT::f32, PPC::F4RCRegisterClass);
-  addRegisterClass(MVT::f64, PPC::F8RCRegisterClass);
+  addRegisterClass(MVT::i32, &PPC::GPRCRegClass);
+  addRegisterClass(MVT::f32, &PPC::F4RCRegClass);
+  addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
 
   // PowerPC has an i16 but no i8 (or i1) SEXTLOAD
   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
@@ -130,17 +134,17 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setOperationAction(ISD::FCOS , MVT::f64, Expand);
   setOperationAction(ISD::FREM , MVT::f64, Expand);
   setOperationAction(ISD::FPOW , MVT::f64, Expand);
-  setOperationAction(ISD::FMA  , MVT::f64, Expand);
+  setOperationAction(ISD::FMA  , MVT::f64, Legal);
   setOperationAction(ISD::FSIN , MVT::f32, Expand);
   setOperationAction(ISD::FCOS , MVT::f32, Expand);
   setOperationAction(ISD::FREM , MVT::f32, Expand);
   setOperationAction(ISD::FPOW , MVT::f32, Expand);
-  setOperationAction(ISD::FMA  , MVT::f32, Expand);
+  setOperationAction(ISD::FMA  , MVT::f32, Legal);
 
   setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
 
   // If we're enabling GP optimizations, use hardware square root
-  if (!TM.getSubtarget<PPCSubtarget>().hasFSQRT()) {
+  if (!Subtarget->hasFSQRT()) {
     setOperationAction(ISD::FSQRT, MVT::f64, Expand);
     setOperationAction(ISD::FSQRT, MVT::f32, Expand);
   }
@@ -226,8 +230,8 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
 
-  if (TM.getSubtarget<PPCSubtarget>().isSVR4ABI()) {
-    if (TM.getSubtarget<PPCSubtarget>().isPPC64()) {
+  if (Subtarget->isSVR4ABI()) {
+    if (isPPC64) {
       // VAARG always uses double-word chunks, so promote anything smaller.
       setOperationAction(ISD::VAARG, MVT::i1, Promote);
       AddPromotedToType (ISD::VAARG, MVT::i1, MVT::i64);
@@ -271,7 +275,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
   setCondCodeAction(ISD::SETONE, MVT::f64, Expand);
 
-  if (TM.getSubtarget<PPCSubtarget>().has64BitSupport()) {
+  if (Subtarget->has64BitSupport()) {
     // They also have instructions for converting between i64 and fp.
     setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
     setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
@@ -290,9 +294,9 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
   }
 
-  if (TM.getSubtarget<PPCSubtarget>().use64BitRegs()) {
+  if (Subtarget->use64BitRegs()) {
     // 64-bit PowerPC implementations can support i64 types directly
-    addRegisterClass(MVT::i64, PPC::G8RCRegisterClass);
+    addRegisterClass(MVT::i64, &PPC::G8RCRegClass);
     // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
     setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
     // 64-bit PowerPC wants to expand i128 shifts itself.
@@ -306,7 +310,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
   }
 
-  if (TM.getSubtarget<PPCSubtarget>().hasAltivec()) {
+  if (Subtarget->hasAltivec()) {
     // First set operation action for all vector types to expand. Then we
     // will selectively turn on ones that can be effectively codegen'd.
     for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
@@ -370,12 +374,13 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
     setOperationAction(ISD::STORE , MVT::v4i32, Legal);
 
-    addRegisterClass(MVT::v4f32, PPC::VRRCRegisterClass);
-    addRegisterClass(MVT::v4i32, PPC::VRRCRegisterClass);
-    addRegisterClass(MVT::v8i16, PPC::VRRCRegisterClass);
-    addRegisterClass(MVT::v16i8, PPC::VRRCRegisterClass);
+    addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass);
+    addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass);
+    addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass);
+    addRegisterClass(MVT::v16i8, &PPC::VRRCRegClass);
 
     setOperationAction(ISD::MUL, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMA, MVT::v4f32, Legal);
     setOperationAction(ISD::MUL, MVT::v4i32, Custom);
     setOperationAction(ISD::MUL, MVT::v8i16, Custom);
     setOperationAction(ISD::MUL, MVT::v16i8, Custom);
@@ -389,7 +394,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
   }
 
-  if (TM.getSubtarget<PPCSubtarget>().has64BitSupport())
+  if (Subtarget->has64BitSupport())
     setOperationAction(ISD::PREFETCH, MVT::Other, Legal);
 
   setOperationAction(ISD::ATOMIC_LOAD,  MVT::i32, Expand);
@@ -398,7 +403,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setBooleanContents(ZeroOrOneBooleanContent);
   setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
 
-  if (TM.getSubtarget<PPCSubtarget>().isPPC64()) {
+  if (isPPC64) {
     setStackPointerRegisterToSaveRestore(PPC::X1);
     setExceptionPointerRegister(PPC::X3);
     setExceptionSelectorRegister(PPC::X4);
@@ -415,7 +420,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setTargetDAGCombine(ISD::BSWAP);
 
   // Darwin long double math library functions have $LDBL128 appended.
-  if (TM.getSubtarget<PPCSubtarget>().isDarwin()) {
+  if (Subtarget->isDarwin()) {
     setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128");
     setLibcallName(RTLIB::POW_PPCF128, "powl$LDBL128");
     setLibcallName(RTLIB::REM_PPCF128, "fmodl$LDBL128");
@@ -432,6 +437,11 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   if (PPCSubTarget.isDarwin())
     setPrefFunctionAlignment(4);
 
+  if (isPPC64 && Subtarget->isJITCodeModel())
+    // Temporary workaround for the inability of PPC64 JIT to handle jump
+    // tables.
+    setSupportJumpTables(false);
+
   setInsertFencesForAtomic(true);
 
   setSchedulingPreference(Sched::Hybrid);
@@ -902,10 +912,11 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
       return true; // [r+i]
     } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {
       // Match LOAD (ADD (X, Lo(G))).
-     assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
+      assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
              && "Cannot handle constant offsets yet!");
       Disp = N.getOperand(1).getOperand(0);  // The global address.
       assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
+             Disp.getOpcode() == ISD::TargetGlobalTLSAddress ||
              Disp.getOpcode() == ISD::TargetConstantPool ||
              Disp.getOpcode() == ISD::TargetJumpTable);
       Base = N.getOperand(0);
@@ -1006,7 +1017,7 @@ bool PPCTargetLowering::SelectAddressRegImmShift(SDValue N, SDValue &Disp,
   if (N.getOpcode() == ISD::ADD) {
     short imm = 0;
     if (isIntS16Immediate(N.getOperand(1), imm) && (imm & 3) == 0) {
-      Disp =  DAG.getTargetConstant(((int)imm & 0xFFFF) >> 2, MVT::i32);
+      Disp = DAG.getTargetConstant(((int)imm & 0xFFFF) >> 2, MVT::i32);
       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
         Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
       } else {
@@ -1015,7 +1026,7 @@ bool PPCTargetLowering::SelectAddressRegImmShift(SDValue N, SDValue &Disp,
       return true; // [r+i]
     } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {
       // Match LOAD (ADD (X, Lo(G))).
-     assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
+      assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
              && "Cannot handle constant offsets yet!");
       Disp = N.getOperand(1).getOperand(0);  // The global address.
       assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
@@ -1084,8 +1095,7 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
                                                   SDValue &Offset,
                                                   ISD::MemIndexedMode &AM,
                                                   SelectionDAG &DAG) const {
-  // Disabled by default for now.
-  if (!EnablePPCPreinc) return false;
+  if (DisablePPCPreinc) return false;
 
   SDValue Ptr;
   EVT VT;
@@ -1103,7 +1113,10 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
   if (VT.isVector())
     return false;
 
-  // TODO: Check reg+reg first.
+  if (SelectAddressRegReg(Ptr, Offset, Base, DAG)) {
+    AM = ISD::PRE_INC;
+    return true;
+  }
 
   // LDU/STU use reg+imm*4, others use reg+imm.
   if (VT != MVT::i64) {
@@ -1222,6 +1235,30 @@ SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
   return LowerLabelRef(TgtBAHi, TgtBALo, isPIC, DAG);
 }
 
+SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
+                                              SelectionDAG &DAG) const {
+
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  DebugLoc dl = GA->getDebugLoc();
+  const GlobalValue *GV = GA->getGlobal();
+  EVT PtrVT = getPointerTy();
+  bool is64bit = PPCSubTarget.isPPC64();
+
+  TLSModel::Model model = getTargetMachine().getTLSModel(GV);
+
+  SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+                                             PPCII::MO_TPREL16_HA);
+  SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+                                             PPCII::MO_TPREL16_LO);
+
+  if (model != TLSModel::LocalExec)
+    llvm_unreachable("only local-exec TLS mode supported");
+  SDValue TLSReg = DAG.getRegister(is64bit ? PPC::X13 : PPC::R2,
+                                   is64bit ? MVT::i64 : MVT::i32);
+  SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg);
+  return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi);
+}
+
 SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
                                               SelectionDAG &DAG) const {
   EVT PtrVT = Op.getValueType();
@@ -1440,13 +1477,16 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   Entry.Node = Nest; Args.push_back(Entry);
 
   // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
-  std::pair<SDValue, SDValue> CallResult =
-    LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()),
-                false, false, false, false, 0, CallingConv::C,
+  TargetLowering::CallLoweringInfo CLI(Chain,
+                                       Type::getVoidTy(*DAG.getContext()),
+                                       false, false, false, false, 0,
+                                       CallingConv::C,
                 /*isTailCall=*/false,
-                /*doesNotRet=*/false, /*isReturnValueUsed=*/true,
+                                       /*doesNotRet=*/false,
+                                       /*isReturnValueUsed=*/true,
                 DAG.getExternalSymbol("__trampoline_setup", PtrVT),
                 Args, DAG, dl);
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
 
   return CallResult.second;
 }
@@ -1702,7 +1742,7 @@ PPCTargetLowering::LowerFormalArguments_SVR4(
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), ArgLocs, *DAG.getContext());
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   // Reserve space for the linkage area on the stack.
   CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false), PtrByteSize);
@@ -1721,19 +1761,19 @@ PPCTargetLowering::LowerFormalArguments_SVR4(
         default:
           llvm_unreachable("ValVT not supported by formal arguments Lowering");
         case MVT::i32:
-          RC = PPC::GPRCRegisterClass;
+          RC = &PPC::GPRCRegClass;
           break;
         case MVT::f32:
-          RC = PPC::F4RCRegisterClass;
+          RC = &PPC::F4RCRegClass;
           break;
         case MVT::f64:
-          RC = PPC::F8RCRegisterClass;
+          RC = &PPC::F8RCRegClass;
           break;
         case MVT::v16i8:
         case MVT::v8i16:
         case MVT::v4i32:
         case MVT::v4f32:
-          RC = PPC::VRRCRegisterClass;
+          RC = &PPC::VRRCRegClass;
           break;
       }
 
@@ -1763,7 +1803,7 @@ PPCTargetLowering::LowerFormalArguments_SVR4(
   // caller's stack frame, right above the parameter list area.
   SmallVector<CCValAssign, 16> ByValArgLocs;
   CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		      getTargetMachine(), ByValArgLocs, *DAG.getContext());
+                      getTargetMachine(), ByValArgLocs, *DAG.getContext());
 
   // Reserve stack space for the allocations in CCInfo.
   CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
@@ -2743,7 +2783,7 @@ PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		    getTargetMachine(), RVLocs, *DAG.getContext());
+                    getTargetMachine(), RVLocs, *DAG.getContext());
   CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC);
 
   // Copy all of the result registers out of their specified physreg.
@@ -2800,7 +2840,7 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
     if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
       SmallVector<CCValAssign, 16> RVLocs;
       CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		     getTargetMachine(), RVLocs, *DAG.getContext());
+                     getTargetMachine(), RVLocs, *DAG.getContext());
       CCInfo.AnalyzeCallResult(Ins, RetCC_PPC);
       for (unsigned i = 0; i != RVLocs.size(); ++i)
         DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
@@ -2864,14 +2904,19 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
 }
 
 SDValue
-PPCTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
-                             CallingConv::ID CallConv, bool isVarArg,
-                             bool doesNotRet, bool &isTailCall,
-                             const SmallVectorImpl<ISD::OutputArg> &Outs,
-                             const SmallVectorImpl<SDValue> &OutVals,
-                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                             DebugLoc dl, SelectionDAG &DAG,
+PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                              SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
+
   if (isTailCall)
     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
                                                    Ins, DAG);
@@ -2921,7 +2966,7 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
   // Assign locations to all of the outgoing arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), ArgLocs, *DAG.getContext());
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   // Reserve space for the linkage area on the stack.
   CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false), PtrByteSize);
@@ -2961,7 +3006,7 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
   // Assign locations to all of the outgoing aggregate by value arguments.
   SmallVector<CCValAssign, 16> ByValArgLocs;
   CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		      getTargetMachine(), ByValArgLocs, *DAG.getContext());
+                      getTargetMachine(), ByValArgLocs, *DAG.getContext());
 
   // Reserve stack space for the allocations in CCInfo.
   CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
@@ -3485,7 +3530,7 @@ PPCTargetLowering::LowerReturn(SDValue Chain,
 
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), RVLocs, *DAG.getContext());
+                 getTargetMachine(), RVLocs, *DAG.getContext());
   CCInfo.AnalyzeReturn(Outs, RetCC_PPC);
 
   // If this is the first return lowered for this function, add the regs to the
@@ -4559,7 +4604,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
-  case ISD::GlobalTLSAddress:   llvm_unreachable("TLS not implemented for PPC");
+  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
   case ISD::SETCC:              return LowerSETCC(Op, DAG);
   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
@@ -4899,11 +4944,37 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 
   MachineFunction *F = BB->getParent();
 
-  if (MI->getOpcode() == PPC::SELECT_CC_I4 ||
-      MI->getOpcode() == PPC::SELECT_CC_I8 ||
-      MI->getOpcode() == PPC::SELECT_CC_F4 ||
-      MI->getOpcode() == PPC::SELECT_CC_F8 ||
-      MI->getOpcode() == PPC::SELECT_CC_VRRC) {
+  if (PPCSubTarget.hasISEL() && (MI->getOpcode() == PPC::SELECT_CC_I4 ||
+                                 MI->getOpcode() == PPC::SELECT_CC_I8)) {
+    unsigned OpCode = MI->getOpcode() == PPC::SELECT_CC_I8 ?
+                                         PPC::ISEL8 : PPC::ISEL;
+    unsigned SelectPred = MI->getOperand(4).getImm();
+    DebugLoc dl = MI->getDebugLoc();
+
+    // The SelectPred is ((BI << 5) | BO) for a BCC
+    unsigned BO = SelectPred & 0xF;
+    assert((BO == 12 || BO == 4) && "invalid predicate BO field for isel");
+
+    unsigned TrueOpNo, FalseOpNo;
+    if (BO == 12) {
+      TrueOpNo = 2;
+      FalseOpNo = 3;
+    } else {
+      TrueOpNo = 3;
+      FalseOpNo = 2;
+      SelectPred = PPC::InvertPredicate((PPC::Predicate)SelectPred);
+    }
+
+    BuildMI(*BB, MI, dl, TII->get(OpCode), MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(TrueOpNo).getReg())
+      .addReg(MI->getOperand(FalseOpNo).getReg())
+      .addImm(SelectPred).addReg(MI->getOperand(1).getReg());
+  } else if (MI->getOpcode() == PPC::SELECT_CC_I4 ||
+             MI->getOpcode() == PPC::SELECT_CC_I8 ||
+             MI->getOpcode() == PPC::SELECT_CC_F4 ||
+             MI->getOpcode() == PPC::SELECT_CC_F8 ||
+             MI->getOpcode() == PPC::SELECT_CC_VRRC) {
+
 
     // The incoming instruction knows the destination vreg to set, the
     // condition code register to branch on, the true/false values to
@@ -5612,18 +5683,18 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
     case 'b':   // R1-R31
     case 'r':   // R0-R31
       if (VT == MVT::i64 && PPCSubTarget.isPPC64())
-        return std::make_pair(0U, PPC::G8RCRegisterClass);
-      return std::make_pair(0U, PPC::GPRCRegisterClass);
+        return std::make_pair(0U, &PPC::G8RCRegClass);
+      return std::make_pair(0U, &PPC::GPRCRegClass);
     case 'f':
       if (VT == MVT::f32)
-        return std::make_pair(0U, PPC::F4RCRegisterClass);
-      else if (VT == MVT::f64)
-        return std::make_pair(0U, PPC::F8RCRegisterClass);
+        return std::make_pair(0U, &PPC::F4RCRegClass);
+      if (VT == MVT::f64)
+        return std::make_pair(0U, &PPC::F8RCRegClass);
       break;
     case 'v':
-      return std::make_pair(0U, PPC::VRRCRegisterClass);
+      return std::make_pair(0U, &PPC::VRRCRegClass);
     case 'y':   // crrc
-      return std::make_pair(0U, PPC::CRRCRegisterClass);
+      return std::make_pair(0U, &PPC::CRRCRegClass);
     }
   }
 
@@ -5839,11 +5910,30 @@ EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size,
   }
 }
 
+/// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than
+/// a pair of mul and add instructions. fmuladd intrinsics will be expanded to
+/// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd
+/// is expanded to mul + add.
+bool PPCTargetLowering::isFMAFasterThanMulAndAdd(EVT VT) const {
+  if (!VT.isSimple())
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f32:
+  case MVT::f64:
+  case MVT::v4f32:
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
+
 Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const {
-  unsigned Directive = PPCSubTarget.getDarwinDirective();
-  if (Directive == PPC::DIR_440 || Directive == PPC::DIR_A2)
-    return Sched::ILP;
+  if (DisableILPPref)
+    return TargetLowering::getSchedulingPreference(N);
 
-  return TargetLowering::getSchedulingPreference(N);
+  return Sched::ILP;
 }
 
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 18eb072..b0a013b 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -366,6 +366,12 @@ namespace llvm {
                         bool IsZeroVal, bool MemcpyStrSrc,
                         MachineFunction &MF) const;
 
+    /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than
+    /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to
+    /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd
+    /// is expanded to mul + add.
+    virtual bool isFMAFasterThanMulAndAdd(EVT VT) const;
+
   private:
     SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const;
     SDValue getReturnAddrFrameIndex(SelectionDAG & DAG) const;
@@ -389,6 +395,7 @@ namespace llvm {
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
@@ -439,12 +446,7 @@ namespace llvm {
                            SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
-      LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
-                bool isVarArg, bool doesNotRet, bool &isTailCall,
-                const SmallVectorImpl<ISD::OutputArg> &Outs,
-                const SmallVectorImpl<SDValue> &OutVals,
-                const SmallVectorImpl<ISD::InputArg> &Ins,
-                DebugLoc dl, SelectionDAG &DAG,
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
                 SmallVectorImpl<SDValue> &InVals) const;
 
     virtual bool
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index 7f67a41..91c5366 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -68,15 +68,15 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR8] in {
   // Convenient aliases for call instructions
   let Uses = [RM] in {
     def BL8_Darwin  : IForm<18, 0, 1,
-                            (outs), (ins calltarget:$func, variable_ops), 
+                            (outs), (ins calltarget:$func),
                             "bl $func", BrB, []>;  // See Pat patterns below.
     def BLA8_Darwin : IForm<18, 1, 1,
-                          (outs), (ins aaddr:$func, variable_ops),
+                          (outs), (ins aaddr:$func),
                           "bla $func", BrB, [(PPCcall_Darwin (i64 imm:$func))]>;
   }
   let Uses = [CTR8, RM] in {
     def BCTRL8_Darwin : XLForm_2_ext<19, 528, 20, 0, 1, 
-                                  (outs), (ins variable_ops),
+                                  (outs), (ins),
                                   "bctrl", BrB,
                                   [(PPCbctrl_Darwin)]>, Requires<[In64BitMode]>;
   }
@@ -88,27 +88,27 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR8] in {
   // Convenient aliases for call instructions
   let Uses = [RM] in {
     def BL8_ELF  : IForm<18, 0, 1,
-                         (outs), (ins calltarget:$func, variable_ops), 
+                         (outs), (ins calltarget:$func),
                          "bl $func", BrB, []>;  // See Pat patterns below.
 
     let isCodeGenOnly = 1 in
     def BL8_NOP_ELF  : IForm_and_DForm_4_zero<18, 0, 1, 24,
-                             (outs), (ins calltarget:$func, variable_ops), 
+                             (outs), (ins calltarget:$func),
                              "bl $func\n\tnop", BrB, []>;
 
     def BLA8_ELF : IForm<18, 1, 1,
-                         (outs), (ins aaddr:$func, variable_ops),
+                         (outs), (ins aaddr:$func),
                          "bla $func", BrB, [(PPCcall_SVR4 (i64 imm:$func))]>;
 
     let isCodeGenOnly = 1 in
     def BLA8_NOP_ELF : IForm_and_DForm_4_zero<18, 1, 1, 24,
-                             (outs), (ins aaddr:$func, variable_ops),
+                             (outs), (ins aaddr:$func),
                              "bla $func\n\tnop", BrB,
                              [(PPCcall_nop_SVR4 (i64 imm:$func))]>;
   }
   let Uses = [X11, CTR8, RM] in {
     def BCTRL8_ELF : XLForm_2_ext<19, 528, 20, 0, 1,
-                               (outs), (ins variable_ops),
+                               (outs), (ins),
                                "bctrl", BrB,
                                [(PPCbctrl_SVR4)]>, Requires<[In64BitMode]>;
   }
@@ -180,17 +180,17 @@ def STDCX : XForm_1<31, 214, (outs), (ins G8RC:$rS, memrr:$dst),
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
 def TCRETURNdi8 :Pseudo< (outs),
-                        (ins calltarget:$dst, i32imm:$offset, variable_ops),
+                        (ins calltarget:$dst, i32imm:$offset),
                  "#TC_RETURNd8 $dst $offset",
                  []>;
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
-def TCRETURNai8 :Pseudo<(outs), (ins aaddr:$func, i32imm:$offset, variable_ops),
+def TCRETURNai8 :Pseudo<(outs), (ins aaddr:$func, i32imm:$offset),
                  "#TC_RETURNa8 $func $offset",
                  [(PPCtc_return (i64 imm:$func), imm:$offset)]>;
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
-def TCRETURNri8 : Pseudo<(outs), (ins CTRRC8:$dst, i32imm:$offset, variable_ops),
+def TCRETURNri8 : Pseudo<(outs), (ins CTRRC8:$dst, i32imm:$offset),
                  "#TC_RETURNr8 $dst $offset",
                  []>;
 
@@ -229,6 +229,15 @@ def : Pat<(PPCtc_return (i64 texternalsym:$dst), imm:$imm),
 def : Pat<(PPCtc_return CTRRC8:$dst, imm:$imm),
           (TCRETURNri8 CTRRC8:$dst, imm:$imm)>;
 
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
+  let Defs = [CTR8], Uses = [CTR8] in {
+    def BDZ8  : IForm_ext<16, 18, 0, 0, (outs), (ins condbrtarget:$dst),
+                         "bdz $dst",  BrB, []>;
+    def BDNZ8 : IForm_ext<16, 16, 0, 0, (outs), (ins condbrtarget:$dst),
+                         "bdnz $dst", BrB, []>;
+  }
+}
+
 // 64-but CR instructions
 def MTCRF8 : XFXForm_5<31, 144, (outs crbitm:$FXM), (ins G8RC:$rS),
                       "mtcrf $FXM, $rS", BrMCRX>,
@@ -278,45 +287,37 @@ def MFLR8  : XFXForm_1_ext<31, 339, 8, (outs G8RC:$rT), (ins),
 
 let PPC970_Unit = 1 in {  // FXU Operations.
 
-// Copies, extends, truncates.
-def OR4To8  : XForm_6<31, 444, (outs G8RC:$rA), (ins GPRC:$rS, GPRC:$rB),
-                   "or $rA, $rS, $rB", IntGeneral,
-                   []>;
-def OR8To4  : XForm_6<31, 444, (outs GPRC:$rA), (ins G8RC:$rS, G8RC:$rB),
-                   "or $rA, $rS, $rB", IntGeneral,
-                   []>;
-
 def LI8  : DForm_2_r0<14, (outs G8RC:$rD), (ins symbolLo64:$imm),
-                      "li $rD, $imm", IntGeneral,
+                      "li $rD, $imm", IntSimple,
                       [(set G8RC:$rD, immSExt16:$imm)]>;
 def LIS8 : DForm_2_r0<15, (outs G8RC:$rD), (ins symbolHi64:$imm),
-                      "lis $rD, $imm", IntGeneral,
+                      "lis $rD, $imm", IntSimple,
                       [(set G8RC:$rD, imm16ShiftedSExt:$imm)]>;
 
 // Logical ops.
 def NAND8: XForm_6<31, 476, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
-                   "nand $rA, $rS, $rB", IntGeneral,
+                   "nand $rA, $rS, $rB", IntSimple,
                    [(set G8RC:$rA, (not (and G8RC:$rS, G8RC:$rB)))]>;
 def AND8 : XForm_6<31,  28, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
-                   "and $rA, $rS, $rB", IntGeneral,
+                   "and $rA, $rS, $rB", IntSimple,
                    [(set G8RC:$rA, (and G8RC:$rS, G8RC:$rB))]>;
 def ANDC8: XForm_6<31,  60, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
-                   "andc $rA, $rS, $rB", IntGeneral,
+                   "andc $rA, $rS, $rB", IntSimple,
                    [(set G8RC:$rA, (and G8RC:$rS, (not G8RC:$rB)))]>;
 def OR8  : XForm_6<31, 444, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
-                   "or $rA, $rS, $rB", IntGeneral,
+                   "or $rA, $rS, $rB", IntSimple,
                    [(set G8RC:$rA, (or G8RC:$rS, G8RC:$rB))]>;
 def NOR8 : XForm_6<31, 124, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
-                   "nor $rA, $rS, $rB", IntGeneral,
+                   "nor $rA, $rS, $rB", IntSimple,
                    [(set G8RC:$rA, (not (or G8RC:$rS, G8RC:$rB)))]>;
 def ORC8 : XForm_6<31, 412, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
-                   "orc $rA, $rS, $rB", IntGeneral,
+                   "orc $rA, $rS, $rB", IntSimple,
                    [(set G8RC:$rA, (or G8RC:$rS, (not G8RC:$rB)))]>;
 def EQV8 : XForm_6<31, 284, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
-                   "eqv $rA, $rS, $rB", IntGeneral,
+                   "eqv $rA, $rS, $rB", IntSimple,
                    [(set G8RC:$rA, (not (xor G8RC:$rS, G8RC:$rB)))]>;
 def XOR8 : XForm_6<31, 316, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
-                   "xor $rA, $rS, $rB", IntGeneral,
+                   "xor $rA, $rS, $rB", IntSimple,
                    [(set G8RC:$rA, (xor G8RC:$rS, G8RC:$rB))]>;
 
 // Logical ops with immediate.
@@ -329,20 +330,20 @@ def ANDISo8 : DForm_4<29, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2),
                     [(set G8RC:$dst, (and G8RC:$src1,imm16ShiftedZExt:$src2))]>,
                      isDOT;
 def ORI8    : DForm_4<24, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2),
-                      "ori $dst, $src1, $src2", IntGeneral,
+                      "ori $dst, $src1, $src2", IntSimple,
                       [(set G8RC:$dst, (or G8RC:$src1, immZExt16:$src2))]>;
 def ORIS8   : DForm_4<25, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2),
-                      "oris $dst, $src1, $src2", IntGeneral,
+                      "oris $dst, $src1, $src2", IntSimple,
                     [(set G8RC:$dst, (or G8RC:$src1, imm16ShiftedZExt:$src2))]>;
 def XORI8   : DForm_4<26, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2),
-                      "xori $dst, $src1, $src2", IntGeneral,
+                      "xori $dst, $src1, $src2", IntSimple,
                       [(set G8RC:$dst, (xor G8RC:$src1, immZExt16:$src2))]>;
 def XORIS8  : DForm_4<27, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2),
-                      "xoris $dst, $src1, $src2", IntGeneral,
+                      "xoris $dst, $src1, $src2", IntSimple,
                    [(set G8RC:$dst, (xor G8RC:$src1, imm16ShiftedZExt:$src2))]>;
 
 def ADD8  : XOForm_1<31, 266, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
-                     "add $rT, $rA, $rB", IntGeneral,
+                     "add $rT, $rA, $rB", IntSimple,
                      [(set G8RC:$rT, (add G8RC:$rA, G8RC:$rB))]>;
                      
 let Defs = [CARRY] in {
@@ -355,10 +356,13 @@ def ADDIC8 : DForm_2<12, (outs G8RC:$rD), (ins G8RC:$rA, s16imm64:$imm),
                      [(set G8RC:$rD, (addc G8RC:$rA, immSExt16:$imm))]>;
 }
 def ADDI8  : DForm_2<14, (outs G8RC:$rD), (ins G8RC:$rA, s16imm64:$imm),
-                     "addi $rD, $rA, $imm", IntGeneral,
+                     "addi $rD, $rA, $imm", IntSimple,
+                     [(set G8RC:$rD, (add G8RC:$rA, immSExt16:$imm))]>;
+def ADDI8L  : DForm_2<14, (outs G8RC:$rD), (ins G8RC:$rA, symbolLo64:$imm),
+                     "addi $rD, $rA, $imm", IntSimple,
                      [(set G8RC:$rD, (add G8RC:$rA, immSExt16:$imm))]>;
 def ADDIS8 : DForm_2<15, (outs G8RC:$rD), (ins G8RC:$rA, symbolHi64:$imm),
-                     "addis $rD, $rA, $imm", IntGeneral,
+                     "addis $rD, $rA, $imm", IntSimple,
                      [(set G8RC:$rD, (add G8RC:$rA, imm16ShiftedSExt:$imm))]>;
 
 let Defs = [CARRY] in {
@@ -374,7 +378,7 @@ def SUBF8 : XOForm_1<31, 40, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
                      "subf $rT, $rA, $rB", IntGeneral,
                      [(set G8RC:$rT, (sub G8RC:$rB, G8RC:$rA))]>;
 def NEG8    : XOForm_3<31, 104, 0, (outs G8RC:$rT), (ins G8RC:$rA),
-                       "neg $rT, $rA", IntGeneral,
+                       "neg $rT, $rA", IntSimple,
                        [(set G8RC:$rT, (ineg G8RC:$rA))]>;
 let Uses = [CARRY], Defs = [CARRY] in {
 def ADDE8   : XOForm_1<31, 138, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
@@ -427,21 +431,21 @@ def SRAD : XForm_6<31, 794, (outs G8RC:$rA), (ins G8RC:$rS, GPRC:$rB),
 }
                    
 def EXTSB8 : XForm_11<31, 954, (outs G8RC:$rA), (ins G8RC:$rS),
-                      "extsb $rA, $rS", IntGeneral,
+                      "extsb $rA, $rS", IntSimple,
                       [(set G8RC:$rA, (sext_inreg G8RC:$rS, i8))]>;
 def EXTSH8 : XForm_11<31, 922, (outs G8RC:$rA), (ins G8RC:$rS),
-                      "extsh $rA, $rS", IntGeneral,
+                      "extsh $rA, $rS", IntSimple,
                       [(set G8RC:$rA, (sext_inreg G8RC:$rS, i16))]>;
 
 def EXTSW  : XForm_11<31, 986, (outs G8RC:$rA), (ins G8RC:$rS),
-                      "extsw $rA, $rS", IntGeneral,
+                      "extsw $rA, $rS", IntSimple,
                       [(set G8RC:$rA, (sext_inreg G8RC:$rS, i32))]>, isPPC64;
 /// EXTSW_32 - Just like EXTSW, but works on '32-bit' registers.
 def EXTSW_32 : XForm_11<31, 986, (outs GPRC:$rA), (ins GPRC:$rS),
-                      "extsw $rA, $rS", IntGeneral,
+                      "extsw $rA, $rS", IntSimple,
                       [(set GPRC:$rA, (PPCextsw_32 GPRC:$rS))]>, isPPC64;
 def EXTSW_32_64 : XForm_11<31, 986, (outs G8RC:$rA), (ins GPRC:$rS),
-                      "extsw $rA, $rS", IntGeneral,
+                      "extsw $rA, $rS", IntSimple,
                       [(set G8RC:$rA, (sext GPRC:$rS))]>, isPPC64;
 
 let Defs = [CARRY] in {
@@ -493,6 +497,10 @@ def RLWINM8 : MForm_2<21,
                      "rlwinm $rA, $rS, $SH, $MB, $ME", IntGeneral,
                      []>;
 
+def ISEL8   : AForm_1<31, 15,
+                     (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB, pred:$cond),
+                     "isel $rT, $rA, $rB, $cond", IntGeneral,
+                     []>;
 }  // End FXU Operations.
 
 
@@ -529,6 +537,16 @@ def LHAU8 : DForm_1a<43, (outs G8RC:$rD, ptr_rc:$ea_result), (ins symbolLo:$disp
                     NoEncode<"$ea_result">;
 // NO LWAU!
 
+def LHAUX8 : XForm_1<31, 375, (outs G8RC:$rD, ptr_rc:$ea_result),
+                    (ins memrr:$addr),
+                    "lhaux $rD, $addr", LdStLoad,
+                    []>, RegConstraint<"$addr.offreg = $ea_result">,
+                    NoEncode<"$ea_result">;
+def LWAUX : XForm_1<31, 375, (outs G8RC:$rD, ptr_rc:$ea_result),
+                    (ins memrr:$addr),
+                    "lwaux $rD, $addr", LdStLoad,
+                    []>, RegConstraint<"$addr.offreg = $ea_result">,
+                    NoEncode<"$ea_result">, isPPC64;
 }
 
 // Zero extending loads.
@@ -568,6 +586,22 @@ def LWZU8 : DForm_1<33, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
                     "lwzu $rD, $addr", LdStLoad,
                     []>, RegConstraint<"$addr.reg = $ea_result">,
                     NoEncode<"$ea_result">;
+
+def LBZUX8 : XForm_1<31, 119, (outs G8RC:$rD, ptr_rc:$ea_result),
+                   (ins memrr:$addr),
+                   "lbzux $rD, $addr", LdStLoad,
+                   []>, RegConstraint<"$addr.offreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+def LHZUX8 : XForm_1<31, 331, (outs G8RC:$rD, ptr_rc:$ea_result),
+                   (ins memrr:$addr),
+                   "lhzux $rD, $addr", LdStLoad,
+                   []>, RegConstraint<"$addr.offreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+def LWZUX8 : XForm_1<31, 55, (outs G8RC:$rD, ptr_rc:$ea_result),
+                   (ins memrr:$addr),
+                   "lwzux $rD, $addr", LdStLoad,
+                   []>, RegConstraint<"$addr.offreg = $ea_result">,
+                   NoEncode<"$ea_result">;
 }
 }
 
@@ -603,6 +637,11 @@ def LDU  : DSForm_1<58, 1, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memrix:$addr
                     []>, RegConstraint<"$addr.reg = $ea_result">, isPPC64,
                     NoEncode<"$ea_result">;
 
+def LDUX : XForm_1<31, 53, (outs G8RC:$rD, ptr_rc:$ea_result),
+                   (ins memrr:$addr),
+                   "ldux $rD, $addr", LdStLoad,
+                   []>, RegConstraint<"$addr.offreg = $ea_result">,
+                   NoEncode<"$ea_result">, isPPC64;
 }
 
 def : Pat<(PPCload ixaddr:$src),
@@ -660,6 +699,14 @@ def STHU8 : DForm_1a<45, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
                                         iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
 
+def STWU8 : DForm_1a<37, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
+                             symbolLo:$ptroff, ptr_rc:$ptrreg),
+                    "stwu $rS, $ptroff($ptrreg)", LdStStore,
+                    [(set ptr_rc:$ea_res,
+                          (pre_truncsti32 G8RC:$rS, ptr_rc:$ptrreg,
+                                          iaddroff:$ptroff))]>,
+                    RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
+
 def STDU : DSForm_1a<62, 1, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
                                         s16immX4:$ptroff, ptr_rc:$ptrreg),
                     "stdu $rS, $ptroff($ptrreg)", LdStSTD,
@@ -668,10 +715,41 @@ def STDU : DSForm_1a<62, 1, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">,
                     isPPC64;
 
-let mayStore = 1 in
-def STDUX : XForm_8<31, 181, (outs), (ins G8RC:$rS, memrr:$dst),
-                   "stdux $rS, $dst", LdStSTD,
-                   []>, isPPC64;
+
+def STBUX8 : XForm_8<31, 247, (outs ptr_rc:$ea_res),
+                              (ins G8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
+                    "stbux $rS, $ptroff, $ptrreg", LdStStore,
+                    [(set ptr_rc:$ea_res,
+                       (pre_truncsti8 G8RC:$rS,
+                                      ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
+                    RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+
+def STHUX8 : XForm_8<31, 439, (outs ptr_rc:$ea_res),
+                              (ins G8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
+                    "sthux $rS, $ptroff, $ptrreg", LdStStore,
+                    [(set ptr_rc:$ea_res,
+                       (pre_truncsti16 G8RC:$rS,
+                                       ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
+                    RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+
+def STWUX8 : XForm_8<31, 183, (outs ptr_rc:$ea_res),
+                              (ins G8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
+                    "stwux $rS, $ptroff, $ptrreg", LdStStore,
+                    [(set ptr_rc:$ea_res,
+                       (pre_truncsti32 G8RC:$rS,
+                                       ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
+                    RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+
+def STDUX : XForm_8<31, 181, (outs ptr_rc:$ea_res),
+                              (ins G8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
+                    "stdux $rS, $ptroff, $ptrreg", LdStStore,
+                    [(set ptr_rc:$ea_res,
+                       (pre_store G8RC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
+                    RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked, isPPC64;
 
 // STD_32/STDX_32 - Just like STD/STDX, but uses a '32-bit' input register.
 def STD_32  : DSForm_1<62, 0, (outs), (ins GPRC:$rT, memrix:$dst),
@@ -706,11 +784,12 @@ def FCTIDZ : XForm_26<63, 815, (outs F8RC:$frD), (ins F8RC:$frB),
 
 // Extensions and truncates to/from 32-bit regs.
 def : Pat<(i64 (zext GPRC:$in)),
-          (RLDICL (OR4To8 GPRC:$in, GPRC:$in), 0, 32)>;
+          (RLDICL (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPRC:$in, sub_32),
+                  0, 32)>;
 def : Pat<(i64 (anyext GPRC:$in)),
-          (OR4To8 GPRC:$in, GPRC:$in)>;
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPRC:$in, sub_32)>;
 def : Pat<(i32 (trunc G8RC:$in)),
-          (OR8To4 G8RC:$in, G8RC:$in)>;
+          (EXTRACT_SUBREG G8RC:$in, sub_32)>;
 
 // Extending loads with i64 targets.
 def : Pat<(zextloadi1 iaddr:$src),
@@ -765,6 +844,10 @@ def : Pat<(PPChi tjumptable:$in , 0), (LIS8 tjumptable:$in)>;
 def : Pat<(PPClo tjumptable:$in , 0), (LI8  tjumptable:$in)>;
 def : Pat<(PPChi tblockaddress:$in, 0), (LIS8 tblockaddress:$in)>;
 def : Pat<(PPClo tblockaddress:$in, 0), (LI8  tblockaddress:$in)>;
+def : Pat<(PPChi tglobaltlsaddr:$g, G8RC:$in),
+          (ADDIS8 G8RC:$in, tglobaltlsaddr:$g)>;
+def : Pat<(PPClo tglobaltlsaddr:$g, G8RC:$in),
+          (ADDI8L G8RC:$in, tglobaltlsaddr:$g)>;
 def : Pat<(add G8RC:$in, (PPChi tglobaladdr:$g, 0)),
           (ADDIS8 G8RC:$in, tglobaladdr:$g)>;
 def : Pat<(add G8RC:$in, (PPChi tconstpool:$g, 0)),
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index 6c0f3d3..b0b8423 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -274,15 +274,11 @@ let PPC970_Unit = 5 in {  // VALU Operations.
 // VA-Form instructions.  3-input AltiVec ops.
 def VMADDFP : VAForm_1<46, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vC, VRRC:$vB),
                        "vmaddfp $vD, $vA, $vC, $vB", VecFP,
-                       [(set VRRC:$vD, (fadd (fmul VRRC:$vA, VRRC:$vC),
-                                             VRRC:$vB))]>,
-                       Requires<[FPContractions]>;
+                       [(set VRRC:$vD, (fma VRRC:$vA, VRRC:$vC, VRRC:$vB))]>;
 def VNMSUBFP: VAForm_1<47, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vC, VRRC:$vB),
                        "vnmsubfp $vD, $vA, $vC, $vB", VecFP,
-                       [(set VRRC:$vD, (fsub V_immneg0,
-                                             (fsub (fmul VRRC:$vA, VRRC:$vC),
-                                                   VRRC:$vB)))]>,
-                       Requires<[FPContractions]>;
+                       [(set VRRC:$vD, (fneg (fma VRRC:$vA, VRRC:$vC,
+                                                  (fneg VRRC:$vB))))]>; 
 
 def VMHADDSHS  : VA1a_Int<32, "vmhaddshs",  int_ppc_altivec_vmhaddshs>;
 def VMHRADDSHS : VA1a_Int<33, "vmhraddshs", int_ppc_altivec_vmhraddshs>;
diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td
index d8e4b2b..a41a027 100644
--- a/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/lib/Target/PowerPC/PPCInstrFormats.td
@@ -94,6 +94,12 @@ class IForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr,
   let Inst{31}    = lk;
 }
 
+class IForm_ext<bits<6> opcode, bits<5> bo, bit aa, bit lk, dag OOL, dag IOL,
+            string asmstr, InstrItinClass itin, list<dag> pattern>
+         : IForm<opcode, aa, lk, OOL, IOL, asmstr, itin, pattern> {
+  let LI{0-4} = bo;
+}
+
 // 1.7.2 B-Form
 class BForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr>
   : I<opcode, OOL, IOL, asmstr, BrB> {
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index b45ada9..47f09dc 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -40,6 +40,10 @@ extern cl::opt<bool> DisablePPC64RS;
 
 using namespace llvm;
 
+static cl::
+opt<bool> DisableCTRLoopAnal("disable-ppc-ctrloop-analysis", cl::Hidden,
+            cl::desc("Disable analysis for CTR loops"));
+
 PPCInstrInfo::PPCInstrInfo(PPCTargetMachine &tm)
   : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP),
     TM(tm), RI(*TM.getSubtargetImpl(), *this) {}
@@ -75,6 +79,22 @@ ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetPostRAHazardRecognizer(
 
   return new PPCScoreboardHazardRecognizer(II, DAG);
 }
+
+// Detect 32 -> 64-bit extensions where we may reuse the low sub-register.
+bool PPCInstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
+                                         unsigned &SrcReg, unsigned &DstReg,
+                                         unsigned &SubIdx) const {
+  switch (MI.getOpcode()) {
+  default: return false;
+  case PPC::EXTSW:
+  case PPC::EXTSW_32_64:
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    SubIdx = PPC::sub_32;
+    return true;
+  }
+}
+
 unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
                                            int &FrameIndex) const {
   switch (MI->getOpcode()) {
@@ -186,10 +206,14 @@ void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB,
 
 
 // Branch analysis.
+// Note: If the condition register is set to CTR or CTR8 then this is a
+// BDNZ (imm == 1) or BDZ (imm == 0) branch.
 bool PPCInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
                                  MachineBasicBlock *&FBB,
                                  SmallVectorImpl<MachineOperand> &Cond,
                                  bool AllowModify) const {
+  bool isPPC64 = TM.getSubtargetImpl()->isPPC64();
+
   // If the block has no terminators, it just falls into the block after it.
   MachineBasicBlock::iterator I = MBB.end();
   if (I == MBB.begin())
@@ -221,7 +245,30 @@ bool PPCInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
       Cond.push_back(LastInst->getOperand(0));
       Cond.push_back(LastInst->getOperand(1));
       return false;
+    } else if (LastInst->getOpcode() == PPC::BDNZ8 ||
+               LastInst->getOpcode() == PPC::BDNZ) {
+      if (!LastInst->getOperand(0).isMBB())
+        return true;
+      if (DisableCTRLoopAnal)
+        return true;
+      TBB = LastInst->getOperand(0).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(1));
+      Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR,
+                                               true));
+      return false;
+    } else if (LastInst->getOpcode() == PPC::BDZ8 ||
+               LastInst->getOpcode() == PPC::BDZ) {
+      if (!LastInst->getOperand(0).isMBB())
+        return true;
+      if (DisableCTRLoopAnal)
+        return true;
+      TBB = LastInst->getOperand(0).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(0));
+      Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR,
+                                               true));
+      return false;
     }
+
     // Otherwise, don't know what this is.
     return true;
   }
@@ -245,6 +292,34 @@ bool PPCInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
     Cond.push_back(SecondLastInst->getOperand(1));
     FBB = LastInst->getOperand(0).getMBB();
     return false;
+  } else if ((SecondLastInst->getOpcode() == PPC::BDNZ8 ||
+              SecondLastInst->getOpcode() == PPC::BDNZ) &&
+      LastInst->getOpcode() == PPC::B) {
+    if (!SecondLastInst->getOperand(0).isMBB() ||
+        !LastInst->getOperand(0).isMBB())
+      return true;
+    if (DisableCTRLoopAnal)
+      return true;
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(1));
+    Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR,
+                                             true));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  } else if ((SecondLastInst->getOpcode() == PPC::BDZ8 ||
+              SecondLastInst->getOpcode() == PPC::BDZ) &&
+      LastInst->getOpcode() == PPC::B) {
+    if (!SecondLastInst->getOperand(0).isMBB() ||
+        !LastInst->getOperand(0).isMBB())
+      return true;
+    if (DisableCTRLoopAnal)
+      return true;
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(0));
+    Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR,
+                                             true));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
   }
 
   // If the block ends with two PPC:Bs, handle it.  The second one is not
@@ -273,7 +348,9 @@ unsigned PPCInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
       return 0;
     --I;
   }
-  if (I->getOpcode() != PPC::B && I->getOpcode() != PPC::BCC)
+  if (I->getOpcode() != PPC::B && I->getOpcode() != PPC::BCC &&
+      I->getOpcode() != PPC::BDNZ8 && I->getOpcode() != PPC::BDNZ &&
+      I->getOpcode() != PPC::BDZ8  && I->getOpcode() != PPC::BDZ)
     return 0;
 
   // Remove the branch.
@@ -283,7 +360,9 @@ unsigned PPCInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
 
   if (I == MBB.begin()) return 1;
   --I;
-  if (I->getOpcode() != PPC::BCC)
+  if (I->getOpcode() != PPC::BCC &&
+      I->getOpcode() != PPC::BDNZ8 && I->getOpcode() != PPC::BDNZ &&
+      I->getOpcode() != PPC::BDZ8  && I->getOpcode() != PPC::BDZ)
     return 1;
 
   // Remove the branch.
@@ -301,10 +380,16 @@ PPCInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   assert((Cond.size() == 2 || Cond.size() == 0) &&
          "PPC branch conditions have two components!");
 
+  bool isPPC64 = TM.getSubtargetImpl()->isPPC64();
+
   // One-way branch.
   if (FBB == 0) {
     if (Cond.empty())   // Unconditional branch
       BuildMI(&MBB, DL, get(PPC::B)).addMBB(TBB);
+    else if (Cond[1].getReg() == PPC::CTR || Cond[1].getReg() == PPC::CTR8)
+      BuildMI(&MBB, DL, get(Cond[0].getImm() ?
+                              (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
+                              (isPPC64 ? PPC::BDZ8  : PPC::BDZ))).addMBB(TBB);
     else                // Conditional branch
       BuildMI(&MBB, DL, get(PPC::BCC))
         .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
@@ -312,8 +397,13 @@ PPCInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   }
 
   // Two-way Conditional Branch.
-  BuildMI(&MBB, DL, get(PPC::BCC))
-    .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
+  if (Cond[1].getReg() == PPC::CTR || Cond[1].getReg() == PPC::CTR8)
+    BuildMI(&MBB, DL, get(Cond[0].getImm() ?
+                            (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
+                            (isPPC64 ? PPC::BDZ8  : PPC::BDZ))).addMBB(TBB);
+  else
+    BuildMI(&MBB, DL, get(PPC::BCC))
+      .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
   BuildMI(&MBB, DL, get(PPC::B)).addMBB(FBB);
   return 2;
 }
@@ -354,7 +444,7 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
                                   const TargetRegisterClass *RC,
                                   SmallVectorImpl<MachineInstr*> &NewMIs) const{
   DebugLoc DL;
-  if (PPC::GPRCRegisterClass->hasSubClassEq(RC)) {
+  if (PPC::GPRCRegClass.hasSubClassEq(RC)) {
     if (SrcReg != PPC::LR) {
       NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STW))
                                          .addReg(SrcReg,
@@ -370,7 +460,7 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
                                                  getKillRegState(isKill)),
                                          FrameIdx));
     }
-  } else if (PPC::G8RCRegisterClass->hasSubClassEq(RC)) {
+  } else if (PPC::G8RCRegClass.hasSubClassEq(RC)) {
     if (SrcReg != PPC::LR8) {
       NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STD))
                                          .addReg(SrcReg,
@@ -386,17 +476,17 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
                                                  getKillRegState(isKill)),
                                          FrameIdx));
     }
-  } else if (PPC::F8RCRegisterClass->hasSubClassEq(RC)) {
+  } else if (PPC::F8RCRegClass.hasSubClassEq(RC)) {
     NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STFD))
                                        .addReg(SrcReg,
                                                getKillRegState(isKill)),
                                        FrameIdx));
-  } else if (PPC::F4RCRegisterClass->hasSubClassEq(RC)) {
+  } else if (PPC::F4RCRegClass.hasSubClassEq(RC)) {
     NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STFS))
                                        .addReg(SrcReg,
                                                getKillRegState(isKill)),
                                        FrameIdx));
-  } else if (PPC::CRRCRegisterClass->hasSubClassEq(RC)) {
+  } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) {
     if ((!DisablePPC32RS && !TM.getSubtargetImpl()->isPPC64()) ||
         (!DisablePPC64RS && TM.getSubtargetImpl()->isPPC64())) {
       NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_CR))
@@ -438,7 +528,7 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
                                                  getKillRegState(isKill)),
                                          FrameIdx));
     }
-  } else if (PPC::CRBITRCRegisterClass->hasSubClassEq(RC)) {
+  } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
     // FIXME: We use CRi here because there is no mtcrf on a bit. Since the
     // backend currently only uses CR1EQ as an individual bit, this should
     // not cause any bug. If we need other uses of CR bits, the following
@@ -470,9 +560,9 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
       Reg = PPC::CR7;
 
     return StoreRegToStackSlot(MF, Reg, isKill, FrameIdx,
-                               PPC::CRRCRegisterClass, NewMIs);
+                               &PPC::CRRCRegClass, NewMIs);
 
-  } else if (PPC::VRRCRegisterClass->hasSubClassEq(RC)) {
+  } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) {
     // We don't have indexed addressing for vector loads.  Emit:
     // R0 = ADDI FI#
     // STVX VAL, 0, R0
@@ -522,7 +612,7 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
                                    unsigned DestReg, int FrameIdx,
                                    const TargetRegisterClass *RC,
                                    SmallVectorImpl<MachineInstr*> &NewMIs)const{
-  if (PPC::GPRCRegisterClass->hasSubClassEq(RC)) {
+  if (PPC::GPRCRegClass.hasSubClassEq(RC)) {
     if (DestReg != PPC::LR) {
       NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LWZ),
                                                  DestReg), FrameIdx));
@@ -531,7 +621,7 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
                                                  PPC::R11), FrameIdx));
       NewMIs.push_back(BuildMI(MF, DL, get(PPC::MTLR)).addReg(PPC::R11));
     }
-  } else if (PPC::G8RCRegisterClass->hasSubClassEq(RC)) {
+  } else if (PPC::G8RCRegClass.hasSubClassEq(RC)) {
     if (DestReg != PPC::LR8) {
       NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LD), DestReg),
                                          FrameIdx));
@@ -540,13 +630,13 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
                                                  PPC::X11), FrameIdx));
       NewMIs.push_back(BuildMI(MF, DL, get(PPC::MTLR8)).addReg(PPC::X11));
     }
-  } else if (PPC::F8RCRegisterClass->hasSubClassEq(RC)) {
+  } else if (PPC::F8RCRegClass.hasSubClassEq(RC)) {
     NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFD), DestReg),
                                        FrameIdx));
-  } else if (PPC::F4RCRegisterClass->hasSubClassEq(RC)) {
+  } else if (PPC::F4RCRegClass.hasSubClassEq(RC)) {
     NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFS), DestReg),
                                        FrameIdx));
-  } else if (PPC::CRRCRegisterClass->hasSubClassEq(RC)) {
+  } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) {
     if ((!DisablePPC32RS && !TM.getSubtargetImpl()->isPPC64()) ||
         (!DisablePPC64RS && TM.getSubtargetImpl()->isPPC64())) {
       NewMIs.push_back(addFrameReference(BuildMI(MF, DL,
@@ -578,7 +668,7 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
                          PPC::MTCRF8 : PPC::MTCRF), DestReg)
                        .addReg(ScratchReg));
     }
-  } else if (PPC::CRBITRCRegisterClass->hasSubClassEq(RC)) {
+  } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
 
     unsigned Reg = 0;
     if (DestReg == PPC::CR0LT || DestReg == PPC::CR0GT ||
@@ -607,9 +697,9 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
       Reg = PPC::CR7;
 
     return LoadRegFromStackSlot(MF, DL, Reg, FrameIdx,
-                                PPC::CRRCRegisterClass, NewMIs);
+                                &PPC::CRRCRegClass, NewMIs);
 
-  } else if (PPC::VRRCRegisterClass->hasSubClassEq(RC)) {
+  } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) {
     // We don't have indexed addressing for vector loads.  Emit:
     // R0 = ADDI FI#
     // Dest = LVX 0, R0
@@ -665,8 +755,11 @@ PPCInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
 bool PPCInstrInfo::
 ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
   assert(Cond.size() == 2 && "Invalid PPC branch opcode!");
-  // Leave the CR# the same, but invert the condition.
-  Cond[0].setImm(PPC::InvertPredicate((PPC::Predicate)Cond[0].getImm()));
+  if (Cond[1].getReg() == PPC::CTR8 || Cond[1].getReg() == PPC::CTR)
+    Cond[0].setImm(Cond[0].getImm() == 0 ? 1 : 0);
+  else
+    // Leave the CR# the same, but invert the condition.
+    Cond[0].setImm(PPC::InvertPredicate((PPC::Predicate)Cond[0].getImm()));
   return false;
 }
 
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index 7d49aa1..374213e 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -92,6 +92,9 @@ public:
   CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
                                      const ScheduleDAG *DAG) const;
 
+  bool isCoalescableExtInstr(const MachineInstr &MI,
+                             unsigned &SrcReg, unsigned &DstReg,
+                             unsigned &SubIdx) const;
   unsigned isLoadFromStackSlot(const MachineInstr *MI,
                                int &FrameIndex) const;
   unsigned isStoreToStackSlot(const MachineInstr *MI,
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 748486c..f57f0c9 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -323,7 +323,7 @@ def memri : Operand<iPTR> {
 }
 def memrr : Operand<iPTR> {
   let PrintMethod = "printMemRegReg";
-  let MIOperandInfo = (ops ptr_rc, ptr_rc);
+  let MIOperandInfo = (ops ptr_rc:$offreg, ptr_rc:$ptrreg);
 }
 def memrix : Operand<iPTR> {   // memri where the imm is shifted 2 bits.
   let PrintMethod = "printMemRegImmShifted";
@@ -349,10 +349,10 @@ def ixaddr : ComplexPattern<iPTR, 2, "SelectAddrImmShift", [], []>; // "std"
 
 /// This is just the offset part of iaddr, used for preinc.
 def iaddroff : ComplexPattern<iPTR, 1, "SelectAddrImmOffs", [], []>;
+def xaddroff : ComplexPattern<iPTR, 1, "SelectAddrIdxOffs", [], []>;
 
 //===----------------------------------------------------------------------===//
 // PowerPC Instruction Predicate Definitions.
-def FPContractions : Predicate<"!TM.Options.NoExcessFPPrecision">;
 def In32BitMode  : Predicate<"!PPCSubTarget.isPPC64()">;
 def In64BitMode  : Predicate<"PPCSubTarget.isPPC64()">;
 def IsBookE  : Predicate<"PPCSubTarget.isBookE()">;
@@ -438,6 +438,13 @@ let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
   def BCC : BForm<16, 0, 0, (outs), (ins pred:$cond, condbrtarget:$dst),
                   "b${cond:cc} ${cond:reg}, $dst"
                   /*[(PPCcondbranch CRRC:$crS, imm:$opc, bb:$dst)]*/>;
+
+  let Defs = [CTR], Uses = [CTR] in {
+    def BDZ  : IForm_ext<16, 18, 0, 0, (outs), (ins condbrtarget:$dst),
+                         "bdz $dst",  BrB, []>;
+    def BDNZ : IForm_ext<16, 16, 0, 0, (outs), (ins condbrtarget:$dst),
+                         "bdnz $dst", BrB, []>;
+  }
 }
 
 // Darwin ABI Calls.
@@ -445,15 +452,15 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR] in {
   // Convenient aliases for call instructions
   let Uses = [RM] in {
     def BL_Darwin  : IForm<18, 0, 1,
-                           (outs), (ins calltarget:$func, variable_ops), 
+                           (outs), (ins calltarget:$func), 
                            "bl $func", BrB, []>;  // See Pat patterns below.
     def BLA_Darwin : IForm<18, 1, 1, 
-                          (outs), (ins aaddr:$func, variable_ops),
+                          (outs), (ins aaddr:$func),
                           "bla $func", BrB, [(PPCcall_Darwin (i32 imm:$func))]>;
   }
   let Uses = [CTR, RM] in {
     def BCTRL_Darwin : XLForm_2_ext<19, 528, 20, 0, 1, 
-                                  (outs), (ins variable_ops),
+                                  (outs), (ins),
                                   "bctrl", BrB,
                                   [(PPCbctrl_Darwin)]>, Requires<[In32BitMode]>;
   }
@@ -464,16 +471,16 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR] in {
   // Convenient aliases for call instructions
   let Uses = [RM] in {
     def BL_SVR4  : IForm<18, 0, 1,
-                        (outs), (ins calltarget:$func, variable_ops), 
+                        (outs), (ins calltarget:$func), 
                         "bl $func", BrB, []>;  // See Pat patterns below.
     def BLA_SVR4 : IForm<18, 1, 1,
-                        (outs), (ins aaddr:$func, variable_ops),
+                        (outs), (ins aaddr:$func),
                         "bla $func", BrB,
                         [(PPCcall_SVR4 (i32 imm:$func))]>;
   }
   let Uses = [CTR, RM] in {
     def BCTRL_SVR4 : XLForm_2_ext<19, 528, 20, 0, 1,
-                                (outs), (ins variable_ops),
+                                (outs), (ins),
                                 "bctrl", BrB,
                                 [(PPCbctrl_SVR4)]>, Requires<[In32BitMode]>;
   }
@@ -482,18 +489,18 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR] in {
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
 def TCRETURNdi :Pseudo< (outs),
-                        (ins calltarget:$dst, i32imm:$offset, variable_ops),
+                        (ins calltarget:$dst, i32imm:$offset),
                  "#TC_RETURNd $dst $offset",
                  []>;
 
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
-def TCRETURNai :Pseudo<(outs), (ins aaddr:$func, i32imm:$offset, variable_ops),
+def TCRETURNai :Pseudo<(outs), (ins aaddr:$func, i32imm:$offset),
                  "#TC_RETURNa $func $offset",
                  [(PPCtc_return (i32 imm:$func), imm:$offset)]>;
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
-def TCRETURNri : Pseudo<(outs), (ins CTRRC:$dst, i32imm:$offset, variable_ops),
+def TCRETURNri : Pseudo<(outs), (ins CTRRC:$dst, i32imm:$offset),
                  "#TC_RETURNr $dst $offset",
                  []>;
 
@@ -704,6 +711,44 @@ def LFDU : DForm_1<51, (outs F8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
                   "lfd $rD, $addr", LdStLFD,
                   []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
+
+
+// Indexed (r+r) Loads with Update (preinc).
+def LBZUX : XForm_1<31, 119, (outs GPRC:$rD, ptr_rc:$ea_result),
+                   (ins memrr:$addr),
+                   "lbzux $rD, $addr", LdStLoad,
+                   []>, RegConstraint<"$addr.offreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LHAUX : XForm_1<31, 375, (outs GPRC:$rD, ptr_rc:$ea_result),
+                   (ins memrr:$addr),
+                   "lhaux $rD, $addr", LdStLoad,
+                   []>, RegConstraint<"$addr.offreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LHZUX : XForm_1<31, 331, (outs GPRC:$rD, ptr_rc:$ea_result),
+                   (ins memrr:$addr),
+                   "lhzux $rD, $addr", LdStLoad,
+                   []>, RegConstraint<"$addr.offreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LWZUX : XForm_1<31, 55, (outs GPRC:$rD, ptr_rc:$ea_result),
+                   (ins memrr:$addr),
+                   "lwzux $rD, $addr", LdStLoad,
+                   []>, RegConstraint<"$addr.offreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LFSUX : XForm_1<31, 567, (outs F4RC:$rD, ptr_rc:$ea_result),
+                   (ins memrr:$addr),
+                   "lfsux $rD, $addr", LdStLoad,
+                   []>, RegConstraint<"$addr.offreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LFDUX : XForm_1<31, 631, (outs F8RC:$rD, ptr_rc:$ea_result),
+                   (ins memrr:$addr),
+                   "lfdux $rD, $addr", LdStLoad,
+                   []>, RegConstraint<"$addr.offreg = $ea_result">,
+                   NoEncode<"$ea_result">;
 }
 }
 
@@ -815,12 +860,49 @@ def STWX  : XForm_8<31, 151, (outs), (ins GPRC:$rS, memrr:$dst),
                    "stwx $rS, $dst", LdStStore,
                    [(store GPRC:$rS, xaddr:$dst)]>,
                    PPC970_DGroup_Cracked;
-                   
-let mayStore = 1 in {
-def STWUX : XForm_8<31, 183, (outs), (ins GPRC:$rS, GPRC:$rA, GPRC:$rB),
-                   "stwux $rS, $rA, $rB", LdStStore,
-                   []>;
-}
+ 
+def STBUX : XForm_8<31, 247, (outs ptr_rc:$ea_res),
+                             (ins GPRC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
+                   "stbux $rS, $ptroff, $ptrreg", LdStStore,
+                   [(set ptr_rc:$ea_res,
+                      (pre_truncsti8 GPRC:$rS,
+                                     ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
+                   RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,
+                   PPC970_DGroup_Cracked;
+ 
+def STHUX : XForm_8<31, 439, (outs ptr_rc:$ea_res),
+                             (ins GPRC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
+                   "sthux $rS, $ptroff, $ptrreg", LdStStore,
+                   [(set ptr_rc:$ea_res,
+                      (pre_truncsti16 GPRC:$rS,
+                                      ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
+                   RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,
+                   PPC970_DGroup_Cracked;
+                 
+def STWUX : XForm_8<31, 183, (outs ptr_rc:$ea_res),
+                             (ins GPRC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
+                   "stwux $rS, $ptroff, $ptrreg", LdStStore,
+                   [(set ptr_rc:$ea_res,
+                      (pre_store GPRC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
+                   RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,
+                   PPC970_DGroup_Cracked;
+
+def STFSUX : XForm_8<31, 695, (outs ptr_rc:$ea_res),
+                              (ins F4RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
+                    "stfsux $rS, $ptroff, $ptrreg", LdStStore,
+                    [(set ptr_rc:$ea_res,
+                       (pre_store F4RC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
+                    RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+
+def STFDUX : XForm_8<31, 759, (outs ptr_rc:$ea_res),
+                              (ins F8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
+                    "stfdux $rS, $ptroff, $ptrreg", LdStStore,
+                    [(set ptr_rc:$ea_res,
+                       (pre_store F8RC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
+                    RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+
 def STHBRX: XForm_8<31, 918, (outs), (ins GPRC:$rS, memrr:$dst),
                    "sthbrx $rS, $dst", LdStStore,
                    [(PPCstbrx GPRC:$rS, xoaddr:$dst, i16)]>, 
@@ -852,7 +934,10 @@ def SYNC : XForm_24_sync<31, 598, (outs), (ins),
 
 let PPC970_Unit = 1 in {  // FXU Operations.
 def ADDI   : DForm_2<14, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm),
-                     "addi $rD, $rA, $imm", IntGeneral,
+                     "addi $rD, $rA, $imm", IntSimple,
+                     [(set GPRC:$rD, (add GPRC:$rA, immSExt16:$imm))]>;
+def ADDIL  : DForm_2<14, (outs GPRC:$rD), (ins GPRC:$rA, symbolLo:$imm),
+                     "addi $rD, $rA, $imm", IntSimple,
                      [(set GPRC:$rD, (add GPRC:$rA, immSExt16:$imm))]>;
 let Defs = [CARRY] in {
 def ADDIC  : DForm_2<12, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm),
@@ -864,7 +949,7 @@ def ADDICo : DForm_2<13, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm),
                      []>;
 }
 def ADDIS  : DForm_2<15, (outs GPRC:$rD), (ins GPRC:$rA, symbolHi:$imm),
-                     "addis $rD, $rA, $imm", IntGeneral,
+                     "addis $rD, $rA, $imm", IntSimple,
                      [(set GPRC:$rD, (add GPRC:$rA, imm16ShiftedSExt:$imm))]>;
 def LA     : DForm_2<14, (outs GPRC:$rD), (ins GPRC:$rA, symbolLo:$sym),
                      "la $rD, $sym($rA)", IntGeneral,
@@ -881,10 +966,10 @@ def SUBFIC : DForm_2< 8, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm),
 
 let isReMaterializable = 1 in {
   def LI  : DForm_2_r0<14, (outs GPRC:$rD), (ins symbolLo:$imm),
-                       "li $rD, $imm", IntGeneral,
+                       "li $rD, $imm", IntSimple,
                        [(set GPRC:$rD, immSExt16:$imm)]>;
   def LIS : DForm_2_r0<15, (outs GPRC:$rD), (ins symbolHi:$imm),
-                       "lis $rD, $imm", IntGeneral,
+                       "lis $rD, $imm", IntSimple,
                        [(set GPRC:$rD, imm16ShiftedSExt:$imm)]>;
 }
 }
@@ -899,18 +984,18 @@ def ANDISo : DForm_4<29, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2),
                     [(set GPRC:$dst, (and GPRC:$src1,imm16ShiftedZExt:$src2))]>,
                     isDOT;
 def ORI   : DForm_4<24, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2),
-                    "ori $dst, $src1, $src2", IntGeneral,
+                    "ori $dst, $src1, $src2", IntSimple,
                     [(set GPRC:$dst, (or GPRC:$src1, immZExt16:$src2))]>;
 def ORIS  : DForm_4<25, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2),
-                    "oris $dst, $src1, $src2", IntGeneral,
+                    "oris $dst, $src1, $src2", IntSimple,
                     [(set GPRC:$dst, (or GPRC:$src1, imm16ShiftedZExt:$src2))]>;
 def XORI  : DForm_4<26, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2),
-                    "xori $dst, $src1, $src2", IntGeneral,
+                    "xori $dst, $src1, $src2", IntSimple,
                     [(set GPRC:$dst, (xor GPRC:$src1, immZExt16:$src2))]>;
 def XORIS : DForm_4<27, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2),
-                    "xoris $dst, $src1, $src2", IntGeneral,
+                    "xoris $dst, $src1, $src2", IntSimple,
                     [(set GPRC:$dst, (xor GPRC:$src1,imm16ShiftedZExt:$src2))]>;
-def NOP   : DForm_4_zero<24, (outs), (ins), "nop", IntGeneral,
+def NOP   : DForm_4_zero<24, (outs), (ins), "nop", IntSimple,
                          []>;
 def CMPWI : DForm_5_ext<11, (outs CRRC:$crD), (ins GPRC:$rA, s16imm:$imm),
                         "cmpwi $crD, $rA, $imm", IntCompare>;
@@ -921,28 +1006,28 @@ def CMPLWI : DForm_6_ext<10, (outs CRRC:$dst), (ins GPRC:$src1, u16imm:$src2),
 
 let PPC970_Unit = 1 in {  // FXU Operations.
 def NAND : XForm_6<31, 476, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
-                   "nand $rA, $rS, $rB", IntGeneral,
+                   "nand $rA, $rS, $rB", IntSimple,
                    [(set GPRC:$rA, (not (and GPRC:$rS, GPRC:$rB)))]>;
 def AND  : XForm_6<31,  28, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
-                   "and $rA, $rS, $rB", IntGeneral,
+                   "and $rA, $rS, $rB", IntSimple,
                    [(set GPRC:$rA, (and GPRC:$rS, GPRC:$rB))]>;
 def ANDC : XForm_6<31,  60, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
-                   "andc $rA, $rS, $rB", IntGeneral,
+                   "andc $rA, $rS, $rB", IntSimple,
                    [(set GPRC:$rA, (and GPRC:$rS, (not GPRC:$rB)))]>;
 def OR   : XForm_6<31, 444, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
-                   "or $rA, $rS, $rB", IntGeneral,
+                   "or $rA, $rS, $rB", IntSimple,
                    [(set GPRC:$rA, (or GPRC:$rS, GPRC:$rB))]>;
 def NOR  : XForm_6<31, 124, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
-                   "nor $rA, $rS, $rB", IntGeneral,
+                   "nor $rA, $rS, $rB", IntSimple,
                    [(set GPRC:$rA, (not (or GPRC:$rS, GPRC:$rB)))]>;
 def ORC  : XForm_6<31, 412, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
-                   "orc $rA, $rS, $rB", IntGeneral,
+                   "orc $rA, $rS, $rB", IntSimple,
                    [(set GPRC:$rA, (or GPRC:$rS, (not GPRC:$rB)))]>;
 def EQV  : XForm_6<31, 284, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
-                   "eqv $rA, $rS, $rB", IntGeneral,
+                   "eqv $rA, $rS, $rB", IntSimple,
                    [(set GPRC:$rA, (not (xor GPRC:$rS, GPRC:$rB)))]>;
 def XOR  : XForm_6<31, 316, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
-                   "xor $rA, $rS, $rB", IntGeneral,
+                   "xor $rA, $rS, $rB", IntSimple,
                    [(set GPRC:$rA, (xor GPRC:$rS, GPRC:$rB))]>;
 def SLW  : XForm_6<31,  24, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
                    "slw $rA, $rS, $rB", IntGeneral,
@@ -967,10 +1052,10 @@ def CNTLZW : XForm_11<31,  26, (outs GPRC:$rA), (ins GPRC:$rS),
                       "cntlzw $rA, $rS", IntGeneral,
                       [(set GPRC:$rA, (ctlz GPRC:$rS))]>;
 def EXTSB  : XForm_11<31, 954, (outs GPRC:$rA), (ins GPRC:$rS),
-                      "extsb $rA, $rS", IntGeneral,
+                      "extsb $rA, $rS", IntSimple,
                       [(set GPRC:$rA, (sext_inreg GPRC:$rS, i8))]>;
 def EXTSH  : XForm_11<31, 922, (outs GPRC:$rA), (ins GPRC:$rS),
-                      "extsh $rA, $rS", IntGeneral,
+                      "extsh $rA, $rS", IntSimple,
                       [(set GPRC:$rA, (sext_inreg GPRC:$rS, i16))]>;
 
 def CMPW   : XForm_16_ext<31, 0, (outs CRRC:$crD), (ins GPRC:$rA, GPRC:$rB),
@@ -1115,7 +1200,7 @@ def MFCR : XFXForm_3<31, 19, (outs GPRC:$rT), (ins),
                      PPC970_MicroCode, PPC970_Unit_CRU;
 
 def MFOCRF: XFXForm_5a<31, 19, (outs GPRC:$rT), (ins crbitm:$FXM),
-                       "mfcr $rT, $FXM", SprMFCR>,
+                       "mfocrf $rT, $FXM", SprMFCR>,
             PPC970_DGroup_First, PPC970_Unit_CRU;
 
 // Instructions to manipulate FPSCR.  Only long double handling uses these.
@@ -1159,7 +1244,7 @@ let PPC970_Unit = 1 in {  // FXU Operations.
 // XO-Form instructions.  Arithmetic instructions that can set overflow bit
 //
 def ADD4  : XOForm_1<31, 266, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
-                     "add $rT, $rA, $rB", IntGeneral,
+                     "add $rT, $rA, $rB", IntSimple,
                      [(set GPRC:$rT, (add GPRC:$rA, GPRC:$rB))]>;
 let Defs = [CARRY] in {
 def ADDC  : XOForm_1<31, 10, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
@@ -1194,7 +1279,7 @@ def SUBFC : XOForm_1<31, 8, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
                      PPC970_DGroup_Cracked;
 }
 def NEG    : XOForm_3<31, 104, 0, (outs GPRC:$rT), (ins GPRC:$rA),
-                      "neg $rT, $rA", IntGeneral,
+                      "neg $rT, $rA", IntSimple,
                       [(set GPRC:$rT, (ineg GPRC:$rA))]>;
 let Uses = [CARRY], Defs = [CARRY] in {
 def ADDE  : XOForm_1<31, 138, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
@@ -1226,51 +1311,43 @@ let Uses = [RM] in {
   def FMADD : AForm_1<63, 29, 
                       (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRC, F8RC:$FRB),
                       "fmadd $FRT, $FRA, $FRC, $FRB", FPFused,
-                      [(set F8RC:$FRT, (fadd (fmul F8RC:$FRA, F8RC:$FRC),
-                                             F8RC:$FRB))]>,
-                      Requires<[FPContractions]>;
+                      [(set F8RC:$FRT,
+                            (fma F8RC:$FRA, F8RC:$FRC, F8RC:$FRB))]>;
   def FMADDS : AForm_1<59, 29,
                       (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRC, F4RC:$FRB),
                       "fmadds $FRT, $FRA, $FRC, $FRB", FPGeneral,
-                      [(set F4RC:$FRT, (fadd (fmul F4RC:$FRA, F4RC:$FRC),
-                                             F4RC:$FRB))]>,
-                      Requires<[FPContractions]>;
+                      [(set F4RC:$FRT,
+                            (fma F4RC:$FRA, F4RC:$FRC, F4RC:$FRB))]>;
   def FMSUB : AForm_1<63, 28,
                       (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRC, F8RC:$FRB),
                       "fmsub $FRT, $FRA, $FRC, $FRB", FPFused,
-                      [(set F8RC:$FRT, (fsub (fmul F8RC:$FRA, F8RC:$FRC),
-                                             F8RC:$FRB))]>,
-                      Requires<[FPContractions]>;
+                      [(set F8RC:$FRT,
+                            (fma F8RC:$FRA, F8RC:$FRC, (fneg F8RC:$FRB)))]>;
   def FMSUBS : AForm_1<59, 28,
                       (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRC, F4RC:$FRB),
                       "fmsubs $FRT, $FRA, $FRC, $FRB", FPGeneral,
-                      [(set F4RC:$FRT, (fsub (fmul F4RC:$FRA, F4RC:$FRC),
-                                             F4RC:$FRB))]>,
-                      Requires<[FPContractions]>;
+                      [(set F4RC:$FRT,
+                            (fma F4RC:$FRA, F4RC:$FRC, (fneg F4RC:$FRB)))]>;
   def FNMADD : AForm_1<63, 31,
                       (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRC, F8RC:$FRB),
                       "fnmadd $FRT, $FRA, $FRC, $FRB", FPFused,
-                      [(set F8RC:$FRT, (fneg (fadd (fmul F8RC:$FRA, F8RC:$FRC),
-                                                   F8RC:$FRB)))]>,
-                      Requires<[FPContractions]>;
+                      [(set F8RC:$FRT,
+                            (fneg (fma F8RC:$FRA, F8RC:$FRC, F8RC:$FRB)))]>;
   def FNMADDS : AForm_1<59, 31,
                       (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRC, F4RC:$FRB),
                       "fnmadds $FRT, $FRA, $FRC, $FRB", FPGeneral,
-                      [(set F4RC:$FRT, (fneg (fadd (fmul F4RC:$FRA, F4RC:$FRC),
-                                                   F4RC:$FRB)))]>,
-                      Requires<[FPContractions]>;
+                      [(set F4RC:$FRT,
+                            (fneg (fma F4RC:$FRA, F4RC:$FRC, F4RC:$FRB)))]>;
   def FNMSUB : AForm_1<63, 30,
                       (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRC, F8RC:$FRB),
                       "fnmsub $FRT, $FRA, $FRC, $FRB", FPFused,
-                      [(set F8RC:$FRT, (fneg (fsub (fmul F8RC:$FRA, F8RC:$FRC),
-                                                   F8RC:$FRB)))]>,
-                      Requires<[FPContractions]>;
+                      [(set F8RC:$FRT, (fneg (fma F8RC:$FRA, F8RC:$FRC,
+                                                  (fneg F8RC:$FRB))))]>;
   def FNMSUBS : AForm_1<59, 30,
                       (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRC, F4RC:$FRB),
                       "fnmsubs $FRT, $FRA, $FRC, $FRB", FPGeneral,
-                      [(set F4RC:$FRT, (fneg (fsub (fmul F4RC:$FRA, F4RC:$FRC),
-                                                   F4RC:$FRB)))]>,
-                      Requires<[FPContractions]>;
+                      [(set F4RC:$FRT, (fneg (fma F4RC:$FRA, F4RC:$FRC,
+                                                  (fneg F4RC:$FRB))))]>;
 }
 // FSEL is artificially split into 4 and 8-byte forms for the result.  To avoid
 // having 4 of these, force the comparison to always be an 8-byte double (code
@@ -1321,6 +1398,13 @@ let Uses = [RM] in {
 }
 
 let PPC970_Unit = 1 in {  // FXU Operations.
+  def ISEL  : AForm_1<31, 15,
+                     (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB, pred:$cond),
+                     "isel $rT, $rA, $rB, $cond", IntGeneral,
+                     []>;
+}
+
+let PPC970_Unit = 1 in {  // FXU Operations.
 // M-Form instructions.  rotate and mask instructions.
 //
 let isCommutable = 1 in {
@@ -1418,6 +1502,10 @@ def : Pat<(PPChi tjumptable:$in, 0), (LIS tjumptable:$in)>;
 def : Pat<(PPClo tjumptable:$in, 0), (LI tjumptable:$in)>;
 def : Pat<(PPChi tblockaddress:$in, 0), (LIS tblockaddress:$in)>;
 def : Pat<(PPClo tblockaddress:$in, 0), (LI tblockaddress:$in)>;
+def : Pat<(PPChi tglobaltlsaddr:$g, GPRC:$in),
+          (ADDIS GPRC:$in, tglobaltlsaddr:$g)>;
+def : Pat<(PPClo tglobaltlsaddr:$g, GPRC:$in),
+          (ADDIL GPRC:$in, tglobaltlsaddr:$g)>;
 def : Pat<(add GPRC:$in, (PPChi tglobaladdr:$g, 0)),
           (ADDIS GPRC:$in, tglobaladdr:$g)>;
 def : Pat<(add GPRC:$in, (PPChi tconstpool:$g, 0)),
@@ -1427,14 +1515,6 @@ def : Pat<(add GPRC:$in, (PPChi tjumptable:$g, 0)),
 def : Pat<(add GPRC:$in, (PPChi tblockaddress:$g, 0)),
           (ADDIS GPRC:$in, tblockaddress:$g)>;
 
-// Fused negative multiply subtract, alternate pattern
-def : Pat<(fsub F8RC:$B, (fmul F8RC:$A, F8RC:$C)),
-          (FNMSUB F8RC:$A, F8RC:$C, F8RC:$B)>,
-          Requires<[FPContractions]>;
-def : Pat<(fsub F4RC:$B, (fmul F4RC:$A, F4RC:$C)),
-          (FNMSUBS F4RC:$A, F4RC:$C, F4RC:$B)>,
-          Requires<[FPContractions]>;
-
 // Standard shifts.  These are represented separately from the real shifts above
 // so that we can distinguish between shifts that allow 5-bit and 6-bit shift
 // amounts.
diff --git a/lib/Target/PowerPC/PPCJITInfo.cpp b/lib/Target/PowerPC/PPCJITInfo.cpp
index a6528c0..aba2739 100644
--- a/lib/Target/PowerPC/PPCJITInfo.cpp
+++ b/lib/Target/PowerPC/PPCJITInfo.cpp
@@ -210,7 +210,7 @@ asm(
     ".text\n"
     ".align 2\n"
     ".globl PPC64CompilationCallback\n"
-    ".section \".opd\",\"aw\"\n"
+    ".section \".opd\",\"aw\",@progbits\n"
     ".align 3\n"
 "PPC64CompilationCallback:\n"
     ".quad .L.PPC64CompilationCallback,.TOC.@tocbase,0\n"
diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp
index 276edcb..19ec993 100644
--- a/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -99,10 +99,22 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
   MCContext &Ctx = Printer.OutContext;
   MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
 
-  if (MO.getTargetFlags() & PPCII::MO_LO16)
-    RefKind = isDarwin ? MCSymbolRefExpr::VK_PPC_DARWIN_LO16 : MCSymbolRefExpr::VK_PPC_GAS_LO16;
-  else if (MO.getTargetFlags() & PPCII::MO_HA16)
-    RefKind = isDarwin ? MCSymbolRefExpr::VK_PPC_DARWIN_HA16 : MCSymbolRefExpr::VK_PPC_GAS_HA16;
+  unsigned access = MO.getTargetFlags() & PPCII::MO_ACCESS_MASK;
+
+  switch (access) {
+    case PPCII::MO_HA16: RefKind = isDarwin ? 
+                           MCSymbolRefExpr::VK_PPC_DARWIN_HA16 : 
+                           MCSymbolRefExpr::VK_PPC_GAS_HA16; 
+                         break;
+    case PPCII::MO_LO16: RefKind = isDarwin ? 
+                           MCSymbolRefExpr::VK_PPC_DARWIN_LO16 : 
+                           MCSymbolRefExpr::VK_PPC_GAS_LO16; 
+                         break;
+    case PPCII::MO_TPREL16_HA: RefKind = MCSymbolRefExpr::VK_PPC_TPREL16_HA;
+                               break;
+    case PPCII::MO_TPREL16_LO: RefKind = MCSymbolRefExpr::VK_PPC_TPREL16_LO;
+                               break;
+   }
 
   // FIXME: This isn't right, but we don't have a good way to express this in
   // the MC Level, see below.
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index ef13571..ab8bf1f 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -89,10 +89,17 @@ PPCRegisterInfo::PPCRegisterInfo(const PPCSubtarget &ST,
   ImmToIdxMap[PPC::ADDI8] = PPC::ADD8; ImmToIdxMap[PPC::STD_32] = PPC::STDX_32;
 }
 
+bool
+PPCRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+  return requiresRegisterScavenging(MF);
+}
+
+
 /// getPointerRegClass - Return the register class to use to hold pointers.
 /// This is used for addressing modes.
 const TargetRegisterClass *
-PPCRegisterInfo::getPointerRegClass(unsigned Kind) const {
+PPCRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
+                                                                       const {
   if (Subtarget.isPPC64())
     return &PPC::G8RCRegClass;
   return &PPC::GPRCRegClass;
@@ -192,6 +199,20 @@ PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   }
 }
 
+bool
+PPCRegisterInfo::avoidWriteAfterWrite(const TargetRegisterClass *RC) const {
+  switch (RC->getID()) {
+  case PPC::G8RCRegClassID:
+  case PPC::GPRCRegClassID:
+  case PPC::F8RCRegClassID:
+  case PPC::F4RCRegClassID:
+  case PPC::VRRCRegClassID:
+    return true;
+  default:
+    return false;
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // Stack Frame Processing methods
 //===----------------------------------------------------------------------===//
@@ -321,14 +342,14 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II,
   // address of new allocated space.
   if (LP64) {
     if (requiresRegisterScavenging(MF)) // FIXME (64-bit): Use "true" part.
-      BuildMI(MBB, II, dl, TII.get(PPC::STDUX))
+      BuildMI(MBB, II, dl, TII.get(PPC::STDUX), PPC::X1)
         .addReg(Reg, RegState::Kill)
-        .addReg(PPC::X1, RegState::Define)
+        .addReg(PPC::X1)
         .addReg(MI.getOperand(1).getReg());
     else
-      BuildMI(MBB, II, dl, TII.get(PPC::STDUX))
+      BuildMI(MBB, II, dl, TII.get(PPC::STDUX), PPC::X1)
         .addReg(PPC::X0, RegState::Kill)
-        .addReg(PPC::X1, RegState::Define)
+        .addReg(PPC::X1)
         .addReg(MI.getOperand(1).getReg());
 
     if (!MI.getOperand(1).isKill())
@@ -342,9 +363,9 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II,
         .addImm(maxCallFrameSize)
         .addReg(MI.getOperand(1).getReg(), RegState::ImplicitKill);
   } else {
-    BuildMI(MBB, II, dl, TII.get(PPC::STWUX))
+    BuildMI(MBB, II, dl, TII.get(PPC::STWUX), PPC::R1)
       .addReg(Reg, RegState::Kill)
-      .addReg(PPC::R1, RegState::Define)
+      .addReg(PPC::R1)
       .addReg(MI.getOperand(1).getReg());
 
     if (!MI.getOperand(1).isKill())
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
index b1e6a72..152c36d 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -35,7 +35,8 @@ public:
   
   /// getPointerRegClass - Return the register class to use to hold pointers.
   /// This is used for addressing modes.
-  virtual const TargetRegisterClass *getPointerRegClass(unsigned Kind=0) const;  
+  virtual const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF, unsigned Kind=0) const;
 
   unsigned getRegPressureLimit(const TargetRegisterClass *RC,
                                MachineFunction &MF) const;
@@ -46,10 +47,14 @@ public:
 
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
+  virtual bool avoidWriteAfterWrite(const TargetRegisterClass *RC) const;
+
   /// requiresRegisterScavenging - We require a register scavenger.
   /// FIXME (64-bit): Should be inlined.
   bool requiresRegisterScavenging(const MachineFunction &MF) const;
 
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const;
+
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I) const;
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.td b/lib/Target/PowerPC/PPCRegisterInfo.td
index 0e55313..5ca3876 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -314,12 +314,18 @@ def CRBITRC : RegisterClass<"PPC", [i32], 32,
 }
 
 def CRRC : RegisterClass<"PPC", [i32], 32, (add CR0, CR1, CR5, CR6,
-                                                CR7, CR2, CR3, CR4)> {
-  let SubRegClasses = [(CRBITRC sub_lt, sub_gt, sub_eq, sub_un)];
+                                                CR7, CR2, CR3, CR4)>;
+
+// The CTR registers are not allocatable because they're used by the
+// decrement-and-branch instructions, and thus need to stay live across
+// multiple basic blocks.
+def CTRRC : RegisterClass<"PPC", [i32], 32, (add CTR)> {
+  let isAllocatable = 0;
+}
+def CTRRC8 : RegisterClass<"PPC", [i64], 64, (add CTR8)> {
+  let isAllocatable = 0;
 }
 
-def CTRRC : RegisterClass<"PPC", [i32], 32, (add CTR)>;
-def CTRRC8 : RegisterClass<"PPC", [i64], 64, (add CTR8)>;
 def VRSAVERC : RegisterClass<"PPC", [i32], 32, (add VRSAVE)>;
 def CARRYRC : RegisterClass<"PPC", [i32], 32, (add CARRY)> {
   let CopyCost = -1;
diff --git a/lib/Target/PowerPC/PPCSchedule.td b/lib/Target/PowerPC/PPCSchedule.td
index 8c0a858..6a6ccb9 100644
--- a/lib/Target/PowerPC/PPCSchedule.td
+++ b/lib/Target/PowerPC/PPCSchedule.td
@@ -25,6 +25,7 @@ def VFPU   : FuncUnit; // vector floating point unit
 //===----------------------------------------------------------------------===//
 // Instruction Itinerary classes used for PowerPC
 //
+def IntSimple    : InstrItinClass;
 def IntGeneral   : InstrItinClass;
 def IntCompare   : InstrItinClass;
 def IntDivD      : InstrItinClass;
@@ -117,17 +118,17 @@ include "PPCScheduleA2.td"
 //
 //    opcode     itinerary class
 //    ======     ===============
-//    add        IntGeneral
+//    add        IntSimple
 //    addc       IntGeneral
 //    adde       IntGeneral
-//    addi       IntGeneral
+//    addi       IntSimple
 //    addic      IntGeneral
 //    addic.     IntGeneral
-//    addis      IntGeneral
+//    addis      IntSimple
 //    addme      IntGeneral
 //    addze      IntGeneral
-//    and        IntGeneral
-//    andc       IntGeneral
+//    and        IntSimple
+//    andc       IntSimple
 //    andi.      IntGeneral
 //    andis.     IntGeneral
 //    b          BrB
@@ -165,10 +166,10 @@ include "PPCScheduleA2.td"
 //    eciwx      LdStLoad
 //    ecowx      LdStLoad
 //    eieio      LdStLoad
-//    eqv        IntGeneral
-//    extsb      IntGeneral
-//    extsh      IntGeneral
-//    extsw      IntRotateD
+//    eqv        IntSimple
+//    extsb      IntSimple
+//    extsh      IntSimple
+//    extsw      IntSimple
 //    fabs       FPGeneral
 //    fadd       FPGeneral
 //    fadds      FPGeneral
@@ -280,13 +281,13 @@ include "PPCScheduleA2.td"
 //    mulld      IntMulHD
 //    mulli      IntMulLI
 //    mullw      IntMulHW
-//    nand       IntGeneral
-//    neg        IntGeneral
-//    nor        IntGeneral
-//    or         IntGeneral
-//    orc        IntGeneral
-//    ori        IntGeneral
-//    oris       IntGeneral
+//    nand       IntSimple
+//    neg        IntSimple
+//    nor        IntSimple
+//    or         IntSimple
+//    orc        IntSimple
+//    ori        IntSimple
+//    oris       IntSimple
 //    rfi        SprRFI
 //    rfid       IntRFID
 //    rldcl      IntRotateD
@@ -502,7 +503,7 @@ include "PPCScheduleA2.td"
 //    vupklsb    VecPerm
 //    vupklsh    VecPerm
 //    vxor       VecGeneral
-//    xor        IntGeneral
-//    xori       IntGeneral
-//    xoris      IntGeneral
+//    xor        IntSimple
+//    xori       IntSimple
+//    xoris      IntSimple
 //
diff --git a/lib/Target/PowerPC/PPCSchedule440.td b/lib/Target/PowerPC/PPCSchedule440.td
index 419faea..cd0fb70 100644
--- a/lib/Target/PowerPC/PPCSchedule440.td
+++ b/lib/Target/PowerPC/PPCSchedule440.td
@@ -108,6 +108,15 @@ def PPC440Itineraries : ProcessorItineraries<
    IRACC, IEXE1, IEXE2, IWB, LRACC, JEXE1, JEXE2, JWB, AGEN, CRD, LWB,
    FEXE1, FEXE2, FEXE3, FEXE4, FEXE5, FEXE6, FWB, LWARX_Hold],
   [GPR_Bypass, FPR_Bypass], [
+  InstrItinData<IntSimple  , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [IRACC, LRACC]>,
+                               InstrStage<1, [IEXE1, JEXE1]>,
+                               InstrStage<1, [IEXE2, JEXE2]>,
+                               InstrStage<1, [IWB, JWB]>],
+                              [6, 4, 4],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
   InstrItinData<IntGeneral  , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1, DISS2]>,
@@ -373,26 +382,6 @@ def PPC440Itineraries : ProcessorItineraries<
                                InstrStage<1, [LWB]>],
                               [8, 5],
                               [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStSTD     , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1, DISS2]>,
-                               InstrStage<1, [LRACC]>,
-                               InstrStage<1, [AGEN]>,
-                               InstrStage<1, [CRD]>,
-                               InstrStage<2, [LWB]>],
-                              [8, 5],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStSTDCX   , [InstrStage<1, [IFTH1, IFTH2]>,
-                               InstrStage<1, [PDCD1, PDCD2]>,
-                               InstrStage<1, [DISS1]>,
-                               InstrStage<1, [IRACC], 0>,
-                               InstrStage<4, [LWARX_Hold], 0>,
-                               InstrStage<1, [LRACC]>,
-                               InstrStage<1, [AGEN]>,
-                               InstrStage<1, [CRD]>,
-                               InstrStage<1, [LWB]>],
-                              [8, 5],
-                              [NoBypass, GPR_Bypass]>,
   InstrItinData<LdStSTWCX   , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1]>,
diff --git a/lib/Target/PowerPC/PPCScheduleA2.td b/lib/Target/PowerPC/PPCScheduleA2.td
index 857ba40..4d4a5d0 100644
--- a/lib/Target/PowerPC/PPCScheduleA2.td
+++ b/lib/Target/PowerPC/PPCScheduleA2.td
@@ -60,6 +60,17 @@ def PPCA2Itineraries : ProcessorItineraries<
    IU5, IU6, RF0, XRF1, XEX1, XEX2, XEX3, XEX4, XEX5, XEX6,
    FRF1, FEX1, FEX2, FEX3, FEX4, FEX5, FEX6],
   [CR_Bypass, GPR_Bypass, FPR_Bypass], [
+  InstrItinData<IntSimple   , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
   InstrItinData<IntGeneral  , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
@@ -159,6 +170,17 @@ def PPCA2Itineraries : ProcessorItineraries<
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [10, 7, 7],
                               [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntRotateD  , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
   InstrItinData<IntShift    , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
@@ -181,6 +203,17 @@ def PPCA2Itineraries : ProcessorItineraries<
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [10, 7, 7], 
                               [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntTrapD    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7], 
+                              [GPR_Bypass, GPR_Bypass]>,
   InstrItinData<BrB         , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
@@ -269,6 +302,17 @@ def PPCA2Itineraries : ProcessorItineraries<
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [14, 7],
                               [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLD      , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7],
+                              [GPR_Bypass, GPR_Bypass]>,
   InstrItinData<LdStStore   , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
@@ -379,28 +423,6 @@ def PPCA2Itineraries : ProcessorItineraries<
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [26, 7],
                               [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStSTD     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [13, 7],
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStSTDCX   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<13, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [26, 7],
-                              [NoBypass, GPR_Bypass]>,
   InstrItinData<LdStSTWCX   , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
diff --git a/lib/Target/PowerPC/PPCScheduleG3.td b/lib/Target/PowerPC/PPCScheduleG3.td
index bc926f7..61e89ed 100644
--- a/lib/Target/PowerPC/PPCScheduleG3.td
+++ b/lib/Target/PowerPC/PPCScheduleG3.td
@@ -14,6 +14,7 @@
 
 def G3Itineraries : ProcessorItineraries<
   [IU1, IU2, FPU1, BPU, SRU, SLU], [], [
+  InstrItinData<IntSimple   , [InstrStage<1, [IU1, IU2]>]>,
   InstrItinData<IntGeneral  , [InstrStage<1, [IU1, IU2]>]>,
   InstrItinData<IntCompare  , [InstrStage<1, [IU1, IU2]>]>,
   InstrItinData<IntDivW     , [InstrStage<19, [IU1]>]>,
diff --git a/lib/Target/PowerPC/PPCScheduleG4.td b/lib/Target/PowerPC/PPCScheduleG4.td
index f7ec1e0..e19ddfa 100644
--- a/lib/Target/PowerPC/PPCScheduleG4.td
+++ b/lib/Target/PowerPC/PPCScheduleG4.td
@@ -13,6 +13,7 @@
 
 def G4Itineraries : ProcessorItineraries<
   [IU1, IU2, SLU, SRU, BPU, FPU1, VIU1, VIU2, VPU, VFPU], [], [
+  InstrItinData<IntSimple   , [InstrStage<1, [IU1, IU2]>]>,
   InstrItinData<IntGeneral  , [InstrStage<1, [IU1, IU2]>]>,
   InstrItinData<IntCompare  , [InstrStage<1, [IU1, IU2]>]>,
   InstrItinData<IntDivW     , [InstrStage<19, [IU1]>]>,
diff --git a/lib/Target/PowerPC/PPCScheduleG4Plus.td b/lib/Target/PowerPC/PPCScheduleG4Plus.td
index 37ebfc5..e7446cb 100644
--- a/lib/Target/PowerPC/PPCScheduleG4Plus.td
+++ b/lib/Target/PowerPC/PPCScheduleG4Plus.td
@@ -16,6 +16,7 @@ def IU4    : FuncUnit; // integer unit 4 (7450 simple)
 
 def G4PlusItineraries : ProcessorItineraries<
   [IU1, IU2, IU3, IU4, BPU, SLU, FPU1, VFPU, VIU1, VIU2, VPU], [], [
+  InstrItinData<IntSimple   , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>,
   InstrItinData<IntGeneral  , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>,
   InstrItinData<IntCompare  , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>,
   InstrItinData<IntDivW     , [InstrStage<23, [IU2]>]>,
diff --git a/lib/Target/PowerPC/PPCScheduleG5.td b/lib/Target/PowerPC/PPCScheduleG5.td
index d1e40ce..1371499 100644
--- a/lib/Target/PowerPC/PPCScheduleG5.td
+++ b/lib/Target/PowerPC/PPCScheduleG5.td
@@ -13,6 +13,7 @@
 
 def G5Itineraries : ProcessorItineraries<
   [IU1, IU2, SLU, BPU, FPU1, FPU2, VFPU, VIU1, VIU2, VPU], [], [
+  InstrItinData<IntSimple   , [InstrStage<2, [IU1, IU2]>]>,
   InstrItinData<IntGeneral  , [InstrStage<2, [IU1, IU2]>]>,
   InstrItinData<IntCompare  , [InstrStage<3, [IU1, IU2]>]>,
   InstrItinData<IntDivD     , [InstrStage<68, [IU1]>]>,
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index f405b47..bb193ac 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -16,6 +16,7 @@
 #include "PPC.h"
 #include "llvm/GlobalValue.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Host.h"
 #include "llvm/Support/TargetRegistry.h"
 #include <cstdlib>
 
@@ -25,56 +26,19 @@
 
 using namespace llvm;
 
-#if defined(__APPLE__)
-#include <mach/mach.h>
-#include <mach/mach_host.h>
-#include <mach/host_info.h>
-#include <mach/machine.h>
-
-/// GetCurrentPowerPCFeatures - Returns the current CPUs features.
-static const char *GetCurrentPowerPCCPU() {
-  host_basic_info_data_t hostInfo;
-  mach_msg_type_number_t infoCount;
-
-  infoCount = HOST_BASIC_INFO_COUNT;
-  host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&hostInfo, 
-            &infoCount);
-            
-  if (hostInfo.cpu_type != CPU_TYPE_POWERPC) return "generic";
-
-  switch(hostInfo.cpu_subtype) {
-  case CPU_SUBTYPE_POWERPC_601:   return "601";
-  case CPU_SUBTYPE_POWERPC_602:   return "602";
-  case CPU_SUBTYPE_POWERPC_603:   return "603";
-  case CPU_SUBTYPE_POWERPC_603e:  return "603e";
-  case CPU_SUBTYPE_POWERPC_603ev: return "603ev";
-  case CPU_SUBTYPE_POWERPC_604:   return "604";
-  case CPU_SUBTYPE_POWERPC_604e:  return "604e";
-  case CPU_SUBTYPE_POWERPC_620:   return "620";
-  case CPU_SUBTYPE_POWERPC_750:   return "750";
-  case CPU_SUBTYPE_POWERPC_7400:  return "7400";
-  case CPU_SUBTYPE_POWERPC_7450:  return "7450";
-  case CPU_SUBTYPE_POWERPC_970:   return "970";
-  default: ;
-  }
-  
-  return "generic";
-}
-#endif
-
-
 PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &CPU,
                            const std::string &FS, bool is64Bit)
   : PPCGenSubtargetInfo(TT, CPU, FS)
   , StackAlignment(16)
   , DarwinDirective(PPC::DIR_NONE)
-  , IsGigaProcessor(false)
+  , HasMFOCRF(false)
   , Has64BitSupport(false)
   , Use64BitRegs(false)
   , IsPPC64(is64Bit)
   , HasAltivec(false)
   , HasFSQRT(false)
   , HasSTFIWX(false)
+  , HasISEL(false)
   , IsBookE(false)
   , HasLazyResolverStubs(false)
   , IsJITCodeModel(false)
@@ -84,9 +48,10 @@ PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &CPU,
   std::string CPUName = CPU;
   if (CPUName.empty())
     CPUName = "generic";
-#if defined(__APPLE__)
+#if (defined(__APPLE__) || defined(__linux__)) && \
+    (defined(__ppc__) || defined(__powerpc__))
   if (CPUName == "generic")
-    CPUName = GetCurrentPowerPCCPU();
+    CPUName = sys::getHostCPUName();
 #endif
 
   // Parse features string.
@@ -146,10 +111,14 @@ bool PPCSubtarget::enablePostRAScheduler(
            CodeGenOpt::Level OptLevel,
            TargetSubtargetInfo::AntiDepBreakMode& Mode,
            RegClassVector& CriticalPathRCs) const {
-  if (DarwinDirective == PPC::DIR_440 || DarwinDirective == PPC::DIR_A2)
-    Mode = TargetSubtargetInfo::ANTIDEP_ALL;
-  else
-    Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL;
+  // FIXME: It would be best to use TargetSubtargetInfo::ANTIDEP_ALL here,
+  // but we can't because we can't reassign the cr registers. There is a
+  // dependence between the cr register and the RLWINM instruction used
+  // to extract its value which the anti-dependency breaker can't currently
+  // see. Maybe we should make a late-expanded pseudo to encode this dependency.
+  // (the relevant code is in PPCDAGToDAGISel::SelectSETCC)
+
+  Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL;
 
   CriticalPathRCs.clear();
 
@@ -157,6 +126,9 @@ bool PPCSubtarget::enablePostRAScheduler(
     CriticalPathRCs.push_back(&PPC::G8RCRegClass);
   else
     CriticalPathRCs.push_back(&PPC::GPRCRegClass);
+    
+  CriticalPathRCs.push_back(&PPC::F8RCRegClass);
+  CriticalPathRCs.push_back(&PPC::VRRCRegClass);
 
   return OptLevel >= CodeGenOpt::Default;
 }
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index a275029..0207c83 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -41,6 +41,8 @@ namespace PPC {
     DIR_750, 
     DIR_970, 
     DIR_A2,
+    DIR_PWR6,
+    DIR_PWR7,
     DIR_64  
   };
 }
@@ -61,13 +63,14 @@ protected:
   unsigned DarwinDirective;
 
   /// Used by the ISel to turn in optimizations for POWER4-derived architectures
-  bool IsGigaProcessor;
+  bool HasMFOCRF;
   bool Has64BitSupport;
   bool Use64BitRegs;
   bool IsPPC64;
   bool HasAltivec;
   bool HasFSQRT;
   bool HasSTFIWX;
+  bool HasISEL;
   bool IsBookE;
   bool HasLazyResolverStubs;
   bool IsJITCodeModel;
@@ -138,7 +141,8 @@ public:
   bool hasFSQRT() const { return HasFSQRT; }
   bool hasSTFIWX() const { return HasSTFIWX; }
   bool hasAltivec() const { return HasAltivec; }
-  bool isGigaProcessor() const { return IsGigaProcessor; }
+  bool hasMFOCRF() const { return HasMFOCRF; }
+  bool hasISEL() const { return HasISEL; }
   bool isBookE() const { return IsBookE; }
 
   const Triple &getTargetTriple() const { return TargetTriple; }
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index d113976..9805112 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -17,10 +17,15 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
+static cl::
+opt<bool> DisableCTRLoops("disable-ppc-ctrloops", cl::Hidden,
+                        cl::desc("Disable CTR loops for PPC"));
+
 extern "C" void LLVMInitializePowerPCTarget() {
   // Register the targets
   RegisterTargetMachine<PPC32TargetMachine> A(ThePPC32Target);
@@ -81,41 +86,37 @@ public:
     return getTM<PPCTargetMachine>();
   }
 
+  virtual bool addPreRegAlloc();
   virtual bool addInstSelector();
   virtual bool addPreEmitPass();
 };
 } // namespace
 
 TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) {
-  TargetPassConfig *PassConfig = new PPCPassConfig(this, PM);
+  return new PPCPassConfig(this, PM);
+}
 
-  // Override this for PowerPC.  Tail merging happily breaks up instruction issue
-  // groups, which typically degrades performance.
-  PassConfig->setEnableTailMerge(false);
+bool PPCPassConfig::addPreRegAlloc() {
+  if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
+    addPass(createPPCCTRLoops());
 
-  return PassConfig;
+  return false;
 }
 
 bool PPCPassConfig::addInstSelector() {
   // Install an instruction selector.
-  PM.add(createPPCISelDag(getPPCTargetMachine()));
+  addPass(createPPCISelDag(getPPCTargetMachine()));
   return false;
 }
 
 bool PPCPassConfig::addPreEmitPass() {
   // Must run branch selection immediately preceding the asm printer.
-  PM.add(createPPCBranchSelectionPass());
+  addPass(createPPCBranchSelectionPass());
   return false;
 }
 
 bool PPCTargetMachine::addCodeEmitter(PassManagerBase &PM,
                                       JITCodeEmitter &JCE) {
-  // FIXME: This should be moved to TargetJITInfo!!
-  if (Subtarget.isPPC64())
-    // Temporary workaround for the inability of PPC64 JIT to handle jump
-    // tables.
-    Options.DisableJumpTables = true;
-
   // Inform the subtarget that we are in JIT mode.  FIXME: does this break macho
   // writing?
   Subtarget.SetJITMode();
diff --git a/lib/Target/PowerPC/README.txt b/lib/Target/PowerPC/README.txt
index 349cd89..b6763aa 100644
--- a/lib/Target/PowerPC/README.txt
+++ b/lib/Target/PowerPC/README.txt
@@ -2,7 +2,6 @@
 
 TODO:
 * gpr0 allocation
-* implement do-loop -> bdnz transform
 * lmw/stmw pass a la arm load store optimizer for prolog/epilog
 
 ===-------------------------------------------------------------------------===
diff --git a/lib/Target/README.txt b/lib/Target/README.txt
index 093255e..cbfa4cf 100644
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@@ -964,6 +964,12 @@ optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
 
 //===---------------------------------------------------------------------===//
 
+unsigned f(unsigned x) { return ((x & 7) + 1) & 15; }
+The & 15 part should be optimized away, it doesn't change the result. Currently
+not optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
 This was noticed in the entryblock for grokdeclarator in 403.gcc:
 
         %tmp = icmp eq i32 %decl_context, 4          
diff --git a/lib/Target/Sparc/CMakeLists.txt b/lib/Target/Sparc/CMakeLists.txt
index ae4af0f..efb10db 100644
--- a/lib/Target/Sparc/CMakeLists.txt
+++ b/lib/Target/Sparc/CMakeLists.txt
@@ -23,5 +23,7 @@ add_llvm_target(SparcCodeGen
   SparcSelectionDAGInfo.cpp
   )
 
+add_dependencies(LLVMSparcCodeGen intrinsics_gen)
+
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp
index 883aa3a..7bf8c3f 100644
--- a/lib/Target/Sparc/DelaySlotFiller.cpp
+++ b/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -279,14 +279,11 @@ void Filler::insertDefsUses(MachineBasicBlock::iterator MI,
 //returns true if the Reg or its alias is in the RegSet.
 bool Filler::IsRegInSet(SmallSet<unsigned, 32>& RegSet, unsigned Reg)
 {
-  if (RegSet.count(Reg))
-    return true;
-  // check Aliased Registers
-  for (const uint16_t *Alias = TM.getRegisterInfo()->getAliasSet(Reg);
-       *Alias; ++ Alias)
-    if (RegSet.count(*Alias))
+  // Check Reg and all aliased Registers.
+  for (MCRegAliasIterator AI(Reg, TM.getRegisterInfo(), true);
+       AI.isValid(); ++AI)
+    if (RegSet.count(*AI))
       return true;
-
   return false;
 }
 
diff --git a/lib/Target/Sparc/SparcAsmPrinter.cpp b/lib/Target/Sparc/SparcAsmPrinter.cpp
index c14b3d4..2554862 100644
--- a/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -187,7 +187,9 @@ bool SparcAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
     if (ExtraCode[1] != 0) return true; // Unknown modifier.
 
     switch (ExtraCode[0]) {
-    default: return true;  // Unknown modifier.
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
     case 'r':
      break;
     }
diff --git a/lib/Target/Sparc/SparcFrameLowering.h b/lib/Target/Sparc/SparcFrameLowering.h
index 210705e..6b593c9 100644
--- a/lib/Target/Sparc/SparcFrameLowering.h
+++ b/lib/Target/Sparc/SparcFrameLowering.h
@@ -22,10 +22,9 @@ namespace llvm {
   class SparcSubtarget;
 
 class SparcFrameLowering : public TargetFrameLowering {
-  const SparcSubtarget &STI;
 public:
-  explicit SparcFrameLowering(const SparcSubtarget &sti)
-    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 8, 0), STI(sti) {
+  explicit SparcFrameLowering(const SparcSubtarget &/*sti*/)
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 8, 0) {
   }
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index c3e6f16..79f7ebd 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -90,7 +90,7 @@ SparcTargetLowering::LowerReturn(SDValue Chain,
 
   // CCState - Info about the registers and stack slot.
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 DAG.getTarget(), RVLocs, *DAG.getContext());
+                 DAG.getTarget(), RVLocs, *DAG.getContext());
 
   // Analize return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_Sparc32);
@@ -160,7 +160,7 @@ SparcTargetLowering::LowerFormalArguments(SDValue Chain,
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), ArgLocs, *DAG.getContext());
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
   CCInfo.AnalyzeFormalArguments(Ins, CC_Sparc32);
 
   const unsigned StackOffset = 92;
@@ -345,21 +345,26 @@ SparcTargetLowering::LowerFormalArguments(SDValue Chain,
 }
 
 SDValue
-SparcTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
-                               CallingConv::ID CallConv, bool isVarArg,
-                               bool doesNotRet, bool &isTailCall,
-                               const SmallVectorImpl<ISD::OutputArg> &Outs,
-                               const SmallVectorImpl<SDValue> &OutVals,
-                               const SmallVectorImpl<ISD::InputArg> &Ins,
-                               DebugLoc dl, SelectionDAG &DAG,
+SparcTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
+
   // Sparc target does not yet support tail call optimization.
   isTailCall = false;
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 DAG.getTarget(), ArgLocs, *DAG.getContext());
+                 DAG.getTarget(), ArgLocs, *DAG.getContext());
   CCInfo.AnalyzeCallOperands(Outs, CC_Sparc32);
 
   // Get the size of the outgoing arguments stack space requirement.
@@ -590,7 +595,7 @@ SparcTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
   CCState RVInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 DAG.getTarget(), RVLocs, *DAG.getContext());
+                 DAG.getTarget(), RVLocs, *DAG.getContext());
 
   RVInfo.AnalyzeCallResult(Ins, RetCC_Sparc32);
 
@@ -689,9 +694,9 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   : TargetLowering(TM, new TargetLoweringObjectFileELF()) {
 
   // Set up the register classes.
-  addRegisterClass(MVT::i32, SP::IntRegsRegisterClass);
-  addRegisterClass(MVT::f32, SP::FPRegsRegisterClass);
-  addRegisterClass(MVT::f64, SP::DFPRegsRegisterClass);
+  addRegisterClass(MVT::i32, &SP::IntRegsRegClass);
+  addRegisterClass(MVT::f32, &SP::FPRegsRegClass);
+  addRegisterClass(MVT::f64, &SP::DFPRegsRegClass);
 
   // Turn FP extload into load/fextend
   setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
@@ -1259,7 +1264,7 @@ SparcTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'r':
-      return std::make_pair(0U, SP::IntRegsRegisterClass);
+      return std::make_pair(0U, &SP::IntRegsRegClass);
     }
   }
 
diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h
index cf43048..09148ea 100644
--- a/lib/Target/Sparc/SparcISelLowering.h
+++ b/lib/Target/Sparc/SparcISelLowering.h
@@ -76,12 +76,7 @@ namespace llvm {
                            SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
-      LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
-                bool isVarArg, bool doesNotRet, bool &isTailCall,
-                const SmallVectorImpl<ISD::OutputArg> &Outs,
-                const SmallVectorImpl<SDValue> &OutVals,
-                const SmallVectorImpl<ISD::InputArg> &Ins,
-                DebugLoc dl, SelectionDAG &DAG,
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
                 SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
diff --git a/lib/Target/Sparc/SparcInstrInfo.cpp b/lib/Target/Sparc/SparcInstrInfo.cpp
index faff468..f8674d0 100644
--- a/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -303,13 +303,13 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   if (I != MBB.end()) DL = I->getDebugLoc();
 
   // On the order of operands here: think "[FrameIdx + 0] = SrcReg".
-  if (RC == SP::IntRegsRegisterClass)
+  if (RC == &SP::IntRegsRegClass)
     BuildMI(MBB, I, DL, get(SP::STri)).addFrameIndex(FI).addImm(0)
       .addReg(SrcReg, getKillRegState(isKill));
-  else if (RC == SP::FPRegsRegisterClass)
+  else if (RC == &SP::FPRegsRegClass)
     BuildMI(MBB, I, DL, get(SP::STFri)).addFrameIndex(FI).addImm(0)
       .addReg(SrcReg,  getKillRegState(isKill));
-  else if (RC == SP::DFPRegsRegisterClass)
+  else if (RC == &SP::DFPRegsRegClass)
     BuildMI(MBB, I, DL, get(SP::STDFri)).addFrameIndex(FI).addImm(0)
       .addReg(SrcReg,  getKillRegState(isKill));
   else
@@ -324,11 +324,11 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   DebugLoc DL;
   if (I != MBB.end()) DL = I->getDebugLoc();
 
-  if (RC == SP::IntRegsRegisterClass)
+  if (RC == &SP::IntRegsRegClass)
     BuildMI(MBB, I, DL, get(SP::LDri), DestReg).addFrameIndex(FI).addImm(0);
-  else if (RC == SP::FPRegsRegisterClass)
+  else if (RC == &SP::FPRegsRegClass)
     BuildMI(MBB, I, DL, get(SP::LDFri), DestReg).addFrameIndex(FI).addImm(0);
-  else if (RC == SP::DFPRegsRegisterClass)
+  else if (RC == &SP::DFPRegsRegClass)
     BuildMI(MBB, I, DL, get(SP::LDDFri), DestReg).addFrameIndex(FI).addImm(0);
   else
     llvm_unreachable("Can't load this register from stack slot");
diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp
index 6f31356..9ee12ed 100644
--- a/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -34,7 +34,8 @@ SparcTargetMachine::SparcTargetMachine(const Target &T, StringRef TT,
   : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
     Subtarget(TT, CPU, FS, is64bit),
     DataLayout(Subtarget.getDataLayout()),
-    TLInfo(*this), TSInfo(*this), InstrInfo(Subtarget),
+    InstrInfo(Subtarget),
+    TLInfo(*this), TSInfo(*this),
     FrameLowering(Subtarget) {
 }
 
@@ -59,7 +60,7 @@ TargetPassConfig *SparcTargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 bool SparcPassConfig::addInstSelector() {
-  PM.add(createSparcISelDag(getSparcTargetMachine()));
+  addPass(createSparcISelDag(getSparcTargetMachine()));
   return false;
 }
 
@@ -67,8 +68,8 @@ bool SparcPassConfig::addInstSelector() {
 /// passes immediately before machine code is emitted.  This should return
 /// true if -print-machineinstrs should print out the code after the passes.
 bool SparcPassConfig::addPreEmitPass(){
-  PM.add(createSparcFPMoverPass(getSparcTargetMachine()));
-  PM.add(createSparcDelaySlotFillerPass(getSparcTargetMachine()));
+  addPass(createSparcFPMoverPass(getSparcTargetMachine()));
+  addPass(createSparcDelaySlotFillerPass(getSparcTargetMachine()));
   return true;
 }
 
diff --git a/lib/Target/Sparc/SparcTargetMachine.h b/lib/Target/Sparc/SparcTargetMachine.h
index b203dfa..b2cc624 100644
--- a/lib/Target/Sparc/SparcTargetMachine.h
+++ b/lib/Target/Sparc/SparcTargetMachine.h
@@ -28,9 +28,9 @@ namespace llvm {
 class SparcTargetMachine : public LLVMTargetMachine {
   SparcSubtarget Subtarget;
   const TargetData DataLayout;       // Calculates type size & alignment
+  SparcInstrInfo InstrInfo;
   SparcTargetLowering TLInfo;
   SparcSelectionDAGInfo TSInfo;
-  SparcInstrInfo InstrInfo;
   SparcFrameLowering FrameLowering;
 public:
   SparcTargetMachine(const Target &T, StringRef TT,
diff --git a/lib/Target/TargetData.cpp b/lib/Target/TargetData.cpp
index acb7476..cc6dc1e 100644
--- a/lib/Target/TargetData.cpp
+++ b/lib/Target/TargetData.cpp
@@ -117,8 +117,8 @@ TargetAlignElem::operator==(const TargetAlignElem &rhs) const {
           && TypeBitWidth == rhs.TypeBitWidth);
 }
 
-const TargetAlignElem TargetData::InvalidAlignmentElem =
-                TargetAlignElem::get((AlignTypeEnum) -1, 0, 0, 0);
+const TargetAlignElem
+TargetData::InvalidAlignmentElem = { (AlignTypeEnum)0xFF, 0, 0, 0 };
 
 //===----------------------------------------------------------------------===//
 //                       TargetData Class Implementation
diff --git a/lib/Target/TargetInstrInfo.cpp b/lib/Target/TargetInstrInfo.cpp
index 440f9ad..f1d1d07 100644
--- a/lib/Target/TargetInstrInfo.cpp
+++ b/lib/Target/TargetInstrInfo.cpp
@@ -21,20 +21,25 @@ using namespace llvm;
 
 //===----------------------------------------------------------------------===//
 //  TargetInstrInfo
-//===----------------------------------------------------------------------===//
+//
+// Methods that depend on CodeGen are implemented in
+// TargetInstrInfoImpl.cpp. Invoking them without linking libCodeGen raises a
+// link error.
+// ===----------------------------------------------------------------------===//
 
 TargetInstrInfo::~TargetInstrInfo() {
 }
 
 const TargetRegisterClass*
 TargetInstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
-                             const TargetRegisterInfo *TRI) const {
+                             const TargetRegisterInfo *TRI,
+                             const MachineFunction &MF) const {
   if (OpNum >= MCID.getNumOperands())
     return 0;
 
   short RegClass = MCID.OpInfo[OpNum].RegClass;
   if (MCID.OpInfo[OpNum].isLookupPtrRegClass())
-    return TRI->getPointerRegClass(RegClass);
+    return TRI->getPointerRegClass(MF, RegClass);
 
   // Instructions like INSERT_SUBREG do not have fixed register classes.
   if (RegClass < 0)
@@ -44,54 +49,6 @@ TargetInstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
   return TRI->getRegClass(RegClass);
 }
 
-unsigned
-TargetInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
-                                const MachineInstr *MI) const {
-  if (!ItinData || ItinData->isEmpty())
-    return 1;
-
-  unsigned Class = MI->getDesc().getSchedClass();
-  unsigned UOps = ItinData->Itineraries[Class].NumMicroOps;
-  if (UOps)
-    return UOps;
-
-  // The # of u-ops is dynamically determined. The specific target should
-  // override this function to return the right number.
-  return 1;
-}
-
-int
-TargetInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
-                             const MachineInstr *DefMI, unsigned DefIdx,
-                             const MachineInstr *UseMI, unsigned UseIdx) const {
-  if (!ItinData || ItinData->isEmpty())
-    return -1;
-
-  unsigned DefClass = DefMI->getDesc().getSchedClass();
-  unsigned UseClass = UseMI->getDesc().getSchedClass();
-  return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx);
-}
-
-int TargetInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
-                                     const MachineInstr *MI,
-                                     unsigned *PredCost) const {
-  if (!ItinData || ItinData->isEmpty())
-    return 1;
-
-  return ItinData->getStageLatency(MI->getDesc().getSchedClass());
-}
-
-bool TargetInstrInfo::hasLowDefLatency(const InstrItineraryData *ItinData,
-                                       const MachineInstr *DefMI,
-                                       unsigned DefIdx) const {
-  if (!ItinData || ItinData->isEmpty())
-    return false;
-
-  unsigned DefClass = DefMI->getDesc().getSchedClass();
-  int DefCycle = ItinData->getOperandCycle(DefClass, DefIdx);
-  return (DefCycle != -1 && DefCycle <= 1);
-}
-
 /// insertNoop - Insert a noop into the instruction stream at the specified
 /// point.
 void TargetInstrInfo::insertNoop(MachineBasicBlock &MBB,
@@ -99,7 +56,6 @@ void TargetInstrInfo::insertNoop(MachineBasicBlock &MBB,
   llvm_unreachable("Target didn't implement insertNoop!");
 }
 
-
 /// Measure the specified inline asm to determine an approximation of its
 /// length.
 /// Comments (which run till the next SeparatorString or newline) do not
diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp
index 2570e0d..b74a0bd 100644
--- a/lib/Target/TargetLoweringObjectFile.cpp
+++ b/lib/Target/TargetLoweringObjectFile.cpp
@@ -152,7 +152,7 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV,
   // a mergable string section, or general .data if it contains relocations.
   if (GVar->isConstant()) {
     // If the initializer for the global contains something that requires a
-    // relocation, then we may have to drop this into a wriable data section
+    // relocation, then we may have to drop this into a writable data section
     // even though it is marked const.
     switch (C->getRelocationInfo()) {
     case Constant::NoRelocation:
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index b9b2526..3825719 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -11,7 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/GlobalAlias.h"
 #include "llvm/GlobalValue.h"
+#include "llvm/GlobalVariable.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/Target/TargetMachine.h"
@@ -75,25 +77,58 @@ CodeModel::Model TargetMachine::getCodeModel() const {
   return CodeGenInfo->getCodeModel();
 }
 
+/// Get the IR-specified TLS model for Var.
+static TLSModel::Model getSelectedTLSModel(const GlobalVariable *Var) {
+  switch (Var->getThreadLocalMode()) {
+  case GlobalVariable::NotThreadLocal:
+    llvm_unreachable("getSelectedTLSModel for non-TLS variable");
+    break;
+  case GlobalVariable::GeneralDynamicTLSModel:
+    return TLSModel::GeneralDynamic;
+  case GlobalVariable::LocalDynamicTLSModel:
+    return TLSModel::LocalDynamic;
+  case GlobalVariable::InitialExecTLSModel:
+    return TLSModel::InitialExec;
+  case GlobalVariable::LocalExecTLSModel:
+    return TLSModel::LocalExec;
+  }
+  llvm_unreachable("invalid TLS model");
+}
+
 TLSModel::Model TargetMachine::getTLSModel(const GlobalValue *GV) const {
-  bool isLocal = GV->hasLocalLinkage();
-  bool isDeclaration = GV->isDeclaration();
+  // If GV is an alias then use the aliasee for determining
+  // thread-localness.
+  if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+    GV = GA->resolveAliasedGlobal(false);
+  const GlobalVariable *Var = cast<GlobalVariable>(GV);
+
+  bool isLocal = Var->hasLocalLinkage();
+  bool isDeclaration = Var->isDeclaration();
+  bool isPIC = getRelocationModel() == Reloc::PIC_;
+  bool isPIE = Options.PositionIndependentExecutable;
   // FIXME: what should we do for protected and internal visibility?
   // For variables, is internal different from hidden?
-  bool isHidden = GV->hasHiddenVisibility();
+  bool isHidden = Var->hasHiddenVisibility();
 
-  if (getRelocationModel() == Reloc::PIC_ &&
-      !Options.PositionIndependentExecutable) {
+  TLSModel::Model Model;
+  if (isPIC && !isPIE) {
     if (isLocal || isHidden)
-      return TLSModel::LocalDynamic;
+      Model = TLSModel::LocalDynamic;
     else
-      return TLSModel::GeneralDynamic;
+      Model = TLSModel::GeneralDynamic;
   } else {
     if (!isDeclaration || isHidden)
-      return TLSModel::LocalExec;
+      Model = TLSModel::LocalExec;
     else
-      return TLSModel::InitialExec;
+      Model = TLSModel::InitialExec;
   }
+
+  // If the user specified a more specific model, use that.
+  TLSModel::Model SelectedModel = getSelectedTLSModel(Var);
+  if (SelectedModel > Model)
+    return SelectedModel;
+
+  return Model;
 }
 
 /// getOptLevel - Returns the optimization level: None, Less,
@@ -127,4 +162,3 @@ void TargetMachine::setFunctionSections(bool V) {
 void TargetMachine::setDataSections(bool V) {
   DataSections = V;
 }
-
diff --git a/lib/Target/TargetRegisterInfo.cpp b/lib/Target/TargetRegisterInfo.cpp
index 1716423..2395f2b 100644
--- a/lib/Target/TargetRegisterInfo.cpp
+++ b/lib/Target/TargetRegisterInfo.cpp
@@ -46,6 +46,50 @@ void PrintReg::print(raw_ostream &OS) const {
   }
 }
 
+void PrintRegUnit::print(raw_ostream &OS) const {
+  // Generic printout when TRI is missing.
+  if (!TRI) {
+    OS << "Unit~" << Unit;
+    return;
+  }
+
+  // Check for invalid register units.
+  if (Unit >= TRI->getNumRegUnits()) {
+    OS << "BadUnit~" << Unit;
+    return;
+  }
+
+  // Normal units have at least one root.
+  MCRegUnitRootIterator Roots(Unit, TRI);
+  assert(Roots.isValid() && "Unit has no roots.");
+  OS << TRI->getName(*Roots);
+  for (++Roots; Roots.isValid(); ++Roots)
+    OS << '~' << TRI->getName(*Roots);
+}
+
+/// getAllocatableClass - Return the maximal subclass of the given register
+/// class that is alloctable, or NULL.
+const TargetRegisterClass *
+TargetRegisterInfo::getAllocatableClass(const TargetRegisterClass *RC) const {
+  if (!RC || RC->isAllocatable())
+    return RC;
+
+  const unsigned *SubClass = RC->getSubClassMask();
+  for (unsigned Base = 0, BaseE = getNumRegClasses();
+       Base < BaseE; Base += 32) {
+    unsigned Idx = Base;
+    for (unsigned Mask = *SubClass++; Mask; Mask >>= 1) {
+      unsigned Offset = CountTrailingZeros_32(Mask);
+      const TargetRegisterClass *SubRC = getRegClass(Idx + Offset);
+      if (SubRC->isAllocatable())
+        return SubRC;
+      Mask >>= Offset;
+      Idx += Offset + 1;
+    }
+  }
+  return NULL;
+}
+
 /// getMinimalPhysRegClass - Returns the Register Class of a physical
 /// register of the given type, picking the most sub register class of
 /// the right type that contains this physreg.
@@ -71,6 +115,7 @@ TargetRegisterInfo::getMinimalPhysRegClass(unsigned reg, EVT VT) const {
 /// registers for the specific register class.
 static void getAllocatableSetForRC(const MachineFunction &MF,
                                    const TargetRegisterClass *RC, BitVector &R){
+  assert(RC->isAllocatable() && "invalid for nonallocatable sets");
   ArrayRef<uint16_t> Order = RC->getRawAllocationOrder(MF);
   for (unsigned i = 0; i != Order.size(); ++i)
     R.set(Order[i]);
@@ -80,7 +125,10 @@ BitVector TargetRegisterInfo::getAllocatableSet(const MachineFunction &MF,
                                           const TargetRegisterClass *RC) const {
   BitVector Allocatable(getNumRegs());
   if (RC) {
-    getAllocatableSetForRC(MF, RC, Allocatable);
+    // A register class with no allocatable subclass returns an empty set.
+    const TargetRegisterClass *SubClass = getAllocatableClass(RC);
+    if (SubClass)
+      getAllocatableSetForRC(MF, SubClass, Allocatable);
   } else {
     for (TargetRegisterInfo::regclass_iterator I = regclass_begin(),
          E = regclass_end(); I != E; ++I)
@@ -95,6 +143,16 @@ BitVector TargetRegisterInfo::getAllocatableSet(const MachineFunction &MF,
   return Allocatable;
 }
 
+static inline
+const TargetRegisterClass *firstCommonClass(const uint32_t *A,
+                                            const uint32_t *B,
+                                            const TargetRegisterInfo *TRI) {
+  for (unsigned I = 0, E = TRI->getNumRegClasses(); I < E; I += 32)
+    if (unsigned Common = *A++ & *B++)
+      return TRI->getRegClass(I + CountTrailingZeros_32(Common));
+  return 0;
+}
+
 const TargetRegisterClass *
 TargetRegisterInfo::getCommonSubClass(const TargetRegisterClass *A,
                                       const TargetRegisterClass *B) const {
@@ -106,15 +164,83 @@ TargetRegisterInfo::getCommonSubClass(const TargetRegisterClass *A,
 
   // Register classes are ordered topologically, so the largest common
   // sub-class it the common sub-class with the smallest ID.
-  const unsigned *SubA = A->getSubClassMask();
-  const unsigned *SubB = B->getSubClassMask();
+  return firstCommonClass(A->getSubClassMask(), B->getSubClassMask(), this);
+}
 
-  // We could start the search from max(A.ID, B.ID), but we are only going to
-  // execute 2-3 iterations anyway.
-  for (unsigned Base = 0, BaseE = getNumRegClasses(); Base < BaseE; Base += 32)
-    if (unsigned Common = *SubA++ & *SubB++)
-      return getRegClass(Base + CountTrailingZeros_32(Common));
+const TargetRegisterClass *
+TargetRegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
+                                             const TargetRegisterClass *B,
+                                             unsigned Idx) const {
+  assert(A && B && "Missing register class");
+  assert(Idx && "Bad sub-register index");
+
+  // Find Idx in the list of super-register indices.
+  for (SuperRegClassIterator RCI(B, this); RCI.isValid(); ++RCI)
+    if (RCI.getSubReg() == Idx)
+      // The bit mask contains all register classes that are projected into B
+      // by Idx. Find a class that is also a sub-class of A.
+      return firstCommonClass(RCI.getMask(), A->getSubClassMask(), this);
+  return 0;
+}
 
-  // No common sub-class exists.
-  return NULL;
+const TargetRegisterClass *TargetRegisterInfo::
+getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA,
+                       const TargetRegisterClass *RCB, unsigned SubB,
+                       unsigned &PreA, unsigned &PreB) const {
+  assert(RCA && SubA && RCB && SubB && "Invalid arguments");
+
+  // Search all pairs of sub-register indices that project into RCA and RCB
+  // respectively. This is quadratic, but usually the sets are very small. On
+  // most targets like X86, there will only be a single sub-register index
+  // (e.g., sub_16bit projecting into GR16).
+  //
+  // The worst case is a register class like DPR on ARM.
+  // We have indices dsub_0..dsub_7 projecting into that class.
+  //
+  // It is very common that one register class is a sub-register of the other.
+  // Arrange for RCA to be the larger register so the answer will be found in
+  // the first iteration. This makes the search linear for the most common
+  // case.
+  const TargetRegisterClass *BestRC = 0;
+  unsigned *BestPreA = &PreA;
+  unsigned *BestPreB = &PreB;
+  if (RCA->getSize() < RCB->getSize()) {
+    std::swap(RCA, RCB);
+    std::swap(SubA, SubB);
+    std::swap(BestPreA, BestPreB);
+  }
+
+  // Also terminate the search one we have found a register class as small as
+  // RCA.
+  unsigned MinSize = RCA->getSize();
+
+  for (SuperRegClassIterator IA(RCA, this, true); IA.isValid(); ++IA) {
+    unsigned FinalA = composeSubRegIndices(IA.getSubReg(), SubA);
+    for (SuperRegClassIterator IB(RCB, this, true); IB.isValid(); ++IB) {
+      // Check if a common super-register class exists for this index pair.
+      const TargetRegisterClass *RC =
+        firstCommonClass(IA.getMask(), IB.getMask(), this);
+      if (!RC || RC->getSize() < MinSize)
+        continue;
+
+      // The indexes must compose identically: PreA+SubA == PreB+SubB.
+      unsigned FinalB = composeSubRegIndices(IB.getSubReg(), SubB);
+      if (FinalA != FinalB)
+        continue;
+
+      // Is RC a better candidate than BestRC?
+      if (BestRC && RC->getSize() >= BestRC->getSize())
+        continue;
+
+      // Yes, RC is the smallest super-register seen so far.
+      BestRC = RC;
+      *BestPreA = IA.getSubReg();
+      *BestPreB = IB.getSubReg();
+
+      // Bail early if we reached MinSize. We won't find a better candidate.
+      if (BestRC->getSize() == MinSize)
+        return BestRC;
+    }
+  }
+  return BestRC;
 }
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 08c732c..95e83ec 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -117,7 +117,7 @@ static unsigned MatchRegisterName(StringRef Name);
 
 /// }
 
-static  bool isImmSExti16i8Value(uint64_t Value) {
+static bool isImmSExti16i8Value(uint64_t Value) {
   return ((                                  Value <= 0x000000000000007FULL)||
           (0x000000000000FF80ULL <= Value && Value <= 0x000000000000FFFFULL)||
           (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
@@ -135,12 +135,12 @@ static bool isImmZExtu32u8Value(uint64_t Value) {
 
 static bool isImmSExti64i8Value(uint64_t Value) {
   return ((                                  Value <= 0x000000000000007FULL)||
-	  (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+          (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
 }
 
 static bool isImmSExti64i32Value(uint64_t Value) {
   return ((                                  Value <= 0x000000007FFFFFFFULL)||
-	  (0xFFFFFFFF80000000ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+          (0xFFFFFFFF80000000ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
 }
 namespace {
 
@@ -187,7 +187,7 @@ struct X86Operand : public MCParsedAsmOperand {
   SMLoc getStartLoc() const { return StartLoc; }
   /// getEndLoc - Get the location of the last token of this operand.
   SMLoc getEndLoc() const { return EndLoc; }
-  
+
   SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
 
   virtual void print(raw_ostream &OS) const {}
@@ -309,28 +309,45 @@ struct X86Operand : public MCParsedAsmOperand {
   }
 
   bool isMem() const { return Kind == Memory; }
-  bool isMem8() const { 
+  bool isMem8() const {
     return Kind == Memory && (!Mem.Size || Mem.Size == 8);
   }
-  bool isMem16() const { 
+  bool isMem16() const {
     return Kind == Memory && (!Mem.Size || Mem.Size == 16);
   }
-  bool isMem32() const { 
+  bool isMem32() const {
     return Kind == Memory && (!Mem.Size || Mem.Size == 32);
   }
-  bool isMem64() const { 
+  bool isMem64() const {
     return Kind == Memory && (!Mem.Size || Mem.Size == 64);
   }
-  bool isMem80() const { 
+  bool isMem80() const {
     return Kind == Memory && (!Mem.Size || Mem.Size == 80);
   }
-  bool isMem128() const { 
+  bool isMem128() const {
     return Kind == Memory && (!Mem.Size || Mem.Size == 128);
   }
-  bool isMem256() const { 
+  bool isMem256() const {
     return Kind == Memory && (!Mem.Size || Mem.Size == 256);
   }
 
+  bool isMemVX32() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
+      getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15;
+  }
+  bool isMemVY32() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
+      getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15;
+  }
+  bool isMemVX64() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
+      getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15;
+  }
+  bool isMemVY64() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
+      getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15;
+  }
+
   bool isAbsMem() const {
     return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
       !getMemIndexReg() && getMemScale() == 1;
@@ -356,26 +373,38 @@ struct X86Operand : public MCParsedAsmOperand {
     addExpr(Inst, getImm());
   }
 
-  void addMem8Operands(MCInst &Inst, unsigned N) const { 
-    addMemOperands(Inst, N); 
+  void addMem8Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
+  }
+  void addMem16Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
+  }
+  void addMem32Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
   }
-  void addMem16Operands(MCInst &Inst, unsigned N) const { 
-    addMemOperands(Inst, N); 
+  void addMem64Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
   }
-  void addMem32Operands(MCInst &Inst, unsigned N) const { 
-    addMemOperands(Inst, N); 
+  void addMem80Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
   }
-  void addMem64Operands(MCInst &Inst, unsigned N) const { 
-    addMemOperands(Inst, N); 
+  void addMem128Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
   }
-  void addMem80Operands(MCInst &Inst, unsigned N) const { 
-    addMemOperands(Inst, N); 
+  void addMem256Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
   }
-  void addMem128Operands(MCInst &Inst, unsigned N) const { 
-    addMemOperands(Inst, N); 
+  void addMemVX32Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
   }
-  void addMem256Operands(MCInst &Inst, unsigned N) const { 
-    addMemOperands(Inst, N); 
+  void addMemVY32Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
+  }
+  void addMemVX64Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
+  }
+  void addMemVY64Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
   }
 
   void addMemOperands(MCInst &Inst, unsigned N) const {
@@ -467,7 +496,7 @@ bool X86AsmParser::isSrcOp(X86Operand &Op) {
 bool X86AsmParser::isDstOp(X86Operand &Op) {
   unsigned basereg = is64BitMode() ? X86::RDI : X86::EDI;
 
-  return Op.isMem() && 
+  return Op.isMem() &&
     (Op.Mem.SegReg == 0 || Op.Mem.SegReg == X86::ES) &&
     isa<MCConstantExpr>(Op.Mem.Disp) &&
     cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
@@ -611,7 +640,7 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg,
   if (getLexer().isNot(AsmToken::LBrac))
     return ErrorOperand(Start, "Expected '[' token!");
   Parser.Lex();
-  
+
   if (getLexer().is(AsmToken::Identifier)) {
     // Parse BaseReg
     if (ParseRegister(BaseReg, Start, End)) {
@@ -638,11 +667,11 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg,
         // Handle '[' Scale*IndexReg ']'
         Parser.Lex();
         SMLoc IdxRegLoc = Parser.getTok().getLoc();
-	if (ParseRegister(IndexReg, IdxRegLoc, End))
-	  return ErrorOperand(IdxRegLoc, "Expected register");
+        if (ParseRegister(IndexReg, IdxRegLoc, End))
+          return ErrorOperand(IdxRegLoc, "Expected register");
         Scale = Val;
       } else
-        return ErrorOperand(Loc, "Unepxeted token");
+        return ErrorOperand(Loc, "Unexpected token");
   }
 
   if (getLexer().is(AsmToken::Plus) || getLexer().is(AsmToken::Minus)) {
@@ -655,8 +684,8 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg,
       if (getLexer().is(AsmToken::Star)) {
         Parser.Lex();
         SMLoc IdxRegLoc = Parser.getTok().getLoc();
-	if (ParseRegister(IndexReg, IdxRegLoc, End))
-	  return ErrorOperand(IdxRegLoc, "Expected register");
+        if (ParseRegister(IndexReg, IdxRegLoc, End))
+          return ErrorOperand(IdxRegLoc, "Expected register");
         Scale = Val;
       } else if (getLexer().is(AsmToken::RBrac)) {
         const MCExpr *ValExpr = MCConstantExpr::Create(Val, getContext());
@@ -668,7 +697,7 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg,
       End = Parser.getTok().getLoc();
       if (!IndexReg)
         ParseRegister(IndexReg, Start, End);
-      else if (getParser().ParseExpression(Disp, End)) return 0;        
+      else if (getParser().ParseExpression(Disp, End)) return 0;
     }
   }
 
@@ -881,7 +910,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
           if (getParser().ParseAbsoluteExpression(ScaleVal)){
             Error(Loc, "expected scale expression");
             return 0;
-	  }
+          }
 
           // Validate the scale amount.
           if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 && ScaleVal != 8){
@@ -916,15 +945,18 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
 
   // If we have both a base register and an index register make sure they are
   // both 64-bit or 32-bit registers.
+  // To support VSIB, IndexReg can be 128-bit or 256-bit registers.
   if (BaseReg != 0 && IndexReg != 0) {
     if (X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) &&
-        !X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg) &&
+        (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
+         X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg)) &&
         IndexReg != X86::RIZ) {
       Error(IndexLoc, "index register is 32-bit, but base register is 64-bit");
       return 0;
     }
     if (X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg) &&
-        !X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg) &&
+        (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
+         X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg)) &&
         IndexReg != X86::EIZ){
       Error(IndexLoc, "index register is 64-bit, but base register is 32-bit");
       return 0;
@@ -944,7 +976,7 @@ ParseInstruction(StringRef Name, SMLoc NameLoc,
   if (PatchedName.startswith("set") && PatchedName.endswith("b") &&
       PatchedName != "setb" && PatchedName != "setnb")
     PatchedName = PatchedName.substr(0, Name.size()-1);
-  
+
   // FIXME: Hack to recognize cmp<comparison code>{ss,sd,ps,pd}.
   const MCExpr *ExtraImmOp = 0;
   if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) &&
@@ -1204,20 +1236,20 @@ ParseInstruction(StringRef Name, SMLoc NameLoc,
       // Intel syntax
       X86Operand *Op1 = static_cast<X86Operand*>(Operands[2]);
       if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) &&
-	  cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) {
-	delete Operands[2];
-	Operands.pop_back();
+          cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) {
+        delete Operands[2];
+        Operands.pop_back();
       }
     } else {
       X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]);
       if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) &&
-	  cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) {
-	delete Operands[1];
-	Operands.erase(Operands.begin() + 1);
+          cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) {
+        delete Operands[1];
+        Operands.erase(Operands.begin() + 1);
       }
     }
   }
-  
+
   // Transforms "int $3" into "int3" as a size optimization.  We can't write an
   // instalias with an immediate operand yet.
   if (Name == "int" && Operands.size() == 2) {
@@ -1520,7 +1552,7 @@ MatchAndEmitInstruction(SMLoc IDLoc,
   case Match_Success:
     // Some instructions need post-processing to, for example, tweak which
     // encoding is selected. Loop on it while changes happen so the
-    // individual transformations can chain off each other. 
+    // individual transformations can chain off each other.
     while (processInstruction(Inst, Operands))
       ;
 
@@ -1558,12 +1590,12 @@ MatchAndEmitInstruction(SMLoc IDLoc,
   // Otherwise, we assume that this may be an integer instruction, which comes
   // in 8/16/32/64-bit forms using the b,w,l,q suffixes respectively.
   const char *Suffixes = Base[0] != 'f' ? "bwlq" : "slt\0";
-  
+
   // Check for the various suffix matches.
   Tmp[Base.size()] = Suffixes[0];
   unsigned ErrorInfoIgnore;
   unsigned Match1, Match2, Match3, Match4;
-  
+
   Match1 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore);
   Tmp[Base.size()] = Suffixes[1];
   Match2 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore);
@@ -1673,10 +1705,10 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
     getParser().setAssemblerDialect(1);
     if (getLexer().isNot(AsmToken::EndOfStatement)) {
       if(Parser.getTok().getString() == "noprefix") {
-	// FIXME : Handle noprefix
-	Parser.Lex();
+        // FIXME : Handle noprefix
+        Parser.Lex();
       } else
-	return true;
+        return true;
     }
     return false;
   }
@@ -1691,19 +1723,19 @@ bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
       const MCExpr *Value;
       if (getParser().ParseExpression(Value))
         return true;
-      
+
       getParser().getStreamer().EmitValue(Value, Size, 0 /*addrspace*/);
-      
+
       if (getLexer().is(AsmToken::EndOfStatement))
         break;
-      
+
       // FIXME: Improve diagnostic.
       if (getLexer().isNot(AsmToken::Comma))
         return Error(L, "unexpected token in directive");
       Parser.Lex();
     }
   }
-  
+
   Parser.Lex();
   return false;
 }
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index f612e23..b886d46 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -52,6 +52,8 @@ endif()
 
 add_llvm_target(X86CodeGen ${sources})
 
+add_dependencies(LLVMX86CodeGen intrinsics_gen)
+
 add_subdirectory(AsmParser)
 add_subdirectory(Disassembler)
 add_subdirectory(InstPrinter)
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index b13a006..4bbfe95 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -356,15 +356,15 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
       // Special case those X86 instructions that use the imm8 as a set of
       // bits, bit count, etc. and are not sign-extend.
       if (Opcode != X86::BLENDPSrri && Opcode != X86::BLENDPDrri &&
-	  Opcode != X86::PBLENDWrri && Opcode != X86::MPSADBWrri &&
-	  Opcode != X86::DPPSrri && Opcode != X86::DPPDrri &&
-	  Opcode != X86::INSERTPSrr && Opcode != X86::VBLENDPSYrri &&
-	  Opcode != X86::VBLENDPSYrmi && Opcode != X86::VBLENDPDYrri &&
-	  Opcode != X86::VBLENDPDYrmi && Opcode != X86::VPBLENDWrri &&
-	  Opcode != X86::VMPSADBWrri && Opcode != X86::VDPPSYrri &&
-	  Opcode != X86::VDPPSYrmi && Opcode != X86::VDPPDrri &&
-	  Opcode != X86::VINSERTPSrr)
-	type = TYPE_MOFFS8;
+          Opcode != X86::PBLENDWrri && Opcode != X86::MPSADBWrri &&
+          Opcode != X86::DPPSrri && Opcode != X86::DPPDrri &&
+          Opcode != X86::INSERTPSrr && Opcode != X86::VBLENDPSYrri &&
+          Opcode != X86::VBLENDPSYrmi && Opcode != X86::VBLENDPDYrri &&
+          Opcode != X86::VBLENDPDYrmi && Opcode != X86::VPBLENDWrri &&
+          Opcode != X86::VMPSADBWrri && Opcode != X86::VDPPSYrri &&
+          Opcode != X86::VDPPSYrmi && Opcode != X86::VDPPDrri &&
+          Opcode != X86::VINSERTPSrr)
+        type = TYPE_MOFFS8;
       break;
     case ENCODING_IW:
       type = TYPE_MOFFS16;
@@ -498,7 +498,38 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
     } else {
       baseReg = MCOperand::CreateReg(0);
     }
-    
+
+    // Check whether we are handling VSIB addressing mode for GATHER.
+    // If sibIndex was set to SIB_INDEX_NONE, index offset is 4 and
+    // we should use SIB_INDEX_XMM4|YMM4 for VSIB.
+    // I don't see a way to get the correct IndexReg in readSIB:
+    //   We can tell whether it is VSIB or SIB after instruction ID is decoded,
+    //   but instruction ID may not be decoded yet when calling readSIB.
+    uint32_t Opcode = mcInst.getOpcode();
+    bool IndexIs128 = (Opcode == X86::VGATHERDPDrm ||
+                       Opcode == X86::VGATHERDPDYrm ||
+                       Opcode == X86::VGATHERQPDrm ||
+                       Opcode == X86::VGATHERDPSrm ||
+                       Opcode == X86::VGATHERQPSrm ||
+                       Opcode == X86::VPGATHERDQrm ||
+                       Opcode == X86::VPGATHERDQYrm ||
+                       Opcode == X86::VPGATHERQQrm ||
+                       Opcode == X86::VPGATHERDDrm ||
+                       Opcode == X86::VPGATHERQDrm);
+    bool IndexIs256 = (Opcode == X86::VGATHERQPDYrm ||
+                       Opcode == X86::VGATHERDPSYrm ||
+                       Opcode == X86::VGATHERQPSYrm ||
+                       Opcode == X86::VPGATHERQQYrm ||
+                       Opcode == X86::VPGATHERDDYrm ||
+                       Opcode == X86::VPGATHERQDYrm);
+    if (IndexIs128 || IndexIs256) {
+      unsigned IndexOffset = insn.sibIndex -
+                         (insn.addressSize == 8 ? SIB_INDEX_RAX:SIB_INDEX_EAX);
+      SIBIndex IndexBase = IndexIs256 ? SIB_INDEX_YMM0 : SIB_INDEX_XMM0;
+      insn.sibIndex = (SIBIndex)(IndexBase + 
+                           (insn.sibIndex == SIB_INDEX_NONE ? 4 : IndexOffset));
+    }
+
     if (insn.sibIndex != SIB_INDEX_NONE) {
       switch (insn.sibIndex) {
       default:
@@ -509,6 +540,8 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
         indexReg = MCOperand::CreateReg(X86::x); break;
       EA_BASES_32BIT
       EA_BASES_64BIT
+      REGS_XMM
+      REGS_YMM
 #undef ENTRY
       }
     } else {
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index fae309b..e2caf6a 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -310,11 +310,14 @@ typedef enum {
  * SIBIndex - All possible values of the SIB index field.
  *   Borrows entries from ALL_EA_BASES with the special case that
  *   sib is synonymous with NONE.
+ * Vector SIB: index can be XMM or YMM.
  */
 typedef enum {
   SIB_INDEX_NONE,
 #define ENTRY(x) SIB_INDEX_##x,
   ALL_EA_BASES
+  REGS_XMM
+  REGS_YMM
 #undef ENTRY
   SIB_INDEX_max
 } SIBIndex;
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp
index f532019..64ac5e6 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -96,7 +96,17 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::PSHUFHWmi:
   case X86::VPSHUFHWmi:
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodePSHUFHWMask(MI->getOperand(MI->getNumOperands()-1).getImm(),
+    DecodePSHUFHWMask(MVT::v8i16,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    break;
+  case X86::VPSHUFHWYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPSHUFHWYmi:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodePSHUFHWMask(MVT::v16i16,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
                       ShuffleMask);
     break;
   case X86::PSHUFLWri:
@@ -106,7 +116,17 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::PSHUFLWmi:
   case X86::VPSHUFLWmi:
     DestName = getRegName(MI->getOperand(0).getReg());
-    DecodePSHUFLWMask(MI->getOperand(MI->getNumOperands()-1).getImm(),
+    DecodePSHUFLWMask(MVT::v8i16,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    break;
+  case X86::VPSHUFLWYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPSHUFLWYmi:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodePSHUFLWMask(MVT::v16i16,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
                       ShuffleMask);
     break;
 
@@ -487,6 +507,16 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
+  case X86::VPERMQYri:
+  case X86::VPERMPDYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPERMQYmi:
+  case X86::VPERMPDYmi:
+    DecodeVPERMMask(MI->getOperand(MI->getNumOperands()-1).getImm(),
+                    ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
   }
 
 
diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index a0bb6dc..db597fb 100644
--- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -94,40 +94,83 @@ namespace X86II {
     MO_PLT,
 
     /// MO_TLSGD - On a symbol operand this indicates that the immediate is
-    /// some TLS offset.
+    /// the offset of the GOT entry with the TLS index structure that contains
+    /// the module number and variable offset for the symbol. Used in the
+    /// general dynamic TLS access model.
     ///
     /// See 'ELF Handling for Thread-Local Storage' for more details.
     ///    SYMBOL_LABEL @TLSGD
     MO_TLSGD,
 
+    /// MO_TLSLD - On a symbol operand this indicates that the immediate is
+    /// the offset of the GOT entry with the TLS index for the module that
+    /// contains the symbol. When this index is passed to a call to to
+    /// __tls_get_addr, the function will return the base address of the TLS
+    /// block for the symbol. Used in the x86-64 local dynamic TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @TLSLD
+    MO_TLSLD,
+
+    /// MO_TLSLDM - On a symbol operand this indicates that the immediate is
+    /// the offset of the GOT entry with the TLS index for the module that
+    /// contains the symbol. When this index is passed to a call to to
+    /// ___tls_get_addr, the function will return the base address of the TLS
+    /// block for the symbol. Used in the IA32 local dynamic TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @TLSLDM
+    MO_TLSLDM,
+
     /// MO_GOTTPOFF - On a symbol operand this indicates that the immediate is
-    /// some TLS offset.
+    /// the offset of the GOT entry with the thread-pointer offset for the
+    /// symbol. Used in the x86-64 initial exec TLS access model.
     ///
     /// See 'ELF Handling for Thread-Local Storage' for more details.
     ///    SYMBOL_LABEL @GOTTPOFF
     MO_GOTTPOFF,
 
     /// MO_INDNTPOFF - On a symbol operand this indicates that the immediate is
-    /// some TLS offset.
+    /// the absolute address of the GOT entry with the negative thread-pointer
+    /// offset for the symbol. Used in the non-PIC IA32 initial exec TLS access
+    /// model.
     ///
     /// See 'ELF Handling for Thread-Local Storage' for more details.
     ///    SYMBOL_LABEL @INDNTPOFF
     MO_INDNTPOFF,
 
     /// MO_TPOFF - On a symbol operand this indicates that the immediate is
-    /// some TLS offset.
+    /// the thread-pointer offset for the symbol. Used in the x86-64 local
+    /// exec TLS access model.
     ///
     /// See 'ELF Handling for Thread-Local Storage' for more details.
     ///    SYMBOL_LABEL @TPOFF
     MO_TPOFF,
 
+    /// MO_DTPOFF - On a symbol operand this indicates that the immediate is
+    /// the offset of the GOT entry with the TLS offset of the symbol. Used
+    /// in the local dynamic TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @DTPOFF
+    MO_DTPOFF,
+
     /// MO_NTPOFF - On a symbol operand this indicates that the immediate is
-    /// some TLS offset.
+    /// the negative thread-pointer offset for the symbol. Used in the IA32
+    /// local exec TLS access model.
     ///
     /// See 'ELF Handling for Thread-Local Storage' for more details.
     ///    SYMBOL_LABEL @NTPOFF
     MO_NTPOFF,
 
+    /// MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is
+    /// the offset of the GOT entry with the negative thread-pointer offset for
+    /// the symbol. Used in the PIC IA32 initial exec TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @GOTNTPOFF
+    MO_GOTNTPOFF,
+
     /// MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the
     /// reference is actually to the "__imp_FOO" symbol.  This is used for
     /// dllimport linkage on windows.
@@ -438,17 +481,17 @@ namespace X86II {
   // getBaseOpcodeFor - This function returns the "base" X86 opcode for the
   // specified machine instruction.
   //
-  static inline unsigned char getBaseOpcodeFor(uint64_t TSFlags) {
+  inline unsigned char getBaseOpcodeFor(uint64_t TSFlags) {
     return TSFlags >> X86II::OpcodeShift;
   }
 
-  static inline bool hasImm(uint64_t TSFlags) {
+  inline bool hasImm(uint64_t TSFlags) {
     return (TSFlags & X86II::ImmMask) != 0;
   }
 
   /// getSizeOfImm - Decode the "size of immediate" field from the TSFlags field
   /// of the specified instruction.
-  static inline unsigned getSizeOfImm(uint64_t TSFlags) {
+  inline unsigned getSizeOfImm(uint64_t TSFlags) {
     switch (TSFlags & X86II::ImmMask) {
     default: llvm_unreachable("Unknown immediate size");
     case X86II::Imm8:
@@ -463,7 +506,7 @@ namespace X86II {
 
   /// isImmPCRel - Return true if the immediate of the specified instruction's
   /// TSFlags indicates that it is pc relative.
-  static inline unsigned isImmPCRel(uint64_t TSFlags) {
+  inline unsigned isImmPCRel(uint64_t TSFlags) {
     switch (TSFlags & X86II::ImmMask) {
     default: llvm_unreachable("Unknown immediate size");
     case X86II::Imm8PCRel:
@@ -486,9 +529,11 @@ namespace X86II {
   /// is duplicated in the MCInst (e.g. "EAX = addl EAX, [mem]") it is only
   /// counted as one operand.
   ///
-  static inline int getMemoryOperandNo(uint64_t TSFlags, unsigned Opcode) {
+  inline int getMemoryOperandNo(uint64_t TSFlags, unsigned Opcode) {
     switch (TSFlags & X86II::FormMask) {
-    case X86II::MRMInitReg:  llvm_unreachable("FIXME: Remove this form");
+    case X86II::MRMInitReg:
+        // FIXME: Remove this form.
+        return -1;
     default: llvm_unreachable("Unknown FormMask value in getMemoryOperandNo!");
     case X86II::Pseudo:
     case X86II::RawFrm:
@@ -546,7 +591,7 @@ namespace X86II {
 
   /// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or
   /// higher) register?  e.g. r8, xmm8, xmm13, etc.
-  static inline bool isX86_64ExtendedReg(unsigned RegNo) {
+  inline bool isX86_64ExtendedReg(unsigned RegNo) {
     switch (RegNo) {
     default: break;
     case X86::R8:    case X86::R9:    case X86::R10:   case X86::R11:
@@ -568,7 +613,7 @@ namespace X86II {
     return false;
   }
   
-  static inline bool isX86_64NonExtLowByteReg(unsigned reg) {
+  inline bool isX86_64NonExtLowByteReg(unsigned reg) {
     return (reg == X86::SPL || reg == X86::BPL ||
             reg == X86::SIL || reg == X86::DIL);
   }
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index afa545c..49c07f3 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -35,19 +35,6 @@ AsmWriterFlavor("x86-asm-syntax", cl::init(ATT),
              clEnumValEnd));
 
 
-static const char *const x86_asm_table[] = {
-  "{si}", "S",
-  "{di}", "D",
-  "{ax}", "a",
-  "{cx}", "c",
-  "{memory}", "memory",
-  "{flags}", "",
-  "{dirflag}", "",
-  "{fpsr}", "",
-  "{fpcr}", "",
-  "{cc}", "cc",
-  0,0};
-
 void X86MCAsmInfoDarwin::anchor() { }
 
 X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) {
@@ -55,7 +42,6 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) {
   if (is64Bit)
     PointerSize = 8;
 
-  AsmTransCBE = x86_asm_table;
   AssemblerDialect = AsmWriterFlavor;
 
   TextAlignFillValue = 0x90;
@@ -88,7 +74,6 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
   if (T.getArch() == Triple::x86_64)
     PointerSize = 8;
 
-  AsmTransCBE = x86_asm_table;
   AssemblerDialect = AsmWriterFlavor;
 
   TextAlignFillValue = 0x90;
@@ -137,7 +122,6 @@ X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
     PrivateGlobalPrefix = ".L";
   }
 
-  AsmTransCBE = x86_asm_table;
   AssemblerDialect = AsmWriterFlavor;
 
   TextAlignFillValue = 0x90;
@@ -151,7 +135,6 @@ X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) {
     PrivateGlobalPrefix = ".L";
   }
 
-  AsmTransCBE = x86_asm_table;
   AssemblerDialect = AsmWriterFlavor;
 
   TextAlignFillValue = 0x90;
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 80990e5..4a38324 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -139,6 +139,7 @@ public:
 
 
 MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII,
+                                            const MCRegisterInfo &MRI,
                                             const MCSubtargetInfo &STI,
                                             MCContext &Ctx) {
   return new X86MCCodeEmitter(MCII, STI, Ctx);
@@ -569,7 +570,17 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   }
 
   // Classify VEX_B, VEX_4V, VEX_R, VEX_X
+  unsigned NumOps = Desc.getNumOperands();
   unsigned CurOp = 0;
+  if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0)
+    ++CurOp;
+  else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0) {
+    assert(Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1);
+    // Special case for GATHER with 2 TIED_TO operands
+    // Skip the first 2 operands: dst, mask_wb
+    CurOp += 2;
+  }
+
   switch (TSFlags & X86II::FormMask) {
   case X86II::MRMInitReg: llvm_unreachable("FIXME: Remove this!");
   case X86II::MRMDestMem: {
@@ -602,11 +613,11 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     //  FMA4:
     //  dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
     //  dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
-    if (X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp++).getReg()))
       VEX_R = 0x0;
 
     if (HasVEX_4V)
-      VEX_4V = getVEXRegisterEncoding(MI, 1);
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
 
     if (X86II::isX86_64ExtendedReg(
                MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
@@ -616,7 +627,12 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
       VEX_X = 0x0;
 
     if (HasVEX_4VOp3)
-      VEX_4V = getVEXRegisterEncoding(MI, X86::AddrNumOperands+1);
+      // Instruction format for 4VOp3:
+      //   src1(ModR/M), MemAddr, src3(VEX_4V)
+      // CurOp points to start of the MemoryOperand,
+      //   it skips TIED_TO operands if exist, then increments past src1.
+      // CurOp + X86::AddrNumOperands will point to src3.
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp+X86::AddrNumOperands);
     break;
   case X86II::MRM0m: case X86II::MRM1m:
   case X86II::MRM2m: case X86II::MRM3m:
@@ -961,11 +977,14 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   // FIXME: This should be handled during MCInst lowering.
   unsigned NumOps = Desc.getNumOperands();
   unsigned CurOp = 0;
-  if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1)
+  if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0)
     ++CurOp;
-  else if (NumOps > 2 && Desc.getOperandConstraint(NumOps-1, MCOI::TIED_TO)== 0)
-    // Skip the last source operand that is tied_to the dest reg. e.g. LXADD32
-    --NumOps;
+  else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0) {
+    assert(Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1);
+    // Special case for GATHER with 2 TIED_TO operands
+    // Skip the first 2 operands: dst, mask_wb
+    CurOp += 2;
+  }
 
   // Keep track of the current byte being emitted.
   unsigned CurByte = 0;
@@ -1037,7 +1056,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     SrcRegNum = CurOp + X86::AddrNumOperands;
 
     if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
-      SrcRegNum++;
+      ++SrcRegNum;
 
     EmitMemModRMByte(MI, CurOp,
                      GetX86RegNum(MI.getOperand(SrcRegNum)),
@@ -1050,15 +1069,15 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     SrcRegNum = CurOp + 1;
 
     if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
-      SrcRegNum++;
+      ++SrcRegNum;
 
-    if(HasMemOp4) // Skip 2nd src (which is encoded in I8IMM)
-      SrcRegNum++;
+    if (HasMemOp4) // Skip 2nd src (which is encoded in I8IMM)
+      ++SrcRegNum;
 
     EmitRegModRMByte(MI.getOperand(SrcRegNum),
                      GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
 
-    // 2 operands skipped with HasMemOp4, comensate accordingly
+    // 2 operands skipped with HasMemOp4, compensate accordingly
     CurOp = HasMemOp4 ? SrcRegNum : SrcRegNum + 1;
     if (HasVEX_4VOp3)
       ++CurOp;
@@ -1071,7 +1090,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
       ++AddrOperands;
       ++FirstMemOp;  // Skip the register source (which is encoded in VEX_VVVV).
     }
-    if(HasMemOp4) // Skip second register source (encoded in I8IMM)
+    if (HasMemOp4) // Skip second register source (encoded in I8IMM)
       ++FirstMemOp;
 
     EmitByte(BaseOpcode, CurByte, OS);
@@ -1089,7 +1108,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   case X86II::MRM4r: case X86II::MRM5r:
   case X86II::MRM6r: case X86II::MRM7r:
     if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
-      CurOp++;
+      ++CurOp;
     EmitByte(BaseOpcode, CurByte, OS);
     EmitRegModRMByte(MI.getOperand(CurOp++),
                      (TSFlags & X86II::FormMask)-X86II::MRM0r,
@@ -1100,7 +1119,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   case X86II::MRM4m: case X86II::MRM5m:
   case X86II::MRM6m: case X86II::MRM7m:
     if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
-      CurOp++;
+      ++CurOp;
     EmitByte(BaseOpcode, CurByte, OS);
     EmitMemModRMByte(MI, CurOp, (TSFlags & X86II::FormMask)-X86II::MRM0m,
                      TSFlags, CurByte, OS, Fixups);
@@ -1149,22 +1168,23 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   }
 
   // If there is a remaining operand, it must be a trailing immediate.  Emit it
-  // according to the right size for the instruction.
-  if (CurOp != NumOps) {
+  // according to the right size for the instruction. Some instructions
+  // (SSE4a extrq and insertq) have two trailing immediates.
+  while (CurOp != NumOps && NumOps - CurOp <= 2) {
     // The last source register of a 4 operand instruction in AVX is encoded
     // in bits[7:4] of a immediate byte.
     if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) {
       const MCOperand &MO = MI.getOperand(HasMemOp4 ? MemOp4_I8IMMOperand
-                                                   : CurOp);
-      CurOp++;
-      bool IsExtReg = X86II::isX86_64ExtendedReg(MO.getReg());
-      unsigned RegNum = (IsExtReg ? (1 << 7) : 0);
-      RegNum |= GetX86RegNum(MO) << 4;
+                                                    : CurOp);
+      ++CurOp;
+      unsigned RegNum = GetX86RegNum(MO) << 4;
+      if (X86II::isX86_64ExtendedReg(MO.getReg()))
+        RegNum |= 1 << 7;
       // If there is an additional 5th operand it must be an immediate, which
       // is encoded in bits[3:0]
-      if(CurOp != NumOps) {
+      if (CurOp != NumOps) {
         const MCOperand &MIMM = MI.getOperand(CurOp++);
-        if(MIMM.isImm()) {
+        if (MIMM.isImm()) {
           unsigned Val = MIMM.getImm();
           assert(Val < 16 && "Immediate operand value out of range");
           RegNum |= Val;
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 9896cbe..4650069 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -76,6 +76,7 @@ namespace X86_MC {
 }
 
 MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII,
+                                      const MCRegisterInfo &MRI,
                                       const MCSubtargetInfo &STI,
                                       MCContext &Ctx);
 
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index a802333..8b87c1f 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -64,13 +64,13 @@ void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
 /// DecodePSHUFMask - This decodes the shuffle masks for pshufd, and vpermilp*.
 /// VT indicates the type of the vector allowing it to handle different
 /// datatypes and vector widths.
-void DecodePSHUFMask(EVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
   unsigned NumElts = VT.getVectorNumElements();
 
   unsigned NumLanes = VT.getSizeInBits() / 128;
   unsigned NumLaneElts = NumElts / NumLanes;
 
-  int NewImm = Imm;
+  unsigned NewImm = Imm;
   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
     for (unsigned i = 0; i != NumLaneElts; ++i) {
       ShuffleMask.push_back(NewImm % NumLaneElts + l);
@@ -80,48 +80,55 @@ void DecodePSHUFMask(EVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
   }
 }
 
-void DecodePSHUFHWMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
-  ShuffleMask.push_back(0);
-  ShuffleMask.push_back(1);
-  ShuffleMask.push_back(2);
-  ShuffleMask.push_back(3);
-  for (unsigned i = 0; i != 4; ++i) {
-    ShuffleMask.push_back(4+(Imm & 3));
-    Imm >>= 2;
+void DecodePSHUFHWMask(MVT VT, unsigned Imm,
+                       SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  for (unsigned l = 0; l != NumElts; l += 8) {
+    unsigned NewImm = Imm;
+    for (unsigned i = 0, e = 4; i != e; ++i) {
+      ShuffleMask.push_back(l + i);
+    }
+    for (unsigned i = 4, e = 8; i != e; ++i) {
+      ShuffleMask.push_back(l + 4 + (NewImm & 3));
+      NewImm >>= 2;
+    }
   }
 }
 
-void DecodePSHUFLWMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
-  for (unsigned i = 0; i != 4; ++i) {
-    ShuffleMask.push_back((Imm & 3));
-    Imm >>= 2;
+void DecodePSHUFLWMask(MVT VT, unsigned Imm,
+                       SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  for (unsigned l = 0; l != NumElts; l += 8) {
+    unsigned NewImm = Imm;
+    for (unsigned i = 0, e = 4; i != e; ++i) {
+      ShuffleMask.push_back(l + (NewImm & 3));
+      NewImm >>= 2;
+    }
+    for (unsigned i = 4, e = 8; i != e; ++i) {
+      ShuffleMask.push_back(l + i);
+    }
   }
-  ShuffleMask.push_back(4);
-  ShuffleMask.push_back(5);
-  ShuffleMask.push_back(6);
-  ShuffleMask.push_back(7);
 }
 
 /// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
 /// the type of the vector allowing it to handle different datatypes and vector
 /// widths.
-void DecodeSHUFPMask(EVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
   unsigned NumElts = VT.getVectorNumElements();
 
   unsigned NumLanes = VT.getSizeInBits() / 128;
   unsigned NumLaneElts = NumElts / NumLanes;
 
-  int NewImm = Imm;
+  unsigned NewImm = Imm;
   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
-    // Part that reads from dest.
-    for (unsigned i = 0; i != NumLaneElts/2; ++i) {
-      ShuffleMask.push_back(NewImm % NumLaneElts + l);
-      NewImm /= NumLaneElts;
-    }
-    // Part that reads from src.
-    for (unsigned i = 0; i != NumLaneElts/2; ++i) {
-      ShuffleMask.push_back(NewImm % NumLaneElts + NumElts + l);
-      NewImm /= NumLaneElts;
+    // each half of a lane comes from different source
+    for (unsigned s = 0; s != NumElts*2; s += NumElts) {
+      for (unsigned i = 0; i != NumLaneElts/2; ++i) {
+        ShuffleMask.push_back(NewImm % NumLaneElts + s + l);
+        NewImm /= NumLaneElts;
+      }
     }
     if (NumLaneElts == 4) NewImm = Imm; // reload imm
   }
@@ -130,7 +137,7 @@ void DecodeSHUFPMask(EVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
 /// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
 /// and punpckh*. VT indicates the type of the vector allowing it to handle
 /// different datatypes and vector widths.
-void DecodeUNPCKHMask(EVT VT, SmallVectorImpl<int> &ShuffleMask) {
+void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
   unsigned NumElts = VT.getVectorNumElements();
 
   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
@@ -150,7 +157,7 @@ void DecodeUNPCKHMask(EVT VT, SmallVectorImpl<int> &ShuffleMask) {
 /// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
 /// and punpckl*. VT indicates the type of the vector allowing it to handle
 /// different datatypes and vector widths.
-void DecodeUNPCKLMask(EVT VT, SmallVectorImpl<int> &ShuffleMask) {
+void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
   unsigned NumElts = VT.getVectorNumElements();
 
   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
@@ -167,19 +174,26 @@ void DecodeUNPCKLMask(EVT VT, SmallVectorImpl<int> &ShuffleMask) {
   }
 }
 
-void DecodeVPERM2X128Mask(EVT VT, unsigned Imm,
+void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
                           SmallVectorImpl<int> &ShuffleMask) {
   if (Imm & 0x88)
     return; // Not a shuffle
 
   unsigned HalfSize = VT.getVectorNumElements()/2;
-  unsigned FstHalfBegin = (Imm & 0x3) * HalfSize;
-  unsigned SndHalfBegin = ((Imm >> 4) & 0x3) * HalfSize;
 
-  for (int i = FstHalfBegin, e = FstHalfBegin+HalfSize; i != e; ++i)
-    ShuffleMask.push_back(i);
-  for (int i = SndHalfBegin, e = SndHalfBegin+HalfSize; i != e; ++i)
-    ShuffleMask.push_back(i);
+  for (unsigned l = 0; l != 2; ++l) {
+    unsigned HalfBegin = ((Imm >> (l*4)) & 0x3) * HalfSize;
+    for (unsigned i = HalfBegin, e = HalfBegin+HalfSize; i != e; ++i)
+      ShuffleMask.push_back(i);
+  }
+}
+
+/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
+/// No VT provided since it only works on 256-bit, 4 element vectors.
+void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  for (unsigned i = 0; i != 4; ++i) {
+    ShuffleMask.push_back((Imm >> (2*i)) & 3);
+  }
 }
 
 } // llvm namespace
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h
index 5b8c6ef..70d8171 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -35,31 +35,35 @@ void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
 // <0,2> or <0,1,4,5>
 void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
 
-void DecodePSHUFMask(EVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
-void DecodePSHUFHWMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodePSHUFHWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
-void DecodePSHUFLWMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodePSHUFLWMask(MVT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
 /// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
 /// the type of the vector allowing it to handle different datatypes and vector
 /// widths.
-void DecodeSHUFPMask(EVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
 /// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
 /// and punpckh*. VT indicates the type of the vector allowing it to handle
 /// different datatypes and vector widths.
-void DecodeUNPCKHMask(EVT VT, SmallVectorImpl<int> &ShuffleMask);
+void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
 
 /// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
 /// and punpckl*. VT indicates the type of the vector allowing it to handle
 /// different datatypes and vector widths.
-void DecodeUNPCKLMask(EVT VT, SmallVectorImpl<int> &ShuffleMask);
+void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
 
 
-void DecodeVPERM2X128Mask(EVT VT, unsigned Imm,
+void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
                           SmallVectorImpl<int> &ShuffleMask);
 
+/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
+/// No VT provided since it only works on 256-bit, 4 element vectors.
+void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
 } // llvm namespace
 
 #endif
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index ecc7b59..bf05ccf 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -36,6 +36,11 @@ FunctionPass *createX86ISelDag(X86TargetMachine &TM,
 /// register for PIC on x86-32.
 FunctionPass* createGlobalBaseRegPass();
 
+/// createCleanupLocalDynamicTLSPass() - This pass combines multiple accesses
+/// to local-dynamic TLS variables so that the TLS base address for the module
+/// is only fetched once per execution path through the function.
+FunctionPass *createCleanupLocalDynamicTLSPass();
+
 /// createX86FloatingPointStackifierPass - This function returns a pass which
 /// converts floating point register references and pseudo instructions into
 /// floating point stack references and physical instructions.
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index b6591d4..6c1a816 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -86,21 +86,24 @@ def FeatureAVX     : SubtargetFeature<"avx", "X86SSELevel", "AVX",
 def FeatureAVX2    : SubtargetFeature<"avx2", "X86SSELevel", "AVX2",
                                       "Enable AVX2 instructions",
                                       [FeatureAVX]>;
-def FeatureCLMUL   : SubtargetFeature<"clmul", "HasCLMUL", "true",
-                               "Enable carry-less multiplication instructions">;
-def FeatureFMA3    : SubtargetFeature<"fma3", "HasFMA3", "true",
+def FeaturePCLMUL  : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
+                         "Enable packed carry-less multiplication instructions",
+                               [FeatureSSE2]>;
+def FeatureFMA     : SubtargetFeature<"fma", "HasFMA", "true",
                                       "Enable three-operand fused multiple-add",
                                       [FeatureAVX]>;
 def FeatureFMA4    : SubtargetFeature<"fma4", "HasFMA4", "true",
                                       "Enable four-operand fused multiple-add",
-                                      [FeatureAVX]>;
+                                      [FeatureAVX, FeatureSSE4A]>;
 def FeatureXOP     : SubtargetFeature<"xop", "HasXOP", "true",
-                                      "Enable XOP instructions">;
+                                      "Enable XOP instructions",
+                                      [FeatureAVX, FeatureSSE4A]>;
 def FeatureVectorUAMem : SubtargetFeature<"vector-unaligned-mem",
                                           "HasVectorUAMem", "true",
                  "Allow unaligned memory operands on vector/SIMD instructions">;
 def FeatureAES     : SubtargetFeature<"aes", "HasAES", "true",
-                                      "Enable AES instructions">;
+                                      "Enable AES instructions",
+                                      [FeatureSSE2]>;
 def FeatureMOVBE   : SubtargetFeature<"movbe", "HasMOVBE", "true",
                                       "Support MOVBE instruction">;
 def FeatureRDRAND  : SubtargetFeature<"rdrand", "HasRDRAND", "true",
@@ -128,10 +131,10 @@ def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom",
                     "Intel Atom processors">;
 
 class Proc<string Name, list<SubtargetFeature> Features>
- : Processor<Name, GenericItineraries, Features>;
+ : ProcessorModel<Name, GenericModel, Features>;
 
 class AtomProc<string Name, list<SubtargetFeature> Features>
- : Processor<Name, AtomItineraries, Features>;
+ : ProcessorModel<Name, AtomModel, Features>;
 
 def : Proc<"generic",         []>;
 def : Proc<"i386",            []>;
@@ -169,25 +172,23 @@ def : Proc<"nehalem",         [FeatureSSE42,  FeatureCMPXCHG16B,
 // Westmere is the corei3/i5/i7 path from nehalem to sandybridge
 def : Proc<"westmere",        [FeatureSSE42, FeatureCMPXCHG16B,
                                FeatureSlowBTMem, FeatureFastUAMem,
-                               FeaturePOPCNT, FeatureAES, FeatureCLMUL]>;
+                               FeaturePOPCNT, FeatureAES, FeaturePCLMUL]>;
 // Sandy Bridge
 // SSE is not listed here since llvm treats AVX as a reimplementation of SSE,
 // rather than a superset.
-// FIXME: Disabling AVX for now since it's not ready.
-def : Proc<"corei7-avx",      [FeatureSSE42, FeatureCMPXCHG16B, FeaturePOPCNT,
-                               FeatureAES, FeatureCLMUL]>;
+def : Proc<"corei7-avx",      [FeatureAVX, FeatureCMPXCHG16B, FeaturePOPCNT,
+                               FeatureAES, FeaturePCLMUL]>;
 // Ivy Bridge
-def : Proc<"core-avx-i",      [FeatureSSE42, FeatureCMPXCHG16B, FeaturePOPCNT,
-                               FeatureAES, FeatureCLMUL,
+def : Proc<"core-avx-i",      [FeatureAVX, FeatureCMPXCHG16B, FeaturePOPCNT,
+                               FeatureAES, FeaturePCLMUL,
                                FeatureRDRAND, FeatureF16C, FeatureFSGSBase]>;
 
 // Haswell
-// FIXME: Disabling AVX/AVX2/FMA3 for now since it's not ready.
-def : Proc<"core-avx2",       [FeatureSSE42, FeatureCMPXCHG16B, FeaturePOPCNT,
-                               FeatureAES, FeatureCLMUL, FeatureRDRAND,
+def : Proc<"core-avx2",       [FeatureAVX2, FeatureCMPXCHG16B, FeaturePOPCNT,
+                               FeatureAES, FeaturePCLMUL, FeatureRDRAND,
                                FeatureF16C, FeatureFSGSBase,
                                FeatureMOVBE, FeatureLZCNT, FeatureBMI,
-                               FeatureBMI2]>;
+                               FeatureBMI2, FeatureFMA]>;
 
 def : Proc<"k6",              [FeatureMMX]>;
 def : Proc<"k6-2",            [Feature3DNow]>;
@@ -211,21 +212,20 @@ def : Proc<"opteron-sse3",    [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,
                                FeatureSlowBTMem]>;
 def : Proc<"athlon64-sse3",   [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,
                                FeatureSlowBTMem]>;
-def : Proc<"amdfam10",        [FeatureSSE3,   FeatureSSE4A,
+def : Proc<"amdfam10",        [FeatureSSE4A,
                                Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT,
                                FeaturePOPCNT, FeatureSlowBTMem]>;
 // Bobcat
 def : Proc<"btver1",          [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B,
                                FeatureLZCNT, FeaturePOPCNT]>;
-// FIXME: Disabling AVX/FMA4 for now since it's not ready.
 // Bulldozer
-def : Proc<"bdver1",          [FeatureSSE42, FeatureSSE4A, FeatureCMPXCHG16B,
-                               FeatureAES, FeatureCLMUL,
-                               FeatureXOP, FeatureLZCNT, FeaturePOPCNT]>;
+def : Proc<"bdver1",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
+                               FeatureAES, FeaturePCLMUL,
+                               FeatureLZCNT, FeaturePOPCNT]>;
 // Enhanced Bulldozer
-def : Proc<"bdver2",          [FeatureSSE42, FeatureSSE4A, FeatureCMPXCHG16B,
-                               FeatureAES, FeatureCLMUL,
-                               FeatureXOP, FeatureF16C, FeatureLZCNT,
+def : Proc<"bdver2",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
+                               FeatureAES, FeaturePCLMUL,
+                               FeatureF16C, FeatureLZCNT,
                                FeaturePOPCNT, FeatureBMI]>;
 
 def : Proc<"winchip-c6",      [FeatureMMX]>;
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index 7db7ccb..db71e27 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -20,10 +20,10 @@
 #include "X86TargetMachine.h"
 #include "InstPrinter/X86ATTInstPrinter.h"
 #include "llvm/CallingConv.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Module.h"
 #include "llvm/Type.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -186,10 +186,14 @@ void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO,
     O << '-' << *MF->getPICBaseSymbol();
     break;
   case X86II::MO_TLSGD:     O << "@TLSGD";     break;
+  case X86II::MO_TLSLD:     O << "@TLSLD";     break;
+  case X86II::MO_TLSLDM:    O << "@TLSLDM";    break;
   case X86II::MO_GOTTPOFF:  O << "@GOTTPOFF";  break;
   case X86II::MO_INDNTPOFF: O << "@INDNTPOFF"; break;
   case X86II::MO_TPOFF:     O << "@TPOFF";     break;
+  case X86II::MO_DTPOFF:    O << "@DTPOFF";    break;
   case X86II::MO_NTPOFF:    O << "@NTPOFF";    break;
+  case X86II::MO_GOTNTPOFF: O << "@GOTNTPOFF"; break;
   case X86II::MO_GOTPCREL:  O << "@GOTPCREL";  break;
   case X86II::MO_GOT:       O << "@GOT";       break;
   case X86II::MO_GOTOFF:    O << "@GOTOFF";    break;
@@ -403,7 +407,9 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
     const MachineOperand &MO = MI->getOperand(OpNo);
 
     switch (ExtraCode[0]) {
-    default: return true;  // Unknown modifier.
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
     case 'a': // This is an address.  Currently only 'i' and 'r' are expected.
       if (MO.isImm()) {
         O << MO.getImm();
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index d148989..a6d2709 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -29,10 +29,13 @@ def RetCC_X86Common : CallingConv<[
   // up in AX and AH, which overlap. Front-ends wishing to conform to the ABI
   // for functions that return two i8 values are currently expected to pack the
   // values into an i16 (which uses AX, and thus AL:AH).
-  CCIfType<[i8] , CCAssignToReg<[AL, DL]>>,
-  CCIfType<[i16], CCAssignToReg<[AX, DX]>>,
-  CCIfType<[i32], CCAssignToReg<[EAX, EDX]>>,
-  CCIfType<[i64], CCAssignToReg<[RAX, RDX]>>,
+  //
+  // For code that doesn't care about the ABI, we allow returning more than two
+  // integer values in registers.
+  CCIfType<[i8] , CCAssignToReg<[AL, DL, CL]>>,
+  CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>,
+  CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>,
+  CCIfType<[i64], CCAssignToReg<[RAX, RDX, RCX]>>,
 
   // Vector types are returned in XMM0 and XMM1, when they fit.  XMM2 and XMM3
   // can only be used by ABI non-compliant code. If the target doesn't have XMM
@@ -413,7 +416,7 @@ def CC_X86 : CallingConv<[
 // Callee-saved Registers.
 //===----------------------------------------------------------------------===//
 
-def CSR_Ghc : CalleeSavedRegs<(add)>;
+def CSR_NoRegs : CalleeSavedRegs<(add)>;
 
 def CSR_32 : CalleeSavedRegs<(add ESI, EDI, EBX, EBP)>;
 def CSR_64 : CalleeSavedRegs<(add RBX, R12, R13, R14, R15, RBP)>;
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
index ee3de9a..d705049 100644
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -53,12 +53,12 @@ namespace {
   public:
     static char ID;
     explicit Emitter(X86TargetMachine &tm, CodeEmitter &mce)
-      : MachineFunctionPass(ID), II(0), TD(0), TM(tm), 
+      : MachineFunctionPass(ID), II(0), TD(0), TM(tm),
       MCE(mce), PICBaseOffset(0), Is64BitMode(false),
       IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
     Emitter(X86TargetMachine &tm, CodeEmitter &mce,
             const X86InstrInfo &ii, const TargetData &td, bool is64)
-      : MachineFunctionPass(ID), II(&ii), TD(&td), TM(tm), 
+      : MachineFunctionPass(ID), II(&ii), TD(&td), TM(tm),
       MCE(mce), PICBaseOffset(0), Is64BitMode(is64),
       IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
 
@@ -68,8 +68,20 @@ namespace {
       return "X86 Machine Code Emitter";
     }
 
+    void emitOpcodePrefix(uint64_t TSFlags, int MemOperand,
+                          const MachineInstr &MI,
+                          const MCInstrDesc *Desc) const;
+
+    void emitVEXOpcodePrefix(uint64_t TSFlags, int MemOperand,
+                             const MachineInstr &MI,
+                             const MCInstrDesc *Desc) const;
+
+    void emitSegmentOverridePrefix(uint64_t TSFlags,
+                                   int MemOperand,
+                                   const MachineInstr &MI) const;
+
     void emitInstruction(MachineInstr &MI, const MCInstrDesc *Desc);
-    
+
     void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesAll();
       AU.addRequired<MachineModuleInfo>();
@@ -115,17 +127,17 @@ template<class CodeEmitter>
 bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) {
   MMI = &getAnalysis<MachineModuleInfo>();
   MCE.setModuleInfo(MMI);
-  
+
   II = TM.getInstrInfo();
   TD = TM.getTargetData();
   Is64BitMode = TM.getSubtarget<X86Subtarget>().is64Bit();
   IsPIC = TM.getRelocationModel() == Reloc::PIC_;
-  
+
   do {
-    DEBUG(dbgs() << "JITTing function '" 
+    DEBUG(dbgs() << "JITTing function '"
           << MF.getFunction()->getName() << "'\n");
     MCE.startFunction(MF);
-    for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); 
+    for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
          MBB != E; ++MBB) {
       MCE.StartMachineBasicBlock(MBB);
       for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
@@ -149,18 +161,18 @@ bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) {
 static unsigned determineREX(const MachineInstr &MI) {
   unsigned REX = 0;
   const MCInstrDesc &Desc = MI.getDesc();
-  
+
   // Pseudo instructions do not need REX prefix byte.
   if ((Desc.TSFlags & X86II::FormMask) == X86II::Pseudo)
     return 0;
   if (Desc.TSFlags & X86II::REX_W)
     REX |= 1 << 3;
-  
+
   unsigned NumOps = Desc.getNumOperands();
   if (NumOps) {
     bool isTwoAddr = NumOps > 1 &&
-    Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1;
-    
+      Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1;
+
     // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
     unsigned i = isTwoAddr ? 1 : 0;
     for (unsigned e = NumOps; i != e; ++i) {
@@ -171,7 +183,7 @@ static unsigned determineREX(const MachineInstr &MI) {
           REX |= 0x40;
       }
     }
-    
+
     switch (Desc.TSFlags & X86II::FormMask) {
       case X86II::MRMInitReg:
         if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0)))
@@ -362,7 +374,7 @@ void Emitter<CodeEmitter>::emitRegModRMByte(unsigned RegOpcodeFld) {
 }
 
 template<class CodeEmitter>
-void Emitter<CodeEmitter>::emitSIBByte(unsigned SS, 
+void Emitter<CodeEmitter>::emitSIBByte(unsigned SS,
                                        unsigned Index,
                                        unsigned Base) {
   // SIB byte is in the same format as the ModRMByte...
@@ -378,8 +390,8 @@ void Emitter<CodeEmitter>::emitConstant(uint64_t Val, unsigned Size) {
   }
 }
 
-/// isDisp8 - Return true if this signed displacement fits in a 8-bit 
-/// sign-extended field. 
+/// isDisp8 - Return true if this signed displacement fits in a 8-bit
+/// sign-extended field.
 static bool isDisp8(int Value) {
   return Value == (signed char)Value;
 }
@@ -388,10 +400,10 @@ static bool gvNeedsNonLazyPtr(const MachineOperand &GVOp,
                               const TargetMachine &TM) {
   // For Darwin-64, simulate the linktime GOT by using the same non-lazy-pointer
   // mechanism as 32-bit mode.
-  if (TM.getSubtarget<X86Subtarget>().is64Bit() && 
+  if (TM.getSubtarget<X86Subtarget>().is64Bit() &&
       !TM.getSubtarget<X86Subtarget>().isTargetDarwin())
     return false;
-  
+
   // Return true if this is a reference to a stub containing the address of the
   // global, not the global itself.
   return isGlobalStubReference(GVOp.getTargetFlags());
@@ -417,7 +429,7 @@ void Emitter<CodeEmitter>::emitDisplacementField(const MachineOperand *RelocOp,
   if (RelocOp->isGlobal()) {
     // In 64-bit static small code model, we could potentially emit absolute.
     // But it's probably not beneficial. If the MCE supports using RIP directly
-    // do it, otherwise fallback to absolute (this is determined by IsPCRel). 
+    // do it, otherwise fallback to absolute (this is determined by IsPCRel).
     //  89 05 00 00 00 00     mov    %eax,0(%rip)  # PC-relative
     //  89 04 25 00 00 00 00  mov    %eax,0x0      # Absolute
     bool Indirect = gvNeedsNonLazyPtr(*RelocOp, TM);
@@ -441,7 +453,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
   const MachineOperand &Op3 = MI.getOperand(Op+3);
   int DispVal = 0;
   const MachineOperand *DispForReloc = 0;
-  
+
   // Figure out what sort of displacement we have to handle here.
   if (Op3.isGlobal()) {
     DispForReloc = &Op3;
@@ -469,7 +481,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
   const MachineOperand &IndexReg = MI.getOperand(Op+2);
 
   unsigned BaseReg = Base.getReg();
-  
+
   // Handle %rip relative addressing.
   if (BaseReg == X86::RIP ||
       (Is64BitMode && DispForReloc)) { // [disp32+RIP] in X86-64 mode
@@ -486,7 +498,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
   bool IsPCRel = MCE.earlyResolveAddresses() ? true : false;
 
   // Is a SIB byte needed?
-  // If no BaseReg, issue a RIP relative instruction only if the MCE can 
+  // If no BaseReg, issue a RIP relative instruction only if the MCE can
   // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table
   // 2-7) and absolute references.
   unsigned BaseRegNo = -1U;
@@ -494,7 +506,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
     BaseRegNo = X86_MC::getX86RegNum(BaseReg);
 
   if (// The SIB byte must be used if there is an index register.
-      IndexReg.getReg() == 0 && 
+      IndexReg.getReg() == 0 &&
       // The SIB byte must be used if the base is ESP/RSP/R12, all of which
       // encode to an R/M value of 4, which indicates that a SIB byte is
       // present.
@@ -508,7 +520,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
       emitDisplacementField(DispForReloc, DispVal, PCAdj, true);
       return;
     }
-    
+
     // If the base is not EBP/ESP and there is no displacement, use simple
     // indirect register encoding, this handles addresses like [EAX].  The
     // encoding for [EBP] with no displacement means [disp32] so we handle it
@@ -517,20 +529,20 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
       MCE.emitByte(ModRMByte(0, RegOpcodeField, BaseRegNo));
       return;
     }
-    
+
     // Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
     if (!DispForReloc && isDisp8(DispVal)) {
       MCE.emitByte(ModRMByte(1, RegOpcodeField, BaseRegNo));
       emitConstant(DispVal, 1);
       return;
     }
-    
+
     // Otherwise, emit the most general non-SIB encoding: [REG+disp32]
     MCE.emitByte(ModRMByte(2, RegOpcodeField, BaseRegNo));
     emitDisplacementField(DispForReloc, DispVal, PCAdj, IsPCRel);
     return;
   }
-  
+
   // Otherwise we need a SIB byte, so start by outputting the ModR/M byte first.
   assert(IndexReg.getReg() != X86::ESP &&
          IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");
@@ -563,7 +575,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
   unsigned SS = SSTable[Scale.getImm()];
 
   if (BaseReg == 0) {
-    // Handle the SIB byte for the case where there is no base, see Intel 
+    // Handle the SIB byte for the case where there is no base, see Intel
     // Manual 2A, table 2-7. The displacement has already been output.
     unsigned IndexRegNo;
     if (IndexReg.getReg())
@@ -596,94 +608,116 @@ static const MCInstrDesc *UpdateOp(MachineInstr &MI, const X86InstrInfo *II,
   return Desc;
 }
 
-template<class CodeEmitter>
-void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
-                                           const MCInstrDesc *Desc) {
-  DEBUG(dbgs() << MI);
-  
-  // If this is a pseudo instruction, lower it.
-  switch (Desc->getOpcode()) {
-  case X86::ADD16rr_DB:      Desc = UpdateOp(MI, II, X86::OR16rr); break;
-  case X86::ADD32rr_DB:      Desc = UpdateOp(MI, II, X86::OR32rr); break;
-  case X86::ADD64rr_DB:      Desc = UpdateOp(MI, II, X86::OR64rr); break;
-  case X86::ADD16ri_DB:      Desc = UpdateOp(MI, II, X86::OR16ri); break;
-  case X86::ADD32ri_DB:      Desc = UpdateOp(MI, II, X86::OR32ri); break;
-  case X86::ADD64ri32_DB:    Desc = UpdateOp(MI, II, X86::OR64ri32); break;
-  case X86::ADD16ri8_DB:     Desc = UpdateOp(MI, II, X86::OR16ri8); break;
-  case X86::ADD32ri8_DB:     Desc = UpdateOp(MI, II, X86::OR32ri8); break;
-  case X86::ADD64ri8_DB:     Desc = UpdateOp(MI, II, X86::OR64ri8); break;
-  case X86::ACQUIRE_MOV8rm:  Desc = UpdateOp(MI, II, X86::MOV8rm); break;
-  case X86::ACQUIRE_MOV16rm: Desc = UpdateOp(MI, II, X86::MOV16rm); break;
-  case X86::ACQUIRE_MOV32rm: Desc = UpdateOp(MI, II, X86::MOV32rm); break;
-  case X86::ACQUIRE_MOV64rm: Desc = UpdateOp(MI, II, X86::MOV64rm); break;
-  case X86::RELEASE_MOV8mr:  Desc = UpdateOp(MI, II, X86::MOV8mr); break;
-  case X86::RELEASE_MOV16mr: Desc = UpdateOp(MI, II, X86::MOV16mr); break;
-  case X86::RELEASE_MOV32mr: Desc = UpdateOp(MI, II, X86::MOV32mr); break;
-  case X86::RELEASE_MOV64mr: Desc = UpdateOp(MI, II, X86::MOV64mr); break;
-  }
-  
+/// Is16BitMemOperand - Return true if the specified instruction has
+/// a 16-bit memory operand. Op specifies the operand # of the memoperand.
+static bool Is16BitMemOperand(const MachineInstr &MI, unsigned Op) {
+  const MachineOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
+  const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+  if ((BaseReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) ||
+      (IndexReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg())))
+    return true;
+  return false;
+}
 
-  MCE.processDebugLoc(MI.getDebugLoc(), true);
+/// Is32BitMemOperand - Return true if the specified instruction has
+/// a 32-bit memory operand. Op specifies the operand # of the memoperand.
+static bool Is32BitMemOperand(const MachineInstr &MI, unsigned Op) {
+  const MachineOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
+  const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+  if ((BaseReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg.getReg())) ||
+      (IndexReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg.getReg())))
+    return true;
+  return false;
+}
 
-  unsigned Opcode = Desc->Opcode;
+/// Is64BitMemOperand - Return true if the specified instruction has
+/// a 64-bit memory operand. Op specifies the operand # of the memoperand.
+#ifndef NDEBUG
+static bool Is64BitMemOperand(const MachineInstr &MI, unsigned Op) {
+  const MachineOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
+  const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+  if ((BaseReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg.getReg())) ||
+      (IndexReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg.getReg())))
+    return true;
+  return false;
+}
+#endif
 
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitOpcodePrefix(uint64_t TSFlags,
+                                            int MemOperand,
+                                            const MachineInstr &MI,
+                                            const MCInstrDesc *Desc) const {
   // Emit the lock opcode prefix as needed.
   if (Desc->TSFlags & X86II::LOCK)
     MCE.emitByte(0xF0);
 
   // Emit segment override opcode prefix as needed.
-  switch (Desc->TSFlags & X86II::SegOvrMask) {
-  case X86II::FS:
-    MCE.emitByte(0x64);
-    break;
-  case X86II::GS:
-    MCE.emitByte(0x65);
-    break;
-  default: llvm_unreachable("Invalid segment!");
-  case 0: break;  // No segment override!
-  }
+  emitSegmentOverridePrefix(TSFlags, MemOperand, MI);
 
   // Emit the repeat opcode prefix as needed.
   if ((Desc->TSFlags & X86II::Op0Mask) == X86II::REP)
     MCE.emitByte(0xF3);
 
-  // Emit the operand size opcode prefix as needed.
-  if (Desc->TSFlags & X86II::OpSize)
-    MCE.emitByte(0x66);
-
   // Emit the address size opcode prefix as needed.
-  if (Desc->TSFlags & X86II::AdSize)
+  bool need_address_override;
+  if (TSFlags & X86II::AdSize) {
+    need_address_override = true;
+  } else if (MemOperand == -1) {
+    need_address_override = false;
+  } else if (Is64BitMode) {
+    assert(!Is16BitMemOperand(MI, MemOperand));
+    need_address_override = Is32BitMemOperand(MI, MemOperand);
+  } else {
+    assert(!Is64BitMemOperand(MI, MemOperand));
+    need_address_override = Is16BitMemOperand(MI, MemOperand);
+  }
+
+  if (need_address_override)
     MCE.emitByte(0x67);
 
+  // Emit the operand size opcode prefix as needed.
+  if (TSFlags & X86II::OpSize)
+    MCE.emitByte(0x66);
+
   bool Need0FPrefix = false;
   switch (Desc->TSFlags & X86II::Op0Mask) {
-  case X86II::TB:  // Two-byte opcode prefix
-  case X86II::T8:  // 0F 38
-  case X86II::TA:  // 0F 3A
-  case X86II::A6:  // 0F A6
-  case X86II::A7:  // 0F A7
-    Need0FPrefix = true;
-    break;
-  case X86II::REP: break; // already handled.
-  case X86II::T8XS: // F3 0F 38
-  case X86II::XS:   // F3 0F
-    MCE.emitByte(0xF3);
-    Need0FPrefix = true;
-    break;
-  case X86II::T8XD: // F2 0F 38
-  case X86II::TAXD: // F2 0F 3A
-  case X86II::XD:   // F2 0F
-    MCE.emitByte(0xF2);
-    Need0FPrefix = true;
-    break;
-  case X86II::D8: case X86II::D9: case X86II::DA: case X86II::DB:
-  case X86II::DC: case X86II::DD: case X86II::DE: case X86II::DF:
-    MCE.emitByte(0xD8+
-                 (((Desc->TSFlags & X86II::Op0Mask)-X86II::D8)
-                                   >> X86II::Op0Shift));
-    break; // Two-byte opcode prefix
-  default: llvm_unreachable("Invalid prefix!");
-  case 0: break;  // No prefix!
+    case X86II::TB:  // Two-byte opcode prefix
+    case X86II::T8:  // 0F 38
+    case X86II::TA:  // 0F 3A
+    case X86II::A6:  // 0F A6
+    case X86II::A7:  // 0F A7
+      Need0FPrefix = true;
+      break;
+    case X86II::REP: break; // already handled.
+    case X86II::T8XS: // F3 0F 38
+    case X86II::XS:   // F3 0F
+      MCE.emitByte(0xF3);
+      Need0FPrefix = true;
+      break;
+    case X86II::T8XD: // F2 0F 38
+    case X86II::TAXD: // F2 0F 3A
+    case X86II::XD:   // F2 0F
+      MCE.emitByte(0xF2);
+      Need0FPrefix = true;
+      break;
+    case X86II::D8: case X86II::D9: case X86II::DA: case X86II::DB:
+    case X86II::DC: case X86II::DD: case X86II::DE: case X86II::DF:
+      MCE.emitByte(0xD8+
+                   (((Desc->TSFlags & X86II::Op0Mask)-X86II::D8)
+                    >> X86II::Op0Shift));
+      break; // Two-byte opcode prefix
+    default: llvm_unreachable("Invalid prefix!");
+    case 0: break;  // No prefix!
   }
 
   // Handle REX prefix.
@@ -697,50 +731,446 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
     MCE.emitByte(0x0F);
 
   switch (Desc->TSFlags & X86II::Op0Mask) {
-  case X86II::T8XD:  // F2 0F 38
-  case X86II::T8XS:  // F3 0F 38
-  case X86II::T8:    // 0F 38
-    MCE.emitByte(0x38);
-    break;
-  case X86II::TAXD:  // F2 0F 38
-  case X86II::TA:    // 0F 3A
-    MCE.emitByte(0x3A);
-    break;
-  case X86II::A6:    // 0F A6
-    MCE.emitByte(0xA6);
-    break;
-  case X86II::A7:    // 0F A7
-    MCE.emitByte(0xA7);
-    break;
+    case X86II::T8XD:  // F2 0F 38
+    case X86II::T8XS:  // F3 0F 38
+    case X86II::T8:    // 0F 38
+      MCE.emitByte(0x38);
+      break;
+    case X86II::TAXD:  // F2 0F 38
+    case X86II::TA:    // 0F 3A
+      MCE.emitByte(0x3A);
+      break;
+    case X86II::A6:    // 0F A6
+      MCE.emitByte(0xA6);
+      break;
+    case X86II::A7:    // 0F A7
+      MCE.emitByte(0xA7);
+      break;
+  }
+}
+
+// On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range
+// 0-7 and the difference between the 2 groups is given by the REX prefix.
+// In the VEX prefix, registers are seen sequencially from 0-15 and encoded
+// in 1's complement form, example:
+//
+//  ModRM field => XMM9 => 1
+//  VEX.VVVV    => XMM9 => ~9
+//
+// See table 4-35 of Intel AVX Programming Reference for details.
+static unsigned char getVEXRegisterEncoding(const MachineInstr &MI,
+                                            unsigned OpNum) {
+  unsigned SrcReg = MI.getOperand(OpNum).getReg();
+  unsigned SrcRegNum = X86_MC::getX86RegNum(MI.getOperand(OpNum).getReg());
+  if (X86II::isX86_64ExtendedReg(SrcReg))
+    SrcRegNum |= 8;
+
+  // The registers represented through VEX_VVVV should
+  // be encoded in 1's complement form.
+  return (~SrcRegNum) & 0xf;
+}
+
+/// EmitSegmentOverridePrefix - Emit segment override opcode prefix as needed
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitSegmentOverridePrefix(uint64_t TSFlags,
+                                                 int MemOperand,
+                                                 const MachineInstr &MI) const {
+  switch (TSFlags & X86II::SegOvrMask) {
+    default: llvm_unreachable("Invalid segment!");
+    case 0:
+      // No segment override, check for explicit one on memory operand.
+      if (MemOperand != -1) {   // If the instruction has a memory operand.
+        switch (MI.getOperand(MemOperand+X86::AddrSegmentReg).getReg()) {
+          default: llvm_unreachable("Unknown segment register!");
+          case 0: break;
+          case X86::CS: MCE.emitByte(0x2E); break;
+          case X86::SS: MCE.emitByte(0x36); break;
+          case X86::DS: MCE.emitByte(0x3E); break;
+          case X86::ES: MCE.emitByte(0x26); break;
+          case X86::FS: MCE.emitByte(0x64); break;
+          case X86::GS: MCE.emitByte(0x65); break;
+        }
+      }
+      break;
+    case X86II::FS:
+      MCE.emitByte(0x64);
+      break;
+    case X86II::GS:
+      MCE.emitByte(0x65);
+      break;
+  }
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
+                                               int MemOperand,
+                                               const MachineInstr &MI,
+                                               const MCInstrDesc *Desc) const {
+  bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+  bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
+
+  // VEX_R: opcode externsion equivalent to REX.R in
+  // 1's complement (inverted) form
+  //
+  //  1: Same as REX_R=0 (must be 1 in 32-bit mode)
+  //  0: Same as REX_R=1 (64 bit mode only)
+  //
+  unsigned char VEX_R = 0x1;
+
+  // VEX_X: equivalent to REX.X, only used when a
+  // register is used for index in SIB Byte.
+  //
+  //  1: Same as REX.X=0 (must be 1 in 32-bit mode)
+  //  0: Same as REX.X=1 (64-bit mode only)
+  unsigned char VEX_X = 0x1;
+
+  // VEX_B:
+  //
+  //  1: Same as REX_B=0 (ignored in 32-bit mode)
+  //  0: Same as REX_B=1 (64 bit mode only)
+  //
+  unsigned char VEX_B = 0x1;
+
+  // VEX_W: opcode specific (use like REX.W, or used for
+  // opcode extension, or ignored, depending on the opcode byte)
+  unsigned char VEX_W = 0;
+
+  // XOP: Use XOP prefix byte 0x8f instead of VEX.
+  unsigned char XOP = 0;
+
+  // VEX_5M (VEX m-mmmmm field):
+  //
+  //  0b00000: Reserved for future use
+  //  0b00001: implied 0F leading opcode
+  //  0b00010: implied 0F 38 leading opcode bytes
+  //  0b00011: implied 0F 3A leading opcode bytes
+  //  0b00100-0b11111: Reserved for future use
+  //  0b01000: XOP map select - 08h instructions with imm byte
+  //  0b10001: XOP map select - 09h instructions with no imm byte
+  unsigned char VEX_5M = 0x1;
+
+  // VEX_4V (VEX vvvv field): a register specifier
+  // (in 1's complement form) or 1111 if unused.
+  unsigned char VEX_4V = 0xf;
+
+  // VEX_L (Vector Length):
+  //
+  //  0: scalar or 128-bit vector
+  //  1: 256-bit vector
+  //
+  unsigned char VEX_L = 0;
+
+  // VEX_PP: opcode extension providing equivalent
+  // functionality of a SIMD prefix
+  //
+  //  0b00: None
+  //  0b01: 66
+  //  0b10: F3
+  //  0b11: F2
+  //
+  unsigned char VEX_PP = 0;
+
+  // Encode the operand size opcode prefix as needed.
+  if (TSFlags & X86II::OpSize)
+    VEX_PP = 0x01;
+
+  if ((TSFlags >> X86II::VEXShift) & X86II::VEX_W)
+    VEX_W = 1;
+
+  if ((TSFlags >> X86II::VEXShift) & X86II::XOP)
+    XOP = 1;
+
+  if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L)
+    VEX_L = 1;
+
+  switch (TSFlags & X86II::Op0Mask) {
+    default: llvm_unreachable("Invalid prefix!");
+    case X86II::T8:  // 0F 38
+      VEX_5M = 0x2;
+      break;
+    case X86II::TA:  // 0F 3A
+      VEX_5M = 0x3;
+      break;
+    case X86II::T8XS: // F3 0F 38
+      VEX_PP = 0x2;
+      VEX_5M = 0x2;
+      break;
+    case X86II::T8XD: // F2 0F 38
+      VEX_PP = 0x3;
+      VEX_5M = 0x2;
+      break;
+    case X86II::TAXD: // F2 0F 3A
+      VEX_PP = 0x3;
+      VEX_5M = 0x3;
+      break;
+    case X86II::XS:  // F3 0F
+      VEX_PP = 0x2;
+      break;
+    case X86II::XD:  // F2 0F
+      VEX_PP = 0x3;
+      break;
+    case X86II::XOP8:
+      VEX_5M = 0x8;
+      break;
+    case X86II::XOP9:
+      VEX_5M = 0x9;
+      break;
+    case X86II::A6:  // Bypass: Not used by VEX
+    case X86II::A7:  // Bypass: Not used by VEX
+    case X86II::TB:  // Bypass: Not used by VEX
+    case 0:
+      break;  // No prefix!
+  }
+
+
+  // Set the vector length to 256-bit if YMM0-YMM15 is used
+  for (unsigned i = 0; i != MI.getNumOperands(); ++i) {
+    if (!MI.getOperand(i).isReg())
+      continue;
+    if (MI.getOperand(i).isImplicit())
+      continue;
+    unsigned SrcReg = MI.getOperand(i).getReg();
+    if (SrcReg >= X86::YMM0 && SrcReg <= X86::YMM15)
+      VEX_L = 1;
+  }
+
+  // Classify VEX_B, VEX_4V, VEX_R, VEX_X
+  unsigned NumOps = Desc->getNumOperands();
+  unsigned CurOp = 0;
+  if (NumOps > 1 && Desc->getOperandConstraint(1, MCOI::TIED_TO) == 0)
+    ++CurOp;
+  else if (NumOps > 3 && Desc->getOperandConstraint(2, MCOI::TIED_TO) == 0) {
+    assert(Desc->getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1);
+    // Special case for GATHER with 2 TIED_TO operands
+    // Skip the first 2 operands: dst, mask_wb
+    CurOp += 2;
+  }
+
+  switch (TSFlags & X86II::FormMask) {
+    case X86II::MRMInitReg:
+      // Duplicate register.
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+        VEX_R = 0x0;
+
+      if (HasVEX_4V)
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+        VEX_B = 0x0;
+      if (HasVEX_4VOp3)
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      break;
+    case X86II::MRMDestMem: {
+      // MRMDestMem instructions forms:
+      //  MemAddr, src1(ModR/M)
+      //  MemAddr, src1(VEX_4V), src2(ModR/M)
+      //  MemAddr, src1(ModR/M), imm8
+      //
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrBaseReg).getReg()))
+        VEX_B = 0x0;
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrIndexReg).getReg()))
+        VEX_X = 0x0;
+
+      CurOp = X86::AddrNumOperands;
+      if (HasVEX_4V)
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+
+      const MachineOperand &MO = MI.getOperand(CurOp);
+      if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg()))
+        VEX_R = 0x0;
+      break;
+    }
+    case X86II::MRMSrcMem:
+      // MRMSrcMem instructions forms:
+      //  src1(ModR/M), MemAddr
+      //  src1(ModR/M), src2(VEX_4V), MemAddr
+      //  src1(ModR/M), MemAddr, imm8
+      //  src1(ModR/M), MemAddr, src2(VEX_I8IMM)
+      //
+      //  FMA4:
+      //  dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
+      //  dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+        VEX_R = 0x0;
+
+      if (HasVEX_4V)
+        VEX_4V = getVEXRegisterEncoding(MI, 1);
+
+      if (X86II::isX86_64ExtendedReg(
+                          MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
+        VEX_B = 0x0;
+      if (X86II::isX86_64ExtendedReg(
+                          MI.getOperand(MemOperand+X86::AddrIndexReg).getReg()))
+        VEX_X = 0x0;
+
+      if (HasVEX_4VOp3)
+        VEX_4V = getVEXRegisterEncoding(MI, X86::AddrNumOperands+1);
+      break;
+    case X86II::MRM0m: case X86II::MRM1m:
+    case X86II::MRM2m: case X86II::MRM3m:
+    case X86II::MRM4m: case X86II::MRM5m:
+    case X86II::MRM6m: case X86II::MRM7m: {
+      // MRM[0-9]m instructions forms:
+      //  MemAddr
+      //  src1(VEX_4V), MemAddr
+      if (HasVEX_4V)
+        VEX_4V = getVEXRegisterEncoding(MI, 0);
+
+      if (X86II::isX86_64ExtendedReg(
+                          MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
+        VEX_B = 0x0;
+      if (X86II::isX86_64ExtendedReg(
+                          MI.getOperand(MemOperand+X86::AddrIndexReg).getReg()))
+        VEX_X = 0x0;
+      break;
+    }
+    case X86II::MRMSrcReg:
+      // MRMSrcReg instructions forms:
+      //  dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
+      //  dst(ModR/M), src1(ModR/M)
+      //  dst(ModR/M), src1(ModR/M), imm8
+      //
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+        VEX_R = 0x0;
+      CurOp++;
+
+      if (HasVEX_4V)
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+        VEX_B = 0x0;
+      CurOp++;
+      if (HasVEX_4VOp3)
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      break;
+    case X86II::MRMDestReg:
+      // MRMDestReg instructions forms:
+      //  dst(ModR/M), src(ModR/M)
+      //  dst(ModR/M), src(ModR/M), imm8
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+        VEX_B = 0x0;
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(1).getReg()))
+        VEX_R = 0x0;
+      break;
+    case X86II::MRM0r: case X86II::MRM1r:
+    case X86II::MRM2r: case X86II::MRM3r:
+    case X86II::MRM4r: case X86II::MRM5r:
+    case X86II::MRM6r: case X86II::MRM7r:
+      // MRM0r-MRM7r instructions forms:
+      //  dst(VEX_4V), src(ModR/M), imm8
+      VEX_4V = getVEXRegisterEncoding(MI, 0);
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(1).getReg()))
+        VEX_B = 0x0;
+      break;
+    default: // RawFrm
+      break;
+  }
+
+  // Emit segment override opcode prefix as needed.
+  emitSegmentOverridePrefix(TSFlags, MemOperand, MI);
+
+  // VEX opcode prefix can have 2 or 3 bytes
+  //
+  //  3 bytes:
+  //    +-----+ +--------------+ +-------------------+
+  //    | C4h | | RXB | m-mmmm | | W | vvvv | L | pp |
+  //    +-----+ +--------------+ +-------------------+
+  //  2 bytes:
+  //    +-----+ +-------------------+
+  //    | C5h | | R | vvvv | L | pp |
+  //    +-----+ +-------------------+
+  //
+  unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
+
+  if (VEX_B && VEX_X && !VEX_W && !XOP && (VEX_5M == 1)) { // 2 byte VEX prefix
+    MCE.emitByte(0xC5);
+    MCE.emitByte(LastByte | (VEX_R << 7));
+    return;
+  }
+
+  // 3 byte VEX prefix
+  MCE.emitByte(XOP ? 0x8F : 0xC4);
+  MCE.emitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M);
+  MCE.emitByte(LastByte | (VEX_W << 7));
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
+                                           const MCInstrDesc *Desc) {
+  DEBUG(dbgs() << MI);
+
+  // If this is a pseudo instruction, lower it.
+  switch (Desc->getOpcode()) {
+  case X86::ADD16rr_DB:      Desc = UpdateOp(MI, II, X86::OR16rr); break;
+  case X86::ADD32rr_DB:      Desc = UpdateOp(MI, II, X86::OR32rr); break;
+  case X86::ADD64rr_DB:      Desc = UpdateOp(MI, II, X86::OR64rr); break;
+  case X86::ADD16ri_DB:      Desc = UpdateOp(MI, II, X86::OR16ri); break;
+  case X86::ADD32ri_DB:      Desc = UpdateOp(MI, II, X86::OR32ri); break;
+  case X86::ADD64ri32_DB:    Desc = UpdateOp(MI, II, X86::OR64ri32); break;
+  case X86::ADD16ri8_DB:     Desc = UpdateOp(MI, II, X86::OR16ri8); break;
+  case X86::ADD32ri8_DB:     Desc = UpdateOp(MI, II, X86::OR32ri8); break;
+  case X86::ADD64ri8_DB:     Desc = UpdateOp(MI, II, X86::OR64ri8); break;
+  case X86::ACQUIRE_MOV8rm:  Desc = UpdateOp(MI, II, X86::MOV8rm); break;
+  case X86::ACQUIRE_MOV16rm: Desc = UpdateOp(MI, II, X86::MOV16rm); break;
+  case X86::ACQUIRE_MOV32rm: Desc = UpdateOp(MI, II, X86::MOV32rm); break;
+  case X86::ACQUIRE_MOV64rm: Desc = UpdateOp(MI, II, X86::MOV64rm); break;
+  case X86::RELEASE_MOV8mr:  Desc = UpdateOp(MI, II, X86::MOV8mr); break;
+  case X86::RELEASE_MOV16mr: Desc = UpdateOp(MI, II, X86::MOV16mr); break;
+  case X86::RELEASE_MOV32mr: Desc = UpdateOp(MI, II, X86::MOV32mr); break;
+  case X86::RELEASE_MOV64mr: Desc = UpdateOp(MI, II, X86::MOV64mr); break;
   }
 
+
+  MCE.processDebugLoc(MI.getDebugLoc(), true);
+
+  unsigned Opcode = Desc->Opcode;
+
   // If this is a two-address instruction, skip one of the register operands.
   unsigned NumOps = Desc->getNumOperands();
   unsigned CurOp = 0;
-  if (NumOps > 1 && Desc->getOperandConstraint(1, MCOI::TIED_TO) != -1)
+  if (NumOps > 1 && Desc->getOperandConstraint(1, MCOI::TIED_TO) == 0)
     ++CurOp;
-  else if (NumOps > 2 && Desc->getOperandConstraint(NumOps-1,MCOI::TIED_TO)== 0)
-    // Skip the last source operand that is tied_to the dest reg. e.g. LXADD32
-    --NumOps;
+  else if (NumOps > 3 && Desc->getOperandConstraint(2, MCOI::TIED_TO) == 0) {
+    assert(Desc->getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1);
+    // Special case for GATHER with 2 TIED_TO operands
+    // Skip the first 2 operands: dst, mask_wb
+    CurOp += 2;
+  }
+
+  uint64_t TSFlags = Desc->TSFlags;
+
+  // Is this instruction encoded using the AVX VEX prefix?
+  bool HasVEXPrefix = (TSFlags >> X86II::VEXShift) & X86II::VEX;
+  // It uses the VEX.VVVV field?
+  bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+  bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
+  bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
+  const unsigned MemOp4_I8IMMOperand = 2;
+
+  // Determine where the memory operand starts, if present.
+  int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode);
+  if (MemoryOperand != -1) MemoryOperand += CurOp;
+
+  if (!HasVEXPrefix)
+    emitOpcodePrefix(TSFlags, MemoryOperand, MI, Desc);
+  else
+    emitVEXOpcodePrefix(TSFlags, MemoryOperand, MI, Desc);
 
   unsigned char BaseOpcode = X86II::getBaseOpcodeFor(Desc->TSFlags);
-  switch (Desc->TSFlags & X86II::FormMask) {
+  switch (TSFlags & X86II::FormMask) {
   default:
     llvm_unreachable("Unknown FormMask value in X86 MachineCodeEmitter!");
   case X86II::Pseudo:
     // Remember the current PC offset, this is the PIC relocation
     // base address.
     switch (Opcode) {
-    default: 
+    default:
       llvm_unreachable("pseudo instructions should be removed before code"
                        " emission");
-      break;
     // Do nothing for Int_MemBarrier - it's just a comment.  Add a debug
     // to make it slightly easier to see.
     case X86::Int_MemBarrier:
       DEBUG(dbgs() << "#MEMBARRIER\n");
       break;
-    
+
     case TargetOpcode::INLINEASM:
       // We allow inline assembler nodes with empty bodies - they can
       // implicitly define registers, which is ok for JIT.
@@ -752,7 +1182,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
     case TargetOpcode::EH_LABEL:
       MCE.emitLabel(MI.getOperand(0).getMCSymbol());
       break;
-    
+
     case TargetOpcode::IMPLICIT_DEF:
     case TargetOpcode::KILL:
       break;
@@ -774,7 +1204,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
 
     if (CurOp == NumOps)
       break;
-      
+
     const MachineOperand &MO = MI.getOperand(CurOp++);
 
     DEBUG(dbgs() << "RawFrm CurOp " << CurOp << "\n");
@@ -787,13 +1217,13 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
       emitPCRelativeBlockAddress(MO.getMBB());
       break;
     }
-    
+
     if (MO.isGlobal()) {
       emitGlobalAddress(MO.getGlobal(), X86::reloc_pcrel_word,
                         MO.getOffset(), 0);
       break;
     }
-    
+
     if (MO.isSymbol()) {
       emitExternalSymbolAddress(MO.getSymbolName(), X86::reloc_pcrel_word);
       break;
@@ -804,7 +1234,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
       emitJumpTableAddress(MO.getIndex(), X86::reloc_pcrel_word);
       break;
     }
-    
+
     assert(MO.isImm() && "Unknown RawFrm operand!");
     if (Opcode == X86::CALLpcrel32 || Opcode == X86::CALL64pcrel32) {
       // Fix up immediate operand for pc relative calls.
@@ -815,21 +1245,21 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
       emitConstant(MO.getImm(), X86II::getSizeOfImm(Desc->TSFlags));
     break;
   }
-      
+
   case X86II::AddRegFrm: {
     MCE.emitByte(BaseOpcode +
                  X86_MC::getX86RegNum(MI.getOperand(CurOp++).getReg()));
-    
+
     if (CurOp == NumOps)
       break;
-      
+
     const MachineOperand &MO1 = MI.getOperand(CurOp++);
     unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
     if (MO1.isImm()) {
       emitConstant(MO1.getImm(), Size);
       break;
     }
-    
+
     unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
       : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
     if (Opcode == X86::MOV64ri64i32)
@@ -855,46 +1285,57 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
     emitRegModRMByte(MI.getOperand(CurOp).getReg(),
                      X86_MC::getX86RegNum(MI.getOperand(CurOp+1).getReg()));
     CurOp += 2;
-    if (CurOp != NumOps)
-      emitConstant(MI.getOperand(CurOp++).getImm(),
-                   X86II::getSizeOfImm(Desc->TSFlags));
     break;
   }
   case X86II::MRMDestMem: {
     MCE.emitByte(BaseOpcode);
+
+    unsigned SrcRegNum = CurOp + X86::AddrNumOperands;
+    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+      SrcRegNum++;
     emitMemModRMByte(MI, CurOp,
-                X86_MC::getX86RegNum(MI.getOperand(CurOp + X86::AddrNumOperands)
-                                  .getReg()));
-    CurOp +=  X86::AddrNumOperands + 1;
-    if (CurOp != NumOps)
-      emitConstant(MI.getOperand(CurOp++).getImm(),
-                   X86II::getSizeOfImm(Desc->TSFlags));
+                X86_MC::getX86RegNum(MI.getOperand(SrcRegNum).getReg()));
+    CurOp = SrcRegNum + 1;
     break;
   }
 
-  case X86II::MRMSrcReg:
+  case X86II::MRMSrcReg: {
     MCE.emitByte(BaseOpcode);
-    emitRegModRMByte(MI.getOperand(CurOp+1).getReg(),
+
+    unsigned SrcRegNum = CurOp+1;
+    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+      ++SrcRegNum;
+
+    if (HasMemOp4) // Skip 2nd src (which is encoded in I8IMM)
+      ++SrcRegNum;
+
+    emitRegModRMByte(MI.getOperand(SrcRegNum).getReg(),
                      X86_MC::getX86RegNum(MI.getOperand(CurOp).getReg()));
-    CurOp += 2;
-    if (CurOp != NumOps)
-      emitConstant(MI.getOperand(CurOp++).getImm(),
-                   X86II::getSizeOfImm(Desc->TSFlags));
+    // 2 operands skipped with HasMemOp4, compensate accordingly
+    CurOp = HasMemOp4 ? SrcRegNum : SrcRegNum + 1;
+    if (HasVEX_4VOp3)
+      ++CurOp;
     break;
-
+  }
   case X86II::MRMSrcMem: {
     int AddrOperands = X86::AddrNumOperands;
+    unsigned FirstMemOp = CurOp+1;
+    if (HasVEX_4V) {
+      ++AddrOperands;
+      ++FirstMemOp;  // Skip the register source (which is encoded in VEX_VVVV).
+    }
+    if (HasMemOp4) // Skip second register source (encoded in I8IMM)
+      ++FirstMemOp;
+
+    MCE.emitByte(BaseOpcode);
 
     intptr_t PCAdj = (CurOp + AddrOperands + 1 != NumOps) ?
       X86II::getSizeOfImm(Desc->TSFlags) : 0;
-
-    MCE.emitByte(BaseOpcode);
-    emitMemModRMByte(MI, CurOp+1,
+    emitMemModRMByte(MI, FirstMemOp,
                      X86_MC::getX86RegNum(MI.getOperand(CurOp).getReg()),PCAdj);
     CurOp += AddrOperands + 1;
-    if (CurOp != NumOps)
-      emitConstant(MI.getOperand(CurOp++).getImm(),
-                   X86II::getSizeOfImm(Desc->TSFlags));
+    if (HasVEX_4VOp3)
+      ++CurOp;
     break;
   }
 
@@ -902,20 +1343,22 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
   case X86II::MRM2r: case X86II::MRM3r:
   case X86II::MRM4r: case X86II::MRM5r:
   case X86II::MRM6r: case X86II::MRM7r: {
+    if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
+      ++CurOp;
     MCE.emitByte(BaseOpcode);
     emitRegModRMByte(MI.getOperand(CurOp++).getReg(),
                      (Desc->TSFlags & X86II::FormMask)-X86II::MRM0r);
 
     if (CurOp == NumOps)
       break;
-    
+
     const MachineOperand &MO1 = MI.getOperand(CurOp++);
     unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
     if (MO1.isImm()) {
       emitConstant(MO1.getImm(), Size);
       break;
     }
-    
+
     unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
       : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
     if (Opcode == X86::MOV64ri32)
@@ -937,8 +1380,10 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
   case X86II::MRM2m: case X86II::MRM3m:
   case X86II::MRM4m: case X86II::MRM5m:
   case X86II::MRM6m: case X86II::MRM7m: {
+    if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
+      ++CurOp;
     intptr_t PCAdj = (CurOp + X86::AddrNumOperands != NumOps) ?
-      (MI.getOperand(CurOp+X86::AddrNumOperands).isImm() ? 
+      (MI.getOperand(CurOp+X86::AddrNumOperands).isImm() ?
           X86II::getSizeOfImm(Desc->TSFlags) : 4) : 0;
 
     MCE.emitByte(BaseOpcode);
@@ -948,14 +1393,14 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
 
     if (CurOp == NumOps)
       break;
-    
+
     const MachineOperand &MO = MI.getOperand(CurOp++);
     unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
     if (MO.isImm()) {
       emitConstant(MO.getImm(), Size);
       break;
     }
-    
+
     unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
       : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
     if (Opcode == X86::MOV64mi32)
@@ -980,7 +1425,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
                      X86_MC::getX86RegNum(MI.getOperand(CurOp).getReg()));
     ++CurOp;
     break;
-      
+
   case X86II::MRM_C1:
     MCE.emitByte(BaseOpcode);
     MCE.emitByte(0xC1);
@@ -1003,6 +1448,33 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
     break;
   }
 
+  while (CurOp != NumOps && NumOps - CurOp <= 2) {
+    // The last source register of a 4 operand instruction in AVX is encoded
+    // in bits[7:4] of a immediate byte.
+    if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) {
+      const MachineOperand &MO = MI.getOperand(HasMemOp4 ? MemOp4_I8IMMOperand
+                                                         : CurOp);
+      ++CurOp;
+      unsigned RegNum = X86_MC::getX86RegNum(MO.getReg()) << 4;
+      if (X86II::isX86_64ExtendedReg(MO.getReg()))
+        RegNum |= 1 << 7;
+      // If there is an additional 5th operand it must be an immediate, which
+      // is encoded in bits[3:0]
+      if (CurOp != NumOps) {
+        const MachineOperand &MIMM = MI.getOperand(CurOp++);
+        if (MIMM.isImm()) {
+          unsigned Val = MIMM.getImm();
+          assert(Val < 16 && "Immediate operand value out of range");
+          RegNum |= Val;
+        }
+      }
+      emitConstant(RegNum, 1);
+    } else {
+      emitConstant(MI.getOperand(CurOp++).getImm(),
+                   X86II::getSizeOfImm(Desc->TSFlags));
+    }
+  }
+
   if (!MI.isVariadic() && CurOp != NumOps) {
 #ifndef NDEBUG
     dbgs() << "Cannot encode all operands of: " << MI << "\n";
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 69752c5..585b7a5 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -183,37 +183,37 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
   case MVT::i1:
   case MVT::i8:
     Opc = X86::MOV8rm;
-    RC  = X86::GR8RegisterClass;
+    RC  = &X86::GR8RegClass;
     break;
   case MVT::i16:
     Opc = X86::MOV16rm;
-    RC  = X86::GR16RegisterClass;
+    RC  = &X86::GR16RegClass;
     break;
   case MVT::i32:
     Opc = X86::MOV32rm;
-    RC  = X86::GR32RegisterClass;
+    RC  = &X86::GR32RegClass;
     break;
   case MVT::i64:
     // Must be in x86-64 mode.
     Opc = X86::MOV64rm;
-    RC  = X86::GR64RegisterClass;
+    RC  = &X86::GR64RegClass;
     break;
   case MVT::f32:
     if (X86ScalarSSEf32) {
       Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
-      RC  = X86::FR32RegisterClass;
+      RC  = &X86::FR32RegClass;
     } else {
       Opc = X86::LD_Fp32m;
-      RC  = X86::RFP32RegisterClass;
+      RC  = &X86::RFP32RegClass;
     }
     break;
   case MVT::f64:
     if (X86ScalarSSEf64) {
       Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
-      RC  = X86::FR64RegisterClass;
+      RC  = &X86::FR64RegClass;
     } else {
       Opc = X86::LD_Fp64m;
-      RC  = X86::RFP64RegisterClass;
+      RC  = &X86::RFP64RegClass;
     }
     break;
   case MVT::f80:
@@ -240,7 +240,7 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) {
   default: return false;
   case MVT::i1: {
     // Mask out all but lowest bit.
-    unsigned AndResult = createResultReg(X86::GR8RegisterClass);
+    unsigned AndResult = createResultReg(&X86::GR8RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
             TII.get(X86::AND8ri), AndResult).addReg(Val).addImm(1);
     Val = AndResult;
@@ -547,13 +547,13 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
 
         if (TLI.getPointerTy() == MVT::i64) {
           Opc = X86::MOV64rm;
-          RC  = X86::GR64RegisterClass;
+          RC  = &X86::GR64RegClass;
 
           if (Subtarget->isPICStyleRIPRel())
             StubAM.Base.Reg = X86::RIP;
         } else {
           Opc = X86::MOV32rm;
-          RC  = X86::GR32RegisterClass;
+          RC  = &X86::GR32RegClass;
         }
 
         LoadReg = createResultReg(RC);
@@ -743,7 +743,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
     // Analyze operands of the call, assigning locations to each operand.
     SmallVector<CCValAssign, 16> ValLocs;
     CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs,
-		   I->getContext());
+                   I->getContext());
     CCInfo.AnalyzeReturn(Outs, RetCC_X86);
 
     const Value *RV = Ret->getOperand(0);
@@ -1258,7 +1258,7 @@ bool X86FastISel::X86SelectFPExt(const Instruction *I) {
     if (V->getType()->isFloatTy()) {
       unsigned OpReg = getRegForValue(V);
       if (OpReg == 0) return false;
-      unsigned ResultReg = createResultReg(X86::FR64RegisterClass);
+      unsigned ResultReg = createResultReg(&X86::FR64RegClass);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
               TII.get(X86::CVTSS2SDrr), ResultReg)
         .addReg(OpReg);
@@ -1277,7 +1277,7 @@ bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
       if (V->getType()->isDoubleTy()) {
         unsigned OpReg = getRegForValue(V);
         if (OpReg == 0) return false;
-        unsigned ResultReg = createResultReg(X86::FR32RegisterClass);
+        unsigned ResultReg = createResultReg(&X86::FR32RegClass);
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                 TII.get(X86::CVTSD2SSrr), ResultReg)
           .addReg(OpReg);
@@ -1314,8 +1314,9 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) {
   if (!Subtarget->is64Bit()) {
     // If we're on x86-32; we can't extract an i8 from a general register.
     // First issue a copy to GR16_ABCD or GR32_ABCD.
-    const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16)
-      ? X86::GR16_ABCDRegisterClass : X86::GR32_ABCDRegisterClass;
+    const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16) ?
+      (const TargetRegisterClass*)&X86::GR16_ABCDRegClass :
+      (const TargetRegisterClass*)&X86::GR32_ABCDRegClass;
     unsigned CopyReg = createResultReg(CopyRC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
             CopyReg).addReg(InputReg);
@@ -1423,7 +1424,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
     return DoSelectCall(&I, "memset");
   }
   case Intrinsic::stackprotector: {
-    // Emit code inline code to store the stack guard onto the stack.
+    // Emit code to store the stack guard onto the stack.
     EVT PtrTy = TLI.getPointerTy();
 
     const Value *Op1 = I.getArgOperand(0); // The guard's value.
@@ -1484,7 +1485,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
       return false;
 
     // The call to CreateRegs builds two sequential registers, to store the
-    // both the the returned values.
+    // both the returned values.
     unsigned ResultReg = FuncInfo.CreateRegs(I.getType());
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpC), ResultReg)
       .addReg(Reg1).addReg(Reg2);
@@ -1548,12 +1549,11 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
 
   // Check whether the function can return without sret-demotion.
   SmallVector<ISD::OutputArg, 4> Outs;
-  SmallVector<uint64_t, 4> Offsets;
   GetReturnInfo(I->getType(), CS.getAttributes().getRetAttributes(),
-                Outs, TLI, &Offsets);
+                Outs, TLI);
   bool CanLowerReturn = TLI.CanLowerReturn(CS.getCallingConv(),
-					   *FuncInfo.MF, FTy->isVarArg(),
-					   Outs, FTy->getContext());
+                                           *FuncInfo.MF, FTy->isVarArg(),
+                                           Outs, FTy->getContext());
   if (!CanLowerReturn)
     return false;
 
@@ -1667,7 +1667,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, ArgLocs,
-		 I->getParent()->getContext());
+                 I->getParent()->getContext());
 
   // Allocate shadow area for Win64
   if (Subtarget->isTargetWin64())
@@ -1693,7 +1693,6 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
 
     // Promote the value if needed.
     switch (VA.getLocInfo()) {
-    default: llvm_unreachable("Unknown loc info!");
     case CCValAssign::Full: break;
     case CCValAssign::SExt: {
       assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
@@ -1737,6 +1736,14 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
       ArgVT = VA.getLocVT();
       break;
     }
+    case CCValAssign::VExt: 
+      // VExt has not been implemented, so this should be impossible to reach
+      // for now.  However, fallback to Selection DAG isel once implemented.
+      return false;
+    case CCValAssign::Indirect:
+      // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully
+      // support this.
+      return false;
     }
 
     if (VA.isRegLoc()) {
@@ -1838,25 +1845,27 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
       MIB.addGlobalAddress(GV, 0, OpFlags);
   }
 
+  // Add a register mask with the call-preserved registers.
+  // Proper defs for return values will be added by setPhysRegsDeadExcept().
+  MIB.addRegMask(TRI.getCallPreservedMask(CS.getCallingConv()));
+
   // Add an implicit use GOT pointer in EBX.
   if (Subtarget->isPICStyleGOT())
-    MIB.addReg(X86::EBX);
+    MIB.addReg(X86::EBX, RegState::Implicit);
 
   if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64())
-    MIB.addReg(X86::AL);
+    MIB.addReg(X86::AL, RegState::Implicit);
 
   // Add implicit physical register uses to the call.
   for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
-    MIB.addReg(RegArgs[i]);
-
-  // Add a register mask with the call-preserved registers.
-  // Proper defs for return values will be added by setPhysRegsDeadExcept().
-  MIB.addRegMask(TRI.getCallPreservedMask(CS.getCallingConv()));
+    MIB.addReg(RegArgs[i], RegState::Implicit);
 
   // Issue CALLSEQ_END
   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
   unsigned NumBytesCallee = 0;
   if (!Subtarget->is64Bit() && !Subtarget->isTargetWindows() &&
+      !(CS.getCallingConv() == CallingConv::Fast ||
+        CS.getCallingConv() == CallingConv::GHC) &&
       CS.paramHasAttr(1, Attribute::StructRet))
     NumBytesCallee = 4;
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(AdjStackUp))
@@ -1889,7 +1898,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
   SmallVector<unsigned, 4> UsedRegs;
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCRetInfo(CC, false, *FuncInfo.MF, TM, RVLocs,
-		    I->getParent()->getContext());
+                    I->getParent()->getContext());
   unsigned ResultReg = FuncInfo.CreateRegs(I->getType());
   CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86);
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
@@ -1903,7 +1912,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
          RVLocs[i].getLocReg() == X86::ST1)) {
       if (isScalarFPTypeInSSEReg(RVLocs[i].getValVT())) {
         CopyVT = MVT::f80;
-        CopyReg = createResultReg(X86::RFP80RegisterClass);
+        CopyReg = createResultReg(&X86::RFP80RegClass);
       }
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::FpPOP_RETVAL),
               CopyReg);
@@ -2001,37 +2010,37 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
   default: return false;
   case MVT::i8:
     Opc = X86::MOV8rm;
-    RC  = X86::GR8RegisterClass;
+    RC  = &X86::GR8RegClass;
     break;
   case MVT::i16:
     Opc = X86::MOV16rm;
-    RC  = X86::GR16RegisterClass;
+    RC  = &X86::GR16RegClass;
     break;
   case MVT::i32:
     Opc = X86::MOV32rm;
-    RC  = X86::GR32RegisterClass;
+    RC  = &X86::GR32RegClass;
     break;
   case MVT::i64:
     // Must be in x86-64 mode.
     Opc = X86::MOV64rm;
-    RC  = X86::GR64RegisterClass;
+    RC  = &X86::GR64RegClass;
     break;
   case MVT::f32:
     if (X86ScalarSSEf32) {
       Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
-      RC  = X86::FR32RegisterClass;
+      RC  = &X86::FR32RegClass;
     } else {
       Opc = X86::LD_Fp32m;
-      RC  = X86::RFP32RegisterClass;
+      RC  = &X86::RFP32RegClass;
     }
     break;
   case MVT::f64:
     if (X86ScalarSSEf64) {
       Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
-      RC  = X86::FR64RegisterClass;
+      RC  = &X86::FR64RegClass;
     } else {
       Opc = X86::LD_Fp64m;
-      RC  = X86::RFP64RegisterClass;
+      RC  = &X86::RFP64RegClass;
     }
     break;
   case MVT::f80:
@@ -2124,19 +2133,19 @@ unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) {
     case MVT::f32:
       if (X86ScalarSSEf32) {
         Opc = X86::FsFLD0SS;
-        RC  = X86::FR32RegisterClass;
+        RC  = &X86::FR32RegClass;
       } else {
         Opc = X86::LD_Fp032;
-        RC  = X86::RFP32RegisterClass;
+        RC  = &X86::RFP32RegClass;
       }
       break;
     case MVT::f64:
       if (X86ScalarSSEf64) {
         Opc = X86::FsFLD0SD;
-        RC  = X86::FR64RegisterClass;
+        RC  = &X86::FR64RegClass;
       } else {
         Opc = X86::LD_Fp064;
-        RC  = X86::RFP64RegisterClass;
+        RC  = &X86::RFP64RegClass;
       }
       break;
     case MVT::f80:
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index ed1707d..711ee41 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -130,7 +130,7 @@ namespace {
     // The hardware keeps track of how many FP registers are live, so we have
     // to model that exactly. Usually, each live register corresponds to an
     // FP<n> register, but when dealing with calls, returns, and inline
-    // assembly, it is sometimes neccesary to have live scratch registers.
+    // assembly, it is sometimes necessary to have live scratch registers.
     unsigned Stack[8];          // FP<n> Registers in each stack slot...
     unsigned StackTop;          // The current top of the FP stack.
 
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 000e375..2238688 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -45,14 +45,14 @@ bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
 bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const MachineModuleInfo &MMI = MF.getMMI();
-  const TargetRegisterInfo *RI = TM.getRegisterInfo();
+  const TargetRegisterInfo *RegInfo = TM.getRegisterInfo();
 
   return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
-          RI->needsStackRealignment(MF) ||
+          RegInfo->needsStackRealignment(MF) ||
           MFI->hasVarSizedObjects() ||
           MFI->isFrameAddressTaken() ||
           MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
-          MMI.callsUnwindInit());
+          MMI.callsUnwindInit() || MMI.callsEHReturn());
 }
 
 static unsigned getSUBriOpcode(unsigned is64Bit, int64_t Imm) {
@@ -125,8 +125,8 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
       unsigned Reg = MO.getReg();
       if (!Reg)
         continue;
-      for (const uint16_t *AsI = TRI.getOverlaps(Reg); *AsI; ++AsI)
-        Uses.insert(*AsI);
+      for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
+        Uses.insert(*AI);
     }
 
     const uint16_t *CS = Is64Bit ? CallerSavedRegs64Bit : CallerSavedRegs32Bit;
@@ -369,7 +369,7 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF,
 /// getCompactUnwindRegNum - Get the compact unwind number for a given
 /// register. The number corresponds to the enum lists in
 /// compact_unwind_encoding.h.
-static int getCompactUnwindRegNum(const unsigned *CURegs, unsigned Reg) {
+static int getCompactUnwindRegNum(const uint16_t *CURegs, unsigned Reg) {
   for (int Idx = 1; *CURegs; ++CURegs, ++Idx)
     if (*CURegs == Reg)
       return Idx;
@@ -398,13 +398,13 @@ encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
   //     4       3
   //     5       3
   //
-  static const unsigned CU32BitRegs[] = {
+  static const uint16_t CU32BitRegs[] = {
     X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
   };
-  static const unsigned CU64BitRegs[] = {
+  static const uint16_t CU64BitRegs[] = {
     X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
   };
-  const unsigned *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs);
+  const uint16_t *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs);
 
   for (unsigned i = 0; i != CU_NUM_SAVED_REGS; ++i) {
     int CUReg = getCompactUnwindRegNum(CURegs, SavedRegs[i]);
@@ -466,13 +466,13 @@ encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
 static uint32_t
 encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
                                       bool Is64Bit) {
-  static const unsigned CU32BitRegs[] = {
+  static const uint16_t CU32BitRegs[] = {
     X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
   };
-  static const unsigned CU64BitRegs[] = {
+  static const uint16_t CU64BitRegs[] = {
     X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
   };
-  const unsigned *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs);
+  const uint16_t *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs);
 
   // Encode the registers in the order they were saved, 3-bits per register. The
   // registers are numbered from 1 to CU_NUM_SAVED_REGS.
@@ -650,6 +650,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
   unsigned SlotSize = RegInfo->getSlotSize();
   unsigned FramePtr = RegInfo->getFrameRegister(MF);
   unsigned StackPtr = RegInfo->getStackRegister();
+  unsigned BasePtr = RegInfo->getBaseRegister();
   DebugLoc DL;
 
   // If we're forcing a stack realignment we can't rely on just the frame
@@ -721,10 +722,14 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
   if (HasFP) {
     // Calculate required stack adjustment.
     uint64_t FrameSize = StackSize - SlotSize;
-    if (RegInfo->needsStackRealignment(MF))
-      FrameSize = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign;
-
-    NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize();
+    if (RegInfo->needsStackRealignment(MF)) {
+      // Callee-saved registers are pushed on stack before the stack
+      // is realigned.
+      FrameSize -= X86FI->getCalleeSavedFrameSize();
+      NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign;
+    } else {
+      NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize();
+    }
 
     // Get the offset of the stack slot for the EBP register, which is
     // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
@@ -781,19 +786,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
     for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end();
          I != E; ++I)
       I->addLiveIn(FramePtr);
-
-    // Realign stack
-    if (RegInfo->needsStackRealignment(MF)) {
-      MachineInstr *MI =
-        BuildMI(MBB, MBBI, DL,
-                TII.get(Is64Bit ? X86::AND64ri32 : X86::AND32ri), StackPtr)
-        .addReg(StackPtr)
-        .addImm(-MaxAlign)
-        .setMIFlag(MachineInstr::FrameSetup);
-
-      // The EFLAGS implicit def is dead.
-      MI->getOperand(3).setIsDead();
-    }
   } else {
     NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
   }
@@ -823,6 +815,27 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
     }
   }
 
+  // Realign stack after we pushed callee-saved registers (so that we'll be
+  // able to calculate their offsets from the frame pointer).
+
+  // NOTE: We push the registers before realigning the stack, so
+  // vector callee-saved (xmm) registers may be saved w/o proper
+  // alignment in this way. However, currently these regs are saved in
+  // stack slots (see X86FrameLowering::spillCalleeSavedRegisters()), so
+  // this shouldn't be a problem.
+  if (RegInfo->needsStackRealignment(MF)) {
+    assert(HasFP && "There should be a frame pointer if stack is realigned.");
+    MachineInstr *MI =
+      BuildMI(MBB, MBBI, DL,
+              TII.get(Is64Bit ? X86::AND64ri32 : X86::AND32ri), StackPtr)
+      .addReg(StackPtr)
+      .addImm(-MaxAlign)
+      .setMIFlag(MachineInstr::FrameSetup);
+
+    // The EFLAGS implicit def is dead.
+    MI->getOperand(3).setIsDead();
+  }
+
   DL = MBB.findDebugLoc(MBBI);
 
   // If there is an SUB32ri of ESP immediately before this instruction, merge
@@ -913,6 +926,18 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
     emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit,
                  UseLEA, TII, *RegInfo);
 
+  // If we need a base pointer, set it up here. It's whatever the value
+  // of the stack pointer is at this point. Any variable size objects
+  // will be allocated after this, so we can still use the base pointer
+  // to reference locals.
+  if (RegInfo->hasBasePointer(MF)) {
+    // Update the frame pointer with the current stack pointer.
+    unsigned Opc = Is64Bit ? X86::MOV64rr : X86::MOV32rr;
+    BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr)
+      .addReg(StackPtr)
+      .setMIFlag(MachineInstr::FrameSetup);
+  }
+
   if (( (!HasFP && NumBytes) || PushedRegs) && needsFrameMoves) {
     // Mark end of stack pointer adjustment.
     MCSymbol *Label = MMI.getContext().CreateTempSymbol();
@@ -997,10 +1022,14 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   if (hasFP(MF)) {
     // Calculate required stack adjustment.
     uint64_t FrameSize = StackSize - SlotSize;
-    if (RegInfo->needsStackRealignment(MF))
-      FrameSize = (FrameSize + MaxAlign - 1)/MaxAlign*MaxAlign;
-
-    NumBytes = FrameSize - CSSize;
+    if (RegInfo->needsStackRealignment(MF)) {
+      // Callee-saved registers were pushed on stack before the stack
+      // was realigned.
+      FrameSize -= CSSize;
+      NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign;
+    } else {
+      NumBytes = FrameSize - CSSize;
+    }
 
     // Pop EBP.
     BuildMI(MBB, MBBI, DL,
@@ -1010,7 +1039,6 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   }
 
   // Skip the callee-saved pop instructions.
-  MachineBasicBlock::iterator LastCSPop = MBBI;
   while (MBBI != MBB.begin()) {
     MachineBasicBlock::iterator PI = prior(MBBI);
     unsigned Opc = PI->getOpcode();
@@ -1021,6 +1049,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
 
     --MBBI;
   }
+  MachineBasicBlock::iterator FirstCSPop = MBBI;
 
   DL = MBBI->getDebugLoc();
 
@@ -1032,28 +1061,16 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   // If dynamic alloca is used, then reset esp to point to the last callee-saved
   // slot before popping them off! Same applies for the case, when stack was
   // realigned.
-  if (RegInfo->needsStackRealignment(MF)) {
-    // We cannot use LEA here, because stack pointer was realigned. We need to
-    // deallocate local frame back.
-    if (CSSize) {
-      emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, UseLEA, TII,
-                   *RegInfo);
-      MBBI = prior(LastCSPop);
-    }
-
-    BuildMI(MBB, MBBI, DL,
-            TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr),
-            StackPtr).addReg(FramePtr);
-  } else if (MFI->hasVarSizedObjects()) {
-    if (CSSize) {
-      unsigned Opc = Is64Bit ? X86::LEA64r : X86::LEA32r;
-      MachineInstr *MI =
-        addRegOffset(BuildMI(MF, DL, TII.get(Opc), StackPtr),
-                     FramePtr, false, -CSSize);
-      MBB.insert(MBBI, MI);
+  if (RegInfo->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) {
+    if (RegInfo->needsStackRealignment(MF))
+      MBBI = FirstCSPop;
+    if (CSSize != 0) {
+      unsigned Opc = getLEArOpcode(Is64Bit);
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr),
+                   FramePtr, false, -CSSize);
     } else {
-      BuildMI(MBB, MBBI, DL,
-              TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), StackPtr)
+      unsigned Opc = (Is64Bit ? X86::MOV64rr : X86::MOV32rr);
+      BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
         .addReg(FramePtr);
     }
   } else if (NumBytes) {
@@ -1124,8 +1141,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
     }
 
     MachineInstr *NewMI = prior(MBBI);
-    for (unsigned i = 2, e = MBBI->getNumOperands(); i != e; ++i)
-      NewMI->addOperand(MBBI->getOperand(i));
+    NewMI->copyImplicitOps(MBBI);
 
     // Delete the pseudo instruction TCRETURN.
     MBB.erase(MBBI);
@@ -1142,16 +1158,25 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
 }
 
 int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) const {
-  const X86RegisterInfo *RI =
+  const X86RegisterInfo *RegInfo =
     static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo());
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea();
   uint64_t StackSize = MFI->getStackSize();
 
-  if (RI->needsStackRealignment(MF)) {
+  if (RegInfo->hasBasePointer(MF)) {
+    assert (hasFP(MF) && "VLAs and dynamic stack realign, but no FP?!");
     if (FI < 0) {
       // Skip the saved EBP.
-      Offset += RI->getSlotSize();
+      return Offset + RegInfo->getSlotSize();
+    } else {
+      assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0);
+      return Offset + StackSize;
+    }
+  } else if (RegInfo->needsStackRealignment(MF)) {
+    if (FI < 0) {
+      // Skip the saved EBP.
+      return Offset + RegInfo->getSlotSize();
     } else {
       assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0);
       return Offset + StackSize;
@@ -1162,7 +1187,7 @@ int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) con
       return Offset + StackSize;
 
     // Skip the saved EBP.
-    Offset += RI->getSlotSize();
+    Offset += RegInfo->getSlotSize();
 
     // Skip the RETADDR move area
     const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
@@ -1174,6 +1199,22 @@ int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) con
   return Offset;
 }
 
+int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+                                             unsigned &FrameReg) const {
+  const X86RegisterInfo *RegInfo =
+      static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo());
+  // We can't calculate offset from frame pointer if the stack is realigned,
+  // so enforce usage of stack/base pointer.  The base pointer is used when we
+  // have dynamic allocas in addition to dynamic realignment.
+  if (RegInfo->hasBasePointer(MF))
+    FrameReg = RegInfo->getBaseRegister();
+  else if (RegInfo->needsStackRealignment(MF))
+    FrameReg = RegInfo->getStackRegister();
+  else
+    FrameReg = RegInfo->getFrameRegister(MF);
+  return getFrameIndexOffset(MF, FI);
+}
+
 bool X86FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                              MachineBasicBlock::iterator MI,
                                         const std::vector<CalleeSavedInfo> &CSI,
@@ -1307,6 +1348,10 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
            "Slot for EBP register must be last in order to be found!");
     (void)FrameIdx;
   }
+
+  // Spill the BasePtr if it's used.
+  if (RegInfo->hasBasePointer(MF))
+    MF.getRegInfo().setPhysRegUsed(RegInfo->getBaseRegister());
 }
 
 static bool
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index d55a497..dc515dc 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -60,6 +60,8 @@ public:
   bool hasReservedCallFrame(const MachineFunction &MF) const;
 
   int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const;
   uint32_t getCompactUnwindEncoding(MachineFunction &MF) const;
 };
 
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 8e2b1d6..5186482 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -187,6 +187,7 @@ namespace {
 
   private:
     SDNode *Select(SDNode *N);
+    SDNode *SelectGather(SDNode *N, unsigned Opc);
     SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
     SDNode *SelectAtomicLoadAdd(SDNode *Node, EVT NVT);
     SDNode *SelectAtomicLoadArith(SDNode *Node, EVT NVT);
@@ -1905,6 +1906,20 @@ static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc,
         ChainCheck = true;
         continue;
       }
+
+      // Make sure using Op as part of the chain would not cause a cycle here.
+      // In theory, we could check whether the chain node is a predecessor of
+      // the load. But that can be very expensive. Instead visit the uses and
+      // make sure they all have smaller node id than the load.
+      int LoadId = LoadNode->getNodeId();
+      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+             UE = UI->use_end(); UI != UE; ++UI) {
+        if (UI.getUse().getResNo() != 0)
+          continue;
+        if (UI->getNodeId() > LoadId)
+          return false;
+      }
+
       ChainOps.push_back(Op);
     }
 
@@ -1938,6 +1953,38 @@ static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) {
   llvm_unreachable("unrecognized size for LdVT");
 }
 
+/// SelectGather - Customized ISel for GATHER operations.
+///
+SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) {
+  // Operands of Gather: VSrc, Base, VIdx, VMask, Scale
+  SDValue Chain = Node->getOperand(0);
+  SDValue VSrc = Node->getOperand(2);
+  SDValue Base = Node->getOperand(3);
+  SDValue VIdx = Node->getOperand(4);
+  SDValue VMask = Node->getOperand(5);
+  ConstantSDNode *Scale = dyn_cast<ConstantSDNode>(Node->getOperand(6));
+  if (!Scale)
+    return 0;
+
+  SDVTList VTs = CurDAG->getVTList(VSrc.getValueType(), VSrc.getValueType(),
+                                   MVT::Other);
+
+  // Memory Operands: Base, Scale, Index, Disp, Segment
+  SDValue Disp = CurDAG->getTargetConstant(0, MVT::i32);
+  SDValue Segment = CurDAG->getRegister(0, MVT::i32);
+  const SDValue Ops[] = { VSrc, Base, getI8Imm(Scale->getSExtValue()), VIdx,
+                          Disp, Segment, VMask, Chain};
+  SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(),
+                                           VTs, Ops, array_lengthof(Ops));
+  // Node has 2 outputs: VDst and MVT::Other.
+  // ResNode has 3 outputs: VDst, VMask_wb, and MVT::Other.
+  // We replace VDst of Node with VDst of ResNode, and Other of Node with Other
+  // of ResNode.
+  ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0));
+  ReplaceUses(SDValue(Node, 1), SDValue(ResNode, 2));
+  return ResNode;
+}
+
 SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
   EVT NVT = Node->getValueType(0);
   unsigned Opc, MOpc;
@@ -1953,23 +2000,82 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
 
   switch (Opcode) {
   default: break;
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    default: break;
+    case Intrinsic::x86_avx2_gather_d_pd:
+    case Intrinsic::x86_avx2_gather_d_pd_256:
+    case Intrinsic::x86_avx2_gather_q_pd:
+    case Intrinsic::x86_avx2_gather_q_pd_256:
+    case Intrinsic::x86_avx2_gather_d_ps:
+    case Intrinsic::x86_avx2_gather_d_ps_256:
+    case Intrinsic::x86_avx2_gather_q_ps:
+    case Intrinsic::x86_avx2_gather_q_ps_256:
+    case Intrinsic::x86_avx2_gather_d_q:
+    case Intrinsic::x86_avx2_gather_d_q_256:
+    case Intrinsic::x86_avx2_gather_q_q:
+    case Intrinsic::x86_avx2_gather_q_q_256:
+    case Intrinsic::x86_avx2_gather_d_d:
+    case Intrinsic::x86_avx2_gather_d_d_256:
+    case Intrinsic::x86_avx2_gather_q_d:
+    case Intrinsic::x86_avx2_gather_q_d_256: {
+      unsigned Opc;
+      switch (IntNo) {
+      default: llvm_unreachable("Impossible intrinsic");
+      case Intrinsic::x86_avx2_gather_d_pd:     Opc = X86::VGATHERDPDrm;  break;
+      case Intrinsic::x86_avx2_gather_d_pd_256: Opc = X86::VGATHERDPDYrm; break;
+      case Intrinsic::x86_avx2_gather_q_pd:     Opc = X86::VGATHERQPDrm;  break;
+      case Intrinsic::x86_avx2_gather_q_pd_256: Opc = X86::VGATHERQPDYrm; break;
+      case Intrinsic::x86_avx2_gather_d_ps:     Opc = X86::VGATHERDPSrm;  break;
+      case Intrinsic::x86_avx2_gather_d_ps_256: Opc = X86::VGATHERDPSYrm; break;
+      case Intrinsic::x86_avx2_gather_q_ps:     Opc = X86::VGATHERQPSrm;  break;
+      case Intrinsic::x86_avx2_gather_q_ps_256: Opc = X86::VGATHERQPSYrm; break;
+      case Intrinsic::x86_avx2_gather_d_q:      Opc = X86::VPGATHERDQrm;  break;
+      case Intrinsic::x86_avx2_gather_d_q_256:  Opc = X86::VPGATHERDQYrm; break;
+      case Intrinsic::x86_avx2_gather_q_q:      Opc = X86::VPGATHERQQrm;  break;
+      case Intrinsic::x86_avx2_gather_q_q_256:  Opc = X86::VPGATHERQQYrm; break;
+      case Intrinsic::x86_avx2_gather_d_d:      Opc = X86::VPGATHERDDrm;  break;
+      case Intrinsic::x86_avx2_gather_d_d_256:  Opc = X86::VPGATHERDDYrm; break;
+      case Intrinsic::x86_avx2_gather_q_d:      Opc = X86::VPGATHERQDrm;  break;
+      case Intrinsic::x86_avx2_gather_q_d_256:  Opc = X86::VPGATHERQDYrm; break;
+      }
+      SDNode *RetVal = SelectGather(Node, Opc);
+      if (RetVal)
+        // We already called ReplaceUses inside SelectGather.
+        return NULL;
+      break;
+    }
+    }
+    break;
+  }
   case X86ISD::GlobalBaseReg:
     return getGlobalBaseReg();
 
+
   case X86ISD::ATOMOR64_DAG:
-    return SelectAtomic64(Node, X86::ATOMOR6432);
   case X86ISD::ATOMXOR64_DAG:
-    return SelectAtomic64(Node, X86::ATOMXOR6432);
   case X86ISD::ATOMADD64_DAG:
-    return SelectAtomic64(Node, X86::ATOMADD6432);
   case X86ISD::ATOMSUB64_DAG:
-    return SelectAtomic64(Node, X86::ATOMSUB6432);
   case X86ISD::ATOMNAND64_DAG:
-    return SelectAtomic64(Node, X86::ATOMNAND6432);
   case X86ISD::ATOMAND64_DAG:
-    return SelectAtomic64(Node, X86::ATOMAND6432);
-  case X86ISD::ATOMSWAP64_DAG:
-    return SelectAtomic64(Node, X86::ATOMSWAP6432);
+  case X86ISD::ATOMSWAP64_DAG: {
+    unsigned Opc;
+    switch (Opcode) {
+    default: llvm_unreachable("Impossible intrinsic");
+    case X86ISD::ATOMOR64_DAG:   Opc = X86::ATOMOR6432;   break;
+    case X86ISD::ATOMXOR64_DAG:  Opc = X86::ATOMXOR6432;  break;
+    case X86ISD::ATOMADD64_DAG:  Opc = X86::ATOMADD6432;  break;
+    case X86ISD::ATOMSUB64_DAG:  Opc = X86::ATOMSUB6432;  break;
+    case X86ISD::ATOMNAND64_DAG: Opc = X86::ATOMNAND6432; break;
+    case X86ISD::ATOMAND64_DAG:  Opc = X86::ATOMAND6432;  break;
+    case X86ISD::ATOMSWAP64_DAG: Opc = X86::ATOMSWAP6432; break;
+    }
+    SDNode *RetVal = SelectAtomic64(Node, Opc);
+    if (RetVal)
+      return RetVal;
+    break;
+  }
 
   case ISD::ATOMIC_LOAD_ADD: {
     SDNode *RetVal = SelectAtomicLoadAdd(Node, NVT);
@@ -2128,7 +2234,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     }
 
     SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
-                                            N0, SDValue()).getValue(1);
+                                          N0, SDValue()).getValue(1);
 
     if (foldedLoad) {
       SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
@@ -2168,7 +2274,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     // Copy the low half of the result, if it is needed.
     if (!SDValue(Node, 0).use_empty()) {
       SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                                LoReg, NVT, InFlag);
+                                              LoReg, NVT, InFlag);
       InFlag = Result.getValue(2);
       ReplaceUses(SDValue(Node, 0), Result);
       DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 3042386..b88f2fa 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -63,41 +63,33 @@ static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
 /// simple subregister reference.  Idx is an index in the 128 bits we
 /// want.  It need not be aligned to a 128-bit bounday.  That makes
 /// lowering EXTRACT_VECTOR_ELT operations easier.
-static SDValue Extract128BitVector(SDValue Vec,
-                                   SDValue Idx,
-                                   SelectionDAG &DAG,
-                                   DebugLoc dl) {
+static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
+                                   SelectionDAG &DAG, DebugLoc dl) {
   EVT VT = Vec.getValueType();
   assert(VT.getSizeInBits() == 256 && "Unexpected vector size!");
   EVT ElVT = VT.getVectorElementType();
-  int Factor = VT.getSizeInBits()/128;
+  unsigned Factor = VT.getSizeInBits()/128;
   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
                                   VT.getVectorNumElements()/Factor);
 
   // Extract from UNDEF is UNDEF.
   if (Vec.getOpcode() == ISD::UNDEF)
-    return DAG.getNode(ISD::UNDEF, dl, ResultVT);
-
-  if (isa<ConstantSDNode>(Idx)) {
-    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+    return DAG.getUNDEF(ResultVT);
 
-    // Extract the relevant 128 bits.  Generate an EXTRACT_SUBVECTOR
-    // we can match to VEXTRACTF128.
-    unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits();
+  // Extract the relevant 128 bits.  Generate an EXTRACT_SUBVECTOR
+  // we can match to VEXTRACTF128.
+  unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits();
 
-    // This is the index of the first element of the 128-bit chunk
-    // we want.
-    unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
-                                 * ElemsPerChunk);
+  // This is the index of the first element of the 128-bit chunk
+  // we want.
+  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
+                               * ElemsPerChunk);
 
-    SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
-    SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
-                                 VecIdx);
+  SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
+  SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
+                               VecIdx);
 
-    return Result;
-  }
-
-  return SDValue();
+  return Result;
 }
 
 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
@@ -105,34 +97,41 @@ static SDValue Extract128BitVector(SDValue Vec,
 /// simple superregister reference.  Idx is an index in the 128 bits
 /// we want.  It need not be aligned to a 128-bit bounday.  That makes
 /// lowering INSERT_VECTOR_ELT operations easier.
-static SDValue Insert128BitVector(SDValue Result,
-                                  SDValue Vec,
-                                  SDValue Idx,
-                                  SelectionDAG &DAG,
+static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
+                                  unsigned IdxVal, SelectionDAG &DAG,
                                   DebugLoc dl) {
-  if (isa<ConstantSDNode>(Idx)) {
-    EVT VT = Vec.getValueType();
-    assert(VT.getSizeInBits() == 128 && "Unexpected vector size!");
+  // Inserting UNDEF is Result
+  if (Vec.getOpcode() == ISD::UNDEF)
+    return Result;
 
-    EVT ElVT = VT.getVectorElementType();
-    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
-    EVT ResultVT = Result.getValueType();
+  EVT VT = Vec.getValueType();
+  assert(VT.getSizeInBits() == 128 && "Unexpected vector size!");
 
-    // Insert the relevant 128 bits.
-    unsigned ElemsPerChunk = 128/ElVT.getSizeInBits();
+  EVT ElVT = VT.getVectorElementType();
+  EVT ResultVT = Result.getValueType();
 
-    // This is the index of the first element of the 128-bit chunk
-    // we want.
-    unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
-                                 * ElemsPerChunk);
+  // Insert the relevant 128 bits.
+  unsigned ElemsPerChunk = 128/ElVT.getSizeInBits();
 
-    SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
-    Result = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
-                         VecIdx);
-    return Result;
-  }
+  // This is the index of the first element of the 128-bit chunk
+  // we want.
+  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
+                               * ElemsPerChunk);
 
-  return SDValue();
+  SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
+                     VecIdx);
+}
+
+/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
+/// instructions. This is used because creating CONCAT_VECTOR nodes of
+/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
+/// large BUILD_VECTORS.
+static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
+                                   unsigned NumElems, SelectionDAG &DAG,
+                                   DebugLoc dl) {
+  SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
+  return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
 }
 
 static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
@@ -141,10 +140,12 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
 
   if (Subtarget->isTargetEnvMacho()) {
     if (is64Bit)
-      return new X8664_MachoTargetObjectFile();
+      return new X86_64MachoTargetObjectFile();
     return new TargetLoweringObjectFileMachO();
   }
 
+  if (Subtarget->isTargetLinux())
+    return new X86LinuxTargetObjectFile();
   if (Subtarget->isTargetELF())
     return new TargetLoweringObjectFileELF();
   if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
@@ -163,7 +164,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   TD = getTargetData();
 
   // Set up the TargetLowering object.
-  static MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
+  static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
 
   // X86 is weird, it always uses i8 for shift amounts and setcc results.
   setBooleanContents(ZeroOrOneBooleanContent);
@@ -172,11 +173,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
 
   // For 64-bit since we have so many registers use the ILP scheduler, for
   // 32-bit code use the register pressure specific scheduling.
-  // For 32 bit Atom, use Hybrid (register pressure + latency) scheduling.
-  if (Subtarget->is64Bit())
+  // For Atom, always use ILP scheduling.
+  if (Subtarget->isAtom()) 
+    setSchedulingPreference(Sched::ILP);
+  else if (Subtarget->is64Bit())
     setSchedulingPreference(Sched::ILP);
-  else if (Subtarget->isAtom()) 
-    setSchedulingPreference(Sched::Hybrid);
   else
     setSchedulingPreference(Sched::RegPressure);
   setStackPointerRegisterToSaveRestore(X86StackPtr);
@@ -216,11 +217,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   }
 
   // Set up the register classes.
-  addRegisterClass(MVT::i8, X86::GR8RegisterClass);
-  addRegisterClass(MVT::i16, X86::GR16RegisterClass);
-  addRegisterClass(MVT::i32, X86::GR32RegisterClass);
+  addRegisterClass(MVT::i8, &X86::GR8RegClass);
+  addRegisterClass(MVT::i16, &X86::GR16RegClass);
+  addRegisterClass(MVT::i32, &X86::GR32RegClass);
   if (Subtarget->is64Bit())
-    addRegisterClass(MVT::i64, X86::GR64RegisterClass);
+    addRegisterClass(MVT::i64, &X86::GR64RegClass);
 
   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
 
@@ -346,7 +347,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   // (low) operations are left as Legal, as there are single-result
   // instructions for this in x86. Using the two-result multiply instructions
   // when both high and low results are needed must be arranged by dagcombine.
-  for (unsigned i = 0, e = 4; i != e; ++i) {
+  for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
     MVT VT = IntVTs[i];
     setOperationAction(ISD::MULHS, VT, Expand);
     setOperationAction(ISD::MULHU, VT, Expand);
@@ -493,7 +494,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setShouldFoldAtomicFences(true);
 
   // Expand certain atomics
-  for (unsigned i = 0, e = 4; i != e; ++i) {
+  for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
     MVT VT = IntVTs[i];
     setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
@@ -568,8 +569,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
     // f32 and f64 use SSE.
     // Set up the FP register classes.
-    addRegisterClass(MVT::f32, X86::FR32RegisterClass);
-    addRegisterClass(MVT::f64, X86::FR64RegisterClass);
+    addRegisterClass(MVT::f32, &X86::FR32RegClass);
+    addRegisterClass(MVT::f64, &X86::FR64RegClass);
 
     // Use ANDPD to simulate FABS.
     setOperationAction(ISD::FABS , MVT::f64, Custom);
@@ -600,8 +601,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
     // Use SSE for f32, x87 for f64.
     // Set up the FP register classes.
-    addRegisterClass(MVT::f32, X86::FR32RegisterClass);
-    addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
+    addRegisterClass(MVT::f32, &X86::FR32RegClass);
+    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 
     // Use ANDPS to simulate FABS.
     setOperationAction(ISD::FABS , MVT::f32, Custom);
@@ -633,8 +634,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   } else if (!TM.Options.UseSoftFloat) {
     // f32 and f64 in x87.
     // Set up the FP register classes.
-    addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
-    addRegisterClass(MVT::f32, X86::RFP32RegisterClass);
+    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
+    addRegisterClass(MVT::f32, &X86::RFP32RegClass);
 
     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
@@ -661,7 +662,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
 
   // Long double always uses X87.
   if (!TM.Options.UseSoftFloat) {
-    addRegisterClass(MVT::f80, X86::RFP80RegisterClass);
+    addRegisterClass(MVT::f80, &X86::RFP80RegClass);
     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
     {
@@ -706,8 +707,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   // First set operation action for all vector types to either promote
   // (for widening) or expand (for scalarization). Then we will selectively
   // turn on ones that can be effectively codegen'd.
-  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-       VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
+  for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
+           VT <= MVT::LAST_VECTOR_VALUETYPE; ++VT) {
     setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand);
@@ -765,8 +766,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::ZERO_EXTEND,  (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::ANY_EXTEND,  (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::VSELECT,  (MVT::SimpleValueType)VT, Expand);
-    for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-         InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
+    for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
+             InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
       setTruncStoreAction((MVT::SimpleValueType)VT,
                           (MVT::SimpleValueType)InnerVT, Expand);
     setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
@@ -777,7 +778,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
   // with -msoft-float, disable use of MMX as well.
   if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
-    addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass);
+    addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
     // No operations on x86mmx supported, everything uses intrinsics.
   }
 
@@ -814,7 +815,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
 
   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
-    addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
+    addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
 
     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
@@ -831,14 +832,14 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   }
 
   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
-    addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
+    addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
 
     // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
     // registers cannot be used even for integer operations.
-    addRegisterClass(MVT::v16i8, X86::VR128RegisterClass);
-    addRegisterClass(MVT::v8i16, X86::VR128RegisterClass);
-    addRegisterClass(MVT::v4i32, X86::VR128RegisterClass);
-    addRegisterClass(MVT::v2i64, X86::VR128RegisterClass);
+    addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
+    addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
+    addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
+    addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
 
     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
@@ -875,7 +876,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i32, Custom);
 
     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
-    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) {
+    for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
       EVT VT = (MVT::SimpleValueType)i;
       // Do not attempt to custom lower non-power-of-2 vectors
       if (!isPowerOf2_32(VT.getVectorNumElements()))
@@ -904,7 +905,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     }
 
     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
-    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) {
+    for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
       MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
       EVT VT = SVT;
 
@@ -1012,12 +1013,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::SETCC,             MVT::v2i64, Custom);
 
   if (!TM.Options.UseSoftFloat && Subtarget->hasAVX()) {
-    addRegisterClass(MVT::v32i8,  X86::VR256RegisterClass);
-    addRegisterClass(MVT::v16i16, X86::VR256RegisterClass);
-    addRegisterClass(MVT::v8i32,  X86::VR256RegisterClass);
-    addRegisterClass(MVT::v8f32,  X86::VR256RegisterClass);
-    addRegisterClass(MVT::v4i64,  X86::VR256RegisterClass);
-    addRegisterClass(MVT::v4f64,  X86::VR256RegisterClass);
+    addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
+    addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
+    addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
+    addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
+    addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
+    addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
 
     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
@@ -1122,8 +1123,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     }
 
     // Custom lower several nodes for 256-bit types.
-    for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-                  i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
+    for (int i = MVT::FIRST_VECTOR_VALUETYPE;
+             i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
       MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
       EVT VT = SVT;
 
@@ -1145,7 +1146,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     }
 
     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
-    for (unsigned i = (unsigned)MVT::v32i8; i != (unsigned)MVT::v4i64; ++i) {
+    for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
       MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
       EVT VT = SVT;
 
@@ -1168,14 +1169,15 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
 
   // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
   // of this type with custom code.
-  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-         VT != (unsigned)MVT::LAST_VECTOR_VALUETYPE; VT++) {
+  for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
+           VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
     setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
                        Custom);
   }
 
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
 
 
   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
@@ -1223,13 +1225,16 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setTargetDAGCombine(ISD::LOAD);
   setTargetDAGCombine(ISD::STORE);
   setTargetDAGCombine(ISD::ZERO_EXTEND);
+  setTargetDAGCombine(ISD::ANY_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND);
   setTargetDAGCombine(ISD::TRUNCATE);
+  setTargetDAGCombine(ISD::UINT_TO_FP);
   setTargetDAGCombine(ISD::SINT_TO_FP);
+  setTargetDAGCombine(ISD::SETCC);
+  setTargetDAGCombine(ISD::FP_TO_SINT);
   if (Subtarget->is64Bit())
     setTargetDAGCombine(ISD::MUL);
-  if (Subtarget->hasBMI())
-    setTargetDAGCombine(ISD::XOR);
+  setTargetDAGCombine(ISD::XOR);
 
   computeRegisterProperties();
 
@@ -1244,6 +1249,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setPrefLoopAlignment(4); // 2^4 bytes.
   benefitFromCodePlacementOpt = true;
 
+  // Predictable cmov don't hurt on atom because it's in-order.
+  predictableSelectIsExpensive = !Subtarget->isAtom();
+
   setPrefFunctionAlignment(4); // 2^4 bytes.
 }
 
@@ -1277,7 +1285,6 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
         break;
     }
   }
-  return;
 }
 
 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
@@ -1412,18 +1419,19 @@ X86TargetLowering::findRepresentativeClass(EVT VT) const{
   default:
     return TargetLowering::findRepresentativeClass(VT);
   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
-    RRC = (Subtarget->is64Bit()
-           ? X86::GR64RegisterClass : X86::GR32RegisterClass);
+    RRC = Subtarget->is64Bit() ?
+      (const TargetRegisterClass*)&X86::GR64RegClass :
+      (const TargetRegisterClass*)&X86::GR32RegClass;
     break;
   case MVT::x86mmx:
-    RRC = X86::VR64RegisterClass;
+    RRC = &X86::VR64RegClass;
     break;
   case MVT::f32: case MVT::f64:
   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
   case MVT::v4f32: case MVT::v2f64:
   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
   case MVT::v4f64:
-    RRC = X86::VR128RegisterClass;
+    RRC = &X86::VR128RegClass;
     break;
   }
   return std::make_pair(RRC, Cost);
@@ -1458,7 +1466,7 @@ bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
 
 bool
 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
-				  MachineFunction &MF, bool isVarArg,
+                                  MachineFunction &MF, bool isVarArg,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
                         LLVMContext &Context) const {
   SmallVector<CCValAssign, 16> RVLocs;
@@ -1502,6 +1510,16 @@ X86TargetLowering::LowerReturn(SDValue Chain,
     SDValue ValToCopy = OutVals[i];
     EVT ValVT = ValToCopy.getValueType();
 
+    // Promote values to the appropriate types
+    if (VA.getLocInfo() == CCValAssign::SExt)
+      ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
+    else if (VA.getLocInfo() == CCValAssign::ZExt)
+      ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
+    else if (VA.getLocInfo() == CCValAssign::AExt)
+      ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
+    else if (VA.getLocInfo() == CCValAssign::BCvt)
+      ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
+
     // If this is x86-64, and we disabled SSE, we can't return FP values,
     // or SSE or MMX vectors.
     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
@@ -1639,7 +1657,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   SmallVector<CCValAssign, 16> RVLocs;
   bool Is64Bit = Subtarget->is64Bit();
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), RVLocs, *DAG.getContext());
+                 getTargetMachine(), RVLocs, *DAG.getContext());
   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
 
   // Copy all of the result registers out of their specified physreg.
@@ -1656,7 +1674,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
     SDValue Val;
 
     // If this is a call to a function that returns an fp value on the floating
-    // point stack, we must guarantee the the value is popped from the stack, so
+    // point stack, we must guarantee the value is popped from the stack, so
     // a CopyFromReg is not good enough - the copy instruction may be eliminated
     // if the return value is not used. We use the FpPOP_RETVAL instruction
     // instead.
@@ -1851,19 +1869,19 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
       EVT RegVT = VA.getLocVT();
       const TargetRegisterClass *RC;
       if (RegVT == MVT::i32)
-        RC = X86::GR32RegisterClass;
+        RC = &X86::GR32RegClass;
       else if (Is64Bit && RegVT == MVT::i64)
-        RC = X86::GR64RegisterClass;
+        RC = &X86::GR64RegClass;
       else if (RegVT == MVT::f32)
-        RC = X86::FR32RegisterClass;
+        RC = &X86::FR32RegClass;
       else if (RegVT == MVT::f64)
-        RC = X86::FR64RegisterClass;
+        RC = &X86::FR64RegClass;
       else if (RegVT.isVector() && RegVT.getSizeInBits() == 256)
-        RC = X86::VR256RegisterClass;
+        RC = &X86::VR256RegClass;
       else if (RegVT.isVector() && RegVT.getSizeInBits() == 128)
-        RC = X86::VR128RegisterClass;
+        RC = &X86::VR128RegClass;
       else if (RegVT == MVT::x86mmx)
-        RC = X86::VR64RegisterClass;
+        RC = &X86::VR64RegClass;
       else
         llvm_unreachable("Unknown argument type!");
 
@@ -2005,7 +2023,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
         SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
                                   DAG.getIntPtrConstant(Offset));
         unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
-                                     X86::GR64RegisterClass);
+                                     &X86::GR64RegClass);
         SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
         SDValue Store =
           DAG.getStore(Val.getValue(1), dl, Val, FIN,
@@ -2021,7 +2039,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
         SmallVector<SDValue, 11> SaveXMMOps;
         SaveXMMOps.push_back(Chain);
 
-        unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass);
+        unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
         SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
         SaveXMMOps.push_back(ALVal);
 
@@ -2032,7 +2050,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
 
         for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
           unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs],
-                                       X86::VR128RegisterClass);
+                                       &X86::VR128RegClass);
           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
           SaveXMMOps.push_back(Val);
         }
@@ -2128,14 +2146,19 @@ EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
 }
 
 SDValue
-X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
-                             CallingConv::ID CallConv, bool isVarArg,
-                             bool doesNotRet, bool &isTailCall,
-                             const SmallVectorImpl<ISD::OutputArg> &Outs,
-                             const SmallVectorImpl<SDValue> &OutVals,
-                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                             DebugLoc dl, SelectionDAG &DAG,
+X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                              SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool &isTailCall                      = CLI.IsTailCall;
+  bool isVarArg                         = CLI.IsVarArg;
+
   MachineFunction &MF = DAG.getMachineFunction();
   bool Is64Bit        = Subtarget->is64Bit();
   bool IsWin64        = Subtarget->isTargetWin64();
@@ -2283,27 +2306,12 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                         &MemOpChains[0], MemOpChains.size());
 
-  // Build a sequence of copy-to-reg nodes chained together with token chain
-  // and flag operands which copy the outgoing args into registers.
-  SDValue InFlag;
-  // Tail call byval lowering might overwrite argument registers so in case of
-  // tail call optimization the copies to registers are lowered later.
-  if (!isTailCall)
-    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
-                               RegsToPass[i].second, InFlag);
-      InFlag = Chain.getValue(1);
-    }
-
   if (Subtarget->isPICStyleGOT()) {
     // ELF / PIC requires GOT in the EBX register before function calls via PLT
     // GOT pointer.
     if (!isTailCall) {
-      Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
-                               DAG.getNode(X86ISD::GlobalBaseReg,
-                                           DebugLoc(), getPointerTy()),
-                               InFlag);
-      InFlag = Chain.getValue(1);
+      RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
+               DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy())));
     } else {
       // If we are tail calling and generating PIC/GOT style code load the
       // address of the callee into ECX. The value in ecx is used as target of
@@ -2341,12 +2349,10 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     assert((Subtarget->hasSSE1() || !NumXMMRegs)
            && "SSE registers cannot be used when SSE is disabled");
 
-    Chain = DAG.getCopyToReg(Chain, dl, X86::AL,
-                             DAG.getConstant(NumXMMRegs, MVT::i8), InFlag);
-    InFlag = Chain.getValue(1);
+    RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
+                                        DAG.getConstant(NumXMMRegs, MVT::i8)));
   }
 
-
   // For tail calls lower the arguments to the 'real' stack slot.
   if (isTailCall) {
     // Force all the incoming stack arguments to be loaded from the stack
@@ -2360,8 +2366,6 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     SmallVector<SDValue, 8> MemOpChains2;
     SDValue FIN;
     int FI = 0;
-    // Do not flag preceding copytoreg stuff together with the following stuff.
-    InFlag = SDValue();
     if (getTargetMachine().Options.GuaranteedTailCallOpt) {
       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
         CCValAssign &VA = ArgLocs[i];
@@ -2401,19 +2405,20 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                           &MemOpChains2[0], MemOpChains2.size());
 
-    // Copy arguments to their registers.
-    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
-                               RegsToPass[i].second, InFlag);
-      InFlag = Chain.getValue(1);
-    }
-    InFlag =SDValue();
-
     // Store the return address to the appropriate stack slot.
     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit,
                                      FPDiff, dl);
   }
 
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into registers.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
     // In the 64-bit large code model, we have to make all calls
@@ -2515,14 +2520,6 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
                                   RegsToPass[i].second.getValueType()));
 
-  // Add an implicit use GOT pointer in EBX.
-  if (!isTailCall && Subtarget->isPICStyleGOT())
-    Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
-
-  // Add an implicit use of AL for non-Windows x86 64-bit vararg functions.
-  if (Is64Bit && isVarArg && !IsWin64)
-    Ops.push_back(DAG.getRegister(X86::AL, MVT::i8));
-
   // Add a register mask operand representing the call-preserved registers.
   const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
@@ -2744,7 +2741,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
 
     SmallVector<CCValAssign, 16> ArgLocs;
     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
-		   getTargetMachine(), ArgLocs, *DAG.getContext());
+                   getTargetMachine(), ArgLocs, *DAG.getContext());
 
     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
@@ -2765,7 +2762,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   if (Unused) {
     SmallVector<CCValAssign, 16> RVLocs;
     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
-		   getTargetMachine(), RVLocs, *DAG.getContext());
+                   getTargetMachine(), RVLocs, *DAG.getContext());
     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
       CCValAssign &VA = RVLocs[i];
@@ -2779,12 +2776,12 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   if (!CCMatch) {
     SmallVector<CCValAssign, 16> RVLocs1;
     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
-		    getTargetMachine(), RVLocs1, *DAG.getContext());
+                    getTargetMachine(), RVLocs1, *DAG.getContext());
     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
 
     SmallVector<CCValAssign, 16> RVLocs2;
     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
-		    getTargetMachine(), RVLocs2, *DAG.getContext());
+                    getTargetMachine(), RVLocs2, *DAG.getContext());
     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
 
     if (RVLocs1.size() != RVLocs2.size())
@@ -2811,7 +2808,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     // argument is passed on the stack.
     SmallVector<CCValAssign, 16> ArgLocs;
     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
-		   getTargetMachine(), ArgLocs, *DAG.getContext());
+                   getTargetMachine(), ArgLocs, *DAG.getContext());
 
     // Allocate shadow area for Win64
     if (Subtarget->isTargetWin64()) {
@@ -2912,6 +2909,7 @@ static bool isTargetShuffle(unsigned Opcode) {
   case X86ISD::UNPCKH:
   case X86ISD::VPERMILP:
   case X86ISD::VPERM2X128:
+  case X86ISD::VPERMI:
     return true;
   }
 }
@@ -3052,10 +3050,12 @@ static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
         // X > -1   -> X == 0, jump !sign.
         RHS = DAG.getConstant(0, RHS.getValueType());
         return X86::COND_NS;
-      } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
+      }
+      if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
         // X < 0   -> X == 0, jump on sign.
         return X86::COND_S;
-      } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
+      }
+      if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
         // X < 1   -> X <= 0
         RHS = DAG.getConstant(0, RHS.getValueType());
         return X86::COND_LE;
@@ -3171,12 +3171,12 @@ static bool isUndefOrEqual(int Val, int CmpVal) {
   return false;
 }
 
-/// isSequentialOrUndefInRange - Return true if every element in Mask, begining
+/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
 /// from position Pos and ending in Pos+Size, falls within the specified
 /// sequential range (L, L+Pos]. or is undef.
 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
-                                       int Pos, int Size, int Low) {
-  for (int i = Pos, e = Pos+Size; i != e; ++i, ++Low)
+                                       unsigned Pos, unsigned Size, int Low) {
+  for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
     if (!isUndefOrEqual(Mask[i], Low))
       return false;
   return true;
@@ -3195,8 +3195,8 @@ static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) {
 
 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
 /// is suitable for input to PSHUFHW.
-static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT) {
-  if (VT != MVT::v8i16)
+static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) {
+  if (VT != MVT::v8i16 && (!HasAVX2 || VT != MVT::v16i16))
     return false;
 
   // Lower quadword copied in order or undef.
@@ -3205,16 +3205,27 @@ static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT) {
 
   // Upper quadword shuffled.
   for (unsigned i = 4; i != 8; ++i)
-    if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7))
+    if (!isUndefOrInRange(Mask[i], 4, 8))
       return false;
 
+  if (VT == MVT::v16i16) {
+    // Lower quadword copied in order or undef.
+    if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
+      return false;
+
+    // Upper quadword shuffled.
+    for (unsigned i = 12; i != 16; ++i)
+      if (!isUndefOrInRange(Mask[i], 12, 16))
+        return false;
+  }
+
   return true;
 }
 
 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
 /// is suitable for input to PSHUFLW.
-static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT) {
-  if (VT != MVT::v8i16)
+static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) {
+  if (VT != MVT::v8i16 && (!HasAVX2 || VT != MVT::v16i16))
     return false;
 
   // Upper quadword copied in order.
@@ -3223,9 +3234,20 @@ static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT) {
 
   // Lower quadword shuffled.
   for (unsigned i = 0; i != 4; ++i)
-    if (Mask[i] >= 4)
+    if (!isUndefOrInRange(Mask[i], 0, 4))
+      return false;
+
+  if (VT == MVT::v16i16) {
+    // Upper quadword copied in order.
+    if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
       return false;
 
+    // Lower quadword shuffled.
+    for (unsigned i = 8; i != 12; ++i)
+      if (!isUndefOrInRange(Mask[i], 8, 12))
+        return false;
+  }
+
   return true;
 }
 
@@ -3419,11 +3441,11 @@ static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) {
   if (NumElems != 2 && NumElems != 4)
     return false;
 
-  for (unsigned i = 0; i != NumElems/2; ++i)
+  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
     if (!isUndefOrEqual(Mask[i], i + NumElems))
       return false;
 
-  for (unsigned i = NumElems/2; i != NumElems; ++i)
+  for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
     if (!isUndefOrEqual(Mask[i], i))
       return false;
 
@@ -3439,17 +3461,63 @@ static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) {
       || VT.getSizeInBits() > 128)
     return false;
 
-  for (unsigned i = 0; i != NumElems/2; ++i)
+  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
     if (!isUndefOrEqual(Mask[i], i))
       return false;
 
-  for (unsigned i = 0; i != NumElems/2; ++i)
-    if (!isUndefOrEqual(Mask[i + NumElems/2], i + NumElems))
+  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
+    if (!isUndefOrEqual(Mask[i + e], i + NumElems))
       return false;
 
   return true;
 }
 
+//
+// Some special combinations that can be optimized.
+//
+static
+SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
+                               SelectionDAG &DAG) {
+  EVT VT = SVOp->getValueType(0);
+  DebugLoc dl = SVOp->getDebugLoc();
+
+  if (VT != MVT::v8i32 && VT != MVT::v8f32)
+    return SDValue();
+
+  ArrayRef<int> Mask = SVOp->getMask();
+
+  // These are the special masks that may be optimized.
+  static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
+  static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
+  bool MatchEvenMask = true;
+  bool MatchOddMask  = true;
+  for (int i=0; i<8; ++i) {
+    if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
+      MatchEvenMask = false;
+    if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
+      MatchOddMask = false;
+  }
+  static const int CompactionMaskEven[] = {0, 2, -1, -1, 4, 6, -1, -1};
+  static const int CompactionMaskOdd [] = {1, 3, -1, -1, 5, 7, -1, -1};
+
+  const int *CompactionMask;
+  if (MatchEvenMask)
+    CompactionMask = CompactionMaskEven;
+  else if (MatchOddMask)
+    CompactionMask = CompactionMaskOdd;
+  else
+    return SDValue();
+
+  SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
+
+  SDValue Op0 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(0),
+                                     UndefNode, CompactionMask);
+  SDValue Op1 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(1),
+                                     UndefNode, CompactionMask);
+  static const int UnpackMask[] = {0, 8, 1, 9, 4, 12, 5, 13};
+  return DAG.getVectorShuffle(VT, dl, Op0, Op1, UnpackMask);
+}
+
 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
 static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT,
@@ -3881,9 +3949,8 @@ static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
   for (unsigned i = 0; i != NumElts; ++i) {
     int Elt = N->getMaskElt(i);
     if (Elt < 0) continue;
-    Elt %= NumLaneElts;
-    unsigned ShAmt = i << Shift;
-    if (ShAmt >= 8) ShAmt -= 8;
+    Elt &= NumLaneElts - 1;
+    unsigned ShAmt = (i << Shift) % 8;
     Mask |= Elt << ShAmt;
   }
 
@@ -3893,30 +3960,48 @@ static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
+  EVT VT = N->getValueType(0);
+
+  assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
+         "Unsupported vector type for PSHUFHW");
+
+  unsigned NumElts = VT.getVectorNumElements();
+
   unsigned Mask = 0;
-  // 8 nodes, but we only care about the last 4.
-  for (unsigned i = 7; i >= 4; --i) {
-    int Val = N->getMaskElt(i);
-    if (Val >= 0)
-      Mask |= (Val - 4);
-    if (i != 4)
-      Mask <<= 2;
+  for (unsigned l = 0; l != NumElts; l += 8) {
+    // 8 nodes per lane, but we only care about the last 4.
+    for (unsigned i = 0; i < 4; ++i) {
+      int Elt = N->getMaskElt(l+i+4);
+      if (Elt < 0) continue;
+      Elt &= 0x3; // only 2-bits.
+      Mask |= Elt << (i * 2);
+    }
   }
+
   return Mask;
 }
 
 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
+  EVT VT = N->getValueType(0);
+
+  assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
+         "Unsupported vector type for PSHUFHW");
+
+  unsigned NumElts = VT.getVectorNumElements();
+
   unsigned Mask = 0;
-  // 8 nodes, but we only care about the first 4.
-  for (int i = 3; i >= 0; --i) {
-    int Val = N->getMaskElt(i);
-    if (Val >= 0)
-      Mask |= Val;
-    if (i != 0)
-      Mask <<= 2;
+  for (unsigned l = 0; l != NumElts; l += 8) {
+    // 8 nodes per lane, but we only care about the first 4.
+    for (unsigned i = 0; i < 4; ++i) {
+      int Elt = N->getMaskElt(l+i);
+      if (Elt < 0) continue;
+      Elt &= 0x3; // only 2-bits
+      Mask |= Elt << (i * 2);
+    }
   }
+
   return Mask;
 }
 
@@ -4017,13 +4102,14 @@ static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
   SmallVector<int, 8> MaskVec;
 
   for (unsigned i = 0; i != NumElems; ++i) {
-    int idx = SVOp->getMaskElt(i);
-    if (idx < 0)
-      MaskVec.push_back(idx);
-    else if (idx < (int)NumElems)
-      MaskVec.push_back(idx + NumElems);
-    else
-      MaskVec.push_back(idx - NumElems);
+    int Idx = SVOp->getMaskElt(i);
+    if (Idx >= 0) {
+      if (Idx < (int)NumElems)
+        Idx += NumElems;
+      else
+        Idx -= NumElems;
+    }
+    MaskVec.push_back(Idx);
   }
   return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1),
                               SVOp->getOperand(0), &MaskVec[0]);
@@ -4108,7 +4194,7 @@ static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
     if (!isUndefOrEqual(Mask[i], i))
       return false;
-  for (unsigned i = NumElems/2; i != NumElems; ++i)
+  for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
     if (!isUndefOrEqual(Mask[i], i+NumElems))
       return false;
   return true;
@@ -4160,11 +4246,12 @@ static bool isZeroShuffle(ShuffleVectorSDNode *N) {
 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
                              SelectionDAG &DAG, DebugLoc dl) {
   assert(VT.isVector() && "Expected a vector type");
+  unsigned Size = VT.getSizeInBits();
 
   // Always build SSE zero vectors as <4 x i32> bitcasted
   // to their dest type. This ensures they get CSE'd.
   SDValue Vec;
-  if (VT.getSizeInBits() == 128) {  // SSE
+  if (Size == 128) {  // SSE
     if (Subtarget->hasSSE2()) {  // SSE2
       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
@@ -4172,7 +4259,7 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
       SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
     }
-  } else if (VT.getSizeInBits() == 256) { // AVX
+  } else if (Size == 256) { // AVX
     if (Subtarget->hasAVX2()) { // AVX2
       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
@@ -4184,7 +4271,9 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8);
     }
-  }
+  } else
+    llvm_unreachable("Unexpected vector type");
+
   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
 }
 
@@ -4195,25 +4284,22 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
 static SDValue getOnesVector(EVT VT, bool HasAVX2, SelectionDAG &DAG,
                              DebugLoc dl) {
   assert(VT.isVector() && "Expected a vector type");
-  assert((VT.is128BitVector() || VT.is256BitVector())
-         && "Expected a 128-bit or 256-bit vector type");
+  unsigned Size = VT.getSizeInBits();
 
   SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
   SDValue Vec;
-  if (VT.getSizeInBits() == 256) {
+  if (Size == 256) {
     if (HasAVX2) { // AVX2
       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
     } else { // AVX
       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
-      SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32),
-                                Vec, DAG.getConstant(0, MVT::i32), DAG, dl);
-      Vec = Insert128BitVector(InsV, Vec,
-                    DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl);
+      Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
     }
-  } else {
+  } else if (Size == 128) {
     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
-  }
+  } else
+    llvm_unreachable("Unexpected vector type");
 
   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
 }
@@ -4256,9 +4342,8 @@ static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
 static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
                           SDValue V2) {
   unsigned NumElems = VT.getVectorNumElements();
-  unsigned Half = NumElems/2;
   SmallVector<int, 8> Mask;
-  for (unsigned i = 0; i != Half; ++i) {
+  for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
     Mask.push_back(i + Half);
     Mask.push_back(i + NumElems + Half);
   }
@@ -4290,15 +4375,14 @@ static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
   EVT VT = V.getValueType();
   DebugLoc dl = V.getDebugLoc();
-  assert((VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256)
-         && "Vector size not supported");
+  unsigned Size = VT.getSizeInBits();
 
-  if (VT.getSizeInBits() == 128) {
+  if (Size == 128) {
     V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
     int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
     V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
                              &SplatMask[0]);
-  } else {
+  } else if (Size == 256) {
     // To use VPERMILPS to splat scalars, the second half of indicies must
     // refer to the higher part, which is a duplication of the lower one,
     // because VPERMILPS can only handle in-lane permutations.
@@ -4308,7 +4392,8 @@ static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
     V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
     V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
                              &SplatMask[0]);
-  }
+  } else
+    llvm_unreachable("Vector size not supported");
 
   return DAG.getNode(ISD::BITCAST, dl, VT, V);
 }
@@ -4329,9 +4414,8 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
   // Extract the 128-bit part containing the splat element and update
   // the splat element index when it refers to the higher register.
   if (Size == 256) {
-    unsigned Idx = (EltNo >= NumElems/2) ? NumElems/2 : 0;
-    V1 = Extract128BitVector(V1, DAG.getConstant(Idx, MVT::i32), DAG, dl);
-    if (Idx > 0)
+    V1 = Extract128BitVector(V1, EltNo, DAG, dl);
+    if (EltNo >= NumElems/2)
       EltNo -= NumElems/2;
   }
 
@@ -4347,10 +4431,7 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
   // into the low and high part. This is necessary because we want
   // to use VPERM* to shuffle the vectors
   if (Size == 256) {
-    SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), V1,
-                         DAG.getConstant(0, MVT::i32), DAG, dl);
-    V1 = Insert128BitVector(InsV, V1,
-               DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
+    V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
   }
 
   return getLegalSplat(DAG, V1, EltNo);
@@ -4378,7 +4459,7 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
 /// target specific opcode. Returns true if the Mask could be calculated.
 /// Sets IsUnary to true if only uses one source.
-static bool getTargetShuffleMask(SDNode *N, EVT VT,
+static bool getTargetShuffleMask(SDNode *N, MVT VT,
                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
   unsigned NumElems = VT.getVectorNumElements();
   SDValue ImmN;
@@ -4409,12 +4490,17 @@ static bool getTargetShuffleMask(SDNode *N, EVT VT,
     break;
   case X86ISD::PSHUFHW:
     ImmN = N->getOperand(N->getNumOperands()-1);
-    DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = true;
     break;
   case X86ISD::PSHUFLW:
     ImmN = N->getOperand(N->getNumOperands()-1);
-    DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::VPERMI:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = true;
     break;
   case X86ISD::MOVSS:
@@ -4474,20 +4560,21 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
 
   // Recurse into target specific vector shuffles to find scalars.
   if (isTargetShuffle(Opcode)) {
-    unsigned NumElems = VT.getVectorNumElements();
+    MVT ShufVT = V.getValueType().getSimpleVT();
+    unsigned NumElems = ShufVT.getVectorNumElements();
     SmallVector<int, 16> ShuffleMask;
     SDValue ImmN;
     bool IsUnary;
 
-    if (!getTargetShuffleMask(N, VT, ShuffleMask, IsUnary))
+    if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
       return SDValue();
 
     int Elt = ShuffleMask[Index];
     if (Elt < 0)
-      return DAG.getUNDEF(VT.getVectorElementType());
+      return DAG.getUNDEF(ShufVT.getVectorElementType());
 
     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
-                                           : N->getOperand(1);
+                                         : N->getOperand(1);
     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
                                Depth+1);
   }
@@ -4795,7 +4882,7 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
 
     int EltNo = (Offset - StartOffset) >> 2;
-    int NumElems = VT.getVectorNumElements();
+    unsigned NumElems = VT.getVectorNumElements();
 
     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
@@ -4803,7 +4890,7 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
                              false, false, false, 0);
 
     SmallVector<int, 8> Mask;
-    for (int i = 0; i < NumElems; ++i)
+    for (unsigned i = 0; i != NumElems; ++i)
       Mask.push_back(EltNo);
 
     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
@@ -4867,8 +4954,9 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
                        LDBase->getPointerInfo(),
                        LDBase->isVolatile(), LDBase->isNonTemporal(),
                        LDBase->isInvariant(), LDBase->getAlignment());
-  } else if (NumElems == 4 && LastLoadedElt == 1 &&
-             DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
+  }
+  if (NumElems == 4 && LastLoadedElt == 1 &&
+      DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
     SDValue ResNode =
@@ -4897,6 +4985,9 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   DebugLoc dl = Op.getDebugLoc();
 
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Unsupported vector type for broadcast.");
+
   SDValue Ld;
   bool ConstSplatVal;
 
@@ -4931,8 +5022,17 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
         return SDValue();
 
       SDValue Sc = Op.getOperand(0);
-      if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR)
-        return SDValue();
+      if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
+          Sc.getOpcode() != ISD::BUILD_VECTOR) {
+
+        if (!Subtarget->hasAVX2())
+          return SDValue();
+
+        // Use the register form of the broadcast instruction available on AVX2.
+        if (VT.is256BitVector())
+          Sc = Extract128BitVector(Sc, 0, DAG, dl);
+        return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
+      }
 
       Ld = Sc.getOperand(0);
       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
@@ -4948,7 +5048,6 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
   }
 
   bool Is256 = VT.getSizeInBits() == 256;
-  bool Is128 = VT.getSizeInBits() == 128;
 
   // Handle the broadcasting a single constant scalar from the constant pool
   // into a vector. On Sandybridge it is still better to load a constant vector
@@ -4958,9 +5057,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
     assert(!CVT.isVector() && "Must not broadcast a vector type");
     unsigned ScalarSize = CVT.getSizeInBits();
 
-    if ((Is256 && (ScalarSize == 32 || ScalarSize == 64)) ||
-        (Is128 && (ScalarSize == 32))) {
-
+    if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) {
       const Constant *C = 0;
       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
         C = CI->getConstantIntValue();
@@ -4972,40 +5069,32 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
       SDValue CP = DAG.getConstantPool(C, getPointerTy());
       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
-                         MachinePointerInfo::getConstantPool(),
-                         false, false, false, Alignment);
+                       MachinePointerInfo::getConstantPool(),
+                       false, false, false, Alignment);
 
       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
     }
   }
 
-  // The scalar source must be a normal load.
-  if (!ISD::isNormalLoad(Ld.getNode()))
-    return SDValue();
-
-  // Reject loads that have uses of the chain result
-  if (Ld->hasAnyUseOfValue(1))
-    return SDValue();
-
+  bool IsLoad = ISD::isNormalLoad(Ld.getNode());
   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
 
-  // VBroadcast to YMM
-  if (Is256 && (ScalarSize == 32 || ScalarSize == 64))
+  // Handle AVX2 in-register broadcasts.
+  if (!IsLoad && Subtarget->hasAVX2() &&
+      (ScalarSize == 32 || (Is256 && ScalarSize == 64)))
     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
 
-  // VBroadcast to XMM
-  if (Is128 && (ScalarSize == 32))
+  // The scalar source must be a normal load.
+  if (!IsLoad)
+    return SDValue();
+
+  if (ScalarSize == 32 || (Is256 && ScalarSize == 64))
     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
 
   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
-  // double since there is vbroadcastsd xmm
+  // double since there is no vbroadcastsd xmm
   if (Subtarget->hasAVX2() && Ld.getValueType().isInteger()) {
-    // VBroadcast to YMM
-    if (Is256 && (ScalarSize == 8 || ScalarSize == 16))
-      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
-
-    // VBroadcast to XMM
-    if (Is128 && (ScalarSize ==  8 || ScalarSize == 16 || ScalarSize == 64))
+    if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   }
 
@@ -5103,8 +5192,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
           Mask.push_back(Idx);
           for (unsigned i = 1; i != VecElts; ++i)
             Mask.push_back(i);
-          Item = DAG.getVectorShuffle(VecVT, dl, Item,
-                                      DAG.getUNDEF(Item.getValueType()),
+          Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
                                       &Mask[0]);
         }
         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
@@ -5137,8 +5225,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
         if (VT.getSizeInBits() == 256) {
           SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
-          Item = Insert128BitVector(ZeroVec, Item, DAG.getConstant(0, MVT::i32),
-                                    DAG, dl);
+          Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
         } else {
           assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
@@ -5172,7 +5259,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       // Turn it into a shuffle of zero and zero-extended scalar to vector.
       Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
       SmallVector<int, 8> MaskVec;
-      for (unsigned i = 0; i < NumElems; i++)
+      for (unsigned i = 0; i != NumElems; ++i)
         MaskVec.push_back(i == Idx ? 0 : 1);
       return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
     }
@@ -5213,10 +5300,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
                                 NumElems/2);
 
     // Recreate the wider vector with the lower and upper part.
-    SDValue Vec = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Lower,
-                                DAG.getConstant(0, MVT::i32), DAG, dl);
-    return Insert128BitVector(Vec, Upper, DAG.getConstant(NumElems/2, MVT::i32),
-                              DAG, dl);
+    return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
   }
 
   // Let legalizer expand 2-wide build_vectors.
@@ -5383,10 +5467,7 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   SDValue V2 = Op.getOperand(1);
   unsigned NumElems = ResVT.getVectorNumElements();
 
-  SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, ResVT), V1,
-                                 DAG.getConstant(0, MVT::i32), DAG, dl);
-  return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32),
-                            DAG, dl);
+  return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
 }
 
 SDValue
@@ -5408,75 +5489,64 @@ X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
 }
 
 // Try to lower a shuffle node into a simple blend instruction.
-static SDValue LowerVECTOR_SHUFFLEtoBlend(SDValue Op,
+static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
                                           const X86Subtarget *Subtarget,
                                           SelectionDAG &DAG) {
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
   DebugLoc dl = SVOp->getDebugLoc();
-  EVT VT = Op.getValueType();
-  EVT InVT = V1.getValueType();
-  int MaskSize = VT.getVectorNumElements();
-  int InSize = InVT.getVectorNumElements();
+  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  unsigned NumElems = VT.getVectorNumElements();
 
   if (!Subtarget->hasSSE41())
     return SDValue();
 
-  if (MaskSize != InSize)
-    return SDValue();
-
-  int ISDNo = 0;
+  unsigned ISDNo = 0;
   MVT OpTy;
 
-  switch (VT.getSimpleVT().SimpleTy) {
+  switch (VT.SimpleTy) {
   default: return SDValue();
   case MVT::v8i16:
-           ISDNo = X86ISD::BLENDPW;
-           OpTy = MVT::v8i16;
-           break;
+    ISDNo = X86ISD::BLENDPW;
+    OpTy = MVT::v8i16;
+    break;
   case MVT::v4i32:
   case MVT::v4f32:
-           ISDNo = X86ISD::BLENDPS;
-           OpTy = MVT::v4f32;
-           break;
+    ISDNo = X86ISD::BLENDPS;
+    OpTy = MVT::v4f32;
+    break;
   case MVT::v2i64:
   case MVT::v2f64:
-           ISDNo = X86ISD::BLENDPD;
-           OpTy = MVT::v2f64;
-           break;
+    ISDNo = X86ISD::BLENDPD;
+    OpTy = MVT::v2f64;
+    break;
   case MVT::v8i32:
   case MVT::v8f32:
-           if (!Subtarget->hasAVX())
-             return SDValue();
-           ISDNo = X86ISD::BLENDPS;
-           OpTy = MVT::v8f32;
-           break;
+    if (!Subtarget->hasAVX())
+      return SDValue();
+    ISDNo = X86ISD::BLENDPS;
+    OpTy = MVT::v8f32;
+    break;
   case MVT::v4i64:
   case MVT::v4f64:
-           if (!Subtarget->hasAVX())
-             return SDValue();
-           ISDNo = X86ISD::BLENDPD;
-           OpTy = MVT::v4f64;
-           break;
-  case MVT::v16i16:
-           if (!Subtarget->hasAVX2())
-             return SDValue();
-           ISDNo = X86ISD::BLENDPW;
-           OpTy = MVT::v16i16;
-           break;
+    if (!Subtarget->hasAVX())
+      return SDValue();
+    ISDNo = X86ISD::BLENDPD;
+    OpTy = MVT::v4f64;
+    break;
   }
   assert(ISDNo && "Invalid Op Number");
 
   unsigned MaskVals = 0;
 
-  for (int i = 0; i < MaskSize; ++i) {
+  for (unsigned i = 0; i != NumElems; ++i) {
     int EltIdx = SVOp->getMaskElt(i);
-    if (EltIdx == i || EltIdx == -1)
+    if (EltIdx == (int)i || EltIdx < 0)
       MaskVals |= (1<<i);
-    else if (EltIdx == (i + MaskSize))
+    else if (EltIdx == (int)(i + NumElems))
       continue; // Bit is set to zero;
-    else return SDValue();
+    else
+      return SDValue();
   }
 
   V1 = DAG.getNode(ISD::BITCAST, dl, OpTy, V1);
@@ -5630,13 +5700,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
     bool TwoInputs = V1Used && V2Used;
     for (unsigned i = 0; i != 8; ++i) {
       int EltIdx = MaskVals[i] * 2;
-      if (TwoInputs && (EltIdx >= 16)) {
-        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
-        continue;
-      }
-      pshufbMask.push_back(DAG.getConstant(EltIdx,   MVT::i8));
-      pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8));
+      int Idx0 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx;
+      int Idx1 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx+1;
+      pshufbMask.push_back(DAG.getConstant(Idx0,   MVT::i8));
+      pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
     }
     V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1);
     V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
@@ -5650,13 +5717,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
     pshufbMask.clear();
     for (unsigned i = 0; i != 8; ++i) {
       int EltIdx = MaskVals[i] * 2;
-      if (EltIdx < 16) {
-        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
-        continue;
-      }
-      pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
-      pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8));
+      int Idx0 = (EltIdx < 16) ? 0x80 : EltIdx - 16;
+      int Idx1 = (EltIdx < 16) ? 0x80 : EltIdx - 15;
+      pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8));
+      pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
     }
     V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2);
     V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
@@ -5732,10 +5796,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
     int EltIdx = MaskVals[i];
     if (EltIdx < 0)
       continue;
-    SDValue ExtOp = (EltIdx < 8)
-    ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
-                  DAG.getIntPtrConstant(EltIdx))
-    : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
+    SDValue ExtOp = (EltIdx < 8) ?
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
+                  DAG.getIntPtrConstant(EltIdx)) :
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
                   DAG.getIntPtrConstant(EltIdx - 8));
     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
                        DAG.getIntPtrConstant(i));
@@ -5756,21 +5820,11 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
   DebugLoc dl = SVOp->getDebugLoc();
   ArrayRef<int> MaskVals = SVOp->getMask();
 
+  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+
   // If we have SSSE3, case 1 is generated when all result bytes come from
   // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
   // present, fall back to case 3.
-  // FIXME: kill V2Only once shuffles are canonizalized by getNode.
-  bool V1Only = true;
-  bool V2Only = true;
-  for (unsigned i = 0; i < 16; ++i) {
-    int EltIdx = MaskVals[i];
-    if (EltIdx < 0)
-      continue;
-    if (EltIdx < 16)
-      V2Only = false;
-    else
-      V1Only = false;
-  }
 
   // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
   if (TLI.getSubtarget()->hasSSSE3()) {
@@ -5782,23 +5836,16 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
     // Otherwise, we have elements from both input vectors, and must zero out
     // elements that come from V2 in the first mask, and V1 in the second mask
     // so that we can OR them together.
-    bool TwoInputs = !(V1Only || V2Only);
     for (unsigned i = 0; i != 16; ++i) {
       int EltIdx = MaskVals[i];
-      if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) {
-        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
-        continue;
-      }
+      if (EltIdx < 0 || EltIdx >= 16)
+        EltIdx = 0x80;
       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
     }
-    // If all the elements are from V2, assign it to V1 and return after
-    // building the first pshufb.
-    if (V2Only)
-      V1 = V2;
     V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
                      DAG.getNode(ISD::BUILD_VECTOR, dl,
                                  MVT::v16i8, &pshufbMask[0], 16));
-    if (!TwoInputs)
+    if (V2IsUndef)
       return V1;
 
     // Calculate the shuffle mask for the second input, shuffle it, and
@@ -5806,11 +5853,8 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
     pshufbMask.clear();
     for (unsigned i = 0; i != 16; ++i) {
       int EltIdx = MaskVals[i];
-      if (EltIdx < 16) {
-        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
-        continue;
-      }
-      pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
+      EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
+      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
     }
     V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
                      DAG.getNode(ISD::BUILD_VECTOR, dl,
@@ -5823,7 +5867,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
   // the 16 different words that comprise the two doublequadword input vectors.
   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
   V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
-  SDValue NewV = V2Only ? V2 : V1;
+  SDValue NewV = V1;
   for (int i = 0; i != 8; ++i) {
     int Elt0 = MaskVals[i*2];
     int Elt1 = MaskVals[i*2+1];
@@ -5833,9 +5877,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
       continue;
 
     // This word of the result is already in the correct place, skip it.
-    if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1))
-      continue;
-    if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17))
+    if ((Elt0 == i*2) && (Elt1 == i*2+1))
       continue;
 
     SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
@@ -5897,41 +5939,37 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
 static
 SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
                                  SelectionDAG &DAG, DebugLoc dl) {
-  EVT VT = SVOp->getValueType(0);
-  SDValue V1 = SVOp->getOperand(0);
-  SDValue V2 = SVOp->getOperand(1);
+  MVT VT = SVOp->getValueType(0).getSimpleVT();
   unsigned NumElems = VT.getVectorNumElements();
-  unsigned NewWidth = (NumElems == 4) ? 2 : 4;
-  EVT NewVT;
-  switch (VT.getSimpleVT().SimpleTy) {
+  MVT NewVT;
+  unsigned Scale;
+  switch (VT.SimpleTy) {
   default: llvm_unreachable("Unexpected!");
-  case MVT::v4f32: NewVT = MVT::v2f64; break;
-  case MVT::v4i32: NewVT = MVT::v2i64; break;
-  case MVT::v8i16: NewVT = MVT::v4i32; break;
-  case MVT::v16i8: NewVT = MVT::v4i32; break;
+  case MVT::v4f32:  NewVT = MVT::v2f64; Scale = 2; break;
+  case MVT::v4i32:  NewVT = MVT::v2i64; Scale = 2; break;
+  case MVT::v8i16:  NewVT = MVT::v4i32; Scale = 2; break;
+  case MVT::v16i8:  NewVT = MVT::v4i32; Scale = 4; break;
+  case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
+  case MVT::v32i8:  NewVT = MVT::v8i32; Scale = 4; break;
   }
 
-  int Scale = NumElems / NewWidth;
   SmallVector<int, 8> MaskVec;
-  for (unsigned i = 0; i < NumElems; i += Scale) {
+  for (unsigned i = 0; i != NumElems; i += Scale) {
     int StartIdx = -1;
-    for (int j = 0; j < Scale; ++j) {
+    for (unsigned j = 0; j != Scale; ++j) {
       int EltIdx = SVOp->getMaskElt(i+j);
       if (EltIdx < 0)
         continue;
-      if (StartIdx == -1)
-        StartIdx = EltIdx - (EltIdx % Scale);
-      if (EltIdx != StartIdx + j)
+      if (StartIdx < 0)
+        StartIdx = (EltIdx / Scale);
+      if (EltIdx != (int)(StartIdx*Scale + j))
         return SDValue();
     }
-    if (StartIdx == -1)
-      MaskVec.push_back(-1);
-    else
-      MaskVec.push_back(StartIdx / Scale);
+    MaskVec.push_back(StartIdx);
   }
 
-  V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
-  V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
+  SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
+  SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
   return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
 }
 
@@ -5974,6 +6012,11 @@ static SDValue getVZextMovL(EVT VT, EVT OpVT,
 /// which could not be matched by any known target speficic shuffle
 static SDValue
 LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+
+  SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
+  if (NewOp.getNode())
+    return NewOp;
+
   EVT VT = SVOp->getValueType(0);
 
   unsigned NumElems = VT.getVectorNumElements();
@@ -5982,14 +6025,15 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
   DebugLoc dl = SVOp->getDebugLoc();
   MVT EltVT = VT.getVectorElementType().getSimpleVT();
   EVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
-  SDValue Shufs[2];
+  SDValue Output[2];
 
   SmallVector<int, 16> Mask;
   for (unsigned l = 0; l < 2; ++l) {
     // Build a shuffle mask for the output, discovering on the fly which
     // input vectors to use as shuffle operands (recorded in InputUsed).
     // If building a suitable shuffle vector proves too hard, then bail
-    // out with useBuildVector set.
+    // out with UseBuildVector set.
+    bool UseBuildVector = false;
     int InputUsed[2] = { -1, -1 }; // Not yet discovered.
     unsigned LaneStart = l * NumLaneElems;
     for (unsigned i = 0; i != NumLaneElems; ++i) {
@@ -6021,38 +6065,61 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
       }
 
       if (OpNo >= array_lengthof(InputUsed)) {
-        // More than two input vectors used! Give up.
-        return SDValue();
+        // More than two input vectors used!  Give up on trying to create a
+        // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
+        UseBuildVector = true;
+        break;
       }
 
       // Add the mask index for the new shuffle vector.
       Mask.push_back(Idx + OpNo * NumLaneElems);
     }
 
-    if (InputUsed[0] < 0) {
+    if (UseBuildVector) {
+      SmallVector<SDValue, 16> SVOps;
+      for (unsigned i = 0; i != NumLaneElems; ++i) {
+        // The mask element.  This indexes into the input.
+        int Idx = SVOp->getMaskElt(i+LaneStart);
+        if (Idx < 0) {
+          SVOps.push_back(DAG.getUNDEF(EltVT));
+          continue;
+        }
+
+        // The input vector this mask element indexes into.
+        int Input = Idx / NumElems;
+
+        // Turn the index into an offset from the start of the input vector.
+        Idx -= Input * NumElems;
+
+        // Extract the vector element by hand.
+        SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
+                                    SVOp->getOperand(Input),
+                                    DAG.getIntPtrConstant(Idx)));
+      }
+
+      // Construct the output using a BUILD_VECTOR.
+      Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0],
+                              SVOps.size());
+    } else if (InputUsed[0] < 0) {
       // No input vectors were used! The result is undefined.
-      Shufs[l] = DAG.getUNDEF(NVT);
+      Output[l] = DAG.getUNDEF(NVT);
     } else {
       SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
-                   DAG.getConstant((InputUsed[0] % 2) * NumLaneElems, MVT::i32),
-                                   DAG, dl);
+                                        (InputUsed[0] % 2) * NumLaneElems,
+                                        DAG, dl);
       // If only one input was used, use an undefined vector for the other.
       SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
         Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
-                   DAG.getConstant((InputUsed[1] % 2) * NumLaneElems, MVT::i32),
-                                   DAG, dl);
+                            (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
       // At least one input vector was used. Create a new shuffle vector.
-      Shufs[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
+      Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
     }
 
     Mask.clear();
   }
 
   // Concatenate the result back
-  SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Shufs[0],
-                                 DAG.getConstant(0, MVT::i32), DAG, dl);
-  return Insert128BitVector(V, Shufs[1],DAG.getConstant(NumLaneElems, MVT::i32),
-                            DAG, dl);
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
 }
 
 /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
@@ -6108,7 +6175,9 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
       }
 
     return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
-  } else if (NumLo == 3 || NumHi == 3) {
+  }
+
+  if (NumLo == 3 || NumHi == 3) {
     // Otherwise, we must have three elements from one vector, call it X, and
     // one element from the other, call it Y.  First, use a shufps to build an
     // intermediate vector with the one element from Y and the element from X
@@ -6144,17 +6213,17 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
       Mask1[2] = HiIndex & 1 ? 6 : 4;
       Mask1[3] = HiIndex & 1 ? 4 : 6;
       return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
-    } else {
-      Mask1[0] = HiIndex & 1 ? 2 : 0;
-      Mask1[1] = HiIndex & 1 ? 0 : 2;
-      Mask1[2] = PermMask[2];
-      Mask1[3] = PermMask[3];
-      if (Mask1[2] >= 0)
-        Mask1[2] += 4;
-      if (Mask1[3] >= 0)
-        Mask1[3] += 4;
-      return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
     }
+
+    Mask1[0] = HiIndex & 1 ? 2 : 0;
+    Mask1[1] = HiIndex & 1 ? 0 : 2;
+    Mask1[2] = PermMask[2];
+    Mask1[3] = PermMask[3];
+    if (Mask1[2] >= 0)
+      Mask1[2] += 4;
+    if (Mask1[3] >= 0)
+      Mask1[3] += 4;
+    return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
   }
 
   // Break it into (shuffle shuffle_hi, shuffle_lo).
@@ -6303,7 +6372,7 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
       return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
 
     if (NumElems == 4)
-      // If we don't care about the second element, procede to use movss.
+      // If we don't care about the second element, proceed to use movss.
       if (SVOp->getMaskElt(1) != -1)
         return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
   }
@@ -6361,7 +6430,8 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
 
   // If the shuffle can be profitably rewritten as a narrower shuffle, then
   // do it!
-  if (VT == MVT::v8i16 || VT == MVT::v16i8) {
+  if (VT == MVT::v8i16  || VT == MVT::v16i8 ||
+      VT == MVT::v16i16 || VT == MVT::v32i8) {
     SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
     if (NewOp.getNode())
       return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
@@ -6565,11 +6635,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
     // new vector_shuffle with the corrected mask.p
     SmallVector<int, 8> NewMask(M.begin(), M.end());
     NormalizeMask(NewMask, NumElems);
-    if (isUNPCKLMask(NewMask, VT, HasAVX2, true)) {
+    if (isUNPCKLMask(NewMask, VT, HasAVX2, true))
       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
-    } else if (isUNPCKHMask(NewMask, VT, HasAVX2, true)) {
+    if (isUNPCKHMask(NewMask, VT, HasAVX2, true))
       return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
-    }
   }
 
   if (Commuted) {
@@ -6606,12 +6675,12 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
   }
 
-  if (isPSHUFHWMask(M, VT))
+  if (isPSHUFHWMask(M, VT, HasAVX2))
     return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
                                 getShufflePSHUFHWImmediate(SVOp),
                                 DAG);
 
-  if (isPSHUFLWMask(M, VT))
+  if (isPSHUFLWMask(M, VT, HasAVX2))
     return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
                                 getShufflePSHUFLWImmediate(SVOp),
                                 DAG);
@@ -6648,7 +6717,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
     return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
                                 V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
 
-  SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(Op, Subtarget, DAG);
+  SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(SVOp, Subtarget, DAG);
   if (BlendOp.getNode())
     return BlendOp;
 
@@ -6715,7 +6784,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
                                     DAG.getValueType(VT));
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
-  } else if (VT.getSizeInBits() == 16) {
+  }
+
+  if (VT.getSizeInBits() == 16) {
     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
     // If Idx is 0, it's cheaper to do a move instead of a pextrw.
     if (Idx == 0)
@@ -6730,7 +6801,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
                                     DAG.getValueType(VT));
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
-  } else if (VT == MVT::f32) {
+  }
+
+  if (VT == MVT::f32) {
     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
     // the result back to FR32 register. It's only worth matching if the
     // result has a single use which is a store or a bitcast to i32.  And in
@@ -6750,7 +6823,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
                                               Op.getOperand(0)),
                                               Op.getOperand(1));
     return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
-  } else if (VT == MVT::i32 || VT == MVT::i64) {
+  }
+
+  if (VT == MVT::i32 || VT == MVT::i64) {
     // ExtractPS/pextrq works with constant index.
     if (isa<ConstantSDNode>(Op.getOperand(1)))
       return Op;
@@ -6777,12 +6852,12 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
 
     // Get the 128-bit vector.
-    bool Upper = IdxVal >= NumElems/2;
-    Vec = Extract128BitVector(Vec,
-                    DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32), DAG, dl);
+    Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
 
+    if (IdxVal >= NumElems/2)
+      IdxVal -= NumElems/2;
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
-                    Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : Idx);
+                       DAG.getConstant(IdxVal, MVT::i32));
   }
 
   assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length");
@@ -6812,7 +6887,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
                                     DAG.getValueType(VT));
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
-  } else if (VT.getSizeInBits() == 32) {
+  }
+
+  if (VT.getSizeInBits() == 32) {
     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
     if (Idx == 0)
       return Op;
@@ -6824,7 +6901,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
                                        DAG.getUNDEF(VVT), Mask);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
                        DAG.getIntPtrConstant(0));
-  } else if (VT.getSizeInBits() == 64) {
+  }
+
+  if (VT.getSizeInBits() == 64) {
     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
     //        to match extract_elt for f64.
@@ -6877,7 +6956,9 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
     if (N2.getValueType() != MVT::i32)
       N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
     return DAG.getNode(Opc, dl, VT, N0, N1, N2);
-  } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
+  }
+
+  if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
     // Bits [7:6] of the constant are the source select.  This will always be
     //  zero here.  The DAG Combiner may combine an extract_elt index into these
     //  bits.  For example (insert (extract, 3), 2) could be matched by putting
@@ -6890,8 +6971,9 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
     // Create this as a scalar to vector..
     N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
     return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
-  } else if ((EltVT == MVT::i32 || EltVT == MVT::i64) && 
-             isa<ConstantSDNode>(N2)) {
+  }
+
+  if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) {
     // PINSR* works with constant index.
     return Op;
   }
@@ -6917,16 +6999,15 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
     // Get the desired 128-bit vector half.
     unsigned NumElems = VT.getVectorNumElements();
     unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue();
-    bool Upper = IdxVal >= NumElems/2;
-    SDValue Ins128Idx = DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32);
-    SDValue V = Extract128BitVector(N0, Ins128Idx, DAG, dl);
+    SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
 
     // Insert the element into the desired half.
-    V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V,
-                 N1, Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : N2);
+    bool Upper = IdxVal >= NumElems/2;
+    V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
+                 DAG.getConstant(Upper ? IdxVal-NumElems/2 : IdxVal, MVT::i32));
 
     // Insert the changed part back to the 256-bit vector
-    return Insert128BitVector(N0, V, Ins128Idx, DAG, dl);
+    return Insert128BitVector(N0, V, IdxVal, DAG, dl);
   }
 
   if (Subtarget->hasSSE41())
@@ -6964,19 +7045,16 @@ X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
 
     // Insert the 128-bit vector.
-    return Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, OpVT), Op,
-                              DAG.getConstant(0, MVT::i32),
-                              DAG, dl);
+    return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
   }
 
-  if (Op.getValueType() == MVT::v1i64 &&
+  if (OpVT == MVT::v1i64 &&
       Op.getOperand(0).getValueType() == MVT::i64)
     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
 
   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
-  assert(Op.getValueType().getSimpleVT().getSizeInBits() == 128 &&
-         "Expected an SSE type!");
-  return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(),
+  assert(OpVT.getSizeInBits() == 128 && "Expected an SSE type!");
+  return DAG.getNode(ISD::BITCAST, dl, OpVT,
                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
 }
 
@@ -6990,9 +7068,11 @@ X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const {
     SDValue Vec = Op.getNode()->getOperand(0);
     SDValue Idx = Op.getNode()->getOperand(1);
 
-    if (Op.getNode()->getValueType(0).getSizeInBits() == 128
-        && Vec.getNode()->getValueType(0).getSizeInBits() == 256) {
-        return Extract128BitVector(Vec, Idx, DAG, dl);
+    if (Op.getNode()->getValueType(0).getSizeInBits() == 128 &&
+        Vec.getNode()->getValueType(0).getSizeInBits() == 256 &&
+        isa<ConstantSDNode>(Idx)) {
+      unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+      return Extract128BitVector(Vec, IdxVal, DAG, dl);
     }
   }
   return SDValue();
@@ -7009,9 +7089,11 @@ X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const {
     SDValue SubVec = Op.getNode()->getOperand(1);
     SDValue Idx = Op.getNode()->getOperand(2);
 
-    if (Op.getNode()->getValueType(0).getSizeInBits() == 256
-        && SubVec.getNode()->getValueType(0).getSizeInBits() == 128) {
-      return Insert128BitVector(Vec, SubVec, Idx, DAG, dl);
+    if (Op.getNode()->getValueType(0).getSizeInBits() == 256 &&
+        SubVec.getNode()->getValueType(0).getSizeInBits() == 128 &&
+        isa<ConstantSDNode>(Idx)) {
+      unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+      return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
     }
   }
   return SDValue();
@@ -7220,7 +7302,7 @@ X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
 static SDValue
 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
-           unsigned char OperandFlags) {
+           unsigned char OperandFlags, bool LocalDynamic = false) {
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   DebugLoc dl = GA->getDebugLoc();
@@ -7228,12 +7310,16 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
                                            GA->getValueType(0),
                                            GA->getOffset(),
                                            OperandFlags);
+
+  X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
+                                           : X86ISD::TLSADDR;
+
   if (InFlag) {
     SDValue Ops[] = { Chain,  TGA, *InFlag };
-    Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3);
+    Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 3);
   } else {
     SDValue Ops[]  = { Chain, TGA };
-    Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2);
+    Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 2);
   }
 
   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
@@ -7265,11 +7351,49 @@ LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                     X86::RAX, X86II::MO_TLSGD);
 }
 
-// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
-// "local exec" model.
+static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
+                                           SelectionDAG &DAG,
+                                           const EVT PtrVT,
+                                           bool is64Bit) {
+  DebugLoc dl = GA->getDebugLoc();
+
+  // Get the start address of the TLS block for this module.
+  X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
+      .getInfo<X86MachineFunctionInfo>();
+  MFI->incNumLocalDynamicTLSAccesses();
+
+  SDValue Base;
+  if (is64Bit) {
+    Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX,
+                      X86II::MO_TLSLD, /*LocalDynamic=*/true);
+  } else {
+    SDValue InFlag;
+    SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
+        DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT), InFlag);
+    InFlag = Chain.getValue(1);
+    Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
+                      X86II::MO_TLSLDM, /*LocalDynamic=*/true);
+  }
+
+  // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
+  // of Base.
+
+  // Build x@dtpoff.
+  unsigned char OperandFlags = X86II::MO_DTPOFF;
+  unsigned WrapperKind = X86ISD::Wrapper;
+  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+                                           GA->getValueType(0),
+                                           GA->getOffset(), OperandFlags);
+  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
+
+  // Add x@dtpoff with the base.
+  return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
+}
+
+// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                                    const EVT PtrVT, TLSModel::Model model,
-                                   bool is64Bit) {
+                                   bool is64Bit, bool isPIC) {
   DebugLoc dl = GA->getDebugLoc();
 
   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
@@ -7287,25 +7411,36 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   unsigned WrapperKind = X86ISD::Wrapper;
   if (model == TLSModel::LocalExec) {
     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
-  } else if (is64Bit) {
-    assert(model == TLSModel::InitialExec);
-    OperandFlags = X86II::MO_GOTTPOFF;
-    WrapperKind = X86ISD::WrapperRIP;
+  } else if (model == TLSModel::InitialExec) {
+    if (is64Bit) {
+      OperandFlags = X86II::MO_GOTTPOFF;
+      WrapperKind = X86ISD::WrapperRIP;
+    } else {
+      OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
+    }
   } else {
-    assert(model == TLSModel::InitialExec);
-    OperandFlags = X86II::MO_INDNTPOFF;
+    llvm_unreachable("Unexpected model");
   }
 
-  // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
-  // exec)
+  // emit "addl x@ntpoff,%eax" (local exec)
+  // or "addl x@indntpoff,%eax" (initial exec)
+  // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
                                            GA->getValueType(0),
                                            GA->getOffset(), OperandFlags);
   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
 
-  if (model == TLSModel::InitialExec)
+  if (model == TLSModel::InitialExec) {
+    if (isPIC && !is64Bit) {
+      Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
+                          DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT),
+                           Offset);
+    }
+
     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
-                         MachinePointerInfo::getGOT(), false, false, false, 0);
+                         MachinePointerInfo::getGOT(), false, false, false,
+                         0);
+  }
 
   // The address of the thread local variable is the add of the thread
   // pointer with the offset of the variable.
@@ -7319,29 +7454,26 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
   const GlobalValue *GV = GA->getGlobal();
 
   if (Subtarget->isTargetELF()) {
-    // TODO: implement the "local dynamic" model
-    // TODO: implement the "initial exec"model for pic executables
-
-    // If GV is an alias then use the aliasee for determining
-    // thread-localness.
-    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-      GV = GA->resolveAliasedGlobal(false);
-
     TLSModel::Model model = getTargetMachine().getTLSModel(GV);
 
     switch (model) {
       case TLSModel::GeneralDynamic:
-      case TLSModel::LocalDynamic: // not implemented
         if (Subtarget->is64Bit())
           return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
         return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
-
+      case TLSModel::LocalDynamic:
+        return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
+                                           Subtarget->is64Bit());
       case TLSModel::InitialExec:
       case TLSModel::LocalExec:
         return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
-                                   Subtarget->is64Bit());
+                                   Subtarget->is64Bit(),
+                         getTargetMachine().getRelocationModel() == Reloc::PIC_);
     }
-  } else if (Subtarget->isTargetDarwin()) {
+    llvm_unreachable("Unknown TLS model.");
+  }
+
+  if (Subtarget->isTargetDarwin()) {
     // Darwin only has one model of TLS.  Lower to that.
     unsigned char OpFlag = 0;
     unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
@@ -7384,7 +7516,9 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
     return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
                               Chain.getValue(1));
-  } else if (Subtarget->isTargetWindows()) {
+  }
+
+  if (Subtarget->isTargetWindows()) {
     // Just use the implicit TLS architecture
     // Need to generate someting similar to:
     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
@@ -7430,7 +7564,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
                         false, false, false, 0);
 
     SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
-		                            getPointerTy());
+                                    getPointerTy());
     IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
 
     SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
@@ -7694,12 +7828,11 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
   // Handle final rounding.
   EVT DestVT = Op.getValueType();
 
-  if (DestVT.bitsLT(MVT::f64)) {
+  if (DestVT.bitsLT(MVT::f64))
     return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
                        DAG.getIntPtrConstant(0));
-  } else if (DestVT.bitsGT(MVT::f64)) {
+  if (DestVT.bitsGT(MVT::f64))
     return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
-  }
 
   // Handle final rounding.
   return Sub;
@@ -7720,10 +7853,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   EVT DstVT = Op.getValueType();
   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
     return LowerUINT_TO_FP_i64(Op, DAG);
-  else if (SrcVT == MVT::i32 && X86ScalarSSEf64)
+  if (SrcVT == MVT::i32 && X86ScalarSSEf64)
     return LowerUINT_TO_FP_i32(Op, DAG);
-  else if (Subtarget->is64Bit() &&
-           SrcVT == MVT::i64 && DstVT == MVT::f32)
+  if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
     return SDValue();
 
   // Make a 64-bit buffer, and use it to build an FILD.
@@ -7900,9 +8032,9 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
     return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
                        FIST, StackSlot, MachinePointerInfo(),
                        false, false, false, 0);
-  else
-    // The node is the result.
-    return FIST;
+
+  // The node is the result.
+  return FIST;
 }
 
 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
@@ -7917,9 +8049,9 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
     return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
                        FIST, StackSlot, MachinePointerInfo(),
                        false, false, false, 0);
-  else
-    // The node is the result.
-    return FIST;
+
+  // The node is the result.
+  return FIST;
 }
 
 SDValue X86TargetLowering::LowerFABS(SDValue Op,
@@ -7969,12 +8101,12 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
     MVT XORVT = VT.getSizeInBits() == 128 ? MVT::v2i64 : MVT::v4i64;
     return DAG.getNode(ISD::BITCAST, dl, VT,
                        DAG.getNode(ISD::XOR, dl, XORVT,
-                    DAG.getNode(ISD::BITCAST, dl, XORVT,
-                                Op.getOperand(0)),
-                    DAG.getNode(ISD::BITCAST, dl, XORVT, Mask)));
-  } else {
-    return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
+                                   DAG.getNode(ISD::BITCAST, dl, XORVT,
+                                               Op.getOperand(0)),
+                                   DAG.getNode(ISD::BITCAST, dl, XORVT, Mask)));
   }
+
+  return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
 }
 
 SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
@@ -8173,7 +8305,13 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
     // Otherwise use a regular EFLAGS-setting instruction.
     switch (Op.getNode()->getOpcode()) {
     default: llvm_unreachable("unexpected operator!");
-    case ISD::SUB: Opcode = X86ISD::SUB; break;
+    case ISD::SUB:
+      // If the only use of SUB is EFLAGS, use CMP instead.
+      if (Op.hasOneUse())
+        Opcode = X86ISD::CMP;
+      else
+        Opcode = X86ISD::SUB;
+      break;
     case ISD::OR:  Opcode = X86ISD::OR;  break;
     case ISD::XOR: Opcode = X86ISD::XOR; break;
     case ISD::AND: Opcode = X86ISD::AND; break;
@@ -8199,6 +8337,14 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
                        DAG.getConstant(0, Op.getValueType()));
 
+  if (Opcode == X86ISD::CMP) {
+    SDValue New = DAG.getNode(Opcode, dl, MVT::i32, Op.getOperand(0),
+                              Op.getOperand(1));
+    // We can't replace usage of SUB with CMP.
+    // The SUB node will be removed later because there is no use of it.
+    return SDValue(New.getNode(), 0);
+  }
+
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
   SmallVector<SDValue, 4> Ops;
   for (unsigned i = 0; i != NumOperands; ++i)
@@ -8221,6 +8367,30 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
   return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
 }
 
+/// Convert a comparison if required by the subtarget.
+SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
+                                                 SelectionDAG &DAG) const {
+  // If the subtarget does not support the FUCOMI instruction, floating-point
+  // comparisons have to be converted.
+  if (Subtarget->hasCMov() ||
+      Cmp.getOpcode() != X86ISD::CMP ||
+      !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
+      !Cmp.getOperand(1).getValueType().isFloatingPoint())
+    return Cmp;
+
+  // The instruction selector will select an FUCOM instruction instead of
+  // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
+  // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
+  // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
+  DebugLoc dl = Cmp.getDebugLoc();
+  SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
+  SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
+  SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
+                            DAG.getConstant(8, MVT::i8));
+  SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
+  return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
+}
+
 /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
 /// if it's possible.
 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
@@ -8342,6 +8512,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
     return SDValue();
 
   SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG);
+  EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
   return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
                      DAG.getConstant(X86CC, MVT::i8), EFLAGS);
 }
@@ -8354,21 +8525,19 @@ static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
   assert(VT.getSizeInBits() == 256 && Op.getOpcode() == ISD::SETCC &&
          "Unsupported value type for operation");
 
-  int NumElems = VT.getVectorNumElements();
+  unsigned NumElems = VT.getVectorNumElements();
   DebugLoc dl = Op.getDebugLoc();
   SDValue CC = Op.getOperand(2);
-  SDValue Idx0 = DAG.getConstant(0, MVT::i32);
-  SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
 
   // Extract the LHS vectors
   SDValue LHS = Op.getOperand(0);
-  SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
-  SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
+  SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
+  SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
 
   // Extract the RHS vectors
   SDValue RHS = Op.getOperand(1);
-  SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl);
-  SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl);
+  SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
+  SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
 
   // Issue the operation on the smaller types and concatenate the result back
   MVT EltVT = VT.getVectorElementType().getSimpleVT();
@@ -8438,7 +8607,8 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
         EQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
                          DAG.getConstant(0, MVT::i8));
         return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ);
-      } else if (SetCCOpcode == ISD::SETONE) {
+      }
+      if (SetCCOpcode == ISD::SETONE) {
         SDValue ORD, NEQ;
         ORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
                           DAG.getConstant(7, MVT::i8));
@@ -8511,7 +8681,8 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
 // isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
 static bool isX86LogicalCmp(SDValue Op) {
   unsigned Opc = Op.getNode()->getOpcode();
-  if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI)
+  if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
+      Opc == X86ISD::SAHF)
     return true;
   if (Op.getResNo() == 1 &&
       (Opc == X86ISD::ADD ||
@@ -8557,6 +8728,46 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
       Cond = NewCond;
   }
 
+  // Handle the following cases related to max and min:
+  // (a > b) ? (a-b) : 0
+  // (a >= b) ? (a-b) : 0
+  // (b < a) ? (a-b) : 0
+  // (b <= a) ? (a-b) : 0
+  // Comparison is removed to use EFLAGS from SUB.
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op2))
+    if (Cond.getOpcode() == X86ISD::SETCC &&
+        Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
+        (Op1.getOpcode() == ISD::SUB || Op1.getOpcode() == X86ISD::SUB) &&
+        C->getAPIntValue() == 0) {
+      SDValue Cmp = Cond.getOperand(1);
+      unsigned CC = cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
+      if ((DAG.isEqualTo(Op1.getOperand(0), Cmp.getOperand(0)) &&
+           DAG.isEqualTo(Op1.getOperand(1), Cmp.getOperand(1)) &&
+           (CC == X86::COND_G || CC == X86::COND_GE ||
+            CC == X86::COND_A || CC == X86::COND_AE)) ||
+          (DAG.isEqualTo(Op1.getOperand(0), Cmp.getOperand(1)) &&
+           DAG.isEqualTo(Op1.getOperand(1), Cmp.getOperand(0)) &&
+           (CC == X86::COND_L || CC == X86::COND_LE ||
+            CC == X86::COND_B || CC == X86::COND_BE))) {
+
+        if (Op1.getOpcode() == ISD::SUB) {
+          SDVTList VTs = DAG.getVTList(Op1.getValueType(), MVT::i32);
+          SDValue New = DAG.getNode(X86ISD::SUB, DL, VTs,
+                                    Op1.getOperand(0), Op1.getOperand(1));
+          DAG.ReplaceAllUsesWith(Op1, New);
+          Op1 = New;
+        }
+
+        SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+        unsigned NewCC = (CC == X86::COND_G || CC == X86::COND_GE ||
+                          CC == X86::COND_L ||
+                          CC == X86::COND_LE) ? X86::COND_GE : X86::COND_AE;
+        SDValue Ops[] = { Op2, Op1, DAG.getConstant(NewCC, MVT::i8),
+                          SDValue(Op1.getNode(), 1) };
+        return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops));
+      }
+    }
+
   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
@@ -8573,8 +8784,25 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
       SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
 
       SDValue CmpOp0 = Cmp.getOperand(0);
+      // Apply further optimizations for special cases
+      // (select (x != 0), -1, 0) -> neg & sbb
+      // (select (x == 0), 0, -1) -> neg & sbb
+      if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
+        if (YC->isNullValue() && 
+            (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
+          SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
+          SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, 
+                                    DAG.getConstant(0, CmpOp0.getValueType()), 
+                                    CmpOp0);
+          SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+                                    DAG.getConstant(X86::COND_B, MVT::i8),
+                                    SDValue(Neg.getNode(), 1));
+          return Res;
+        }
+
       Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
                         CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
+      Cmp = ConvertCmpIfNecessary(Cmp, DAG);
 
       SDValue Res =   // Res = 0 or -1.
         DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
@@ -8681,6 +8909,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   // a >= b ? -1 :  0 -> RES = setcc_carry
   // a >= b ?  0 : -1 -> RES = ~setcc_carry
   if (Cond.getOpcode() == X86ISD::CMP) {
+    Cond = ConvertCmpIfNecessary(Cond, DAG);
     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
 
     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
@@ -8919,6 +9148,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
 
           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
                                     Cond.getOperand(0), Cond.getOperand(1));
+          Cmp = ConvertCmpIfNecessary(Cmp, DAG);
           CC = DAG.getConstant(X86::COND_NE, MVT::i8);
           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
                               Chain, Dest, CC, Cmp);
@@ -8948,6 +9178,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
 
           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
                                     Cond.getOperand(0), Cond.getOperand(1));
+          Cmp = ConvertCmpIfNecessary(Cmp, DAG);
           CC = DAG.getConstant(X86::COND_NE, MVT::i8);
           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
                               Chain, Dest, CC, Cmp);
@@ -8981,6 +9212,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
     CC = DAG.getConstant(X86::COND_NE, MVT::i8);
     Cond = EmitTest(Cond, X86::COND_NE, DAG);
   }
+  Cond = ConvertCmpIfNecessary(Cond, DAG);
   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
                      Chain, Dest, CC, Cond);
 }
@@ -9019,7 +9251,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
       const Function *F = MF.getFunction();
 
       for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
-           I != E; I++)
+           I != E; ++I)
         if (I->hasNestAttr())
           report_fatal_error("Cannot use segmented stacks with functions that "
                              "have nested arguments.");
@@ -9202,12 +9434,15 @@ static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
   assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32");
 
   if (isa<ConstantSDNode>(ShAmt)) {
+    // Constant may be a TargetConstant. Use a regular constant.
+    uint32_t ShiftAmt = cast<ConstantSDNode>(ShAmt)->getZExtValue();
     switch (Opc) {
       default: llvm_unreachable("Unknown target vector shift node");
       case X86ISD::VSHLI:
       case X86ISD::VSRLI:
       case X86ISD::VSRAI:
-        return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
+        return DAG.getNode(Opc, dl, VT, SrcOp,
+                           DAG.getConstant(ShiftAmt, MVT::i32));
     }
   }
 
@@ -9227,7 +9462,13 @@ static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
   ShOps[2] = DAG.getUNDEF(MVT::i32);
   ShOps[3] = DAG.getUNDEF(MVT::i32);
   ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4);
-  ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt);
+
+  // The return type has to be a 128-bit type with the same element
+  // type as the input type.
+  MVT EltVT = VT.getVectorElementType().getSimpleVT();
+  EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
+
+  ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
 }
 
@@ -9337,196 +9578,6 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
                                 DAG.getConstant(X86CC, MVT::i8), Cond);
     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   }
-  // XOP comparison intrinsics
-  case Intrinsic::x86_xop_vpcomltb:
-  case Intrinsic::x86_xop_vpcomltw:
-  case Intrinsic::x86_xop_vpcomltd:
-  case Intrinsic::x86_xop_vpcomltq:
-  case Intrinsic::x86_xop_vpcomltub:
-  case Intrinsic::x86_xop_vpcomltuw:
-  case Intrinsic::x86_xop_vpcomltud:
-  case Intrinsic::x86_xop_vpcomltuq:
-  case Intrinsic::x86_xop_vpcomleb:
-  case Intrinsic::x86_xop_vpcomlew:
-  case Intrinsic::x86_xop_vpcomled:
-  case Intrinsic::x86_xop_vpcomleq:
-  case Intrinsic::x86_xop_vpcomleub:
-  case Intrinsic::x86_xop_vpcomleuw:
-  case Intrinsic::x86_xop_vpcomleud:
-  case Intrinsic::x86_xop_vpcomleuq:
-  case Intrinsic::x86_xop_vpcomgtb:
-  case Intrinsic::x86_xop_vpcomgtw:
-  case Intrinsic::x86_xop_vpcomgtd:
-  case Intrinsic::x86_xop_vpcomgtq:
-  case Intrinsic::x86_xop_vpcomgtub:
-  case Intrinsic::x86_xop_vpcomgtuw:
-  case Intrinsic::x86_xop_vpcomgtud:
-  case Intrinsic::x86_xop_vpcomgtuq:
-  case Intrinsic::x86_xop_vpcomgeb:
-  case Intrinsic::x86_xop_vpcomgew:
-  case Intrinsic::x86_xop_vpcomged:
-  case Intrinsic::x86_xop_vpcomgeq:
-  case Intrinsic::x86_xop_vpcomgeub:
-  case Intrinsic::x86_xop_vpcomgeuw:
-  case Intrinsic::x86_xop_vpcomgeud:
-  case Intrinsic::x86_xop_vpcomgeuq:
-  case Intrinsic::x86_xop_vpcomeqb:
-  case Intrinsic::x86_xop_vpcomeqw:
-  case Intrinsic::x86_xop_vpcomeqd:
-  case Intrinsic::x86_xop_vpcomeqq:
-  case Intrinsic::x86_xop_vpcomequb:
-  case Intrinsic::x86_xop_vpcomequw:
-  case Intrinsic::x86_xop_vpcomequd:
-  case Intrinsic::x86_xop_vpcomequq:
-  case Intrinsic::x86_xop_vpcomneb:
-  case Intrinsic::x86_xop_vpcomnew:
-  case Intrinsic::x86_xop_vpcomned:
-  case Intrinsic::x86_xop_vpcomneq:
-  case Intrinsic::x86_xop_vpcomneub:
-  case Intrinsic::x86_xop_vpcomneuw:
-  case Intrinsic::x86_xop_vpcomneud:
-  case Intrinsic::x86_xop_vpcomneuq:
-  case Intrinsic::x86_xop_vpcomfalseb:
-  case Intrinsic::x86_xop_vpcomfalsew:
-  case Intrinsic::x86_xop_vpcomfalsed:
-  case Intrinsic::x86_xop_vpcomfalseq:
-  case Intrinsic::x86_xop_vpcomfalseub:
-  case Intrinsic::x86_xop_vpcomfalseuw:
-  case Intrinsic::x86_xop_vpcomfalseud:
-  case Intrinsic::x86_xop_vpcomfalseuq:
-  case Intrinsic::x86_xop_vpcomtrueb:
-  case Intrinsic::x86_xop_vpcomtruew:
-  case Intrinsic::x86_xop_vpcomtrued:
-  case Intrinsic::x86_xop_vpcomtrueq:
-  case Intrinsic::x86_xop_vpcomtrueub:
-  case Intrinsic::x86_xop_vpcomtrueuw:
-  case Intrinsic::x86_xop_vpcomtrueud:
-  case Intrinsic::x86_xop_vpcomtrueuq: {
-    unsigned CC = 0;
-    unsigned Opc = 0;
-
-    switch (IntNo) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    case Intrinsic::x86_xop_vpcomltb:
-    case Intrinsic::x86_xop_vpcomltw:
-    case Intrinsic::x86_xop_vpcomltd:
-    case Intrinsic::x86_xop_vpcomltq:
-      CC = 0;
-      Opc = X86ISD::VPCOM;
-      break;
-    case Intrinsic::x86_xop_vpcomltub:
-    case Intrinsic::x86_xop_vpcomltuw:
-    case Intrinsic::x86_xop_vpcomltud:
-    case Intrinsic::x86_xop_vpcomltuq:
-      CC = 0;
-      Opc = X86ISD::VPCOMU;
-      break;
-    case Intrinsic::x86_xop_vpcomleb:
-    case Intrinsic::x86_xop_vpcomlew:
-    case Intrinsic::x86_xop_vpcomled:
-    case Intrinsic::x86_xop_vpcomleq:
-      CC = 1;
-      Opc = X86ISD::VPCOM;
-      break;
-    case Intrinsic::x86_xop_vpcomleub:
-    case Intrinsic::x86_xop_vpcomleuw:
-    case Intrinsic::x86_xop_vpcomleud:
-    case Intrinsic::x86_xop_vpcomleuq:
-      CC = 1;
-      Opc = X86ISD::VPCOMU;
-      break;
-    case Intrinsic::x86_xop_vpcomgtb:
-    case Intrinsic::x86_xop_vpcomgtw:
-    case Intrinsic::x86_xop_vpcomgtd:
-    case Intrinsic::x86_xop_vpcomgtq:
-      CC = 2;
-      Opc = X86ISD::VPCOM;
-      break;
-    case Intrinsic::x86_xop_vpcomgtub:
-    case Intrinsic::x86_xop_vpcomgtuw:
-    case Intrinsic::x86_xop_vpcomgtud:
-    case Intrinsic::x86_xop_vpcomgtuq:
-      CC = 2;
-      Opc = X86ISD::VPCOMU;
-      break;
-    case Intrinsic::x86_xop_vpcomgeb:
-    case Intrinsic::x86_xop_vpcomgew:
-    case Intrinsic::x86_xop_vpcomged:
-    case Intrinsic::x86_xop_vpcomgeq:
-      CC = 3;
-      Opc = X86ISD::VPCOM;
-      break;
-    case Intrinsic::x86_xop_vpcomgeub:
-    case Intrinsic::x86_xop_vpcomgeuw:
-    case Intrinsic::x86_xop_vpcomgeud:
-    case Intrinsic::x86_xop_vpcomgeuq:
-      CC = 3;
-      Opc = X86ISD::VPCOMU;
-      break;
-    case Intrinsic::x86_xop_vpcomeqb:
-    case Intrinsic::x86_xop_vpcomeqw:
-    case Intrinsic::x86_xop_vpcomeqd:
-    case Intrinsic::x86_xop_vpcomeqq:
-      CC = 4;
-      Opc = X86ISD::VPCOM;
-      break;
-    case Intrinsic::x86_xop_vpcomequb:
-    case Intrinsic::x86_xop_vpcomequw:
-    case Intrinsic::x86_xop_vpcomequd:
-    case Intrinsic::x86_xop_vpcomequq:
-      CC = 4;
-      Opc = X86ISD::VPCOMU;
-      break;
-    case Intrinsic::x86_xop_vpcomneb:
-    case Intrinsic::x86_xop_vpcomnew:
-    case Intrinsic::x86_xop_vpcomned:
-    case Intrinsic::x86_xop_vpcomneq:
-      CC = 5;
-      Opc = X86ISD::VPCOM;
-      break;
-    case Intrinsic::x86_xop_vpcomneub:
-    case Intrinsic::x86_xop_vpcomneuw:
-    case Intrinsic::x86_xop_vpcomneud:
-    case Intrinsic::x86_xop_vpcomneuq:
-      CC = 5;
-      Opc = X86ISD::VPCOMU;
-      break;
-    case Intrinsic::x86_xop_vpcomfalseb:
-    case Intrinsic::x86_xop_vpcomfalsew:
-    case Intrinsic::x86_xop_vpcomfalsed:
-    case Intrinsic::x86_xop_vpcomfalseq:
-      CC = 6;
-      Opc = X86ISD::VPCOM;
-      break;
-    case Intrinsic::x86_xop_vpcomfalseub:
-    case Intrinsic::x86_xop_vpcomfalseuw:
-    case Intrinsic::x86_xop_vpcomfalseud:
-    case Intrinsic::x86_xop_vpcomfalseuq:
-      CC = 6;
-      Opc = X86ISD::VPCOMU;
-      break;
-    case Intrinsic::x86_xop_vpcomtrueb:
-    case Intrinsic::x86_xop_vpcomtruew:
-    case Intrinsic::x86_xop_vpcomtrued:
-    case Intrinsic::x86_xop_vpcomtrueq:
-      CC = 7;
-      Opc = X86ISD::VPCOM;
-      break;
-    case Intrinsic::x86_xop_vpcomtrueub:
-    case Intrinsic::x86_xop_vpcomtrueuw:
-    case Intrinsic::x86_xop_vpcomtrueud:
-    case Intrinsic::x86_xop_vpcomtrueuq:
-      CC = 7;
-      Opc = X86ISD::VPCOMU;
-      break;
-    }
-
-    SDValue LHS = Op.getOperand(1);
-    SDValue RHS = Op.getOperand(2);
-    return DAG.getNode(Opc, dl, Op.getValueType(), LHS, RHS,
-                       DAG.getConstant(CC, MVT::i8));
-  }
-
   // Arithmetic intrinsics.
   case Intrinsic::x86_sse2_pmulu_dq:
   case Intrinsic::x86_avx2_pmulu_dq:
@@ -9770,6 +9821,38 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
   }
 }
 
+SDValue
+X86TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  switch (IntNo) {
+  default: return SDValue();    // Don't custom lower most intrinsics.
+
+  // RDRAND intrinsics.
+  case Intrinsic::x86_rdrand_16:
+  case Intrinsic::x86_rdrand_32:
+  case Intrinsic::x86_rdrand_64: {
+    // Emit the node with the right value type.
+    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
+    SDValue Result = DAG.getNode(X86ISD::RDRAND, dl, VTs, Op.getOperand(0));
+
+    // If the value returned by RDRAND was valid (CF=1), return 1. Otherwise
+    // return the value from Rand, which is always 0, casted to i32.
+    SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
+                      DAG.getConstant(1, Op->getValueType(1)),
+                      DAG.getConstant(X86::COND_B, MVT::i32),
+                      SDValue(Result.getNode(), 1) };
+    SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
+                                  DAG.getVTList(Op->getValueType(1), MVT::Glue),
+                                  Ops, 4);
+
+    // Return { result, isValid, chain }.
+    return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
+                       SDValue(Result.getNode(), 2));
+  }
+  }
+}
+
 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
                                            SelectionDAG &DAG) const {
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
@@ -9817,7 +9900,6 @@ SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
 }
 
 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
-  MachineFunction &MF = DAG.getMachineFunction();
   SDValue Chain     = Op.getOperand(0);
   SDValue Offset    = Op.getOperand(1);
   SDValue Handler   = Op.getOperand(2);
@@ -9834,7 +9916,6 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
                        false, false, 0);
   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
-  MF.getRegInfo().addLiveOut(StoreAddrReg);
 
   return DAG.getNode(X86ISD::EH_RETURN, dl,
                      MVT::Other,
@@ -10153,20 +10234,18 @@ static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
   assert(VT.getSizeInBits() == 256 && VT.isInteger() &&
          "Unsupported value type for operation");
 
-  int NumElems = VT.getVectorNumElements();
+  unsigned NumElems = VT.getVectorNumElements();
   DebugLoc dl = Op.getDebugLoc();
-  SDValue Idx0 = DAG.getConstant(0, MVT::i32);
-  SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
 
   // Extract the LHS vectors
   SDValue LHS = Op.getOperand(0);
-  SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
-  SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
+  SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
+  SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
 
   // Extract the RHS vectors
   SDValue RHS = Op.getOperand(1);
-  SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl);
-  SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl);
+  SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
+  SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
 
   MVT EltVT = VT.getVectorElementType().getSimpleVT();
   EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
@@ -10311,6 +10390,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
           return Res;
         }
+        llvm_unreachable("Unknown shift opcode.");
       }
 
       if (Subtarget->hasAVX2() && VT == MVT::v32i8) {
@@ -10354,6 +10434,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
           return Res;
         }
+        llvm_unreachable("Unknown shift opcode.");
       }
     }
   }
@@ -10428,9 +10509,8 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
     EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
 
     // Extract the two vectors
-    SDValue V1 = Extract128BitVector(R, DAG.getConstant(0, MVT::i32), DAG, dl);
-    SDValue V2 = Extract128BitVector(R, DAG.getConstant(NumElems/2, MVT::i32),
-                                     DAG, dl);
+    SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
+    SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
 
     // Recreate the shift amount vectors
     SDValue Amt1, Amt2;
@@ -10449,9 +10529,8 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
                                  &Amt2Csts[0], NumElems/2);
     } else {
       // Variable shift amount
-      Amt1 = Extract128BitVector(Amt, DAG.getConstant(0, MVT::i32), DAG, dl);
-      Amt2 = Extract128BitVector(Amt, DAG.getConstant(NumElems/2, MVT::i32),
-                                 DAG, dl);
+      Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
+      Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
     }
 
     // Issue new vector shifts for the smaller types
@@ -10561,20 +10640,18 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
         return SDValue();
       if (!Subtarget->hasAVX2()) {
         // needs to be split
-        int NumElems = VT.getVectorNumElements();
-        SDValue Idx0 = DAG.getConstant(0, MVT::i32);
-        SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
+        unsigned NumElems = VT.getVectorNumElements();
 
         // Extract the LHS vectors
         SDValue LHS = Op.getOperand(0);
-        SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
-        SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
+        SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
+        SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
 
         MVT EltVT = VT.getVectorElementType().getSimpleVT();
         EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
 
         EVT ExtraEltVT = ExtraVT.getVectorElementType();
-        int ExtraNumElems = ExtraVT.getVectorNumElements();
+        unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
         ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
                                    ExtraNumElems/2);
         SDValue Extra = DAG.getValueType(ExtraVT);
@@ -10860,6 +10937,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::VAARG:              return LowerVAARG(Op, DAG);
   case ISD::VACOPY:             return LowerVACOPY(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, DAG);
   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
   case ISD::FRAME_TO_ARGS_OFFSET:
@@ -11119,10 +11197,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
   case X86ISD::FRCP:               return "X86ISD::FRCP";
   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
+  case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
   case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
   case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
+  case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
   case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
   case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
   case X86ISD::ATOMADD64_DAG:      return "X86ISD::ATOMADD64_DAG";
@@ -11191,6 +11271,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
   case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
   case X86ISD::WIN_FTOL:           return "X86ISD::WIN_FTOL";
+  case X86ISD::SAHF:               return "X86ISD::SAHF";
+  case X86ISD::RDRAND:             return "X86ISD::RDRAND";
   }
 }
 
@@ -11259,6 +11341,15 @@ bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
   return true;
 }
 
+bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+  return Imm == (int32_t)Imm;
+}
+
+bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
+  // Can also use sub to handle negated immediates.
+  return Imm == (int32_t)Imm;
+}
+
 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
   if (!VT1.isInteger() || !VT2.isInteger())
     return false;
@@ -11301,8 +11392,8 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
           isMOVLMask(M, VT) ||
           isSHUFPMask(M, VT, Subtarget->hasAVX()) ||
           isPSHUFDMask(M, VT) ||
-          isPSHUFHWMask(M, VT) ||
-          isPSHUFLWMask(M, VT) ||
+          isPSHUFHWMask(M, VT, Subtarget->hasAVX2()) ||
+          isPSHUFLWMask(M, VT, Subtarget->hasAVX2()) ||
           isPALIGNRMask(M, VT, Subtarget) ||
           isUNPCKLMask(M, VT, Subtarget->hasAVX2()) ||
           isUNPCKHMask(M, VT, Subtarget->hasAVX2()) ||
@@ -11461,7 +11552,7 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
   //     result in out1, out2
   //     fallthrough -->nextMBB
 
-  const TargetRegisterClass *RC = X86::GR32RegisterClass;
+  const TargetRegisterClass *RC = &X86::GR32RegClass;
   const unsigned LoadOpc = X86::MOV32rm;
   const unsigned NotOpc = X86::NOT32r;
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
@@ -11663,7 +11754,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
   int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
   int valArgIndx = lastAddrIndx + 1;
 
-  unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+  unsigned t1 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
   MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1);
   for (int i=0; i <= lastAddrIndx; ++i)
     (*MIB).addOperand(*argOpers[i]);
@@ -11673,7 +11764,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
           argOpers[valArgIndx]->isImm()) &&
          "invalid operand");
 
-  unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+  unsigned t2 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
   if (argOpers[valArgIndx]->isReg())
     MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2);
   else
@@ -11688,7 +11779,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
   MIB.addReg(t2);
 
   // Generate movc
-  unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+  unsigned t3 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
   MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3);
   MIB.addReg(t2);
   MIB.addReg(t1);
@@ -12307,8 +12398,9 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
       .addReg(sizeVReg);
     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
-      .addExternalSymbol("__morestack_allocate_stack_space").addReg(X86::RDI)
+      .addExternalSymbol("__morestack_allocate_stack_space")
       .addRegMask(RegMask)
+      .addReg(X86::RDI, RegState::Implicit)
       .addReg(X86::RAX, RegState::ImplicitDefine);
   } else {
     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
@@ -12518,7 +12610,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 
     // Load the old value of the high byte of the control word...
     unsigned OldCW =
-      F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass);
+      F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
                       CWFrameIdx);
 
@@ -12606,25 +12698,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                X86::AND32ri, X86::MOV32rm,
                                                X86::LCMPXCHG32,
                                                X86::NOT32r, X86::EAX,
-                                               X86::GR32RegisterClass);
+                                               &X86::GR32RegClass);
   case X86::ATOMOR32:
     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr,
                                                X86::OR32ri, X86::MOV32rm,
                                                X86::LCMPXCHG32,
                                                X86::NOT32r, X86::EAX,
-                                               X86::GR32RegisterClass);
+                                               &X86::GR32RegClass);
   case X86::ATOMXOR32:
     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr,
                                                X86::XOR32ri, X86::MOV32rm,
                                                X86::LCMPXCHG32,
                                                X86::NOT32r, X86::EAX,
-                                               X86::GR32RegisterClass);
+                                               &X86::GR32RegClass);
   case X86::ATOMNAND32:
     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
                                                X86::AND32ri, X86::MOV32rm,
                                                X86::LCMPXCHG32,
                                                X86::NOT32r, X86::EAX,
-                                               X86::GR32RegisterClass, true);
+                                               &X86::GR32RegClass, true);
   case X86::ATOMMIN32:
     return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr);
   case X86::ATOMMAX32:
@@ -12639,25 +12731,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                X86::AND16ri, X86::MOV16rm,
                                                X86::LCMPXCHG16,
                                                X86::NOT16r, X86::AX,
-                                               X86::GR16RegisterClass);
+                                               &X86::GR16RegClass);
   case X86::ATOMOR16:
     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr,
                                                X86::OR16ri, X86::MOV16rm,
                                                X86::LCMPXCHG16,
                                                X86::NOT16r, X86::AX,
-                                               X86::GR16RegisterClass);
+                                               &X86::GR16RegClass);
   case X86::ATOMXOR16:
     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr,
                                                X86::XOR16ri, X86::MOV16rm,
                                                X86::LCMPXCHG16,
                                                X86::NOT16r, X86::AX,
-                                               X86::GR16RegisterClass);
+                                               &X86::GR16RegClass);
   case X86::ATOMNAND16:
     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
                                                X86::AND16ri, X86::MOV16rm,
                                                X86::LCMPXCHG16,
                                                X86::NOT16r, X86::AX,
-                                               X86::GR16RegisterClass, true);
+                                               &X86::GR16RegClass, true);
   case X86::ATOMMIN16:
     return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr);
   case X86::ATOMMAX16:
@@ -12672,25 +12764,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                X86::AND8ri, X86::MOV8rm,
                                                X86::LCMPXCHG8,
                                                X86::NOT8r, X86::AL,
-                                               X86::GR8RegisterClass);
+                                               &X86::GR8RegClass);
   case X86::ATOMOR8:
     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr,
                                                X86::OR8ri, X86::MOV8rm,
                                                X86::LCMPXCHG8,
                                                X86::NOT8r, X86::AL,
-                                               X86::GR8RegisterClass);
+                                               &X86::GR8RegClass);
   case X86::ATOMXOR8:
     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr,
                                                X86::XOR8ri, X86::MOV8rm,
                                                X86::LCMPXCHG8,
                                                X86::NOT8r, X86::AL,
-                                               X86::GR8RegisterClass);
+                                               &X86::GR8RegClass);
   case X86::ATOMNAND8:
     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
                                                X86::AND8ri, X86::MOV8rm,
                                                X86::LCMPXCHG8,
                                                X86::NOT8r, X86::AL,
-                                               X86::GR8RegisterClass, true);
+                                               &X86::GR8RegClass, true);
   // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
   // This group is for 64-bit host.
   case X86::ATOMAND64:
@@ -12698,25 +12790,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                X86::AND64ri32, X86::MOV64rm,
                                                X86::LCMPXCHG64,
                                                X86::NOT64r, X86::RAX,
-                                               X86::GR64RegisterClass);
+                                               &X86::GR64RegClass);
   case X86::ATOMOR64:
     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr,
                                                X86::OR64ri32, X86::MOV64rm,
                                                X86::LCMPXCHG64,
                                                X86::NOT64r, X86::RAX,
-                                               X86::GR64RegisterClass);
+                                               &X86::GR64RegClass);
   case X86::ATOMXOR64:
     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr,
                                                X86::XOR64ri32, X86::MOV64rm,
                                                X86::LCMPXCHG64,
                                                X86::NOT64r, X86::RAX,
-                                               X86::GR64RegisterClass);
+                                               &X86::GR64RegClass);
   case X86::ATOMNAND64:
     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
                                                X86::AND64ri32, X86::MOV64rm,
                                                X86::LCMPXCHG64,
                                                X86::NOT64r, X86::RAX,
-                                               X86::GR64RegisterClass, true);
+                                               &X86::GR64RegClass, true);
   case X86::ATOMMIN64:
     return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr);
   case X86::ATOMMAX64:
@@ -12871,10 +12963,10 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
 /// inserting the result into the low part of a new 256-bit vector
 static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
   EVT VT = SVOp->getValueType(0);
-  int NumElems = VT.getVectorNumElements();
+  unsigned NumElems = VT.getVectorNumElements();
 
   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
-  for (int i = 0, j = NumElems/2; i < NumElems/2; ++i, ++j)
+  for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
         SVOp->getMaskElt(j) >= 0)
       return false;
@@ -12887,10 +12979,10 @@ static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
 /// inserting the result into the high part of a new 256-bit vector
 static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
   EVT VT = SVOp->getValueType(0);
-  int NumElems = VT.getVectorNumElements();
+  unsigned NumElems = VT.getVectorNumElements();
 
   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
-  for (int i = NumElems/2, j = 0; i < NumElems; ++i, ++j)
+  for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
         SVOp->getMaskElt(j) >= 0)
       return false;
@@ -12907,7 +12999,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
   EVT VT = SVOp->getValueType(0);
-  int NumElems = VT.getVectorNumElements();
+  unsigned NumElems = VT.getVectorNumElements();
 
   if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
       V2.getOpcode() == ISD::CONCAT_VECTORS) {
@@ -12932,30 +13024,31 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
     // To match the shuffle mask, the first half of the mask should
     // be exactly the first vector, and all the rest a splat with the
     // first element of the second one.
-    for (int i = 0; i < NumElems/2; ++i)
+    for (unsigned i = 0; i != NumElems/2; ++i)
       if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
           !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
         return SDValue();
 
     // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
     if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
-      SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
-      SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
-      SDValue ResNode =
-        DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2,
-                                Ld->getMemoryVT(),
-                                Ld->getPointerInfo(),
-                                Ld->getAlignment(),
-                                false/*isVolatile*/, true/*ReadMem*/,
-                                false/*WriteMem*/);
-      return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
+      if (Ld->hasNUsesOfValue(1, 0)) {
+        SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
+        SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
+        SDValue ResNode =
+          DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2,
+                                  Ld->getMemoryVT(),
+                                  Ld->getPointerInfo(),
+                                  Ld->getAlignment(),
+                                  false/*isVolatile*/, true/*ReadMem*/,
+                                  false/*WriteMem*/);
+        return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
+      }
     } 
 
     // Emit a zeroed vector and insert the desired subvector on its
     // first half.
     SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
-    SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0),
-                         DAG.getConstant(0, MVT::i32), DAG, dl);
+    SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
     return DCI.CombineTo(N, InsV);
   }
 
@@ -12965,18 +13058,15 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
 
   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
   if (isShuffleHigh128VectorInsertLow(SVOp)) {
-    SDValue V = Extract128BitVector(V1, DAG.getConstant(NumElems/2, MVT::i32),
-                                    DAG, dl);
-    SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT),
-                                      V, DAG.getConstant(0, MVT::i32), DAG, dl);
+    SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
+    SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
     return DCI.CombineTo(N, InsV);
   }
 
   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
   if (isShuffleLow128VectorInsertHigh(SVOp)) {
-    SDValue V = Extract128BitVector(V1, DAG.getConstant(0, MVT::i32), DAG, dl);
-    SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT),
-                             V, DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
+    SDValue V = Extract128BitVector(V1, 0, DAG, dl);
+    SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
     return DCI.CombineTo(N, InsV);
   }
 
@@ -13015,7 +13105,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
 }
 
 
-/// PerformTruncateCombine - Converts truncate operation to
+/// DCI, PerformTruncateCombine - Converts truncate operation to
 /// a sequence of vector shuffle operations.
 /// It is possible when we truncate 256-bit vector to 128-bit vector
 
@@ -13024,7 +13114,8 @@ SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
   if (!DCI.isBeforeLegalizeOps())
     return SDValue();
 
-  if (!Subtarget->hasAVX()) return SDValue();
+  if (!Subtarget->hasAVX())
+    return SDValue();
 
   EVT VT = N->getValueType(0);
   SDValue Op = N->getOperand(0);
@@ -13033,55 +13124,102 @@ SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
 
   if ((VT == MVT::v4i32) && (OpVT == MVT::v4i64)) {
 
+    if (Subtarget->hasAVX2()) {
+      // AVX2: v4i64 -> v4i32
+
+      // VPERMD
+      static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
+
+      Op = DAG.getNode(ISD::BITCAST, dl, MVT::v8i32, Op);
+      Op = DAG.getVectorShuffle(MVT::v8i32, dl, Op, DAG.getUNDEF(MVT::v8i32),
+                                ShufMask);
+
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Op,
+                         DAG.getIntPtrConstant(0));
+    }
+
+    // AVX: v4i64 -> v4i32
     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
-                          DAG.getIntPtrConstant(0));
+                               DAG.getIntPtrConstant(0));
 
     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
-                          DAG.getIntPtrConstant(2));
+                               DAG.getIntPtrConstant(2));
 
     OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
     OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
 
     // PSHUFD
-    int ShufMask1[] = {0, 2, 0, 0};
+    static const int ShufMask1[] = {0, 2, 0, 0};
 
-    OpLo = DAG.getVectorShuffle(VT, dl, OpLo, DAG.getUNDEF(VT),
-                                ShufMask1);
-    OpHi = DAG.getVectorShuffle(VT, dl, OpHi, DAG.getUNDEF(VT),
-                                ShufMask1);
+    OpLo = DAG.getVectorShuffle(VT, dl, OpLo, DAG.getUNDEF(VT), ShufMask1);
+    OpHi = DAG.getVectorShuffle(VT, dl, OpHi, DAG.getUNDEF(VT), ShufMask1);
 
     // MOVLHPS
-    int ShufMask2[] = {0, 1, 4, 5};
+    static const int ShufMask2[] = {0, 1, 4, 5};
 
     return DAG.getVectorShuffle(VT, dl, OpLo, OpHi, ShufMask2);
   }
+
   if ((VT == MVT::v8i16) && (OpVT == MVT::v8i32)) {
 
+    if (Subtarget->hasAVX2()) {
+      // AVX2: v8i32 -> v8i16
+
+      Op = DAG.getNode(ISD::BITCAST, dl, MVT::v32i8, Op);
+
+      // PSHUFB
+      SmallVector<SDValue,32> pshufbMask;
+      for (unsigned i = 0; i < 2; ++i) {
+        pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
+        for (unsigned j = 0; j < 8; ++j)
+          pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+      }
+      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v32i8,
+                               &pshufbMask[0], 32);
+      Op = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, Op, BV);
+
+      Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i64, Op);
+
+      static const int ShufMask[] = {0,  2,  -1,  -1};
+      Op = DAG.getVectorShuffle(MVT::v4i64, dl,  Op, DAG.getUNDEF(MVT::v4i64),
+                                &ShufMask[0]);
+
+      Op = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
+                       DAG.getIntPtrConstant(0));
+
+      return DAG.getNode(ISD::BITCAST, dl, VT, Op);
+    }
+
     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op,
-                          DAG.getIntPtrConstant(0));
+                               DAG.getIntPtrConstant(0));
 
     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op,
-                          DAG.getIntPtrConstant(4));
+                               DAG.getIntPtrConstant(4));
 
     OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLo);
     OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpHi);
 
     // PSHUFB
-    int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13, 
-                      -1, -1, -1, -1, -1, -1, -1, -1};
+    static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
+                                   -1, -1, -1, -1, -1, -1, -1, -1};
 
-    OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo,
-                                DAG.getUNDEF(MVT::v16i8),
+    OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo, DAG.getUNDEF(MVT::v16i8),
                                 ShufMask1);
-    OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi,
-                                DAG.getUNDEF(MVT::v16i8),
+    OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi, DAG.getUNDEF(MVT::v16i8),
                                 ShufMask1);
 
     OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
     OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
 
     // MOVLHPS
-    int ShufMask2[] = {0, 1, 4, 5};
+    static const int ShufMask2[] = {0, 1, 4, 5};
 
     SDValue res = DAG.getVectorShuffle(MVT::v4i32, dl, OpLo, OpHi, ShufMask2);
     return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, res);
@@ -13128,7 +13266,8 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
 
   SmallVector<int, 16> ShuffleMask;
   bool UnaryShuffle;
-  if (!getTargetShuffleMask(InVec.getNode(), VT, ShuffleMask, UnaryShuffle))
+  if (!getTargetShuffleMask(InVec.getNode(), VT.getSimpleVT(), ShuffleMask,
+                            UnaryShuffle))
     return SDValue();
 
   // Select the input vector, guarding against out of range extract vector.
@@ -13277,8 +13416,6 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     const X86Subtarget *Subtarget) {
-
-
   DebugLoc DL = N->getDebugLoc();
   SDValue Cond = N->getOperand(0);
   // Get the LHS/RHS of the select.
@@ -13560,9 +13697,13 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
   // to simplify previous instructions.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
-      !DCI.isBeforeLegalize() &&
-      TLI.isOperationLegal(ISD::VSELECT, VT)) {
+      !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) {
     unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
+
+    // Don't optimize vector selects that map to mask-registers.
+    if (BitWidth == 1)
+      return SDValue();
+
     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
 
@@ -14261,6 +14402,41 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+// Generate NEG and CMOV for integer abs.
+static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+
+  // Since X86 does not have CMOV for 8-bit integer, we don't convert
+  // 8-bit integer abs to NEG and CMOV.
+  if (VT.isInteger() && VT.getSizeInBits() == 8)
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  DebugLoc DL = N->getDebugLoc();
+
+  // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
+  // and change it to SUB and CMOV.
+  if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
+      N0.getOpcode() == ISD::ADD &&
+      N0.getOperand(1) == N1 &&
+      N1.getOpcode() == ISD::SRA &&
+      N1.getOperand(0) == N0.getOperand(0))
+    if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
+      if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
+        // Generate SUB & CMOV.
+        SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
+                                  DAG.getConstant(0, VT), N0.getOperand(0));
+
+        SDValue Ops[] = { N0.getOperand(0), Neg,
+                          DAG.getConstant(X86::COND_GE, MVT::i8),
+                          SDValue(Neg.getNode(), 1) };
+        return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue),
+                           Ops, array_lengthof(Ops));
+      }
+  return SDValue();
+}
+
 // PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
 static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
@@ -14268,6 +14444,16 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
+  if (Subtarget->hasCMov()) {
+    SDValue RV = performIntegerAbsCombine(N, DAG);
+    if (RV.getNode())
+      return RV;
+  }
+
+  // Try forming BMI if it is available.
+  if (!Subtarget->hasBMI())
+    return SDValue();
+
   EVT VT = N->getValueType(0);
 
   if (VT != MVT::i32 && VT != MVT::i64)
@@ -14293,7 +14479,8 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
 
 /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
 static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
-                                   const X86Subtarget *Subtarget) {
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const X86Subtarget *Subtarget) {
   LoadSDNode *Ld = cast<LoadSDNode>(N);
   EVT RegVT = Ld->getValueType(0);
   EVT MemVT = Ld->getMemoryVT();
@@ -14315,63 +14502,94 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
     unsigned RegSz = RegVT.getSizeInBits();
     unsigned MemSz = MemVT.getSizeInBits();
     assert(RegSz > MemSz && "Register size must be greater than the mem size");
-    // All sizes must be a power of two
-    if (!isPowerOf2_32(RegSz * MemSz * NumElems)) return SDValue();
 
-    // Attempt to load the original value using a single load op.
-    // Find a scalar type which is equal to the loaded word size.
+    // All sizes must be a power of two.
+    if (!isPowerOf2_32(RegSz * MemSz * NumElems))
+      return SDValue();
+
+    // Attempt to load the original value using scalar loads.
+    // Find the largest scalar type that divides the total loaded size.
     MVT SclrLoadTy = MVT::i8;
     for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
          tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
       MVT Tp = (MVT::SimpleValueType)tp;
-      if (TLI.isTypeLegal(Tp) &&  Tp.getSizeInBits() == MemSz) {
+      if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
         SclrLoadTy = Tp;
-        break;
       }
     }
 
-    // Proceed if a load word is found.
-    if (SclrLoadTy.getSizeInBits() != MemSz) return SDValue();
+    // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
+    if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
+        (64 <= MemSz))
+      SclrLoadTy = MVT::f64;
 
+    // Calculate the number of scalar loads that we need to perform
+    // in order to load our vector from memory.
+    unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
+
+    // Represent our vector as a sequence of elements which are the
+    // largest scalar that we can load.
     EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
       RegSz/SclrLoadTy.getSizeInBits());
 
+    // Represent the data using the same element type that is stored in
+    // memory. In practice, we ''widen'' MemVT.
     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
                                   RegSz/MemVT.getScalarType().getSizeInBits());
-    // Can't shuffle using an illegal type.
-    if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
 
-    // Perform a single load.
-    SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
-                                  Ld->getBasePtr(),
-                                  Ld->getPointerInfo(), Ld->isVolatile(),
-                                  Ld->isNonTemporal(), Ld->isInvariant(),
-                                  Ld->getAlignment());
+    assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
+      "Invalid vector type");
 
-    // Insert the word loaded into a vector.
-    SDValue ScalarInVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
-      LoadUnitVecVT, ScalarLoad);
+    // We can't shuffle using an illegal type.
+    if (!TLI.isTypeLegal(WideVecVT))
+      return SDValue();
+
+    SmallVector<SDValue, 8> Chains;
+    SDValue Ptr = Ld->getBasePtr();
+    SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits()/8,
+                                        TLI.getPointerTy());
+    SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
+
+    for (unsigned i = 0; i < NumLoads; ++i) {
+      // Perform a single load.
+      SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
+                                       Ptr, Ld->getPointerInfo(),
+                                       Ld->isVolatile(), Ld->isNonTemporal(),
+                                       Ld->isInvariant(), Ld->getAlignment());
+      Chains.push_back(ScalarLoad.getValue(1));
+      // Create the first element type using SCALAR_TO_VECTOR in order to avoid
+      // another round of DAGCombining.
+      if (i == 0)
+        Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
+      else
+        Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
+                          ScalarLoad, DAG.getIntPtrConstant(i));
+
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+    }
+
+    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
+                               Chains.size());
 
     // Bitcast the loaded value to a vector of the original element type, in
     // the size of the target vector type.
-    SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT,
-                                    ScalarInVector);
+    SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
     unsigned SizeRatio = RegSz/MemSz;
 
     // Redistribute the loaded elements into the different locations.
     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
-    for (unsigned i = 0; i < NumElems; i++) ShuffleVec[i*SizeRatio] = i;
+    for (unsigned i = 0; i != NumElems; ++i)
+      ShuffleVec[i*SizeRatio] = i;
 
     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
-                                DAG.getUNDEF(SlicedVec.getValueType()),
-                                ShuffleVec.data());
+                                         DAG.getUNDEF(WideVecVT),
+                                         &ShuffleVec[0]);
 
     // Bitcast to the requested type.
     Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
     // Replace the original load with the new sequence
     // and return the new chain.
-    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Shuff);
-    return SDValue(ScalarLoad.getNode(), 1);
+    return DCI.CombineTo(N, Shuff, TF, true);
   }
 
   return SDValue();
@@ -14388,13 +14606,12 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // If we are saving a concatenation of two XMM registers, perform two stores.
-  // This is better in Sandy Bridge cause one 256-bit mem op is done via two
-  // 128-bit ones. If in the future the cost becomes only one memory access the
-  // first version would be better.
-  if (VT.getSizeInBits() == 256 &&
-    StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS &&
-    StoredVal.getNumOperands() == 2) {
-
+  // On Sandy Bridge, 256-bit memory operations are executed by two
+  // 128-bit ports. However, on Haswell it is better to issue a single 256-bit
+  // memory  operation.
+  if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2() &&
+      StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS &&
+      StoredVal.getNumOperands() == 2) {
     SDValue Value0 = StoredVal.getOperand(0);
     SDValue Value1 = StoredVal.getOperand(1);
 
@@ -14439,14 +14656,16 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
 
     SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
-    for (unsigned i = 0; i < NumElems; i++ ) ShuffleVec[i] = i * SizeRatio;
+    for (unsigned i = 0; i != NumElems; ++i)
+      ShuffleVec[i] = i * SizeRatio;
 
-    // Can't shuffle using an illegal type
-    if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
+    // Can't shuffle using an illegal type.
+    if (!TLI.isTypeLegal(WideVecVT))
+      return SDValue();
 
     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
-                                DAG.getUNDEF(WideVec.getValueType()),
-                                ShuffleVec.data());
+                                         DAG.getUNDEF(WideVecVT),
+                                         &ShuffleVec[0]);
     // At this point all of the data is stored at the bottom of the
     // register. We now need to save it to mem.
 
@@ -14455,13 +14674,18 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
     for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
          tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
       MVT Tp = (MVT::SimpleValueType)tp;
-      if (TLI.isTypeLegal(Tp) && StoreType.getSizeInBits() < NumElems * ToSz)
+      if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
         StoreType = Tp;
     }
 
+    // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
+    if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
+        (64 <= NumElems * ToSz))
+      StoreType = MVT::f64;
+
     // Bitcast the original vector into a vector of store-size units
     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
-            StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
+            StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
     SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
     SmallVector<SDValue, 8> Chains;
@@ -14470,7 +14694,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
     SDValue Ptr = St->getBasePtr();
 
     // Perform one or more big stores into memory.
-    for (unsigned i = 0; i < (ToSz*NumElems)/StoreType.getSizeInBits() ; i++) {
+    for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
                                    StoreType, ShuffWide,
                                    DAG.getIntPtrConstant(i));
@@ -14819,18 +15043,9 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
   if (!DCI.isBeforeLegalizeOps())
     return SDValue();
 
-  if (!Subtarget->hasAVX()) 
+  if (!Subtarget->hasAVX())
     return SDValue();
 
-  // Optimize vectors in AVX mode
-  // Sign extend  v8i16 to v8i32 and
-  //              v4i32 to v4i64
-  //
-  // Divide input vector into two parts
-  // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
-  // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
-  // concat the vectors to original VT
-
   EVT VT = N->getValueType(0);
   SDValue Op = N->getOperand(0);
   EVT OpVT = Op.getValueType();
@@ -14839,23 +15054,37 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
   if ((VT == MVT::v4i64 && OpVT == MVT::v4i32) ||
       (VT == MVT::v8i32 && OpVT == MVT::v8i16)) {
 
+    if (Subtarget->hasAVX2())
+      return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, Op);
+
+    // Optimize vectors in AVX mode
+    // Sign extend  v8i16 to v8i32 and
+    //              v4i32 to v4i64
+    //
+    // Divide input vector into two parts
+    // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
+    // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
+    // concat the vectors to original VT
+
     unsigned NumElems = OpVT.getVectorNumElements();
     SmallVector<int,8> ShufMask1(NumElems, -1);
-    for (unsigned i = 0; i < NumElems/2; i++) ShufMask1[i] = i;
+    for (unsigned i = 0; i != NumElems/2; ++i)
+      ShufMask1[i] = i;
 
     SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT),
-                                        ShufMask1.data());
+                                        &ShufMask1[0]);
 
     SmallVector<int,8> ShufMask2(NumElems, -1);
-    for (unsigned i = 0; i < NumElems/2; i++) ShufMask2[i] = i + NumElems/2;
+    for (unsigned i = 0; i != NumElems/2; ++i)
+      ShufMask2[i] = i + NumElems/2;
 
     SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT),
-                                        ShufMask2.data());
+                                        &ShufMask2[0]);
 
-    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), 
+    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
                                   VT.getVectorNumElements()/2);
 
-    OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo); 
+    OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo);
     OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi);
 
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
@@ -14864,6 +15093,7 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
 }
 
 static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
+                                  TargetLowering::DAGCombinerInfo &DCI,
                                   const X86Subtarget *Subtarget) {
   // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
   //           (and (i32 x86isd::setcc_carry), 1)
@@ -14888,6 +15118,7 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
                                    N00.getOperand(0), N00.getOperand(1)),
                        DAG.getConstant(1, VT));
   }
+
   // Optimize vectors in AVX mode:
   //
   //   v8i16 -> v8i32
@@ -14900,26 +15131,57 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
   //   Concat upper and lower parts.
   //
-  if (Subtarget->hasAVX()) {
+  if (!DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  if (!Subtarget->hasAVX())
+    return SDValue();
 
-    if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16))  ||
+  if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16)) ||
       ((VT == MVT::v4i64) && (OpVT == MVT::v4i32)))  {
 
-      SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl);
-      SDValue OpLo = getTargetShuffleNode(X86ISD::UNPCKL, dl, OpVT, N0, ZeroVec, DAG);
-      SDValue OpHi = getTargetShuffleNode(X86ISD::UNPCKH, dl, OpVT, N0, ZeroVec, DAG);
+    if (Subtarget->hasAVX2())
+      return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, N0);
 
-      EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 
-        VT.getVectorNumElements()/2);
+    SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl);
+    SDValue OpLo = getUnpackl(DAG, dl, OpVT, N0, ZeroVec);
+    SDValue OpHi = getUnpackh(DAG, dl, OpVT, N0, ZeroVec);
 
-      OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
-      OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
+    EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+                               VT.getVectorNumElements()/2);
 
-      return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
-    }
+    OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
+    OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
+
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
   }
 
+  return SDValue();
+}
 
+// Optimize x == -y --> x+y == 0
+//          x != -y --> x+y != 0
+static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) {
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1); 
+
+  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
+      if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
+        SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(),
+                                   LHS.getValueType(), RHS, LHS.getOperand(1));
+        return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0),
+                            addV, DAG.getConstant(0, addV.getValueType()), CC);
+      }
+  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
+      if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
+        SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(),
+                                   RHS.getValueType(), LHS, RHS.getOperand(1));
+        return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0),
+                            addV, DAG.getConstant(0, addV.getValueType()), CC);
+      }
   return SDValue();
 }
 
@@ -14941,9 +15203,36 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
+static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue Op0 = N->getOperand(0);
+  EVT InVT = Op0->getValueType(0);
+
+  // UINT_TO_FP(v4i8) -> SINT_TO_FP(ZEXT(v4i8 to v4i32))
+  if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
+    DebugLoc dl = N->getDebugLoc();
+    MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
+    SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
+    // Notice that we use SINT_TO_FP because we know that the high bits
+    // are zero and SINT_TO_FP is better supported by the hardware.
+    return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
+  }
+
+  return SDValue();
+}
+
 static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
                                         const X86TargetLowering *XTLI) {
   SDValue Op0 = N->getOperand(0);
+  EVT InVT = Op0->getValueType(0);
+
+  // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
+  if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
+    DebugLoc dl = N->getDebugLoc();
+    MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
+    SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
+    return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
+  }
+
   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
   // a 32-bit target where SSE doesn't support i64->FP operations.
   if (Op0.getOpcode() == ISD::LOAD) {
@@ -14962,6 +15251,20 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue PerformFP_TO_SINTCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+
+  // v4i8 = FP_TO_SINT() -> v4i8 = TRUNCATE (V4i32 = FP_TO_SINT()
+  if (VT == MVT::v8i8 || VT == MVT::v4i8) {
+    DebugLoc dl = N->getDebugLoc();
+    MVT DstVT = VT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
+    SDValue I = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, N->getOperand(0));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, I);
+  }
+
+  return SDValue();
+}
+
 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
 static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
                                  X86TargetLowering::DAGCombinerInfo &DCI) {
@@ -15096,9 +15399,11 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
   case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
   case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
-  case ISD::LOAD:           return PerformLOADCombine(N, DAG, Subtarget);
+  case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
   case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
+  case ISD::UINT_TO_FP:     return PerformUINT_TO_FPCombine(N, DAG);
   case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
+  case ISD::FP_TO_SINT:     return PerformFP_TO_SINTCombine(N, DAG);
   case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
   case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
   case X86ISD::FXOR:
@@ -15106,9 +15411,11 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
   case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
   case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
-  case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, Subtarget);
+  case ISD::ANY_EXTEND:
+  case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
   case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
   case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG, DCI);
+  case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG);
   case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG);
   case X86ISD::SHUFP:       // Handle all target specific shuffles
   case X86ISD::PALIGN:
@@ -15653,55 +15960,55 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
       // in the normal allocation?
     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
       if (Subtarget->is64Bit()) {
-	if (VT == MVT::i32 || VT == MVT::f32)
-	  return std::make_pair(0U, X86::GR32RegisterClass);
-	else if (VT == MVT::i16)
-	  return std::make_pair(0U, X86::GR16RegisterClass);
-	else if (VT == MVT::i8 || VT == MVT::i1)
-	  return std::make_pair(0U, X86::GR8RegisterClass);
-	else if (VT == MVT::i64 || VT == MVT::f64)
-	  return std::make_pair(0U, X86::GR64RegisterClass);
-	break;
+        if (VT == MVT::i32 || VT == MVT::f32)
+          return std::make_pair(0U, &X86::GR32RegClass);
+        if (VT == MVT::i16)
+          return std::make_pair(0U, &X86::GR16RegClass);
+        if (VT == MVT::i8 || VT == MVT::i1)
+          return std::make_pair(0U, &X86::GR8RegClass);
+        if (VT == MVT::i64 || VT == MVT::f64)
+          return std::make_pair(0U, &X86::GR64RegClass);
+        break;
       }
       // 32-bit fallthrough
     case 'Q':   // Q_REGS
       if (VT == MVT::i32 || VT == MVT::f32)
-	return std::make_pair(0U, X86::GR32_ABCDRegisterClass);
-      else if (VT == MVT::i16)
-	return std::make_pair(0U, X86::GR16_ABCDRegisterClass);
-      else if (VT == MVT::i8 || VT == MVT::i1)
-	return std::make_pair(0U, X86::GR8_ABCD_LRegisterClass);
-      else if (VT == MVT::i64)
-	return std::make_pair(0U, X86::GR64_ABCDRegisterClass);
+        return std::make_pair(0U, &X86::GR32_ABCDRegClass);
+      if (VT == MVT::i16)
+        return std::make_pair(0U, &X86::GR16_ABCDRegClass);
+      if (VT == MVT::i8 || VT == MVT::i1)
+        return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
+      if (VT == MVT::i64)
+        return std::make_pair(0U, &X86::GR64_ABCDRegClass);
       break;
     case 'r':   // GENERAL_REGS
     case 'l':   // INDEX_REGS
       if (VT == MVT::i8 || VT == MVT::i1)
-        return std::make_pair(0U, X86::GR8RegisterClass);
+        return std::make_pair(0U, &X86::GR8RegClass);
       if (VT == MVT::i16)
-        return std::make_pair(0U, X86::GR16RegisterClass);
+        return std::make_pair(0U, &X86::GR16RegClass);
       if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
-        return std::make_pair(0U, X86::GR32RegisterClass);
-      return std::make_pair(0U, X86::GR64RegisterClass);
+        return std::make_pair(0U, &X86::GR32RegClass);
+      return std::make_pair(0U, &X86::GR64RegClass);
     case 'R':   // LEGACY_REGS
       if (VT == MVT::i8 || VT == MVT::i1)
-        return std::make_pair(0U, X86::GR8_NOREXRegisterClass);
+        return std::make_pair(0U, &X86::GR8_NOREXRegClass);
       if (VT == MVT::i16)
-        return std::make_pair(0U, X86::GR16_NOREXRegisterClass);
+        return std::make_pair(0U, &X86::GR16_NOREXRegClass);
       if (VT == MVT::i32 || !Subtarget->is64Bit())
-        return std::make_pair(0U, X86::GR32_NOREXRegisterClass);
-      return std::make_pair(0U, X86::GR64_NOREXRegisterClass);
+        return std::make_pair(0U, &X86::GR32_NOREXRegClass);
+      return std::make_pair(0U, &X86::GR64_NOREXRegClass);
     case 'f':  // FP Stack registers.
       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
       // value to the correct fpstack register class.
       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
-        return std::make_pair(0U, X86::RFP32RegisterClass);
+        return std::make_pair(0U, &X86::RFP32RegClass);
       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
-        return std::make_pair(0U, X86::RFP64RegisterClass);
-      return std::make_pair(0U, X86::RFP80RegisterClass);
+        return std::make_pair(0U, &X86::RFP64RegClass);
+      return std::make_pair(0U, &X86::RFP80RegClass);
     case 'y':   // MMX_REGS if MMX allowed.
       if (!Subtarget->hasMMX()) break;
-      return std::make_pair(0U, X86::VR64RegisterClass);
+      return std::make_pair(0U, &X86::VR64RegClass);
     case 'Y':   // SSE_REGS if SSE2 allowed
       if (!Subtarget->hasSSE2()) break;
       // FALL THROUGH.
@@ -15713,10 +16020,10 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
       // Scalar SSE types.
       case MVT::f32:
       case MVT::i32:
-        return std::make_pair(0U, X86::FR32RegisterClass);
+        return std::make_pair(0U, &X86::FR32RegClass);
       case MVT::f64:
       case MVT::i64:
-        return std::make_pair(0U, X86::FR64RegisterClass);
+        return std::make_pair(0U, &X86::FR64RegClass);
       // Vector types.
       case MVT::v16i8:
       case MVT::v8i16:
@@ -15724,7 +16031,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
       case MVT::v2i64:
       case MVT::v4f32:
       case MVT::v2f64:
-        return std::make_pair(0U, X86::VR128RegisterClass);
+        return std::make_pair(0U, &X86::VR128RegClass);
       // AVX types.
       case MVT::v32i8:
       case MVT::v16i16:
@@ -15732,8 +16039,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
       case MVT::v4i64:
       case MVT::v8f32:
       case MVT::v4f64:
-        return std::make_pair(0U, X86::VR256RegisterClass);
-        
+        return std::make_pair(0U, &X86::VR256RegClass);
       }
       break;
     }
@@ -15756,28 +16062,28 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
         Constraint[6] == '}') {
 
       Res.first = X86::ST0+Constraint[4]-'0';
-      Res.second = X86::RFP80RegisterClass;
+      Res.second = &X86::RFP80RegClass;
       return Res;
     }
 
     // GCC allows "st(0)" to be called just plain "st".
     if (StringRef("{st}").equals_lower(Constraint)) {
       Res.first = X86::ST0;
-      Res.second = X86::RFP80RegisterClass;
+      Res.second = &X86::RFP80RegClass;
       return Res;
     }
 
     // flags -> EFLAGS
     if (StringRef("{flags}").equals_lower(Constraint)) {
       Res.first = X86::EFLAGS;
-      Res.second = X86::CCRRegisterClass;
+      Res.second = &X86::CCRRegClass;
       return Res;
     }
 
     // 'A' means EAX + EDX.
     if (Constraint == "A") {
       Res.first = X86::EAX;
-      Res.second = X86::GR32_ADRegisterClass;
+      Res.second = &X86::GR32_ADRegClass;
       return Res;
     }
     return Res;
@@ -15793,7 +16099,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
   // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
   // really want an 8-bit or 32-bit register, map to the appropriate register
   // class and return the appropriate register.
-  if (Res.second == X86::GR16RegisterClass) {
+  if (Res.second == &X86::GR16RegClass) {
     if (VT == MVT::i8) {
       unsigned DestReg = 0;
       switch (Res.first) {
@@ -15805,7 +16111,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
       }
       if (DestReg) {
         Res.first = DestReg;
-        Res.second = X86::GR8RegisterClass;
+        Res.second = &X86::GR8RegClass;
       }
     } else if (VT == MVT::i32) {
       unsigned DestReg = 0;
@@ -15822,7 +16128,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
       }
       if (DestReg) {
         Res.first = DestReg;
-        Res.second = X86::GR32RegisterClass;
+        Res.second = &X86::GR32RegClass;
       }
     } else if (VT == MVT::i64) {
       unsigned DestReg = 0;
@@ -15839,22 +16145,25 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
       }
       if (DestReg) {
         Res.first = DestReg;
-        Res.second = X86::GR64RegisterClass;
+        Res.second = &X86::GR64RegClass;
       }
     }
-  } else if (Res.second == X86::FR32RegisterClass ||
-             Res.second == X86::FR64RegisterClass ||
-             Res.second == X86::VR128RegisterClass) {
+  } else if (Res.second == &X86::FR32RegClass ||
+             Res.second == &X86::FR64RegClass ||
+             Res.second == &X86::VR128RegClass) {
     // Handle references to XMM physical registers that got mapped into the
     // wrong class.  This can happen with constraints like {xmm0} where the
     // target independent register mapper will just pick the first match it can
     // find, ignoring the required type.
-    if (VT == MVT::f32)
-      Res.second = X86::FR32RegisterClass;
-    else if (VT == MVT::f64)
-      Res.second = X86::FR64RegisterClass;
-    else if (X86::VR128RegisterClass->hasType(VT))
-      Res.second = X86::VR128RegisterClass;
+
+    if (VT == MVT::f32 || VT == MVT::i32)
+      Res.second = &X86::FR32RegClass;
+    else if (VT == MVT::f64 || VT == MVT::i64)
+      Res.second = &X86::FR64RegClass;
+    else if (X86::VR128RegClass.hasType(VT))
+      Res.second = &X86::VR128RegClass;
+    else if (X86::VR256RegClass.hasType(VT))
+      Res.second = &X86::VR256RegClass;
   }
 
   return Res;
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 09116e8..78e4d75 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -207,6 +207,10 @@ namespace llvm {
       // TLSADDR - Thread Local Storage.
       TLSADDR,
 
+      // TLSBASEADDR - Thread Local Storage. A call to get the start address
+      // of the TLS block for the current module.
+      TLSBASEADDR,
+
       // TLSCALL - Thread Local Storage.  When calling to an OS provided
       // thunk at the address from an earlier relocation.
       TLSCALL,
@@ -242,9 +246,6 @@ namespace llvm {
       // PCMP* - Vector integer comparisons.
       PCMPEQ, PCMPGT,
 
-      // VPCOM, VPCOMU - XOP Vector integer comparisons.
-      VPCOM, VPCOMU,
-
       // ADD, SUB, SMUL, etc. - Arithmetic operations with FLAGS results.
       ADD, SUB, ADC, SBB, SMUL,
       INC, DEC, OR, XOR, AND,
@@ -315,6 +316,15 @@ namespace llvm {
       SFENCE,
       LFENCE,
 
+      // FNSTSW16r - Store FP status word into i16 register.
+      FNSTSW16r,
+
+      // SAHF - Store contents of %ah into %eflags.
+      SAHF,
+
+      // RDRAND - Get a random integer and indicate whether it is valid in CF.
+      RDRAND,
+
       // ATOMADD64_DAG, ATOMSUB64_DAG, ATOMOR64_DAG, ATOMAND64_DAG,
       // ATOMXOR64_DAG, ATOMNAND64_DAG, ATOMSWAP64_DAG -
       // Atomic 64-bit binary operations.
@@ -558,6 +568,18 @@ namespace llvm {
     /// by AM is legal for this target, for a load/store of the specified type.
     virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty)const;
 
+    /// isLegalICmpImmediate - Return true if the specified immediate is legal
+    /// icmp immediate, that is the target has icmp instructions which can
+    /// compare a register against the immediate without having to materialize
+    /// the immediate into a register.
+    virtual bool isLegalICmpImmediate(int64_t Imm) const;
+
+    /// isLegalAddImmediate - Return true if the specified immediate is legal
+    /// add immediate, that is the target has add instructions which can
+    /// add a register and the immediate without having to materialize
+    /// the immediate into a register.
+    virtual bool isLegalAddImmediate(int64_t Imm) const;
+
     /// isTruncateFree - Return true if it's free to truncate a value of
     /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
     /// register EAX to i16 by referencing its sub-register AX.
@@ -761,6 +783,7 @@ namespace llvm {
     SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
@@ -797,12 +820,7 @@ namespace llvm {
                            DebugLoc dl, SelectionDAG &DAG,
                            SmallVectorImpl<SDValue> &InVals) const;
     virtual SDValue
-      LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
-                bool isVarArg, bool doesNotRet, bool &isTailCall,
-                const SmallVectorImpl<ISD::OutputArg> &Outs,
-                const SmallVectorImpl<SDValue> &OutVals,
-                const SmallVectorImpl<ISD::InputArg> &Ins,
-                DebugLoc dl, SelectionDAG &DAG,
+      LowerCall(CallLoweringInfo &CLI,
                 SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
@@ -822,9 +840,9 @@ namespace llvm {
 
     virtual bool
     CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
-		   bool isVarArg,
-		   const SmallVectorImpl<ISD::OutputArg> &Outs,
-		   LLVMContext &Context) const;
+                   bool isVarArg,
+                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                   LLVMContext &Context) const;
 
     void ReplaceATOMIC_BINARY_64(SDNode *N, SmallVectorImpl<SDValue> &Results,
                                  SelectionDAG &DAG, unsigned NewOp) const;
@@ -909,6 +927,9 @@ namespace llvm {
     /// equivalent, for use with the given x86 condition code.
     SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
                     SelectionDAG &DAG) const;
+
+    /// Convert a comparison if required by the subtarget.
+    SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const;
   };
 
   namespace X86 {
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index 0eee083..b6ba68f 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -1143,7 +1143,9 @@ let Uses = [EFLAGS] in {
                             0, 0>;
 }
 
+let isCompare = 1 in {
 defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>;
+}
 
 
 //===----------------------------------------------------------------------===//
@@ -1154,7 +1156,7 @@ defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>;
 def X86testpat : PatFrag<(ops node:$lhs, node:$rhs),
                          (X86cmp (and_su node:$lhs, node:$rhs), 0)>;
 
-let Defs = [EFLAGS] in {
+let isCompare = 1, Defs = [EFLAGS] in {
   let isCommutable = 1 in {
     def TEST8rr  : BinOpRR_F<0x84, "test", Xi8 , X86testpat, MRMSrcReg>;
     def TEST16rr : BinOpRR_F<0x84, "test", Xi16, X86testpat, MRMSrcReg>;
diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h
index fa1d676..aaef4a4 100644
--- a/lib/Target/X86/X86InstrBuilder.h
+++ b/lib/Target/X86/X86InstrBuilder.h
@@ -55,11 +55,11 @@ struct X86AddressMode {
     : BaseType(RegBase), Scale(1), IndexReg(0), Disp(0), GV(0), GVOpFlags(0) {
     Base.Reg = 0;
   }
-  
-  
+
+
   void getFullAddress(SmallVectorImpl<MachineOperand> &MO) {
     assert(Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8);
-    
+
     if (BaseType == X86AddressMode::RegBase)
       MO.push_back(MachineOperand::CreateReg(Base.Reg, false, false,
                                              false, false, false, 0, false));
@@ -67,16 +67,16 @@ struct X86AddressMode {
       assert(BaseType == X86AddressMode::FrameIndexBase);
       MO.push_back(MachineOperand::CreateFI(Base.FrameIndex));
     }
-    
+
     MO.push_back(MachineOperand::CreateImm(Scale));
     MO.push_back(MachineOperand::CreateReg(IndexReg, false, false,
                                            false, false, false, 0, false));
-    
+
     if (GV)
       MO.push_back(MachineOperand::CreateGA(GV, Disp, GVOpFlags));
     else
       MO.push_back(MachineOperand::CreateImm(Disp));
-    
+
     MO.push_back(MachineOperand::CreateReg(0, false, false,
                                            false, false, false, 0, false));
   }
@@ -122,7 +122,7 @@ static inline const MachineInstrBuilder &
 addFullAddress(const MachineInstrBuilder &MIB,
                const X86AddressMode &AM) {
   assert(AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8);
-  
+
   if (AM.BaseType == X86AddressMode::RegBase)
     MIB.addReg(AM.Base.Reg);
   else {
@@ -135,7 +135,7 @@ addFullAddress(const MachineInstrBuilder &MIB,
     MIB.addGlobalAddress(AM.GV, AM.Disp, AM.GVOpFlags);
   else
     MIB.addImm(AM.Disp);
-    
+
   return MIB.addReg(0);
 }
 
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 6f9e849..99c2b8f 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -375,11 +375,16 @@ let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
             MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
             XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
             XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
-    Uses = [ESP] in
+    Uses = [ESP] in {
 def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
                   "# TLS_addr32",
                   [(X86tlsaddr tls32addr:$sym)]>,
                   Requires<[In32BitMode]>;
+def TLS_base_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+                  "# TLS_base_addr32",
+                  [(X86tlsbaseaddr tls32baseaddr:$sym)]>,
+                  Requires<[In32BitMode]>;
+}
 
 // All calls clobber the non-callee saved registers. RSP is marked as
 // a use to prevent stack-pointer assignments that appear immediately
@@ -389,11 +394,16 @@ let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
             MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
             XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
             XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
-    Uses = [RSP] in
+    Uses = [RSP] in {
 def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
                    "# TLS_addr64",
                   [(X86tlsaddr tls64addr:$sym)]>,
                   Requires<[In64BitMode]>;
+def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
+                   "# TLS_base_addr64",
+                  [(X86tlsbaseaddr tls64baseaddr:$sym)]>,
+                  Requires<[In64BitMode]>;
+}
 
 // Darwin TLS Support
 // For i386, the address of the thunk is passed on the stack, on return the
@@ -1008,8 +1018,8 @@ def : Pat<(X86call (i64 texternalsym:$dst)),
           (CALL64pcrel32 texternalsym:$dst)>;
 
 // tailcall stuff
-def : Pat<(X86tcret GR32_TC:$dst, imm:$off),
-          (TCRETURNri GR32_TC:$dst, imm:$off)>,
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
+          (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>,
           Requires<[In32BitMode]>;
 
 // FIXME: This is disabled for 32-bit PIC mode because the global base
@@ -1623,6 +1633,12 @@ def : Pat<(sub GR16:$src1, i16immSExt8:$src2),
 def : Pat<(sub GR32:$src1, i32immSExt8:$src2),
           (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>;
 
+// sub 0, reg
+def : Pat<(X86sub_flag 0, GR8 :$src), (NEG8r  GR8 :$src)>;
+def : Pat<(X86sub_flag 0, GR16:$src), (NEG16r GR16:$src)>;
+def : Pat<(X86sub_flag 0, GR32:$src), (NEG32r GR32:$src)>;
+def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>;
+
 // mul reg, reg
 def : Pat<(mul GR16:$src1, GR16:$src2),
           (IMUL16rr GR16:$src1, GR16:$src2)>;
diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td
index bf11fde..b0c27c8 100644
--- a/lib/Target/X86/X86InstrControl.td
+++ b/lib/Target/X86/X86InstrControl.td
@@ -18,16 +18,16 @@
 // Return instructions.
 let isTerminator = 1, isReturn = 1, isBarrier = 1,
     hasCtrlDep = 1, FPForm = SpecialFP in {
-  def RET    : I   <0xC3, RawFrm, (outs), (ins variable_ops),
+  def RET    : I   <0xC3, RawFrm, (outs), (ins),
                     "ret",
                     [(X86retflag 0)], IIC_RET>;
-  def RETW   : I   <0xC3, RawFrm, (outs), (ins variable_ops),
+  def RETW   : I   <0xC3, RawFrm, (outs), (ins),
                     "ret{w}",
                     [], IIC_RET>, OpSize;
-  def RETI   : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
+  def RETI   : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt),
                     "ret\t$amt",
                     [(X86retflag timm:$amt)], IIC_RET_IMM>;
-  def RETIW  : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
+  def RETIW  : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt),
                     "ret{w}\t$amt",
                     [], IIC_RET_IMM>, OpSize;
   def LRETL  : I   <0xCB, RawFrm, (outs), (ins),
@@ -148,12 +148,12 @@ let isCall = 1 in
   // registers are added manually.
   let Uses = [ESP] in {
     def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm,
-                           (outs), (ins i32imm_pcrel:$dst,variable_ops),
+                           (outs), (ins i32imm_pcrel:$dst),
                            "call{l}\t$dst", [], IIC_CALL_RI>, Requires<[In32BitMode]>;
-    def CALL32r     : I<0xFF, MRM2r, (outs), (ins GR32:$dst, variable_ops),
+    def CALL32r     : I<0xFF, MRM2r, (outs), (ins GR32:$dst),
                         "call{l}\t{*}$dst", [(X86call GR32:$dst)], IIC_CALL_RI>,
                          Requires<[In32BitMode]>;
-    def CALL32m     : I<0xFF, MRM2m, (outs), (ins i32mem:$dst, variable_ops),
+    def CALL32m     : I<0xFF, MRM2m, (outs), (ins i32mem:$dst),
                         "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))], IIC_CALL_MEM>,
                         Requires<[In32BitMode]>;
 
@@ -174,7 +174,7 @@ let isCall = 1 in
     // callw for 16 bit code for the assembler.
     let isAsmParserOnly = 1 in
       def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm,
-                       (outs), (ins i16imm_pcrel:$dst, variable_ops),
+                       (outs), (ins i16imm_pcrel:$dst),
                        "callw\t$dst", []>, OpSize;
   }
 
@@ -185,23 +185,23 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
     isCodeGenOnly = 1 in
   let Uses = [ESP] in {
   def TCRETURNdi : PseudoI<(outs),
-                     (ins i32imm_pcrel:$dst, i32imm:$offset, variable_ops), []>;
+                     (ins i32imm_pcrel:$dst, i32imm:$offset), []>;
   def TCRETURNri : PseudoI<(outs),
-                     (ins GR32_TC:$dst, i32imm:$offset, variable_ops), []>;
+                     (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>;
   let mayLoad = 1 in
   def TCRETURNmi : PseudoI<(outs),
-                     (ins i32mem_TC:$dst, i32imm:$offset, variable_ops), []>;
+                     (ins i32mem_TC:$dst, i32imm:$offset), []>;
 
   // FIXME: The should be pseudo instructions that are lowered when going to
   // mcinst.
   def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs),
-                           (ins i32imm_pcrel:$dst, variable_ops),
+                           (ins i32imm_pcrel:$dst),
                            "jmp\t$dst  # TAILCALL",
                            [], IIC_JMP_REL>;
-  def TAILJMPr : I<0xFF, MRM4r, (outs), (ins GR32_TC:$dst, variable_ops),
+  def TAILJMPr : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
                    "", [], IIC_JMP_REG>;  // FIXME: Remove encoding when JIT is dead.
   let mayLoad = 1 in
-  def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst, variable_ops),
+  def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst),
                    "jmp{l}\t{*}$dst  # TAILCALL", [], IIC_JMP_MEM>;
 }
 
@@ -218,14 +218,14 @@ let isCall = 1, Uses = [RSP] in {
   // that the offset between an arbitrary immediate and the call will fit in
   // the 32-bit pcrel field that we have.
   def CALL64pcrel32 : Ii32PCRel<0xE8, RawFrm,
-                        (outs), (ins i64i32imm_pcrel:$dst, variable_ops),
+                        (outs), (ins i64i32imm_pcrel:$dst),
                         "call{q}\t$dst", [], IIC_CALL_RI>,
                       Requires<[In64BitMode]>;
-  def CALL64r       : I<0xFF, MRM2r, (outs), (ins GR64:$dst, variable_ops),
+  def CALL64r       : I<0xFF, MRM2r, (outs), (ins GR64:$dst),
                         "call{q}\t{*}$dst", [(X86call GR64:$dst)],
                         IIC_CALL_RI>,
                       Requires<[In64BitMode]>;
-  def CALL64m       : I<0xFF, MRM2m, (outs), (ins i64mem:$dst, variable_ops),
+  def CALL64m       : I<0xFF, MRM2m, (outs), (ins i64mem:$dst),
                         "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))],
                         IIC_CALL_MEM>,
                       Requires<[In64BitMode]>;
@@ -240,7 +240,7 @@ let isCall = 1, isCodeGenOnly = 1 in
   let Defs = [RAX, R10, R11, RSP, EFLAGS],
       Uses = [RSP] in {
     def W64ALLOCA : Ii32PCRel<0xE8, RawFrm,
-                      (outs), (ins i64i32imm_pcrel:$dst, variable_ops),
+                      (outs), (ins i64i32imm_pcrel:$dst),
                       "call{q}\t$dst", [], IIC_CALL_RI>,
                     Requires<[IsWin64]>;
   }
@@ -250,21 +250,21 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
   let Uses = [RSP],
       usesCustomInserter = 1 in {
   def TCRETURNdi64 : PseudoI<(outs),
-                      (ins i64i32imm_pcrel:$dst, i32imm:$offset, variable_ops),
+                      (ins i64i32imm_pcrel:$dst, i32imm:$offset),
                       []>;
   def TCRETURNri64 : PseudoI<(outs),
-                      (ins ptr_rc_tailcall:$dst, i32imm:$offset, variable_ops), []>;
+                      (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>;
   let mayLoad = 1 in
   def TCRETURNmi64 : PseudoI<(outs),
-                       (ins i64mem_TC:$dst, i32imm:$offset, variable_ops), []>;
+                       (ins i64mem_TC:$dst, i32imm:$offset), []>;
 
   def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs),
-                                      (ins i64i32imm_pcrel:$dst, variable_ops),
+                                      (ins i64i32imm_pcrel:$dst),
                    "jmp\t$dst  # TAILCALL", [], IIC_JMP_REL>;
-  def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst, variable_ops),
+  def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
                      "jmp{q}\t{*}$dst  # TAILCALL", [], IIC_JMP_MEM>;
 
   let mayLoad = 1 in
-  def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst, variable_ops),
+  def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
                      "jmp{q}\t{*}$dst  # TAILCALL", [], IIC_JMP_MEM>;
 }
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index d57937b..8802a2e 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -15,83 +15,161 @@
 // FMA3 - Intel 3 operand Fused Multiply-Add instructions
 //===----------------------------------------------------------------------===//
 
+let Constraints = "$src1 = $dst" in {
 multiclass fma3p_rm<bits<8> opc, string OpcodeStr> {
+let neverHasSideEffects = 1 in {
   def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
-           (ins VR128:$src1, VR128:$src2),
-           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           (ins VR128:$src1, VR128:$src2, VR128:$src3),
+           !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
            []>;
+  let mayLoad = 1 in
   def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
-           (ins VR128:$src1, f128mem:$src2),
-           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+           !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
            []>;
   def rY : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
-           (ins VR256:$src1, VR256:$src2),
-           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           (ins VR256:$src1, VR256:$src2, VR256:$src3),
+           !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
            []>;
+  let mayLoad = 1 in
   def mY : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
-           (ins VR256:$src1, f256mem:$src2),
-           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           (ins VR256:$src1, VR256:$src2, f256mem:$src3),
+           !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
            []>;
+} // neverHasSideEffects = 1
+}
+
+// Intrinsic for 132 pattern
+multiclass fma3p_rm_int<bits<8> opc, string OpcodeStr,
+                        PatFrag MemFrag128, PatFrag MemFrag256,
+                        Intrinsic Int128, Intrinsic Int256> {
+  def r_Int : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, VR128:$src3),
+           !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+           [(set VR128:$dst, (Int128 VR128:$src1, VR128:$src3, VR128:$src2))]>;
+  def m_Int : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+           !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+           [(set VR128:$dst,
+             (Int128 VR128:$src1, (MemFrag128 addr:$src3), VR128:$src2))]>;
+  def rY_Int : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
+           (ins VR256:$src1, VR256:$src2, VR256:$src3),
+           !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+           [(set VR256:$dst, (Int256 VR256:$src1, VR256:$src3, VR256:$src2))]>;
+  def mY_Int : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
+           (ins VR256:$src1, VR256:$src2, f256mem:$src3),
+           !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+           [(set VR256:$dst,
+             (Int256 VR256:$src1, (MemFrag256 addr:$src3), VR256:$src2))]>;
 }
+} // Constraints = "$src1 = $dst"
 
 multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
-                       string OpcodeStr, string PackTy> {
-  defm r132 : fma3p_rm<opc132, !strconcat(OpcodeStr, !strconcat("132", PackTy))>;
-  defm r213 : fma3p_rm<opc213, !strconcat(OpcodeStr, !strconcat("213", PackTy))>;
-  defm r231 : fma3p_rm<opc231, !strconcat(OpcodeStr, !strconcat("231", PackTy))>;
+                       string OpcodeStr, string PackTy,
+                       PatFrag MemFrag128, PatFrag MemFrag256,
+                       Intrinsic Int128, Intrinsic Int256> {
+  defm r132 : fma3p_rm_int <opc132, !strconcat(OpcodeStr,
+                            !strconcat("132", PackTy)), MemFrag128, MemFrag256,
+                            Int128, Int256>;
+  defm r132 : fma3p_rm <opc132, !strconcat(OpcodeStr, !strconcat("132", PackTy))>;
+  defm r213 : fma3p_rm <opc213, !strconcat(OpcodeStr, !strconcat("213", PackTy))>;
+  defm r231 : fma3p_rm <opc231, !strconcat(OpcodeStr, !strconcat("231", PackTy))>;
 }
 
 // Fused Multiply-Add
 let ExeDomain = SSEPackedSingle in {
-  defm VFMADDPS    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps">;
-  defm VFMSUBPS    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps">;
-  defm VFMADDSUBPS : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps">;
-  defm VFMSUBADDPS : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps">;
+  defm VFMADDPS    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", memopv4f32,
+    memopv8f32, int_x86_fma_vfmadd_ps, int_x86_fma_vfmadd_ps_256>;
+  defm VFMSUBPS    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps",  memopv4f32,
+    memopv8f32, int_x86_fma_vfmsub_ps, int_x86_fma_vfmsub_ps_256>;
+  defm VFMADDSUBPS : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps",
+    memopv4f32, memopv8f32, int_x86_fma_vfmaddsub_ps,
+    int_x86_fma_vfmaddsub_ps_256>;
+  defm VFMSUBADDPS : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps",
+    memopv4f32, memopv8f32, int_x86_fma_vfmsubadd_ps,
+    int_x86_fma_vfmaddsub_ps_256>;
 }
 
 let ExeDomain = SSEPackedDouble in {
-  defm VFMADDPD    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd">, VEX_W;
-  defm VFMSUBPD    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd">, VEX_W;
-  defm VFMADDSUBPD : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd">, VEX_W;
-  defm VFMSUBADDPD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd">, VEX_W;
+  defm VFMADDPD    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", memopv2f64,
+    memopv4f64, int_x86_fma_vfmadd_pd, int_x86_fma_vfmadd_pd_256>, VEX_W;
+  defm VFMSUBPD    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", memopv2f64,
+    memopv4f64, int_x86_fma_vfmsub_pd, int_x86_fma_vfmsub_pd_256>, VEX_W;
+  defm VFMADDSUBPD : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd", memopv2f64,
+    memopv4f64, int_x86_fma_vfmaddsub_pd, int_x86_fma_vfmaddsub_pd_256>, VEX_W;
+  defm VFMSUBADDPD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd", memopv2f64,
+    memopv4f64, int_x86_fma_vfmsubadd_pd, int_x86_fma_vfmsubadd_pd_256>, VEX_W;
 }
 
 // Fused Negative Multiply-Add
 let ExeDomain = SSEPackedSingle in {
-  defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps">;
-  defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps">;
+  defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps",  memopv4f32,
+    memopv8f32, int_x86_fma_vfnmadd_ps, int_x86_fma_vfnmadd_ps_256>;
+  defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps",  memopv4f32,
+    memopv8f32, int_x86_fma_vfnmsub_ps, int_x86_fma_vfnmsub_ps_256>;
 }
 let ExeDomain = SSEPackedDouble in {
-  defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd">, VEX_W;
-  defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd">, VEX_W;
+  defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", memopv2f64,
+    memopv4f64, int_x86_fma_vfnmadd_pd, int_x86_fma_vfnmadd_pd_256>, VEX_W;
+  defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd", memopv2f64,
+    memopv4f64, int_x86_fma_vfnmsub_pd, int_x86_fma_vfnmsub_pd_256>, VEX_W;
 }
 
-multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop> {
-  def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
-           (ins VR128:$src1, VR128:$src2),
-           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+
+let Constraints = "$src1 = $dst" in {
+multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop,
+                    RegisterClass RC> {
+let neverHasSideEffects = 1 in {
+  def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
+           (ins RC:$src1, RC:$src2, RC:$src3),
+           !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
            []>;
-  def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
-           (ins VR128:$src1, x86memop:$src2),
-           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+  let mayLoad = 1 in
+  def m : FMA3<opc, MRMSrcMem, (outs RC:$dst),
+           (ins RC:$src1, RC:$src2, x86memop:$src3),
+           !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
            []>;
+} // neverHasSideEffects = 1
 }
 
+multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr, Operand memop,
+                        ComplexPattern mem_cpat, Intrinsic IntId> {
+  def r_Int : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, VR128:$src3),
+           !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+           [(set VR128:$dst, (IntId VR128:$src1, VR128:$src3, VR128:$src2))]>;
+  def m_Int : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, memop:$src3),
+           !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+           [(set VR128:$dst,
+             (IntId VR128:$src1, mem_cpat:$src3, VR128:$src2))]>;
+}
+} // Constraints = "$src1 = $dst"
+
 multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
-                       string OpcodeStr> {
-  defm SSr132 : fma3s_rm<opc132, !strconcat(OpcodeStr, "132ss"), f32mem>;
-  defm SSr213 : fma3s_rm<opc213, !strconcat(OpcodeStr, "213ss"), f32mem>;
-  defm SSr231 : fma3s_rm<opc231, !strconcat(OpcodeStr, "231ss"), f32mem>;
-  defm SDr132 : fma3s_rm<opc132, !strconcat(OpcodeStr, "132sd"), f64mem>, VEX_W;
-  defm SDr213 : fma3s_rm<opc213, !strconcat(OpcodeStr, "213sd"), f64mem>, VEX_W;
-  defm SDr231 : fma3s_rm<opc231, !strconcat(OpcodeStr, "231sd"), f64mem>, VEX_W;
+                       string OpStr, Intrinsic IntF32, Intrinsic IntF64> {
+  defm SSr132 : fma3s_rm<opc132, !strconcat(OpStr, "132ss"), f32mem, FR32>;
+  defm SSr213 : fma3s_rm<opc213, !strconcat(OpStr, "213ss"), f32mem, FR32>;
+  defm SSr231 : fma3s_rm<opc231, !strconcat(OpStr, "231ss"), f32mem, FR32>;
+  defm SDr132 : fma3s_rm<opc132, !strconcat(OpStr, "132sd"), f64mem, FR64>, VEX_W;
+  defm SDr213 : fma3s_rm<opc213, !strconcat(OpStr, "213sd"), f64mem, FR64>, VEX_W;
+  defm SDr231 : fma3s_rm<opc231, !strconcat(OpStr, "231sd"), f64mem, FR64>, VEX_W;
+  defm SSr132 : fma3s_rm_int <opc132, !strconcat(OpStr, "132ss"), ssmem,
+                              sse_load_f32, IntF32>;
+  defm SDr132 : fma3s_rm_int <opc132, !strconcat(OpStr, "132sd"), sdmem,
+                              sse_load_f64, IntF64>;
 }
 
-defm VFMADD : fma3s_forms<0x99, 0xA9, 0xB9, "vfmadd">, VEX_LIG;
-defm VFMSUB : fma3s_forms<0x9B, 0xAB, 0xBB, "vfmsub">, VEX_LIG;
+defm VFMADD : fma3s_forms<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss,
+                          int_x86_fma_vfmadd_sd>, VEX_LIG;
+defm VFMSUB : fma3s_forms<0x9B, 0xAB, 0xBB, "vfmsub", int_x86_fma_vfmsub_ss,
+                          int_x86_fma_vfmsub_sd>, VEX_LIG;
+
+defm VFNMADD : fma3s_forms<0x9D, 0xAD, 0xBD, "vfnmadd", int_x86_fma_vfnmadd_ss,
+                           int_x86_fma_vfnmadd_sd>, VEX_LIG;
+defm VFNMSUB : fma3s_forms<0x9F, 0xAF, 0xBF, "vfnmsub", int_x86_fma_vfnmsub_ss,
+                           int_x86_fma_vfnmsub_sd>, VEX_LIG;
 
-defm VFNMADD : fma3s_forms<0x9D, 0xAD, 0xBD, "vfnmadd">, VEX_LIG;
-defm VFNMSUB : fma3s_forms<0x9F, 0xAF, 0xBF, "vfnmsub">, VEX_LIG;
 
 //===----------------------------------------------------------------------===//
 // FMA4 - AMD 4 operand Fused Multiply-Add instructions
@@ -178,43 +256,47 @@ let isCodeGenOnly = 1 in {
 } // isCodeGenOnly = 1
 }
 
+let Predicates = [HasFMA4] in {
+
 defm VFMADDSS4    : fma4s<0x6A, "vfmaddss", ssmem, sse_load_f32,
-                          int_x86_fma4_vfmadd_ss>;
+                          int_x86_fma_vfmadd_ss>;
 defm VFMADDSD4    : fma4s<0x6B, "vfmaddsd", sdmem, sse_load_f64,
-                          int_x86_fma4_vfmadd_sd>;
-defm VFMADDPS4    : fma4p<0x68, "vfmaddps", int_x86_fma4_vfmadd_ps,
-                          int_x86_fma4_vfmadd_ps_256, memopv4f32, memopv8f32>;
-defm VFMADDPD4    : fma4p<0x69, "vfmaddpd", int_x86_fma4_vfmadd_pd,
-                          int_x86_fma4_vfmadd_pd_256, memopv2f64, memopv4f64>;
+                          int_x86_fma_vfmadd_sd>;
+defm VFMADDPS4    : fma4p<0x68, "vfmaddps", int_x86_fma_vfmadd_ps,
+                          int_x86_fma_vfmadd_ps_256, memopv4f32, memopv8f32>;
+defm VFMADDPD4    : fma4p<0x69, "vfmaddpd", int_x86_fma_vfmadd_pd,
+                          int_x86_fma_vfmadd_pd_256, memopv2f64, memopv4f64>;
 defm VFMSUBSS4    : fma4s<0x6E, "vfmsubss", ssmem, sse_load_f32,
-                          int_x86_fma4_vfmsub_ss>;
+                          int_x86_fma_vfmsub_ss>;
 defm VFMSUBSD4    : fma4s<0x6F, "vfmsubsd", sdmem, sse_load_f64,
-                          int_x86_fma4_vfmsub_sd>;
-defm VFMSUBPS4    : fma4p<0x6C, "vfmsubps", int_x86_fma4_vfmsub_ps,
-                          int_x86_fma4_vfmsub_ps_256, memopv4f32, memopv8f32>;
-defm VFMSUBPD4    : fma4p<0x6D, "vfmsubpd", int_x86_fma4_vfmsub_pd,
-                          int_x86_fma4_vfmsub_pd_256, memopv2f64, memopv4f64>;
+                          int_x86_fma_vfmsub_sd>;
+defm VFMSUBPS4    : fma4p<0x6C, "vfmsubps", int_x86_fma_vfmsub_ps,
+                          int_x86_fma_vfmsub_ps_256, memopv4f32, memopv8f32>;
+defm VFMSUBPD4    : fma4p<0x6D, "vfmsubpd", int_x86_fma_vfmsub_pd,
+                          int_x86_fma_vfmsub_pd_256, memopv2f64, memopv4f64>;
 defm VFNMADDSS4   : fma4s<0x7A, "vfnmaddss", ssmem, sse_load_f32,
-                          int_x86_fma4_vfnmadd_ss>;
+                          int_x86_fma_vfnmadd_ss>;
 defm VFNMADDSD4   : fma4s<0x7B, "vfnmaddsd", sdmem, sse_load_f64,
-                          int_x86_fma4_vfnmadd_sd>;
-defm VFNMADDPS4   : fma4p<0x78, "vfnmaddps", int_x86_fma4_vfnmadd_ps,
-                          int_x86_fma4_vfnmadd_ps_256, memopv4f32, memopv8f32>;
-defm VFNMADDPD4   : fma4p<0x79, "vfnmaddpd", int_x86_fma4_vfnmadd_pd,
-                          int_x86_fma4_vfnmadd_pd_256, memopv2f64, memopv4f64>;
+                          int_x86_fma_vfnmadd_sd>;
+defm VFNMADDPS4   : fma4p<0x78, "vfnmaddps", int_x86_fma_vfnmadd_ps,
+                          int_x86_fma_vfnmadd_ps_256, memopv4f32, memopv8f32>;
+defm VFNMADDPD4   : fma4p<0x79, "vfnmaddpd", int_x86_fma_vfnmadd_pd,
+                          int_x86_fma_vfnmadd_pd_256, memopv2f64, memopv4f64>;
 defm VFNMSUBSS4   : fma4s<0x7E, "vfnmsubss", ssmem, sse_load_f32,
-                          int_x86_fma4_vfnmsub_ss>;
+                          int_x86_fma_vfnmsub_ss>;
 defm VFNMSUBSD4   : fma4s<0x7F, "vfnmsubsd", sdmem, sse_load_f64,
-                          int_x86_fma4_vfnmsub_sd>;
-defm VFNMSUBPS4   : fma4p<0x7C, "vfnmsubps", int_x86_fma4_vfnmsub_ps,
-                          int_x86_fma4_vfnmsub_ps_256, memopv4f32, memopv8f32>;
-defm VFNMSUBPD4   : fma4p<0x7D, "vfnmsubpd", int_x86_fma4_vfnmsub_pd,
-                          int_x86_fma4_vfnmsub_pd_256, memopv2f64, memopv4f64>;
-defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", int_x86_fma4_vfmaddsub_ps,
-                         int_x86_fma4_vfmaddsub_ps_256, memopv4f32, memopv8f32>;
-defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", int_x86_fma4_vfmaddsub_pd,
-                         int_x86_fma4_vfmaddsub_pd_256, memopv2f64, memopv4f64>;
-defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", int_x86_fma4_vfmsubadd_ps,
-                         int_x86_fma4_vfmsubadd_ps_256, memopv4f32, memopv8f32>;
-defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", int_x86_fma4_vfmsubadd_pd,
-                         int_x86_fma4_vfmsubadd_pd_256, memopv2f64, memopv4f64>;
+                          int_x86_fma_vfnmsub_sd>;
+defm VFNMSUBPS4   : fma4p<0x7C, "vfnmsubps", int_x86_fma_vfnmsub_ps,
+                          int_x86_fma_vfnmsub_ps_256, memopv4f32, memopv8f32>;
+defm VFNMSUBPD4   : fma4p<0x7D, "vfnmsubpd", int_x86_fma_vfnmsub_pd,
+                          int_x86_fma_vfnmsub_pd_256, memopv2f64, memopv4f64>;
+defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", int_x86_fma_vfmaddsub_ps,
+                          int_x86_fma_vfmaddsub_ps_256, memopv4f32, memopv8f32>;
+defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", int_x86_fma_vfmaddsub_pd,
+                          int_x86_fma_vfmaddsub_pd_256, memopv2f64, memopv4f64>;
+defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", int_x86_fma_vfmsubadd_ps,
+                          int_x86_fma_vfmsubadd_ps_256, memopv4f32, memopv8f32>;
+defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", int_x86_fma_vfmsubadd_pd,
+                          int_x86_fma_vfmsubadd_pd_256, memopv2f64, memopv4f64>;
+} // HasFMA4
+
diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td
index a13887e..568726e 100644
--- a/lib/Target/X86/X86InstrFPStack.td
+++ b/lib/Target/X86/X86InstrFPStack.td
@@ -27,6 +27,7 @@ def SDTX86Fst       : SDTypeProfile<0, 3, [SDTCisFP<0>,
                                            SDTCisVT<2, OtherVT>]>;
 def SDTX86Fild      : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisPtrTy<1>,
                                            SDTCisVT<2, OtherVT>]>;
+def SDTX86Fnstsw    : SDTypeProfile<1, 1, [SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
 def SDTX86FpToIMem  : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
 
 def SDTX86CwdStore  : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
@@ -41,6 +42,7 @@ def X86fild         : SDNode<"X86ISD::FILD", SDTX86Fild,
 def X86fildflag     : SDNode<"X86ISD::FILD_FLAG", SDTX86Fild,
                              [SDNPHasChain, SDNPOutGlue, SDNPMayLoad,
                               SDNPMemOperand]>;
+def X86fp_stsw      : SDNode<"X86ISD::FNSTSW16r", SDTX86Fnstsw>;
 def X86fp_to_i16mem : SDNode<"X86ISD::FP_TO_INT16_IN_MEM", SDTX86FpToIMem,
                              [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 def X86fp_to_i32mem : SDNode<"X86ISD::FP_TO_INT32_IN_MEM", SDTX86FpToIMem,
@@ -203,6 +205,7 @@ def _FI32m  : FPI<0xDA, fp, (outs), (ins i32mem:$src),
 }
 }
 
+let Defs = [FPSW] in {
 defm ADD : FPBinary_rr<fadd>;
 defm SUB : FPBinary_rr<fsub>;
 defm MUL : FPBinary_rr<fmul>;
@@ -213,6 +216,7 @@ defm SUBR: FPBinary<fsub ,MRM5m, "subr">;
 defm MUL : FPBinary<fmul, MRM1m, "mul">;
 defm DIV : FPBinary<fdiv, MRM6m, "div">;
 defm DIVR: FPBinary<fdiv, MRM7m, "divr">;
+}
 
 class FPST0rInst<bits<8> o, string asm>
   : FPI<o, AddRegFrm, (outs), (ins RST:$op), asm>, D8;
@@ -257,6 +261,7 @@ def _Fp80  : FpI_<(outs RFP80:$dst), (ins RFP80:$src), OneArgFPRW,
 def _F     : FPI<opcode, RawFrm, (outs), (ins), asmstring>, D9;
 }
 
+let Defs = [FPSW] in {
 defm CHS : FPUnary<fneg, 0xE0, "fchs">;
 defm ABS : FPUnary<fabs, 0xE1, "fabs">;
 defm SQRT: FPUnary<fsqrt,0xFA, "fsqrt">;
@@ -269,6 +274,7 @@ def TST_Fp64  : FpIf64<(outs), (ins RFP64:$src), OneArgFP, []>;
 def TST_Fp80  : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>;
 }
 def TST_F  : FPI<0xE4, RawFrm, (outs), (ins), "ftst">, D9;
+} // Defs = [FPSW]
 
 // Versions of FP instructions that take a single memory operand.  Added for the
 //   disassembler; remove as they are included with patterns elsewhere.
@@ -316,6 +322,7 @@ multiclass FPCMov<PatLeaf cc> {
                                         Requires<[HasCMov]>;
 }
 
+let Defs = [FPSW] in {
 let Uses = [EFLAGS], Constraints = "$src1 = $dst" in {
 defm CMOVB  : FPCMov<X86_COND_B>;
 defm CMOVBE : FPCMov<X86_COND_BE>;
@@ -416,24 +423,40 @@ def IST_Fp64m80  : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>;
 }
 
 let mayLoad = 1 in {
-def LD_F32m   : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src">;
-def LD_F64m   : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src">;
-def LD_F80m   : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src">;
-def ILD_F16m  : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src">;
-def ILD_F32m  : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src">;
-def ILD_F64m  : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src">;
+def LD_F32m   : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src",
+                    IIC_FLD>;
+def LD_F64m   : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src",
+                    IIC_FLD>;
+def LD_F80m   : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src",
+                    IIC_FLD80>;
+def ILD_F16m  : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src",
+                    IIC_FILD>;
+def ILD_F32m  : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src",
+                    IIC_FILD>;
+def ILD_F64m  : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src",
+                    IIC_FILD>;
 }
 let mayStore = 1 in {
-def ST_F32m   : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst">;
-def ST_F64m   : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst">;
-def ST_FP32m  : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst">;
-def ST_FP64m  : FPI<0xDD, MRM3m, (outs), (ins f64mem:$dst), "fstp{l}\t$dst">;
-def ST_FP80m  : FPI<0xDB, MRM7m, (outs), (ins f80mem:$dst), "fstp{t}\t$dst">;
-def IST_F16m  : FPI<0xDF, MRM2m, (outs), (ins i16mem:$dst), "fist{s}\t$dst">;
-def IST_F32m  : FPI<0xDB, MRM2m, (outs), (ins i32mem:$dst), "fist{l}\t$dst">;
-def IST_FP16m : FPI<0xDF, MRM3m, (outs), (ins i16mem:$dst), "fistp{s}\t$dst">;
-def IST_FP32m : FPI<0xDB, MRM3m, (outs), (ins i32mem:$dst), "fistp{l}\t$dst">;
-def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst">;
+def ST_F32m   : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst",
+                    IIC_FST>;
+def ST_F64m   : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst",
+                    IIC_FST>;
+def ST_FP32m  : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst",
+                    IIC_FST>;
+def ST_FP64m  : FPI<0xDD, MRM3m, (outs), (ins f64mem:$dst), "fstp{l}\t$dst",
+                    IIC_FST>;
+def ST_FP80m  : FPI<0xDB, MRM7m, (outs), (ins f80mem:$dst), "fstp{t}\t$dst",
+                    IIC_FST80>;
+def IST_F16m  : FPI<0xDF, MRM2m, (outs), (ins i16mem:$dst), "fist{s}\t$dst",
+                    IIC_FIST>;
+def IST_F32m  : FPI<0xDB, MRM2m, (outs), (ins i32mem:$dst), "fist{l}\t$dst",
+                    IIC_FIST>;
+def IST_FP16m : FPI<0xDF, MRM3m, (outs), (ins i16mem:$dst), "fistp{s}\t$dst",
+                    IIC_FIST>;
+def IST_FP32m : FPI<0xDB, MRM3m, (outs), (ins i32mem:$dst), "fistp{l}\t$dst",
+                    IIC_FIST>;
+def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst",
+                    IIC_FIST>;
 }
 
 // FISTTP requires SSE3 even though it's a FPStack op.
@@ -459,17 +482,23 @@ def ISTT_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP,
 } // Predicates = [HasSSE3]
 
 let mayStore = 1 in {
-def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst">;
-def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst">;
+def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst",
+  IIC_FST>;
+def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst",
+  IIC_FST>;
 def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst), 
-  "fisttp{ll}\t$dst">;
+  "fisttp{ll}\t$dst", IIC_FST>;
 }
 
 // FP Stack manipulation instructions.
-def LD_Frr   : FPI<0xC0, AddRegFrm, (outs), (ins RST:$op), "fld\t$op">, D9;
-def ST_Frr   : FPI<0xD0, AddRegFrm, (outs), (ins RST:$op), "fst\t$op">, DD;
-def ST_FPrr  : FPI<0xD8, AddRegFrm, (outs), (ins RST:$op), "fstp\t$op">, DD;
-def XCH_F    : FPI<0xC8, AddRegFrm, (outs), (ins RST:$op), "fxch\t$op">, D9;
+def LD_Frr   : FPI<0xC0, AddRegFrm, (outs), (ins RST:$op), "fld\t$op",
+                   IIC_FLD>, D9;
+def ST_Frr   : FPI<0xD0, AddRegFrm, (outs), (ins RST:$op), "fst\t$op",
+                   IIC_FST>, DD;
+def ST_FPrr  : FPI<0xD8, AddRegFrm, (outs), (ins RST:$op), "fstp\t$op",
+                   IIC_FST>, DD;
+def XCH_F    : FPI<0xC8, AddRegFrm, (outs), (ins RST:$op), "fxch\t$op",
+                   IIC_FXCH>, D9;
 
 // Floating point constant loads.
 let isReMaterializable = 1 in {
@@ -487,20 +516,21 @@ def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
                 [(set RFP80:$dst, fpimm1)]>;
 }
 
-def LD_F0 : FPI<0xEE, RawFrm, (outs), (ins), "fldz">, D9;
-def LD_F1 : FPI<0xE8, RawFrm, (outs), (ins), "fld1">, D9;
+def LD_F0 : FPI<0xEE, RawFrm, (outs), (ins), "fldz", IIC_FLDZ>, D9;
+def LD_F1 : FPI<0xE8, RawFrm, (outs), (ins), "fld1", IIC_FIST>, D9;
 
 
 // Floating point compares.
-let Defs = [EFLAGS] in {
 def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
-                        []>;  // FPSW = cmp ST(0) with ST(i)
+                        [(set FPSW, (trunc (X86cmp RFP32:$lhs, RFP32:$rhs)))]>;
 def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
-                        []>;  // FPSW = cmp ST(0) with ST(i)
+                        [(set FPSW, (trunc (X86cmp RFP64:$lhs, RFP64:$rhs)))]>;
 def UCOM_Fpr80 : FpI_  <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
-                        []>;  // FPSW = cmp ST(0) with ST(i)
-                        
+                        [(set FPSW, (trunc (X86cmp RFP80:$lhs, RFP80:$rhs)))]>;
+} // Defs = [FPSW]
+
 // CC = ST(0) cmp ST(i)
+let Defs = [EFLAGS, FPSW] in {
 def UCOM_FpIr32: FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
                   [(set EFLAGS, (X86cmp RFP32:$lhs, RFP32:$rhs))]>;
 def UCOM_FpIr64: FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
@@ -509,85 +539,94 @@ def UCOM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
                   [(set EFLAGS, (X86cmp RFP80:$lhs, RFP80:$rhs))]>;
 }
 
-let Defs = [EFLAGS], Uses = [ST0] in {
+let Defs = [FPSW], Uses = [ST0] in {
 def UCOM_Fr    : FPI<0xE0, AddRegFrm,    // FPSW = cmp ST(0) with ST(i)
                     (outs), (ins RST:$reg),
-                    "fucom\t$reg">, DD;
+                    "fucom\t$reg", IIC_FUCOM>, DD;
 def UCOM_FPr   : FPI<0xE8, AddRegFrm,    // FPSW = cmp ST(0) with ST(i), pop
                     (outs), (ins RST:$reg),
-                    "fucomp\t$reg">, DD;
+                    "fucomp\t$reg", IIC_FUCOM>, DD;
 def UCOM_FPPr  : FPI<0xE9, RawFrm,       // cmp ST(0) with ST(1), pop, pop
                     (outs), (ins),
-                    "fucompp">, DA;
+                    "fucompp", IIC_FUCOM>, DA;
+}
 
+let Defs = [EFLAGS, FPSW], Uses = [ST0] in {
 def UCOM_FIr   : FPI<0xE8, AddRegFrm,     // CC = cmp ST(0) with ST(i)
                     (outs), (ins RST:$reg),
-                    "fucomi\t$reg">, DB;
+                    "fucomi\t$reg", IIC_FUCOMI>, DB;
 def UCOM_FIPr  : FPI<0xE8, AddRegFrm,     // CC = cmp ST(0) with ST(i), pop
                     (outs), (ins RST:$reg),
-                    "fucompi\t$reg">, DF;
+                    "fucompi\t$reg", IIC_FUCOMI>, DF;
 }
 
+let Defs = [EFLAGS, FPSW] in {
 def COM_FIr : FPI<0xF0, AddRegFrm, (outs), (ins RST:$reg),
-                  "fcomi\t$reg">, DB;
+                  "fcomi\t$reg", IIC_FCOMI>, DB;
 def COM_FIPr : FPI<0xF0, AddRegFrm, (outs), (ins RST:$reg),
-                   "fcompi\t$reg">, DF;
+                   "fcompi\t$reg", IIC_FCOMI>, DF;
+}
 
 // Floating point flag ops.
-let Defs = [AX] in
-def FNSTSW8r  : I<0xE0, RawFrm,                  // AX = fp flags
-                  (outs), (ins), "fnstsw %ax", []>, DF;
+let Defs = [AX], Uses = [FPSW] in
+def FNSTSW16r : I<0xE0, RawFrm,                  // AX = fp flags
+                  (outs), (ins), "fnstsw %ax",
+                  [(set AX, (X86fp_stsw FPSW))], IIC_FNSTSW>, DF;
 
 def FNSTCW16m : I<0xD9, MRM7m,                   // [mem16] = X87 control world
                   (outs), (ins i16mem:$dst), "fnstcw\t$dst",
-                  [(X86fp_cwd_get16 addr:$dst)]>;
+                  [(X86fp_cwd_get16 addr:$dst)], IIC_FNSTCW>;
                   
 let mayLoad = 1 in
 def FLDCW16m  : I<0xD9, MRM5m,                   // X87 control world = [mem16]
-                  (outs), (ins i16mem:$dst), "fldcw\t$dst", []>;
+                  (outs), (ins i16mem:$dst), "fldcw\t$dst", [], IIC_FLDCW>;
 
 // FPU control instructions
-def FNINIT : I<0xE3, RawFrm, (outs), (ins), "fninit", []>, DB;
+let Defs = [FPSW] in
+def FNINIT : I<0xE3, RawFrm, (outs), (ins), "fninit", [], IIC_FNINIT>, DB;
 def FFREE : FPI<0xC0, AddRegFrm, (outs), (ins RST:$reg),
-                "ffree\t$reg">, DD;
+                "ffree\t$reg", IIC_FFREE>, DD;
 
 // Clear exceptions
 
-def FNCLEX : I<0xE2, RawFrm, (outs), (ins), "fnclex", []>, DB;
+let Defs = [FPSW] in
+def FNCLEX : I<0xE2, RawFrm, (outs), (ins), "fnclex", [], IIC_FNCLEX>, DB;
 
 // Operandless floating-point instructions for the disassembler.
-def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", []>;
-
-def FNOP : I<0xD0, RawFrm, (outs), (ins), "fnop", []>, D9;
-def FXAM : I<0xE5, RawFrm, (outs), (ins), "fxam", []>, D9;
-def FLDL2T : I<0xE9, RawFrm, (outs), (ins), "fldl2t", []>, D9;
-def FLDL2E : I<0xEA, RawFrm, (outs), (ins), "fldl2e", []>, D9;
-def FLDPI : I<0xEB, RawFrm, (outs), (ins), "fldpi", []>, D9;
-def FLDLG2 : I<0xEC, RawFrm, (outs), (ins), "fldlg2", []>, D9;
-def FLDLN2 : I<0xED, RawFrm, (outs), (ins), "fldln2", []>, D9;
-def F2XM1 : I<0xF0, RawFrm, (outs), (ins), "f2xm1", []>, D9;
-def FYL2X : I<0xF1, RawFrm, (outs), (ins), "fyl2x", []>, D9;
-def FPTAN : I<0xF2, RawFrm, (outs), (ins), "fptan", []>, D9;
-def FPATAN : I<0xF3, RawFrm, (outs), (ins), "fpatan", []>, D9;
-def FXTRACT : I<0xF4, RawFrm, (outs), (ins), "fxtract", []>, D9;
-def FPREM1 : I<0xF5, RawFrm, (outs), (ins), "fprem1", []>, D9;
-def FDECSTP : I<0xF6, RawFrm, (outs), (ins), "fdecstp", []>, D9;
-def FINCSTP : I<0xF7, RawFrm, (outs), (ins), "fincstp", []>, D9;
-def FPREM : I<0xF8, RawFrm, (outs), (ins), "fprem", []>, D9;
-def FYL2XP1 : I<0xF9, RawFrm, (outs), (ins), "fyl2xp1", []>, D9;
-def FSINCOS : I<0xFB, RawFrm, (outs), (ins), "fsincos", []>, D9;
-def FRNDINT : I<0xFC, RawFrm, (outs), (ins), "frndint", []>, D9;
-def FSCALE : I<0xFD, RawFrm, (outs), (ins), "fscale", []>, D9;
-def FCOMPP : I<0xD9, RawFrm, (outs), (ins), "fcompp", []>, DE;
+def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", [], IIC_WAIT>;
+
+def FNOP : I<0xD0, RawFrm, (outs), (ins), "fnop", [], IIC_FNOP>, D9;
+def FXAM : I<0xE5, RawFrm, (outs), (ins), "fxam", [], IIC_FXAM>, D9;
+def FLDL2T : I<0xE9, RawFrm, (outs), (ins), "fldl2t", [], IIC_FLDL>, D9;
+def FLDL2E : I<0xEA, RawFrm, (outs), (ins), "fldl2e", [], IIC_FLDL>, D9;
+def FLDPI : I<0xEB, RawFrm, (outs), (ins), "fldpi", [], IIC_FLDL>, D9;
+def FLDLG2 : I<0xEC, RawFrm, (outs), (ins), "fldlg2", [], IIC_FLDL>, D9;
+def FLDLN2 : I<0xED, RawFrm, (outs), (ins), "fldln2", [], IIC_FLDL>, D9;
+def F2XM1 : I<0xF0, RawFrm, (outs), (ins), "f2xm1", [], IIC_F2XM1>, D9;
+def FYL2X : I<0xF1, RawFrm, (outs), (ins), "fyl2x", [], IIC_FYL2X>, D9;
+def FPTAN : I<0xF2, RawFrm, (outs), (ins), "fptan", [], IIC_FPTAN>, D9;
+def FPATAN : I<0xF3, RawFrm, (outs), (ins), "fpatan", [], IIC_FPATAN>, D9;
+def FXTRACT : I<0xF4, RawFrm, (outs), (ins), "fxtract", [], IIC_FXTRACT>, D9;
+def FPREM1 : I<0xF5, RawFrm, (outs), (ins), "fprem1", [], IIC_FPREM1>, D9;
+def FDECSTP : I<0xF6, RawFrm, (outs), (ins), "fdecstp", [], IIC_FPSTP>, D9;
+def FINCSTP : I<0xF7, RawFrm, (outs), (ins), "fincstp", [], IIC_FPSTP>, D9;
+def FPREM : I<0xF8, RawFrm, (outs), (ins), "fprem", [], IIC_FPREM>, D9;
+def FYL2XP1 : I<0xF9, RawFrm, (outs), (ins), "fyl2xp1", [], IIC_FYL2XP1>, D9;
+def FSINCOS : I<0xFB, RawFrm, (outs), (ins), "fsincos", [], IIC_FSINCOS>, D9;
+def FRNDINT : I<0xFC, RawFrm, (outs), (ins), "frndint", [], IIC_FRNDINT>, D9;
+def FSCALE : I<0xFD, RawFrm, (outs), (ins), "fscale", [], IIC_FSCALE>, D9;
+def FCOMPP : I<0xD9, RawFrm, (outs), (ins), "fcompp", [], IIC_FCOMPP>, DE;
 
 def FXSAVE : I<0xAE, MRM0m, (outs opaque512mem:$dst), (ins),
-               "fxsave\t$dst", []>, TB;
+               "fxsave\t$dst", [], IIC_FXSAVE>, TB;
 def FXSAVE64 : I<0xAE, MRM0m, (outs opaque512mem:$dst), (ins),
-                 "fxsaveq\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>;
+                 "fxsaveq\t$dst", [], IIC_FXSAVE>, TB, REX_W,
+                 Requires<[In64BitMode]>;
 def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
-                "fxrstor\t$src", []>, TB;
+                "fxrstor\t$src", [], IIC_FXRSTOR>, TB;
 def FXRSTOR64 : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
-                  "fxrstorq\t$src", []>, TB, REX_W, Requires<[In64BitMode]>;
+                  "fxrstorq\t$src", [], IIC_FXRSTOR>, TB, REX_W,
+                  Requires<[In64BitMode]>;
 
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index b387090..a115ab4 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -255,8 +255,9 @@ class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
 
 // FPStack Instruction Templates:
 // FPI - Floating Point Instruction template.
-class FPI<bits<8> o, Format F, dag outs, dag ins, string asm>
-  : I<o, F, outs, ins, asm, []> {}
+class FPI<bits<8> o, Format F, dag outs, dag ins, string asm,
+          InstrItinClass itin = IIC_DEFAULT>
+  : I<o, F, outs, ins, asm, [], itin> {}
 
 // FpI_ - Floating Point Pseudo Instruction template. Not Predicated.
 class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern,
@@ -365,6 +366,7 @@ class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
 // 
 //   SDI    - SSE2 instructions with XD prefix.
 //   SDIi8  - SSE2 instructions with ImmT == Imm8 and XD prefix.
+//   SSDI   - SSE2 instructions with XS prefix.
 //   SSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix.
 //   PDI    - SSE2 instructions with TB and OpSize prefixes.
 //   PDIi8  - SSE2 instructions with ImmT == Imm8 and TB and OpSize prefixes.
@@ -377,8 +379,11 @@ class SDI<bits<8> o, Format F, dag outs, dag ins, string asm,
 class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>;
+class SSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+          list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+      : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasSSE2]>;
 class SSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
-             list<dag> pattern>
+             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>;
 class PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
@@ -503,29 +508,29 @@ class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 class AES8I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag>pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
-        Requires<[HasSSE2, HasAES]>;
+        Requires<[HasAES]>;
 
 class AESAI<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
-        Requires<[HasSSE2, HasAES]>;
+        Requires<[HasAES]>;
 
-// CLMUL Instruction Templates
-class CLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+// PCLMUL Instruction Templates
+class PCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
                list<dag>pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
-        OpSize, Requires<[HasSSE2, HasCLMUL]>;
+        OpSize, Requires<[HasPCLMUL]>;
 
-class AVXCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+class AVXPCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
                   list<dag>pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
-        OpSize, VEX_4V, Requires<[HasAVX, HasCLMUL]>;
+        OpSize, VEX_4V, Requires<[HasAVX, HasPCLMUL]>;
 
 // FMA3 Instruction Templates
 class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag>pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin>, T8,
-        OpSize, VEX_4V, Requires<[HasFMA3]>;
+        OpSize, VEX_4V, Requires<[HasFMA]>;
 
 // FMA4 Instruction Templates
 class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm,
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 35801e4..ec030dd 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -71,9 +71,14 @@ def X86insrtps : SDNode<"X86ISD::INSERTPS",
                                       SDTCisVT<2, v4f32>, SDTCisPtrTy<3>]>>;
 def X86vzmovl  : SDNode<"X86ISD::VZEXT_MOVL",
                  SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
+
+def X86vzmovly  : SDNode<"X86ISD::VZEXT_MOVL",
+                 SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, 
+                                      SDTCisOpSmallerThanOp<1, 0> ]>>;
+
 def X86vsmovl  : SDNode<"X86ISD::VSEXT_MOVL",
                  SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisInt<1>, SDTCisInt<0>]>>;
-                 
+
 def X86vzload  : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def X86vshldq  : SDNode<"X86ISD::VSHLDQ",    SDTIntShiftOp>;
@@ -102,13 +107,6 @@ def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
 def X86ptest   : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
 def X86testp   : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
 
-def X86vpcom   : SDNode<"X86ISD::VPCOM",
-                        SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                                      SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>>;
-def X86vpcomu  : SDNode<"X86ISD::VPCOMU",
-                        SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                                      SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>>;
-
 def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
                         SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
                                       SDTCisSameAs<1,2>]>>;
@@ -304,7 +302,7 @@ def nontemporalstore : PatFrag<(ops node:$val, node:$ptr),
 }]>;
 
 def alignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
-			           (st node:$val, node:$ptr), [{
+                                    (st node:$val, node:$ptr), [{
   if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
     return ST->isNonTemporal() && !ST->isTruncatingStore() &&
            ST->getAddressingMode() == ISD::UNINDEXED &&
@@ -313,7 +311,7 @@ def alignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
 }]>;
 
 def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
-			           (st node:$val, node:$ptr), [{
+                                      (st node:$val, node:$ptr), [{
   if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
     return ST->isNonTemporal() &&
            ST->getAlignment() < 16;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index b12c1db..69493bc 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -21,6 +21,7 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -54,38 +55,39 @@ ReMatPICStubLoad("remat-pic-stub-load",
 
 enum {
   // Select which memory operand is being unfolded.
-  // (stored in bits 0 - 7)
+  // (stored in bits 0 - 3)
   TB_INDEX_0    = 0,
   TB_INDEX_1    = 1,
   TB_INDEX_2    = 2,
-  TB_INDEX_MASK = 0xff,
-
-  // Minimum alignment required for load/store.
-  // Used for RegOp->MemOp conversion.
-  // (stored in bits 8 - 15)
-  TB_ALIGN_SHIFT = 8,
-  TB_ALIGN_NONE  =    0 << TB_ALIGN_SHIFT,
-  TB_ALIGN_16    =   16 << TB_ALIGN_SHIFT,
-  TB_ALIGN_32    =   32 << TB_ALIGN_SHIFT,
-  TB_ALIGN_MASK  = 0xff << TB_ALIGN_SHIFT,
+  TB_INDEX_3    = 3,
+  TB_INDEX_MASK = 0xf,
 
   // Do not insert the reverse map (MemOp -> RegOp) into the table.
   // This may be needed because there is a many -> one mapping.
-  TB_NO_REVERSE   = 1 << 16,
+  TB_NO_REVERSE   = 1 << 4,
 
   // Do not insert the forward map (RegOp -> MemOp) into the table.
   // This is needed for Native Client, which prohibits branch
   // instructions from using a memory operand.
-  TB_NO_FORWARD   = 1 << 17,
+  TB_NO_FORWARD   = 1 << 5,
+
+  TB_FOLDED_LOAD  = 1 << 6,
+  TB_FOLDED_STORE = 1 << 7,
 
-  TB_FOLDED_LOAD  = 1 << 18,
-  TB_FOLDED_STORE = 1 << 19
+  // Minimum alignment required for load/store.
+  // Used for RegOp->MemOp conversion.
+  // (stored in bits 8 - 15)
+  TB_ALIGN_SHIFT = 8,
+  TB_ALIGN_NONE  =    0 << TB_ALIGN_SHIFT,
+  TB_ALIGN_16    =   16 << TB_ALIGN_SHIFT,
+  TB_ALIGN_32    =   32 << TB_ALIGN_SHIFT,
+  TB_ALIGN_MASK  = 0xff << TB_ALIGN_SHIFT
 };
 
 struct X86OpTblEntry {
   uint16_t RegOp;
   uint16_t MemOp;
-  uint32_t Flags;
+  uint16_t Flags;
 };
 
 X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
@@ -408,14 +410,10 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::IMUL64rri8,      X86::IMUL64rmi8,          0 },
     { X86::Int_COMISDrr,    X86::Int_COMISDrm,        0 },
     { X86::Int_COMISSrr,    X86::Int_COMISSrm,        0 },
-    { X86::Int_CVTDQ2PDrr,  X86::Int_CVTDQ2PDrm,      TB_ALIGN_16 },
-    { X86::Int_CVTDQ2PSrr,  X86::Int_CVTDQ2PSrm,      TB_ALIGN_16 },
-    { X86::Int_CVTPD2DQrr,  X86::Int_CVTPD2DQrm,      TB_ALIGN_16 },
-    { X86::Int_CVTPD2PSrr,  X86::Int_CVTPD2PSrm,      TB_ALIGN_16 },
-    { X86::Int_CVTPS2DQrr,  X86::Int_CVTPS2DQrm,      TB_ALIGN_16 },
-    { X86::Int_CVTPS2PDrr,  X86::Int_CVTPS2PDrm,      0 },
     { X86::CVTSD2SI64rr,    X86::CVTSD2SI64rm,        0 },
     { X86::CVTSD2SIrr,      X86::CVTSD2SIrm,          0 },
+    { X86::CVTSS2SI64rr,    X86::CVTSS2SI64rm,        0 },
+    { X86::CVTSS2SIrr,      X86::CVTSS2SIrm,          0 },
     { X86::Int_CVTSD2SSrr,  X86::Int_CVTSD2SSrm,      0 },
     { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm,    0 },
     { X86::Int_CVTSI2SDrr,  X86::Int_CVTSI2SDrm,      0 },
@@ -492,14 +490,20 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     // AVX 128-bit versions of foldable instructions
     { X86::Int_VCOMISDrr,   X86::Int_VCOMISDrm,       0 },
     { X86::Int_VCOMISSrr,   X86::Int_VCOMISSrm,       0 },
-    { X86::Int_VCVTDQ2PDrr, X86::Int_VCVTDQ2PDrm,     TB_ALIGN_16 },
-    { X86::Int_VCVTDQ2PSrr, X86::Int_VCVTDQ2PSrm,     TB_ALIGN_16 },
-    { X86::Int_VCVTPD2DQrr, X86::Int_VCVTPD2DQrm,     TB_ALIGN_16 },
-    { X86::Int_VCVTPD2PSrr, X86::Int_VCVTPD2PSrm,     TB_ALIGN_16 },
-    { X86::Int_VCVTPS2DQrr, X86::Int_VCVTPS2DQrm,     TB_ALIGN_16 },
-    { X86::Int_VCVTPS2PDrr, X86::Int_VCVTPS2PDrm,     0 },
     { X86::Int_VUCOMISDrr,  X86::Int_VUCOMISDrm,      0 },
     { X86::Int_VUCOMISSrr,  X86::Int_VUCOMISSrm,      0 },
+    { X86::VCVTTSD2SI64rr,  X86::VCVTTSD2SI64rm,      0 },
+    { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm,0 },
+    { X86::VCVTTSD2SIrr,    X86::VCVTTSD2SIrm,        0 },
+    { X86::Int_VCVTTSD2SIrr,X86::Int_VCVTTSD2SIrm,    0 },
+    { X86::VCVTTSS2SI64rr,  X86::VCVTTSS2SI64rm,      0 },
+    { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm,0 },
+    { X86::VCVTTSS2SIrr,    X86::VCVTTSS2SIrm,        0 },
+    { X86::Int_VCVTTSS2SIrr,X86::Int_VCVTTSS2SIrm,    0 },
+    { X86::VCVTSD2SI64rr,   X86::VCVTSD2SI64rm,       0 },
+    { X86::VCVTSD2SIrr,     X86::VCVTSD2SIrm,         0 },
+    { X86::VCVTSS2SI64rr,   X86::VCVTSS2SI64rm,       0 },
+    { X86::VCVTSS2SIrr,     X86::VCVTSS2SIrm,         0 },
     { X86::FsVMOVAPDrr,     X86::VMOVSDrm,            TB_NO_REVERSE },
     { X86::FsVMOVAPSrr,     X86::VMOVSSrm,            TB_NO_REVERSE },
     { X86::VMOV64toPQIrr,   X86::VMOVQI2PQIrm,        0 },
@@ -535,6 +539,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VSQRTPSr_Int,    X86::VSQRTPSm_Int,        TB_ALIGN_16 },
     { X86::VUCOMISDrr,      X86::VUCOMISDrm,          0 },
     { X86::VUCOMISSrr,      X86::VUCOMISSrm,          0 },
+    { X86::VBROADCASTSSrr,  X86::VBROADCASTSSrm,      TB_NO_REVERSE },
+
     // AVX 256-bit foldable instructions
     { X86::VMOVAPDYrr,      X86::VMOVAPDYrm,          TB_ALIGN_32 },
     { X86::VMOVAPSYrr,      X86::VMOVAPSYrm,          TB_ALIGN_32 },
@@ -543,6 +549,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VMOVUPSYrr,      X86::VMOVUPSYrm,          0 },
     { X86::VPERMILPDYri,    X86::VPERMILPDYmi,        TB_ALIGN_32 },
     { X86::VPERMILPSYri,    X86::VPERMILPSYmi,        TB_ALIGN_32 },
+
     // AVX2 foldable instructions
     { X86::VPABSBrr256,     X86::VPABSBrm256,         TB_ALIGN_32 },
     { X86::VPABSDrr256,     X86::VPABSDrm256,         TB_ALIGN_32 },
@@ -558,6 +565,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VSQRTPDYr_Int,   X86::VSQRTPDYm_Int,       TB_ALIGN_32 },
     { X86::VSQRTPSYr,       X86::VSQRTPSYm,           TB_ALIGN_32 },
     { X86::VSQRTPSYr_Int,   X86::VSQRTPSYm_Int,       TB_ALIGN_32 },
+    { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm,     TB_NO_REVERSE },
+    { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm,     TB_NO_REVERSE },
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) {
@@ -808,17 +817,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::Int_VCVTSI2SSrr,   X86::Int_VCVTSI2SSrm,    0 },
     { X86::VCVTSS2SDrr,       X86::VCVTSS2SDrm,        0 },
     { X86::Int_VCVTSS2SDrr,   X86::Int_VCVTSS2SDrm,    0 },
-    { X86::VCVTTSD2SI64rr,    X86::VCVTTSD2SI64rm,     0 },
-    { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm, 0 },
-    { X86::VCVTTSD2SIrr,      X86::VCVTTSD2SIrm,       0 },
-    { X86::Int_VCVTTSD2SIrr,  X86::Int_VCVTTSD2SIrm,   0 },
-    { X86::VCVTTSS2SI64rr,    X86::VCVTTSS2SI64rm,     0 },
-    { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm, 0 },
-    { X86::VCVTTSS2SIrr,      X86::VCVTTSS2SIrm,       0 },
-    { X86::Int_VCVTTSS2SIrr,  X86::Int_VCVTTSS2SIrm,   0 },
-    { X86::VCVTSD2SI64rr,     X86::VCVTSD2SI64rm,      0 },
-    { X86::VCVTSD2SIrr,       X86::VCVTSD2SIrm,        0 },
-    { X86::VCVTTPD2DQrr,      X86::VCVTTPD2DQrm,       TB_ALIGN_16 },
+    { X86::VCVTTPD2DQrr,      X86::VCVTTPD2DQXrm,      TB_ALIGN_16 },
     { X86::VCVTTPS2DQrr,      X86::VCVTTPS2DQrm,       TB_ALIGN_16 },
     { X86::VRSQRTSSr,         X86::VRSQRTSSm,          0 },
     { X86::VSQRTSDr,          X86::VSQRTSDm,           0 },
@@ -1122,6 +1121,158 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
                   // Index 2, folded load
                   Flags | TB_INDEX_2 | TB_FOLDED_LOAD);
   }
+
+  static const X86OpTblEntry OpTbl3[] = {
+    // FMA foldable instructions
+    { X86::VFMADDSSr231r,         X86::VFMADDSSr231m,         0 },
+    { X86::VFMADDSDr231r,         X86::VFMADDSDr231m,         0 },
+    { X86::VFMADDSSr132r,         X86::VFMADDSSr132m,         0 },
+    { X86::VFMADDSDr132r,         X86::VFMADDSDr132m,         0 },
+    { X86::VFMADDSSr213r,         X86::VFMADDSSr213m,         0 },
+    { X86::VFMADDSDr213r,         X86::VFMADDSDr213m,         0 },
+    { X86::VFMADDSSr132r_Int,     X86::VFMADDSSr132m_Int,     0 },
+    { X86::VFMADDSDr132r_Int,     X86::VFMADDSDr132m_Int,     0 },
+
+    { X86::VFMADDPSr231r,         X86::VFMADDPSr231m,         TB_ALIGN_16 },
+    { X86::VFMADDPDr231r,         X86::VFMADDPDr231m,         TB_ALIGN_16 },
+    { X86::VFMADDPSr132r,         X86::VFMADDPSr132m,         TB_ALIGN_16 },
+    { X86::VFMADDPDr132r,         X86::VFMADDPDr132m,         TB_ALIGN_16 },
+    { X86::VFMADDPSr213r,         X86::VFMADDPSr213m,         TB_ALIGN_16 },
+    { X86::VFMADDPDr213r,         X86::VFMADDPDr213m,         TB_ALIGN_16 },
+    { X86::VFMADDPSr231rY,        X86::VFMADDPSr231mY,        TB_ALIGN_32 },
+    { X86::VFMADDPDr231rY,        X86::VFMADDPDr231mY,        TB_ALIGN_32 },
+    { X86::VFMADDPSr132rY,        X86::VFMADDPSr132mY,        TB_ALIGN_32 },
+    { X86::VFMADDPDr132rY,        X86::VFMADDPDr132mY,        TB_ALIGN_32 },
+    { X86::VFMADDPSr213rY,        X86::VFMADDPSr213mY,        TB_ALIGN_32 },
+    { X86::VFMADDPDr213rY,        X86::VFMADDPDr213mY,        TB_ALIGN_32 },
+    { X86::VFMADDPSr132r_Int,     X86::VFMADDPSr132m_Int,     TB_ALIGN_16 },
+    { X86::VFMADDPDr132r_Int,     X86::VFMADDPDr132m_Int,     TB_ALIGN_16 },
+    { X86::VFMADDPSr132rY_Int,    X86::VFMADDPSr132mY_Int,    TB_ALIGN_32 },
+    { X86::VFMADDPDr132rY_Int,    X86::VFMADDPDr132mY_Int,    TB_ALIGN_32 },
+
+    { X86::VFNMADDSSr231r,        X86::VFNMADDSSr231m,        0 },
+    { X86::VFNMADDSDr231r,        X86::VFNMADDSDr231m,        0 },
+    { X86::VFNMADDSSr132r,        X86::VFNMADDSSr132m,        0 },
+    { X86::VFNMADDSDr132r,        X86::VFNMADDSDr132m,        0 },
+    { X86::VFNMADDSSr213r,        X86::VFNMADDSSr213m,        0 },
+    { X86::VFNMADDSDr213r,        X86::VFNMADDSDr213m,        0 },
+    { X86::VFNMADDSSr132r_Int,    X86::VFNMADDSSr132m_Int,    0 },
+    { X86::VFNMADDSDr132r_Int,    X86::VFNMADDSDr132m_Int,    0 },
+
+    { X86::VFNMADDPSr231r,        X86::VFNMADDPSr231m,        TB_ALIGN_16 },
+    { X86::VFNMADDPDr231r,        X86::VFNMADDPDr231m,        TB_ALIGN_16 },
+    { X86::VFNMADDPSr132r,        X86::VFNMADDPSr132m,        TB_ALIGN_16 },
+    { X86::VFNMADDPDr132r,        X86::VFNMADDPDr132m,        TB_ALIGN_16 },
+    { X86::VFNMADDPSr213r,        X86::VFNMADDPSr213m,        TB_ALIGN_16 },
+    { X86::VFNMADDPDr213r,        X86::VFNMADDPDr213m,        TB_ALIGN_16 },
+    { X86::VFNMADDPSr231rY,       X86::VFNMADDPSr231mY,       TB_ALIGN_32 },
+    { X86::VFNMADDPDr231rY,       X86::VFNMADDPDr231mY,       TB_ALIGN_32 },
+    { X86::VFNMADDPSr132rY,       X86::VFNMADDPSr132mY,       TB_ALIGN_32 },
+    { X86::VFNMADDPDr132rY,       X86::VFNMADDPDr132mY,       TB_ALIGN_32 },
+    { X86::VFNMADDPSr213rY,       X86::VFNMADDPSr213mY,       TB_ALIGN_32 },
+    { X86::VFNMADDPDr213rY,       X86::VFNMADDPDr213mY,       TB_ALIGN_32 },
+    { X86::VFNMADDPSr132r_Int,    X86::VFNMADDPSr132m_Int,    TB_ALIGN_16 },
+    { X86::VFNMADDPDr132r_Int,    X86::VFNMADDPDr132m_Int,    TB_ALIGN_16 },
+    { X86::VFNMADDPSr132rY_Int,   X86::VFNMADDPSr132mY_Int,   TB_ALIGN_32 },
+    { X86::VFNMADDPDr132rY_Int,   X86::VFNMADDPDr132mY_Int,   TB_ALIGN_32 },
+
+    { X86::VFMSUBSSr231r,         X86::VFMSUBSSr231m,         0 },
+    { X86::VFMSUBSDr231r,         X86::VFMSUBSDr231m,         0 },
+    { X86::VFMSUBSSr132r,         X86::VFMSUBSSr132m,         0 },
+    { X86::VFMSUBSDr132r,         X86::VFMSUBSDr132m,         0 },
+    { X86::VFMSUBSSr213r,         X86::VFMSUBSSr213m,         0 },
+    { X86::VFMSUBSDr213r,         X86::VFMSUBSDr213m,         0 },
+    { X86::VFMSUBSSr132r_Int,     X86::VFMSUBSSr132m_Int,     0 },
+    { X86::VFMSUBSDr132r_Int,     X86::VFMSUBSDr132m_Int,     0 },
+
+    { X86::VFMSUBPSr231r,         X86::VFMSUBPSr231m,         TB_ALIGN_16 },
+    { X86::VFMSUBPDr231r,         X86::VFMSUBPDr231m,         TB_ALIGN_16 },
+    { X86::VFMSUBPSr132r,         X86::VFMSUBPSr132m,         TB_ALIGN_16 },
+    { X86::VFMSUBPDr132r,         X86::VFMSUBPDr132m,         TB_ALIGN_16 },
+    { X86::VFMSUBPSr213r,         X86::VFMSUBPSr213m,         TB_ALIGN_16 },
+    { X86::VFMSUBPDr213r,         X86::VFMSUBPDr213m,         TB_ALIGN_16 },
+    { X86::VFMSUBPSr231rY,        X86::VFMSUBPSr231mY,        TB_ALIGN_32 },
+    { X86::VFMSUBPDr231rY,        X86::VFMSUBPDr231mY,        TB_ALIGN_32 },
+    { X86::VFMSUBPSr132rY,        X86::VFMSUBPSr132mY,        TB_ALIGN_32 },
+    { X86::VFMSUBPDr132rY,        X86::VFMSUBPDr132mY,        TB_ALIGN_32 },
+    { X86::VFMSUBPSr213rY,        X86::VFMSUBPSr213mY,        TB_ALIGN_32 },
+    { X86::VFMSUBPDr213rY,        X86::VFMSUBPDr213mY,        TB_ALIGN_32 },
+    { X86::VFMSUBPSr132r_Int,     X86::VFMSUBPSr132m_Int,     TB_ALIGN_16 },
+    { X86::VFMSUBPDr132r_Int,     X86::VFMSUBPDr132m_Int,     TB_ALIGN_16 },
+    { X86::VFMSUBPSr132rY_Int,    X86::VFMSUBPSr132mY_Int,    TB_ALIGN_32 },
+    { X86::VFMSUBPDr132rY_Int,    X86::VFMSUBPDr132mY_Int,    TB_ALIGN_32 },
+
+    { X86::VFNMSUBSSr231r,        X86::VFNMSUBSSr231m,        0 },
+    { X86::VFNMSUBSDr231r,        X86::VFNMSUBSDr231m,        0 },
+    { X86::VFNMSUBSSr132r,        X86::VFNMSUBSSr132m,        0 },
+    { X86::VFNMSUBSDr132r,        X86::VFNMSUBSDr132m,        0 },
+    { X86::VFNMSUBSSr213r,        X86::VFNMSUBSSr213m,        0 },
+    { X86::VFNMSUBSDr213r,        X86::VFNMSUBSDr213m,        0 },
+    { X86::VFNMSUBSSr132r_Int,    X86::VFNMSUBSSr132m_Int,    0 },
+    { X86::VFNMSUBSDr132r_Int,    X86::VFNMSUBSDr132m_Int,    0 },
+
+    { X86::VFNMSUBPSr231r,        X86::VFNMSUBPSr231m,        TB_ALIGN_16 },
+    { X86::VFNMSUBPDr231r,        X86::VFNMSUBPDr231m,        TB_ALIGN_16 },
+    { X86::VFNMSUBPSr132r,        X86::VFNMSUBPSr132m,        TB_ALIGN_16 },
+    { X86::VFNMSUBPDr132r,        X86::VFNMSUBPDr132m,        TB_ALIGN_16 },
+    { X86::VFNMSUBPSr213r,        X86::VFNMSUBPSr213m,        TB_ALIGN_16 },
+    { X86::VFNMSUBPDr213r,        X86::VFNMSUBPDr213m,        TB_ALIGN_16 },
+    { X86::VFNMSUBPSr231rY,       X86::VFNMSUBPSr231mY,       TB_ALIGN_32 },
+    { X86::VFNMSUBPDr231rY,       X86::VFNMSUBPDr231mY,       TB_ALIGN_32 },
+    { X86::VFNMSUBPSr132rY,       X86::VFNMSUBPSr132mY,       TB_ALIGN_32 },
+    { X86::VFNMSUBPDr132rY,       X86::VFNMSUBPDr132mY,       TB_ALIGN_32 },
+    { X86::VFNMSUBPSr213rY,       X86::VFNMSUBPSr213mY,       TB_ALIGN_32 },
+    { X86::VFNMSUBPDr213rY,       X86::VFNMSUBPDr213mY,       TB_ALIGN_32 },
+    { X86::VFNMSUBPSr132r_Int,    X86::VFNMSUBPSr132m_Int,    TB_ALIGN_16 },
+    { X86::VFNMSUBPDr132r_Int,    X86::VFNMSUBPDr132m_Int,    TB_ALIGN_16 },
+    { X86::VFNMSUBPSr132rY_Int,   X86::VFNMSUBPSr132mY_Int,   TB_ALIGN_32 },
+    { X86::VFNMSUBPDr132rY_Int,   X86::VFNMSUBPDr132mY_Int,   TB_ALIGN_32 },
+
+    { X86::VFMADDSUBPSr231r,      X86::VFMADDSUBPSr231m,      TB_ALIGN_16 },
+    { X86::VFMADDSUBPDr231r,      X86::VFMADDSUBPDr231m,      TB_ALIGN_16 },
+    { X86::VFMADDSUBPSr132r,      X86::VFMADDSUBPSr132m,      TB_ALIGN_16 },
+    { X86::VFMADDSUBPDr132r,      X86::VFMADDSUBPDr132m,      TB_ALIGN_16 },
+    { X86::VFMADDSUBPSr213r,      X86::VFMADDSUBPSr213m,      TB_ALIGN_16 },
+    { X86::VFMADDSUBPDr213r,      X86::VFMADDSUBPDr213m,      TB_ALIGN_16 },
+    { X86::VFMADDSUBPSr231rY,     X86::VFMADDSUBPSr231mY,     TB_ALIGN_32 },
+    { X86::VFMADDSUBPDr231rY,     X86::VFMADDSUBPDr231mY,     TB_ALIGN_32 },
+    { X86::VFMADDSUBPSr132rY,     X86::VFMADDSUBPSr132mY,     TB_ALIGN_32 },
+    { X86::VFMADDSUBPDr132rY,     X86::VFMADDSUBPDr132mY,     TB_ALIGN_32 },
+    { X86::VFMADDSUBPSr213rY,     X86::VFMADDSUBPSr213mY,     TB_ALIGN_32 },
+    { X86::VFMADDSUBPDr213rY,     X86::VFMADDSUBPDr213mY,     TB_ALIGN_32 },
+    { X86::VFMADDSUBPSr132r_Int,  X86::VFMADDSUBPSr132m_Int,  TB_ALIGN_16 },
+    { X86::VFMADDSUBPDr132r_Int,  X86::VFMADDSUBPDr132m_Int,  TB_ALIGN_16 },
+    { X86::VFMADDSUBPSr132rY_Int, X86::VFMADDSUBPSr132mY_Int, TB_ALIGN_32 },
+    { X86::VFMADDSUBPDr132rY_Int, X86::VFMADDSUBPDr132mY_Int, TB_ALIGN_32 },
+
+    { X86::VFMSUBADDPSr231r,      X86::VFMSUBADDPSr231m,      TB_ALIGN_16 },
+    { X86::VFMSUBADDPDr231r,      X86::VFMSUBADDPDr231m,      TB_ALIGN_16 },
+    { X86::VFMSUBADDPSr132r,      X86::VFMSUBADDPSr132m,      TB_ALIGN_16 },
+    { X86::VFMSUBADDPDr132r,      X86::VFMSUBADDPDr132m,      TB_ALIGN_16 },
+    { X86::VFMSUBADDPSr213r,      X86::VFMSUBADDPSr213m,      TB_ALIGN_16 },
+    { X86::VFMSUBADDPDr213r,      X86::VFMSUBADDPDr213m,      TB_ALIGN_16 },
+    { X86::VFMSUBADDPSr231rY,     X86::VFMSUBADDPSr231mY,     TB_ALIGN_32 },
+    { X86::VFMSUBADDPDr231rY,     X86::VFMSUBADDPDr231mY,     TB_ALIGN_32 },
+    { X86::VFMSUBADDPSr132rY,     X86::VFMSUBADDPSr132mY,     TB_ALIGN_32 },
+    { X86::VFMSUBADDPDr132rY,     X86::VFMSUBADDPDr132mY,     TB_ALIGN_32 },
+    { X86::VFMSUBADDPSr213rY,     X86::VFMSUBADDPSr213mY,     TB_ALIGN_32 },
+    { X86::VFMSUBADDPDr213rY,     X86::VFMSUBADDPDr213mY,     TB_ALIGN_32 },
+    { X86::VFMSUBADDPSr132r_Int,  X86::VFMSUBADDPSr132m_Int,  TB_ALIGN_16 },
+    { X86::VFMSUBADDPDr132r_Int,  X86::VFMSUBADDPDr132m_Int,  TB_ALIGN_16 },
+    { X86::VFMSUBADDPSr132rY_Int, X86::VFMSUBADDPSr132mY_Int, TB_ALIGN_32 },
+    { X86::VFMSUBADDPDr132rY_Int, X86::VFMSUBADDPDr132mY_Int, TB_ALIGN_32 },
+  };
+
+  for (unsigned i = 0, e = array_lengthof(OpTbl3); i != e; ++i) {
+    unsigned RegOp = OpTbl3[i].RegOp;
+    unsigned MemOp = OpTbl3[i].MemOp;
+    unsigned Flags = OpTbl3[i].Flags;
+    AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
+                  RegOp, MemOp,
+                  // Index 3, folded load
+                  Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
+  }
+
 }
 
 void
@@ -1782,12 +1933,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
       unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r
         : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+      const TargetRegisterClass *RC = MIOpc == X86::INC64r ?
+        (const TargetRegisterClass*)&X86::GR64_NOSPRegClass :
+        (const TargetRegisterClass*)&X86::GR32_NOSPRegClass;
 
       // LEA can't handle RSP.
       if (TargetRegisterInfo::isVirtualRegister(Src) &&
-          !MF.getRegInfo().constrainRegClass(Src,
-                            MIOpc == X86::INC64r ? X86::GR64_NOSPRegisterClass :
-                                                   X86::GR32_NOSPRegisterClass))
+          !MF.getRegInfo().constrainRegClass(Src, RC))
         return 0;
 
       NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
@@ -1812,11 +1964,12 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
       unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
         : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+      const TargetRegisterClass *RC = MIOpc == X86::DEC64r ?
+        (const TargetRegisterClass*)&X86::GR64_NOSPRegClass :
+        (const TargetRegisterClass*)&X86::GR32_NOSPRegClass;
       // LEA can't handle RSP.
       if (TargetRegisterInfo::isVirtualRegister(Src) &&
-          !MF.getRegInfo().constrainRegClass(Src,
-                            MIOpc == X86::DEC64r ? X86::GR64_NOSPRegisterClass :
-                                                   X86::GR32_NOSPRegisterClass))
+          !MF.getRegInfo().constrainRegClass(Src, RC))
         return 0;
 
       NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
@@ -1844,10 +1997,10 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       const TargetRegisterClass *RC;
       if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB) {
         Opc = X86::LEA64r;
-        RC = X86::GR64_NOSPRegisterClass;
+        RC = &X86::GR64_NOSPRegClass;
       } else {
         Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
-        RC = X86::GR32_NOSPRegisterClass;
+        RC = &X86::GR32_NOSPRegClass;
       }
 
 
@@ -1863,6 +2016,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
                         .addReg(Dest, RegState::Define |
                                 getDeadRegState(isDead)),
                         Src, isKill, Src2, isKill2);
+
+      // Preserve undefness of the operands.
+      bool isUndef = MI->getOperand(1).isUndef();
+      bool isUndef2 = MI->getOperand(2).isUndef();
+      NewMI->getOperand(1).setIsUndef(isUndef);
+      NewMI->getOperand(3).setIsUndef(isUndef2);
+
       if (LV && isKill2)
         LV->replaceKillInstruction(Src2, MI, NewMI);
       break;
@@ -2079,7 +2239,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
   }
 }
 
-static X86::CondCode GetCondFromBranchOpc(unsigned BrOpc) {
+static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) {
   switch (BrOpc) {
   default: return X86::COND_INVALID;
   case X86::JE_4:  return X86::COND_E;
@@ -2101,6 +2261,84 @@ static X86::CondCode GetCondFromBranchOpc(unsigned BrOpc) {
   }
 }
 
+/// getCondFromSETOpc - return condition code of a SET opcode.
+static X86::CondCode getCondFromSETOpc(unsigned Opc) {
+  switch (Opc) {
+  default: return X86::COND_INVALID;
+  case X86::SETAr:  case X86::SETAm:  return X86::COND_A;
+  case X86::SETAEr: case X86::SETAEm: return X86::COND_AE;
+  case X86::SETBr:  case X86::SETBm:  return X86::COND_B;
+  case X86::SETBEr: case X86::SETBEm: return X86::COND_BE;
+  case X86::SETEr:  case X86::SETEm:  return X86::COND_E;
+  case X86::SETGr:  case X86::SETGm:  return X86::COND_G;
+  case X86::SETGEr: case X86::SETGEm: return X86::COND_GE;
+  case X86::SETLr:  case X86::SETLm:  return X86::COND_L;
+  case X86::SETLEr: case X86::SETLEm: return X86::COND_LE;
+  case X86::SETNEr: case X86::SETNEm: return X86::COND_NE;
+  case X86::SETNOr: case X86::SETNOm: return X86::COND_NO;
+  case X86::SETNPr: case X86::SETNPm: return X86::COND_NP;
+  case X86::SETNSr: case X86::SETNSm: return X86::COND_NS;
+  case X86::SETOr:  case X86::SETOm:  return X86::COND_O;
+  case X86::SETPr:  case X86::SETPm:  return X86::COND_P;
+  case X86::SETSr:  case X86::SETSm:  return X86::COND_S;
+  }
+}
+
+/// getCondFromCmovOpc - return condition code of a CMov opcode.
+static X86::CondCode getCondFromCMovOpc(unsigned Opc) {
+  switch (Opc) {
+  default: return X86::COND_INVALID;
+  case X86::CMOVA16rm:  case X86::CMOVA16rr:  case X86::CMOVA32rm:
+  case X86::CMOVA32rr:  case X86::CMOVA64rm:  case X86::CMOVA64rr:
+    return X86::COND_A;
+  case X86::CMOVAE16rm: case X86::CMOVAE16rr: case X86::CMOVAE32rm:
+  case X86::CMOVAE32rr: case X86::CMOVAE64rm: case X86::CMOVAE64rr:
+    return X86::COND_AE;
+  case X86::CMOVB16rm:  case X86::CMOVB16rr:  case X86::CMOVB32rm:
+  case X86::CMOVB32rr:  case X86::CMOVB64rm:  case X86::CMOVB64rr:
+    return X86::COND_B;
+  case X86::CMOVBE16rm: case X86::CMOVBE16rr: case X86::CMOVBE32rm:
+  case X86::CMOVBE32rr: case X86::CMOVBE64rm: case X86::CMOVBE64rr:
+    return X86::COND_BE;
+  case X86::CMOVE16rm:  case X86::CMOVE16rr:  case X86::CMOVE32rm:
+  case X86::CMOVE32rr:  case X86::CMOVE64rm:  case X86::CMOVE64rr:
+    return X86::COND_E;
+  case X86::CMOVG16rm:  case X86::CMOVG16rr:  case X86::CMOVG32rm:
+  case X86::CMOVG32rr:  case X86::CMOVG64rm:  case X86::CMOVG64rr:
+    return X86::COND_G;
+  case X86::CMOVGE16rm: case X86::CMOVGE16rr: case X86::CMOVGE32rm:
+  case X86::CMOVGE32rr: case X86::CMOVGE64rm: case X86::CMOVGE64rr:
+    return X86::COND_GE;
+  case X86::CMOVL16rm:  case X86::CMOVL16rr:  case X86::CMOVL32rm:
+  case X86::CMOVL32rr:  case X86::CMOVL64rm:  case X86::CMOVL64rr:
+    return X86::COND_L;
+  case X86::CMOVLE16rm: case X86::CMOVLE16rr: case X86::CMOVLE32rm:
+  case X86::CMOVLE32rr: case X86::CMOVLE64rm: case X86::CMOVLE64rr:
+    return X86::COND_LE;
+  case X86::CMOVNE16rm: case X86::CMOVNE16rr: case X86::CMOVNE32rm:
+  case X86::CMOVNE32rr: case X86::CMOVNE64rm: case X86::CMOVNE64rr:
+    return X86::COND_NE;
+  case X86::CMOVNO16rm: case X86::CMOVNO16rr: case X86::CMOVNO32rm:
+  case X86::CMOVNO32rr: case X86::CMOVNO64rm: case X86::CMOVNO64rr:
+    return X86::COND_NO;
+  case X86::CMOVNP16rm: case X86::CMOVNP16rr: case X86::CMOVNP32rm:
+  case X86::CMOVNP32rr: case X86::CMOVNP64rm: case X86::CMOVNP64rr:
+    return X86::COND_NP;
+  case X86::CMOVNS16rm: case X86::CMOVNS16rr: case X86::CMOVNS32rm:
+  case X86::CMOVNS32rr: case X86::CMOVNS64rm: case X86::CMOVNS64rr:
+    return X86::COND_NS;
+  case X86::CMOVO16rm:  case X86::CMOVO16rr:  case X86::CMOVO32rm:
+  case X86::CMOVO32rr:  case X86::CMOVO64rm:  case X86::CMOVO64rr:
+    return X86::COND_O;
+  case X86::CMOVP16rm:  case X86::CMOVP16rr:  case X86::CMOVP32rm:
+  case X86::CMOVP32rr:  case X86::CMOVP64rm:  case X86::CMOVP64rr:
+    return X86::COND_P;
+  case X86::CMOVS16rm:  case X86::CMOVS16rr:  case X86::CMOVS32rm:
+  case X86::CMOVS32rr:  case X86::CMOVS64rm:  case X86::CMOVS64rr:
+    return X86::COND_S;
+  }
+}
+
 unsigned X86::GetCondBranchFromCond(X86::CondCode CC) {
   switch (CC) {
   default: llvm_unreachable("Illegal condition code!");
@@ -2147,6 +2385,101 @@ X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
   }
 }
 
+/// getSwappedCondition - assume the flags are set by MI(a,b), return
+/// the condition code if we modify the instructions such that flags are
+/// set by MI(b,a).
+static X86::CondCode getSwappedCondition(X86::CondCode CC) {
+  switch (CC) {
+  default: return X86::COND_INVALID;
+  case X86::COND_E:  return X86::COND_E;
+  case X86::COND_NE: return X86::COND_NE;
+  case X86::COND_L:  return X86::COND_G;
+  case X86::COND_LE: return X86::COND_GE;
+  case X86::COND_G:  return X86::COND_L;
+  case X86::COND_GE: return X86::COND_LE;
+  case X86::COND_B:  return X86::COND_A;
+  case X86::COND_BE: return X86::COND_AE;
+  case X86::COND_A:  return X86::COND_B;
+  case X86::COND_AE: return X86::COND_BE;
+  }
+}
+
+/// getSETFromCond - Return a set opcode for the given condition and
+/// whether it has memory operand.
+static unsigned getSETFromCond(X86::CondCode CC,
+                               bool HasMemoryOperand) {
+  static const unsigned Opc[16][2] = {
+    { X86::SETAr,  X86::SETAm  },
+    { X86::SETAEr, X86::SETAEm },
+    { X86::SETBr,  X86::SETBm  },
+    { X86::SETBEr, X86::SETBEm },
+    { X86::SETEr,  X86::SETEm  },
+    { X86::SETGr,  X86::SETGm  },
+    { X86::SETGEr, X86::SETGEm },
+    { X86::SETLr,  X86::SETLm  },
+    { X86::SETLEr, X86::SETLEm },
+    { X86::SETNEr, X86::SETNEm },
+    { X86::SETNOr, X86::SETNOm },
+    { X86::SETNPr, X86::SETNPm },
+    { X86::SETNSr, X86::SETNSm },
+    { X86::SETOr,  X86::SETOm  },
+    { X86::SETPr,  X86::SETPm  },
+    { X86::SETSr,  X86::SETSm  }
+  };
+
+  assert(CC < 16 && "Can only handle standard cond codes");
+  return Opc[CC][HasMemoryOperand ? 1 : 0];
+}
+
+/// getCMovFromCond - Return a cmov opcode for the given condition,
+/// register size in bytes, and operand type.
+static unsigned getCMovFromCond(X86::CondCode CC, unsigned RegBytes,
+                                bool HasMemoryOperand) {
+  static const unsigned Opc[32][3] = {
+    { X86::CMOVA16rr,  X86::CMOVA32rr,  X86::CMOVA64rr  },
+    { X86::CMOVAE16rr, X86::CMOVAE32rr, X86::CMOVAE64rr },
+    { X86::CMOVB16rr,  X86::CMOVB32rr,  X86::CMOVB64rr  },
+    { X86::CMOVBE16rr, X86::CMOVBE32rr, X86::CMOVBE64rr },
+    { X86::CMOVE16rr,  X86::CMOVE32rr,  X86::CMOVE64rr  },
+    { X86::CMOVG16rr,  X86::CMOVG32rr,  X86::CMOVG64rr  },
+    { X86::CMOVGE16rr, X86::CMOVGE32rr, X86::CMOVGE64rr },
+    { X86::CMOVL16rr,  X86::CMOVL32rr,  X86::CMOVL64rr  },
+    { X86::CMOVLE16rr, X86::CMOVLE32rr, X86::CMOVLE64rr },
+    { X86::CMOVNE16rr, X86::CMOVNE32rr, X86::CMOVNE64rr },
+    { X86::CMOVNO16rr, X86::CMOVNO32rr, X86::CMOVNO64rr },
+    { X86::CMOVNP16rr, X86::CMOVNP32rr, X86::CMOVNP64rr },
+    { X86::CMOVNS16rr, X86::CMOVNS32rr, X86::CMOVNS64rr },
+    { X86::CMOVO16rr,  X86::CMOVO32rr,  X86::CMOVO64rr  },
+    { X86::CMOVP16rr,  X86::CMOVP32rr,  X86::CMOVP64rr  },
+    { X86::CMOVS16rr,  X86::CMOVS32rr,  X86::CMOVS64rr  },
+    { X86::CMOVA16rm,  X86::CMOVA32rm,  X86::CMOVA64rm  },
+    { X86::CMOVAE16rm, X86::CMOVAE32rm, X86::CMOVAE64rm },
+    { X86::CMOVB16rm,  X86::CMOVB32rm,  X86::CMOVB64rm  },
+    { X86::CMOVBE16rm, X86::CMOVBE32rm, X86::CMOVBE64rm },
+    { X86::CMOVE16rm,  X86::CMOVE32rm,  X86::CMOVE64rm  },
+    { X86::CMOVG16rm,  X86::CMOVG32rm,  X86::CMOVG64rm  },
+    { X86::CMOVGE16rm, X86::CMOVGE32rm, X86::CMOVGE64rm },
+    { X86::CMOVL16rm,  X86::CMOVL32rm,  X86::CMOVL64rm  },
+    { X86::CMOVLE16rm, X86::CMOVLE32rm, X86::CMOVLE64rm },
+    { X86::CMOVNE16rm, X86::CMOVNE32rm, X86::CMOVNE64rm },
+    { X86::CMOVNO16rm, X86::CMOVNO32rm, X86::CMOVNO64rm },
+    { X86::CMOVNP16rm, X86::CMOVNP32rm, X86::CMOVNP64rm },
+    { X86::CMOVNS16rm, X86::CMOVNS32rm, X86::CMOVNS64rm },
+    { X86::CMOVO16rm,  X86::CMOVO32rm,  X86::CMOVO64rm  },
+    { X86::CMOVP16rm,  X86::CMOVP32rm,  X86::CMOVP64rm  },
+    { X86::CMOVS16rm,  X86::CMOVS32rm,  X86::CMOVS64rm  }
+  };
+
+  assert(CC < 16 && "Can only handle standard cond codes");
+  unsigned Idx = HasMemoryOperand ? 16+CC : CC;
+  switch(RegBytes) {
+  default: llvm_unreachable("Illegal register size!");
+  case 2: return Opc[Idx][0];
+  case 4: return Opc[Idx][1];
+  case 8: return Opc[Idx][2];
+  }
+}
+
 bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
   if (!MI->isTerminator()) return false;
 
@@ -2213,7 +2546,7 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
     }
 
     // Handle conditional branches.
-    X86::CondCode BranchCode = GetCondFromBranchOpc(I->getOpcode());
+    X86::CondCode BranchCode = getCondFromBranchOpc(I->getOpcode());
     if (BranchCode == X86::COND_INVALID)
       return true;  // Can't handle indirect branch.
 
@@ -2311,7 +2644,7 @@ unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
     if (I->isDebugValue())
       continue;
     if (I->getOpcode() != X86::JMP_4 &&
-        GetCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
+        getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
       break;
     // Remove the branch.
     I->eraseFromParent();
@@ -2371,6 +2704,56 @@ X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   return Count;
 }
 
+bool X86InstrInfo::
+canInsertSelect(const MachineBasicBlock &MBB,
+                const SmallVectorImpl<MachineOperand> &Cond,
+                unsigned TrueReg, unsigned FalseReg,
+                int &CondCycles, int &TrueCycles, int &FalseCycles) const {
+  // Not all subtargets have cmov instructions.
+  if (!TM.getSubtarget<X86Subtarget>().hasCMov())
+    return false;
+  if (Cond.size() != 1)
+    return false;
+  // We cannot do the composite conditions, at least not in SSA form.
+  if ((X86::CondCode)Cond[0].getImm() > X86::COND_S)
+    return false;
+
+  // Check register classes.
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *RC =
+    RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
+  if (!RC)
+    return false;
+
+  // We have cmov instructions for 16, 32, and 64 bit general purpose registers.
+  if (X86::GR16RegClass.hasSubClassEq(RC) ||
+      X86::GR32RegClass.hasSubClassEq(RC) ||
+      X86::GR64RegClass.hasSubClassEq(RC)) {
+    // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy
+    // Bridge. Probably Ivy Bridge as well.
+    CondCycles = 2;
+    TrueCycles = 2;
+    FalseCycles = 2;
+    return true;
+  }
+
+  // Can't do vectors.
+  return false;
+}
+
+void X86InstrInfo::insertSelect(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I, DebugLoc DL,
+                                unsigned DstReg,
+                                const SmallVectorImpl<MachineOperand> &Cond,
+                                unsigned TrueReg, unsigned FalseReg) const {
+   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+   assert(Cond.size() == 1 && "Invalid Cond array");
+   unsigned Opc = getCMovFromCond((X86::CondCode)Cond[0].getImm(),
+                                  MRI.getRegClass(DstReg)->getSize(),
+                                  false/*HasMemoryOperand*/);
+   BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg);
+}
+
 /// isHReg - Test if the given register is a physical h register.
 static bool isHReg(unsigned Reg) {
   return X86::GR8_ABCD_HRegClass.contains(Reg);
@@ -2637,6 +3020,305 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
   NewMIs.push_back(MIB);
 }
 
+bool X86InstrInfo::
+analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2,
+               int &CmpMask, int &CmpValue) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case X86::CMP64ri32:
+  case X86::CMP64ri8:
+  case X86::CMP32ri:
+  case X86::CMP32ri8:
+  case X86::CMP16ri:
+  case X86::CMP16ri8:
+  case X86::CMP8ri:
+    SrcReg = MI->getOperand(0).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = MI->getOperand(1).getImm();
+    return true;
+  case X86::CMP64rr:
+  case X86::CMP32rr:
+  case X86::CMP16rr:
+  case X86::CMP8rr:
+    SrcReg = MI->getOperand(0).getReg();
+    SrcReg2 = MI->getOperand(1).getReg();
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  case X86::TEST8rr:
+  case X86::TEST16rr:
+  case X86::TEST32rr:
+  case X86::TEST64rr:
+    SrcReg = MI->getOperand(0).getReg();
+    if (MI->getOperand(1).getReg() != SrcReg) return false;
+    // Compare against zero.
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  }
+  return false;
+}
+
+/// isRedundantFlagInstr - check whether the first instruction, whose only
+/// purpose is to update flags, can be made redundant.
+/// CMPrr can be made redundant by SUBrr if the operands are the same.
+/// This function can be extended later on.
+/// SrcReg, SrcRegs: register operands for FlagI.
+/// ImmValue: immediate for FlagI if it takes an immediate.
+inline static bool isRedundantFlagInstr(MachineInstr *FlagI, unsigned SrcReg,
+                                        unsigned SrcReg2, int ImmValue,
+                                        MachineInstr *OI) {
+  if (((FlagI->getOpcode() == X86::CMP64rr &&
+        OI->getOpcode() == X86::SUB64rr) ||
+       (FlagI->getOpcode() == X86::CMP32rr &&
+        OI->getOpcode() == X86::SUB32rr)||
+       (FlagI->getOpcode() == X86::CMP16rr &&
+        OI->getOpcode() == X86::SUB16rr)||
+       (FlagI->getOpcode() == X86::CMP8rr &&
+        OI->getOpcode() == X86::SUB8rr)) &&
+      ((OI->getOperand(1).getReg() == SrcReg &&
+        OI->getOperand(2).getReg() == SrcReg2) ||
+       (OI->getOperand(1).getReg() == SrcReg2 &&
+        OI->getOperand(2).getReg() == SrcReg)))
+    return true;
+
+  if (((FlagI->getOpcode() == X86::CMP64ri32 &&
+        OI->getOpcode() == X86::SUB64ri32) ||
+       (FlagI->getOpcode() == X86::CMP64ri8 &&
+        OI->getOpcode() == X86::SUB64ri8) ||
+       (FlagI->getOpcode() == X86::CMP32ri &&
+        OI->getOpcode() == X86::SUB32ri) ||
+       (FlagI->getOpcode() == X86::CMP32ri8 &&
+        OI->getOpcode() == X86::SUB32ri8) ||
+       (FlagI->getOpcode() == X86::CMP16ri &&
+        OI->getOpcode() == X86::SUB16ri) ||
+       (FlagI->getOpcode() == X86::CMP16ri8 &&
+        OI->getOpcode() == X86::SUB16ri8) ||
+       (FlagI->getOpcode() == X86::CMP8ri &&
+        OI->getOpcode() == X86::SUB8ri)) &&
+      OI->getOperand(1).getReg() == SrcReg &&
+      OI->getOperand(2).getImm() == ImmValue)
+    return true;
+  return false;
+}
+
+/// isDefConvertible - check whether the definition can be converted
+/// to remove a comparison against zero.
+inline static bool isDefConvertible(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default: return false;
+  case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri:
+  case X86::SUB32ri8:  case X86::SUB16ri:  case X86::SUB16ri8:
+  case X86::SUB8ri:    case X86::SUB64rr:  case X86::SUB32rr:
+  case X86::SUB16rr:   case X86::SUB8rr:   case X86::SUB64rm:
+  case X86::SUB32rm:   case X86::SUB16rm:  case X86::SUB8rm:
+  case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri:
+  case X86::ADD32ri8:  case X86::ADD16ri:  case X86::ADD16ri8:
+  case X86::ADD8ri:    case X86::ADD64rr:  case X86::ADD32rr:
+  case X86::ADD16rr:   case X86::ADD8rr:   case X86::ADD64rm:
+  case X86::ADD32rm:   case X86::ADD16rm:  case X86::ADD8rm:
+  case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri:
+  case X86::AND32ri8:  case X86::AND16ri:  case X86::AND16ri8:
+  case X86::AND8ri:    case X86::AND64rr:  case X86::AND32rr:
+  case X86::AND16rr:   case X86::AND8rr:   case X86::AND64rm:
+  case X86::AND32rm:   case X86::AND16rm:  case X86::AND8rm:
+  case X86::XOR64ri32: case X86::XOR64ri8: case X86::XOR32ri:
+  case X86::XOR32ri8:  case X86::XOR16ri:  case X86::XOR16ri8:
+  case X86::XOR8ri:    case X86::XOR64rr:  case X86::XOR32rr:
+  case X86::XOR16rr:   case X86::XOR8rr:   case X86::XOR64rm:
+  case X86::XOR32rm:   case X86::XOR16rm:  case X86::XOR8rm:
+  case X86::OR64ri32:  case X86::OR64ri8:  case X86::OR32ri:
+  case X86::OR32ri8:   case X86::OR16ri:   case X86::OR16ri8:
+  case X86::OR8ri:     case X86::OR64rr:   case X86::OR32rr:
+  case X86::OR16rr:    case X86::OR8rr:    case X86::OR64rm:
+  case X86::OR32rm:    case X86::OR16rm:   case X86::OR8rm:
+    return true;
+  }
+}
+
+/// optimizeCompareInstr - Check if there exists an earlier instruction that
+/// operates on the same source operands and sets flags in the same way as
+/// Compare; remove Compare if possible.
+bool X86InstrInfo::
+optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
+                     int CmpMask, int CmpValue,
+                     const MachineRegisterInfo *MRI) const {
+  // Get the unique definition of SrcReg.
+  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+  if (!MI) return false;
+
+  // CmpInstr is the first instruction of the BB.
+  MachineBasicBlock::iterator I = CmpInstr, Def = MI;
+
+  // If we are comparing against zero, check whether we can use MI to update
+  // EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize.
+  bool IsCmpZero = (SrcReg2 == 0 && CmpValue == 0);
+  if (IsCmpZero && (MI->getParent() != CmpInstr->getParent() ||
+      !isDefConvertible(MI)))
+    return false;
+
+  // We are searching for an earlier instruction that can make CmpInstr
+  // redundant and that instruction will be saved in Sub.
+  MachineInstr *Sub = NULL;
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+  // We iterate backward, starting from the instruction before CmpInstr and
+  // stop when reaching the definition of a source register or done with the BB.
+  // RI points to the instruction before CmpInstr.
+  // If the definition is in this basic block, RE points to the definition;
+  // otherwise, RE is the rend of the basic block.
+  MachineBasicBlock::reverse_iterator
+      RI = MachineBasicBlock::reverse_iterator(I),
+      RE = CmpInstr->getParent() == MI->getParent() ?
+           MachineBasicBlock::reverse_iterator(++Def) /* points to MI */ :
+           CmpInstr->getParent()->rend();
+  MachineInstr *Movr0Inst = 0;
+  for (; RI != RE; ++RI) {
+    MachineInstr *Instr = &*RI;
+    // Check whether CmpInstr can be made redundant by the current instruction.
+    if (!IsCmpZero &&
+        isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, Instr)) {
+      Sub = Instr;
+      break;
+    }
+
+    if (Instr->modifiesRegister(X86::EFLAGS, TRI) ||
+        Instr->readsRegister(X86::EFLAGS, TRI)) {
+      // This instruction modifies or uses EFLAGS.
+
+      // MOV32r0 etc. are implemented with xor which clobbers condition code.
+      // They are safe to move up, if the definition to EFLAGS is dead and
+      // earlier instructions do not read or write EFLAGS.
+      if (!Movr0Inst && (Instr->getOpcode() == X86::MOV8r0 ||
+           Instr->getOpcode() == X86::MOV16r0 ||
+           Instr->getOpcode() == X86::MOV32r0 ||
+           Instr->getOpcode() == X86::MOV64r0) &&
+          Instr->registerDefIsDead(X86::EFLAGS, TRI)) {
+        Movr0Inst = Instr;
+        continue;
+      }
+
+      // We can't remove CmpInstr.
+      return false;
+    }
+  }
+
+  // Return false if no candidates exist.
+  if (!IsCmpZero && !Sub)
+    return false;
+
+  bool IsSwapped = (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
+                    Sub->getOperand(2).getReg() == SrcReg);
+
+  // Scan forward from the instruction after CmpInstr for uses of EFLAGS.
+  // It is safe to remove CmpInstr if EFLAGS is redefined or killed.
+  // If we are done with the basic block, we need to check whether EFLAGS is
+  // live-out.
+  bool IsSafe = false;
+  SmallVector<std::pair<MachineInstr*, unsigned /*NewOpc*/>, 4> OpsToUpdate;
+  MachineBasicBlock::iterator E = CmpInstr->getParent()->end();
+  for (++I; I != E; ++I) {
+    const MachineInstr &Instr = *I;
+    if (Instr.modifiesRegister(X86::EFLAGS, TRI)) {
+      // It is safe to remove CmpInstr if EFLAGS is updated again.
+      IsSafe = true;
+      break;
+    }
+    if (!Instr.readsRegister(X86::EFLAGS, TRI))
+      continue;
+
+    // EFLAGS is used by this instruction.
+    X86::CondCode OldCC;
+    bool OpcIsSET = false;
+    if (IsCmpZero || IsSwapped) {
+      // We decode the condition code from opcode.
+      if (Instr.isBranch())
+        OldCC = getCondFromBranchOpc(Instr.getOpcode());
+      else {
+        OldCC = getCondFromSETOpc(Instr.getOpcode());
+        if (OldCC != X86::COND_INVALID)
+          OpcIsSET = true;
+        else
+          OldCC = getCondFromCMovOpc(Instr.getOpcode());
+      }
+      if (OldCC == X86::COND_INVALID) return false;
+    }
+    if (IsCmpZero) {
+      switch (OldCC) {
+      default: break;
+      case X86::COND_A: case X86::COND_AE:
+      case X86::COND_B: case X86::COND_BE:
+      case X86::COND_G: case X86::COND_GE:
+      case X86::COND_L: case X86::COND_LE:
+      case X86::COND_O: case X86::COND_NO:
+        // CF and OF are used, we can't perform this optimization.
+        return false;
+      }
+    } else if (IsSwapped) {
+      // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
+      // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
+      // We swap the condition code and synthesize the new opcode.
+      X86::CondCode NewCC = getSwappedCondition(OldCC);
+      if (NewCC == X86::COND_INVALID) return false;
+
+      // Synthesize the new opcode.
+      bool HasMemoryOperand = Instr.hasOneMemOperand();
+      unsigned NewOpc;
+      if (Instr.isBranch())
+        NewOpc = GetCondBranchFromCond(NewCC);
+      else if(OpcIsSET)
+        NewOpc = getSETFromCond(NewCC, HasMemoryOperand);
+      else {
+        unsigned DstReg = Instr.getOperand(0).getReg();
+        NewOpc = getCMovFromCond(NewCC, MRI->getRegClass(DstReg)->getSize(),
+                                 HasMemoryOperand);
+      }
+
+      // Push the MachineInstr to OpsToUpdate.
+      // If it is safe to remove CmpInstr, the condition code of these
+      // instructions will be modified.
+      OpsToUpdate.push_back(std::make_pair(&*I, NewOpc));
+    }
+    if (Instr.killsRegister(X86::EFLAGS, TRI)) {
+      IsSafe = true;
+      break;
+    }
+  }
+
+  // If EFLAGS is not killed nor re-defined, we should check whether it is
+  // live-out. If it is live-out, do not optimize.
+  if ((IsCmpZero || IsSwapped) && !IsSafe) {
+    MachineBasicBlock *MBB = CmpInstr->getParent();
+    for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+             SE = MBB->succ_end(); SI != SE; ++SI)
+      if ((*SI)->isLiveIn(X86::EFLAGS))
+        return false;
+  }
+
+  // The instruction to be updated is either Sub or MI.
+  Sub = IsCmpZero ? MI : Sub;
+  // Move Movr0Inst to the place right before Sub.
+  if (Movr0Inst) {
+    Sub->getParent()->remove(Movr0Inst);
+    Sub->getParent()->insert(MachineBasicBlock::iterator(Sub), Movr0Inst);
+  }
+
+  // Make sure Sub instruction defines EFLAGS.
+  assert(Sub->getNumOperands() >= 2 &&
+         Sub->getOperand(Sub->getNumOperands()-1).isReg() &&
+         Sub->getOperand(Sub->getNumOperands()-1).getReg() == X86::EFLAGS &&
+         "EFLAGS should be the last operand of SUB, ADD, OR, XOR, AND");
+  Sub->getOperand(Sub->getNumOperands()-1).setIsDef(true);
+  CmpInstr->eraseFromParent();
+
+  // Modify the condition code of instructions in OpsToUpdate.
+  for (unsigned i = 0, e = OpsToUpdate.size(); i < e; i++)
+    OpsToUpdate[i].first->setDesc(get(OpsToUpdate[i].second));
+  return true;
+}
+
 /// Expand2AddrUndef - Expand a single-def pseudo instruction to a two-addr
 /// instruction with two undef reads of the register being defined.  This is
 /// used for mapping:
@@ -2809,7 +3491,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
         return NULL;
       bool NarrowToMOV32rm = false;
       if (Size) {
-        unsigned RCSize = getRegClass(MI->getDesc(), i, &RI)->getSize();
+        unsigned RCSize = getRegClass(MI->getDesc(), i, &RI, MF)->getSize();
         if (Size < RCSize) {
           // Check if it's safe to fold the load. If the size of the object is
           // narrower than the load width, then it's not.
@@ -3202,7 +3884,7 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
   UnfoldStore &= FoldedStore;
 
   const MCInstrDesc &MCID = get(Opc);
-  const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI);
+  const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
   if (!MI->hasOneMemOperand() &&
       RC == &X86::VR128RegClass &&
       !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
@@ -3297,7 +3979,7 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
 
   // Emit the store instruction.
   if (UnfoldStore) {
-    const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI);
+    const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF);
     std::pair<MachineInstr::mmo_iterator,
               MachineInstr::mmo_iterator> MMOs =
       MF.extractStoreMemRefs(MI->memoperands_begin(),
@@ -3323,7 +4005,8 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
   bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
   bool FoldedStore = I->second.second & TB_FOLDED_STORE;
   const MCInstrDesc &MCID = get(Opc);
-  const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI);
+  MachineFunction &MF = DAG.getMachineFunction();
+  const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
   unsigned NumDefs = MCID.NumDefs;
   std::vector<SDValue> AddrOps;
   std::vector<SDValue> BeforeOps;
@@ -3344,7 +4027,6 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
 
   // Emit the load instruction.
   SDNode *Load = 0;
-  MachineFunction &MF = DAG.getMachineFunction();
   if (FoldedLoad) {
     EVT VT = *RC->vt_begin();
     std::pair<MachineInstr::mmo_iterator,
@@ -3371,7 +4053,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
   std::vector<EVT> VTs;
   const TargetRegisterClass *DstRC = 0;
   if (MCID.getNumDefs() > 0) {
-    DstRC = getRegClass(MCID, 0, &RI);
+    DstRC = getRegClass(MCID, 0, &RI, MF);
     VTs.push_back(*DstRC->vt_begin());
   }
   for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
@@ -3625,7 +4307,7 @@ unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
   // Create the register. The code to initialize it is inserted
   // later, by the CGBR pass (below).
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
-  GlobalBaseReg = RegInfo.createVirtualRegister(X86::GR32RegisterClass);
+  GlobalBaseReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
   X86FI->setGlobalBaseReg(GlobalBaseReg);
   return GlobalBaseReg;
 }
@@ -3835,7 +4517,7 @@ namespace {
 
       unsigned PC;
       if (TM->getSubtarget<X86Subtarget>().isPICStyleGOT())
-        PC = RegInfo.createVirtualRegister(X86::GR32RegisterClass);
+        PC = RegInfo.createVirtualRegister(&X86::GR32RegClass);
       else
         PC = GlobalBaseReg;
 
@@ -3869,3 +4551,117 @@ namespace {
 char CGBR::ID = 0;
 FunctionPass*
 llvm::createGlobalBaseRegPass() { return new CGBR(); }
+
+namespace {
+  struct LDTLSCleanup : public MachineFunctionPass {
+    static char ID;
+    LDTLSCleanup() : MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF) {
+      X86MachineFunctionInfo* MFI = MF.getInfo<X86MachineFunctionInfo>();
+      if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
+        // No point folding accesses if there isn't at least two.
+        return false;
+      }
+
+      MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+      return VisitNode(DT->getRootNode(), 0);
+    }
+
+    // Visit the dominator subtree rooted at Node in pre-order.
+    // If TLSBaseAddrReg is non-null, then use that to replace any
+    // TLS_base_addr instructions. Otherwise, create the register
+    // when the first such instruction is seen, and then use it
+    // as we encounter more instructions.
+    bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
+      MachineBasicBlock *BB = Node->getBlock();
+      bool Changed = false;
+
+      // Traverse the current block.
+      for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
+           ++I) {
+        switch (I->getOpcode()) {
+          case X86::TLS_base_addr32:
+          case X86::TLS_base_addr64:
+            if (TLSBaseAddrReg)
+              I = ReplaceTLSBaseAddrCall(I, TLSBaseAddrReg);
+            else
+              I = SetRegister(I, &TLSBaseAddrReg);
+            Changed = true;
+            break;
+          default:
+            break;
+        }
+      }
+
+      // Visit the children of this block in the dominator tree.
+      for (MachineDomTreeNode::iterator I = Node->begin(), E = Node->end();
+           I != E; ++I) {
+        Changed |= VisitNode(*I, TLSBaseAddrReg);
+      }
+
+      return Changed;
+    }
+
+    // Replace the TLS_base_addr instruction I with a copy from
+    // TLSBaseAddrReg, returning the new instruction.
+    MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr *I,
+                                         unsigned TLSBaseAddrReg) {
+      MachineFunction *MF = I->getParent()->getParent();
+      const X86TargetMachine *TM =
+          static_cast<const X86TargetMachine *>(&MF->getTarget());
+      const bool is64Bit = TM->getSubtarget<X86Subtarget>().is64Bit();
+      const X86InstrInfo *TII = TM->getInstrInfo();
+
+      // Insert a Copy from TLSBaseAddrReg to RAX/EAX.
+      MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
+                                   TII->get(TargetOpcode::COPY),
+                                   is64Bit ? X86::RAX : X86::EAX)
+                                   .addReg(TLSBaseAddrReg);
+
+      // Erase the TLS_base_addr instruction.
+      I->eraseFromParent();
+
+      return Copy;
+    }
+
+    // Create a virtal register in *TLSBaseAddrReg, and populate it by
+    // inserting a copy instruction after I. Returns the new instruction.
+    MachineInstr *SetRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
+      MachineFunction *MF = I->getParent()->getParent();
+      const X86TargetMachine *TM =
+          static_cast<const X86TargetMachine *>(&MF->getTarget());
+      const bool is64Bit = TM->getSubtarget<X86Subtarget>().is64Bit();
+      const X86InstrInfo *TII = TM->getInstrInfo();
+
+      // Create a virtual register for the TLS base address.
+      MachineRegisterInfo &RegInfo = MF->getRegInfo();
+      *TLSBaseAddrReg = RegInfo.createVirtualRegister(is64Bit
+                                                      ? &X86::GR64RegClass
+                                                      : &X86::GR32RegClass);
+
+      // Insert a copy from RAX/EAX to TLSBaseAddrReg.
+      MachineInstr *Next = I->getNextNode();
+      MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
+                                   TII->get(TargetOpcode::COPY),
+                                   *TLSBaseAddrReg)
+                                   .addReg(is64Bit ? X86::RAX : X86::EAX);
+
+      return Copy;
+    }
+
+    virtual const char *getPassName() const {
+      return "Local Dynamic TLS Access Clean-up";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<MachineDominatorTree>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+char LDTLSCleanup::ID = 0;
+FunctionPass*
+llvm::createCleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); }
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index b23d756..ec9b2e6 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -128,8 +128,8 @@ class X86InstrInfo : public X86GenInstrInfo {
   X86TargetMachine &TM;
   const X86RegisterInfo RI;
 
-  /// RegOp2MemOpTable2Addr, RegOp2MemOpTable0, RegOp2MemOpTable1,
-  /// RegOp2MemOpTable2 - Load / store folding opcode maps.
+  /// RegOp2MemOpTable3Addr, RegOp2MemOpTable0, RegOp2MemOpTable1,
+  /// RegOp2MemOpTable2, RegOp2MemOpTable3 - Load / store folding opcode maps.
   ///
   typedef DenseMap<unsigned,
                    std::pair<unsigned, unsigned> > RegOp2MemOpTableType;
@@ -137,6 +137,7 @@ class X86InstrInfo : public X86GenInstrInfo {
   RegOp2MemOpTableType RegOp2MemOpTable0;
   RegOp2MemOpTableType RegOp2MemOpTable1;
   RegOp2MemOpTableType RegOp2MemOpTable2;
+  RegOp2MemOpTableType RegOp2MemOpTable3;
 
   /// MemOp2RegOpTable - Load / store unfolding opcode map.
   ///
@@ -144,9 +145,9 @@ class X86InstrInfo : public X86GenInstrInfo {
                    std::pair<unsigned, unsigned> > MemOp2RegOpTableType;
   MemOp2RegOpTableType MemOp2RegOpTable;
 
-  void AddTableEntry(RegOp2MemOpTableType &R2MTable,
-                     MemOp2RegOpTableType &M2RTable,
-                     unsigned RegOp, unsigned MemOp, unsigned Flags);
+  static void AddTableEntry(RegOp2MemOpTableType &R2MTable,
+                            MemOp2RegOpTableType &M2RTable,
+                            unsigned RegOp, unsigned MemOp, unsigned Flags);
 
 public:
   explicit X86InstrInfo(X86TargetMachine &tm);
@@ -218,6 +219,14 @@ public:
                                 MachineBasicBlock *FBB,
                                 const SmallVectorImpl<MachineOperand> &Cond,
                                 DebugLoc DL) const;
+  virtual bool canInsertSelect(const MachineBasicBlock&,
+                               const SmallVectorImpl<MachineOperand> &Cond,
+                               unsigned, unsigned, int&, int&, int&) const;
+  virtual void insertSelect(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI, DebugLoc DL,
+                            unsigned DstReg,
+                            const SmallVectorImpl<MachineOperand> &Cond,
+                            unsigned TrueReg, unsigned FalseReg) const;
   virtual void copyPhysReg(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI, DebugLoc DL,
                            unsigned DestReg, unsigned SrcReg,
@@ -363,6 +372,21 @@ public:
                              const MachineInstr *DefMI, unsigned DefIdx,
                              const MachineInstr *UseMI, unsigned UseIdx) const;
 
+  /// analyzeCompare - For a comparison instruction, return the source registers
+  /// in SrcReg and SrcReg2 if having two register operands, and the value it
+  /// compares against in CmpValue. Return true if the comparison instruction
+  /// can be analyzed.
+  virtual bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+                              unsigned &SrcReg2,
+                              int &CmpMask, int &CmpValue) const;
+
+  /// optimizeCompareInstr - Check if there exists an earlier instruction that
+  /// operates on the same source operands and sets flags in the same way as
+  /// Compare; remove Compare if possible.
+  virtual bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+                                    unsigned SrcReg2, int CmpMask, int CmpValue,
+                                    const MachineRegisterInfo *MRI) const;
+
 private:
   MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc,
                                               MachineFunction::iterator &MFI,
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 6a25312..d293156 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -63,6 +63,10 @@ def SDTX86SetCC_C : SDTypeProfile<1, 2,
                                   [SDTCisInt<0>,
                                    SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
 
+def SDTX86sahf : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i8>]>;
+
+def SDTX86rdrand : SDTypeProfile<2, 0, [SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
 def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>,
                                      SDTCisVT<2, i8>]>;
 def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
@@ -95,6 +99,8 @@ def SDTX86Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
 
 def SDT_X86TLSADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
+def SDT_X86TLSBASEADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
 def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
 def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
@@ -131,6 +137,11 @@ def X86brcond  : SDNode<"X86ISD::BRCOND",   SDTX86BrCond,
 def X86setcc   : SDNode<"X86ISD::SETCC",    SDTX86SetCC>;
 def X86setcc_c : SDNode<"X86ISD::SETCC_CARRY", SDTX86SetCC_C>;
 
+def X86sahf    : SDNode<"X86ISD::SAHF",     SDTX86sahf>;
+
+def X86rdrand  : SDNode<"X86ISD::RDRAND",   SDTX86rdrand,
+                        [SDNPHasChain, SDNPSideEffect]>;
+
 def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas,
                         [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
                          SDNPMayLoad, SDNPMemOperand]>;
@@ -199,6 +210,9 @@ def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP",  SDTX86Wrapper>;
 def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR,
                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 
+def X86tlsbaseaddr : SDNode<"X86ISD::TLSBASEADDR", SDT_X86TLSBASEADDR,
+                        [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
 def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET,
                         [SDNPHasChain]>;
 
@@ -278,6 +292,20 @@ def X86Mem256AsmOperand : AsmOperandClass {
   let Name = "Mem256"; let PredicateMethod = "isMem256";
 }
 
+// Gather mem operands
+def X86MemVX32Operand : AsmOperandClass {
+  let Name = "MemVX32"; let PredicateMethod = "isMemVX32";
+}
+def X86MemVY32Operand : AsmOperandClass {
+  let Name = "MemVY32"; let PredicateMethod = "isMemVY32";
+}
+def X86MemVX64Operand : AsmOperandClass {
+  let Name = "MemVX64"; let PredicateMethod = "isMemVX64";
+}
+def X86MemVY64Operand : AsmOperandClass {
+  let Name = "MemVY64"; let PredicateMethod = "isMemVY64";
+}
+
 def X86AbsMemAsmOperand : AsmOperandClass {
   let Name = "AbsMem";
   let SuperClasses = [X86MemAsmOperand];
@@ -316,6 +344,20 @@ def f128mem : X86MemOperand<"printf128mem"> {
   let ParserMatchClass = X86Mem128AsmOperand; }
 def f256mem : X86MemOperand<"printf256mem">{ 
   let ParserMatchClass = X86Mem256AsmOperand; }
+
+// Gather mem operands
+def vx32mem : X86MemOperand<"printi32mem">{
+  let MIOperandInfo = (ops ptr_rc, i8imm, VR128, i32imm, i8imm);
+  let ParserMatchClass = X86MemVX32Operand; }
+def vy32mem : X86MemOperand<"printi32mem">{
+  let MIOperandInfo = (ops ptr_rc, i8imm, VR256, i32imm, i8imm);
+  let ParserMatchClass = X86MemVY32Operand; }
+def vx64mem : X86MemOperand<"printi64mem">{
+  let MIOperandInfo = (ops ptr_rc, i8imm, VR128, i32imm, i8imm);
+  let ParserMatchClass = X86MemVX64Operand; }
+def vy64mem : X86MemOperand<"printi64mem">{
+  let MIOperandInfo = (ops ptr_rc, i8imm, VR256, i32imm, i8imm);
+  let ParserMatchClass = X86MemVY64Operand; }
 }
 
 // A version of i8mem for use on x86-64 that uses GR64_NOREX instead of
@@ -328,7 +370,7 @@ def i8mem_NOREX : Operand<i64> {
 }
 
 // GPRs available for tailcall.
-// It represents GR64_TC or GR64_TCW64.
+// It represents GR32_TC, GR64_TC or GR64_TCW64.
 def ptr_rc_tailcall : PointerLikeRegClass<2>;
 
 // Special i32mem for addresses of load folding tail calls. These are not
@@ -336,7 +378,8 @@ def ptr_rc_tailcall : PointerLikeRegClass<2>;
 // after callee-saved register are popped.
 def i32mem_TC : Operand<i32> {
   let PrintMethod = "printi32mem";
-  let MIOperandInfo = (ops GR32_TC, i8imm, GR32_TC, i32imm, i8imm);
+  let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, ptr_rc_tailcall,
+                       i32imm, i8imm);
   let ParserMatchClass = X86Mem32AsmOperand;
   let OperandType = "OPERAND_MEMORY";
 }
@@ -487,6 +530,9 @@ def lea32addr : ComplexPattern<i32, 5, "SelectLEAAddr",
 def tls32addr : ComplexPattern<i32, 5, "SelectTLSADDRAddr",
                                [tglobaltlsaddr], []>;
 
+def tls32baseaddr : ComplexPattern<i32, 5, "SelectTLSADDRAddr",
+                               [tglobaltlsaddr], []>;
+
 def lea64addr : ComplexPattern<i64, 5, "SelectLEAAddr",
                         [add, sub, mul, X86mul_imm, shl, or, frameindex,
                          X86WrapperRIP], []>;
@@ -494,6 +540,9 @@ def lea64addr : ComplexPattern<i64, 5, "SelectLEAAddr",
 def tls64addr : ComplexPattern<i64, 5, "SelectTLSADDRAddr",
                                [tglobaltlsaddr], []>;
 
+def tls64baseaddr : ComplexPattern<i64, 5, "SelectTLSADDRAddr",
+                               [tglobaltlsaddr], []>;
+
 //===----------------------------------------------------------------------===//
 // X86 Instruction Predicate Definitions.
 def HasCMov      : Predicate<"Subtarget->hasCMov()">;
@@ -514,8 +563,8 @@ def HasAVX2      : Predicate<"Subtarget->hasAVX2()">;
 
 def HasPOPCNT    : Predicate<"Subtarget->hasPOPCNT()">;
 def HasAES       : Predicate<"Subtarget->hasAES()">;
-def HasCLMUL     : Predicate<"Subtarget->hasCLMUL()">;
-def HasFMA3      : Predicate<"Subtarget->hasFMA3()">;
+def HasPCLMUL    : Predicate<"Subtarget->hasPCLMUL()">;
+def HasFMA       : Predicate<"Subtarget->hasFMA()">;
 def HasFMA4      : Predicate<"Subtarget->hasFMA4()">;
 def HasXOP       : Predicate<"Subtarget->hasXOP()">;
 def HasMOVBE     : Predicate<"Subtarget->hasMOVBE()">;
@@ -680,25 +729,27 @@ def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{
 
 // Nop
 let neverHasSideEffects = 1 in {
-  def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", []>;
+  def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", [], IIC_NOP>;
   def NOOPW : I<0x1f, MRM0m, (outs), (ins i16mem:$zero),
-                "nop{w}\t$zero", []>, TB, OpSize;
+                "nop{w}\t$zero", [], IIC_NOP>, TB, OpSize;
   def NOOPL : I<0x1f, MRM0m, (outs), (ins i32mem:$zero),
-                "nop{l}\t$zero", []>, TB;
+                "nop{l}\t$zero", [], IIC_NOP>, TB;
 }
 
 
 // Constructing a stack frame.
 def ENTER : Ii16<0xC8, RawFrmImm8, (outs), (ins i16imm:$len, i8imm:$lvl),
-                 "enter\t$len, $lvl", []>;
+                 "enter\t$len, $lvl", [], IIC_ENTER>;
 
 let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, neverHasSideEffects=1 in
 def LEAVE    : I<0xC9, RawFrm,
-                 (outs), (ins), "leave", []>, Requires<[In32BitMode]>;
+                 (outs), (ins), "leave", [], IIC_LEAVE>,
+                 Requires<[In32BitMode]>;
 
 let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, neverHasSideEffects = 1 in
 def LEAVE64  : I<0xC9, RawFrm,
-                 (outs), (ins), "leave", []>, Requires<[In64BitMode]>;
+                 (outs), (ins), "leave", [], IIC_LEAVE>,
+                 Requires<[In64BitMode]>;
 
 //===----------------------------------------------------------------------===//
 //  Miscellaneous Instructions.
@@ -706,41 +757,49 @@ def LEAVE64  : I<0xC9, RawFrm,
 
 let Defs = [ESP], Uses = [ESP], neverHasSideEffects=1 in {
 let mayLoad = 1 in {
-def POP16r  : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", []>,
-  OpSize;
-def POP32r  : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", []>;
-def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", []>,
-  OpSize;
-def POP16rmm: I<0x8F, MRM0m, (outs i16mem:$dst), (ins), "pop{w}\t$dst", []>,
-  OpSize;
-def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", []>;
-def POP32rmm: I<0x8F, MRM0m, (outs i32mem:$dst), (ins), "pop{l}\t$dst", []>;
-
-def POPF16   : I<0x9D, RawFrm, (outs), (ins), "popf{w}", []>, OpSize;
-def POPF32   : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", []>,
+def POP16r  : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
+                IIC_POP_REG16>, OpSize;
+def POP32r  : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
+                IIC_POP_REG>;
+def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
+                IIC_POP_REG>, OpSize;
+def POP16rmm: I<0x8F, MRM0m, (outs i16mem:$dst), (ins), "pop{w}\t$dst", [],
+                IIC_POP_MEM>, OpSize;
+def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
+                IIC_POP_REG>;
+def POP32rmm: I<0x8F, MRM0m, (outs i32mem:$dst), (ins), "pop{l}\t$dst", [],
+                IIC_POP_MEM>;
+
+def POPF16   : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>, OpSize;
+def POPF32   : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", [], IIC_POP_FD>,
                Requires<[In32BitMode]>;
 }
 
 let mayStore = 1 in {
-def PUSH16r  : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[]>,
-  OpSize;
-def PUSH32r  : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[]>;
-def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[]>,
-  OpSize;
-def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[]>,
+def PUSH16r  : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
+                 IIC_PUSH_REG>, OpSize;
+def PUSH32r  : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
+                 IIC_PUSH_REG>;
+def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
+                 IIC_PUSH_REG>, OpSize;
+def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[],
+                 IIC_PUSH_MEM>,
   OpSize;
-def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[]>;
-def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[]>;
+def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
+                 IIC_PUSH_REG>;
+def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[],
+                 IIC_PUSH_MEM>;
 
 def PUSHi8   : Ii8<0x6a, RawFrm, (outs), (ins i32i8imm:$imm),
-                      "push{l}\t$imm", []>;
+                      "push{l}\t$imm", [], IIC_PUSH_IMM>;
 def PUSHi16  : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
-                      "push{w}\t$imm", []>, OpSize;
+                      "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize;
 def PUSHi32  : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm),
-                      "push{l}\t$imm", []>;
+                      "push{l}\t$imm", [], IIC_PUSH_IMM>;
 
-def PUSHF16  : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", []>, OpSize;
-def PUSHF32  : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", []>,
+def PUSHF16  : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", [], IIC_PUSH_F>,
+                 OpSize;
+def PUSHF32  : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", [], IIC_PUSH_F>,
                Requires<[In32BitMode]>;
 
 }
@@ -749,44 +808,48 @@ def PUSHF32  : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", []>,
 let Defs = [RSP], Uses = [RSP], neverHasSideEffects=1 in {
 let mayLoad = 1 in {
 def POP64r   : I<0x58, AddRegFrm,
-                 (outs GR64:$reg), (ins), "pop{q}\t$reg", []>;
-def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", []>;
-def POP64rmm: I<0x8F, MRM0m, (outs i64mem:$dst), (ins), "pop{q}\t$dst", []>;
+                 (outs GR64:$reg), (ins), "pop{q}\t$reg", [], IIC_POP_REG>;
+def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", [],
+                IIC_POP_REG>;
+def POP64rmm: I<0x8F, MRM0m, (outs i64mem:$dst), (ins), "pop{q}\t$dst", [],
+                IIC_POP_MEM>;
 }
 let mayStore = 1 in {
 def PUSH64r  : I<0x50, AddRegFrm,
-                 (outs), (ins GR64:$reg), "push{q}\t$reg", []>;
-def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", []>;
-def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", []>;
+                 (outs), (ins GR64:$reg), "push{q}\t$reg", [], IIC_PUSH_REG>;
+def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", [],
+                 IIC_PUSH_REG>;
+def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", [],
+                 IIC_PUSH_MEM>;
 }
 }
 
 let Defs = [RSP], Uses = [RSP], neverHasSideEffects = 1, mayStore = 1 in {
 def PUSH64i8   : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm),
-                     "push{q}\t$imm", []>;
+                     "push{q}\t$imm", [], IIC_PUSH_IMM>;
 def PUSH64i16  : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
-                      "push{q}\t$imm", []>;
+                      "push{q}\t$imm", [], IIC_PUSH_IMM>;
 def PUSH64i32  : Ii32<0x68, RawFrm, (outs), (ins i64i32imm:$imm),
-                      "push{q}\t$imm", []>;
+                      "push{q}\t$imm", [], IIC_PUSH_IMM>;
 }
 
 let Defs = [RSP, EFLAGS], Uses = [RSP], mayLoad = 1, neverHasSideEffects=1 in
-def POPF64   : I<0x9D, RawFrm, (outs), (ins), "popfq", []>,
+def POPF64   : I<0x9D, RawFrm, (outs), (ins), "popfq", [], IIC_POP_FD>,
                Requires<[In64BitMode]>;
 let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1, neverHasSideEffects=1 in
-def PUSHF64    : I<0x9C, RawFrm, (outs), (ins), "pushfq", []>,
+def PUSHF64    : I<0x9C, RawFrm, (outs), (ins), "pushfq", [], IIC_PUSH_F>,
                  Requires<[In64BitMode]>;
 
 
 
 let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP],
     mayLoad=1, neverHasSideEffects=1 in {
-def POPA32   : I<0x61, RawFrm, (outs), (ins), "popa{l}", []>,
+def POPA32   : I<0x61, RawFrm, (outs), (ins), "popa{l}", [], IIC_POP_A>,
                Requires<[In32BitMode]>;
 }
 let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP],
     mayStore=1, neverHasSideEffects=1 in {
-def PUSHA32  : I<0x60, RawFrm, (outs), (ins), "pusha{l}", []>,
+def PUSHA32  : I<0x60, RawFrm, (outs), (ins), "pusha{l}", [], IIC_PUSH_A>,
                Requires<[In32BitMode]>;
 }
 
@@ -794,84 +857,92 @@ let Constraints = "$src = $dst" in {    // GR32 = bswap GR32
 def BSWAP32r : I<0xC8, AddRegFrm,
                  (outs GR32:$dst), (ins GR32:$src),
                  "bswap{l}\t$dst",
-                 [(set GR32:$dst, (bswap GR32:$src))]>, TB;
+                 [(set GR32:$dst, (bswap GR32:$src))], IIC_BSWAP>, TB;
 
 def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src),
                   "bswap{q}\t$dst",
-                  [(set GR64:$dst, (bswap GR64:$src))]>, TB;
+                  [(set GR64:$dst, (bswap GR64:$src))], IIC_BSWAP>, TB;
 } // Constraints = "$src = $dst"
 
 // Bit scan instructions.
 let Defs = [EFLAGS] in {
 def BSF16rr  : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                  "bsf{w}\t{$src, $dst|$dst, $src}",
-                 [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))]>, TB, OpSize;
+                 [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))],
+                  IIC_BSF>, TB, OpSize;
 def BSF16rm  : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                  "bsf{w}\t{$src, $dst|$dst, $src}",
-                 [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))]>, TB,
-                 OpSize;
+                 [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))],
+                  IIC_BSF>, TB, OpSize;
 def BSF32rr  : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                  "bsf{l}\t{$src, $dst|$dst, $src}",
-                 [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))]>, TB;
+                 [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))], IIC_BSF>, TB;
 def BSF32rm  : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                  "bsf{l}\t{$src, $dst|$dst, $src}",
-                 [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))]>, TB;
+                 [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))],
+                 IIC_BSF>, TB;
 def BSF64rr  : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                   "bsf{q}\t{$src, $dst|$dst, $src}",
-                  [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))]>, TB;
+                  [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))],
+                  IIC_BSF>, TB;
 def BSF64rm  : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                   "bsf{q}\t{$src, $dst|$dst, $src}",
-                  [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))]>, TB;
+                  [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))],
+                  IIC_BSF>, TB;
 
 def BSR16rr  : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                  "bsr{w}\t{$src, $dst|$dst, $src}",
-                 [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))]>, TB, OpSize;
+                 [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))], IIC_BSR>,
+                 TB, OpSize;
 def BSR16rm  : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                  "bsr{w}\t{$src, $dst|$dst, $src}",
-                 [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))]>, TB,
+                 [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))],
+                 IIC_BSR>, TB,
                  OpSize;
 def BSR32rr  : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                  "bsr{l}\t{$src, $dst|$dst, $src}",
-                 [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))]>, TB;
+                 [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))], IIC_BSR>, TB;
 def BSR32rm  : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                  "bsr{l}\t{$src, $dst|$dst, $src}",
-                 [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))]>, TB;
+                 [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))],
+                 IIC_BSR>, TB;
 def BSR64rr  : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                   "bsr{q}\t{$src, $dst|$dst, $src}",
-                  [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))]>, TB;
+                  [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))], IIC_BSR>, TB;
 def BSR64rm  : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                   "bsr{q}\t{$src, $dst|$dst, $src}",
-                  [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))]>, TB;
+                  [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))],
+                  IIC_BSR>, TB;
 } // Defs = [EFLAGS]
 
 
 // These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
 let Defs = [EDI,ESI], Uses = [EDI,ESI,EFLAGS] in {
-def MOVSB : I<0xA4, RawFrm, (outs), (ins), "movsb", []>;
-def MOVSW : I<0xA5, RawFrm, (outs), (ins), "movsw", []>, OpSize;
-def MOVSD : I<0xA5, RawFrm, (outs), (ins), "movs{l|d}", []>;
-def MOVSQ : RI<0xA5, RawFrm, (outs), (ins), "movsq", []>;
+def MOVSB : I<0xA4, RawFrm, (outs), (ins), "movsb", [], IIC_MOVS>;
+def MOVSW : I<0xA5, RawFrm, (outs), (ins), "movsw", [], IIC_MOVS>, OpSize;
+def MOVSD : I<0xA5, RawFrm, (outs), (ins), "movs{l|d}", [], IIC_MOVS>;
+def MOVSQ : RI<0xA5, RawFrm, (outs), (ins), "movsq", [], IIC_MOVS>;
 }
 
 // These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
 let Defs = [EDI], Uses = [AL,EDI,EFLAGS] in
-def STOSB : I<0xAA, RawFrm, (outs), (ins), "stosb", []>;
+def STOSB : I<0xAA, RawFrm, (outs), (ins), "stosb", [], IIC_STOS>;
 let Defs = [EDI], Uses = [AX,EDI,EFLAGS] in
-def STOSW : I<0xAB, RawFrm, (outs), (ins), "stosw", []>, OpSize;
+def STOSW : I<0xAB, RawFrm, (outs), (ins), "stosw", [], IIC_STOS>, OpSize;
 let Defs = [EDI], Uses = [EAX,EDI,EFLAGS] in
-def STOSD : I<0xAB, RawFrm, (outs), (ins), "stos{l|d}", []>;
+def STOSD : I<0xAB, RawFrm, (outs), (ins), "stos{l|d}", [], IIC_STOS>;
 let Defs = [RCX,RDI], Uses = [RAX,RCX,RDI,EFLAGS] in
-def STOSQ : RI<0xAB, RawFrm, (outs), (ins), "stosq", []>;
+def STOSQ : RI<0xAB, RawFrm, (outs), (ins), "stosq", [], IIC_STOS>;
 
-def SCAS8 : I<0xAE, RawFrm, (outs), (ins), "scasb", []>;
-def SCAS16 : I<0xAF, RawFrm, (outs), (ins), "scasw", []>, OpSize;
-def SCAS32 : I<0xAF, RawFrm, (outs), (ins), "scas{l|d}", []>;
-def SCAS64 : RI<0xAF, RawFrm, (outs), (ins), "scasq", []>;
+def SCAS8 : I<0xAE, RawFrm, (outs), (ins), "scasb", [], IIC_SCAS>;
+def SCAS16 : I<0xAF, RawFrm, (outs), (ins), "scasw", [], IIC_SCAS>, OpSize;
+def SCAS32 : I<0xAF, RawFrm, (outs), (ins), "scas{l|d}", [], IIC_SCAS>;
+def SCAS64 : RI<0xAF, RawFrm, (outs), (ins), "scasq", [], IIC_SCAS>;
 
-def CMPS8 : I<0xA6, RawFrm, (outs), (ins), "cmpsb", []>;
-def CMPS16 : I<0xA7, RawFrm, (outs), (ins), "cmpsw", []>, OpSize;
-def CMPS32 : I<0xA7, RawFrm, (outs), (ins), "cmps{l|d}", []>;
-def CMPS64 : RI<0xA7, RawFrm, (outs), (ins), "cmpsq", []>;
+def CMPS8 : I<0xA6, RawFrm, (outs), (ins), "cmpsb", [], IIC_CMPS>;
+def CMPS16 : I<0xA7, RawFrm, (outs), (ins), "cmpsw", [], IIC_CMPS>, OpSize;
+def CMPS32 : I<0xA7, RawFrm, (outs), (ins), "cmps{l|d}", [], IIC_CMPS>;
+def CMPS64 : RI<0xA7, RawFrm, (outs), (ins), "cmpsq", [], IIC_CMPS>;
 
 
 //===----------------------------------------------------------------------===//
@@ -880,64 +951,64 @@ def CMPS64 : RI<0xA7, RawFrm, (outs), (ins), "cmpsq", []>;
 
 let neverHasSideEffects = 1 in {
 def MOV8rr  : I<0x88, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src),
-                "mov{b}\t{$src, $dst|$dst, $src}", []>;
+                "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
 def MOV16rr : I<0x89, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
-                "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize;
 def MOV32rr : I<0x89, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", []>;
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
 def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
-                 "mov{q}\t{$src, $dst|$dst, $src}", []>;
+                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
 }
 let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
 def MOV8ri  : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src),
                    "mov{b}\t{$src, $dst|$dst, $src}",
-                   [(set GR8:$dst, imm:$src)]>;
+                   [(set GR8:$dst, imm:$src)], IIC_MOV>;
 def MOV16ri : Ii16<0xB8, AddRegFrm, (outs GR16:$dst), (ins i16imm:$src),
                    "mov{w}\t{$src, $dst|$dst, $src}",
-                   [(set GR16:$dst, imm:$src)]>, OpSize;
+                   [(set GR16:$dst, imm:$src)], IIC_MOV>, OpSize;
 def MOV32ri : Ii32<0xB8, AddRegFrm, (outs GR32:$dst), (ins i32imm:$src),
                    "mov{l}\t{$src, $dst|$dst, $src}",
-                   [(set GR32:$dst, imm:$src)]>;
+                   [(set GR32:$dst, imm:$src)], IIC_MOV>;
 def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src),
                     "movabs{q}\t{$src, $dst|$dst, $src}",
-                    [(set GR64:$dst, imm:$src)]>;
+                    [(set GR64:$dst, imm:$src)], IIC_MOV>;
 def MOV64ri32 : RIi32<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src),
                       "mov{q}\t{$src, $dst|$dst, $src}",
-                      [(set GR64:$dst, i64immSExt32:$src)]>;
+                      [(set GR64:$dst, i64immSExt32:$src)], IIC_MOV>;
 }
 
 def MOV8mi  : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src),
                    "mov{b}\t{$src, $dst|$dst, $src}",
-                   [(store (i8 imm:$src), addr:$dst)]>;
+                   [(store (i8 imm:$src), addr:$dst)], IIC_MOV_MEM>;
 def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src),
                    "mov{w}\t{$src, $dst|$dst, $src}",
-                   [(store (i16 imm:$src), addr:$dst)]>, OpSize;
+                   [(store (i16 imm:$src), addr:$dst)], IIC_MOV_MEM>, OpSize;
 def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src),
                    "mov{l}\t{$src, $dst|$dst, $src}",
-                   [(store (i32 imm:$src), addr:$dst)]>;
+                   [(store (i32 imm:$src), addr:$dst)], IIC_MOV_MEM>;
 def MOV64mi32 : RIi32<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
                       "mov{q}\t{$src, $dst|$dst, $src}",
-                      [(store i64immSExt32:$src, addr:$dst)]>;
+                      [(store i64immSExt32:$src, addr:$dst)], IIC_MOV_MEM>;
 
 /// moffs8, moffs16 and moffs32 versions of moves.  The immediate is a
 /// 32-bit offset from the PC.  These are only valid in x86-32 mode.
 def MOV8o8a : Ii32 <0xA0, RawFrm, (outs), (ins offset8:$src),
-                   "mov{b}\t{$src, %al|AL, $src}", []>,
+                   "mov{b}\t{$src, %al|AL, $src}", [], IIC_MOV_MEM>,
                    Requires<[In32BitMode]>;
 def MOV16o16a : Ii32 <0xA1, RawFrm, (outs), (ins offset16:$src),
-                      "mov{w}\t{$src, %ax|AL, $src}", []>, OpSize,
+                      "mov{w}\t{$src, %ax|AL, $src}", [], IIC_MOV_MEM>, OpSize,
                      Requires<[In32BitMode]>;
 def MOV32o32a : Ii32 <0xA1, RawFrm, (outs), (ins offset32:$src),
-                      "mov{l}\t{$src, %eax|EAX, $src}", []>,
+                      "mov{l}\t{$src, %eax|EAX, $src}", [], IIC_MOV_MEM>,
                      Requires<[In32BitMode]>;
 def MOV8ao8 : Ii32 <0xA2, RawFrm, (outs offset8:$dst), (ins),
-                   "mov{b}\t{%al, $dst|$dst, AL}", []>,
+                   "mov{b}\t{%al, $dst|$dst, AL}", [], IIC_MOV_MEM>,
                   Requires<[In32BitMode]>;
 def MOV16ao16 : Ii32 <0xA3, RawFrm, (outs offset16:$dst), (ins),
-                      "mov{w}\t{%ax, $dst|$dst, AL}", []>, OpSize,
+                      "mov{w}\t{%ax, $dst|$dst, AL}", [], IIC_MOV_MEM>, OpSize,
                      Requires<[In32BitMode]>;
 def MOV32ao32 : Ii32 <0xA3, RawFrm, (outs offset32:$dst), (ins),
-                      "mov{l}\t{%eax, $dst|$dst, EAX}", []>,
+                      "mov{l}\t{%eax, $dst|$dst, EAX}", [], IIC_MOV_MEM>,
                      Requires<[In32BitMode]>;
 
 // FIXME: These definitions are utterly broken
@@ -958,42 +1029,42 @@ def MOV64ao64 : RIi32<0xA3, RawFrm, (outs offset64:$dst), (ins),
 
 let isCodeGenOnly = 1 in {
 def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src),
-                   "mov{b}\t{$src, $dst|$dst, $src}", []>;
+                   "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
 def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
-                    "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+                    "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize;
 def MOV32rr_REV : I<0x8B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
-                    "mov{l}\t{$src, $dst|$dst, $src}", []>;
+                    "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
 def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
-                     "mov{q}\t{$src, $dst|$dst, $src}", []>;
+                     "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
 }
 
 let canFoldAsLoad = 1, isReMaterializable = 1 in {
 def MOV8rm  : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src),
                 "mov{b}\t{$src, $dst|$dst, $src}",
-                [(set GR8:$dst, (loadi8 addr:$src))]>;
+                [(set GR8:$dst, (loadi8 addr:$src))], IIC_MOV_MEM>;
 def MOV16rm : I<0x8B, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                 "mov{w}\t{$src, $dst|$dst, $src}",
-                [(set GR16:$dst, (loadi16 addr:$src))]>, OpSize;
+                [(set GR16:$dst, (loadi16 addr:$src))], IIC_MOV_MEM>, OpSize;
 def MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                 "mov{l}\t{$src, $dst|$dst, $src}",
-                [(set GR32:$dst, (loadi32 addr:$src))]>;
+                [(set GR32:$dst, (loadi32 addr:$src))], IIC_MOV_MEM>;
 def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                  "mov{q}\t{$src, $dst|$dst, $src}",
-                 [(set GR64:$dst, (load addr:$src))]>;
+                 [(set GR64:$dst, (load addr:$src))], IIC_MOV_MEM>;
 }
 
 def MOV8mr  : I<0x88, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src),
                 "mov{b}\t{$src, $dst|$dst, $src}",
-                [(store GR8:$src, addr:$dst)]>;
+                [(store GR8:$src, addr:$dst)], IIC_MOV_MEM>;
 def MOV16mr : I<0x89, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
                 "mov{w}\t{$src, $dst|$dst, $src}",
-                [(store GR16:$src, addr:$dst)]>, OpSize;
+                [(store GR16:$src, addr:$dst)], IIC_MOV_MEM>, OpSize;
 def MOV32mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                 "mov{l}\t{$src, $dst|$dst, $src}",
-                [(store GR32:$src, addr:$dst)]>;
+                [(store GR32:$src, addr:$dst)], IIC_MOV_MEM>;
 def MOV64mr : RI<0x89, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                  "mov{q}\t{$src, $dst|$dst, $src}",
-                 [(store GR64:$src, addr:$dst)]>;
+                 [(store GR64:$src, addr:$dst)], IIC_MOV_MEM>;
 
 // Versions of MOV8rr, MOV8mr, and MOV8rm that use i8mem_NOREX and GR8_NOREX so
 // that they can be used for copying and storing h registers, which can't be
@@ -1002,24 +1073,28 @@ let isCodeGenOnly = 1 in {
 let neverHasSideEffects = 1 in
 def MOV8rr_NOREX : I<0x88, MRMDestReg,
                      (outs GR8_NOREX:$dst), (ins GR8_NOREX:$src),
-                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", []>;
+                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", [], IIC_MOV>;
 let mayStore = 1 in
 def MOV8mr_NOREX : I<0x88, MRMDestMem,
                      (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src),
-                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", []>;
+                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", [],
+                     IIC_MOV_MEM>;
 let mayLoad = 1, neverHasSideEffects = 1,
     canFoldAsLoad = 1, isReMaterializable = 1 in
 def MOV8rm_NOREX : I<0x8A, MRMSrcMem,
                      (outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src),
-                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", []>;
+                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", [],
+                     IIC_MOV_MEM>;
 }
 
 
 // Condition code ops, incl. set if equal/not equal/...
-let Defs = [EFLAGS], Uses = [AH], neverHasSideEffects = 1 in
-def SAHF     : I<0x9E, RawFrm, (outs),  (ins), "sahf", []>;  // flags = AH
+let Defs = [EFLAGS], Uses = [AH] in
+def SAHF     : I<0x9E, RawFrm, (outs),  (ins), "sahf",
+                 [(set EFLAGS, (X86sahf AH))], IIC_AHF>;
 let Defs = [AH], Uses = [EFLAGS], neverHasSideEffects = 1 in
-def LAHF     : I<0x9F, RawFrm, (outs),  (ins), "lahf", []>;  // AH = flags
+def LAHF     : I<0x9F, RawFrm, (outs),  (ins), "lahf", [],
+                IIC_AHF>;  // AH = flags
 
 
 //===----------------------------------------------------------------------===//
@@ -1028,13 +1103,14 @@ def LAHF     : I<0x9F, RawFrm, (outs),  (ins), "lahf", []>;  // AH = flags
 let Defs = [EFLAGS] in {
 def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
                "bt{w}\t{$src2, $src1|$src1, $src2}",
-               [(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))]>, OpSize, TB;
+               [(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))], IIC_BT_RR>,
+               OpSize, TB;
 def BT32rr : I<0xA3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
                "bt{l}\t{$src2, $src1|$src1, $src2}",
-               [(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))]>, TB;
+               [(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))], IIC_BT_RR>, TB;
 def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
                "bt{q}\t{$src2, $src1|$src1, $src2}",
-               [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))]>, TB;
+               [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))], IIC_BT_RR>, TB;
 
 // Unlike with the register+register form, the memory+register form of the
 // bt instruction does not ignore the high bits of the index. From ISel's
@@ -1045,31 +1121,33 @@ def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
                "bt{w}\t{$src2, $src1|$src1, $src2}",
 //               [(X86bt (loadi16 addr:$src1), GR16:$src2),
 //                (implicit EFLAGS)]
-               []
+               [], IIC_BT_MR
                >, OpSize, TB, Requires<[FastBTMem]>;
 def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
                "bt{l}\t{$src2, $src1|$src1, $src2}",
 //               [(X86bt (loadi32 addr:$src1), GR32:$src2),
 //                (implicit EFLAGS)]
-               []
+               [], IIC_BT_MR
                >, TB, Requires<[FastBTMem]>;
 def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
                "bt{q}\t{$src2, $src1|$src1, $src2}",
 //               [(X86bt (loadi64 addr:$src1), GR64:$src2),
 //                (implicit EFLAGS)]
-                []
+                [], IIC_BT_MR
                 >, TB;
 
 def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2),
                 "bt{w}\t{$src2, $src1|$src1, $src2}",
-                [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))]>,
-                OpSize, TB;
+                [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))],
+                IIC_BT_RI>, OpSize, TB;
 def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32i8imm:$src2),
                 "bt{l}\t{$src2, $src1|$src1, $src2}",
-                [(set EFLAGS, (X86bt GR32:$src1, i32immSExt8:$src2))]>, TB;
+                [(set EFLAGS, (X86bt GR32:$src1, i32immSExt8:$src2))],
+                IIC_BT_RI>, TB;
 def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2),
                 "bt{q}\t{$src2, $src1|$src1, $src2}",
-                [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))]>, TB;
+                [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))],
+                IIC_BT_RI>, TB;
 
 // Note that these instructions don't need FastBTMem because that
 // only applies when the other operand is in a register. When it's
@@ -1077,91 +1155,103 @@ def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2),
 def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
                 "bt{w}\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS, (X86bt (loadi16 addr:$src1), i16immSExt8:$src2))
-                 ]>, OpSize, TB;
+                 ], IIC_BT_MI>, OpSize, TB;
 def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
                 "bt{l}\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS, (X86bt (loadi32 addr:$src1), i32immSExt8:$src2))
-                 ]>, TB;
+                 ], IIC_BT_MI>, TB;
 def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
                 "bt{q}\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS, (X86bt (loadi64 addr:$src1),
-                                     i64immSExt8:$src2))]>, TB;
+                                     i64immSExt8:$src2))], IIC_BT_MI>, TB;
 
 
 def BTC16rr : I<0xBB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
-                "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+                "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+                OpSize, TB;
 def BTC32rr : I<0xBB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
-                "btc{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
 def BTC64rr : RI<0xBB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
-                 "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                 "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
 def BTC16mr : I<0xBB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
-                "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+                "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+                OpSize, TB;
 def BTC32mr : I<0xBB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
-                "btc{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
 def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
-                 "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                 "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
 def BTC16ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR16:$src1, i16i8imm:$src2),
-                    "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+                    "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+                    OpSize, TB;
 def BTC32ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR32:$src1, i32i8imm:$src2),
-                    "btc{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                    "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
 def BTC64ri8 : RIi8<0xBA, MRM7r, (outs), (ins GR64:$src1, i64i8imm:$src2),
-                    "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                    "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
 def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
-                    "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+                    "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+                    OpSize, TB;
 def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
-                    "btc{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                    "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
 def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
-                    "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                    "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
 
 def BTR16rr : I<0xB3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
-                "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+                "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+                OpSize, TB;
 def BTR32rr : I<0xB3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
-                "btr{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
 def BTR64rr : RI<0xB3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
                  "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
 def BTR16mr : I<0xB3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
-                "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+                "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+                OpSize, TB;
 def BTR32mr : I<0xB3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
-                "btr{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
 def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
-                 "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                 "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
 def BTR16ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR16:$src1, i16i8imm:$src2),
-                    "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+                    "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+                    OpSize, TB;
 def BTR32ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR32:$src1, i32i8imm:$src2),
-                    "btr{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                    "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
 def BTR64ri8 : RIi8<0xBA, MRM6r, (outs), (ins GR64:$src1, i64i8imm:$src2),
-                    "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                    "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
 def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
-                    "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+                    "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+                    OpSize, TB;
 def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
-                    "btr{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                    "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
 def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
-                    "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                    "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
 
 def BTS16rr : I<0xAB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
-                "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+                "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+                OpSize, TB;
 def BTS32rr : I<0xAB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
-                "bts{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
 def BTS64rr : RI<0xAB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
-                 "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                 "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
 def BTS16mr : I<0xAB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
-                "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+                "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+                OpSize, TB;
 def BTS32mr : I<0xAB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
-                "bts{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
 def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
-                 "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                 "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
 def BTS16ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR16:$src1, i16i8imm:$src2),
-                    "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+                    "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+                    OpSize, TB;
 def BTS32ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR32:$src1, i32i8imm:$src2),
-                    "bts{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                    "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
 def BTS64ri8 : RIi8<0xBA, MRM5r, (outs), (ins GR64:$src1, i64i8imm:$src2),
-                    "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                    "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
 def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
-                    "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+                    "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+                    OpSize, TB;
 def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
-                    "bts{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                    "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
 def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
-                    "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+                    "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
 } // Defs = [EFLAGS]
 
 
@@ -1175,89 +1265,106 @@ def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
 let Constraints = "$val = $dst" in {
 def XCHG8rm  : I<0x86, MRMSrcMem, (outs GR8:$dst), (ins GR8:$val, i8mem:$ptr),
                "xchg{b}\t{$val, $ptr|$ptr, $val}",
-               [(set GR8:$dst, (atomic_swap_8 addr:$ptr, GR8:$val))]>;
+               [(set GR8:$dst, (atomic_swap_8 addr:$ptr, GR8:$val))],
+               IIC_XCHG_MEM>;
 def XCHG16rm : I<0x87, MRMSrcMem, (outs GR16:$dst),(ins GR16:$val, i16mem:$ptr),
                "xchg{w}\t{$val, $ptr|$ptr, $val}",
-               [(set GR16:$dst, (atomic_swap_16 addr:$ptr, GR16:$val))]>,
+               [(set GR16:$dst, (atomic_swap_16 addr:$ptr, GR16:$val))],
+               IIC_XCHG_MEM>,
                 OpSize;
 def XCHG32rm : I<0x87, MRMSrcMem, (outs GR32:$dst),(ins GR32:$val, i32mem:$ptr),
                "xchg{l}\t{$val, $ptr|$ptr, $val}",
-               [(set GR32:$dst, (atomic_swap_32 addr:$ptr, GR32:$val))]>;
+               [(set GR32:$dst, (atomic_swap_32 addr:$ptr, GR32:$val))],
+               IIC_XCHG_MEM>;
 def XCHG64rm : RI<0x87, MRMSrcMem, (outs GR64:$dst),(ins GR64:$val,i64mem:$ptr),
                   "xchg{q}\t{$val, $ptr|$ptr, $val}",
-                  [(set GR64:$dst, (atomic_swap_64 addr:$ptr, GR64:$val))]>;
+                  [(set GR64:$dst, (atomic_swap_64 addr:$ptr, GR64:$val))],
+                  IIC_XCHG_MEM>;
 
 def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst), (ins GR8:$val, GR8:$src),
-                "xchg{b}\t{$val, $src|$src, $val}", []>;
+                "xchg{b}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
 def XCHG16rr : I<0x87, MRMSrcReg, (outs GR16:$dst), (ins GR16:$val, GR16:$src),
-                 "xchg{w}\t{$val, $src|$src, $val}", []>, OpSize;
+                 "xchg{w}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>, OpSize;
 def XCHG32rr : I<0x87, MRMSrcReg, (outs GR32:$dst), (ins GR32:$val, GR32:$src),
-                 "xchg{l}\t{$val, $src|$src, $val}", []>;
+                 "xchg{l}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
 def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst), (ins GR64:$val,GR64:$src),
-                  "xchg{q}\t{$val, $src|$src, $val}", []>;
+                  "xchg{q}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
 }
 
 def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src),
-                  "xchg{w}\t{$src, %ax|AX, $src}", []>, OpSize;
+                  "xchg{w}\t{$src, %ax|AX, $src}", [], IIC_XCHG_REG>, OpSize;
 def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src),
-                  "xchg{l}\t{$src, %eax|EAX, $src}", []>, Requires<[In32BitMode]>;
+                  "xchg{l}\t{$src, %eax|EAX, $src}", [], IIC_XCHG_REG>,
+                  Requires<[In32BitMode]>;
 // Uses GR32_NOAX in 64-bit mode to prevent encoding using the 0x90 NOP encoding.
 // xchg %eax, %eax needs to clear upper 32-bits of RAX so is not a NOP.
 def XCHG32ar64 : I<0x90, AddRegFrm, (outs), (ins GR32_NOAX:$src),
-                   "xchg{l}\t{$src, %eax|EAX, $src}", []>, Requires<[In64BitMode]>;
+                   "xchg{l}\t{$src, %eax|EAX, $src}", [], IIC_XCHG_REG>,
+                   Requires<[In64BitMode]>;
 def XCHG64ar : RI<0x90, AddRegFrm, (outs), (ins GR64:$src),
-                  "xchg{q}\t{$src, %rax|RAX, $src}", []>;
+                  "xchg{q}\t{$src, %rax|RAX, $src}", [], IIC_XCHG_REG>;
 
 
 
 def XADD8rr : I<0xC0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
-                "xadd{b}\t{$src, $dst|$dst, $src}", []>, TB;
+                "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
 def XADD16rr : I<0xC1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
-                 "xadd{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+                 "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB,
+                 OpSize;
 def XADD32rr  : I<0xC1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
-                 "xadd{l}\t{$src, $dst|$dst, $src}", []>, TB;
+                 "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
 def XADD64rr  : RI<0xC1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
-                   "xadd{q}\t{$src, $dst|$dst, $src}", []>, TB;
+                   "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
 
 let mayLoad = 1, mayStore = 1 in {
 def XADD8rm   : I<0xC0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
-                 "xadd{b}\t{$src, $dst|$dst, $src}", []>, TB;
+                 "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
 def XADD16rm  : I<0xC1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
-                 "xadd{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+                 "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB,
+                 OpSize;
 def XADD32rm  : I<0xC1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
-                 "xadd{l}\t{$src, $dst|$dst, $src}", []>, TB;
+                 "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
 def XADD64rm  : RI<0xC1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
-                   "xadd{q}\t{$src, $dst|$dst, $src}", []>, TB;
+                   "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
 
 }
 
 def CMPXCHG8rr : I<0xB0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
-                   "cmpxchg{b}\t{$src, $dst|$dst, $src}", []>, TB;
+                   "cmpxchg{b}\t{$src, $dst|$dst, $src}", [],
+                   IIC_CMPXCHG_REG8>, TB;
 def CMPXCHG16rr : I<0xB1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
-                    "cmpxchg{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+                    "cmpxchg{w}\t{$src, $dst|$dst, $src}", [],
+                    IIC_CMPXCHG_REG>, TB, OpSize;
 def CMPXCHG32rr  : I<0xB1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
-                     "cmpxchg{l}\t{$src, $dst|$dst, $src}", []>, TB;
+                     "cmpxchg{l}\t{$src, $dst|$dst, $src}", [],
+                     IIC_CMPXCHG_REG>, TB;
 def CMPXCHG64rr  : RI<0xB1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
-                      "cmpxchg{q}\t{$src, $dst|$dst, $src}", []>, TB;
+                      "cmpxchg{q}\t{$src, $dst|$dst, $src}", [],
+                      IIC_CMPXCHG_REG>, TB;
 
 let mayLoad = 1, mayStore = 1 in {
 def CMPXCHG8rm   : I<0xB0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
-                     "cmpxchg{b}\t{$src, $dst|$dst, $src}", []>, TB;
+                     "cmpxchg{b}\t{$src, $dst|$dst, $src}", [],
+                     IIC_CMPXCHG_MEM8>, TB;
 def CMPXCHG16rm  : I<0xB1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
-                     "cmpxchg{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+                     "cmpxchg{w}\t{$src, $dst|$dst, $src}", [],
+                     IIC_CMPXCHG_MEM>, TB, OpSize;
 def CMPXCHG32rm  : I<0xB1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
-                     "cmpxchg{l}\t{$src, $dst|$dst, $src}", []>, TB;
+                     "cmpxchg{l}\t{$src, $dst|$dst, $src}", [],
+                     IIC_CMPXCHG_MEM>, TB;
 def CMPXCHG64rm  : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
-                      "cmpxchg{q}\t{$src, $dst|$dst, $src}", []>, TB;
+                      "cmpxchg{q}\t{$src, $dst|$dst, $src}", [],
+                      IIC_CMPXCHG_MEM>, TB;
 }
 
 let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in
 def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst),
-                  "cmpxchg8b\t$dst", []>, TB;
+                  "cmpxchg8b\t$dst", [], IIC_CMPXCHG_8B>, TB;
 
 let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in
 def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst),
-                    "cmpxchg16b\t$dst", []>, TB, Requires<[HasCmpxchg16b]>;
+                    "cmpxchg16b\t$dst", [], IIC_CMPXCHG_16B>,
+                    TB, Requires<[HasCmpxchg16b]>;
 
 
 
@@ -1281,69 +1388,75 @@ def REPNE_PREFIX : I<0xF2, RawFrm, (outs),  (ins), "repne", []>;
 
 
 // String manipulation instructions
-def LODSB : I<0xAC, RawFrm, (outs), (ins), "lodsb", []>;
-def LODSW : I<0xAD, RawFrm, (outs), (ins), "lodsw", []>, OpSize;
-def LODSD : I<0xAD, RawFrm, (outs), (ins), "lods{l|d}", []>;
-def LODSQ : RI<0xAD, RawFrm, (outs), (ins), "lodsq", []>;
+def LODSB : I<0xAC, RawFrm, (outs), (ins), "lodsb", [], IIC_LODS>;
+def LODSW : I<0xAD, RawFrm, (outs), (ins), "lodsw", [], IIC_LODS>, OpSize;
+def LODSD : I<0xAD, RawFrm, (outs), (ins), "lods{l|d}", [], IIC_LODS>;
+def LODSQ : RI<0xAD, RawFrm, (outs), (ins), "lodsq", [], IIC_LODS>;
 
-def OUTSB : I<0x6E, RawFrm, (outs), (ins), "outsb", []>;
-def OUTSW : I<0x6F, RawFrm, (outs), (ins), "outsw", []>, OpSize;
-def OUTSD : I<0x6F, RawFrm, (outs), (ins), "outs{l|d}", []>;
+def OUTSB : I<0x6E, RawFrm, (outs), (ins), "outsb", [], IIC_OUTS>;
+def OUTSW : I<0x6F, RawFrm, (outs), (ins), "outsw", [], IIC_OUTS>, OpSize;
+def OUTSD : I<0x6F, RawFrm, (outs), (ins), "outs{l|d}", [], IIC_OUTS>;
 
 
 // Flag instructions
-def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", []>;
-def STC : I<0xF9, RawFrm, (outs), (ins), "stc", []>;
-def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", []>;
-def STI : I<0xFB, RawFrm, (outs), (ins), "sti", []>;
-def CLD : I<0xFC, RawFrm, (outs), (ins), "cld", []>;
-def STD : I<0xFD, RawFrm, (outs), (ins), "std", []>;
-def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", []>;
+def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", [], IIC_CLC>;
+def STC : I<0xF9, RawFrm, (outs), (ins), "stc", [], IIC_STC>;
+def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", [], IIC_CLI>;
+def STI : I<0xFB, RawFrm, (outs), (ins), "sti", [], IIC_STI>;
+def CLD : I<0xFC, RawFrm, (outs), (ins), "cld", [], IIC_CLD>;
+def STD : I<0xFD, RawFrm, (outs), (ins), "std", [], IIC_STD>;
+def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", [], IIC_CMC>;
 
-def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", []>, TB;
+def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", [], IIC_CLTS>, TB;
 
 // Table lookup instructions
-def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", []>;
+def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", [], IIC_XLAT>;
 
 // ASCII Adjust After Addition
 // sets AL, AH and CF and AF of EFLAGS and uses AL and AF of EFLAGS
-def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", []>, Requires<[In32BitMode]>;
+def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", [], IIC_AAA>,
+            Requires<[In32BitMode]>;
 
 // ASCII Adjust AX Before Division
 // sets AL, AH and EFLAGS and uses AL and AH
 def AAD8i8 : Ii8<0xD5, RawFrm, (outs), (ins i8imm:$src),
-                 "aad\t$src", []>, Requires<[In32BitMode]>;
+                 "aad\t$src", [], IIC_AAD>, Requires<[In32BitMode]>;
 
 // ASCII Adjust AX After Multiply
 // sets AL, AH and EFLAGS and uses AL
 def AAM8i8 : Ii8<0xD4, RawFrm, (outs), (ins i8imm:$src),
-                 "aam\t$src", []>, Requires<[In32BitMode]>;
+                 "aam\t$src", [], IIC_AAM>, Requires<[In32BitMode]>;
 
 // ASCII Adjust AL After Subtraction - sets
 // sets AL, AH and CF and AF of EFLAGS and uses AL and AF of EFLAGS
-def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", []>, Requires<[In32BitMode]>;
+def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", [], IIC_AAS>,
+            Requires<[In32BitMode]>;
 
 // Decimal Adjust AL after Addition
 // sets AL, CF and AF of EFLAGS and uses AL, CF and AF of EFLAGS
-def DAA : I<0x27, RawFrm, (outs), (ins), "daa", []>, Requires<[In32BitMode]>;
+def DAA : I<0x27, RawFrm, (outs), (ins), "daa", [], IIC_DAA>,
+            Requires<[In32BitMode]>;
 
 // Decimal Adjust AL after Subtraction
 // sets AL, CF and AF of EFLAGS and uses AL, CF and AF of EFLAGS
-def DAS : I<0x2F, RawFrm, (outs), (ins), "das", []>, Requires<[In32BitMode]>;
+def DAS : I<0x2F, RawFrm, (outs), (ins), "das", [], IIC_DAS>,
+            Requires<[In32BitMode]>;
 
 // Check Array Index Against Bounds
 def BOUNDS16rm : I<0x62, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
-                   "bound\t{$src, $dst|$dst, $src}", []>, OpSize,
+                   "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>, OpSize,
                    Requires<[In32BitMode]>;
 def BOUNDS32rm : I<0x62, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
-                   "bound\t{$src, $dst|$dst, $src}", []>,
+                   "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>,
                    Requires<[In32BitMode]>;
 
 // Adjust RPL Field of Segment Selector
 def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$src), (ins GR16:$dst),
-                 "arpl\t{$src, $dst|$dst, $src}", []>, Requires<[In32BitMode]>;
+                 "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_REG>,
+                 Requires<[In32BitMode]>;
 def ARPL16mr : I<0x63, MRMSrcMem, (outs GR16:$src), (ins i16mem:$dst),
-                 "arpl\t{$src, $dst|$dst, $src}", []>, Requires<[In32BitMode]>;
+                 "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_MEM>,
+                 Requires<[In32BitMode]>;
 
 //===----------------------------------------------------------------------===//
 // MOVBE Instructions
@@ -1351,22 +1464,28 @@ def ARPL16mr : I<0x63, MRMSrcMem, (outs GR16:$src), (ins i16mem:$dst),
 let Predicates = [HasMOVBE] in {
   def MOVBE16rm : I<0xF0, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                     "movbe{w}\t{$src, $dst|$dst, $src}",
-                    [(set GR16:$dst, (bswap (loadi16 addr:$src)))]>, OpSize, T8;
+                    [(set GR16:$dst, (bswap (loadi16 addr:$src)))], IIC_MOVBE>,
+                    OpSize, T8;
   def MOVBE32rm : I<0xF0, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                     "movbe{l}\t{$src, $dst|$dst, $src}",
-                    [(set GR32:$dst, (bswap (loadi32 addr:$src)))]>, T8;
+                    [(set GR32:$dst, (bswap (loadi32 addr:$src)))], IIC_MOVBE>,
+                    T8;
   def MOVBE64rm : RI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                      "movbe{q}\t{$src, $dst|$dst, $src}",
-                     [(set GR64:$dst, (bswap (loadi64 addr:$src)))]>, T8;
+                     [(set GR64:$dst, (bswap (loadi64 addr:$src)))], IIC_MOVBE>,
+                     T8;
   def MOVBE16mr : I<0xF1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
                     "movbe{w}\t{$src, $dst|$dst, $src}",
-                    [(store (bswap GR16:$src), addr:$dst)]>, OpSize, T8;
+                    [(store (bswap GR16:$src), addr:$dst)], IIC_MOVBE>,
+                    OpSize, T8;
   def MOVBE32mr : I<0xF1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                     "movbe{l}\t{$src, $dst|$dst, $src}",
-                    [(store (bswap GR32:$src), addr:$dst)]>, T8;
+                    [(store (bswap GR32:$src), addr:$dst)], IIC_MOVBE>,
+                    T8;
   def MOVBE64mr : RI<0xF1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                      "movbe{q}\t{$src, $dst|$dst, $src}",
-                     [(store (bswap GR64:$src), addr:$dst)]>, T8;
+                     [(store (bswap GR64:$src), addr:$dst)], IIC_MOVBE>,
+                     T8;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1374,11 +1493,14 @@ let Predicates = [HasMOVBE] in {
 //
 let Predicates = [HasRDRAND], Defs = [EFLAGS] in {
   def RDRAND16r : I<0xC7, MRM6r, (outs GR16:$dst), (ins),
-                    "rdrand{w}\t$dst", []>, OpSize, TB;
+                    "rdrand{w}\t$dst",
+                    [(set GR16:$dst, EFLAGS, (X86rdrand))]>, OpSize, TB;
   def RDRAND32r : I<0xC7, MRM6r, (outs GR32:$dst), (ins),
-                    "rdrand{l}\t$dst", []>, TB;
+                    "rdrand{l}\t$dst",
+                    [(set GR32:$dst, EFLAGS, (X86rdrand))]>, TB;
   def RDRAND64r : RI<0xC7, MRM6r, (outs GR64:$dst), (ins),
-                     "rdrand{q}\t$dst", []>, TB;
+                     "rdrand{q}\t$dst",
+                     [(set GR64:$dst, EFLAGS, (X86rdrand))]>, TB;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1774,9 +1896,9 @@ def : InstAlias<"fdivp %st(0), $op", (DIVR_FPrST0 RST:$op)>;
 def : InstAlias<"fdivrp %st(0), $op", (DIV_FPrST0 RST:$op)>;
 
 // We accept "fnstsw %eax" even though it only writes %ax.
-def : InstAlias<"fnstsw %eax", (FNSTSW8r)>;
-def : InstAlias<"fnstsw %al" , (FNSTSW8r)>;
-def : InstAlias<"fnstsw"     , (FNSTSW8r)>;
+def : InstAlias<"fnstsw %eax", (FNSTSW16r)>;
+def : InstAlias<"fnstsw %al" , (FNSTSW16r)>;
+def : InstAlias<"fnstsw"     , (FNSTSW16r)>;
 
 // lcall and ljmp aliases.  This seems to be an odd mapping in 64-bit mode, but
 // this is compatible with what GAS does.
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index 63f96b6..e4edd36 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -20,71 +20,130 @@
 // MMX Multiclasses
 //===----------------------------------------------------------------------===//
 
+def MMX_INTALU_ITINS : OpndItins<
+  IIC_MMX_ALU_RR, IIC_MMX_ALU_RM
+>;
+
+def MMX_INTALUQ_ITINS : OpndItins<
+  IIC_MMX_ALUQ_RR, IIC_MMX_ALUQ_RM
+>;
+
+def MMX_PHADDSUBW : OpndItins<
+  IIC_MMX_PHADDSUBW_RR, IIC_MMX_PHADDSUBW_RM
+>;
+
+def MMX_PHADDSUBD : OpndItins<
+  IIC_MMX_PHADDSUBD_RR, IIC_MMX_PHADDSUBD_RM
+>;
+
+def MMX_PMUL_ITINS : OpndItins<
+  IIC_MMX_PMUL, IIC_MMX_PMUL
+>;
+
+def MMX_PSADBW_ITINS : OpndItins<
+  IIC_MMX_PSADBW, IIC_MMX_PSADBW
+>;
+
+def MMX_MISC_FUNC_ITINS : OpndItins<
+  IIC_MMX_MISC_FUNC_MEM, IIC_MMX_MISC_FUNC_REG
+>;
+
+def MMX_SHIFT_ITINS : ShiftOpndItins<
+  IIC_MMX_SHIFT_RR, IIC_MMX_SHIFT_RM, IIC_MMX_SHIFT_RI
+>;
+
+def MMX_UNPCK_H_ITINS : OpndItins<
+  IIC_MMX_UNPCK_H_RR, IIC_MMX_UNPCK_H_RM
+>;
+
+def MMX_UNPCK_L_ITINS : OpndItins<
+  IIC_MMX_UNPCK_L, IIC_MMX_UNPCK_L
+>;
+
+def MMX_PCK_ITINS : OpndItins<
+  IIC_MMX_PCK_RR, IIC_MMX_PCK_RM
+>;
+
+def MMX_PSHUF_ITINS : OpndItins<
+  IIC_MMX_PSHUF, IIC_MMX_PSHUF
+>;
+
+def MMX_CVT_PD_ITINS : OpndItins<
+  IIC_MMX_CVT_PD_RR, IIC_MMX_CVT_PD_RM
+>;
+
+def MMX_CVT_PS_ITINS : OpndItins<
+  IIC_MMX_CVT_PS_RR, IIC_MMX_CVT_PS_RM
+>;
+
 let Constraints = "$src1 = $dst" in {
   // MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic.
   // When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp.
   multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
-                               bit Commutable = 0> {
+                               OpndItins itins, bit Commutable = 0> {
     def irr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
                  (ins VR64:$src1, VR64:$src2),
                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                 [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]> {
+                 [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr> {
       let isCommutable = Commutable;
     }
     def irm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
                  (ins VR64:$src1, i64mem:$src2),
                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
                  [(set VR64:$dst, (IntId VR64:$src1,
-                                   (bitconvert (load_mmx addr:$src2))))]>;
+                                   (bitconvert (load_mmx addr:$src2))))],
+                 itins.rm>;
   }
 
   multiclass MMXI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
                                 string OpcodeStr, Intrinsic IntId,
-                                Intrinsic IntId2> {
+                                Intrinsic IntId2, ShiftOpndItins itins> {
     def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
                                   (ins VR64:$src1, VR64:$src2),
                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                  [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]>;
+                  [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr>;
     def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
                                   (ins VR64:$src1, i64mem:$src2),
                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
                   [(set VR64:$dst, (IntId VR64:$src1,
-                                    (bitconvert (load_mmx addr:$src2))))]>;
+                                    (bitconvert (load_mmx addr:$src2))))],
+                  itins.rm>;
     def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst),
                                    (ins VR64:$src1, i32i8imm:$src2),
                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-           [(set VR64:$dst, (IntId2 VR64:$src1, (i32 imm:$src2)))]>;
+           [(set VR64:$dst, (IntId2 VR64:$src1, (i32 imm:$src2)))], itins.ri>;
   }
 }
 
 /// Unary MMX instructions requiring SSSE3.
 multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr,
-                               Intrinsic IntId64> {
+                               Intrinsic IntId64, OpndItins itins> {
   def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                   [(set VR64:$dst, (IntId64 VR64:$src))]>;
+                   [(set VR64:$dst, (IntId64 VR64:$src))], itins.rr>;
 
   def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                    [(set VR64:$dst,
-                     (IntId64 (bitconvert (memopmmx addr:$src))))]>;
+                     (IntId64 (bitconvert (memopmmx addr:$src))))],
+                   itins.rm>;
 }
 
 /// Binary MMX instructions requiring SSSE3.
 let ImmT = NoImm, Constraints = "$src1 = $dst" in {
 multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr,
-                             Intrinsic IntId64> {
+                             Intrinsic IntId64, OpndItins itins> {
   let isCommutable = 0 in
   def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
        (ins VR64:$src1, VR64:$src2),
         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-       [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]>;
+       [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))], itins.rr>;
   def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
        (ins VR64:$src1, i64mem:$src2),
         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
        [(set VR64:$dst,
          (IntId64 VR64:$src1,
-          (bitconvert (memopmmx addr:$src2))))]>;
+          (bitconvert (memopmmx addr:$src2))))], itins.rm>;
 }
 }
 
@@ -103,13 +162,13 @@ multiclass ssse3_palign_mm<string asm, Intrinsic IntId> {
 
 multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                          Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag,
-                         string asm, Domain d> {
+                         string asm, OpndItins itins, Domain d> {
   def irr : PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
                         [(set DstRC:$dst, (Int SrcRC:$src))], 
-                        IIC_DEFAULT, d>;
+                        itins.rr, d>;
   def irm : PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
                         [(set DstRC:$dst, (Int (ld_frag addr:$src)))], 
-                        IIC_DEFAULT, d>;
+                        itins.rm, d>;
 }
 
 multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC,
@@ -139,22 +198,24 @@ def MMX_EMMS  : MMXI<0x77, RawFrm, (outs), (ins), "emms",
 def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
                         "movd\t{$src, $dst|$dst, $src}",
                         [(set VR64:$dst, 
-                         (x86mmx (scalar_to_vector GR32:$src)))]>;
+                         (x86mmx (scalar_to_vector GR32:$src)))],
+                        IIC_MMX_MOV_MM_RM>;
 let canFoldAsLoad = 1 in
 def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
                         "movd\t{$src, $dst|$dst, $src}",
-              [(set VR64:$dst,
-               (x86mmx (scalar_to_vector (loadi32 addr:$src))))]>;
+                        [(set VR64:$dst,
+                        (x86mmx (scalar_to_vector (loadi32 addr:$src))))],
+                        IIC_MMX_MOV_MM_RM>;
 let mayStore = 1 in
 def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src),
-                        "movd\t{$src, $dst|$dst, $src}", []>;
+                        "movd\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOV_MM_RM>;
 def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs), (ins GR32:$dst, VR64:$src),
-                        "movd\t{$src, $dst|$dst, $src}", []>;
+                        "movd\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOV_REG_MM>;
 
 let neverHasSideEffects = 1 in
 def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
                              "movd\t{$src, $dst|$dst, $src}",
-                             []>;
+                             [], IIC_MMX_MOV_MM_RM>;
 
 // These are 64 bit moves, but since the OS X assembler doesn't
 // recognize a register-register movq, we write them as
@@ -163,197 +224,276 @@ def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg,
                                (outs GR64:$dst), (ins VR64:$src),
                                "movd\t{$src, $dst|$dst, $src}", 
                              [(set GR64:$dst,
-                              (bitconvert VR64:$src))]>;
+                              (bitconvert VR64:$src))], IIC_MMX_MOV_REG_MM>;
 def MMX_MOVD64rrv164 : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
                              "movd\t{$src, $dst|$dst, $src}",
                              [(set VR64:$dst,
-                              (bitconvert GR64:$src))]>;
+                              (bitconvert GR64:$src))], IIC_MMX_MOV_MM_RM>;
 let neverHasSideEffects = 1 in
 def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
-                        "movq\t{$src, $dst|$dst, $src}", []>;
+                        "movq\t{$src, $dst|$dst, $src}", [],
+                        IIC_MMX_MOVQ_RR>;
 let canFoldAsLoad = 1 in
 def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
                         "movq\t{$src, $dst|$dst, $src}",
-                        [(set VR64:$dst, (load_mmx addr:$src))]>;
+                        [(set VR64:$dst, (load_mmx addr:$src))],
+                        IIC_MMX_MOVQ_RM>;
 def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
                         "movq\t{$src, $dst|$dst, $src}",
-                        [(store (x86mmx VR64:$src), addr:$dst)]>;
+                        [(store (x86mmx VR64:$src), addr:$dst)],
+                        IIC_MMX_MOVQ_RM>;
 
 def MMX_MOVDQ2Qrr : SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
                           (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}",
                           [(set VR64:$dst,
                             (x86mmx (bitconvert
                             (i64 (vector_extract (v2i64 VR128:$src),
-                                  (iPTR 0))))))]>;
+                                  (iPTR 0))))))],
+                          IIC_MMX_MOVQ_RR>;
 
 def MMX_MOVQ2DQrr : SSDIi8<0xD6, MRMSrcReg, (outs VR128:$dst),
                             (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
           [(set VR128:$dst,
             (v2i64 (scalar_to_vector
-                              (i64 (bitconvert (x86mmx VR64:$src))))))]>;
+                              (i64 (bitconvert (x86mmx VR64:$src))))))],
+                           IIC_MMX_MOVQ_RR>;
 
 let neverHasSideEffects = 1 in
 def MMX_MOVQ2FR64rr: SSDIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
-                       (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}", []>;
+                       (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}", [],
+                       IIC_MMX_MOVQ_RR>;
 
 def MMX_MOVFR642Qrr: SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
-                       (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}", []>;
+                       (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}", [],
+                       IIC_MMX_MOVQ_RR>;
 
 def MMX_MOVNTQmr  : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
                          "movntq\t{$src, $dst|$dst, $src}",
-                         [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)]>;
+                         [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)],
+                         IIC_MMX_MOVQ_RM>;
 
 let AddedComplexity = 15 in
 // movd to MMX register zero-extends
 def MMX_MOVZDI2PDIrr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
                              "movd\t{$src, $dst|$dst, $src}",
               [(set VR64:$dst,
-                    (x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))))]>;
+                    (x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))))],
+                            IIC_MMX_MOV_MM_RM>;
 let AddedComplexity = 20 in
 def MMX_MOVZDI2PDIrm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst),
                            (ins i32mem:$src),
                              "movd\t{$src, $dst|$dst, $src}",
           [(set VR64:$dst,
                 (x86mmx (X86vzmovl (x86mmx
-                                   (scalar_to_vector (loadi32 addr:$src))))))]>;
+                                   (scalar_to_vector (loadi32 addr:$src))))))],
+                            IIC_MMX_MOV_MM_RM>;
 
 // Arithmetic Instructions
-defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b>;
-defm MMX_PABSW : SS3I_unop_rm_int_mm<0x1D, "pabsw", int_x86_ssse3_pabs_w>;
-defm MMX_PABSD : SS3I_unop_rm_int_mm<0x1E, "pabsd", int_x86_ssse3_pabs_d>;
+defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PABSW : SS3I_unop_rm_int_mm<0x1D, "pabsw", int_x86_ssse3_pabs_w,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PABSD : SS3I_unop_rm_int_mm<0x1E, "pabsd", int_x86_ssse3_pabs_d,
+                                     MMX_INTALU_ITINS>;
 // -- Addition
-defm MMX_PADDB : MMXI_binop_rm_int<0xFC, "paddb", int_x86_mmx_padd_b, 1>;
-defm MMX_PADDW : MMXI_binop_rm_int<0xFD, "paddw", int_x86_mmx_padd_w, 1>;
-defm MMX_PADDD : MMXI_binop_rm_int<0xFE, "paddd", int_x86_mmx_padd_d, 1>;
-defm MMX_PADDQ : MMXI_binop_rm_int<0xD4, "paddq", int_x86_mmx_padd_q, 1>;
-defm MMX_PADDSB  : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b, 1>;
-defm MMX_PADDSW  : MMXI_binop_rm_int<0xED, "paddsw" , int_x86_mmx_padds_w, 1>;
-
-defm MMX_PADDUSB : MMXI_binop_rm_int<0xDC, "paddusb", int_x86_mmx_paddus_b, 1>;
-defm MMX_PADDUSW : MMXI_binop_rm_int<0xDD, "paddusw", int_x86_mmx_paddus_w, 1>;
-
-defm MMX_PHADDW  : SS3I_binop_rm_int_mm<0x01, "phaddw", int_x86_ssse3_phadd_w>;
-defm MMX_PHADD   : SS3I_binop_rm_int_mm<0x02, "phaddd", int_x86_ssse3_phadd_d>;
-defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw>;
+defm MMX_PADDB : MMXI_binop_rm_int<0xFC, "paddb", int_x86_mmx_padd_b,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PADDW : MMXI_binop_rm_int<0xFD, "paddw", int_x86_mmx_padd_w,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PADDD : MMXI_binop_rm_int<0xFE, "paddd", int_x86_mmx_padd_d,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PADDQ : MMXI_binop_rm_int<0xD4, "paddq", int_x86_mmx_padd_q,
+                                   MMX_INTALUQ_ITINS, 1>;
+defm MMX_PADDSB  : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PADDSW  : MMXI_binop_rm_int<0xED, "paddsw" , int_x86_mmx_padds_w,
+                                   MMX_INTALU_ITINS, 1>;
+
+defm MMX_PADDUSB : MMXI_binop_rm_int<0xDC, "paddusb", int_x86_mmx_paddus_b,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PADDUSW : MMXI_binop_rm_int<0xDD, "paddusw", int_x86_mmx_paddus_w,
+                                   MMX_INTALU_ITINS, 1>;
+
+defm MMX_PHADDW  : SS3I_binop_rm_int_mm<0x01, "phaddw", int_x86_ssse3_phadd_w,
+                                   MMX_PHADDSUBW>;
+defm MMX_PHADD   : SS3I_binop_rm_int_mm<0x02, "phaddd", int_x86_ssse3_phadd_d,
+                                   MMX_PHADDSUBD>;
+defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw,
+                                   MMX_PHADDSUBW>;
 
 
 // -- Subtraction
-defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b>;
-defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w>;
-defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d>;
-defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q>;
-
-defm MMX_PSUBSB  : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b>;
-defm MMX_PSUBSW  : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w>;
-
-defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b>;
-defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w>;
-
-defm MMX_PHSUBW  : SS3I_binop_rm_int_mm<0x05, "phsubw", int_x86_ssse3_phsub_w>;
-defm MMX_PHSUBD  : SS3I_binop_rm_int_mm<0x06, "phsubd", int_x86_ssse3_phsub_d>;
-defm MMX_PHSUBSW : SS3I_binop_rm_int_mm<0x07, "phsubsw",int_x86_ssse3_phsub_sw>;
+defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b,
+                                   MMX_INTALU_ITINS>;
+defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q,
+                                   MMX_INTALUQ_ITINS, 1>;
+
+defm MMX_PSUBSB  : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PSUBSW  : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w,
+                                   MMX_INTALU_ITINS, 1>;
+
+defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w,
+                                   MMX_INTALU_ITINS, 1>;
+
+defm MMX_PHSUBW  : SS3I_binop_rm_int_mm<0x05, "phsubw", int_x86_ssse3_phsub_w,
+                                   MMX_PHADDSUBW>;
+defm MMX_PHSUBD  : SS3I_binop_rm_int_mm<0x06, "phsubd", int_x86_ssse3_phsub_d,
+                                   MMX_PHADDSUBD>;
+defm MMX_PHSUBSW : SS3I_binop_rm_int_mm<0x07, "phsubsw",int_x86_ssse3_phsub_sw,
+                                   MMX_PHADDSUBW>;
 
 // -- Multiplication
-defm MMX_PMULLW  : MMXI_binop_rm_int<0xD5, "pmullw", int_x86_mmx_pmull_w, 1>;
-
-defm MMX_PMULHW  : MMXI_binop_rm_int<0xE5, "pmulhw",  int_x86_mmx_pmulh_w,  1>;
-defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w, 1>;
-defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq, 1>;
+defm MMX_PMULLW  : MMXI_binop_rm_int<0xD5, "pmullw", int_x86_mmx_pmull_w,
+                                     MMX_PMUL_ITINS, 1>;
+
+defm MMX_PMULHW  : MMXI_binop_rm_int<0xE5, "pmulhw",  int_x86_mmx_pmulh_w,
+                                     MMX_PMUL_ITINS, 1>;
+defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w,
+                                     MMX_PMUL_ITINS, 1>;
+defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq,
+                                     MMX_PMUL_ITINS, 1>;
 let isCommutable = 1 in
 defm MMX_PMULHRSW : SS3I_binop_rm_int_mm<0x0B, "pmulhrsw",
-                                     int_x86_ssse3_pmul_hr_sw>;
+                                     int_x86_ssse3_pmul_hr_sw, MMX_PMUL_ITINS>;
 
 // -- Miscellanea
-defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd, 1>;
+defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd,
+                                     MMX_PMUL_ITINS, 1>;
 
 defm MMX_PMADDUBSW : SS3I_binop_rm_int_mm<0x04, "pmaddubsw",
-                                     int_x86_ssse3_pmadd_ub_sw>;
-defm MMX_PAVGB   : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b, 1>;
-defm MMX_PAVGW   : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w, 1>;
-
-defm MMX_PMINUB  : MMXI_binop_rm_int<0xDA, "pminub", int_x86_mmx_pminu_b, 1>;
-defm MMX_PMINSW  : MMXI_binop_rm_int<0xEA, "pminsw", int_x86_mmx_pmins_w, 1>;
-
-defm MMX_PMAXUB  : MMXI_binop_rm_int<0xDE, "pmaxub", int_x86_mmx_pmaxu_b, 1>;
-defm MMX_PMAXSW  : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w, 1>;
-
-defm MMX_PSADBW  : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw, 1>;
-
-defm MMX_PSIGNB :  SS3I_binop_rm_int_mm<0x08, "psignb", int_x86_ssse3_psign_b>;
-defm MMX_PSIGNW :  SS3I_binop_rm_int_mm<0x09, "psignw", int_x86_ssse3_psign_w>;
-defm MMX_PSIGND :  SS3I_binop_rm_int_mm<0x0A, "psignd", int_x86_ssse3_psign_d>;
+                                     int_x86_ssse3_pmadd_ub_sw, MMX_PMUL_ITINS>;
+defm MMX_PAVGB   : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+defm MMX_PAVGW   : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+
+defm MMX_PMINUB  : MMXI_binop_rm_int<0xDA, "pminub", int_x86_mmx_pminu_b,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+defm MMX_PMINSW  : MMXI_binop_rm_int<0xEA, "pminsw", int_x86_mmx_pmins_w,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+
+defm MMX_PMAXUB  : MMXI_binop_rm_int<0xDE, "pmaxub", int_x86_mmx_pmaxu_b,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+defm MMX_PMAXSW  : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+
+defm MMX_PSADBW  : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw,
+                                     MMX_PSADBW_ITINS, 1>;
+
+defm MMX_PSIGNB :  SS3I_binop_rm_int_mm<0x08, "psignb", int_x86_ssse3_psign_b,
+                                        MMX_MISC_FUNC_ITINS>;
+defm MMX_PSIGNW :  SS3I_binop_rm_int_mm<0x09, "psignw", int_x86_ssse3_psign_w,
+                                        MMX_MISC_FUNC_ITINS>;
+defm MMX_PSIGND :  SS3I_binop_rm_int_mm<0x0A, "psignd", int_x86_ssse3_psign_d,
+                                        MMX_MISC_FUNC_ITINS>;
 let Constraints = "$src1 = $dst" in
   defm MMX_PALIGN : ssse3_palign_mm<"palignr", int_x86_mmx_palignr_b>;
 
 // Logical Instructions
-defm MMX_PAND : MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand, 1>;
-defm MMX_POR  : MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por,  1>;
-defm MMX_PXOR : MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor, 1>;
-defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn>;
+defm MMX_PAND : MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand,
+                                  MMX_INTALU_ITINS, 1>;
+defm MMX_POR  : MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por,
+                                  MMX_INTALU_ITINS, 1>;
+defm MMX_PXOR : MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor,
+                                  MMX_INTALU_ITINS, 1>;
+defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn,
+                                  MMX_INTALU_ITINS>;
 
 // Shift Instructions
 defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
-                                    int_x86_mmx_psrl_w, int_x86_mmx_psrli_w>;
+                                    int_x86_mmx_psrl_w, int_x86_mmx_psrli_w,
+                                    MMX_SHIFT_ITINS>;
 defm MMX_PSRLD : MMXI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld",
-                                    int_x86_mmx_psrl_d, int_x86_mmx_psrli_d>;
+                                    int_x86_mmx_psrl_d, int_x86_mmx_psrli_d,
+                                    MMX_SHIFT_ITINS>;
 defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
-                                    int_x86_mmx_psrl_q, int_x86_mmx_psrli_q>;
+                                    int_x86_mmx_psrl_q, int_x86_mmx_psrli_q,
+                                    MMX_SHIFT_ITINS>;
 
 defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
-                                    int_x86_mmx_psll_w, int_x86_mmx_pslli_w>;
+                                    int_x86_mmx_psll_w, int_x86_mmx_pslli_w,
+                                    MMX_SHIFT_ITINS>;
 defm MMX_PSLLD : MMXI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld",
-                                    int_x86_mmx_psll_d, int_x86_mmx_pslli_d>;
+                                    int_x86_mmx_psll_d, int_x86_mmx_pslli_d,
+                                    MMX_SHIFT_ITINS>;
 defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
-                                    int_x86_mmx_psll_q, int_x86_mmx_pslli_q>;
+                                    int_x86_mmx_psll_q, int_x86_mmx_pslli_q,
+                                    MMX_SHIFT_ITINS>;
 
 defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
-                                    int_x86_mmx_psra_w, int_x86_mmx_psrai_w>;
+                                    int_x86_mmx_psra_w, int_x86_mmx_psrai_w,
+                                    MMX_SHIFT_ITINS>;
 defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
-                                    int_x86_mmx_psra_d, int_x86_mmx_psrai_d>;
+                                    int_x86_mmx_psra_d, int_x86_mmx_psrai_d,
+                                    MMX_SHIFT_ITINS>;
 
 // Comparison Instructions
-defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b>;
-defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w>;
-defm MMX_PCMPEQD : MMXI_binop_rm_int<0x76, "pcmpeqd", int_x86_mmx_pcmpeq_d>;
-
-defm MMX_PCMPGTB : MMXI_binop_rm_int<0x64, "pcmpgtb", int_x86_mmx_pcmpgt_b>;
-defm MMX_PCMPGTW : MMXI_binop_rm_int<0x65, "pcmpgtw", int_x86_mmx_pcmpgt_w>;
-defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d>;
+defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PCMPEQD : MMXI_binop_rm_int<0x76, "pcmpeqd", int_x86_mmx_pcmpeq_d,
+                                     MMX_INTALU_ITINS>;
+
+defm MMX_PCMPGTB : MMXI_binop_rm_int<0x64, "pcmpgtb", int_x86_mmx_pcmpgt_b,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PCMPGTW : MMXI_binop_rm_int<0x65, "pcmpgtw", int_x86_mmx_pcmpgt_w,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d,
+                                     MMX_INTALU_ITINS>;
 
 // -- Unpack Instructions
 defm MMX_PUNPCKHBW : MMXI_binop_rm_int<0x68, "punpckhbw", 
-                                       int_x86_mmx_punpckhbw>;
+                                       int_x86_mmx_punpckhbw,
+                                       MMX_UNPCK_H_ITINS>;
 defm MMX_PUNPCKHWD : MMXI_binop_rm_int<0x69, "punpckhwd", 
-                                       int_x86_mmx_punpckhwd>;
+                                       int_x86_mmx_punpckhwd,
+                                       MMX_UNPCK_H_ITINS>;
 defm MMX_PUNPCKHDQ : MMXI_binop_rm_int<0x6A, "punpckhdq", 
-                                       int_x86_mmx_punpckhdq>;
+                                       int_x86_mmx_punpckhdq,
+                                       MMX_UNPCK_H_ITINS>;
 defm MMX_PUNPCKLBW : MMXI_binop_rm_int<0x60, "punpcklbw", 
-                                       int_x86_mmx_punpcklbw>;
+                                       int_x86_mmx_punpcklbw,
+                                       MMX_UNPCK_L_ITINS>;
 defm MMX_PUNPCKLWD : MMXI_binop_rm_int<0x61, "punpcklwd", 
-                                       int_x86_mmx_punpcklwd>;
+                                       int_x86_mmx_punpcklwd,
+                                       MMX_UNPCK_L_ITINS>;
 defm MMX_PUNPCKLDQ : MMXI_binop_rm_int<0x62, "punpckldq",
-                                       int_x86_mmx_punpckldq>;
+                                       int_x86_mmx_punpckldq,
+                                       MMX_UNPCK_L_ITINS>;
 
 // -- Pack Instructions
-defm MMX_PACKSSWB : MMXI_binop_rm_int<0x63, "packsswb", int_x86_mmx_packsswb>;
-defm MMX_PACKSSDW : MMXI_binop_rm_int<0x6B, "packssdw", int_x86_mmx_packssdw>;
-defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb>;
+defm MMX_PACKSSWB : MMXI_binop_rm_int<0x63, "packsswb", int_x86_mmx_packsswb,
+                                      MMX_PCK_ITINS>;
+defm MMX_PACKSSDW : MMXI_binop_rm_int<0x6B, "packssdw", int_x86_mmx_packssdw,
+                                      MMX_PCK_ITINS>;
+defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb,
+                                      MMX_PCK_ITINS>;
 
 // -- Shuffle Instructions
-defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", int_x86_ssse3_pshuf_b>;
+defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", int_x86_ssse3_pshuf_b,
+                                       MMX_PSHUF_ITINS>;
 
 def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg,
                           (outs VR64:$dst), (ins VR64:$src1, i8imm:$src2),
                           "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                           [(set VR64:$dst,
-                             (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))]>;
+                             (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))],
+                          IIC_MMX_PSHUF>;
 def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
                           (outs VR64:$dst), (ins i64mem:$src1, i8imm:$src2),
                           "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                           [(set VR64:$dst,
                              (int_x86_sse_pshuf_w (load_mmx addr:$src1),
-                                                   imm:$src2))]>;
-
+                                                   imm:$src2))],
+                          IIC_MMX_PSHUF>;
 
 
 
@@ -361,24 +501,24 @@ def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
 // -- Conversion Instructions
 defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi,
                       f64mem, load, "cvtps2pi\t{$src, $dst|$dst, $src}",
-                      SSEPackedSingle>, TB;
+                      MMX_CVT_PS_ITINS, SSEPackedSingle>, TB;
 defm MMX_CVTPD2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtpd2pi,
                       f128mem, memop, "cvtpd2pi\t{$src, $dst|$dst, $src}",
-                      SSEPackedDouble>, TB, OpSize;
+                      MMX_CVT_PD_ITINS, SSEPackedDouble>, TB, OpSize;
 defm MMX_CVTTPS2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttps2pi,
                        f64mem, load, "cvttps2pi\t{$src, $dst|$dst, $src}",
-                       SSEPackedSingle>, TB;
+                       MMX_CVT_PS_ITINS, SSEPackedSingle>, TB;
 defm MMX_CVTTPD2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttpd2pi,
                        f128mem, memop, "cvttpd2pi\t{$src, $dst|$dst, $src}",
-                       SSEPackedDouble>, TB, OpSize;
+                       MMX_CVT_PD_ITINS, SSEPackedDouble>, TB, OpSize;
 defm MMX_CVTPI2PD : sse12_cvt_pint<0x2A, VR64, VR128, int_x86_sse_cvtpi2pd,
                          i64mem, load, "cvtpi2pd\t{$src, $dst|$dst, $src}",
-                         SSEPackedDouble>, TB, OpSize;
+                         MMX_CVT_PD_ITINS, SSEPackedDouble>, TB, OpSize;
 let Constraints = "$src1 = $dst" in {
   defm MMX_CVTPI2PS : sse12_cvt_pint_3addr<0x2A, VR64, VR128,
                          int_x86_sse_cvtpi2ps,
                          i64mem, load, "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
-                         SSEPackedSingle>, TB;
+                          SSEPackedSingle>, TB;
 }
 
 // Extract / Insert
@@ -386,14 +526,16 @@ def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg,
                            (outs GR32:$dst), (ins VR64:$src1, i32i8imm:$src2),
                            "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                            [(set GR32:$dst, (int_x86_mmx_pextr_w VR64:$src1,
-                                             (iPTR imm:$src2)))]>;
+                                             (iPTR imm:$src2)))],
+                           IIC_MMX_PEXTR>;
 let Constraints = "$src1 = $dst" in {
   def MMX_PINSRWirri : MMXIi8<0xC4, MRMSrcReg,
                       (outs VR64:$dst), 
                       (ins VR64:$src1, GR32:$src2, i32i8imm:$src3),
                       "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                       [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
-                                        GR32:$src2, (iPTR imm:$src3)))]>;
+                                        GR32:$src2, (iPTR imm:$src3)))],
+                      IIC_MMX_PINSRW>;
 
   def MMX_PINSRWirmi : MMXIi8<0xC4, MRMSrcMem,
                      (outs VR64:$dst),
@@ -401,7 +543,8 @@ let Constraints = "$src1 = $dst" in {
                      "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                      [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
                                          (i32 (anyext (loadi16 addr:$src2))),
-                                       (iPTR imm:$src3)))]>;
+                                       (iPTR imm:$src3)))],
+                     IIC_MMX_PINSRW>;
 }
 
 // Mask creation
@@ -439,11 +582,13 @@ def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))),
 let Uses = [EDI] in
 def MMX_MASKMOVQ : MMXI<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
                         "maskmovq\t{$mask, $src|$src, $mask}",
-                        [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)]>;
+                        [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)],
+                        IIC_MMX_MASKMOV>;
 let Uses = [RDI] in
 def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
                            "maskmovq\t{$mask, $src|$src, $mask}",
-                           [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)]>;
+                           [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)],
+                           IIC_MMX_MASKMOV>;
 
 // 64-bit bit convert.
 def : Pat<(x86mmx (bitconvert (i64 GR64:$src))),
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 65e3c1e..c2d169a 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -1418,10 +1418,10 @@ multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
 multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                          SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
                          string asm, Domain d, OpndItins itins> {
-  def rr : PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+  def rr : I<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
                         [(set DstRC:$dst, (OpNode SrcRC:$src))],
                         itins.rr, d>;
-  def rm : PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+  def rm : I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
                         [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))],
                         itins.rm, d>;
 }
@@ -1622,7 +1622,7 @@ defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
                                     "cvttsd2si{q}", SSE_CVT_SD2SI>,
                                     XD, REX_W;
 
-let Pattern = []<dag> in {
+let Pattern = []<dag>, neverHasSideEffects = 1 in {
 defm VCVTSS2SI   : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load,
                                "cvtss2si{l}\t{$src, $dst|$dst, $src}",
                                SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG;
@@ -1630,14 +1630,16 @@ defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, undef, f32mem, load,
                                "cvtss2si\t{$src, $dst|$dst, $src}",
                                SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG;
 defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, VR128, undef, i128mem, load,
-                               "cvtdq2ps\t{$src, $dst|$dst, $src}",
-                               SSEPackedSingle, SSE_CVT_PS>, TB, VEX;
+                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
+                               SSEPackedSingle, SSE_CVT_PS>, TB, VEX,
+                               Requires<[HasAVX]>;
 defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, VR256, undef, i256mem, load,
-                               "cvtdq2ps\t{$src, $dst|$dst, $src}",
-                               SSEPackedSingle, SSE_CVT_PS>, TB, VEX;
+                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
+                               SSEPackedSingle, SSE_CVT_PS>, TB, VEX,
+                               Requires<[HasAVX]>;
 }
 
-let Pattern = []<dag> in {
+let Pattern = []<dag>, neverHasSideEffects = 1 in {
 defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load /*dummy*/,
                           "cvtss2si{l}\t{$src, $dst|$dst, $src}",
                           SSE_CVT_SS2SI_32>, XS;
@@ -1646,8 +1648,8 @@ defm CVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, undef, f32mem, load /*dummy*/,
                           SSE_CVT_SS2SI_64>, XS, REX_W;
 defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, undef, i128mem, load /*dummy*/,
                             "cvtdq2ps\t{$src, $dst|$dst, $src}",
-                            SSEPackedSingle, SSE_CVT_PS>,
-                            TB; /* PD SSE3 form is avaiable */
+                            SSEPackedSingle, SSE_CVT_PS>, TB,
+                            Requires<[HasSSE2]>;
 }
 
 let Predicates = [HasAVX] in {
@@ -1788,57 +1790,6 @@ def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
                     Requires<[HasSSE2]>;
 }
 
-// Convert doubleword to packed single/double fp
-// SSE2 instructions without OpSize prefix
-def Int_VCVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       "vcvtdq2ps\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))],
-                       IIC_SSE_CVT_PS_RR>,
-                     TB, VEX, Requires<[HasAVX]>;
-def Int_VCVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
-                      "vcvtdq2ps\t{$src, $dst|$dst, $src}",
-                      [(set VR128:$dst, (int_x86_sse2_cvtdq2ps
-                                        (bitconvert (memopv2i64 addr:$src))))],
-                                        IIC_SSE_CVT_PS_RM>,
-                     TB, VEX, Requires<[HasAVX]>;
-def Int_CVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       "cvtdq2ps\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))],
-                       IIC_SSE_CVT_PS_RR>,
-                     TB, Requires<[HasSSE2]>;
-def Int_CVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
-                      "cvtdq2ps\t{$src, $dst|$dst, $src}",
-                      [(set VR128:$dst, (int_x86_sse2_cvtdq2ps
-                                        (bitconvert (memopv2i64 addr:$src))))],
-                                        IIC_SSE_CVT_PS_RM>,
-                     TB, Requires<[HasSSE2]>;
-
-// FIXME: why the non-intrinsic version is described as SSE3?
-// SSE2 instructions with XS prefix
-def Int_VCVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       "vcvtdq2pd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))],
-                       IIC_SSE_CVT_PD_RR>,
-                     XS, VEX, Requires<[HasAVX]>;
-def Int_VCVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
-                       "vcvtdq2pd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvtdq2pd
-                                        (bitconvert (memopv2i64 addr:$src))))],
-                                        IIC_SSE_CVT_PD_RM>,
-                     XS, VEX, Requires<[HasAVX]>;
-def Int_CVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))],
-                       IIC_SSE_CVT_PD_RR>,
-                     XS, Requires<[HasSSE2]>;
-def Int_CVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
-                     "cvtdq2pd\t{$src, $dst|$dst, $src}",
-                     [(set VR128:$dst, (int_x86_sse2_cvtdq2pd
-                                        (bitconvert (memopv2i64 addr:$src))))],
-                                        IIC_SSE_CVT_PD_RM>,
-                     XS, Requires<[HasSSE2]>;
-
-
 // Convert packed single/double fp to doubleword
 def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtps2dq\t{$src, $dst|$dst, $src}", [],
@@ -1859,51 +1810,63 @@ def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                      "cvtps2dq\t{$src, $dst|$dst, $src}", [],
                      IIC_SSE_CVT_PS_RM>;
 
-def Int_VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                        "cvtps2dq\t{$src, $dst|$dst, $src}",
-                        [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
-                        IIC_SSE_CVT_PS_RR>,
-                        VEX;
-def Int_VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst),
-                         (ins f128mem:$src),
-                         "cvtps2dq\t{$src, $dst|$dst, $src}",
-                         [(set VR128:$dst, (int_x86_sse2_cvtps2dq
-                                            (memop addr:$src)))],
-                                            IIC_SSE_CVT_PS_RM>, VEX;
-def Int_CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                        "cvtps2dq\t{$src, $dst|$dst, $src}",
-                        [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
-                        IIC_SSE_CVT_PS_RR>;
-def Int_CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                         "cvtps2dq\t{$src, $dst|$dst, $src}",
-                         [(set VR128:$dst, (int_x86_sse2_cvtps2dq
-                                            (memop addr:$src)))],
-                                            IIC_SSE_CVT_PS_RM>;
-
-// SSE2 packed instructions with XD prefix
-def Int_VCVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))],
-                       IIC_SSE_CVT_PD_RR>,
-                     XD, VEX, Requires<[HasAVX]>;
-def Int_VCVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq
-                                          (memop addr:$src)))],
-                                          IIC_SSE_CVT_PD_RM>,
-                     XD, VEX, Requires<[HasAVX]>;
-def Int_CVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       "cvtpd2dq\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))],
-                       IIC_SSE_CVT_PD_RR>,
-                     XD, Requires<[HasSSE2]>;
-def Int_CVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                       "cvtpd2dq\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq
-                                          (memop addr:$src)))],
-                                          IIC_SSE_CVT_PD_RM>,
-                     XD, Requires<[HasSSE2]>;
+let Predicates = [HasAVX] in {
+  def : Pat<(int_x86_sse2_cvtps2dq VR128:$src),
+            (VCVTPS2DQrr VR128:$src)>;
+  def : Pat<(int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)),
+            (VCVTPS2DQrm addr:$src)>;
+}
+
+let Predicates = [HasSSE2] in {
+  def : Pat<(int_x86_sse2_cvtps2dq VR128:$src),
+            (CVTPS2DQrr VR128:$src)>;
+  def : Pat<(int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)),
+            (CVTPS2DQrm addr:$src)>;
+}
+
+// Convert Packed Double FP to Packed DW Integers
+let Predicates = [HasAVX] in {
+// The assembler can recognize rr 256-bit instructions by seeing a ymm
+// register, but the same isn't true when using memory operands instead.
+// Provide other assembly rr and rm forms to address this explicitly.
+def VCVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "vcvtpd2dq\t{$src, $dst|$dst, $src}", []>, VEX;
 
+// XMM only
+def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
+                (VCVTPD2DQrr VR128:$dst, VR128:$src)>;
+def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       "vcvtpd2dqx\t{$src, $dst|$dst, $src}", []>, VEX;
+
+// YMM only
+def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
+                       "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", []>, VEX;
+def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
+                       "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L;
+def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}",
+                (VCVTPD2DQYrr VR128:$dst, VR256:$src)>;
+}
+
+def CVTPD2DQrm  : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                      "cvtpd2dq\t{$src, $dst|$dst, $src}", [],
+                      IIC_SSE_CVT_PD_RM>;
+def CVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                      "cvtpd2dq\t{$src, $dst|$dst, $src}", [],
+                      IIC_SSE_CVT_PD_RR>;
+
+let Predicates = [HasAVX] in {
+  def : Pat<(int_x86_sse2_cvtpd2dq VR128:$src),
+            (VCVTPD2DQrr VR128:$src)>;
+  def : Pat<(int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)),
+            (VCVTPD2DQXrm addr:$src)>;
+}
+
+let Predicates = [HasSSE2] in {
+  def : Pat<(int_x86_sse2_cvtpd2dq VR128:$src),
+            (CVTPD2DQrr VR128:$src)>;
+  def : Pat<(int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)),
+            (CVTPD2DQrm addr:$src)>;
+}
 
 // Convert with truncation packed single/double fp to doubleword
 // SSE2 packed instructions with XS prefix
@@ -1915,7 +1878,7 @@ def VCVTTPS2DQrr : VSSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 def VCVTTPS2DQrm : VSSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                         "cvttps2dq\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst, (int_x86_sse2_cvttps2dq
-                                           (memop addr:$src)))],
+                                           (memopv4f32 addr:$src)))],
                                            IIC_SSE_CVT_PS_RM>, VEX;
 def VCVTTPS2DQYrr : VSSI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                          "cvttps2dq\t{$src, $dst|$dst, $src}",
@@ -1936,14 +1899,19 @@ def CVTTPS2DQrr : SSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 def CVTTPS2DQrm : SSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                       "cvttps2dq\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
-                            (int_x86_sse2_cvttps2dq (memop addr:$src)))],
+                            (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))],
                             IIC_SSE_CVT_PS_RM>;
 
 let Predicates = [HasAVX] in {
   def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
-            (Int_VCVTDQ2PSrr VR128:$src)>;
+            (VCVTDQ2PSrr VR128:$src)>;
   def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
-            (Int_VCVTDQ2PSrm addr:$src)>;
+            (VCVTDQ2PSrm addr:$src)>;
+
+  def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
+            (VCVTDQ2PSrr VR128:$src)>;
+  def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))),
+            (VCVTDQ2PSrm addr:$src)>;
 
   def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
             (VCVTTPS2DQrr VR128:$src)>;
@@ -1963,9 +1931,14 @@ let Predicates = [HasAVX] in {
 
 let Predicates = [HasSSE2] in {
   def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
-            (Int_CVTDQ2PSrr VR128:$src)>;
+            (CVTDQ2PSrr VR128:$src)>;
   def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
-            (Int_CVTDQ2PSrm addr:$src)>;
+            (CVTDQ2PSrm addr:$src)>;
+
+  def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
+            (CVTDQ2PSrr VR128:$src)>;
+  def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))),
+            (CVTDQ2PSrm addr:$src)>;
 
   def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
             (CVTTPS2DQrr VR128:$src)>;
@@ -1978,12 +1951,7 @@ def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         [(set VR128:$dst,
                               (int_x86_sse2_cvttpd2dq VR128:$src))],
                               IIC_SSE_CVT_PD_RR>, VEX;
-let isCodeGenOnly = 1 in
-def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                        "cvttpd2dq\t{$src, $dst|$dst, $src}",
-                        [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
-                                               (memop addr:$src)))],
-                                               IIC_SSE_CVT_PD_RM>, VEX;
+
 def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                       "cvttpd2dq\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))],
@@ -1991,31 +1959,38 @@ def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
                       "cvttpd2dq\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
-                                        (memop addr:$src)))],
+                                        (memopv2f64 addr:$src)))],
                                         IIC_SSE_CVT_PD_RM>;
 
 // The assembler can recognize rr 256-bit instructions by seeing a ymm
 // register, but the same isn't true when using memory operands instead.
 // Provide other assembly rr and rm forms to address this explicitly.
-def VCVTTPD2DQXrYr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
-                          "cvttpd2dq\t{$src, $dst|$dst, $src}", [],
-                          IIC_SSE_CVT_PD_RR>, VEX;
 
 // XMM only
-def VCVTTPD2DQXrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                         "cvttpd2dqx\t{$src, $dst|$dst, $src}", [],
-                         IIC_SSE_CVT_PD_RR>, VEX;
+def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
+                (VCVTTPD2DQrr VR128:$dst, VR128:$src)>;
 def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                         "cvttpd2dqx\t{$src, $dst|$dst, $src}", [],
+                         "cvttpd2dqx\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
+                                            (memopv2f64 addr:$src)))],
                          IIC_SSE_CVT_PD_RM>, VEX;
 
 // YMM only
 def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
-                         "cvttpd2dqy\t{$src, $dst|$dst, $src}", [],
+                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", [],
                          IIC_SSE_CVT_PD_RR>, VEX;
 def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
-                         "cvttpd2dqy\t{$src, $dst|$dst, $src}", [],
+                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", [],
                          IIC_SSE_CVT_PD_RM>, VEX, VEX_L;
+def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
+                (VCVTTPD2DQYrr VR128:$dst, VR256:$src)>;
+
+let Predicates = [HasAVX] in {
+  def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
+            (VCVTTPD2DQYrr VR256:$src)>;
+  def : Pat<(v4i32 (fp_to_sint (memopv4f64 addr:$src))),
+            (VCVTTPD2DQYrm addr:$src)>;
+} // Predicates = [HasAVX]
 
 // Convert packed single to packed double
 let Predicates = [HasAVX] in {
@@ -2033,35 +2008,71 @@ def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}", [],
                      IIC_SSE_CVT_PD_RM>, TB, VEX;
 }
+
+let Predicates = [HasSSE2] in {
 def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtps2pd\t{$src, $dst|$dst, $src}", [],
                        IIC_SSE_CVT_PD_RR>, TB;
 def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                        "cvtps2pd\t{$src, $dst|$dst, $src}", [],
                        IIC_SSE_CVT_PD_RM>, TB;
+}
 
-def Int_VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       "vcvtps2pd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
-                       IIC_SSE_CVT_PD_RR>,
-                     TB, VEX, Requires<[HasAVX]>;
-def Int_VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
-                       "vcvtps2pd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd
-                                          (load addr:$src)))],
-                                          IIC_SSE_CVT_PD_RM>,
-                     TB, VEX, Requires<[HasAVX]>;
-def Int_CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       "cvtps2pd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
-                       IIC_SSE_CVT_PD_RR>,
-                     TB, Requires<[HasSSE2]>;
-def Int_CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
-                       "cvtps2pd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd
-                                          (load addr:$src)))],
-                                          IIC_SSE_CVT_PD_RM>,
-                     TB, Requires<[HasSSE2]>;
+let Predicates = [HasAVX] in {
+  def : Pat<(int_x86_sse2_cvtps2pd VR128:$src),
+            (VCVTPS2PDrr VR128:$src)>;
+}
+
+let Predicates = [HasSSE2] in {
+  def : Pat<(int_x86_sse2_cvtps2pd VR128:$src),
+            (CVTPS2PDrr VR128:$src)>;
+}
+
+// Convert Packed DW Integers to Packed Double FP
+let Predicates = [HasAVX] in {
+def VCVTDQ2PDrm  : SSDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                     "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
+def VCVTDQ2PDrr  : SSDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                     "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
+def VCVTDQ2PDYrm  : SSDI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
+                     "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
+def VCVTDQ2PDYrr  : SSDI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+                     "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
+}
+
+def CVTDQ2PDrm  : SSDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                       "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
+                       IIC_SSE_CVT_PD_RR>;
+def CVTDQ2PDrr  : SSDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
+                       IIC_SSE_CVT_PD_RM>;
+
+// 128 bit register conversion intrinsics
+let Predicates = [HasAVX] in
+def : Pat<(int_x86_sse2_cvtdq2pd VR128:$src),
+           (VCVTDQ2PDrr VR128:$src)>;
+
+let Predicates = [HasSSE2] in
+def : Pat<(int_x86_sse2_cvtdq2pd VR128:$src),
+           (CVTDQ2PDrr VR128:$src)>;
+
+// AVX 256-bit register conversion intrinsics
+let Predicates = [HasAVX] in {
+  def : Pat<(int_x86_avx_cvtdq2_pd_256 VR128:$src),
+            (VCVTDQ2PDYrr VR128:$src)>;
+  def : Pat<(int_x86_avx_cvtdq2_pd_256 (bitconvert (memopv2i64 addr:$src))),
+            (VCVTDQ2PDYrm addr:$src)>;
+
+  def : Pat<(int_x86_avx_cvt_pd2dq_256 VR256:$src),
+            (VCVTPD2DQYrr VR256:$src)>;
+  def : Pat<(int_x86_avx_cvt_pd2dq_256 (memopv4f64 addr:$src)),
+            (VCVTPD2DQYrm addr:$src)>;
+
+  def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))),
+            (VCVTDQ2PDYrr VR128:$src)>;
+  def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
+            (VCVTDQ2PDYrm addr:$src)>;
+} // Predicates = [HasAVX]
 
 // Convert packed double to packed single
 // The assembler can recognize rr 256-bit instructions by seeing a ymm
@@ -2070,25 +2081,24 @@ def Int_CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
 def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtpd2ps\t{$src, $dst|$dst, $src}", [],
                        IIC_SSE_CVT_PD_RR>, VEX;
-def VCVTPD2PSXrYr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
-                         "cvtpd2ps\t{$src, $dst|$dst, $src}", [],
-                         IIC_SSE_CVT_PD_RR>, VEX;
 
 // XMM only
-def VCVTPD2PSXrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                        "cvtpd2psx\t{$src, $dst|$dst, $src}", [],
-                        IIC_SSE_CVT_PD_RR>, VEX;
+def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
+                (VCVTPD2PSrr VR128:$dst, VR128:$src)>;
 def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                         "cvtpd2psx\t{$src, $dst|$dst, $src}", [],
                         IIC_SSE_CVT_PD_RM>, VEX;
 
 // YMM only
 def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
-                        "cvtpd2psy\t{$src, $dst|$dst, $src}", [],
+                        "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", [],
                         IIC_SSE_CVT_PD_RR>, VEX;
 def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
-                        "cvtpd2psy\t{$src, $dst|$dst, $src}", [],
+                        "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", [],
                         IIC_SSE_CVT_PD_RM>, VEX, VEX_L;
+def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}",
+                (VCVTPD2PSYrr VR128:$dst, VR256:$src)>;
+
 def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                      "cvtpd2ps\t{$src, $dst|$dst, $src}", [],
                      IIC_SSE_CVT_PD_RR>;
@@ -2097,64 +2107,60 @@ def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                      IIC_SSE_CVT_PD_RM>;
 
 
-def Int_VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                         "cvtpd2ps\t{$src, $dst|$dst, $src}",
-                        [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
-                        IIC_SSE_CVT_PD_RR>;
-def Int_VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst),
-                         (ins f128mem:$src),
-                         "cvtpd2ps\t{$src, $dst|$dst, $src}",
-                         [(set VR128:$dst, (int_x86_sse2_cvtpd2ps
-                                            (memop addr:$src)))],
-                                            IIC_SSE_CVT_PD_RM>;
-def Int_CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                         "cvtpd2ps\t{$src, $dst|$dst, $src}",
-                        [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
-                        IIC_SSE_CVT_PD_RR>;
-def Int_CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                         "cvtpd2ps\t{$src, $dst|$dst, $src}",
-                         [(set VR128:$dst, (int_x86_sse2_cvtpd2ps
-                                            (memop addr:$src)))],
-                                            IIC_SSE_CVT_PD_RM>;
+let Predicates = [HasAVX] in {
+  def : Pat<(int_x86_sse2_cvtpd2ps VR128:$src),
+            (VCVTPD2PSrr VR128:$src)>;
+  def : Pat<(int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)),
+            (VCVTPD2PSXrm addr:$src)>;
+}
+
+let Predicates = [HasSSE2] in {
+  def : Pat<(int_x86_sse2_cvtpd2ps VR128:$src),
+            (CVTPD2PSrr VR128:$src)>;
+  def : Pat<(int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)),
+            (CVTPD2PSrm addr:$src)>;
+}
 
 // AVX 256-bit register conversion intrinsics
 // FIXME: Migrate SSE conversion intrinsics matching to use patterns as below
 // whenever possible to avoid declaring two versions of each one.
-def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src),
-          (VCVTDQ2PSYrr VR256:$src)>;
-def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (memopv4i64 addr:$src))),
-          (VCVTDQ2PSYrm addr:$src)>;
-
-def : Pat<(int_x86_avx_cvt_pd2_ps_256 VR256:$src),
-          (VCVTPD2PSYrr VR256:$src)>;
-def : Pat<(int_x86_avx_cvt_pd2_ps_256 (memopv4f64 addr:$src)),
-          (VCVTPD2PSYrm addr:$src)>;
-
-def : Pat<(int_x86_avx_cvt_ps2dq_256 VR256:$src),
-          (VCVTPS2DQYrr VR256:$src)>;
-def : Pat<(int_x86_avx_cvt_ps2dq_256 (memopv8f32 addr:$src)),
-          (VCVTPS2DQYrm addr:$src)>;
-
-def : Pat<(int_x86_avx_cvt_ps2_pd_256 VR128:$src),
-          (VCVTPS2PDYrr VR128:$src)>;
-def : Pat<(int_x86_avx_cvt_ps2_pd_256 (memopv4f32 addr:$src)),
-          (VCVTPS2PDYrm addr:$src)>;
-
-def : Pat<(int_x86_avx_cvtt_pd2dq_256 VR256:$src),
-          (VCVTTPD2DQYrr VR256:$src)>;
-def : Pat<(int_x86_avx_cvtt_pd2dq_256 (memopv4f64 addr:$src)),
-          (VCVTTPD2DQYrm addr:$src)>;
-
-// Match fround and fextend for 128/256-bit conversions
-def : Pat<(v4f32 (fround (v4f64 VR256:$src))),
-          (VCVTPD2PSYrr VR256:$src)>;
-def : Pat<(v4f32 (fround (loadv4f64 addr:$src))),
-          (VCVTPD2PSYrm addr:$src)>;
-
-def : Pat<(v4f64 (fextend (v4f32 VR128:$src))),
-          (VCVTPS2PDYrr VR128:$src)>;
-def : Pat<(v4f64 (fextend (loadv4f32 addr:$src))),
-          (VCVTPS2PDYrm addr:$src)>;
+let Predicates = [HasAVX] in {
+  def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src),
+            (VCVTDQ2PSYrr VR256:$src)>;
+  def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (memopv4i64 addr:$src))),
+            (VCVTDQ2PSYrm addr:$src)>;
+
+  def : Pat<(int_x86_avx_cvt_pd2_ps_256 VR256:$src),
+            (VCVTPD2PSYrr VR256:$src)>;
+  def : Pat<(int_x86_avx_cvt_pd2_ps_256 (memopv4f64 addr:$src)),
+            (VCVTPD2PSYrm addr:$src)>;
+
+  def : Pat<(int_x86_avx_cvt_ps2dq_256 VR256:$src),
+            (VCVTPS2DQYrr VR256:$src)>;
+  def : Pat<(int_x86_avx_cvt_ps2dq_256 (memopv8f32 addr:$src)),
+            (VCVTPS2DQYrm addr:$src)>;
+
+  def : Pat<(int_x86_avx_cvt_ps2_pd_256 VR128:$src),
+            (VCVTPS2PDYrr VR128:$src)>;
+  def : Pat<(int_x86_avx_cvt_ps2_pd_256 (memopv4f32 addr:$src)),
+            (VCVTPS2PDYrm addr:$src)>;
+
+  def : Pat<(int_x86_avx_cvtt_pd2dq_256 VR256:$src),
+            (VCVTTPD2DQYrr VR256:$src)>;
+  def : Pat<(int_x86_avx_cvtt_pd2dq_256 (memopv4f64 addr:$src)),
+            (VCVTTPD2DQYrm addr:$src)>;
+
+  // Match fround and fextend for 128/256-bit conversions
+  def : Pat<(v4f32 (fround (v4f64 VR256:$src))),
+            (VCVTPD2PSYrr VR256:$src)>;
+  def : Pat<(v4f32 (fround (loadv4f64 addr:$src))),
+            (VCVTPD2PSYrm addr:$src)>;
+
+  def : Pat<(v4f64 (fextend (v4f32 VR128:$src))),
+            (VCVTPS2PDYrr VR128:$src)>;
+  def : Pat<(v4f64 (fextend (loadv4f32 addr:$src))),
+            (VCVTPS2PDYrm addr:$src)>;
+}
 
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Compare Instructions
@@ -3336,13 +3342,6 @@ let AddedComplexity = 400 in { // Prefer non-temporal versions
                                                 IIC_SSE_MOVNT>, VEX;
 }
 
-def : Pat<(int_x86_avx_movnt_dq_256 addr:$dst, VR256:$src),
-          (VMOVNTDQYmr addr:$dst, VR256:$src)>;
-def : Pat<(int_x86_avx_movnt_pd_256 addr:$dst, VR256:$src),
-          (VMOVNTPDYmr addr:$dst, VR256:$src)>;
-def : Pat<(int_x86_avx_movnt_ps_256 addr:$dst, VR256:$src),
-          (VMOVNTPSYmr addr:$dst, VR256:$src)>;
-
 let AddedComplexity = 400 in { // Prefer non-temporal versions
 def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                     "movntps\t{$src, $dst|$dst, $src}",
@@ -4610,7 +4609,7 @@ def MOVPQIto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
 // Bitcast FR64 <-> GR64
 //
 let Predicates = [HasAVX] in
-def VMOV64toSDrm : S3SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+def VMOV64toSDrm : SSDI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
                         "vmovq\t{$src, $dst|$dst, $src}",
                         [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
                         VEX;
@@ -4623,7 +4622,7 @@ def VMOVSDto64mr : VRPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
                          [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
                          IIC_SSE_MOVDQ>, VEX;
 
-def MOV64toSDrm : S3SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+def MOV64toSDrm : SSDI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
                        "movq\t{$src, $dst|$dst, $src}",
                        [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
                        IIC_SSE_MOVDQ>;
@@ -4897,80 +4896,6 @@ def MOVQxrxr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                  "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, XS;
 
 //===---------------------------------------------------------------------===//
-// SSE3 - Conversion Instructions
-//===---------------------------------------------------------------------===//
-
-// Convert Packed Double FP to Packed DW Integers
-let Predicates = [HasAVX] in {
-// The assembler can recognize rr 256-bit instructions by seeing a ymm
-// register, but the same isn't true when using memory operands instead.
-// Provide other assembly rr and rm forms to address this explicitly.
-def VCVTPD2DQrr  : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       "vcvtpd2dq\t{$src, $dst|$dst, $src}", []>, VEX;
-def VCVTPD2DQXrYr  : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
-                       "vcvtpd2dq\t{$src, $dst|$dst, $src}", []>, VEX;
-
-// XMM only
-def VCVTPD2DQXrr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                      "vcvtpd2dqx\t{$src, $dst|$dst, $src}", []>, VEX;
-def VCVTPD2DQXrm : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                      "vcvtpd2dqx\t{$src, $dst|$dst, $src}", []>, VEX;
-
-// YMM only
-def VCVTPD2DQYrr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
-                      "vcvtpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX;
-def VCVTPD2DQYrm : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
-                      "vcvtpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L;
-}
-
-def CVTPD2DQrm  : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                       "cvtpd2dq\t{$src, $dst|$dst, $src}", [],
-                       IIC_SSE_CVT_PD_RM>;
-def CVTPD2DQrr  : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       "cvtpd2dq\t{$src, $dst|$dst, $src}", [],
-                       IIC_SSE_CVT_PD_RR>;
-
-def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
-          (VCVTTPD2DQYrr VR256:$src)>;
-def : Pat<(v4i32 (fp_to_sint (memopv4f64 addr:$src))),
-          (VCVTTPD2DQYrm addr:$src)>;
-
-// Convert Packed DW Integers to Packed Double FP
-let Predicates = [HasAVX] in {
-def VCVTDQ2PDrm  : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                     "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
-def VCVTDQ2PDrr  : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                     "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
-def VCVTDQ2PDYrm  : S3SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
-                     "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
-def VCVTDQ2PDYrr  : S3SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
-                     "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
-}
-
-def CVTDQ2PDrm  : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                       "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
-                       IIC_SSE_CVT_PD_RR>;
-def CVTDQ2PDrr  : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
-                       IIC_SSE_CVT_PD_RM>;
-
-// AVX 256-bit register conversion intrinsics
-def : Pat<(int_x86_avx_cvtdq2_pd_256 VR128:$src),
-           (VCVTDQ2PDYrr VR128:$src)>;
-def : Pat<(int_x86_avx_cvtdq2_pd_256 (bitconvert (memopv2i64 addr:$src))),
-           (VCVTDQ2PDYrm addr:$src)>;
-
-def : Pat<(int_x86_avx_cvt_pd2dq_256 VR256:$src),
-          (VCVTPD2DQYrr VR256:$src)>;
-def : Pat<(int_x86_avx_cvt_pd2dq_256 (memopv4f64 addr:$src)),
-          (VCVTPD2DQYrm addr:$src)>;
-
-def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))),
-          (VCVTDQ2PDYrr VR128:$src)>;
-def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
-          (VCVTDQ2PDYrm addr:$src)>;
-
-//===---------------------------------------------------------------------===//
 // SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
 //===---------------------------------------------------------------------===//
 multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
@@ -5730,14 +5655,26 @@ let Predicates = [HasSSE41] in {
             (PMOVZXDQrm addr:$src)>;
 }
 
+let Predicates = [HasAVX2] in {
+  let AddedComplexity = 15 in {
+    def : Pat<(v4i64 (X86vzmovly (v4i32 VR128:$src))),
+              (VPMOVZXDQYrr VR128:$src)>;
+    def : Pat<(v8i32 (X86vzmovly (v8i16 VR128:$src))),
+              (VPMOVZXWDYrr VR128:$src)>;
+  }
+
+  def : Pat<(v4i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQYrr VR128:$src)>;
+  def : Pat<(v8i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDYrr VR128:$src)>;
+}
+
 let Predicates = [HasAVX] in {
-def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>;
-def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>;
+  def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>;
 }
 
 let Predicates = [HasSSE41] in {
-def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>;
-def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>;
+  def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>;
 }
 
 
@@ -6608,15 +6545,15 @@ let Predicates = [HasAVX] in {
   let isCommutable = 0 in {
     let ExeDomain = SSEPackedSingle in {
     defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps,
-                                        VR128, memopv4f32, i128mem, 0>, VEX_4V;
+                                        VR128, memopv4f32, f128mem, 0>, VEX_4V;
     defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps",
-              int_x86_avx_blend_ps_256, VR256, memopv8f32, i256mem, 0>, VEX_4V;
+              int_x86_avx_blend_ps_256, VR256, memopv8f32, f256mem, 0>, VEX_4V;
     }
     let ExeDomain = SSEPackedDouble in {
     defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd,
-                                        VR128, memopv2f64, i128mem, 0>, VEX_4V;
+                                        VR128, memopv2f64, f128mem, 0>, VEX_4V;
     defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd",
-              int_x86_avx_blend_pd_256, VR256, memopv4f64, i256mem, 0>, VEX_4V;
+              int_x86_avx_blend_pd_256, VR256, memopv4f64, f256mem, 0>, VEX_4V;
     }
   defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw,
                                       VR128, memopv2i64, i128mem, 0>, VEX_4V;
@@ -6625,10 +6562,10 @@ let Predicates = [HasAVX] in {
   }
   let ExeDomain = SSEPackedSingle in
   defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
-                                   VR128, memopv4f32, i128mem, 0>, VEX_4V;
+                                   VR128, memopv4f32, f128mem, 0>, VEX_4V;
   let ExeDomain = SSEPackedDouble in
   defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
-                                   VR128, memopv2f64, i128mem, 0>, VEX_4V;
+                                   VR128, memopv2f64, f128mem, 0>, VEX_4V;
   let ExeDomain = SSEPackedSingle in
   defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
                                    VR256, memopv8f32, i256mem, 0>, VEX_4V;
@@ -6647,10 +6584,10 @@ let Constraints = "$src1 = $dst" in {
   let isCommutable = 0 in {
   let ExeDomain = SSEPackedSingle in
   defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps,
-                                     VR128, memopv4f32, i128mem>;
+                                     VR128, memopv4f32, f128mem>;
   let ExeDomain = SSEPackedDouble in
   defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd,
-                                     VR128, memopv2f64, i128mem>;
+                                     VR128, memopv2f64, f128mem>;
   defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw,
                                      VR128, memopv2i64, i128mem>;
   defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
@@ -6658,10 +6595,10 @@ let Constraints = "$src1 = $dst" in {
   }
   let ExeDomain = SSEPackedSingle in
   defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
-                                  VR128, memopv4f32, i128mem>;
+                                  VR128, memopv4f32, f128mem>;
   let ExeDomain = SSEPackedDouble in
   defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
-                                  VR128, memopv2f64, i128mem>;
+                                  VR128, memopv2f64, f128mem>;
 }
 
 /// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
@@ -6687,15 +6624,15 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
 
 let Predicates = [HasAVX] in {
 let ExeDomain = SSEPackedDouble in {
-defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, i128mem,
+defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
                                            memopv2f64, int_x86_sse41_blendvpd>;
-defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, i256mem,
+defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
                                          memopv4f64, int_x86_avx_blendv_pd_256>;
 } // ExeDomain = SSEPackedDouble
 let ExeDomain = SSEPackedSingle in {
-defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, i128mem,
+defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
                                            memopv4f32, int_x86_sse41_blendvps>;
-defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, i256mem,
+defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
                                          memopv8f32, int_x86_avx_blendv_ps_256>;
 } // ExeDomain = SSEPackedSingle
 defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
@@ -6766,7 +6703,7 @@ let Predicates = [HasAVX2] in {
 /// SS41I_ternary_int - SSE 4.1 ternary operator
 let Uses = [XMM0], Constraints = "$src1 = $dst" in {
   multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
-                               Intrinsic IntId> {
+                               X86MemOperand x86memop, Intrinsic IntId> {
     def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
                     (ins VR128:$src1, VR128:$src2),
                     !strconcat(OpcodeStr,
@@ -6775,7 +6712,7 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in {
                     OpSize;
 
     def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
-                    (ins VR128:$src1, i128mem:$src2),
+                    (ins VR128:$src1, x86memop:$src2),
                     !strconcat(OpcodeStr,
                      "\t{$src2, $dst|$dst, $src2}"),
                     [(set VR128:$dst,
@@ -6785,14 +6722,28 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in {
 }
 
 let ExeDomain = SSEPackedDouble in
-defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64,
+defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem,
                                   int_x86_sse41_blendvpd>;
 let ExeDomain = SSEPackedSingle in
-defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32,
+defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem,
                                   int_x86_sse41_blendvps>;
-defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64,
+defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
                                   int_x86_sse41_pblendvb>;
 
+// Aliases with the implicit xmm0 argument
+def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+                (BLENDVPDrr0 VR128:$dst, VR128:$src2)>;
+def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+                (BLENDVPDrm0 VR128:$dst, f128mem:$src2)>;
+def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+                (BLENDVPSrr0 VR128:$dst, VR128:$src2)>;
+def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+                (BLENDVPSrm0 VR128:$dst, f128mem:$src2)>;
+def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+                (PBLENDVBrr0 VR128:$dst, VR128:$src2)>;
+def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+                (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>;
+
 let Predicates = [HasSSE41] in {
   def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
                             (v16i8 VR128:$src2))),
@@ -7204,52 +7155,50 @@ def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
   OpSize;
 
 //===----------------------------------------------------------------------===//
-// CLMUL Instructions
+// PCLMUL Instructions
 //===----------------------------------------------------------------------===//
 
-// Carry-less Multiplication instructions
-let neverHasSideEffects = 1 in {
 // AVX carry-less Multiplication instructions
-def VPCLMULQDQrr : AVXCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
+def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
            (ins VR128:$src1, VR128:$src2, i8imm:$src3),
            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           []>;
+           [(set VR128:$dst,
+             (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>;
 
-let mayLoad = 1 in
-def VPCLMULQDQrm : AVXCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
+def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           []>;
+           [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
+                              (memopv2i64 addr:$src2), imm:$src3))]>;
 
+// Carry-less Multiplication instructions
 let Constraints = "$src1 = $dst" in {
-def PCLMULQDQrr : CLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
+def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
            (ins VR128:$src1, VR128:$src2, i8imm:$src3),
            "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-           []>;
+           [(set VR128:$dst,
+             (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>;
 
-let mayLoad = 1 in
-def PCLMULQDQrm : CLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
+def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
            "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-           []>;
+           [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
+                              (memopv2i64 addr:$src2), imm:$src3))]>;
 } // Constraints = "$src1 = $dst"
-} // neverHasSideEffects = 1
 
 
 multiclass pclmul_alias<string asm, int immop> {
-  def : InstAlias<!strconcat("pclmul", asm, 
-                           "dq {$src, $dst|$dst, $src}"),
+  def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
                   (PCLMULQDQrr VR128:$dst, VR128:$src, immop)>;
 
-  def : InstAlias<!strconcat("pclmul", asm, 
-                             "dq {$src, $dst|$dst, $src}"),
+  def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
                   (PCLMULQDQrm VR128:$dst, i128mem:$src, immop)>;
 
-  def : InstAlias<!strconcat("vpclmul", asm, 
+  def : InstAlias<!strconcat("vpclmul", asm,
                              "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
                   (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop)>;
 
-  def : InstAlias<!strconcat("vpclmul", asm, 
+  def : InstAlias<!strconcat("vpclmul", asm,
                              "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
                   (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop)>;
 }
@@ -7259,6 +7208,45 @@ defm : pclmul_alias<"lqhq", 0x10>;
 defm : pclmul_alias<"lqlq", 0x00>;
 
 //===----------------------------------------------------------------------===//
+// SSE4A Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasSSE4A] in {
+
+let Constraints = "$src = $dst" in {
+def EXTRQI : Ii8<0x78, MRM0r, (outs VR128:$dst),
+                 (ins VR128:$src, i8imm:$len, i8imm:$idx),
+                 "extrq\t{$idx, $len, $src|$src, $len, $idx}",
+                 [(set VR128:$dst, (int_x86_sse4a_extrqi VR128:$src, imm:$len,
+                                    imm:$idx))]>, TB, OpSize;
+def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
+              (ins VR128:$src, VR128:$mask),
+              "extrq\t{$mask, $src|$src, $mask}",
+              [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
+                                 VR128:$mask))]>, TB, OpSize;
+
+def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
+                   (ins VR128:$src, VR128:$src2, i8imm:$len, i8imm:$idx),
+                   "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
+                   [(set VR128:$dst, (int_x86_sse4a_insertqi VR128:$src,
+                                      VR128:$src2, imm:$len, imm:$idx))]>, XD;
+def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
+                 (ins VR128:$src, VR128:$mask),
+                 "insertq\t{$mask, $src|$src, $mask}",
+                 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
+                                    VR128:$mask))]>, XD;
+}
+
+def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
+                "movntss\t{$src, $dst|$dst, $src}",
+                [(int_x86_sse4a_movnt_ss addr:$dst, VR128:$src)]>, XS;
+
+def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                "movntsd\t{$src, $dst|$dst, $src}",
+                [(int_x86_sse4a_movnt_sd addr:$dst, VR128:$src)]>, XD;
+}
+
+//===----------------------------------------------------------------------===//
 // AVX Instructions
 //===----------------------------------------------------------------------===//
 
@@ -7286,7 +7274,7 @@ let ExeDomain = SSEPackedSingle in {
                                       int_x86_avx_vbroadcast_ss_256>;
 }
 let ExeDomain = SSEPackedDouble in
-def VBROADCASTSDrm  : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem,
+def VBROADCASTSDYrm  : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem,
                                     int_x86_avx_vbroadcast_sd_256>;
 def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem,
                                    int_x86_avx_vbroadcastf128_pd_256>;
@@ -7298,8 +7286,8 @@ let ExeDomain = SSEPackedSingle in {
                                            int_x86_avx2_vbroadcast_ss_ps_256>;
 }
 let ExeDomain = SSEPackedDouble in
-def VBROADCASTSDrr  : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256,
-                                         int_x86_avx2_vbroadcast_sd_pd_256>;
+def VBROADCASTSDYrr  : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256,
+                                          int_x86_avx2_vbroadcast_sd_pd_256>;
 
 let Predicates = [HasAVX2] in
 def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem,
@@ -7595,7 +7583,6 @@ let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
 // Half precision conversion instructions
 //===----------------------------------------------------------------------===//
 multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
-let Predicates = [HasAVX, HasF16C] in {
   def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
              "vcvtph2ps\t{$src, $dst|$dst, $src}",
              [(set RC:$dst, (Int VR128:$src))]>,
@@ -7604,27 +7591,26 @@ let Predicates = [HasAVX, HasF16C] in {
   def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
              "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8, OpSize, VEX;
 }
-}
 
 multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
-let Predicates = [HasAVX, HasF16C] in {
   def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
                (ins RC:$src1, i32i8imm:$src2),
                "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>,
                TA, OpSize, VEX;
-  let neverHasSideEffects = 1, mayLoad = 1 in
-  def mr : Ii8<0x1D, MRMDestMem, (outs x86memop:$dst),
-               (ins RC:$src1, i32i8imm:$src2),
+  let neverHasSideEffects = 1, mayStore = 1 in
+  def mr : Ii8<0x1D, MRMDestMem, (outs),
+               (ins x86memop:$dst, RC:$src1, i32i8imm:$src2),
                "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
                TA, OpSize, VEX;
 }
-}
 
-defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>;
-defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>;
-defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
-defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>;
+let Predicates = [HasAVX, HasF16C] in {
+  defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>;
+  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>;
+  defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
+  defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>;
+}
 
 //===----------------------------------------------------------------------===//
 // AVX2 Instructions
@@ -7711,6 +7697,55 @@ let Predicates = [HasAVX2] in {
           (VPBROADCASTQrm addr:$src)>;
   def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
           (VPBROADCASTQYrm addr:$src)>;
+
+  def : Pat<(v16i8 (X86VBroadcast (v16i8 VR128:$src))),
+          (VPBROADCASTBrr VR128:$src)>;
+  def : Pat<(v32i8 (X86VBroadcast (v16i8 VR128:$src))),
+          (VPBROADCASTBYrr VR128:$src)>;
+  def : Pat<(v8i16 (X86VBroadcast (v8i16 VR128:$src))),
+          (VPBROADCASTWrr VR128:$src)>;
+  def : Pat<(v16i16 (X86VBroadcast (v8i16 VR128:$src))),
+          (VPBROADCASTWYrr VR128:$src)>;
+  def : Pat<(v4i32 (X86VBroadcast (v4i32 VR128:$src))),
+          (VPBROADCASTDrr VR128:$src)>;
+  def : Pat<(v8i32 (X86VBroadcast (v4i32 VR128:$src))),
+          (VPBROADCASTDYrr VR128:$src)>;
+  def : Pat<(v2i64 (X86VBroadcast (v2i64 VR128:$src))),
+          (VPBROADCASTQrr VR128:$src)>;
+  def : Pat<(v4i64 (X86VBroadcast (v2i64 VR128:$src))),
+          (VPBROADCASTQYrr VR128:$src)>;
+  def : Pat<(v4f32 (X86VBroadcast (v4f32 VR128:$src))),
+          (VBROADCASTSSrr VR128:$src)>;
+  def : Pat<(v8f32 (X86VBroadcast (v4f32 VR128:$src))),
+          (VBROADCASTSSYrr VR128:$src)>;
+  def : Pat<(v2f64 (X86VBroadcast (v2f64 VR128:$src))),
+          (VPBROADCASTQrr VR128:$src)>;
+  def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))),
+          (VBROADCASTSDYrr VR128:$src)>;
+
+  // Provide fallback in case the load node that is used in the patterns above
+  // is used by additional users, which prevents the pattern selection.
+  let AddedComplexity = 20 in {
+    def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
+              (VBROADCASTSSrr
+              (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss))>;
+    def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
+              (VBROADCASTSSYrr
+              (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss))>;
+    def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
+              (VBROADCASTSDYrr
+              (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd))>;
+
+    def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
+              (VBROADCASTSSrr
+              (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), GR32:$src, sub_ss))>;
+    def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
+              (VBROADCASTSSYrr
+              (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), GR32:$src, sub_ss))>;
+    def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
+              (VBROADCASTSDYrr
+              (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), GR64:$src, sub_sd))>;
+  }
 }
 
 // AVX1 broadcast patterns
@@ -7718,16 +7753,62 @@ let Predicates = [HasAVX] in {
 def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
           (VBROADCASTSSYrm addr:$src)>;
 def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
-          (VBROADCASTSDrm addr:$src)>;
+          (VBROADCASTSDYrm addr:$src)>;
 def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))),
           (VBROADCASTSSYrm addr:$src)>;
 def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))),
-          (VBROADCASTSDrm addr:$src)>;
-
+          (VBROADCASTSDYrm addr:$src)>;
 def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))),
           (VBROADCASTSSrm addr:$src)>;
 def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
           (VBROADCASTSSrm addr:$src)>;
+
+  // Provide fallback in case the load node that is used in the patterns above
+  // is used by additional users, which prevents the pattern selection.
+  let AddedComplexity = 20 in {
+  // 128bit broadcasts:
+  def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
+            (VPSHUFDri
+            (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss), 0)>;
+  def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
+            (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
+              (VPSHUFDri
+                (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss), 0),
+                  sub_xmm),
+              (VPSHUFDri
+                (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss),
+               0), 1)>;
+  def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
+            (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
+              (VPSHUFDri
+                (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd),
+                0x44),
+              sub_xmm),
+              (VPSHUFDri
+                (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd),
+                0x44), 1)>;
+
+  def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
+            (VPSHUFDri
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), GR32:$src, sub_ss), 0)>;
+  def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
+            (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+              (VPSHUFDri
+                (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), GR32:$src, sub_ss), 0),
+                  sub_xmm),
+              (VPSHUFDri
+                (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), GR32:$src, sub_ss),
+               0), 1)>;
+  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
+            (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
+              (VPSHUFDri
+                (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), GR64:$src, sub_sd),
+                0x44),
+              sub_xmm),
+              (VPSHUFDri
+                (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), GR64:$src, sub_sd),
+                0x44), 1)>;
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -7820,8 +7901,8 @@ let neverHasSideEffects = 1 in {
 def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR128:$src2, i8imm:$src3),
           "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>,
-          VEX_4V;
+          []>, VEX_4V;
+let mayLoad = 1 in
 def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, i128mem:$src2, i8imm:$src3),
           "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
@@ -7954,3 +8035,30 @@ defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W;
 defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>;
 defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W;
 defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>;
+
+//===----------------------------------------------------------------------===//
+// VGATHER - GATHER Operations
+multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
+                       X86MemOperand memop128, X86MemOperand memop256> {
+  def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst, VR128:$mask_wb),
+            (ins VR128:$src1, memop128:$src2, VR128:$mask),
+            !strconcat(OpcodeStr,
+              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
+            []>, VEX_4VOp3;
+  def Yrm : AVX28I<opc, MRMSrcMem, (outs RC256:$dst, RC256:$mask_wb),
+            (ins RC256:$src1, memop256:$src2, RC256:$mask),
+            !strconcat(OpcodeStr,
+              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
+            []>, VEX_4VOp3, VEX_L;
+}
+
+let Constraints = "$src1 = $dst, $mask = $mask_wb" in {
+  defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx64mem, vx64mem>, VEX_W;
+  defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx64mem, vy64mem>, VEX_W;
+  defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx32mem, vy32mem>;
+  defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx32mem, vy32mem>;
+  defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx64mem, vx64mem>, VEX_W;
+  defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx64mem, vy64mem>, VEX_W;
+  defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx32mem, vy32mem>;
+  defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx32mem, vy32mem>;
+}
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index bddba6c..ea716bf 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -14,7 +14,8 @@
 //===----------------------------------------------------------------------===//
 
 let Defs = [RAX, RDX] in
-  def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)]>, TB;
+  def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)], IIC_RDTSC>,
+              TB;
 
 let Defs = [RAX, RCX, RDX] in
   def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", []>, TB;
@@ -26,14 +27,17 @@ let isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in {
   def UD2B    : I<0xB9, RawFrm, (outs), (ins), "ud2b", []>, TB;
 }
 
-def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", []>;
-def RSM : I<0xAA, RawFrm, (outs), (ins), "rsm", []>, TB;
+def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", [], IIC_HLT>;
+def RSM : I<0xAA, RawFrm, (outs), (ins), "rsm", [], IIC_RSM>, TB;
 
 // Interrupt and SysCall Instructions.
 let Uses = [EFLAGS] in
   def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>;
 def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3",
-              [(int_x86_int (i8 3))]>;
+              [(int_x86_int (i8 3))], IIC_INT3>;
+
+def : Pat<(debugtrap),
+          (INT3)>;
 
 // The long form of "int $3" turns into int3 as a size optimization.
 // FIXME: This doesn't work because InstAlias can't match immediate constants.
@@ -41,23 +45,25 @@ def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3",
 
 
 def INT : Ii8<0xcd, RawFrm, (outs), (ins i8imm:$trap), "int\t$trap",
-              [(int_x86_int imm:$trap)]>;
+              [(int_x86_int imm:$trap)], IIC_INT>;
 
 
-def SYSCALL  : I<0x05, RawFrm, (outs), (ins), "syscall", []>, TB;
-def SYSRET   : I<0x07, RawFrm, (outs), (ins), "sysret{l}", []>, TB;
-def SYSRET64 :RI<0x07, RawFrm, (outs), (ins), "sysret{q}", []>, TB,
+def SYSCALL  : I<0x05, RawFrm, (outs), (ins), "syscall", [], IIC_SYSCALL>, TB;
+def SYSRET   : I<0x07, RawFrm, (outs), (ins), "sysret{l}", [], IIC_SYSCALL>, TB;
+def SYSRET64 :RI<0x07, RawFrm, (outs), (ins), "sysret{q}", [], IIC_SYSCALL>, TB,
                Requires<[In64BitMode]>;
 
-def SYSENTER : I<0x34, RawFrm, (outs), (ins), "sysenter", []>, TB;
-                 
-def SYSEXIT   : I<0x35, RawFrm, (outs), (ins), "sysexit{l}", []>, TB;
+def SYSENTER : I<0x34, RawFrm, (outs), (ins), "sysenter", [],
+                 IIC_SYS_ENTER_EXIT>, TB;
+
+def SYSEXIT   : I<0x35, RawFrm, (outs), (ins), "sysexit{l}", [],
+                 IIC_SYS_ENTER_EXIT>, TB;
 def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexit{q}", []>, TB,
                 Requires<[In64BitMode]>;
 
-def IRET16 : I<0xcf, RawFrm, (outs), (ins), "iret{w}", []>, OpSize;
-def IRET32 : I<0xcf, RawFrm, (outs), (ins), "iret{l|d}", []>;
-def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", []>,
+def IRET16 : I<0xcf, RawFrm, (outs), (ins), "iret{w}", [], IIC_IRET>, OpSize;
+def IRET32 : I<0xcf, RawFrm, (outs), (ins), "iret{l|d}", [], IIC_IRET>;
+def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", [], IIC_IRET>,
              Requires<[In64BitMode]>;
 
 
@@ -66,73 +72,73 @@ def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", []>,
 //
 let Defs = [AL], Uses = [DX] in
 def IN8rr  : I<0xEC, RawFrm, (outs), (ins),
-               "in{b}\t{%dx, %al|AL, DX}", []>;
+               "in{b}\t{%dx, %al|AL, DX}", [], IIC_IN_RR>;
 let Defs = [AX], Uses = [DX] in
 def IN16rr : I<0xED, RawFrm, (outs), (ins),
-               "in{w}\t{%dx, %ax|AX, DX}", []>,  OpSize;
+               "in{w}\t{%dx, %ax|AX, DX}", [], IIC_IN_RR>,  OpSize;
 let Defs = [EAX], Uses = [DX] in
 def IN32rr : I<0xED, RawFrm, (outs), (ins),
-               "in{l}\t{%dx, %eax|EAX, DX}", []>;
+               "in{l}\t{%dx, %eax|EAX, DX}", [], IIC_IN_RR>;
 
 let Defs = [AL] in
 def IN8ri  : Ii8<0xE4, RawFrm, (outs), (ins i8imm:$port),
-                  "in{b}\t{$port, %al|AL, $port}", []>;
+                  "in{b}\t{$port, %al|AL, $port}", [], IIC_IN_RI>;
 let Defs = [AX] in
 def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port),
-                  "in{w}\t{$port, %ax|AX, $port}", []>, OpSize;
+                  "in{w}\t{$port, %ax|AX, $port}", [], IIC_IN_RI>, OpSize;
 let Defs = [EAX] in
 def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port),
-                  "in{l}\t{$port, %eax|EAX, $port}", []>;
+                  "in{l}\t{$port, %eax|EAX, $port}", [], IIC_IN_RI>;
 
 let Uses = [DX, AL] in
 def OUT8rr  : I<0xEE, RawFrm, (outs), (ins),
-                "out{b}\t{%al, %dx|DX, AL}", []>;
+                "out{b}\t{%al, %dx|DX, AL}", [], IIC_OUT_RR>;
 let Uses = [DX, AX] in
 def OUT16rr : I<0xEF, RawFrm, (outs), (ins),
-                "out{w}\t{%ax, %dx|DX, AX}", []>, OpSize;
+                "out{w}\t{%ax, %dx|DX, AX}", [], IIC_OUT_RR>, OpSize;
 let Uses = [DX, EAX] in
 def OUT32rr : I<0xEF, RawFrm, (outs), (ins),
-                "out{l}\t{%eax, %dx|DX, EAX}", []>;
+                "out{l}\t{%eax, %dx|DX, EAX}", [], IIC_OUT_RR>;
 
 let Uses = [AL] in
 def OUT8ir  : Ii8<0xE6, RawFrm, (outs), (ins i8imm:$port),
-                   "out{b}\t{%al, $port|$port, AL}", []>;
+                   "out{b}\t{%al, $port|$port, AL}", [], IIC_OUT_IR>;
 let Uses = [AX] in
 def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port),
-                   "out{w}\t{%ax, $port|$port, AX}", []>, OpSize;
+                   "out{w}\t{%ax, $port|$port, AX}", [], IIC_OUT_IR>, OpSize;
 let Uses = [EAX] in
 def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port),
-                   "out{l}\t{%eax, $port|$port, EAX}", []>;
+                   "out{l}\t{%eax, $port|$port, EAX}", [], IIC_OUT_IR>;
 
-def IN8  : I<0x6C, RawFrm, (outs), (ins), "ins{b}", []>;
-def IN16 : I<0x6D, RawFrm, (outs), (ins), "ins{w}", []>,  OpSize;
-def IN32 : I<0x6D, RawFrm, (outs), (ins), "ins{l}", []>;
+def IN8  : I<0x6C, RawFrm, (outs), (ins), "ins{b}", [], IIC_INS>;
+def IN16 : I<0x6D, RawFrm, (outs), (ins), "ins{w}", [], IIC_INS>,  OpSize;
+def IN32 : I<0x6D, RawFrm, (outs), (ins), "ins{l}", [], IIC_INS>;
 
 //===----------------------------------------------------------------------===//
 // Moves to and from debug registers
 
 def MOV32rd : I<0x21, MRMDestReg, (outs GR32:$dst), (ins DEBUG_REG:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", []>, TB;
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB;
 def MOV64rd : I<0x21, MRMDestReg, (outs GR64:$dst), (ins DEBUG_REG:$src),
-                "mov{q}\t{$src, $dst|$dst, $src}", []>, TB;
+                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB;
                 
 def MOV32dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR32:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", []>, TB;
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB;
 def MOV64dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR64:$src),
-                "mov{q}\t{$src, $dst|$dst, $src}", []>, TB;
+                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB;
 
 //===----------------------------------------------------------------------===//
 // Moves to and from control registers
 
 def MOV32rc : I<0x20, MRMDestReg, (outs GR32:$dst), (ins CONTROL_REG:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", []>, TB;
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB;
 def MOV64rc : I<0x20, MRMDestReg, (outs GR64:$dst), (ins CONTROL_REG:$src),
-                "mov{q}\t{$src, $dst|$dst, $src}", []>, TB;
+                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB;
                 
 def MOV32cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR32:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", []>, TB;
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB;
 def MOV64cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR64:$src),
-                "mov{q}\t{$src, $dst|$dst, $src}", []>, TB;
+                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB;
 
 //===----------------------------------------------------------------------===//
 // Segment override instruction prefixes
@@ -150,254 +156,265 @@ def GS_PREFIX : I<0x65, RawFrm, (outs), (ins), "gs", []>;
 //
 
 def MOV16rs : I<0x8C, MRMDestReg, (outs GR16:$dst), (ins SEGMENT_REG:$src),
-                "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>, OpSize;
 def MOV32rs : I<0x8C, MRMDestReg, (outs GR32:$dst), (ins SEGMENT_REG:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", []>;
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>;
 def MOV64rs : RI<0x8C, MRMDestReg, (outs GR64:$dst), (ins SEGMENT_REG:$src),
-                 "mov{q}\t{$src, $dst|$dst, $src}", []>;
+                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>;
 
 def MOV16ms : I<0x8C, MRMDestMem, (outs i16mem:$dst), (ins SEGMENT_REG:$src),
-                "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize;
 def MOV32ms : I<0x8C, MRMDestMem, (outs i32mem:$dst), (ins SEGMENT_REG:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", []>;
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>;
 def MOV64ms : RI<0x8C, MRMDestMem, (outs i64mem:$dst), (ins SEGMENT_REG:$src),
-                 "mov{q}\t{$src, $dst|$dst, $src}", []>;
+                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>;
 
 def MOV16sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR16:$src),
-                "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize;
 def MOV32sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR32:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", []>;
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>;
 def MOV64sr : RI<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR64:$src),
-                 "mov{q}\t{$src, $dst|$dst, $src}", []>;
+                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>;
 
 def MOV16sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i16mem:$src),
-                "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSize;
 def MOV32sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i32mem:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", []>;
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>;
 def MOV64sm : RI<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i64mem:$src),
-                 "mov{q}\t{$src, $dst|$dst, $src}", []>;
+                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>;
 
 //===----------------------------------------------------------------------===//
 // Segmentation support instructions.
 
-def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", []>, TB;
+def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", [], IIC_SWAPGS>, TB;
 
 def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), 
-                "lar{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+                "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB, OpSize;
 def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
-                "lar{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+                "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB, OpSize;
 
 // i16mem operand in LAR32rm and GR32 operand in LAR32rr is not a typo.
 def LAR32rm : I<0x02, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), 
-                "lar{l}\t{$src, $dst|$dst, $src}", []>, TB;
+                "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB;
 def LAR32rr : I<0x02, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
-                "lar{l}\t{$src, $dst|$dst, $src}", []>, TB;
+                "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB;
 // i16mem operand in LAR64rm and GR32 operand in LAR32rr is not a typo.
 def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), 
-                 "lar{q}\t{$src, $dst|$dst, $src}", []>, TB;
+                 "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB;
 def LAR64rr : RI<0x02, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
-                 "lar{q}\t{$src, $dst|$dst, $src}", []>, TB;
+                 "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB;
 
 def LSL16rm : I<0x03, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
-                "lsl{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; 
+                "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB, OpSize; 
 def LSL16rr : I<0x03, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
-                "lsl{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+                "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB, OpSize;
 def LSL32rm : I<0x03, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
-                "lsl{l}\t{$src, $dst|$dst, $src}", []>, TB; 
+                "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB; 
 def LSL32rr : I<0x03, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
-                "lsl{l}\t{$src, $dst|$dst, $src}", []>, TB;
+                "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB;
 def LSL64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
-                 "lsl{q}\t{$src, $dst|$dst, $src}", []>, TB; 
+                 "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB; 
 def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
-                 "lsl{q}\t{$src, $dst|$dst, $src}", []>, TB;
+                 "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB;
 
-def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr", []>, TB;
+def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr",
+               [], IIC_INVLPG>, TB;
 
 def STR16r : I<0x00, MRM1r, (outs GR16:$dst), (ins),
-               "str{w}\t$dst", []>, TB, OpSize;
+               "str{w}\t$dst", [], IIC_STR>, TB, OpSize;
 def STR32r : I<0x00, MRM1r, (outs GR32:$dst), (ins),
-               "str{l}\t$dst", []>, TB;
+               "str{l}\t$dst", [], IIC_STR>, TB;
 def STR64r : RI<0x00, MRM1r, (outs GR64:$dst), (ins),
-                "str{q}\t$dst", []>, TB;
+                "str{q}\t$dst", [], IIC_STR>, TB;
 def STRm   : I<0x00, MRM1m, (outs i16mem:$dst), (ins),
-               "str{w}\t$dst", []>, TB;
+               "str{w}\t$dst", [], IIC_STR>, TB;
 
 def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src),
-             "ltr{w}\t$src", []>, TB;
+             "ltr{w}\t$src", [], IIC_LTR>, TB;
 def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src),
-             "ltr{w}\t$src", []>, TB;
+             "ltr{w}\t$src", [], IIC_LTR>, TB;
              
 def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins),
-                 "push{w}\t{%cs|CS}", []>, Requires<[In32BitMode]>, OpSize;
+                 "push{w}\t{%cs|CS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
+               OpSize;
 def PUSHCS32 : I<0x0E, RawFrm, (outs), (ins),
-                 "push{l}\t{%cs|CS}", []>, Requires<[In32BitMode]>;
+                 "push{l}\t{%cs|CS}", [], IIC_PUSH_CS>, Requires<[In32BitMode]>;
 def PUSHSS16 : I<0x16, RawFrm, (outs), (ins),
-                 "push{w}\t{%ss|SS}", []>, Requires<[In32BitMode]>, OpSize;
+                 "push{w}\t{%ss|SS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
+               OpSize;
 def PUSHSS32 : I<0x16, RawFrm, (outs), (ins),
-                 "push{l}\t{%ss|SS}", []>, Requires<[In32BitMode]>;
+                 "push{l}\t{%ss|SS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
 def PUSHDS16 : I<0x1E, RawFrm, (outs), (ins),
-                 "push{w}\t{%ds|DS}", []>, Requires<[In32BitMode]>, OpSize;
+                 "push{w}\t{%ds|DS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
+               OpSize;
 def PUSHDS32 : I<0x1E, RawFrm, (outs), (ins),
-                 "push{l}\t{%ds|DS}", []>, Requires<[In32BitMode]>;
+                 "push{l}\t{%ds|DS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
 def PUSHES16 : I<0x06, RawFrm, (outs), (ins),
-                 "push{w}\t{%es|ES}", []>, Requires<[In32BitMode]>, OpSize;
+                 "push{w}\t{%es|ES}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
+               OpSize;
 def PUSHES32 : I<0x06, RawFrm, (outs), (ins),
-                 "push{l}\t{%es|ES}", []>, Requires<[In32BitMode]>;
+                 "push{l}\t{%es|ES}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
                  
 def PUSHFS16 : I<0xa0, RawFrm, (outs), (ins),
-                 "push{w}\t{%fs|FS}", []>, OpSize, TB;
+                 "push{w}\t{%fs|FS}", [], IIC_PUSH_SR>, OpSize, TB;
 def PUSHFS32 : I<0xa0, RawFrm, (outs), (ins),
-                 "push{l}\t{%fs|FS}", []>, TB, Requires<[In32BitMode]>;
+                 "push{l}\t{%fs|FS}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>;
 def PUSHGS16 : I<0xa8, RawFrm, (outs), (ins),
-                 "push{w}\t{%gs|GS}", []>, OpSize, TB;
+                 "push{w}\t{%gs|GS}", [], IIC_PUSH_SR>, OpSize, TB;
 def PUSHGS32 : I<0xa8, RawFrm, (outs), (ins),
-                 "push{l}\t{%gs|GS}", []>, TB, Requires<[In32BitMode]>;
+                 "push{l}\t{%gs|GS}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>;
 
 def PUSHFS64 : I<0xa0, RawFrm, (outs), (ins),
-                 "push{q}\t{%fs|FS}", []>, TB;
+                 "push{q}\t{%fs|FS}", [], IIC_PUSH_SR>, TB;
 def PUSHGS64 : I<0xa8, RawFrm, (outs), (ins),
-                 "push{q}\t{%gs|GS}", []>, TB;
+                 "push{q}\t{%gs|GS}", [], IIC_PUSH_SR>, TB;
 
 // No "pop cs" instruction.
 def POPSS16 : I<0x17, RawFrm, (outs), (ins),
-                "pop{w}\t{%ss|SS}", []>, OpSize, Requires<[In32BitMode]>;
+                "pop{w}\t{%ss|SS}", [], IIC_POP_SR_SS>,
+              OpSize, Requires<[In32BitMode]>;
 def POPSS32 : I<0x17, RawFrm, (outs), (ins),
-                "pop{l}\t{%ss|SS}", []>        , Requires<[In32BitMode]>;
+                "pop{l}\t{%ss|SS}", [], IIC_POP_SR_SS>,
+                      Requires<[In32BitMode]>;
                 
 def POPDS16 : I<0x1F, RawFrm, (outs), (ins),
-                "pop{w}\t{%ds|DS}", []>, OpSize, Requires<[In32BitMode]>;
+                "pop{w}\t{%ds|DS}", [], IIC_POP_SR>,
+              OpSize, Requires<[In32BitMode]>;
 def POPDS32 : I<0x1F, RawFrm, (outs), (ins),
-                "pop{l}\t{%ds|DS}", []>        , Requires<[In32BitMode]>;
+                "pop{l}\t{%ds|DS}", [], IIC_POP_SR>,
+                      Requires<[In32BitMode]>;
                 
 def POPES16 : I<0x07, RawFrm, (outs), (ins),
-                "pop{w}\t{%es|ES}", []>, OpSize, Requires<[In32BitMode]>;
+                "pop{w}\t{%es|ES}", [], IIC_POP_SR>,
+              OpSize, Requires<[In32BitMode]>;
 def POPES32 : I<0x07, RawFrm, (outs), (ins),
-                "pop{l}\t{%es|ES}", []>        , Requires<[In32BitMode]>;
+                "pop{l}\t{%es|ES}", [], IIC_POP_SR>,
+                      Requires<[In32BitMode]>;
                 
 def POPFS16 : I<0xa1, RawFrm, (outs), (ins),
-                "pop{w}\t{%fs|FS}", []>, OpSize, TB;
+                "pop{w}\t{%fs|FS}", [], IIC_POP_SR>, OpSize, TB;
 def POPFS32 : I<0xa1, RawFrm, (outs), (ins),
-                "pop{l}\t{%fs|FS}", []>, TB    , Requires<[In32BitMode]>;
+                "pop{l}\t{%fs|FS}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>;
 def POPFS64 : I<0xa1, RawFrm, (outs), (ins),
-                "pop{q}\t{%fs|FS}", []>, TB;
+                "pop{q}\t{%fs|FS}", [], IIC_POP_SR>, TB;
                 
 def POPGS16 : I<0xa9, RawFrm, (outs), (ins),
-                "pop{w}\t{%gs|GS}", []>, OpSize, TB;
+                "pop{w}\t{%gs|GS}", [], IIC_POP_SR>, OpSize, TB;
 def POPGS32 : I<0xa9, RawFrm, (outs), (ins),
-                "pop{l}\t{%gs|GS}", []>, TB    , Requires<[In32BitMode]>;
+                "pop{l}\t{%gs|GS}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>;
 def POPGS64 : I<0xa9, RawFrm, (outs), (ins),
-                "pop{q}\t{%gs|GS}", []>, TB;
+                "pop{q}\t{%gs|GS}", [], IIC_POP_SR>, TB;
                  
 
 def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
-                "lds{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+                "lds{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize;
 def LDS32rm : I<0xc5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
-                "lds{l}\t{$src, $dst|$dst, $src}", []>;
+                "lds{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>;
                 
 def LSS16rm : I<0xb2, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
-                "lss{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+                "lss{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize;
 def LSS32rm : I<0xb2, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
-                "lss{l}\t{$src, $dst|$dst, $src}", []>, TB;
+                "lss{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
 def LSS64rm : RI<0xb2, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
-                 "lss{q}\t{$src, $dst|$dst, $src}", []>, TB;
+                 "lss{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
                 
 def LES16rm : I<0xc4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
-                "les{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+                "les{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize;
 def LES32rm : I<0xc4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
-                "les{l}\t{$src, $dst|$dst, $src}", []>;
+                "les{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>;
                 
 def LFS16rm : I<0xb4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
-                "lfs{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+                "lfs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize;
 def LFS32rm : I<0xb4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
-                "lfs{l}\t{$src, $dst|$dst, $src}", []>, TB;
+                "lfs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
 def LFS64rm : RI<0xb4, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
-                 "lfs{q}\t{$src, $dst|$dst, $src}", []>, TB;
+                 "lfs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
                 
 def LGS16rm : I<0xb5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
-                "lgs{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+                "lgs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize;
 def LGS32rm : I<0xb5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
-                "lgs{l}\t{$src, $dst|$dst, $src}", []>, TB;
+                "lgs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
                 
 def LGS64rm : RI<0xb5, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
-                 "lgs{q}\t{$src, $dst|$dst, $src}", []>, TB;
+                 "lgs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
 
 
 def VERRr : I<0x00, MRM4r, (outs), (ins GR16:$seg),
-              "verr\t$seg", []>, TB;
+              "verr\t$seg", [], IIC_VERR>, TB;
 def VERRm : I<0x00, MRM4m, (outs), (ins i16mem:$seg),
-              "verr\t$seg", []>, TB;
+              "verr\t$seg", [], IIC_VERR>, TB;
 def VERWr : I<0x00, MRM5r, (outs), (ins GR16:$seg),
-              "verw\t$seg", []>, TB;
+              "verw\t$seg", [], IIC_VERW_MEM>, TB;
 def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg),
-              "verw\t$seg", []>, TB;
+              "verw\t$seg", [], IIC_VERW_REG>, TB;
 
 //===----------------------------------------------------------------------===//
 // Descriptor-table support instructions
 
 def SGDT16m : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins),
-              "sgdtw\t$dst", []>, TB, OpSize, Requires<[In32BitMode]>;
+              "sgdtw\t$dst", [], IIC_SGDT>, TB, OpSize, Requires<[In32BitMode]>;
 def SGDTm : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins),
-              "sgdt\t$dst", []>, TB;
+              "sgdt\t$dst", [], IIC_SGDT>, TB;
 def SIDT16m : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins),
-              "sidtw\t$dst", []>, TB, OpSize, Requires<[In32BitMode]>;
+              "sidtw\t$dst", [], IIC_SIDT>, TB, OpSize, Requires<[In32BitMode]>;
 def SIDTm : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins),
               "sidt\t$dst", []>, TB;
 def SLDT16r : I<0x00, MRM0r, (outs GR16:$dst), (ins),
-                "sldt{w}\t$dst", []>, TB, OpSize;
+                "sldt{w}\t$dst", [], IIC_SLDT>, TB, OpSize;
 def SLDT16m : I<0x00, MRM0m, (outs i16mem:$dst), (ins),
-                "sldt{w}\t$dst", []>, TB;
+                "sldt{w}\t$dst", [], IIC_SLDT>, TB;
 def SLDT32r : I<0x00, MRM0r, (outs GR32:$dst), (ins),
-                "sldt{l}\t$dst", []>, TB;
+                "sldt{l}\t$dst", [], IIC_SLDT>, TB;
                 
 // LLDT is not interpreted specially in 64-bit mode because there is no sign
 //   extension.
 def SLDT64r : RI<0x00, MRM0r, (outs GR64:$dst), (ins),
-                 "sldt{q}\t$dst", []>, TB;
+                 "sldt{q}\t$dst", [], IIC_SLDT>, TB;
 def SLDT64m : RI<0x00, MRM0m, (outs i16mem:$dst), (ins),
-                 "sldt{q}\t$dst", []>, TB;
+                 "sldt{q}\t$dst", [], IIC_SLDT>, TB;
 
 def LGDT16m : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
-              "lgdtw\t$src", []>, TB, OpSize, Requires<[In32BitMode]>;
+              "lgdtw\t$src", [], IIC_LGDT>, TB, OpSize, Requires<[In32BitMode]>;
 def LGDTm : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
-              "lgdt\t$src", []>, TB;
+              "lgdt\t$src", [], IIC_LGDT>, TB;
 def LIDT16m : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
-              "lidtw\t$src", []>, TB, OpSize, Requires<[In32BitMode]>;
+              "lidtw\t$src", [], IIC_LIDT>, TB, OpSize, Requires<[In32BitMode]>;
 def LIDTm : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
-              "lidt\t$src", []>, TB;
+              "lidt\t$src", [], IIC_LIDT>, TB;
 def LLDT16r : I<0x00, MRM2r, (outs), (ins GR16:$src),
-                "lldt{w}\t$src", []>, TB;
+                "lldt{w}\t$src", [], IIC_LLDT_REG>, TB;
 def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src),
-                "lldt{w}\t$src", []>, TB;
+                "lldt{w}\t$src", [], IIC_LLDT_MEM>, TB;
                 
 //===----------------------------------------------------------------------===//
 // Specialized register support
-def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", []>, TB;
-def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", []>, TB;
-def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", []>, TB;
+def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", [], IIC_WRMSR>, TB;
+def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", [], IIC_RDMSR>, TB;
+def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [], IIC_RDPMC>, TB;
 
 def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins), 
-                "smsw{w}\t$dst", []>, OpSize, TB;
+                "smsw{w}\t$dst", [], IIC_SMSW>, OpSize, TB;
 def SMSW32r : I<0x01, MRM4r, (outs GR32:$dst), (ins), 
-                "smsw{l}\t$dst", []>, TB;
+                "smsw{l}\t$dst", [], IIC_SMSW>, TB;
 // no m form encodable; use SMSW16m
 def SMSW64r : RI<0x01, MRM4r, (outs GR64:$dst), (ins), 
-                 "smsw{q}\t$dst", []>, TB;
+                 "smsw{q}\t$dst", [], IIC_SMSW>, TB;
 
 // For memory operands, there is only a 16-bit form
 def SMSW16m : I<0x01, MRM4m, (outs i16mem:$dst), (ins),
-                "smsw{w}\t$dst", []>, TB;
+                "smsw{w}\t$dst", [], IIC_SMSW>, TB;
 
 def LMSW16r : I<0x01, MRM6r, (outs), (ins GR16:$src),
-                "lmsw{w}\t$src", []>, TB;
+                "lmsw{w}\t$src", [], IIC_LMSW_MEM>, TB;
 def LMSW16m : I<0x01, MRM6m, (outs), (ins i16mem:$src),
-                "lmsw{w}\t$src", []>, TB;
+                "lmsw{w}\t$src", [], IIC_LMSW_REG>, TB;
                 
-def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", []>, TB;
+def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", [], IIC_CPUID>, TB;
 
 //===----------------------------------------------------------------------===//
 // Cache instructions
-def INVD : I<0x08, RawFrm, (outs), (ins), "invd", []>, TB;
-def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", []>, TB;
+def INVD : I<0x08, RawFrm, (outs), (ins), "invd", [], IIC_INVD>, TB;
+def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [], IIC_INVD>, TB;
 
 //===----------------------------------------------------------------------===//
 // XSAVE instructions
diff --git a/lib/Target/X86/X86InstrVMX.td b/lib/Target/X86/X86InstrVMX.td
index 6a8f0c8..6d3548f 100644
--- a/lib/Target/X86/X86InstrVMX.td
+++ b/lib/Target/X86/X86InstrVMX.td
@@ -17,17 +17,17 @@
 
 // 66 0F 38 80
 def INVEPT32 : I<0x80, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
-               "invept {$src2, $src1|$src1, $src2}", []>, OpSize, T8,
+               "invept\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8,
                Requires<[In32BitMode]>;
 def INVEPT64 : I<0x80, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
-               "invept {$src2, $src1|$src1, $src2}", []>, OpSize, T8,
+               "invept\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8,
                Requires<[In64BitMode]>;
 // 66 0F 38 81
 def INVVPID32 : I<0x81, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
-                "invvpid {$src2, $src1|$src1, $src2}", []>, OpSize, T8,
+                "invvpid\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8,
                 Requires<[In32BitMode]>;
 def INVVPID64 : I<0x81, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
-                "invvpid {$src2, $src1|$src1, $src2}", []>, OpSize, T8,
+                "invvpid\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8,
                 Requires<[In64BitMode]>;
 // 0F 01 C1
 def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", []>, TB;
diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td
index 65bbcb5..8ec2c68 100644
--- a/lib/Target/X86/X86InstrXOP.td
+++ b/lib/Target/X86/X86InstrXOP.td
@@ -15,7 +15,7 @@ multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> {
   def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
            [(set VR128:$dst, (Int VR128:$src))]>, VEX;
-  def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+  def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
            [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX;
 }
@@ -36,27 +36,19 @@ let isAsmParserOnly = 1 in {
   defm VPHADDBW  : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, memopv2i64>;
   defm VPHADDBQ  : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, memopv2i64>;
   defm VPHADDBD  : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, memopv2i64>;
-  defm VFRCZPS   : xop2op<0x80, "vfrczps", int_x86_xop_vfrcz_ps, memopv4f32>;
-  defm VFRCZPD   : xop2op<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, memopv2f64>;
 }
 
 // Scalar load 2 addr operand instructions
-let Constraints = "$src1 = $dst" in {
 multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
                      Operand memop, ComplexPattern mem_cpat> {
-  def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1,
-                                                        VR128:$src2),
-           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-           [(set VR128:$dst, (Int VR128:$src1, VR128:$src2))]>, VEX;
-  def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1,
-                                                        memop:$src2),
-           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-           [(set VR128:$dst, (Int VR128:$src1,
-                                  (bitconvert mem_cpat:$src2)))]>, VEX;
+  def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR128:$dst, (Int VR128:$src))]>, VEX;
+  def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins memop:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, VEX;
 }
 
-} // Constraints = "$src1 = $dst"
-
 let isAsmParserOnly = 1 in {
   defm VFRCZSS   : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss,
                    ssmem, sse_load_f32>;
@@ -64,12 +56,26 @@ let isAsmParserOnly = 1 in {
                    sdmem, sse_load_f64>;
 }
 
+multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int,
+                     PatFrag memop> {
+  def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR128:$dst, (Int VR128:$src))]>, VEX;
+  def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX;
+}
+
+let isAsmParserOnly = 1 in {
+  defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, memopv4f32>;
+  defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, memopv2f64>;
+}
 
 multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
                      PatFrag memop> {
   def rrY : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR256:$dst, (Int VR256:$src))]>, VEX, VEX_L;
+           [(set VR256:$dst, (Int VR256:$src))]>, VEX;
   def rmY : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
            [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX;
@@ -88,13 +94,13 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst, (Int VR128:$src1, VR128:$src2))]>, VEX_4VOp3;
   def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
-           (ins VR128:$src1, f128mem:$src2),
+           (ins VR128:$src1, i128mem:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
               (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2))))]>,
            VEX_4V, VEX_W;
   def mr : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
-           (ins f128mem:$src1, VR128:$src2),
+           (ins i128mem:$src1, VR128:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
               (Int (bitconvert (memopv2i64 addr:$src1)), VR128:$src2))]>,
@@ -116,25 +122,23 @@ let isAsmParserOnly = 1 in {
   defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>;
 }
 
-multiclass xop3opimm<bits<8> opc, string OpcodeStr> {
-  let neverHasSideEffects = 1 in {
-    def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
-             (ins VR128:$src1, i8imm:$src2),
-             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             []>, VEX;
-    let mayLoad = 1 in
-    def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
-             (ins f128mem:$src1, i8imm:$src2),
-             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             []>, VEX;
-  }
+multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+  def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, i8imm:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set VR128:$dst, (Int VR128:$src1, imm:$src2))]>, VEX;
+  def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins i128mem:$src1, i8imm:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set VR128:$dst,
+             (Int (bitconvert (memopv2i64 addr:$src1)), imm:$src2))]>, VEX;
 }
 
 let isAsmParserOnly = 1 in {
-  defm VPROTW : xop3opimm<0xC1, "vprotw">;
-  defm VPROTQ : xop3opimm<0xC3, "vprotq">;
-  defm VPROTD : xop3opimm<0xC2, "vprotd">;
-  defm VPROTB : xop3opimm<0xC0, "vprotb">;
+  defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>;
+  defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>;
+  defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>;
+  defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>;
 }
 
 // Instruction where second source can be memory, but third must be register
@@ -146,7 +150,7 @@ multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            [(set VR128:$dst,
               (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_4V, VEX_I8IMM;
   def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
-           (ins VR128:$src1, f128mem:$src2, VR128:$src3),
+           (ins VR128:$src1, i128mem:$src2, VR128:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
@@ -170,32 +174,31 @@ let isAsmParserOnly = 1 in {
 }
 
 // Instruction where second source can be memory, third must be imm8
-multiclass xop4opimm<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                     ValueType VT> {
+multiclass xop4opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
   def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
            (ins VR128:$src1, VR128:$src2, i8imm:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-           [(set VR128:$dst,
-             (VT (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>, VEX_4V;
+           [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+           VEX_4V;
   def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
-           (ins VR128:$src1, f128mem:$src2, i8imm:$src3),
+           (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
-             (VT (OpNode VR128:$src1, (bitconvert (memopv2i64 addr:$src2)),
-                  imm:$src3)))]>, VEX_4V;
+             (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2)),
+              imm:$src3))]>, VEX_4V;
 }
 
 let isAsmParserOnly = 1 in {
-  defm VPCOMB  : xop4opimm<0xCC, "vpcomb", X86vpcom, v16i8>;
-  defm VPCOMW  : xop4opimm<0xCD, "vpcomw", X86vpcom, v8i16>;
-  defm VPCOMD  : xop4opimm<0xCE, "vpcomd", X86vpcom, v4i32>;
-  defm VPCOMQ  : xop4opimm<0xCF, "vpcomq", X86vpcom, v2i64>;
-  defm VPCOMUB : xop4opimm<0xEC, "vpcomub", X86vpcomu, v16i8>;
-  defm VPCOMUW : xop4opimm<0xED, "vpcomuw", X86vpcomu, v8i16>;
-  defm VPCOMUD : xop4opimm<0xEE, "vpcomud", X86vpcomu, v4i32>;
-  defm VPCOMUQ : xop4opimm<0xEF, "vpcomuq", X86vpcomu, v2i64>;
+  defm VPCOMB  : xop4opimm<0xCC, "vpcomb", int_x86_xop_vpcomb>;
+  defm VPCOMW  : xop4opimm<0xCD, "vpcomw", int_x86_xop_vpcomw>;
+  defm VPCOMD  : xop4opimm<0xCE, "vpcomd", int_x86_xop_vpcomd>;
+  defm VPCOMQ  : xop4opimm<0xCF, "vpcomq", int_x86_xop_vpcomq>;
+  defm VPCOMUB : xop4opimm<0xEC, "vpcomub", int_x86_xop_vpcomub>;
+  defm VPCOMUW : xop4opimm<0xED, "vpcomuw", int_x86_xop_vpcomuw>;
+  defm VPCOMUD : xop4opimm<0xEE, "vpcomud", int_x86_xop_vpcomud>;
+  defm VPCOMUQ : xop4opimm<0xEF, "vpcomuq", int_x86_xop_vpcomuq>;
 }
 
 // Instruction where either second or third source can be memory
@@ -207,7 +210,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, VR128:$src3))]>,
            VEX_4V, VEX_I8IMM;
   def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
-           (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+           (ins VR128:$src1, VR128:$src2, i128mem:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
@@ -215,7 +218,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
               (bitconvert (memopv2i64 addr:$src3))))]>,
            VEX_4V, VEX_I8IMM, VEX_W, MemOp4;
   def mr : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
-           (ins VR128:$src1, f128mem:$src2, VR128:$src3),
+           (ins VR128:$src1, i128mem:$src2, VR128:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
@@ -237,7 +240,7 @@ multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            [(set VR256:$dst, (Int VR256:$src1, VR256:$src2, VR256:$src3))]>,
            VEX_4V, VEX_I8IMM;
   def rmY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst),
-           (ins VR256:$src1, VR256:$src2, f256mem:$src3),
+           (ins VR256:$src1, VR256:$src2, i256mem:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR256:$dst,
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index b578e8d..df7507c 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -156,10 +156,14 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
     break;
   case X86II::MO_SECREL:    RefKind = MCSymbolRefExpr::VK_SECREL; break;
   case X86II::MO_TLSGD:     RefKind = MCSymbolRefExpr::VK_TLSGD; break;
+  case X86II::MO_TLSLD:     RefKind = MCSymbolRefExpr::VK_TLSLD; break;
+  case X86II::MO_TLSLDM:    RefKind = MCSymbolRefExpr::VK_TLSLDM; break;
   case X86II::MO_GOTTPOFF:  RefKind = MCSymbolRefExpr::VK_GOTTPOFF; break;
   case X86II::MO_INDNTPOFF: RefKind = MCSymbolRefExpr::VK_INDNTPOFF; break;
   case X86II::MO_TPOFF:     RefKind = MCSymbolRefExpr::VK_TPOFF; break;
+  case X86II::MO_DTPOFF:    RefKind = MCSymbolRefExpr::VK_DTPOFF; break;
   case X86II::MO_NTPOFF:    RefKind = MCSymbolRefExpr::VK_NTPOFF; break;
+  case X86II::MO_GOTNTPOFF: RefKind = MCSymbolRefExpr::VK_GOTNTPOFF; break;
   case X86II::MO_GOTPCREL:  RefKind = MCSymbolRefExpr::VK_GOTPCREL; break;
   case X86II::MO_GOT:       RefKind = MCSymbolRefExpr::VK_GOT; break;
   case X86II::MO_GOTOFF:    RefKind = MCSymbolRefExpr::VK_GOTOFF; break;
@@ -550,17 +554,38 @@ ReSimplify:
 static void LowerTlsAddr(MCStreamer &OutStreamer,
                          X86MCInstLower &MCInstLowering,
                          const MachineInstr &MI) {
-  bool is64Bits = MI.getOpcode() == X86::TLS_addr64;
+
+  bool is64Bits = MI.getOpcode() == X86::TLS_addr64 ||
+                  MI.getOpcode() == X86::TLS_base_addr64;
+
+  bool needsPadding = MI.getOpcode() == X86::TLS_addr64;
+
   MCContext &context = OutStreamer.getContext();
 
-  if (is64Bits) {
+  if (needsPadding) {
     MCInst prefix;
     prefix.setOpcode(X86::DATA16_PREFIX);
     OutStreamer.EmitInstruction(prefix);
   }
+
+  MCSymbolRefExpr::VariantKind SRVK;
+  switch (MI.getOpcode()) {
+    case X86::TLS_addr32:
+    case X86::TLS_addr64:
+      SRVK = MCSymbolRefExpr::VK_TLSGD;
+      break;
+    case X86::TLS_base_addr32:
+      SRVK = MCSymbolRefExpr::VK_TLSLDM;
+      break;
+    case X86::TLS_base_addr64:
+      SRVK = MCSymbolRefExpr::VK_TLSLD;
+      break;
+    default:
+      llvm_unreachable("unexpected opcode");
+  }
+
   MCSymbol *sym = MCInstLowering.GetSymbolFromOperand(MI.getOperand(3));
-  const MCSymbolRefExpr *symRef =
-    MCSymbolRefExpr::Create(sym, MCSymbolRefExpr::VK_TLSGD, context);
+  const MCSymbolRefExpr *symRef = MCSymbolRefExpr::Create(sym, SRVK, context);
 
   MCInst LEA;
   if (is64Bits) {
@@ -571,6 +596,14 @@ static void LowerTlsAddr(MCStreamer &OutStreamer,
     LEA.addOperand(MCOperand::CreateReg(0));        // index
     LEA.addOperand(MCOperand::CreateExpr(symRef));  // disp
     LEA.addOperand(MCOperand::CreateReg(0));        // seg
+  } else if (SRVK == MCSymbolRefExpr::VK_TLSLDM) {
+    LEA.setOpcode(X86::LEA32r);
+    LEA.addOperand(MCOperand::CreateReg(X86::EAX)); // dest
+    LEA.addOperand(MCOperand::CreateReg(X86::EBX)); // base
+    LEA.addOperand(MCOperand::CreateImm(1));        // scale
+    LEA.addOperand(MCOperand::CreateReg(0));        // index
+    LEA.addOperand(MCOperand::CreateExpr(symRef));  // disp
+    LEA.addOperand(MCOperand::CreateReg(0));        // seg
   } else {
     LEA.setOpcode(X86::LEA32r);
     LEA.addOperand(MCOperand::CreateReg(X86::EAX)); // dest
@@ -582,7 +615,7 @@ static void LowerTlsAddr(MCStreamer &OutStreamer,
   }
   OutStreamer.EmitInstruction(LEA);
 
-  if (is64Bits) {
+  if (needsPadding) {
     MCInst prefix;
     prefix.setOpcode(X86::DATA16_PREFIX);
     OutStreamer.EmitInstruction(prefix);
@@ -609,8 +642,6 @@ static void LowerTlsAddr(MCStreamer &OutStreamer,
 }
 
 void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  OutStreamer.EmitCodeRegion();
-
   X86MCInstLower MCInstLowering(Mang, *MF, *this);
   switch (MI->getOpcode()) {
   case TargetOpcode::DBG_VALUE:
@@ -646,6 +677,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
   case X86::TLS_addr32:
   case X86::TLS_addr64:
+  case X86::TLS_base_addr32:
+  case X86::TLS_base_addr64:
     return LowerTlsAddr(OutStreamer, MCInstLowering, *MI);
 
   case X86::MOVPC32r: {
@@ -715,4 +748,3 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   MCInstLowering.Lower(MI, TmpInst);
   OutStreamer.EmitInstruction(TmpInst);
 }
-
diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h
index c747109..f83a525 100644
--- a/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/lib/Target/X86/X86MachineFunctionInfo.h
@@ -66,6 +66,8 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
   /// ArgumentStackSize - The number of bytes on stack consumed by the arguments
   /// being passed on the stack.
   unsigned ArgumentStackSize;
+  /// NumLocalDynamics - Number of local-dynamic TLS accesses.
+  unsigned NumLocalDynamics;
 
 public:
   X86MachineFunctionInfo() : ForceFramePointer(false),
@@ -79,7 +81,8 @@ public:
                              RegSaveFrameIndex(0),
                              VarArgsGPOffset(0),
                              VarArgsFPOffset(0),
-                             ArgumentStackSize(0) {}
+                             ArgumentStackSize(0),
+                             NumLocalDynamics(0) {}
   
   explicit X86MachineFunctionInfo(MachineFunction &MF)
     : ForceFramePointer(false),
@@ -93,8 +96,9 @@ public:
       RegSaveFrameIndex(0),
       VarArgsGPOffset(0),
       VarArgsFPOffset(0),
-      ArgumentStackSize(0) {}
-  
+      ArgumentStackSize(0),
+      NumLocalDynamics(0) {}
+
   bool getForceFramePointer() const { return ForceFramePointer;} 
   void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
 
@@ -130,6 +134,10 @@ public:
 
   unsigned getArgumentStackSize() const { return ArgumentStackSize; }
   void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; }
+
+  unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; }
+  void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; }
+
 };
 
 } // End llvm namespace
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index b56025f..acf53f8 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -50,6 +50,10 @@ ForceStackAlign("force-align-stack",
                            " needed for the function."),
                  cl::init(false), cl::Hidden);
 
+cl::opt<bool>
+EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true),
+          cl::desc("Enable use of a base pointer for complex stack frames"));
+
 X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm,
                                  const TargetInstrInfo &tii)
   : X86GenRegisterInfo(tm.getSubtarget<X86Subtarget>().is64Bit()
@@ -68,10 +72,12 @@ X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm,
     SlotSize = 8;
     StackPtr = X86::RSP;
     FramePtr = X86::RBP;
+    BasePtr = X86::RBX;
   } else {
     SlotSize = 4;
     StackPtr = X86::ESP;
     FramePtr = X86::EBP;
+    BasePtr = X86::EBX;
   }
 }
 
@@ -90,6 +96,12 @@ int X86RegisterInfo::getCompactUnwindRegNum(unsigned RegNum, bool isEH) const {
   return -1;
 }
 
+bool
+X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+  // Only enable when post-RA scheduling is enabled and this is needed.
+  return TM.getSubtargetImpl()->postRAScheduler();
+}
+
 int
 X86RegisterInfo::getSEHRegNum(unsigned i) const {
   int reg = X86_MC::getX86RegNum(i);
@@ -146,7 +158,7 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{
   // The GR8_NOREX class is always used in a way that won't be constrained to a
   // sub-class, so sub-classes like GR8_ABCD_L are allowed to expand to the
   // full GR8 class.
-  if (RC == X86::GR8_NOREXRegisterClass)
+  if (RC == &X86::GR8_NOREXRegClass)
     return RC;
 
   const TargetRegisterClass *Super = RC;
@@ -175,7 +187,8 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{
 }
 
 const TargetRegisterClass *
-X86RegisterInfo::getPointerRegClass(unsigned Kind) const {
+X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
+                                                                         const {
   switch (Kind) {
   default: llvm_unreachable("Unexpected Kind in getPointerRegClass!");
   case 0: // Normal GPRs.
@@ -238,7 +251,7 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   }
 
   if (ghcCall)
-    return CSR_Ghc_SaveList;
+    return CSR_NoRegs_SaveList;
   if (Is64Bit) {
     if (IsWin64)
       return CSR_Win64_SaveList;
@@ -254,7 +267,7 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
 const uint32_t*
 X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
   if (CC == CallingConv::GHC)
-    return CSR_Ghc_RegMask;
+    return CSR_NoRegs_RegMask;
   if (!Is64Bit)
     return CSR_32_RegMask;
   if (IsWin64)
@@ -268,21 +281,33 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 
   // Set the stack-pointer register and its aliases as reserved.
   Reserved.set(X86::RSP);
-  Reserved.set(X86::ESP);
-  Reserved.set(X86::SP);
-  Reserved.set(X86::SPL);
+  for (MCSubRegIterator I(X86::RSP, this); I.isValid(); ++I)
+    Reserved.set(*I);
 
   // Set the instruction pointer register and its aliases as reserved.
   Reserved.set(X86::RIP);
-  Reserved.set(X86::EIP);
-  Reserved.set(X86::IP);
+  for (MCSubRegIterator I(X86::RIP, this); I.isValid(); ++I)
+    Reserved.set(*I);
 
   // Set the frame-pointer register and its aliases as reserved if needed.
   if (TFI->hasFP(MF)) {
     Reserved.set(X86::RBP);
-    Reserved.set(X86::EBP);
-    Reserved.set(X86::BP);
-    Reserved.set(X86::BPL);
+    for (MCSubRegIterator I(X86::RBP, this); I.isValid(); ++I)
+      Reserved.set(*I);
+  }
+
+  // Set the base-pointer register and its aliases as reserved if needed.
+  if (hasBasePointer(MF)) {
+    CallingConv::ID CC = MF.getFunction()->getCallingConv();
+    const uint32_t* RegMask = getCallPreservedMask(CC);
+    if (MachineOperand::clobbersPhysReg(RegMask, getBaseRegister()))
+      report_fatal_error(
+        "Stack realignment in presence of dynamic allocas is not supported with"
+        "this calling convention.");
+
+    Reserved.set(getBaseRegister());
+    for (MCSubRegIterator I(getBaseRegister(), this); I.isValid(); ++I)
+      Reserved.set(*I);
   }
 
   // Mark the segment registers as reserved.
@@ -293,6 +318,16 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   Reserved.set(X86::FS);
   Reserved.set(X86::GS);
 
+  // Mark the floating point stack registers as reserved.
+  Reserved.set(X86::ST0);
+  Reserved.set(X86::ST1);
+  Reserved.set(X86::ST2);
+  Reserved.set(X86::ST3);
+  Reserved.set(X86::ST4);
+  Reserved.set(X86::ST5);
+  Reserved.set(X86::ST6);
+  Reserved.set(X86::ST7);
+
   // Reserve the registers that only exist in 64-bit mode.
   if (!Is64Bit) {
     // These 8-bit registers are part of the x86-64 extension even though their
@@ -308,14 +343,13 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
         X86::R8,  X86::R9,  X86::R10, X86::R11,
         X86::R12, X86::R13, X86::R14, X86::R15
       };
-      for (const uint16_t *AI = getOverlaps(GPR64[n]); unsigned Reg = *AI; ++AI)
-        Reserved.set(Reg);
+      for (MCRegAliasIterator AI(GPR64[n], this, true); AI.isValid(); ++AI)
+        Reserved.set(*AI);
 
       // XMM8, XMM9, ...
       assert(X86::XMM15 == X86::XMM8+7);
-      for (const uint16_t *AI = getOverlaps(X86::XMM8 + n); unsigned Reg = *AI;
-           ++AI)
-        Reserved.set(Reg);
+      for (MCRegAliasIterator AI(X86::XMM8 + n, this, true); AI.isValid(); ++AI)
+        Reserved.set(*AI);
     }
   }
 
@@ -326,10 +360,36 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 // Stack Frame Processing methods
 //===----------------------------------------------------------------------===//
 
+bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+   const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+   if (!EnableBasePointer)
+     return false;
+
+   // When we need stack realignment and there are dynamic allocas, we can't 
+   // reference off of the stack pointer, so we reserve a base pointer.
+   if (needsStackRealignment(MF) && MFI->hasVarSizedObjects())
+     return true;
+
+   return false;
+}
+
 bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
-  return (MF.getTarget().Options.RealignStack &&
-          !MFI->hasVarSizedObjects());
+  const MachineRegisterInfo *MRI = &MF.getRegInfo();
+  if (!MF.getTarget().Options.RealignStack)
+    return false;
+
+  // Stack realignment requires a frame pointer.  If we already started
+  // register allocation with frame pointer elimination, it is too late now.
+  if (!MRI->canReserveReg(FramePtr))
+    return false;
+
+  // If a base pointer is necessary.  Check that it isn't too late to reserve
+  // it.
+  if (MFI->hasVarSizedObjects())
+    return MRI->canReserveReg(BasePtr);
+  return true;
 }
 
 bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
@@ -339,13 +399,6 @@ bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
   bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) ||
                                F->hasFnAttr(Attribute::StackAlignment));
 
-  // FIXME: Currently we don't support stack realignment for functions with
-  //        variable-sized allocas.
-  // FIXME: It's more complicated than this...
-  if (0 && requiresRealignment && MFI->hasVarSizedObjects())
-    report_fatal_error(
-      "Stack realignment in presence of dynamic allocas is not supported");
-
   // If we've requested that we force align the stack do so now.
   if (ForceStackAlign)
     return canRealignStack(MF);
@@ -485,7 +538,9 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
   unsigned Opc = MI.getOpcode();
   bool AfterFPPop = Opc == X86::TAILJMPm64 || Opc == X86::TAILJMPm;
-  if (needsStackRealignment(MF))
+  if (hasBasePointer(MF))
+    BasePtr = (FrameIndex < 0 ? FramePtr : getBaseRegister());
+  else if (needsStackRealignment(MF))
     BasePtr = (FrameIndex < 0 ? FramePtr : StackPtr);
   else if (AfterFPPop)
     BasePtr = StackPtr;
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index bee0393..1bc32cb 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -50,6 +50,11 @@ private:
   ///
   unsigned FramePtr;
 
+  /// BasePtr - X86 physical register used as a base ptr in complex stack
+  /// frames. I.e., when we need a 3rd base, not just SP and FP, due to
+  /// variable size stack objects.
+  unsigned BasePtr;
+
 public:
   X86RegisterInfo(X86TargetMachine &tm, const TargetInstrInfo &tii);
 
@@ -65,7 +70,8 @@ public:
   int getCompactUnwindRegNum(unsigned RegNum, bool isEH) const;
 
   /// Code Generation virtual methods...
-  /// 
+  ///
+  virtual bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const;
 
   /// getMatchingSuperRegClass - Return a subclass of the specified register
   /// class A so that each register in it has a sub-register of the
@@ -82,7 +88,8 @@ public:
 
   /// getPointerRegClass - Returns a TargetRegisterClass used for pointer
   /// values.
-  const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const;
+  const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF, unsigned Kind = 0) const;
 
   /// getCrossCopyRegClass - Returns a legal register class to copy a register
   /// in the specified class to or from. Returns NULL if it is possible to copy
@@ -104,6 +111,8 @@ public:
   /// register scavenger to determine what registers are free.
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
+  bool hasBasePointer(const MachineFunction &MF) const;
+
   bool canRealignStack(const MachineFunction &MF) const;
 
   bool needsStackRealignment(const MachineFunction &MF) const;
@@ -121,6 +130,7 @@ public:
   // Debug information queries.
   unsigned getFrameRegister(const MachineFunction &MF) const;
   unsigned getStackRegister() const { return StackPtr; }
+  unsigned getBaseRegister() const { return BasePtr; }
   // FIXME: Move to FrameInfok
   unsigned getSlotSize() const { return SlotSize; }
 
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index 5263a49..ae2d4d0 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -223,6 +223,9 @@ let Namespace = "X86" in {
   def ST6 : STRegister<"st(6)", [FP1]>, DwarfRegNum<[39, 18, 17]>;
   def ST7 : STRegister<"st(7)", [FP0]>, DwarfRegNum<[40, 19, 18]>;
 
+  // Floating-point status word
+  def FPSW : Register<"fpsw">;
+
   // Status flags register
   def EFLAGS : Register<"flags">;
 
@@ -296,26 +299,18 @@ def GR8 : RegisterClass<"X86", [i8],  8,
 
 def GR16 : RegisterClass<"X86", [i16], 16,
                          (add AX, CX, DX, SI, DI, BX, BP, SP,
-                              R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W)> {
-  let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi)];
-}
+                              R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W)>;
 
 def GR32 : RegisterClass<"X86", [i32], 32,
                          (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP,
-                              R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D)> {
-  let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)];
-}
+                              R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D)>;
 
 // GR64 - 64-bit GPRs. This oddly includes RIP, which isn't accurate, since
 // RIP isn't really a register and it can't be used anywhere except in an
 // address, but it doesn't cause trouble.
 def GR64 : RegisterClass<"X86", [i64], 64,
                          (add RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
-                              RBX, R14, R15, R12, R13, RBP, RSP, RIP)> {
-  let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi),
-                       (GR16 sub_16bit),
-                       (GR32 sub_32bit)];
-}
+                              RBX, R14, R15, R12, R13, RBP, RSP, RIP)>;
 
 // Segment registers for use by MOV instructions (and others) that have a
 //   segment register as one operand.  Always contain a 16-bit segment
@@ -336,30 +331,12 @@ def CONTROL_REG : RegisterClass<"X86", [i64], 64, (sequence "CR%u", 0, 15)>;
 // operations.
 def GR8_ABCD_L : RegisterClass<"X86", [i8], 8, (add AL, CL, DL, BL)>;
 def GR8_ABCD_H : RegisterClass<"X86", [i8], 8, (add AH, CH, DH, BH)>;
-def GR16_ABCD : RegisterClass<"X86", [i16], 16, (add AX, CX, DX, BX)> {
-  let SubRegClasses = [(GR8_ABCD_L sub_8bit), (GR8_ABCD_H sub_8bit_hi)];
-}
-def GR32_ABCD : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, EBX)> {
-  let SubRegClasses = [(GR8_ABCD_L sub_8bit),
-                       (GR8_ABCD_H sub_8bit_hi),
-                       (GR16_ABCD sub_16bit)];
-}
-def GR64_ABCD : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RBX)> {
-  let SubRegClasses = [(GR8_ABCD_L sub_8bit),
-                       (GR8_ABCD_H sub_8bit_hi),
-                       (GR16_ABCD sub_16bit),
-                       (GR32_ABCD sub_32bit)];
-}
-def GR32_TC   : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)> {
-  let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)];
-}
+def GR16_ABCD : RegisterClass<"X86", [i16], 16, (add AX, CX, DX, BX)>;
+def GR32_ABCD : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, EBX)>;
+def GR64_ABCD : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RBX)>;
+def GR32_TC   : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)>;
 def GR64_TC   : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI,
-                                                     R8, R9, R11, RIP)> {
-  let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi),
-                       (GR16 sub_16bit),
-                       (GR32_TC sub_32bit)];
-}
-
+                                                     R8, R9, R11, RIP)>;
 def GR64_TCW64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX,
                                                       R8, R9, R11)>;
 
@@ -373,64 +350,36 @@ def GR8_NOREX : RegisterClass<"X86", [i8], 8,
 }
 // GR16_NOREX - GR16 registers which do not require a REX prefix.
 def GR16_NOREX : RegisterClass<"X86", [i16], 16,
-                               (add AX, CX, DX, SI, DI, BX, BP, SP)> {
-  let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi)];
-}
+                               (add AX, CX, DX, SI, DI, BX, BP, SP)>;
 // GR32_NOREX - GR32 registers which do not require a REX prefix.
 def GR32_NOREX : RegisterClass<"X86", [i32], 32,
-                               (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP)> {
-  let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi),
-                       (GR16_NOREX sub_16bit)];
-}
+                               (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP)>;
 // GR64_NOREX - GR64 registers which do not require a REX prefix.
 def GR64_NOREX : RegisterClass<"X86", [i64], 64,
-                            (add RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP, RIP)> {
-  let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi),
-                       (GR16_NOREX sub_16bit),
-                       (GR32_NOREX sub_32bit)];
-}
+                            (add RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP, RIP)>;
 
 // GR32_NOAX - GR32 registers except EAX. Used by AddRegFrm of XCHG32 in 64-bit
 // mode to prevent encoding using the 0x90 NOP encoding. xchg %eax, %eax needs
 // to clear upper 32-bits of RAX so is not a NOP.
-def GR32_NOAX : RegisterClass<"X86", [i32], 32, (sub GR32, EAX)> {
-  let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)];
-}
+def GR32_NOAX : RegisterClass<"X86", [i32], 32, (sub GR32, EAX)>;
 
 // GR32_NOSP - GR32 registers except ESP.
-def GR32_NOSP : RegisterClass<"X86", [i32], 32, (sub GR32, ESP)> {
-  let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)];
-}
+def GR32_NOSP : RegisterClass<"X86", [i32], 32, (sub GR32, ESP)>;
 
 // GR64_NOSP - GR64 registers except RSP (and RIP).
-def GR64_NOSP : RegisterClass<"X86", [i64], 64, (sub GR64, RSP, RIP)> {
-  let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi),
-                       (GR16 sub_16bit),
-                       (GR32_NOSP sub_32bit)];
-}
+def GR64_NOSP : RegisterClass<"X86", [i64], 64, (sub GR64, RSP, RIP)>;
 
 // GR32_NOREX_NOSP - GR32 registers which do not require a REX prefix except
 // ESP.
 def GR32_NOREX_NOSP : RegisterClass<"X86", [i32], 32,
-                                    (and GR32_NOREX, GR32_NOSP)> {
-  let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi),
-                       (GR16_NOREX sub_16bit)];
-}
+                                    (and GR32_NOREX, GR32_NOSP)>;
 
 // GR64_NOREX_NOSP - GR64_NOREX registers except RSP.
 def GR64_NOREX_NOSP : RegisterClass<"X86", [i64], 64,
-                                    (and GR64_NOREX, GR64_NOSP)> {
-  let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi),
-                       (GR16_NOREX sub_16bit),
-                       (GR32_NOREX_NOSP sub_32bit)];
-}
+                                    (and GR64_NOREX, GR64_NOSP)>;
 
 // A class to support the 'A' assembler constraint: EAX then EDX.
-def GR32_AD : RegisterClass<"X86", [i32], 32, (add EAX, EDX)> {
-  let SubRegClasses = [(GR8_ABCD_L sub_8bit),
-                       (GR8_ABCD_H sub_8bit_hi),
-                       (GR16_ABCD sub_16bit)];
-}
+def GR32_AD : RegisterClass<"X86", [i32], 32, (add EAX, EDX)>;
 
 // Scalar SSE2 floating point registers.
 def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>;
@@ -458,17 +407,16 @@ def RST : RegisterClass<"X86", [f80, f64, f32], 32, (sequence "ST%u", 0, 7)> {
 // Generic vector registers: VR64 and VR128.
 def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>;
 def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-                          128, (add FR32)> {
-  let SubRegClasses = [(FR32 sub_ss), (FR64 sub_sd)];
-}
-
+                          128, (add FR32)>;
 def VR256 : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
-                          256, (sequence "YMM%u", 0, 15)> {
-  let SubRegClasses = [(FR32 sub_ss), (FR64 sub_sd), (VR128 sub_xmm)];
-}
+                          256, (sequence "YMM%u", 0, 15)>;
 
 // Status flags registers.
 def CCR : RegisterClass<"X86", [i32], 32, (add EFLAGS)> {
   let CopyCost = -1;  // Don't allow copying of status registers.
   let isAllocatable = 0;
 }
+def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> {
+  let CopyCost = -1;  // Don't allow copying of status registers.
+  let isAllocatable = 0;
+}
diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td
index 17f4efd..c14407f 100644
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-// Instruction Itinerary classes used for X86 
+// Instruction Itinerary classes used for X86
 def IIC_DEFAULT     : InstrItinClass;
 def IIC_ALU_MEM     : InstrItinClass;
 def IIC_ALU_NONMEM  : InstrItinClass;
@@ -253,6 +253,42 @@ def IIC_SSE_CVT_SS2SI64_RR : InstrItinClass;
 def IIC_SSE_CVT_SD2SI_RM : InstrItinClass;
 def IIC_SSE_CVT_SD2SI_RR : InstrItinClass;
 
+// MMX
+def IIC_MMX_MOV_MM_RM : InstrItinClass;
+def IIC_MMX_MOV_REG_MM : InstrItinClass;
+def IIC_MMX_MOVQ_RM : InstrItinClass;
+def IIC_MMX_MOVQ_RR : InstrItinClass;
+
+def IIC_MMX_ALU_RM : InstrItinClass;
+def IIC_MMX_ALU_RR : InstrItinClass;
+def IIC_MMX_ALUQ_RM : InstrItinClass;
+def IIC_MMX_ALUQ_RR : InstrItinClass;
+def IIC_MMX_PHADDSUBW_RM : InstrItinClass;
+def IIC_MMX_PHADDSUBW_RR : InstrItinClass;
+def IIC_MMX_PHADDSUBD_RM : InstrItinClass;
+def IIC_MMX_PHADDSUBD_RR : InstrItinClass;
+def IIC_MMX_PMUL : InstrItinClass;
+def IIC_MMX_MISC_FUNC_MEM : InstrItinClass;
+def IIC_MMX_MISC_FUNC_REG : InstrItinClass;
+def IIC_MMX_PSADBW : InstrItinClass;
+def IIC_MMX_SHIFT_RI : InstrItinClass;
+def IIC_MMX_SHIFT_RM : InstrItinClass;
+def IIC_MMX_SHIFT_RR : InstrItinClass;
+def IIC_MMX_UNPCK_H_RM : InstrItinClass;
+def IIC_MMX_UNPCK_H_RR : InstrItinClass;
+def IIC_MMX_UNPCK_L : InstrItinClass;
+def IIC_MMX_PCK_RM : InstrItinClass;
+def IIC_MMX_PCK_RR : InstrItinClass;
+def IIC_MMX_PSHUF : InstrItinClass;
+def IIC_MMX_PEXTR : InstrItinClass;
+def IIC_MMX_PINSRW : InstrItinClass;
+def IIC_MMX_MASKMOV : InstrItinClass;
+
+def IIC_MMX_CVT_PD_RR : InstrItinClass;
+def IIC_MMX_CVT_PD_RM : InstrItinClass;
+def IIC_MMX_CVT_PS_RR : InstrItinClass;
+def IIC_MMX_CVT_PS_RM : InstrItinClass;
+
 def IIC_CMPX_LOCK : InstrItinClass;
 def IIC_CMPX_LOCK_8 : InstrItinClass;
 def IIC_CMPX_LOCK_8B : InstrItinClass;
@@ -261,13 +297,185 @@ def IIC_CMPX_LOCK_16B : InstrItinClass;
 def IIC_XADD_LOCK_MEM : InstrItinClass;
 def IIC_XADD_LOCK_MEM8 : InstrItinClass;
 
+def IIC_FILD : InstrItinClass;
+def IIC_FLD : InstrItinClass;
+def IIC_FLD80 : InstrItinClass;
+def IIC_FST : InstrItinClass;
+def IIC_FST80 : InstrItinClass;
+def IIC_FIST : InstrItinClass;
+def IIC_FLDZ : InstrItinClass;
+def IIC_FUCOM : InstrItinClass;
+def IIC_FUCOMI : InstrItinClass;
+def IIC_FCOMI : InstrItinClass;
+def IIC_FNSTSW : InstrItinClass;
+def IIC_FNSTCW : InstrItinClass;
+def IIC_FLDCW : InstrItinClass;
+def IIC_FNINIT : InstrItinClass;
+def IIC_FFREE : InstrItinClass;
+def IIC_FNCLEX : InstrItinClass;
+def IIC_WAIT : InstrItinClass;
+def IIC_FXAM : InstrItinClass;
+def IIC_FNOP : InstrItinClass;
+def IIC_FLDL : InstrItinClass;
+def IIC_F2XM1 : InstrItinClass;
+def IIC_FYL2X : InstrItinClass;
+def IIC_FPTAN : InstrItinClass;
+def IIC_FPATAN : InstrItinClass;
+def IIC_FXTRACT : InstrItinClass;
+def IIC_FPREM1 : InstrItinClass;
+def IIC_FPSTP : InstrItinClass;
+def IIC_FPREM : InstrItinClass;
+def IIC_FYL2XP1 : InstrItinClass;
+def IIC_FSINCOS : InstrItinClass;
+def IIC_FRNDINT : InstrItinClass;
+def IIC_FSCALE : InstrItinClass;
+def IIC_FCOMPP : InstrItinClass;
+def IIC_FXSAVE : InstrItinClass;
+def IIC_FXRSTOR : InstrItinClass;
+
+def IIC_FXCH : InstrItinClass;
+
+// System instructions
+def IIC_CPUID : InstrItinClass;
+def IIC_INT : InstrItinClass;
+def IIC_INT3 : InstrItinClass;
+def IIC_INVD : InstrItinClass;
+def IIC_INVLPG : InstrItinClass;
+def IIC_IRET : InstrItinClass;
+def IIC_HLT : InstrItinClass;
+def IIC_LXS : InstrItinClass;
+def IIC_LTR : InstrItinClass;
+def IIC_RDTSC : InstrItinClass;
+def IIC_RSM : InstrItinClass;
+def IIC_SIDT : InstrItinClass;
+def IIC_SGDT : InstrItinClass;
+def IIC_SLDT : InstrItinClass;
+def IIC_STR : InstrItinClass;
+def IIC_SWAPGS : InstrItinClass;
+def IIC_SYSCALL : InstrItinClass;
+def IIC_SYS_ENTER_EXIT : InstrItinClass;
+def IIC_IN_RR : InstrItinClass;
+def IIC_IN_RI : InstrItinClass;
+def IIC_OUT_RR : InstrItinClass;
+def IIC_OUT_IR : InstrItinClass;
+def IIC_INS : InstrItinClass;
+def IIC_MOV_REG_DR : InstrItinClass;
+def IIC_MOV_DR_REG : InstrItinClass;
+def IIC_MOV_REG_CR : InstrItinClass;
+def IIC_MOV_CR_REG : InstrItinClass;
+def IIC_MOV_REG_SR : InstrItinClass;
+def IIC_MOV_MEM_SR : InstrItinClass;
+def IIC_MOV_SR_REG : InstrItinClass;
+def IIC_MOV_SR_MEM : InstrItinClass;
+def IIC_LAR_RM : InstrItinClass;
+def IIC_LAR_RR : InstrItinClass;
+def IIC_LSL_RM : InstrItinClass;
+def IIC_LSL_RR : InstrItinClass;
+def IIC_LGDT : InstrItinClass;
+def IIC_LIDT : InstrItinClass;
+def IIC_LLDT_REG : InstrItinClass;
+def IIC_LLDT_MEM : InstrItinClass;
+def IIC_PUSH_CS : InstrItinClass;
+def IIC_PUSH_SR : InstrItinClass;
+def IIC_POP_SR : InstrItinClass;
+def IIC_POP_SR_SS : InstrItinClass;
+def IIC_VERR : InstrItinClass;
+def IIC_VERW_REG : InstrItinClass;
+def IIC_VERW_MEM : InstrItinClass;
+def IIC_WRMSR : InstrItinClass;
+def IIC_RDMSR : InstrItinClass;
+def IIC_RDPMC : InstrItinClass;
+def IIC_SMSW : InstrItinClass;
+def IIC_LMSW_REG : InstrItinClass;
+def IIC_LMSW_MEM : InstrItinClass;
+def IIC_ENTER : InstrItinClass;
+def IIC_LEAVE : InstrItinClass;
+def IIC_POP_MEM : InstrItinClass;
+def IIC_POP_REG16 : InstrItinClass;
+def IIC_POP_REG : InstrItinClass;
+def IIC_POP_F : InstrItinClass;
+def IIC_POP_FD : InstrItinClass;
+def IIC_POP_A : InstrItinClass;
+def IIC_PUSH_IMM : InstrItinClass;
+def IIC_PUSH_MEM : InstrItinClass;
+def IIC_PUSH_REG : InstrItinClass;
+def IIC_PUSH_F : InstrItinClass;
+def IIC_PUSH_A : InstrItinClass;
+def IIC_BSWAP : InstrItinClass;
+def IIC_BSF : InstrItinClass;
+def IIC_BSR : InstrItinClass;
+def IIC_MOVS : InstrItinClass;
+def IIC_STOS : InstrItinClass;
+def IIC_SCAS : InstrItinClass;
+def IIC_CMPS : InstrItinClass;
+def IIC_MOV : InstrItinClass;
+def IIC_MOV_MEM : InstrItinClass;
+def IIC_AHF : InstrItinClass;
+def IIC_BT_MI : InstrItinClass;
+def IIC_BT_MR : InstrItinClass;
+def IIC_BT_RI : InstrItinClass;
+def IIC_BT_RR : InstrItinClass;
+def IIC_BTX_MI : InstrItinClass;
+def IIC_BTX_MR : InstrItinClass;
+def IIC_BTX_RI : InstrItinClass;
+def IIC_BTX_RR : InstrItinClass;
+def IIC_XCHG_REG : InstrItinClass;
+def IIC_XCHG_MEM : InstrItinClass;
+def IIC_XADD_REG : InstrItinClass;
+def IIC_XADD_MEM : InstrItinClass;
+def IIC_CMPXCHG_MEM : InstrItinClass;
+def IIC_CMPXCHG_REG : InstrItinClass;
+def IIC_CMPXCHG_MEM8 : InstrItinClass;
+def IIC_CMPXCHG_REG8 : InstrItinClass;
+def IIC_CMPXCHG_8B : InstrItinClass;
+def IIC_CMPXCHG_16B : InstrItinClass;
+def IIC_LODS : InstrItinClass;
+def IIC_OUTS : InstrItinClass;
+def IIC_CLC : InstrItinClass;
+def IIC_CLD : InstrItinClass;
+def IIC_CLI : InstrItinClass;
+def IIC_CMC : InstrItinClass;
+def IIC_CLTS : InstrItinClass;
+def IIC_STC : InstrItinClass;
+def IIC_STI : InstrItinClass;
+def IIC_STD : InstrItinClass;
+def IIC_XLAT : InstrItinClass;
+def IIC_AAA : InstrItinClass;
+def IIC_AAD : InstrItinClass;
+def IIC_AAM : InstrItinClass;
+def IIC_AAS : InstrItinClass;
+def IIC_DAA : InstrItinClass;
+def IIC_DAS : InstrItinClass;
+def IIC_BOUND : InstrItinClass;
+def IIC_ARPL_REG : InstrItinClass;
+def IIC_ARPL_MEM : InstrItinClass;
+def IIC_MOVBE : InstrItinClass;
+
+def IIC_NOP : InstrItinClass;
 
 //===----------------------------------------------------------------------===//
 // Processor instruction itineraries.
 
-def GenericItineraries : ProcessorItineraries<[], [], []>;
+// IssueWidth is analagous to the number of decode units. Core and its
+// descendents, including Nehalem and SandyBridge have 4 decoders.
+// Resources beyond the decoder operate on micro-ops and are bufferred
+// so adjacent micro-ops don't directly compete.
+//
+// MinLatency=0 indicates that RAW dependencies can be decoded in the
+// same cycle.
+//
+// HighLatency=10 is optimistic. X86InstrInfo::isHighLatencyDef
+// indicates high latency opcodes. Alternatively, InstrItinData
+// entries may be included here to define specific operand
+// latencies. Since these latencies are not used for pipeline hazards,
+// they do not need to be exact.
+//
+// The GenericModel contains no instruciton itineraries.
+def GenericModel : SchedMachineModel {
+  let IssueWidth = 4;
+  let MinLatency = 0;
+  let LoadLatency = 4;
+  let HighLatency = 10;
+}
 
 include "X86ScheduleAtom.td"
-
-
-
diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td
index 77d4e56..8710261 100644
--- a/lib/Target/X86/X86ScheduleAtom.td
+++ b/lib/Target/X86/X86ScheduleAtom.td
@@ -106,7 +106,7 @@ def AtomItineraries : ProcessorItineraries<
   InstrItinData<IIC_CMOV64_RM, [InstrStage<1, [Port0]>] >,
   InstrItinData<IIC_CMOV64_RR, [InstrStage<1, [Port0, Port1]>] >,
   // set
-  InstrItinData<IIC_SET_M, [InstrStage<2, [Port0, Port1]>] >, 
+  InstrItinData<IIC_SET_M, [InstrStage<2, [Port0, Port1]>] >,
   InstrItinData<IIC_SET_R, [InstrStage<1, [Port0, Port1]>] >,
   // jcc
   InstrItinData<IIC_Jcc, [InstrStage<1, [Port1]>] >,
@@ -294,12 +294,237 @@ def AtomItineraries : ProcessorItineraries<
   InstrItinData<IIC_SSE_CVT_SD2SI_RR, [InstrStage<8, [Port0, Port1]>] >,
   InstrItinData<IIC_SSE_CVT_SD2SI_RM, [InstrStage<9, [Port0, Port1]>] >,
 
+  // MMX MOVs
+  InstrItinData<IIC_MMX_MOV_MM_RM,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_MOV_REG_MM, [InstrStage<3, [Port0]>] >,
+  InstrItinData<IIC_MMX_MOVQ_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_MOVQ_RR, [InstrStage<1, [Port0, Port1]>] >,
+  // other MMX
+  InstrItinData<IIC_MMX_ALU_RM,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_ALU_RR,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_ALUQ_RM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_ALUQ_RR, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBW_RM, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBW_RR, [InstrStage<5, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBD_RM, [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBD_RR, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PMUL, [InstrStage<4, [Port0]>] >,
+  InstrItinData<IIC_MMX_MISC_FUNC_MEM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_MISC_FUNC_REG, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PSADBW,   [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_SHIFT_RI, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_SHIFT_RM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_SHIFT_RR, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_UNPCK_H_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_UNPCK_H_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_UNPCK_L, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_PCK_RM,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_PCK_RR,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PSHUF,   [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_PEXTR,   [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PINSRW,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_MASKMOV, [InstrStage<1, [Port0]>] >,
+  // conversions
+  // from/to PD
+  InstrItinData<IIC_MMX_CVT_PD_RR, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_CVT_PD_RM, [InstrStage<8, [Port0, Port1]>] >,
+  // from/to PI
+  InstrItinData<IIC_MMX_CVT_PS_RR, [InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_MMX_CVT_PS_RM, [InstrStage<5, [Port0], 0>,
+                                    InstrStage<5, [Port1]>]>,
+
   InstrItinData<IIC_CMPX_LOCK, [InstrStage<14, [Port0, Port1]>] >,
   InstrItinData<IIC_CMPX_LOCK_8, [InstrStage<6, [Port0, Port1]>] >,
   InstrItinData<IIC_CMPX_LOCK_8B, [InstrStage<18, [Port0, Port1]>] >,
   InstrItinData<IIC_CMPX_LOCK_16B, [InstrStage<22, [Port0, Port1]>] >,
 
   InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<2, [Port0, Port1]>] >,
-  InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<3, [Port0, Port1]>] >
+  InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<3, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_FILD, [InstrStage<5, [Port0], 0>, InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_FLD,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_FLD80, [InstrStage<4, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_FST,   [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_FST80, [InstrStage<5, [Port0, Port1]>] >,
+  InstrItinData<IIC_FIST,  [InstrStage<6, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_FLDZ,   [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_FUCOM,  [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_FUCOMI, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_FCOMI,  [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_FNSTSW, [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_FNSTCW, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_FLDCW,  [InstrStage<5, [Port0, Port1]>] >,
+  InstrItinData<IIC_FNINIT, [InstrStage<63, [Port0, Port1]>] >,
+  InstrItinData<IIC_FFREE,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_FNCLEX, [InstrStage<25, [Port0, Port1]>] >,
+  InstrItinData<IIC_WAIT,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_FXAM,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_FNOP,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_FLDL,  [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_F2XM1,  [InstrStage<99, [Port0, Port1]>] >,
+  InstrItinData<IIC_FYL2X,  [InstrStage<146, [Port0, Port1]>] >,
+  InstrItinData<IIC_FPTAN,  [InstrStage<168, [Port0, Port1]>] >,
+  InstrItinData<IIC_FPATAN,  [InstrStage<183, [Port0, Port1]>] >,
+  InstrItinData<IIC_FXTRACT,  [InstrStage<25, [Port0, Port1]>] >,
+  InstrItinData<IIC_FPREM1,  [InstrStage<71, [Port0, Port1]>] >,
+  InstrItinData<IIC_FPSTP,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_FPREM,  [InstrStage<55, [Port0, Port1]>] >,
+  InstrItinData<IIC_FYL2XP1,  [InstrStage<147, [Port0, Port1]>] >,
+  InstrItinData<IIC_FSINCOS,  [InstrStage<174, [Port0, Port1]>] >,
+  InstrItinData<IIC_FRNDINT,  [InstrStage<46, [Port0, Port1]>] >,
+  InstrItinData<IIC_FSCALE,  [InstrStage<77, [Port0, Port1]>] >,
+  InstrItinData<IIC_FCOMPP,  [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_FXSAVE,  [InstrStage<140, [Port0, Port1]>] >,
+  InstrItinData<IIC_FXRSTOR,  [InstrStage<141, [Port0, Port1]>] >,
+  InstrItinData<IIC_FXCH, [InstrStage<1, [Port0], 0>, InstrStage<1, [Port1]>] >,
+
+  // System instructions
+  InstrItinData<IIC_CPUID, [InstrStage<121, [Port0, Port1]>] >,
+  InstrItinData<IIC_INT,   [InstrStage<127, [Port0, Port1]>] >,
+  InstrItinData<IIC_INT3,  [InstrStage<130, [Port0, Port1]>] >,
+  InstrItinData<IIC_INVD,  [InstrStage<1003, [Port0, Port1]>] >,
+  InstrItinData<IIC_INVLPG, [InstrStage<71, [Port0, Port1]>] >,
+  InstrItinData<IIC_IRET,  [InstrStage<109, [Port0, Port1]>] >,
+  InstrItinData<IIC_HLT,   [InstrStage<121, [Port0, Port1]>] >,
+  InstrItinData<IIC_LXS,   [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_LTR,   [InstrStage<83, [Port0, Port1]>] >,
+  InstrItinData<IIC_RDTSC, [InstrStage<30, [Port0, Port1]>] >,
+  InstrItinData<IIC_RSM,   [InstrStage<741, [Port0, Port1]>] >,
+  InstrItinData<IIC_SIDT,  [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_SGDT,  [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_SLDT,  [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_STR,    [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_SWAPGS, [InstrStage<22, [Port0, Port1]>] >,
+  InstrItinData<IIC_SYSCALL, [InstrStage<96, [Port0, Port1]>] >,
+  InstrItinData<IIC_SYS_ENTER_EXIT, [InstrStage<88, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_IN_RR,  [InstrStage<94, [Port0, Port1]>] >,
+  InstrItinData<IIC_IN_RI,  [InstrStage<92, [Port0, Port1]>] >,
+  InstrItinData<IIC_OUT_RR, [InstrStage<68, [Port0, Port1]>] >,
+  InstrItinData<IIC_OUT_IR, [InstrStage<72, [Port0, Port1]>] >,
+  InstrItinData<IIC_INS,    [InstrStage<59, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_MOV_REG_DR, [InstrStage<88, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV_DR_REG, [InstrStage<123, [Port0, Port1]>] >,
+  // worst case for mov REG_CRx
+  InstrItinData<IIC_MOV_REG_CR, [InstrStage<12, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV_CR_REG, [InstrStage<136, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_MOV_REG_SR, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MOV_MEM_SR, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV_SR_REG, [InstrStage<21, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV_SR_MEM, [InstrStage<26, [Port0, Port1]>] >,
+  // LAR
+  InstrItinData<IIC_LAR_RM,  [InstrStage<50, [Port0, Port1]>] >,
+  InstrItinData<IIC_LAR_RR,  [InstrStage<54, [Port0, Port1]>] >,
+  // LSL
+  InstrItinData<IIC_LSL_RM,  [InstrStage<46, [Port0, Port1]>] >,
+  InstrItinData<IIC_LSL_RR,  [InstrStage<49, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_LGDT, [InstrStage<44, [Port0, Port1]>] >,
+  InstrItinData<IIC_LIDT, [InstrStage<44, [Port0, Port1]>] >,
+  InstrItinData<IIC_LLDT_REG, [InstrStage<60, [Port0, Port1]>] >,
+  InstrItinData<IIC_LLDT_MEM, [InstrStage<64, [Port0, Port1]>] >,
+  // push control register, segment registers
+  InstrItinData<IIC_PUSH_CS, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_PUSH_SR, [InstrStage<2, [Port0, Port1]>] >,
+  // pop control register, segment registers
+  InstrItinData<IIC_POP_SR,    [InstrStage<29, [Port0, Port1]>] >,
+  InstrItinData<IIC_POP_SR_SS, [InstrStage<48, [Port0, Port1]>] >,
+  // VERR, VERW
+  InstrItinData<IIC_VERR,     [InstrStage<41, [Port0, Port1]>] >,
+  InstrItinData<IIC_VERW_REG, [InstrStage<51, [Port0, Port1]>] >,
+  InstrItinData<IIC_VERW_MEM, [InstrStage<50, [Port0, Port1]>] >,
+  // WRMSR, RDMSR
+  InstrItinData<IIC_WRMSR, [InstrStage<202, [Port0, Port1]>] >,
+  InstrItinData<IIC_RDMSR, [InstrStage<78, [Port0, Port1]>] >,
+  InstrItinData<IIC_RDPMC, [InstrStage<46, [Port0, Port1]>] >,
+  // SMSW, LMSW
+  InstrItinData<IIC_SMSW, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_LMSW_REG, [InstrStage<69, [Port0, Port1]>] >,
+  InstrItinData<IIC_LMSW_MEM, [InstrStage<67, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_ENTER, [InstrStage<32, [Port0, Port1]>] >,
+  InstrItinData<IIC_LEAVE, [InstrStage<2, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_POP_MEM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_POP_REG16, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_POP_REG, [InstrStage<1, [Port0], 0>,
+                            InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_POP_F, [InstrStage<32, [Port0, Port1]>] >,
+  InstrItinData<IIC_POP_FD, [InstrStage<26, [Port0, Port1]>] >,
+  InstrItinData<IIC_POP_A, [InstrStage<9, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_PUSH_IMM, [InstrStage<1, [Port0], 0>,
+                               InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_PUSH_MEM, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_PUSH_REG, [InstrStage<1, [Port0], 0>,
+                               InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_PUSH_F, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_PUSH_A, [InstrStage<8, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_BSWAP, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_BSF, [InstrStage<16, [Port0, Port1]>] >,
+  InstrItinData<IIC_BSR, [InstrStage<16, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOVS, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_STOS, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SCAS, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPS, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV_MEM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_AHF, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_BT_MI, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_BT_MR, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_BT_RI, [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_BT_RR, [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_BTX_MI, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_BTX_MR, [InstrStage<11, [Port0, Port1]>] >,
+  InstrItinData<IIC_BTX_RI, [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_BTX_RR, [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_XCHG_REG, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_XCHG_MEM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_XADD_REG, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_XADD_MEM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_MEM, [InstrStage<14, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_REG, [InstrStage<15, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_MEM8, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_REG8, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_8B, [InstrStage<18, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_16B, [InstrStage<22, [Port0, Port1]>] >,
+  InstrItinData<IIC_LODS, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_OUTS, [InstrStage<74, [Port0, Port1]>] >,
+  InstrItinData<IIC_CLC, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_CLD, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_CLI, [InstrStage<14, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMC, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_CLTS, [InstrStage<33, [Port0, Port1]>] >,
+  InstrItinData<IIC_STC, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_STI, [InstrStage<17, [Port0, Port1]>] >,
+  InstrItinData<IIC_STD, [InstrStage<21, [Port0, Port1]>] >,
+  InstrItinData<IIC_XLAT, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_AAA, [InstrStage<13, [Port0, Port1]>] >,
+  InstrItinData<IIC_AAD, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_AAM, [InstrStage<21, [Port0, Port1]>] >,
+  InstrItinData<IIC_AAS, [InstrStage<13, [Port0, Port1]>] >,
+  InstrItinData<IIC_DAA, [InstrStage<18, [Port0, Port1]>] >,
+  InstrItinData<IIC_DAS, [InstrStage<20, [Port0, Port1]>] >,
+  InstrItinData<IIC_BOUND, [InstrStage<11, [Port0, Port1]>] >,
+  InstrItinData<IIC_ARPL_REG, [InstrStage<24, [Port0, Port1]>] >,
+  InstrItinData<IIC_ARPL_MEM, [InstrStage<23, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOVBE, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_NOP, [InstrStage<1, [Port0, Port1]>] >
   ]>;
 
+// Atom machine model.
+def AtomModel : SchedMachineModel {
+  let IssueWidth = 2;  // Allows 2 instructions per scheduling group.
+  let MinLatency = 1;  // InstrStage cycles overrides MinLatency.
+                       // OperandCycles may be used for expected latency.
+  let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles.
+  let HighLatency = 30;// Expected, may be overriden by OperandCycles.
+
+  let Itineraries = AtomItineraries;
+}
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index 9a04e35..7c6788f 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -62,13 +62,15 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
       Args.push_back(Entry);
       Entry.Node = Size;
       Args.push_back(Entry);
-      std::pair<SDValue,SDValue> CallResult =
-        TLI.LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()),
+      TargetLowering::
+      CallLoweringInfo CLI(Chain, Type::getVoidTy(*DAG.getContext()),
                         false, false, false, false,
                         0, CallingConv::C, /*isTailCall=*/false,
                         /*doesNotRet=*/false, /*isReturnValueUsed=*/false,
                         DAG.getExternalSymbol(bzeroEntry, IntPtr), Args,
                         DAG, dl);
+      std::pair<SDValue,SDValue> CallResult =
+        TLI.LowerCallTo(CLI);
       return CallResult.second;
     }
 
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index ed1a409..e6e9c56 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -196,33 +196,32 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
   if ((ECX >> 9)  & 1) { X86SSELevel = SSSE3; ToggleFeature(X86::FeatureSSSE3);}
   if ((ECX >> 19) & 1) { X86SSELevel = SSE41; ToggleFeature(X86::FeatureSSE41);}
   if ((ECX >> 20) & 1) { X86SSELevel = SSE42; ToggleFeature(X86::FeatureSSE42);}
-  // FIXME: AVX codegen support is not ready.
-  //if ((ECX >> 28) & 1) { X86SSELevel = AVX;  ToggleFeature(X86::FeatureAVX); }
+  if ((ECX >> 28) & 1) { X86SSELevel = AVX;   ToggleFeature(X86::FeatureAVX); }
 
   bool IsIntel = memcmp(text.c, "GenuineIntel", 12) == 0;
   bool IsAMD   = !IsIntel && memcmp(text.c, "AuthenticAMD", 12) == 0;
 
-  if (IsIntel && ((ECX >> 1) & 0x1)) {
-    HasCLMUL = true;
-    ToggleFeature(X86::FeatureCLMUL);
+  if ((ECX >> 1) & 0x1) {
+    HasPCLMUL = true;
+    ToggleFeature(X86::FeaturePCLMUL);
   }
-  if (IsIntel && ((ECX >> 12) & 0x1)) {
-    HasFMA3 = true;
-    ToggleFeature(X86::FeatureFMA3);
+  if ((ECX >> 12) & 0x1) {
+    HasFMA = true;
+    ToggleFeature(X86::FeatureFMA);
   }
   if (IsIntel && ((ECX >> 22) & 0x1)) {
     HasMOVBE = true;
     ToggleFeature(X86::FeatureMOVBE);
   }
-  if (IsIntel && ((ECX >> 23) & 0x1)) {
+  if ((ECX >> 23) & 0x1) {
     HasPOPCNT = true;
     ToggleFeature(X86::FeaturePOPCNT);
   }
-  if (IsIntel && ((ECX >> 25) & 0x1)) {
+  if ((ECX >> 25) & 0x1) {
     HasAES = true;
     ToggleFeature(X86::FeatureAES);
   }
-  if (IsIntel && ((ECX >> 29) & 0x1)) {
+  if ((ECX >> 29) & 0x1) {
     HasF16C = true;
     ToggleFeature(X86::FeatureF16C);
   }
@@ -254,8 +253,12 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
     }
 
     // Set processor type. Currently only Atom is detected.
-    if (Family == 6 && Model == 28) {
+    if (Family == 6 &&
+        (Model == 28 || Model == 38 || Model == 39
+         || Model == 53 || Model == 54)) {
       X86ProcFamily = IntelAtom;
+
+      UseLeaForSP = true;
       ToggleFeature(X86::FeatureLeaForSP);
     }
 
@@ -289,9 +292,9 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
     }
   }
 
-  if (IsIntel && MaxLevel >= 7) {
+  if (MaxLevel >= 7) {
     if (!X86_MC::GetCpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX)) {
-      if (EBX & 0x1) {
+      if (IsIntel && (EBX & 0x1)) {
         HasFSGSBase = true;
         ToggleFeature(X86::FeatureFSGSBase);
       }
@@ -299,12 +302,11 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
         HasBMI = true;
         ToggleFeature(X86::FeatureBMI);
       }
-      // FIXME: AVX2 codegen support is not ready.
-      //if ((EBX >> 5) & 0x1) {
-      //  X86SSELevel = AVX2;
-      //  ToggleFeature(X86::FeatureAVX2);
-      //}
-      if ((EBX >> 8) & 0x1) {
+      if (IsIntel && ((EBX >> 5) & 0x1)) {
+        X86SSELevel = AVX2;
+        ToggleFeature(X86::FeatureAVX2);
+      }
+      if (IsIntel && ((EBX >> 8) & 0x1)) {
         HasBMI2 = true;
         ToggleFeature(X86::FeatureBMI2);
       }
@@ -325,8 +327,8 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
   , HasPOPCNT(false)
   , HasSSE4A(false)
   , HasAES(false)
-  , HasCLMUL(false)
-  , HasFMA3(false)
+  , HasPCLMUL(false)
+  , HasFMA(false)
   , HasFMA4(false)
   , HasXOP(false)
   , HasMOVBE(false)
@@ -424,9 +426,7 @@ bool X86Subtarget::enablePostRAScheduler(
            CodeGenOpt::Level OptLevel,
            TargetSubtargetInfo::AntiDepBreakMode& Mode,
            RegClassVector& CriticalPathRCs) const {
-  //TODO: change back to ANTIDEP_CRITICAL when the
-  // X86 subtarget properly sets up post RA liveness.
-  Mode = TargetSubtargetInfo::ANTIDEP_NONE;
+  Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL;
   CriticalPathRCs.clear();
   return PostRAScheduler && OptLevel >= CodeGenOpt::Default;
 }
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 7fd832b..1af585f 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -85,11 +85,11 @@ protected:
   /// HasAES - Target has AES instructions
   bool HasAES;
 
-  /// HasCLMUL - Target has carry-less multiplication
-  bool HasCLMUL;
+  /// HasPCLMUL - Target has carry-less multiplication
+  bool HasPCLMUL;
 
-  /// HasFMA3 - Target has 3-operand fused multiply-add
-  bool HasFMA3;
+  /// HasFMA - Target has 3-operand fused multiply-add
+  bool HasFMA;
 
   /// HasFMA4 - Target has 4-operand fused multiply-add
   bool HasFMA4;
@@ -203,8 +203,8 @@ public:
   bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
   bool hasPOPCNT() const { return HasPOPCNT; }
   bool hasAES() const { return HasAES; }
-  bool hasCLMUL() const { return HasCLMUL; }
-  bool hasFMA3() const { return HasFMA3; }
+  bool hasPCLMUL() const { return HasPCLMUL; }
+  bool hasFMA() const { return HasFMA; }
   bool hasFMA4() const { return HasFMA4; }
   bool hasXOP() const { return HasXOP; }
   bool hasMOVBE() const { return HasMOVBE; }
@@ -307,6 +307,8 @@ public:
                              TargetSubtargetInfo::AntiDepBreakMode& Mode,
                              RegClassVector& CriticalPathRCs) const;
 
+  bool postRAScheduler() const { return PostRAScheduler; }
+
   /// getInstrItins = Return the instruction itineraries based on the
   /// subtarget selection.
   const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index f4b7a62..b7ba568 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -140,39 +140,48 @@ public:
 } // namespace
 
 TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new X86PassConfig(this, PM);
+  X86PassConfig *PC = new X86PassConfig(this, PM);
+
+  if (Subtarget.hasCMov())
+    PC->enablePass(&EarlyIfConverterID);
+
+  return PC;
 }
 
 bool X86PassConfig::addInstSelector() {
   // Install an instruction selector.
-  PM.add(createX86ISelDag(getX86TargetMachine(), getOptLevel()));
+  addPass(createX86ISelDag(getX86TargetMachine(), getOptLevel()));
+
+  // For ELF, cleanup any local-dynamic TLS accesses.
+  if (getX86Subtarget().isTargetELF() && getOptLevel() != CodeGenOpt::None)
+    addPass(createCleanupLocalDynamicTLSPass());
 
   // For 32-bit, prepend instructions to set the "global base reg" for PIC.
   if (!getX86Subtarget().is64Bit())
-    PM.add(createGlobalBaseRegPass());
+    addPass(createGlobalBaseRegPass());
 
   return false;
 }
 
 bool X86PassConfig::addPreRegAlloc() {
-  PM.add(createX86MaxStackAlignmentHeuristicPass());
+  addPass(createX86MaxStackAlignmentHeuristicPass());
   return false;  // -print-machineinstr shouldn't print after this.
 }
 
 bool X86PassConfig::addPostRegAlloc() {
-  PM.add(createX86FloatingPointStackifierPass());
+  addPass(createX86FloatingPointStackifierPass());
   return true;  // -print-machineinstr should print after this.
 }
 
 bool X86PassConfig::addPreEmitPass() {
   bool ShouldPrint = false;
   if (getOptLevel() != CodeGenOpt::None && getX86Subtarget().hasSSE2()) {
-    PM.add(createExecutionDependencyFixPass(&X86::VR128RegClass));
+    addPass(createExecutionDependencyFixPass(&X86::VR128RegClass));
     ShouldPrint = true;
   }
 
   if (getX86Subtarget().hasAVX() && UseVZeroUpper) {
-    PM.add(createX86IssueVZeroUpperPass());
+    addPass(createX86IssueVZeroUpperPass());
     ShouldPrint = true;
   }
 
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index 718f35e..92aee0d 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -9,16 +9,19 @@
 
 #include "X86TargetObjectFile.h"
 #include "X86TargetMachine.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/Target/Mangler.h"
 #include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ELF.h"
 using namespace llvm;
 using namespace dwarf;
 
-const MCExpr *X8664_MachoTargetObjectFile::
+const MCExpr *X86_64MachoTargetObjectFile::
 getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
                                MachineModuleInfo *MMI, unsigned Encoding,
                                MCStreamer &Streamer) const {
@@ -37,8 +40,14 @@ getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
     getExprForDwarfGlobalReference(GV, Mang, MMI, Encoding, Streamer);
 }
 
-MCSymbol *X8664_MachoTargetObjectFile::
+MCSymbol *X86_64MachoTargetObjectFile::
 getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
                         MachineModuleInfo *MMI) const {
   return Mang->getSymbol(GV);
 }
+
+void
+X86LinuxTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+}
diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
index a02a368..2d320c5 100644
--- a/lib/Target/X86/X86TargetObjectFile.h
+++ b/lib/Target/X86/X86TargetObjectFile.h
@@ -16,9 +16,9 @@
 
 namespace llvm {
 
-  /// X8664_MachoTargetObjectFile - This TLOF implementation is used for Darwin
+  /// X86_64MachoTargetObjectFile - This TLOF implementation is used for Darwin
   /// x86-64.
-  class X8664_MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
+  class X86_64MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
   public:
     virtual const MCExpr *
     getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
@@ -32,6 +32,12 @@ namespace llvm {
                             MachineModuleInfo *MMI) const;
   };
 
+  /// X86LinuxTargetObjectFile - This implementation is used for linux x86
+  /// and x86-64.
+  class X86LinuxTargetObjectFile : public TargetLoweringObjectFileELF {
+    virtual void Initialize(MCContext &Ctx, const TargetMachine &TM);
+  };
+
 } // end namespace llvm
 
 #endif
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index 2fd78a7..e4f567f 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -145,7 +145,7 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
   // to insert any VZEROUPPER instructions.  This is constant-time, so it is
   // cheap in the common case of no ymm use.
   bool YMMUsed = false;
-  const TargetRegisterClass *RC = X86::VR256RegisterClass;
+  const TargetRegisterClass *RC = &X86::VR256RegClass;
   for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end();
        i != e; i++) {
     if (MRI.isPhysRegUsed(*i)) {
@@ -205,7 +205,7 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
   }
 
 
-  // The entry MBB for the function may set the inital state to dirty if
+  // The entry MBB for the function may set the initial state to dirty if
   // the function receives any YMM incoming arguments
   if (MBB == MF.begin()) {
     EntryState = ST_CLEAN;
diff --git a/lib/Target/XCore/CMakeLists.txt b/lib/Target/XCore/CMakeLists.txt
index 0d59572..ca94f03 100644
--- a/lib/Target/XCore/CMakeLists.txt
+++ b/lib/Target/XCore/CMakeLists.txt
@@ -22,5 +22,7 @@ add_llvm_target(XCoreCodeGen
   XCoreSelectionDAGInfo.cpp
   )
 
+add_dependencies(LLVMXCoreCodeGen intrinsics_gen)
+
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/XCore/XCoreAsmPrinter.cpp b/lib/Target/XCore/XCoreAsmPrinter.cpp
index 8906b24..c76866f 100644
--- a/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -18,9 +18,9 @@
 #include "XCoreSubtarget.h"
 #include "XCoreTargetMachine.h"
 #include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Module.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -260,7 +260,17 @@ void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
 bool XCoreAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                                       unsigned AsmVariant,const char *ExtraCode,
                                       raw_ostream &O) {
-  printOperand(MI, OpNo, O);
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0])
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+    }
+
+printOperand(MI, OpNo, O);
   return false;
 }
 
diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp
index 50fda58..3dbc3b9 100644
--- a/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -78,8 +78,7 @@ static void storeToStack(MachineBasicBlock &MBB,
 //===----------------------------------------------------------------------===//
 
 XCoreFrameLowering::XCoreFrameLowering(const XCoreSubtarget &sti)
-  : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 4, 0),
-    STI(sti) {
+  : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 4, 0) {
   // Do nothing
 }
 
@@ -341,7 +340,7 @@ XCoreFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
   bool LRUsed = MF.getRegInfo().isPhysRegUsed(XCore::LR);
-  const TargetRegisterClass *RC = XCore::GRRegsRegisterClass;
+  const TargetRegisterClass *RC = &XCore::GRRegsRegClass;
   XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
   if (LRUsed) {
     MF.getRegInfo().setPhysRegUnused(XCore::LR);
diff --git a/lib/Target/XCore/XCoreFrameLowering.h b/lib/Target/XCore/XCoreFrameLowering.h
index 4c51aa5..afa2773 100644
--- a/lib/Target/XCore/XCoreFrameLowering.h
+++ b/lib/Target/XCore/XCoreFrameLowering.h
@@ -22,7 +22,6 @@ namespace llvm {
   class XCoreSubtarget;
 
   class XCoreFrameLowering: public TargetFrameLowering {
-    const XCoreSubtarget &STI;
   public:
     XCoreFrameLowering(const XCoreSubtarget &STI);
 
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index fdf2b78..8643ffc 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -66,7 +66,7 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
     Subtarget(*XTM.getSubtargetImpl()) {
 
   // Set up the register classes.
-  addRegisterClass(MVT::i32, XCore::GRRegsRegisterClass);
+  addRegisterClass(MVT::i32, &XCore::GRRegsRegClass);
 
   // Compute derived properties from the register classes
   computeRegisterProperties();
@@ -485,12 +485,12 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   Entry.Node = BasePtr;
   Args.push_back(Entry);
 
-  std::pair<SDValue, SDValue> CallResult =
-        LowerCallTo(Chain, IntPtrTy, false, false,
+  TargetLowering::CallLoweringInfo CLI(Chain, IntPtrTy, false, false,
                     false, false, 0, CallingConv::C, /*isTailCall=*/false,
                     /*doesNotRet=*/false, /*isReturnValueUsed=*/true,
                     DAG.getExternalSymbol("__misaligned_load", getPointerTy()),
                     Args, DAG, DL);
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
 
   SDValue Ops[] =
     { CallResult.first, CallResult.second };
@@ -547,12 +547,13 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const
   Entry.Node = Value;
   Args.push_back(Entry);
 
-  std::pair<SDValue, SDValue> CallResult =
-        LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()), false, false,
+  TargetLowering::CallLoweringInfo CLI(Chain,
+                    Type::getVoidTy(*DAG.getContext()), false, false,
                     false, false, 0, CallingConv::C, /*isTailCall=*/false,
                     /*doesNotRet=*/false, /*isReturnValueUsed=*/true,
                     DAG.getExternalSymbol("__misaligned_store", getPointerTy()),
                     Args, DAG, dl);
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
 
   return CallResult.second;
 }
@@ -873,14 +874,19 @@ LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const {
 
 /// XCore call implementation
 SDValue
-XCoreTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
-                               CallingConv::ID CallConv, bool isVarArg,
-                               bool doesNotRet, bool &isTailCall,
-                               const SmallVectorImpl<ISD::OutputArg> &Outs,
-                               const SmallVectorImpl<SDValue> &OutVals,
-                               const SmallVectorImpl<ISD::InputArg> &Ins,
-                               DebugLoc dl, SelectionDAG &DAG,
+XCoreTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
+
   // XCore target does not yet support tail call optimization.
   isTailCall = false;
 
@@ -913,7 +919,7 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), ArgLocs, *DAG.getContext());
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   // The ABI dictates there should be one stack slot available to the callee
   // on function entry (for saving lr).
@@ -1036,7 +1042,7 @@ XCoreTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), RVLocs, *DAG.getContext());
+                 getTargetMachine(), RVLocs, *DAG.getContext());
 
   CCInfo.AnalyzeCallResult(Ins, RetCC_XCore);
 
@@ -1096,7 +1102,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), ArgLocs, *DAG.getContext());
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   CCInfo.AnalyzeFormalArguments(Ins, CC_XCore);
 
@@ -1121,8 +1127,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
           llvm_unreachable(0);
         }
       case MVT::i32:
-        unsigned VReg = RegInfo.createVirtualRegister(
-                          XCore::GRRegsRegisterClass);
+        unsigned VReg = RegInfo.createVirtualRegister(&XCore::GRRegsRegClass);
         RegInfo.addLiveIn(VA.getLocReg(), VReg);
         InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
       }
@@ -1172,8 +1177,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
         offset -= StackSlotSize;
         SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
         // Move argument from phys reg -> virt reg
-        unsigned VReg = RegInfo.createVirtualRegister(
-                          XCore::GRRegsRegisterClass);
+        unsigned VReg = RegInfo.createVirtualRegister(&XCore::GRRegsRegClass);
         RegInfo.addLiveIn(ArgRegs[i], VReg);
         SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
         // Move argument from virt reg -> stack
@@ -1201,7 +1205,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
 
 bool XCoreTargetLowering::
 CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
-	       bool isVarArg,
+               bool isVarArg,
                const SmallVectorImpl<ISD::OutputArg> &Outs,
                LLVMContext &Context) const {
   SmallVector<CCValAssign, 16> RVLocs;
@@ -1222,7 +1226,7 @@ XCoreTargetLowering::LowerReturn(SDValue Chain,
 
   // CCState - Info about the registers and stack slot.
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-		 getTargetMachine(), RVLocs, *DAG.getContext());
+                 getTargetMachine(), RVLocs, *DAG.getContext());
 
   // Analyze return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_XCore);
@@ -1606,12 +1610,12 @@ XCoreTargetLowering::isLegalAddressingMode(const AddrMode &AM,
 std::pair<unsigned, const TargetRegisterClass*>
 XCoreTargetLowering::
 getRegForInlineAsmConstraint(const std::string &Constraint,
-			     EVT VT) const {
+                             EVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     default : break;
     case 'r':
-      return std::make_pair(0U, XCore::GRRegsRegisterClass);
+      return std::make_pair(0U, &XCore::GRRegsRegClass);
     }
   }
   // Use the default implementation in TargetLowering to convert the register
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index 0b63ecd..2874f00 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -151,7 +151,7 @@ namespace llvm {
     // Inline asm support
     std::pair<unsigned, const TargetRegisterClass*>
     getRegForInlineAsmConstraint(const std::string &Constraint,
-				 EVT VT) const;
+                                 EVT VT) const;
 
     // Expand specifics
     SDValue TryExpandADDWithMul(SDNode *Op, SelectionDAG &DAG) const;
@@ -174,12 +174,7 @@ namespace llvm {
                            SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
-      LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
-                bool isVarArg, bool doesNotRet, bool &isTailCall,
-                const SmallVectorImpl<ISD::OutputArg> &Outs,
-                const SmallVectorImpl<SDValue> &OutVals,
-                const SmallVectorImpl<ISD::InputArg> &Ins,
-                DebugLoc dl, SelectionDAG &DAG,
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
                 SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
@@ -191,7 +186,7 @@ namespace llvm {
 
     virtual bool
       CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
-		     bool isVarArg,
+                     bool isVarArg,
                      const SmallVectorImpl<ISD::OutputArg> &ArgsFlags,
                      LLVMContext &Context) const;
   };
diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
index b25a08d..ae646a2 100644
--- a/lib/Target/XCore/XCoreInstrInfo.td
+++ b/lib/Target/XCore/XCoreInstrInfo.td
@@ -741,14 +741,12 @@ let isCall=1,
 // All calls clobber the link register and the non-callee-saved registers:
 Defs = [R0, R1, R2, R3, R11, LR], Uses = [SP] in {
 def BL_u10 : _FU10<
-                  (outs),
-                  (ins calltarget:$target, variable_ops),
+                  (outs), (ins calltarget:$target),
                   "bl $target",
                   [(XCoreBranchLink immU10:$target)]>;
 
 def BL_lu10 : _FLU10<
-                  (outs),
-                  (ins calltarget:$target, variable_ops),
+                  (outs), (ins calltarget:$target),
                   "bl $target",
                   [(XCoreBranchLink immU20:$target)]>;
 }
@@ -796,7 +794,7 @@ def MKMSK_rus : _FRUS<(outs GRRegs:$dst), (ins i32imm:$size),
 
 def MKMSK_2r : _FRUS<(outs GRRegs:$dst), (ins GRRegs:$size),
                  "mkmsk $dst, $size",
-                 [(set GRRegs:$dst, (add (shl 1, GRRegs:$size), 0xffffffff))]>;
+                 [(set GRRegs:$dst, (add (shl 1, GRRegs:$size), -1))]>;
 
 def GETR_rus : _FRUS<(outs GRRegs:$dst), (ins i32imm:$type),
                  "getr $dst, $type",
@@ -950,10 +948,10 @@ def ENDIN_l2r : _FL2R<(outs GRRegs:$dst), (ins GRRegs:$src),
 // dgetreg
 def MSYNC_1r : _F1R<(outs), (ins GRRegs:$i),
                     "msync res[$i]",
-		    [(int_xcore_msync GRRegs:$i)]>;
+                    [(int_xcore_msync GRRegs:$i)]>;
 def MJOIN_1r : _F1R<(outs), (ins GRRegs:$i),
                     "mjoin res[$i]",
-		    [(int_xcore_mjoin GRRegs:$i)]>;
+                    [(int_xcore_mjoin GRRegs:$i)]>;
 
 let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in
 def BAU_1r : _F1R<(outs), (ins GRRegs:$addr),
@@ -988,7 +986,7 @@ def ECALLF_1r : _F1R<(outs), (ins GRRegs:$src),
 let isCall=1, 
 // All calls clobber the link register and the non-callee-saved registers:
 Defs = [R0, R1, R2, R3, R11, LR], Uses = [SP] in {
-def BLA_1r : _F1R<(outs), (ins GRRegs:$addr, variable_ops),
+def BLA_1r : _F1R<(outs), (ins GRRegs:$addr),
                  "bla $addr",
                  [(XCoreBranchLink GRRegs:$addr)]>;
 }
@@ -1038,7 +1036,7 @@ def GETET_0R : _F0R<(outs), (ins),
 
 def SSYNC_0r : _F0R<(outs), (ins),
                     "ssync",
-		    [(int_xcore_ssync)]>;
+                    [(int_xcore_ssync)]>;
 
 let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1,
     hasSideEffects = 1 in
diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp
index f3b4b4c..cdd0a08 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.cpp
+++ b/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -92,6 +92,11 @@ XCoreRegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
 }
 
 bool
+XCoreRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+  return requiresRegisterScavenging(MF);
+}
+
+bool
 XCoreRegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
   return false;
 }
@@ -205,8 +210,7 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   unsigned Reg = MI.getOperand(0).getReg();
   bool isKill = MI.getOpcode() == XCore::STWFI && MI.getOperand(0).isKill();
 
-  assert(XCore::GRRegsRegisterClass->contains(Reg) &&
-         "Unexpected register operand");
+  assert(XCore::GRRegsRegClass.contains(Reg) && "Unexpected register operand");
   
   MachineBasicBlock &MBB = *MI.getParent();
   
@@ -217,7 +221,7 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       if (!RS)
         report_fatal_error("eliminateFrameIndex Frame size too big: " +
                            Twine(Offset));
-      unsigned ScratchReg = RS->scavengeRegister(XCore::GRRegsRegisterClass, II,
+      unsigned ScratchReg = RS->scavengeRegister(&XCore::GRRegsRegClass, II,
                                                  SPAdj);
       loadConstant(MBB, II, ScratchReg, Offset, dl);
       switch (MI.getOpcode()) {
diff --git a/lib/Target/XCore/XCoreRegisterInfo.h b/lib/Target/XCore/XCoreRegisterInfo.h
index 7391cfd..c4dcb6b 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.h
+++ b/lib/Target/XCore/XCoreRegisterInfo.h
@@ -50,6 +50,8 @@ public:
   
   bool requiresRegisterScavenging(const MachineFunction &MF) const;
 
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const;
+
   bool useFPForScavengingIndex(const MachineFunction &MF) const;
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
index f65297e..11ec86b 100644
--- a/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -55,7 +55,7 @@ TargetPassConfig *XCoreTargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 bool XCorePassConfig::addInstSelector() {
-  PM.add(createXCoreISelDag(getXCoreTargetMachine(), getOptLevel()));
+  addPass(createXCoreISelDag(getXCoreTargetMachine(), getOptLevel()));
   return false;
 }
 
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp
index e160f63..b94dd69 100644
--- a/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -245,10 +245,7 @@ static bool IsPrefix(const ArgPromotion::IndicesVector &Prefix,
                      const ArgPromotion::IndicesVector &Longer) {
   if (Prefix.size() > Longer.size())
     return false;
-  for (unsigned i = 0, e = Prefix.size(); i != e; ++i)
-    if (Prefix[i] != Longer[i])
-      return false;
-  return true;
+  return std::equal(Prefix.begin(), Prefix.end(), Longer.begin());
 }
 
 
@@ -616,8 +613,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
   
   // Recompute the parameter attributes list based on the new arguments for
   // the function.
-  NF->setAttributes(AttrListPtr::get(AttributesVec.begin(),
-                                     AttributesVec.end()));
+  NF->setAttributes(AttrListPtr::get(AttributesVec));
   AttributesVec.clear();
 
   F->getParent()->getFunctionList().insert(F, NF);
@@ -734,13 +730,11 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
       New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
                                Args, "", Call);
       cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv());
-      cast<InvokeInst>(New)->setAttributes(AttrListPtr::get(AttributesVec.begin(),
-                                                          AttributesVec.end()));
+      cast<InvokeInst>(New)->setAttributes(AttrListPtr::get(AttributesVec));
     } else {
       New = CallInst::Create(NF, Args, "", Call);
       cast<CallInst>(New)->setCallingConv(CS.getCallingConv());
-      cast<CallInst>(New)->setAttributes(AttrListPtr::get(AttributesVec.begin(),
-                                                        AttributesVec.end()));
+      cast<CallInst>(New)->setAttributes(AttrListPtr::get(AttributesVec));
       if (cast<CallInst>(Call)->isTailCall())
         cast<CallInst>(New)->setTailCall();
     }
diff --git a/lib/Transforms/IPO/CMakeLists.txt b/lib/Transforms/IPO/CMakeLists.txt
index 58b3551..3f6b1de 100644
--- a/lib/Transforms/IPO/CMakeLists.txt
+++ b/lib/Transforms/IPO/CMakeLists.txt
@@ -20,3 +20,5 @@ add_llvm_library(LLVMipo
   StripDeadPrototypes.cpp
   StripSymbols.cpp
   )
+
+add_dependencies(LLVMipo intrinsics_gen)
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 95aef27..fd23a93 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -238,7 +238,7 @@ bool DAE::DeleteDeadVarargs(Function &Fn) {
         AttributesVec.push_back(PAL.getSlot(i));
       if (Attributes FnAttrs = PAL.getFnAttributes())
         AttributesVec.push_back(AttributeWithIndex::get(~0, FnAttrs));
-      PAL = AttrListPtr::get(AttributesVec.begin(), AttributesVec.end());
+      PAL = AttrListPtr::get(AttributesVec);
     }
 
     Instruction *New;
@@ -753,8 +753,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
     AttributesVec.push_back(AttributeWithIndex::get(~0, FnAttrs));
 
   // Reconstruct the AttributesList based on the vector we constructed.
-  AttrListPtr NewPAL = AttrListPtr::get(AttributesVec.begin(),
-                                        AttributesVec.end());
+  AttrListPtr NewPAL = AttrListPtr::get(AttributesVec);
 
   // Create the new function type based on the recomputed parameters.
   FunctionType *NFTy = FunctionType::get(NRetTy, Params, FTy->isVarArg());
@@ -816,8 +815,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
       AttributesVec.push_back(AttributeWithIndex::get(~0, FnAttrs));
 
     // Reconstruct the AttributesList based on the vector we constructed.
-    AttrListPtr NewCallPAL = AttrListPtr::get(AttributesVec.begin(),
-                                              AttributesVec.end());
+    AttrListPtr NewCallPAL = AttrListPtr::get(AttributesVec);
 
     Instruction *New;
     if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
diff --git a/lib/Transforms/IPO/ExtractGV.cpp b/lib/Transforms/IPO/ExtractGV.cpp
index d9911bf..4c7f0ed 100644
--- a/lib/Transforms/IPO/ExtractGV.cpp
+++ b/lib/Transforms/IPO/ExtractGV.cpp
@@ -53,12 +53,12 @@ namespace {
            I != E; ++I) {
         if (deleteStuff == (bool)Named.count(I) && !I->isDeclaration()) {
           I->setInitializer(0);
-	} else {
-	  if (I->hasAvailableExternallyLinkage())
-	    continue;
-	  if (I->getName() == "llvm.global_ctors")
-	    continue;
-	}
+        } else {
+          if (I->hasAvailableExternallyLinkage())
+            continue;
+          if (I->getName() == "llvm.global_ctors")
+            continue;
+        }
 
         if (I->hasLocalLinkage())
           I->setVisibility(GlobalValue::HiddenVisibility);
@@ -69,10 +69,10 @@ namespace {
       for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
         if (deleteStuff == (bool)Named.count(I) && !I->isDeclaration()) {
           I->deleteBody();
-	} else {
-	  if (I->hasAvailableExternallyLinkage())
-	    continue;
-	}
+        } else {
+          if (I->hasAvailableExternallyLinkage())
+            continue;
+        }
 
         if (I->hasLocalLinkage())
           I->setVisibility(GlobalValue::HiddenVisibility);
diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp
index 2b427aa..18c1c7b 100644
--- a/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/lib/Transforms/IPO/GlobalDCE.cpp
@@ -65,7 +65,7 @@ bool GlobalDCE::runOnModule(Module &M) {
   for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
     Changed |= RemoveUnusedGlobalValue(*I);
     // Functions with external linkage are needed if they have a body
-    if (!I->hasLocalLinkage() && !I->hasLinkOnceLinkage() &&
+    if (!I->isDiscardableIfUnused() &&
         !I->isDeclaration() && !I->hasAvailableExternallyLinkage())
       GlobalIsNeeded(I);
   }
@@ -75,7 +75,7 @@ bool GlobalDCE::runOnModule(Module &M) {
     Changed |= RemoveUnusedGlobalValue(*I);
     // Externally visible & appending globals are needed, if they have an
     // initializer.
-    if (!I->hasLocalLinkage() && !I->hasLinkOnceLinkage() &&
+    if (!I->isDiscardableIfUnused() &&
         !I->isDeclaration() && !I->hasAvailableExternallyLinkage())
       GlobalIsNeeded(I);
   }
@@ -84,7 +84,7 @@ bool GlobalDCE::runOnModule(Module &M) {
        I != E; ++I) {
     Changed |= RemoveUnusedGlobalValue(*I);
     // Externally visible aliases are needed.
-    if (!I->hasLocalLinkage() && !I->hasLinkOnceLinkage())
+    if (!I->isDiscardableIfUnused())
       GlobalIsNeeded(I);
   }
 
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 1522aa4..60ce958 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -254,6 +254,8 @@ static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS,
             GS.StoredType = GlobalStatus::isStored;
           }
         }
+      } else if (isa<BitCastInst>(I)) {
+        if (AnalyzeGlobal(I, GS, PHIUsers)) return true;
       } else if (isa<GetElementPtrInst>(I)) {
         if (AnalyzeGlobal(I, GS, PHIUsers)) return true;
       } else if (isa<SelectInst>(I)) {
@@ -294,6 +296,165 @@ static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS,
   return false;
 }
 
+/// isLeakCheckerRoot - Is this global variable possibly used by a leak checker
+/// as a root?  If so, we might not really want to eliminate the stores to it.
+static bool isLeakCheckerRoot(GlobalVariable *GV) {
+  // A global variable is a root if it is a pointer, or could plausibly contain
+  // a pointer.  There are two challenges; one is that we could have a struct
+  // the has an inner member which is a pointer.  We recurse through the type to
+  // detect these (up to a point).  The other is that we may actually be a union
+  // of a pointer and another type, and so our LLVM type is an integer which
+  // gets converted into a pointer, or our type is an [i8 x #] with a pointer
+  // potentially contained here.
+
+  if (GV->hasPrivateLinkage())
+    return false;
+
+  SmallVector<Type *, 4> Types;
+  Types.push_back(cast<PointerType>(GV->getType())->getElementType());
+
+  unsigned Limit = 20;
+  do {
+    Type *Ty = Types.pop_back_val();
+    switch (Ty->getTypeID()) {
+      default: break;
+      case Type::PointerTyID: return true;
+      case Type::ArrayTyID:
+      case Type::VectorTyID: {
+        SequentialType *STy = cast<SequentialType>(Ty);
+        Types.push_back(STy->getElementType());
+        break;
+      }
+      case Type::StructTyID: {
+        StructType *STy = cast<StructType>(Ty);
+        if (STy->isOpaque()) return true;
+        for (StructType::element_iterator I = STy->element_begin(),
+                 E = STy->element_end(); I != E; ++I) {
+          Type *InnerTy = *I;
+          if (isa<PointerType>(InnerTy)) return true;
+          if (isa<CompositeType>(InnerTy))
+            Types.push_back(InnerTy);
+        }
+        break;
+      }
+    }
+    if (--Limit == 0) return true;
+  } while (!Types.empty());
+  return false;
+}
+
+/// Given a value that is stored to a global but never read, determine whether
+/// it's safe to remove the store and the chain of computation that feeds the
+/// store.
+static bool IsSafeComputationToRemove(Value *V) {
+  do {
+    if (isa<Constant>(V))
+      return true;
+    if (!V->hasOneUse())
+      return false;
+    if (isa<LoadInst>(V) || isa<Argument>(V) || isa<GlobalValue>(V))
+      return false;
+    if (isAllocationFn(V))
+      return true;
+
+    Instruction *I = cast<Instruction>(V);
+    if (I->mayHaveSideEffects())
+      return false;
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
+      if (!GEP->hasAllConstantIndices())
+        return false;
+    } else if (I->getNumOperands() != 1) {
+      return false;
+    }
+
+    V = I->getOperand(0);
+  } while (1);
+}
+
+/// CleanupPointerRootUsers - This GV is a pointer root.  Loop over all users
+/// of the global and clean up any that obviously don't assign the global a
+/// value that isn't dynamically allocated.
+///
+static bool CleanupPointerRootUsers(GlobalVariable *GV) {
+  // A brief explanation of leak checkers.  The goal is to find bugs where
+  // pointers are forgotten, causing an accumulating growth in memory
+  // usage over time.  The common strategy for leak checkers is to whitelist the
+  // memory pointed to by globals at exit.  This is popular because it also
+  // solves another problem where the main thread of a C++ program may shut down
+  // before other threads that are still expecting to use those globals.  To
+  // handle that case, we expect the program may create a singleton and never
+  // destroy it.
+
+  bool Changed = false;
+
+  // If Dead[n].first is the only use of a malloc result, we can delete its
+  // chain of computation and the store to the global in Dead[n].second.
+  SmallVector<std::pair<Instruction *, Instruction *>, 32> Dead;
+
+  // Constants can't be pointers to dynamically allocated memory.
+  for (Value::use_iterator UI = GV->use_begin(), E = GV->use_end();
+       UI != E;) {
+    User *U = *UI++;
+    if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      Value *V = SI->getValueOperand();
+      if (isa<Constant>(V)) {
+        Changed = true;
+        SI->eraseFromParent();
+      } else if (Instruction *I = dyn_cast<Instruction>(V)) {
+        if (I->hasOneUse())
+          Dead.push_back(std::make_pair(I, SI));
+      }
+    } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(U)) {
+      if (isa<Constant>(MSI->getValue())) {
+        Changed = true;
+        MSI->eraseFromParent();
+      } else if (Instruction *I = dyn_cast<Instruction>(MSI->getValue())) {
+        if (I->hasOneUse())
+          Dead.push_back(std::make_pair(I, MSI));
+      }
+    } else if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(U)) {
+      GlobalVariable *MemSrc = dyn_cast<GlobalVariable>(MTI->getSource());
+      if (MemSrc && MemSrc->isConstant()) {
+        Changed = true;
+        MTI->eraseFromParent();
+      } else if (Instruction *I = dyn_cast<Instruction>(MemSrc)) {
+        if (I->hasOneUse())
+          Dead.push_back(std::make_pair(I, MTI));
+      }
+    } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
+      if (CE->use_empty()) {
+        CE->destroyConstant();
+        Changed = true;
+      }
+    } else if (Constant *C = dyn_cast<Constant>(U)) {
+      if (SafeToDestroyConstant(C)) {
+        C->destroyConstant();
+        // This could have invalidated UI, start over from scratch.
+        Dead.clear();
+        CleanupPointerRootUsers(GV);
+        return true;
+      }
+    }
+  }
+
+  for (int i = 0, e = Dead.size(); i != e; ++i) {
+    if (IsSafeComputationToRemove(Dead[i].first)) {
+      Dead[i].second->eraseFromParent();
+      Instruction *I = Dead[i].first;
+      do {
+        Instruction *J = dyn_cast<Instruction>(I->getOperand(0));
+        if (!J)
+          break;
+        I->eraseFromParent();
+        I = J;
+      } while (!isAllocationFn(I));
+      I->eraseFromParent();
+    }
+  }
+
+  return Changed;
+}
+
 /// CleanupConstantGlobalUsers - We just marked GV constant.  Loop over all
 /// users of the global, cleaning up the obvious ones.  This is largely just a
 /// quick scan over the use list to clean up the easy and obvious cruft.  This
@@ -517,7 +678,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const TargetData &TD) {
       GlobalVariable *NGV = new GlobalVariable(STy->getElementType(i), false,
                                                GlobalVariable::InternalLinkage,
                                                In, GV->getName()+"."+Twine(i),
-                                               GV->isThreadLocal(),
+                                               GV->getThreadLocalMode(),
                                               GV->getType()->getAddressSpace());
       Globals.insert(GV, NGV);
       NewGlobals.push_back(NGV);
@@ -550,7 +711,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const TargetData &TD) {
       GlobalVariable *NGV = new GlobalVariable(STy->getElementType(), false,
                                                GlobalVariable::InternalLinkage,
                                                In, GV->getName()+"."+Twine(i),
-                                               GV->isThreadLocal(),
+                                               GV->getThreadLocalMode(),
                                               GV->getType()->getAddressSpace());
       Globals.insert(GV, NGV);
       NewGlobals.push_back(NGV);
@@ -810,13 +971,18 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV,
   // If we nuked all of the loads, then none of the stores are needed either,
   // nor is the global.
   if (AllNonStoreUsesGone) {
-    DEBUG(dbgs() << "  *** GLOBAL NOW DEAD!\n");
-    CleanupConstantGlobalUsers(GV, 0, TD, TLI);
+    if (isLeakCheckerRoot(GV)) {
+      Changed |= CleanupPointerRootUsers(GV);
+    } else {
+      Changed = true;
+      CleanupConstantGlobalUsers(GV, 0, TD, TLI);
+    }
     if (GV->use_empty()) {
+      DEBUG(dbgs() << "  *** GLOBAL NOW DEAD!\n");
+      Changed = true;
       GV->eraseFromParent();
       ++NumDeleted;
     }
-    Changed = true;
   }
   return Changed;
 }
@@ -866,7 +1032,7 @@ static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV,
                                              UndefValue::get(GlobalType),
                                              GV->getName()+".body",
                                              GV,
-                                             GV->isThreadLocal());
+                                             GV->getThreadLocalMode());
 
   // If there are bitcast users of the malloc (which is typical, usually we have
   // a malloc + bitcast) then replace them with uses of the new global.  Update
@@ -899,7 +1065,7 @@ static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV,
     new GlobalVariable(Type::getInt1Ty(GV->getContext()), false,
                        GlobalValue::InternalLinkage,
                        ConstantInt::getFalse(GV->getContext()),
-                       GV->getName()+".init", GV->isThreadLocal());
+                       GV->getName()+".init", GV->getThreadLocalMode());
   bool InitBoolUsed = false;
 
   // Loop over all uses of GV, processing them in turn.
@@ -1321,7 +1487,7 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
                          PFieldTy, false, GlobalValue::InternalLinkage,
                          Constant::getNullValue(PFieldTy),
                          GV->getName() + ".f" + Twine(FieldNo), GV,
-                         GV->isThreadLocal());
+                         GV->getThreadLocalMode());
     FieldGlobals.push_back(NGV);
 
     unsigned TypeSize = TD->getTypeAllocSize(FieldTy);
@@ -1567,8 +1733,10 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
       Instruction *Cast = new BitCastInst(Malloc, CI->getType(), "tmp", CI);
       CI->replaceAllUsesWith(Cast);
       CI->eraseFromParent();
-      CI = dyn_cast<BitCastInst>(Malloc) ?
-        extractMallocCallFromBitCast(Malloc) : cast<CallInst>(Malloc);
+      if (BitCastInst *BCI = dyn_cast<BitCastInst>(Malloc))
+        CI = cast<CallInst>(BCI->getOperand(0));
+      else
+        CI = cast<CallInst>(Malloc);
     }
 
     GVI = PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, TD, true), TD);
@@ -1645,7 +1813,7 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
                                              GlobalValue::InternalLinkage,
                                         ConstantInt::getFalse(GV->getContext()),
                                              GV->getName()+".b",
-                                             GV->isThreadLocal());
+                                             GV->getThreadLocalMode());
   GV->getParent()->getGlobalList().insert(GV, NewGV);
 
   Constant *InitVal = GV->getInitializer();
@@ -1716,7 +1884,7 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
 /// possible.  If we make a change, return true.
 bool GlobalOpt::ProcessGlobal(GlobalVariable *GV,
                               Module::global_iterator &GVI) {
-  if (!GV->hasLocalLinkage())
+  if (!GV->isDiscardableIfUnused())
     return false;
 
   // Do more involved optimizations if the global is internal.
@@ -1729,6 +1897,9 @@ bool GlobalOpt::ProcessGlobal(GlobalVariable *GV,
     return true;
   }
 
+  if (!GV->hasLocalLinkage())
+    return false;
+
   SmallPtrSet<const PHINode*, 16> PHIUsers;
   GlobalStatus GS;
 
@@ -1787,10 +1958,15 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
   if (!GS.isLoaded) {
     DEBUG(dbgs() << "GLOBAL NEVER LOADED: " << *GV);
 
-    // Delete any stores we can find to the global.  We may not be able to
-    // make it completely dead though.
-    bool Changed = CleanupConstantGlobalUsers(GV, GV->getInitializer(),
-                                              TD, TLI);
+    bool Changed;
+    if (isLeakCheckerRoot(GV)) {
+      // Delete any constant stores to the global.
+      Changed = CleanupPointerRootUsers(GV);
+    } else {
+      // Delete any stores we can find to the global.  We may not be able to
+      // make it completely dead though.
+      Changed = CleanupConstantGlobalUsers(GV, GV->getInitializer(), TD, TLI);
+    }
 
     // If the global is dead now, delete it.
     if (GV->use_empty()) {
@@ -1838,7 +2014,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
 
         if (GV->use_empty()) {
           DEBUG(dbgs() << "   *** Substituting initializer allowed us to "
-                << "simplify all users and delete global!\n");
+                       << "simplify all users and delete global!\n");
           GV->eraseFromParent();
           ++NumDeleted;
         } else {
@@ -1870,6 +2046,8 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
 /// function, changing them to FastCC.
 static void ChangeCalleesToFastCall(Function *F) {
   for (Value::use_iterator UI = F->use_begin(), E = F->use_end(); UI != E;++UI){
+    if (isa<BlockAddress>(*UI))
+      continue;
     CallSite User(cast<Instruction>(*UI));
     User.setCallingConv(CallingConv::Fast);
   }
@@ -1890,6 +2068,8 @@ static AttrListPtr StripNest(const AttrListPtr &Attrs) {
 static void RemoveNestAttribute(Function *F) {
   F->setAttributes(StripNest(F->getAttributes()));
   for (Value::use_iterator UI = F->use_begin(), E = F->use_end(); UI != E;++UI){
+    if (isa<BlockAddress>(*UI))
+      continue;
     CallSite User(cast<Instruction>(*UI));
     User.setAttributes(StripNest(User.getAttributes()));
   }
@@ -2045,7 +2225,7 @@ static GlobalVariable *InstallGlobalCtors(GlobalVariable *GCL,
   // Create the new global and insert it next to the existing list.
   GlobalVariable *NGV = new GlobalVariable(CA->getType(), GCL->isConstant(),
                                            GCL->getLinkage(), CA, "",
-                                           GCL->isThreadLocal());
+                                           GCL->getThreadLocalMode());
   GCL->getParent()->getGlobalList().insert(GCL, NGV);
   NGV->takeName(GCL);
 
@@ -2701,7 +2881,7 @@ static bool EvaluateStaticConstructor(Function *F, const TargetData *TD,
           << " stores.\n");
     for (DenseMap<Constant*, Constant*>::const_iterator I =
            Eval.getMutatedMemory().begin(), E = Eval.getMutatedMemory().end();
-	 I != E; ++I)
+         I != E; ++I)
       CommitValueTo(I->second, I->first);
     for (SmallPtrSet<GlobalVariable*, 8>::const_iterator I =
            Eval.getInvariants().begin(), E = Eval.getInvariants().end();
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index dc9cbfb..712888a 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -36,7 +36,7 @@ STATISTIC(NumCallsDeleted, "Number of call sites deleted, not inlined");
 STATISTIC(NumDeleted, "Number of functions deleted because all callers found");
 STATISTIC(NumMergedAllocas, "Number of allocas merged together");
 
-// This weirdly named statistic tracks the number of times that, when attemting
+// This weirdly named statistic tracks the number of times that, when attempting
 // to inline a function A into B, we analyze the callers of B in order to see
 // if those would be more profitable and blocked inline steps.
 STATISTIC(NumCallerCallersAnalyzed, "Number of caller-callers analyzed");
@@ -201,19 +201,22 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI,
 }
 
 unsigned Inliner::getInlineThreshold(CallSite CS) const {
-  int thres = InlineThreshold;
+  int thres = InlineThreshold; // -inline-threshold or else selected by
+                               // overall opt level
 
-  // Listen to optsize when -inline-limit is not given.
+  // If -inline-threshold is not given, listen to the optsize attribute when it
+  // would decrease the threshold.
   Function *Caller = CS.getCaller();
-  if (Caller && !Caller->isDeclaration() &&
-      Caller->hasFnAttr(Attribute::OptimizeForSize) &&
-      InlineLimit.getNumOccurrences() == 0)
+  bool OptSize = Caller && !Caller->isDeclaration() &&
+    Caller->hasFnAttr(Attribute::OptimizeForSize);
+  if (!(InlineLimit.getNumOccurrences() > 0) && OptSize && OptSizeThreshold < thres)
     thres = OptSizeThreshold;
 
-  // Listen to inlinehint when it would increase the threshold.
+  // Listen to the inlinehint attribute when it would increase the threshold.
   Function *Callee = CS.getCalledFunction();
-  if (HintThreshold > thres && Callee && !Callee->isDeclaration() &&
-      Callee->hasFnAttr(Attribute::InlineHint))
+  bool InlineHint = Callee && !Callee->isDeclaration() &&
+    Callee->hasFnAttr(Attribute::InlineHint);
+  if (InlineHint && HintThreshold > thres)
     thres = HintThreshold;
 
   return thres;
diff --git a/lib/Transforms/IPO/LoopExtractor.cpp b/lib/Transforms/IPO/LoopExtractor.cpp
index 4f96afe..97d7cdc 100644
--- a/lib/Transforms/IPO/LoopExtractor.cpp
+++ b/lib/Transforms/IPO/LoopExtractor.cpp
@@ -24,7 +24,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/FunctionUtils.h"
+#include "llvm/Transforms/Utils/CodeExtractor.h"
 #include "llvm/ADT/Statistic.h"
 #include <fstream>
 #include <set>
@@ -132,7 +132,8 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) {
   if (ShouldExtractLoop) {
     if (NumLoops == 0) return Changed;
     --NumLoops;
-    if (ExtractLoop(DT, L) != 0) {
+    CodeExtractor Extractor(DT, *L);
+    if (Extractor.extractCodeRegion() != 0) {
       Changed = true;
       // After extraction, the loop is replaced by a function call, so
       // we shouldn't try to run any more loop passes on it.
@@ -296,7 +297,7 @@ bool BlockExtractorPass::runOnModule(Module &M) {
     if (const InvokeInst *II =
         dyn_cast<InvokeInst>(BlocksToExtract[i]->getTerminator()))
       BlocksToExtractVec.push_back(II->getUnwindDest());
-    ExtractBasicBlock(BlocksToExtractVec);
+    CodeExtractor(BlocksToExtractVec).extractCodeRegion();
   }
 
   return !BlocksToExtract.empty();
diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index 0b01c38..9f70f66 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -45,22 +45,22 @@
 
 #define DEBUG_TYPE "mergefunc"
 #include "llvm/Transforms/IPO.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/FoldingSet.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/Constants.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/InlineAsm.h"
 #include "llvm/Instructions.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/Operator.h"
 #include "llvm/Pass.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetData.h"
@@ -389,7 +389,7 @@ bool FunctionComparator::enumerate(const Value *V1, const Value *V2) {
     if (!C2) return false;
     // TODO: constant expressions with GEP or references to F1 or F2.
     if (C1->isNullValue() && C2->isNullValue() &&
-	isEquivalentType(C1->getType(), C2->getType()))
+        isEquivalentType(C1->getType(), C2->getType()))
       return true;
     // Try bitcasting C2 to C1's type. If the bitcast is legal and returns C1
     // then they must have equal bit patterns.
diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp
index d9d1d10..9c9910b 100644
--- a/lib/Transforms/IPO/PartialInlining.cpp
+++ b/lib/Transforms/IPO/PartialInlining.cpp
@@ -19,7 +19,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/FunctionUtils.h"
+#include "llvm/Transforms/Utils/CodeExtractor.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Support/CFG.h"
 using namespace llvm;
@@ -122,7 +122,8 @@ Function* PartialInliner::unswitchFunction(Function* F) {
   DT.runOnFunction(*duplicateFunction);
   
   // Extract the body of the if.
-  Function* extractedFunction = ExtractCodeRegion(DT, toExtract);
+  Function* extractedFunction
+    = CodeExtractor(toExtract, &DT).extractCodeRegion();
   
   InlineFunctionInfo IFI;
   
diff --git a/lib/Transforms/IPO/StripSymbols.cpp b/lib/Transforms/IPO/StripSymbols.cpp
index b5caa9a..d8e8cf7 100644
--- a/lib/Transforms/IPO/StripSymbols.cpp
+++ b/lib/Transforms/IPO/StripSymbols.cpp
@@ -22,11 +22,11 @@
 
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Instructions.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/ValueSymbolTable.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/ADT/DenseMap.h"
diff --git a/lib/Transforms/InstCombine/CMakeLists.txt b/lib/Transforms/InstCombine/CMakeLists.txt
index d070ccc..72cfe2c 100644
--- a/lib/Transforms/InstCombine/CMakeLists.txt
+++ b/lib/Transforms/InstCombine/CMakeLists.txt
@@ -13,3 +13,5 @@ add_llvm_library(LLVMInstCombine
   InstCombineSimplifyDemanded.cpp
   InstCombineVectorOps.cpp
   )
+
+add_dependencies(LLVMInstCombine intrinsics_gen)
diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h
index 199df51..0d5ef90 100644
--- a/lib/Transforms/InstCombine/InstCombine.h
+++ b/lib/Transforms/InstCombine/InstCombine.h
@@ -11,11 +11,11 @@
 #define INSTCOMBINE_INSTCOMBINE_H
 
 #include "InstCombineWorklist.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Operator.h"
 #include "llvm/Pass.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/InstVisitor.h"
 #include "llvm/Support/TargetFolder.h"
 
@@ -187,7 +187,7 @@ public:
   Instruction *visitPHINode(PHINode &PN);
   Instruction *visitGetElementPtrInst(GetElementPtrInst &GEP);
   Instruction *visitAllocaInst(AllocaInst &AI);
-  Instruction *visitMalloc(Instruction &FI);
+  Instruction *visitAllocSite(Instruction &FI);
   Instruction *visitFree(CallInst &FI);
   Instruction *visitLoadInst(LoadInst &LI);
   Instruction *visitStoreInst(StoreInst &SI);
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 05e702f..99b62f8 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -170,10 +170,11 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   // -A + B  -->  B - A
   // -A + -B  -->  -(A + B)
   if (Value *LHSV = dyn_castNegVal(LHS)) {
-    if (Value *RHSV = dyn_castNegVal(RHS)) {
-      Value *NewAdd = Builder->CreateAdd(LHSV, RHSV, "sum");
-      return BinaryOperator::CreateNeg(NewAdd);
-    }
+    if (!isa<Constant>(RHS))
+      if (Value *RHSV = dyn_castNegVal(RHS)) {
+        Value *NewAdd = Builder->CreateAdd(LHSV, RHSV, "sum");
+        return BinaryOperator::CreateNeg(NewAdd);
+      }
     
     return BinaryOperator::CreateSub(RHS, LHSV);
   }
@@ -329,6 +330,20 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
     }
   }
 
+  // Check for (x & y) + (x ^ y)
+  {
+    Value *A = 0, *B = 0;
+    if (match(RHS, m_Xor(m_Value(A), m_Value(B))) &&
+        (match(LHS, m_And(m_Specific(A), m_Specific(B))) ||
+         match(LHS, m_And(m_Specific(B), m_Specific(A)))))
+      return BinaryOperator::CreateOr(A, B);
+
+    if (match(LHS, m_Xor(m_Value(A), m_Value(B))) &&
+        (match(RHS, m_And(m_Specific(A), m_Specific(B))) ||
+         match(RHS, m_And(m_Specific(B), m_Specific(A)))))
+      return BinaryOperator::CreateOr(A, B);
+  }
+
   return Changed ? &I : 0;
 }
 
@@ -406,66 +421,6 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
 }
 
 
-/// EmitGEPOffset - Given a getelementptr instruction/constantexpr, emit the
-/// code necessary to compute the offset from the base pointer (without adding
-/// in the base pointer).  Return the result as a signed integer of intptr size.
-Value *InstCombiner::EmitGEPOffset(User *GEP) {
-  TargetData &TD = *getTargetData();
-  gep_type_iterator GTI = gep_type_begin(GEP);
-  Type *IntPtrTy = TD.getIntPtrType(GEP->getContext());
-  Value *Result = Constant::getNullValue(IntPtrTy);
-
-  // If the GEP is inbounds, we know that none of the addressing operations will
-  // overflow in an unsigned sense.
-  bool isInBounds = cast<GEPOperator>(GEP)->isInBounds();
-  
-  // Build a mask for high order bits.
-  unsigned IntPtrWidth = TD.getPointerSizeInBits();
-  uint64_t PtrSizeMask = ~0ULL >> (64-IntPtrWidth);
-
-  for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end(); i != e;
-       ++i, ++GTI) {
-    Value *Op = *i;
-    uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType()) & PtrSizeMask;
-    if (ConstantInt *OpC = dyn_cast<ConstantInt>(Op)) {
-      if (OpC->isZero()) continue;
-      
-      // Handle a struct index, which adds its field offset to the pointer.
-      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
-        Size = TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
-        
-        if (Size)
-          Result = Builder->CreateAdd(Result, ConstantInt::get(IntPtrTy, Size),
-                                      GEP->getName()+".offs");
-        continue;
-      }
-      
-      Constant *Scale = ConstantInt::get(IntPtrTy, Size);
-      Constant *OC =
-              ConstantExpr::getIntegerCast(OpC, IntPtrTy, true /*SExt*/);
-      Scale = ConstantExpr::getMul(OC, Scale, isInBounds/*NUW*/);
-      // Emit an add instruction.
-      Result = Builder->CreateAdd(Result, Scale, GEP->getName()+".offs");
-      continue;
-    }
-    // Convert to correct type.
-    if (Op->getType() != IntPtrTy)
-      Op = Builder->CreateIntCast(Op, IntPtrTy, true, Op->getName()+".c");
-    if (Size != 1) {
-      // We'll let instcombine(mul) convert this to a shl if possible.
-      Op = Builder->CreateMul(Op, ConstantInt::get(IntPtrTy, Size),
-                              GEP->getName()+".idx", isInBounds /*NUW*/);
-    }
-
-    // Emit an add instruction.
-    Result = Builder->CreateAdd(Op, Result, GEP->getName()+".offs");
-  }
-  return Result;
-}
-
-
-
-
 /// Optimize pointer differences into the same array into a size.  Consider:
 ///  &A[10] - &A[0]: we should compile this to "10".  LHS/RHS are the pointer
 /// operands to the ptrtoint instructions for the LHS/RHS of the subtract.
@@ -589,11 +544,6 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
       if (Instruction *R = FoldOpIntoSelect(I, SI))
         return R;
 
-    // C - zext(bool) -> bool ? C - 1 : C
-    if (ZExtInst *ZI = dyn_cast<ZExtInst>(Op1))
-      if (ZI->getSrcTy()->isIntegerTy(1))
-        return SelectInst::Create(ZI->getOperand(0), SubOne(C), C);
-
     // C-(X+C2) --> (C-C2)-X
     ConstantInt *C2;
     if (match(Op1, m_Add(m_Value(X), m_ConstantInt(C2))))
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 0dbe11d..7d0af0d 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -986,19 +986,23 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
     bool Op1Ordered;
     unsigned Op0Pred = getFCmpCode(Op0CC, Op0Ordered);
     unsigned Op1Pred = getFCmpCode(Op1CC, Op1Ordered);
+    // uno && ord -> false
+    if (Op0Pred == 0 && Op1Pred == 0 && Op0Ordered != Op1Ordered)
+        return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0);
     if (Op1Pred == 0) {
       std::swap(LHS, RHS);
       std::swap(Op0Pred, Op1Pred);
       std::swap(Op0Ordered, Op1Ordered);
     }
     if (Op0Pred == 0) {
-      // uno && ueq -> uno && (uno || eq) -> ueq
+      // uno && ueq -> uno && (uno || eq) -> uno
       // ord && olt -> ord && (ord && lt) -> olt
-      if (Op0Ordered == Op1Ordered)
+      if (!Op0Ordered && (Op0Ordered == Op1Ordered))
+        return LHS;
+      if (Op0Ordered && (Op0Ordered == Op1Ordered))
         return RHS;
       
       // uno && oeq -> uno && (ord && eq) -> false
-      // uno && ord -> false
       if (!Op0Ordered)
         return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0);
       // ord && ueq -> ord && (uno || eq) -> oeq
@@ -1932,10 +1936,15 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
 
   // A | ( A ^ B) -> A |  B
   // A | (~A ^ B) -> A | ~B
+  // (A & B) | (A ^ B)
   if (match(Op1, m_Xor(m_Value(A), m_Value(B)))) {
     if (Op0 == A || Op0 == B)
       return BinaryOperator::CreateOr(A, B);
 
+    if (match(Op0, m_And(m_Specific(A), m_Specific(B))) ||
+        match(Op0, m_And(m_Specific(B), m_Specific(A))))
+      return BinaryOperator::CreateOr(A, B);
+
     if (Op1->hasOneUse() && match(A, m_Not(m_Specific(Op0)))) {
       Value *Not = Builder->CreateNot(B, B->getName()+".not");
       return BinaryOperator::CreateOr(Not, Op0);
@@ -2212,7 +2221,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
   if (Op0I && Op1I && Op0I->isShift() && 
       Op0I->getOpcode() == Op1I->getOpcode() && 
       Op0I->getOperand(1) == Op1I->getOperand(1) &&
-      (Op1I->hasOneUse() || Op1I->hasOneUse())) {
+      (Op0I->hasOneUse() || Op1I->hasOneUse())) {
     Value *NewOp =
       Builder->CreateXor(Op0I->getOperand(0), Op1I->getOperand(0),
                          Op0I->getName());
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 77e4727..c1d9d01 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -172,8 +172,6 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
 Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   if (isFreeCall(&CI))
     return visitFree(CI);
-  if (isMalloc(&CI))
-    return visitMalloc(CI);
 
   // If the caller function is nounwind, mark the call as nounwind, even if the
   // callee isn't.
@@ -246,78 +244,10 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   switch (II->getIntrinsicID()) {
   default: break;
   case Intrinsic::objectsize: {
-    // We need target data for just about everything so depend on it.
-    if (!TD) break;
-
-    Type *ReturnTy = CI.getType();
-    uint64_t DontKnow = II->getArgOperand(1) == Builder->getTrue() ? 0 : -1ULL;
-
-    // Get to the real allocated thing and offset as fast as possible.
-    Value *Op1 = II->getArgOperand(0)->stripPointerCasts();
-
-    uint64_t Offset = 0;
-    uint64_t Size = -1ULL;
-
-    // Try to look through constant GEPs.
-    if (GEPOperator *GEP = dyn_cast<GEPOperator>(Op1)) {
-      if (!GEP->hasAllConstantIndices()) break;
-
-      // Get the current byte offset into the thing. Use the original
-      // operand in case we're looking through a bitcast.
-      SmallVector<Value*, 8> Ops(GEP->idx_begin(), GEP->idx_end());
-      if (!GEP->getPointerOperandType()->isPointerTy())
-        return 0;
-      Offset = TD->getIndexedOffset(GEP->getPointerOperandType(), Ops);
-
-      Op1 = GEP->getPointerOperand()->stripPointerCasts();
-
-      // Make sure we're not a constant offset from an external
-      // global.
-      if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Op1))
-        if (!GV->hasDefinitiveInitializer()) break;
-    }
-
-    // If we've stripped down to a single global variable that we
-    // can know the size of then just return that.
-    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Op1)) {
-      if (GV->hasDefinitiveInitializer()) {
-        Constant *C = GV->getInitializer();
-        Size = TD->getTypeAllocSize(C->getType());
-      } else {
-        // Can't determine size of the GV.
-        Constant *RetVal = ConstantInt::get(ReturnTy, DontKnow);
-        return ReplaceInstUsesWith(CI, RetVal);
-      }
-    } else if (AllocaInst *AI = dyn_cast<AllocaInst>(Op1)) {
-      // Get alloca size.
-      if (AI->getAllocatedType()->isSized()) {
-        Size = TD->getTypeAllocSize(AI->getAllocatedType());
-        if (AI->isArrayAllocation()) {
-          const ConstantInt *C = dyn_cast<ConstantInt>(AI->getArraySize());
-          if (!C) break;
-          Size *= C->getZExtValue();
-        }
-      }
-    } else if (CallInst *MI = extractMallocCall(Op1)) {
-      // Get allocation size.
-      Type* MallocType = getMallocAllocatedType(MI);
-      if (MallocType && MallocType->isSized())
-        if (Value *NElems = getMallocArraySize(MI, TD, true))
-          if (ConstantInt *NElements = dyn_cast<ConstantInt>(NElems))
-            Size = NElements->getZExtValue() * TD->getTypeAllocSize(MallocType);
-    }
-
-    // Do not return "I don't know" here. Later optimization passes could
-    // make it possible to evaluate objectsize to a constant.
-    if (Size == -1ULL)
-      break;
-
-    if (Size < Offset) {
-      // Out of bound reference? Negative index normalized to large
-      // index? Just return "I don't know".
-      return ReplaceInstUsesWith(CI, ConstantInt::get(ReturnTy, DontKnow));
-    }
-    return ReplaceInstUsesWith(CI, ConstantInt::get(ReturnTy, Size-Offset));
+    uint64_t Size;
+    if (getObjectSize(II->getArgOperand(0), Size, TD))
+      return ReplaceInstUsesWith(CI, ConstantInt::get(CI.getType(), Size));
+    return 0;
   }
   case Intrinsic::bswap:
     // bswap(bswap(x)) -> x
@@ -694,6 +624,57 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   }
 
+  case Intrinsic::arm_neon_vmulls:
+  case Intrinsic::arm_neon_vmullu: {
+    Value *Arg0 = II->getArgOperand(0);
+    Value *Arg1 = II->getArgOperand(1);
+
+    // Handle mul by zero first:
+    if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) {
+      return ReplaceInstUsesWith(CI, ConstantAggregateZero::get(II->getType()));
+    }
+
+    // Check for constant LHS & RHS - in this case we just simplify.
+    bool Zext = (II->getIntrinsicID() == Intrinsic::arm_neon_vmullu);
+    VectorType *NewVT = cast<VectorType>(II->getType());
+    unsigned NewWidth = NewVT->getElementType()->getIntegerBitWidth();
+    if (ConstantDataVector *CV0 = dyn_cast<ConstantDataVector>(Arg0)) {
+      if (ConstantDataVector *CV1 = dyn_cast<ConstantDataVector>(Arg1)) {
+        VectorType* VT = cast<VectorType>(CV0->getType());
+        SmallVector<Constant*, 4> NewElems;
+        for (unsigned i = 0; i < VT->getNumElements(); ++i) {
+          APInt CV0E =
+            (cast<ConstantInt>(CV0->getAggregateElement(i)))->getValue();
+          CV0E = Zext ? CV0E.zext(NewWidth) : CV0E.sext(NewWidth);
+          APInt CV1E =
+            (cast<ConstantInt>(CV1->getAggregateElement(i)))->getValue();
+          CV1E = Zext ? CV1E.zext(NewWidth) : CV1E.sext(NewWidth);
+          NewElems.push_back(
+            ConstantInt::get(NewVT->getElementType(), CV0E * CV1E));
+        }
+        return ReplaceInstUsesWith(CI, ConstantVector::get(NewElems));
+      }
+
+      // Couldn't simplify - cannonicalize constant to the RHS.
+      std::swap(Arg0, Arg1);
+    }
+
+    // Handle mul by one:
+    if (ConstantDataVector *CV1 = dyn_cast<ConstantDataVector>(Arg1)) {
+      if (ConstantInt *Splat =
+            dyn_cast_or_null<ConstantInt>(CV1->getSplatValue())) {
+        if (Splat->isOne()) {
+          if (Zext)
+            return CastInst::CreateZExtOrBitCast(Arg0, II->getType());
+          // else    
+          return CastInst::CreateSExtOrBitCast(Arg0, II->getType());
+        }
+      }
+    }
+
+    break;
+  }
+
   case Intrinsic::stackrestore: {
     // If the save is right next to the restore, remove the restore.  This can
     // happen when variable allocas are DCE'd.
@@ -711,7 +692,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     TerminatorInst *TI = II->getParent()->getTerminator();
     bool CannotRemove = false;
     for (++BI; &*BI != TI; ++BI) {
-      if (isa<AllocaInst>(BI) || isMalloc(BI)) {
+      if (isa<AllocaInst>(BI)) {
         CannotRemove = true;
         break;
       }
@@ -898,6 +879,9 @@ static IntrinsicInst *FindInitTrampoline(Value *Callee) {
 // visitCallSite - Improvements for call and invoke instructions.
 //
 Instruction *InstCombiner::visitCallSite(CallSite CS) {
+  if (isAllocLikeFn(CS.getInstruction()))
+    return visitAllocSite(*CS.getInstruction());
+
   bool Changed = false;
 
   // If the callee is a pointer to a function, attempt to move any casts to the
@@ -933,24 +917,24 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
     }
 
   if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
-    // This instruction is not reachable, just remove it.  We insert a store to
-    // undef so that we know that this code is not reachable, despite the fact
-    // that we can't modify the CFG here.
-    new StoreInst(ConstantInt::getTrue(Callee->getContext()),
-               UndefValue::get(Type::getInt1PtrTy(Callee->getContext())),
-                  CS.getInstruction());
-
     // If CS does not return void then replaceAllUsesWith undef.
     // This allows ValueHandlers and custom metadata to adjust itself.
     if (!CS.getInstruction()->getType()->isVoidTy())
       ReplaceInstUsesWith(*CS.getInstruction(),
                           UndefValue::get(CS.getInstruction()->getType()));
 
-    if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
-      // Don't break the CFG, insert a dummy cond branch.
-      BranchInst::Create(II->getNormalDest(), II->getUnwindDest(),
-                         ConstantInt::getTrue(Callee->getContext()), II);
+    if (isa<InvokeInst>(CS.getInstruction())) {
+      // Can't remove an invoke because we cannot change the CFG.
+      return 0;
     }
+
+    // This instruction is not reachable, just remove it.  We insert a store to
+    // undef so that we know that this code is not reachable, despite the fact
+    // that we can't modify the CFG here.
+    new StoreInst(ConstantInt::getTrue(Callee->getContext()),
+                  UndefValue::get(Type::getInt1PtrTy(Callee->getContext())),
+                  CS.getInstruction());
+
     return EraseInstFromFunction(*CS.getInstruction());
   }
 
@@ -1194,8 +1178,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
   if (NewRetTy->isVoidTy())
     Caller->setName("");   // Void type should not have a name.
 
-  const AttrListPtr &NewCallerPAL = AttrListPtr::get(attrVec.begin(),
-                                                     attrVec.end());
+  const AttrListPtr &NewCallerPAL = AttrListPtr::get(attrVec);
 
   Instruction *NC;
   if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
@@ -1367,8 +1350,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
         NestF->getType() == PointerType::getUnqual(NewFTy) ?
         NestF : ConstantExpr::getBitCast(NestF,
                                          PointerType::getUnqual(NewFTy));
-      const AttrListPtr &NewPAL = AttrListPtr::get(NewAttrs.begin(),
-                                                   NewAttrs.end());
+      const AttrListPtr &NewPAL = AttrListPtr::get(NewAttrs);
 
       Instruction *NewCaller;
       if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 39279f4..555b442 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -34,7 +34,7 @@ static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
   if (BinaryOperator *I = dyn_cast<BinaryOperator>(Val)) {
     // Cannot look past anything that might overflow.
     OverflowingBinaryOperator *OBI = dyn_cast<OverflowingBinaryOperator>(Val);
-    if (OBI && !OBI->hasNoUnsignedWrap()) {
+    if (OBI && !OBI->hasNoUnsignedWrap() && !OBI->hasNoSignedWrap()) {
       Scale = 1;
       Offset = 0;
       return Val;
@@ -648,10 +648,8 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) {
   if (!I) return false;
   
   // If the input is a truncate from the destination type, we can trivially
-  // eliminate it, even if it has multiple uses.
-  // FIXME: This is currently disabled until codegen can handle this without
-  // pessimizing code, PR5997.
-  if (0 && isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty)
+  // eliminate it.
+  if (isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty)
     return true;
   
   // We can't extend or shrink something that has multiple uses: doing so would
@@ -992,11 +990,8 @@ static bool CanEvaluateSExtd(Value *V, Type *Ty) {
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) return false;
   
-  // If this is a truncate from the dest type, we can trivially eliminate it,
-  // even if it has multiple uses.
-  // FIXME: This is currently disabled until codegen can handle this without
-  // pessimizing code, PR5997.
-  if (0 && isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty)
+  // If this is a truncate from the dest type, we can trivially eliminate it.
+  if (isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty)
     return true;
   
   // We can't extend or shrink something that has multiple uses: doing so would
@@ -1341,10 +1336,9 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
     // non-type-safe code.
     if (TD && GEP->hasOneUse() && isa<BitCastInst>(GEP->getOperand(0)) &&
         GEP->hasAllConstantIndices()) {
-      // We are guaranteed to get a constant from EmitGEPOffset.
-      ConstantInt *OffsetV = cast<ConstantInt>(EmitGEPOffset(GEP));
-      int64_t Offset = OffsetV->getSExtValue();
-      
+      SmallVector<Value*, 8> Ops(GEP->idx_begin(), GEP->idx_end());
+      int64_t Offset = TD->getIndexedOffset(GEP->getPointerOperandType(), Ops);
+
       // Get the base pointer input of the bitcast, and the type it points to.
       Value *OrigBase = cast<BitCastInst>(GEP->getOperand(0))->getOperand(0);
       Type *GEPIdxTy =
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index ab2987f..7076d88 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1035,7 +1035,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       if ((KnownZero|KnownOne).countLeadingOnes() >= SrcBits-DstBits) {
         // Pull in the high bits from known-ones set.
         APInt NewRHS = RHS->getValue().zext(SrcBits);
-        NewRHS |= KnownOne;
+        NewRHS |= KnownOne & APInt::getHighBitsSet(SrcBits, SrcBits-DstBits);
         return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0),
                             ConstantInt::get(ICI.getContext(), NewRHS));
       }
@@ -2580,10 +2580,25 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       }
     }
 
+    // Transform (zext A) == (B & (1<<X)-1) --> A == (trunc B)
+    // and       (B & (1<<X)-1) == (zext A) --> A == (trunc B)
+    ConstantInt *Cst1;
+    if ((Op0->hasOneUse() &&
+         match(Op0, m_ZExt(m_Value(A))) &&
+         match(Op1, m_And(m_Value(B), m_ConstantInt(Cst1)))) ||
+        (Op1->hasOneUse() &&
+         match(Op0, m_And(m_Value(B), m_ConstantInt(Cst1))) &&
+         match(Op1, m_ZExt(m_Value(A))))) {
+      APInt Pow2 = Cst1->getValue() + 1;
+      if (Pow2.isPowerOf2() && isa<IntegerType>(A->getType()) &&
+          Pow2.logBase2() == cast<IntegerType>(A->getType())->getBitWidth())
+        return new ICmpInst(I.getPredicate(), A,
+                            Builder->CreateTrunc(B, A->getType()));
+    }
+
     // Transform "icmp eq (trunc (lshr(X, cst1)), cst" to
     // "icmp (and X, mask), cst"
     uint64_t ShAmt = 0;
-    ConstantInt *Cst1;
     if (Op0->hasOneUse() &&
         match(Op0, m_Trunc(m_OneUse(m_LShr(m_Value(A),
                                            m_ConstantInt(ShAmt))))) &&
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index b2f2e24..c485844 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -22,72 +22,6 @@ using namespace llvm;
 
 STATISTIC(NumDeadStore, "Number of dead stores eliminated");
 
-// Try to kill dead allocas by walking through its uses until we see some use
-// that could escape. This is a conservative analysis which tries to handle
-// GEPs, bitcasts, stores, and no-op intrinsics. These tend to be the things
-// left after inlining and SROA finish chewing on an alloca.
-static Instruction *removeDeadAlloca(InstCombiner &IC, AllocaInst &AI) {
-  SmallVector<Instruction *, 4> Worklist, DeadStores;
-  Worklist.push_back(&AI);
-  do {
-    Instruction *PI = Worklist.pop_back_val();
-    for (Value::use_iterator UI = PI->use_begin(), UE = PI->use_end();
-         UI != UE; ++UI) {
-      Instruction *I = cast<Instruction>(*UI);
-      switch (I->getOpcode()) {
-      default:
-        // Give up the moment we see something we can't handle.
-        return 0;
-
-      case Instruction::GetElementPtr:
-      case Instruction::BitCast:
-        Worklist.push_back(I);
-        continue;
-
-      case Instruction::Call:
-        // We can handle a limited subset of calls to no-op intrinsics.
-        if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
-          switch (II->getIntrinsicID()) {
-          case Intrinsic::dbg_declare:
-          case Intrinsic::dbg_value:
-          case Intrinsic::invariant_start:
-          case Intrinsic::invariant_end:
-          case Intrinsic::lifetime_start:
-          case Intrinsic::lifetime_end:
-            continue;
-          default:
-            return 0;
-          }
-        }
-        // Reject everything else.
-        return 0;
-
-      case Instruction::Store: {
-        // Stores into the alloca are only live if the alloca is live.
-        StoreInst *SI = cast<StoreInst>(I);
-        // We can eliminate atomic stores, but not volatile.
-        if (SI->isVolatile())
-          return 0;
-        // The store is only trivially safe if the poniter is the destination
-        // as opposed to the value. We're conservative here and don't check for
-        // the case where we store the address of a dead alloca into a dead
-        // alloca.
-        if (SI->getPointerOperand() != PI)
-          return 0;
-        DeadStores.push_back(I);
-        continue;
-      }
-      }
-    }
-  } while (!Worklist.empty());
-
-  // The alloca is dead. Kill off all the stores to it, and then replace it
-  // with undef.
-  while (!DeadStores.empty())
-    IC.EraseInstFromFunction(*DeadStores.pop_back_val());
-  return IC.ReplaceInstUsesWith(AI, UndefValue::get(AI.getType()));
-}
-
 Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
   // Ensure that the alloca array size argument has type intptr_t, so that
   // any casting is exposed early.
@@ -106,7 +40,6 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
     if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) {
       Type *NewTy = 
         ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
-      assert(isa<AllocaInst>(AI) && "Unknown type of allocation inst!");
       AllocaInst *New = Builder->CreateAlloca(NewTy, 0, AI.getName());
       New->setAlignment(AI.getAlignment());
 
@@ -135,22 +68,54 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
     }
   }
 
-  if (TD && isa<AllocaInst>(AI) && AI.getAllocatedType()->isSized()) {
-    // If alloca'ing a zero byte object, replace the alloca with a null pointer.
-    // Note that we only do this for alloca's, because malloc should allocate
-    // and return a unique pointer, even for a zero byte allocation.
-    if (TD->getTypeAllocSize(AI.getAllocatedType()) == 0)
-      return ReplaceInstUsesWith(AI, Constant::getNullValue(AI.getType()));
-
+  if (TD && AI.getAllocatedType()->isSized()) {
     // If the alignment is 0 (unspecified), assign it the preferred alignment.
     if (AI.getAlignment() == 0)
       AI.setAlignment(TD->getPrefTypeAlignment(AI.getAllocatedType()));
+
+    // Move all alloca's of zero byte objects to the entry block and merge them
+    // together.  Note that we only do this for alloca's, because malloc should
+    // allocate and return a unique pointer, even for a zero byte allocation.
+    if (TD->getTypeAllocSize(AI.getAllocatedType()) == 0) {
+      // For a zero sized alloca there is no point in doing an array allocation.
+      // This is helpful if the array size is a complicated expression not used
+      // elsewhere.
+      if (AI.isArrayAllocation()) {
+        AI.setOperand(0, ConstantInt::get(AI.getArraySize()->getType(), 1));
+        return &AI;
+      }
+
+      // Get the first instruction in the entry block.
+      BasicBlock &EntryBlock = AI.getParent()->getParent()->getEntryBlock();
+      Instruction *FirstInst = EntryBlock.getFirstNonPHIOrDbg();
+      if (FirstInst != &AI) {
+        // If the entry block doesn't start with a zero-size alloca then move
+        // this one to the start of the entry block.  There is no problem with
+        // dominance as the array size was forced to a constant earlier already.
+        AllocaInst *EntryAI = dyn_cast<AllocaInst>(FirstInst);
+        if (!EntryAI || !EntryAI->getAllocatedType()->isSized() ||
+            TD->getTypeAllocSize(EntryAI->getAllocatedType()) != 0) {
+          AI.moveBefore(FirstInst);
+          return &AI;
+        }
+
+        // Replace this zero-sized alloca with the one at the start of the entry
+        // block after ensuring that the address will be aligned enough for both
+        // types.
+        unsigned MaxAlign =
+          std::max(TD->getPrefTypeAlignment(EntryAI->getAllocatedType()),
+                   TD->getPrefTypeAlignment(AI.getAllocatedType()));
+        EntryAI->setAlignment(MaxAlign);
+        if (AI.getType() != EntryAI->getType())
+          return new BitCastInst(EntryAI, AI.getType());
+        return ReplaceInstUsesWith(AI, EntryAI);
+      }
+    }
   }
 
-  // Try to aggressively remove allocas which are only used for GEPs, lifetime
-  // markers, and stores. This happens when SROA iteratively promotes stores
-  // out of the alloca, and we need to cleanup after it.
-  return removeDeadAlloca(*this, AI);
+  // At last, use the generic allocation site handler to aggressively remove
+  // unused allocas.
+  return visitAllocSite(AI);
 }
 
 
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 5168e2a..35a0bbb 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -464,9 +464,12 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
 
   // X udiv (C1 << N), where C1 is "1<<C2"  -->  X >> (N+C2)
   { const APInt *CI; Value *N;
-    if (match(Op1, m_Shl(m_Power2(CI), m_Value(N)))) {
+    if (match(Op1, m_Shl(m_Power2(CI), m_Value(N))) ||
+        match(Op1, m_ZExt(m_Shl(m_Power2(CI), m_Value(N))))) {
       if (*CI != 1)
         N = Builder->CreateAdd(N, ConstantInt::get(I.getType(),CI->logBase2()));
+      if (ZExtInst *Z = dyn_cast<ZExtInst>(Op1))
+        N = Builder->CreateZExt(N, Z->getDestTy());
       if (I.isExact())
         return BinaryOperator::CreateExactLShr(Op0, N);
       return BinaryOperator::CreateLShr(Op0, N);
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index e727b2c..eb9945b 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -129,6 +129,12 @@ Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI,
     if (TI->isCast()) {
       if (TI->getOperand(0)->getType() != FI->getOperand(0)->getType())
         return 0;
+      // The select condition may be a vector. We may only change the operand
+      // type if the vector width remains the same (and matches the condition).
+      Type *CondTy = SI.getCondition()->getType();
+      if (CondTy->isVectorTy() && CondTy->getVectorNumElements() !=
+          FI->getOperand(0)->getType()->getVectorNumElements())
+        return 0;
     } else {
       return 0;  // unknown unary op.
     }
@@ -498,7 +504,7 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
 
   // NOTE: if we wanted to, this is where to detect integer MIN/MAX
 
-  if (isa<Constant>(CmpRHS)) {
+  if (CmpRHS != CmpLHS && isa<Constant>(CmpRHS)) {
     if (CmpLHS == TrueVal && Pred == ICmpInst::ICMP_EQ) {
       // Transform (X == C) ? X : Y -> (X == C) ? C : Y
       SI.setOperand(1, CmpRHS);
diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp
index b31049e..4bb2403 100644
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -151,7 +151,7 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift,
 
     // We can always turn lshr(c1)+shl(c2) -> lshr(c3)+and(c4), but it isn't
     // profitable unless we know the and'd out bits are already zero.
-    if (CI->getZExtValue() > NumBits) {
+    if (CI->getValue().ult(TypeWidth) && CI->getZExtValue() > NumBits) {
       unsigned LowBits = CI->getZExtValue() - NumBits;
       if (MaskedValueIsZero(I->getOperand(0),
                           APInt::getLowBitsSet(TypeWidth, NumBits) << LowBits))
@@ -529,6 +529,19 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
     ShiftOp = 0;
   
   if (ShiftOp && isa<ConstantInt>(ShiftOp->getOperand(1))) {
+
+    // This is a constant shift of a constant shift. Be careful about hiding
+    // shl instructions behind bit masks. They are used to represent multiplies
+    // by a constant, and it is important that simple arithmetic expressions
+    // are still recognizable by scalar evolution.
+    //
+    // The transforms applied to shl are very similar to the transforms applied
+    // to mul by constant. We can be more aggressive about optimizing right
+    // shifts.
+    //
+    // Combinations of right and left shifts will still be optimized in
+    // DAGCombine where scalar evolution no longer applies.
+
     ConstantInt *ShiftAmt1C = cast<ConstantInt>(ShiftOp->getOperand(1));
     uint32_t ShiftAmt1 = ShiftAmt1C->getLimitedValue(TypeBits);
     uint32_t ShiftAmt2 = Op1->getLimitedValue(TypeBits);
@@ -554,13 +567,6 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
     }
     
     if (ShiftAmt1 == ShiftAmt2) {
-      // If we have ((X >>? C) << C), turn this into X & (-1 << C).
-      if (I.getOpcode() == Instruction::Shl &&
-          ShiftOp->getOpcode() != Instruction::Shl) {
-        APInt Mask(APInt::getHighBitsSet(TypeBits, TypeBits - ShiftAmt1));
-        return BinaryOperator::CreateAnd(X,
-                                         ConstantInt::get(I.getContext(),Mask));
-      }
       // If we have ((X << C) >>u C), turn this into X & (-1 >>u C).
       if (I.getOpcode() == Instruction::LShr &&
           ShiftOp->getOpcode() == Instruction::Shl) {
@@ -570,28 +576,23 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
       }
     } else if (ShiftAmt1 < ShiftAmt2) {
       uint32_t ShiftDiff = ShiftAmt2-ShiftAmt1;
-      
-      // (X >>? C1) << C2 --> X << (C2-C1) & (-1 << C2)
+
+      // (X >>?,exact C1) << C2 --> X << (C2-C1)
+      // The inexact version is deferred to DAGCombine so we don't hide shl
+      // behind a bit mask.
       if (I.getOpcode() == Instruction::Shl &&
-          ShiftOp->getOpcode() != Instruction::Shl) {
+          ShiftOp->getOpcode() != Instruction::Shl &&
+          ShiftOp->isExact()) {
         assert(ShiftOp->getOpcode() == Instruction::LShr ||
                ShiftOp->getOpcode() == Instruction::AShr);
         ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
-        if (ShiftOp->isExact()) {
-          // (X >>?,exact C1) << C2 --> X << (C2-C1)
-          BinaryOperator *NewShl = BinaryOperator::Create(Instruction::Shl,
-                                                          X, ShiftDiffCst);
-          NewShl->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
-          NewShl->setHasNoSignedWrap(I.hasNoSignedWrap());
-          return NewShl;
-        }
-        Value *Shift = Builder->CreateShl(X, ShiftDiffCst);
-        
-        APInt Mask(APInt::getHighBitsSet(TypeBits, TypeBits - ShiftAmt2));
-        return BinaryOperator::CreateAnd(Shift,
-                                         ConstantInt::get(I.getContext(),Mask));
+        BinaryOperator *NewShl = BinaryOperator::Create(Instruction::Shl,
+                                                        X, ShiftDiffCst);
+        NewShl->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
+        NewShl->setHasNoSignedWrap(I.hasNoSignedWrap());
+        return NewShl;
       }
-      
+
       // (X << C1) >>u C2  --> X >>u (C2-C1) & (-1 >> C2)
       if (I.getOpcode() == Instruction::LShr &&
           ShiftOp->getOpcode() == Instruction::Shl) {
@@ -627,24 +628,19 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
       assert(ShiftAmt2 < ShiftAmt1);
       uint32_t ShiftDiff = ShiftAmt1-ShiftAmt2;
 
-      // (X >>? C1) << C2 --> X >>? (C1-C2) & (-1 << C2)
+      // (X >>?exact C1) << C2 --> X >>?exact (C1-C2)
+      // The inexact version is deferred to DAGCombine so we don't hide shl
+      // behind a bit mask.
       if (I.getOpcode() == Instruction::Shl &&
-          ShiftOp->getOpcode() != Instruction::Shl) {
+          ShiftOp->getOpcode() != Instruction::Shl &&
+          ShiftOp->isExact()) {
         ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
-        if (ShiftOp->isExact()) {
-          // (X >>?exact C1) << C2 --> X >>?exact (C1-C2)
-          BinaryOperator *NewShr = BinaryOperator::Create(ShiftOp->getOpcode(),
-                                                          X, ShiftDiffCst);
-          NewShr->setIsExact(true);
-          return NewShr;
-        }
-        Value *Shift = Builder->CreateBinOp(ShiftOp->getOpcode(),
-                                            X, ShiftDiffCst);
-        APInt Mask(APInt::getHighBitsSet(TypeBits, TypeBits - ShiftAmt2));
-        return BinaryOperator::CreateAnd(Shift,
-                                         ConstantInt::get(I.getContext(),Mask));
+        BinaryOperator *NewShr = BinaryOperator::Create(ShiftOp->getOpcode(),
+                                                        X, ShiftDiffCst);
+        NewShr->setIsExact(true);
+        return NewShr;
       }
-      
+
       // (X << C1) >>u C2  --> X << (C1-C2) & (-1 >> C2)
       if (I.getOpcode() == Instruction::LShr &&
           ShiftOp->getOpcode() == Instruction::Shl) {
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 066b2ec..68ecd51 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -87,30 +87,34 @@ void InstCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 
+Value *InstCombiner::EmitGEPOffset(User *GEP) {
+  return llvm::EmitGEPOffset(Builder, *getTargetData(), GEP);
+}
+
 /// ShouldChangeType - Return true if it is desirable to convert a computation
 /// from 'From' to 'To'.  We don't want to convert from a legal to an illegal
 /// type for example, or from a smaller to a larger illegal type.
 bool InstCombiner::ShouldChangeType(Type *From, Type *To) const {
   assert(From->isIntegerTy() && To->isIntegerTy());
-  
+
   // If we don't have TD, we don't know if the source/dest are legal.
   if (!TD) return false;
-  
+
   unsigned FromWidth = From->getPrimitiveSizeInBits();
   unsigned ToWidth = To->getPrimitiveSizeInBits();
   bool FromLegal = TD->isLegalInteger(FromWidth);
   bool ToLegal = TD->isLegalInteger(ToWidth);
-  
+
   // If this is a legal integer from type, and the result would be an illegal
   // type, don't do the transformation.
   if (FromLegal && !ToLegal)
     return false;
-  
+
   // Otherwise, if both are illegal, do not increase the size of the result. We
   // do allow things like i160 -> i64, but not i64 -> i160.
   if (!FromLegal && !ToLegal && ToWidth > FromWidth)
     return false;
-  
+
   return true;
 }
 
@@ -127,7 +131,7 @@ static bool MaintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) {
 
   // We reason about Add and Sub Only.
   Instruction::BinaryOps Opcode = I.getOpcode();
-  if (Opcode != Instruction::Add && 
+  if (Opcode != Instruction::Add &&
       Opcode != Instruction::Sub) {
     return false;
   }
@@ -203,7 +207,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
           // Conservatively clear the optional flags, since they may not be
           // preserved by the reassociation.
           if (MaintainNoSignedWrap(I, B, C) &&
-	      (!Op0 || (isa<BinaryOperator>(Op0) && Op0->hasNoSignedWrap()))) {
+              (!Op0 || (isa<BinaryOperator>(Op0) && Op0->hasNoSignedWrap()))) {
             // Note: this is only valid because SimplifyBinOp doesn't look at
             // the operands to Op0.
             I.clearSubclassOptionalData();
@@ -211,7 +215,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
           } else {
             I.clearSubclassOptionalData();
           }
-            
+
           Changed = true;
           ++NumReassoc;
           continue;
@@ -540,7 +544,7 @@ static Value *FoldOperationIntoSelectOperand(Instruction &I, Value *SO,
   Value *Op0 = SO, *Op1 = ConstOperand;
   if (!ConstIsRHS)
     std::swap(Op0, Op1);
-  
+
   if (BinaryOperator *BO = dyn_cast<BinaryOperator>(&I))
     return IC->Builder->CreateBinOp(BO->getOpcode(), Op0, Op1,
                                     SO->getName()+".op");
@@ -579,7 +583,7 @@ Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) {
       if (SrcTy && SrcTy->getNumElements() != DestTy->getNumElements())
         return 0;
     }
-    
+
     Value *SelectTrueVal = FoldOperationIntoSelectOperand(Op, TV, this);
     Value *SelectFalseVal = FoldOperationIntoSelectOperand(Op, FV, this);
 
@@ -599,7 +603,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
   unsigned NumPHIValues = PN->getNumIncomingValues();
   if (NumPHIValues == 0)
     return 0;
-  
+
   // We normally only transform phis with a single use.  However, if a PHI has
   // multiple uses and they are all the same operation, we can fold *all* of the
   // uses into the PHI.
@@ -613,7 +617,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
     }
     // Otherwise, we can replace *all* users with the new PHI we form.
   }
-  
+
   // Check to see if all of the operands of the PHI are simple constants
   // (constantint/constantfp/undef).  If there is one non-constant value,
   // remember the BB it is in.  If there is more than one or if *it* is a PHI,
@@ -627,7 +631,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
 
     if (isa<PHINode>(InVal)) return 0;  // Itself a phi.
     if (NonConstBB) return 0;  // More than one non-const value.
-    
+
     NonConstBB = PN->getIncomingBlock(i);
 
     // If the InVal is an invoke at the end of the pred block, then we can't
@@ -635,14 +639,14 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
     if (InvokeInst *II = dyn_cast<InvokeInst>(InVal))
       if (II->getParent() == NonConstBB)
         return 0;
-    
+
     // If the incoming non-constant value is in I's block, we will remove one
     // instruction, but insert another equivalent one, leading to infinite
     // instcombine.
     if (NonConstBB == I.getParent())
       return 0;
   }
-  
+
   // If there is exactly one non-constant value, we can insert a copy of the
   // operation in that block.  However, if this is a critical edge, we would be
   // inserting the computation one some other paths (e.g. inside a loop).  Only
@@ -656,12 +660,12 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
   PHINode *NewPN = PHINode::Create(I.getType(), PN->getNumIncomingValues());
   InsertNewInstBefore(NewPN, *PN);
   NewPN->takeName(PN);
-  
+
   // If we are going to have to insert a new computation, do so right before the
   // predecessors terminator.
   if (NonConstBB)
     Builder->SetInsertPoint(NonConstBB->getTerminator());
-  
+
   // Next, add all of the operands to the PHI.
   if (SelectInst *SI = dyn_cast<SelectInst>(&I)) {
     // We only currently try to fold the condition of a select when it is a phi,
@@ -706,20 +710,20 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
                                    PN->getIncomingValue(i), C, "phitmp");
       NewPN->addIncoming(InV, PN->getIncomingBlock(i));
     }
-  } else { 
+  } else {
     CastInst *CI = cast<CastInst>(&I);
     Type *RetTy = CI->getType();
     for (unsigned i = 0; i != NumPHIValues; ++i) {
       Value *InV;
       if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i)))
         InV = ConstantExpr::getCast(CI->getOpcode(), InC, RetTy);
-      else 
+      else
         InV = Builder->CreateCast(CI->getOpcode(),
                                 PN->getIncomingValue(i), I.getType(), "phitmp");
       NewPN->addIncoming(InV, PN->getIncomingBlock(i));
     }
   }
-  
+
   for (Value::use_iterator UI = PN->use_begin(), E = PN->use_end();
        UI != E; ) {
     Instruction *User = cast<Instruction>(*UI++);
@@ -734,11 +738,11 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
 /// or not there is a sequence of GEP indices into the type that will land us at
 /// the specified offset.  If so, fill them into NewIndices and return the
 /// resultant element type, otherwise return null.
-Type *InstCombiner::FindElementAtOffset(Type *Ty, int64_t Offset, 
+Type *InstCombiner::FindElementAtOffset(Type *Ty, int64_t Offset,
                                           SmallVectorImpl<Value*> &NewIndices) {
   if (!TD) return 0;
   if (!Ty->isSized()) return 0;
-  
+
   // Start with the index over the outer type.  Note that the type size
   // might be zero (even if the offset isn't zero) if the indexed type
   // is something like [0 x {int, int}]
@@ -747,7 +751,7 @@ Type *InstCombiner::FindElementAtOffset(Type *Ty, int64_t Offset,
   if (int64_t TySize = TD->getTypeAllocSize(Ty)) {
     FirstIdx = Offset/TySize;
     Offset -= FirstIdx*TySize;
-    
+
     // Handle hosts where % returns negative instead of values [0..TySize).
     if (Offset < 0) {
       --FirstIdx;
@@ -756,24 +760,24 @@ Type *InstCombiner::FindElementAtOffset(Type *Ty, int64_t Offset,
     }
     assert((uint64_t)Offset < (uint64_t)TySize && "Out of range offset");
   }
-  
+
   NewIndices.push_back(ConstantInt::get(IntPtrTy, FirstIdx));
-    
+
   // Index into the types.  If we fail, set OrigBase to null.
   while (Offset) {
     // Indexing into tail padding between struct/array elements.
     if (uint64_t(Offset*8) >= TD->getTypeSizeInBits(Ty))
       return 0;
-    
+
     if (StructType *STy = dyn_cast<StructType>(Ty)) {
       const StructLayout *SL = TD->getStructLayout(STy);
       assert(Offset < (int64_t)SL->getSizeInBytes() &&
              "Offset must stay within the indexed type");
-      
+
       unsigned Elt = SL->getElementContainingOffset(Offset);
       NewIndices.push_back(ConstantInt::get(Type::getInt32Ty(Ty->getContext()),
                                             Elt));
-      
+
       Offset -= SL->getElementOffset(Elt);
       Ty = STy->getElementType(Elt);
     } else if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
@@ -787,7 +791,7 @@ Type *InstCombiner::FindElementAtOffset(Type *Ty, int64_t Offset,
       return 0;
     }
   }
-  
+
   return Ty;
 }
 
@@ -948,7 +952,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
           Res->setIsInBounds(GEP.isInBounds());
           return Res;
         }
-        
+
         if (ArrayType *XATy =
               dyn_cast<ArrayType>(StrippedPtrTy->getElementType())){
           // GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ... ?
@@ -981,16 +985,16 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         // V and GEP are both pointer types --> BitCast
         return new BitCastInst(NewGEP, GEP.getType());
       }
-      
+
       // Transform things like:
       // getelementptr i8* bitcast ([100 x double]* X to i8*), i32 %tmp
       //   (where tmp = 8*tmp2) into:
       // getelementptr [100 x double]* %arr, i32 0, i32 %tmp2; bitcast
-      
+
       if (TD && SrcElTy->isArrayTy() && ResElTy->isIntegerTy(8)) {
         uint64_t ArrayEltSize =
             TD->getTypeAllocSize(cast<ArrayType>(SrcElTy)->getElementType());
-        
+
         // Check to see if "tmp" is a scale by a multiple of ArrayEltSize.  We
         // allow either a mul, shift, or constant here.
         Value *NewIdx = 0;
@@ -1015,7 +1019,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
             NewIdx = Inst->getOperand(0);
           }
         }
-        
+
         // If the index will be to exactly the right offset with the scale taken
         // out, perform the transformation. Note, we don't know whether Scale is
         // signed or not. We'll use unsigned version of division/modulo
@@ -1054,10 +1058,9 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         !isa<BitCastInst>(BCI->getOperand(0)) && GEP.hasAllConstantIndices() &&
         StrippedPtrTy->getAddressSpace() == GEP.getPointerAddressSpace()) {
 
-      // Determine how much the GEP moves the pointer.  We are guaranteed to get
-      // a constant back from EmitGEPOffset.
-      ConstantInt *OffsetV = cast<ConstantInt>(EmitGEPOffset(&GEP));
-      int64_t Offset = OffsetV->getSExtValue();
+      // Determine how much the GEP moves the pointer.
+      SmallVector<Value*, 8> Ops(GEP.idx_begin(), GEP.idx_end());
+      int64_t Offset = TD->getIndexedOffset(GEP.getPointerOperandType(), Ops);
 
       // If this GEP instruction doesn't move the pointer, just replace the GEP
       // with a bitcast of the real input to the dest type.
@@ -1065,7 +1068,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         // If the bitcast is of an allocation, and the allocation will be
         // converted to match the type of the cast, don't touch this.
         if (isa<AllocaInst>(BCI->getOperand(0)) ||
-            isMalloc(BCI->getOperand(0))) {
+            isAllocationFn(BCI->getOperand(0))) {
           // See if the bitcast simplifies, if so, don't nuke this GEP yet.
           if (Instruction *I = visitBitCast(*BCI)) {
             if (I != BCI) {
@@ -1078,7 +1081,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         }
         return new BitCastInst(BCI->getOperand(0), GEP.getType());
       }
-      
+
       // Otherwise, if the offset is non-zero, we need to find out if there is a
       // field at Offset in 'A's type.  If so, we can pull the cast through the
       // GEP.
@@ -1089,68 +1092,103 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         Value *NGEP = GEP.isInBounds() ?
           Builder->CreateInBoundsGEP(BCI->getOperand(0), NewIndices) :
           Builder->CreateGEP(BCI->getOperand(0), NewIndices);
-        
+
         if (NGEP->getType() == GEP.getType())
           return ReplaceInstUsesWith(GEP, NGEP);
         NGEP->takeName(&GEP);
         return new BitCastInst(NGEP, GEP.getType());
       }
     }
-  }    
-    
+  }
+
   return 0;
 }
 
 
 
-static bool IsOnlyNullComparedAndFreed(Value *V, SmallVectorImpl<WeakVH> &Users,
-                                       int Depth = 0) {
-  if (Depth == 8)
-    return false;
+static bool
+isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakVH> &Users) {
+  SmallVector<Instruction*, 4> Worklist;
+  Worklist.push_back(AI);
 
-  for (Value::use_iterator UI = V->use_begin(), UE = V->use_end();
-       UI != UE; ++UI) {
-    User *U = *UI;
-    if (isFreeCall(U)) {
-      Users.push_back(U);
-      continue;
-    }
-    if (ICmpInst *ICI = dyn_cast<ICmpInst>(U)) {
-      if (ICI->isEquality() && isa<ConstantPointerNull>(ICI->getOperand(1))) {
-        Users.push_back(ICI);
+  do {
+    Instruction *PI = Worklist.pop_back_val();
+    for (Value::use_iterator UI = PI->use_begin(), UE = PI->use_end(); UI != UE;
+         ++UI) {
+      Instruction *I = cast<Instruction>(*UI);
+      switch (I->getOpcode()) {
+      default:
+        // Give up the moment we see something we can't handle.
+        return false;
+
+      case Instruction::BitCast:
+      case Instruction::GetElementPtr:
+        Users.push_back(I);
+        Worklist.push_back(I);
         continue;
-      }
-    }
-    if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
-      if (IsOnlyNullComparedAndFreed(BCI, Users, Depth+1)) {
-        Users.push_back(BCI);
+
+      case Instruction::ICmp: {
+        ICmpInst *ICI = cast<ICmpInst>(I);
+        // We can fold eq/ne comparisons with null to false/true, respectively.
+        if (!ICI->isEquality() || !isa<ConstantPointerNull>(ICI->getOperand(1)))
+          return false;
+        Users.push_back(I);
         continue;
       }
-    }
-    if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) {
-      if (IsOnlyNullComparedAndFreed(GEPI, Users, Depth+1)) {
-        Users.push_back(GEPI);
+
+      case Instruction::Call:
+        // Ignore no-op and store intrinsics.
+        if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+          switch (II->getIntrinsicID()) {
+          default:
+            return false;
+
+          case Intrinsic::memmove:
+          case Intrinsic::memcpy:
+          case Intrinsic::memset: {
+            MemIntrinsic *MI = cast<MemIntrinsic>(II);
+            if (MI->isVolatile() || MI->getRawDest() != PI)
+              return false;
+          }
+          // fall through
+          case Intrinsic::dbg_declare:
+          case Intrinsic::dbg_value:
+          case Intrinsic::invariant_start:
+          case Intrinsic::invariant_end:
+          case Intrinsic::lifetime_start:
+          case Intrinsic::lifetime_end:
+          case Intrinsic::objectsize:
+            Users.push_back(I);
+            continue;
+          }
+        }
+
+        if (isFreeCall(I)) {
+          Users.push_back(I);
+          continue;
+        }
+        return false;
+
+      case Instruction::Store: {
+        StoreInst *SI = cast<StoreInst>(I);
+        if (SI->isVolatile() || SI->getPointerOperand() != PI)
+          return false;
+        Users.push_back(I);
         continue;
       }
-    }
-    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
-      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
-          II->getIntrinsicID() == Intrinsic::lifetime_end) {
-        Users.push_back(II);
-        continue;
       }
+      llvm_unreachable("missing a return?");
     }
-    return false;
-  }
+  } while (!Worklist.empty());
   return true;
 }
 
-Instruction *InstCombiner::visitMalloc(Instruction &MI) {
+Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
   // If we have a malloc call which is only used in any amount of comparisons
   // to null and free calls, delete the calls and replace the comparisons with
   // true or false as appropriate.
   SmallVector<WeakVH, 64> Users;
-  if (IsOnlyNullComparedAndFreed(&MI, Users)) {
+  if (isAllocSiteRemovable(&MI, Users)) {
     for (unsigned i = 0, e = Users.size(); i != e; ++i) {
       Instruction *I = cast_or_null<Instruction>(&*Users[i]);
       if (!I) continue;
@@ -1161,9 +1199,23 @@ Instruction *InstCombiner::visitMalloc(Instruction &MI) {
                                              C->isFalseWhenEqual()));
       } else if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) {
         ReplaceInstUsesWith(*I, UndefValue::get(I->getType()));
+      } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+        if (II->getIntrinsicID() == Intrinsic::objectsize) {
+          ConstantInt *CI = cast<ConstantInt>(II->getArgOperand(1));
+          uint64_t DontKnow = CI->isZero() ? -1ULL : 0;
+          ReplaceInstUsesWith(*I, ConstantInt::get(I->getType(), DontKnow));
+        }
       }
       EraseInstFromFunction(*I);
     }
+
+    if (InvokeInst *II = dyn_cast<InvokeInst>(&MI)) {
+      // Replace invoke with a NOP intrinsic to maintain the original CFG
+      Module *M = II->getParent()->getParent()->getParent();
+      Function *F = Intrinsic::getDeclaration(M, Intrinsic::donothing);
+      InvokeInst::Create(F, II->getNormalDest(), II->getUnwindDest(),
+                         ArrayRef<Value *>(), "", II->getParent());
+    }
     return EraseInstFromFunction(MI);
   }
   return 0;
@@ -1181,7 +1233,7 @@ Instruction *InstCombiner::visitFree(CallInst &FI) {
                          UndefValue::get(Type::getInt1PtrTy(FI.getContext())));
     return EraseInstFromFunction(FI);
   }
-  
+
   // If we have 'free null' delete the instruction.  This can happen in stl code
   // when lots of inlining happens.
   if (isa<ConstantPointerNull>(Op))
@@ -1207,14 +1259,14 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
 
   // Cannonicalize fcmp_one -> fcmp_oeq
   FCmpInst::Predicate FPred; Value *Y;
-  if (match(&BI, m_Br(m_FCmp(FPred, m_Value(X), m_Value(Y)), 
+  if (match(&BI, m_Br(m_FCmp(FPred, m_Value(X), m_Value(Y)),
                              TrueDest, FalseDest)) &&
       BI.getCondition()->hasOneUse())
     if (FPred == FCmpInst::FCMP_ONE || FPred == FCmpInst::FCMP_OLE ||
         FPred == FCmpInst::FCMP_OGE) {
       FCmpInst *Cond = cast<FCmpInst>(BI.getCondition());
       Cond->setPredicate(FCmpInst::getInversePredicate(FPred));
-      
+
       // Swap Destinations and condition.
       BI.swapSuccessors();
       Worklist.Add(Cond);
@@ -1280,7 +1332,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
     }
     return 0; // Can't handle other constants
   }
-  
+
   if (InsertValueInst *IV = dyn_cast<InsertValueInst>(Agg)) {
     // We're extracting from an insertvalue instruction, compare the indices
     const unsigned *exti, *exte, *insi, *inse;
@@ -1329,7 +1381,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
       // %E = extractvalue { i32, { i32 } } %I, 1, 0
       // with
       // %E extractvalue { i32 } { i32 42 }, 0
-      return ExtractValueInst::Create(IV->getInsertedValueOperand(), 
+      return ExtractValueInst::Create(IV->getInsertedValueOperand(),
                                       makeArrayRef(exti, exte));
   }
   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Agg)) {
@@ -1349,7 +1401,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
           EraseInstFromFunction(*II);
           return BinaryOperator::CreateAdd(LHS, RHS);
         }
-          
+
         // If the normal result of the add is dead, and the RHS is a constant,
         // we can transform this into a range comparison.
         // overflow = uadd a, -4  -->  overflow = icmp ugt a, 3
@@ -1798,7 +1850,7 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
 /// many instructions are dead or constant).  Additionally, if we find a branch
 /// whose condition is a known constant, we only visit the reachable successors.
 ///
-static bool AddReachableCodeToWorklist(BasicBlock *BB, 
+static bool AddReachableCodeToWorklist(BasicBlock *BB,
                                        SmallPtrSet<BasicBlock*, 64> &Visited,
                                        InstCombiner &IC,
                                        const TargetData *TD,
@@ -1812,13 +1864,13 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
 
   do {
     BB = Worklist.pop_back_val();
-    
+
     // We have now visited this block!  If we've already been here, ignore it.
     if (!Visited.insert(BB)) continue;
 
     for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) {
       Instruction *Inst = BBI++;
-      
+
       // DCE instruction if trivially dead.
       if (isInstructionTriviallyDead(Inst)) {
         ++NumDeadInst;
@@ -1826,7 +1878,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
         Inst->eraseFromParent();
         continue;
       }
-      
+
       // ConstantProp instruction if trivially constant.
       if (!Inst->use_empty() && isa<Constant>(Inst->getOperand(0)))
         if (Constant *C = ConstantFoldInstruction(Inst, TD, TLI)) {
@@ -1837,7 +1889,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
           Inst->eraseFromParent();
           continue;
         }
-      
+
       if (TD) {
         // See if we can constant fold its operands.
         for (User::op_iterator i = Inst->op_begin(), e = Inst->op_end();
@@ -1881,17 +1933,17 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
             Worklist.push_back(ReachableBB);
             continue;
           }
-        
+
         // Otherwise it is the default destination.
         Worklist.push_back(SI->getDefaultDest());
         continue;
       }
     }
-    
+
     for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
       Worklist.push_back(TI->getSuccessor(i));
   } while (!Worklist.empty());
-  
+
   // Once we've found all of the instructions to add to instcombine's worklist,
   // add them in reverse order.  This way instcombine will visit from the top
   // of the function down.  This jives well with the way that it adds all uses
@@ -1899,13 +1951,13 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
   // some N^2 behavior in pathological cases.
   IC.Worklist.AddInitialGroup(&InstrsForInstCombineWorklist[0],
                               InstrsForInstCombineWorklist.size());
-  
+
   return MadeIRChange;
 }
 
 bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
   MadeIRChange = false;
-  
+
   DEBUG(errs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
                << F.getName() << "\n");
 
@@ -1976,13 +2028,13 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
       BasicBlock *BB = I->getParent();
       Instruction *UserInst = cast<Instruction>(I->use_back());
       BasicBlock *UserParent;
-      
+
       // Get the block the use occurs in.
       if (PHINode *PN = dyn_cast<PHINode>(UserInst))
         UserParent = PN->getIncomingBlock(I->use_begin().getUse());
       else
         UserParent = UserInst->getParent();
-      
+
       if (UserParent != BB) {
         bool UserIsSuccessor = false;
         // See if the user is one of our successors.
@@ -2004,7 +2056,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
     // Now that we have an instruction, try combining it to simplify it.
     Builder->SetInsertPoint(I->getParent(), I);
     Builder->SetCurrentDebugLocation(I->getDebugLoc());
-    
+
 #ifndef NDEBUG
     std::string OrigI;
 #endif
@@ -2069,14 +2121,14 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
 bool InstCombiner::runOnFunction(Function &F) {
   TD = getAnalysisIfAvailable<TargetData>();
   TLI = &getAnalysis<TargetLibraryInfo>();
-  
+
   /// Builder - This is an IRBuilder that automatically inserts new
   /// instructions into the worklist when they are created.
-  IRBuilder<true, TargetFolder, InstCombineIRInserter> 
+  IRBuilder<true, TargetFolder, InstCombineIRInserter>
     TheBuilder(F.getContext(), TargetFolder(TD),
                InstCombineIRInserter(Worklist));
   Builder = &TheBuilder;
-  
+
   bool EverMadeChange = false;
 
   // Lower dbg.declare intrinsics otherwise their value may be clobbered
@@ -2087,7 +2139,7 @@ bool InstCombiner::runOnFunction(Function &F) {
   unsigned Iteration = 0;
   while (DoOneIteration(F, Iteration++))
     EverMadeChange = true;
-  
+
   Builder = 0;
   return EverMadeChange;
 }
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 7fb33f7..3368026 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -16,6 +16,13 @@
 #define DEBUG_TYPE "asan"
 
 #include "FunctionBlackList.h"
+#include "llvm/Function.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Module.h"
+#include "llvm/Type.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallSet.h"
@@ -23,14 +30,9 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/Function.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Module.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/system_error.h"
 #include "llvm/Target/TargetData.h"
@@ -38,7 +40,6 @@
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
-#include "llvm/Type.h"
 
 #include <string>
 #include <algorithm>
@@ -72,6 +73,9 @@ static const int kAsanStackMidRedzoneMagic = 0xf2;
 static const int kAsanStackRightRedzoneMagic = 0xf3;
 static const int kAsanStackPartialRedzoneMagic = 0xf4;
 
+// Accesses sizes are powers of two: 1, 2, 4, 8, 16.
+static const size_t kNumberOfAccessSizes = 5;
+
 // Command-line flags.
 
 // This flag may need to be replaced with -f[no-]asan-reads.
@@ -79,6 +83,20 @@ static cl::opt<bool> ClInstrumentReads("asan-instrument-reads",
        cl::desc("instrument read instructions"), cl::Hidden, cl::init(true));
 static cl::opt<bool> ClInstrumentWrites("asan-instrument-writes",
        cl::desc("instrument write instructions"), cl::Hidden, cl::init(true));
+static cl::opt<bool> ClInstrumentAtomics("asan-instrument-atomics",
+       cl::desc("instrument atomic instructions (rmw, cmpxchg)"),
+       cl::Hidden, cl::init(true));
+static cl::opt<bool> ClMergeCallbacks("asan-merge-callbacks",
+       cl::desc("merge __asan_report_ callbacks to create fewer BBs"),
+       cl::Hidden, cl::init(false));
+// This flag limits the number of instructions to be instrumented
+// in any given BB. Normally, this should be set to unlimited (INT_MAX),
+// but due to http://llvm.org/bugs/show_bug.cgi?id=12652 we temporary
+// set it to 10000.
+static cl::opt<int> ClMaxInsnsToInstrumentPerBB("asan-max-ins-per-bb",
+       cl::init(10000),
+       cl::desc("maximal number of instructions to instrument in any given BB"),
+       cl::Hidden);
 // This flag may need to be replaced with -f[no]asan-stack.
 static cl::opt<bool> ClStack("asan-stack",
        cl::desc("Handle stack memory"), cl::Hidden, cl::init(true));
@@ -127,18 +145,42 @@ static cl::opt<int> ClDebugMax("asan-debug-max", cl::desc("Debug man inst"),
 
 namespace {
 
+/// When the crash callbacks are merged, they receive some amount of arguments
+/// that are merged in a PHI node. This struct represents arguments from one
+/// call site.
+struct CrashArg {
+  Value *Arg1;
+  Value *Arg2;
+};
+
+/// An object of this type is created while instrumenting every function.
+struct AsanFunctionContext {
+  AsanFunctionContext(Function &Function) : F(Function), CrashBlock() { }
+
+  Function &F;
+  // These are initially zero. If we require at least one call to
+  // __asan_report_{read,write}{1,2,4,8,16}, an appropriate BB is created.
+  BasicBlock *CrashBlock[2][kNumberOfAccessSizes];
+  typedef  SmallVector<CrashArg, 8> CrashArgsVec;
+  CrashArgsVec CrashArgs[2][kNumberOfAccessSizes];
+};
+
 /// AddressSanitizer: instrument the code in module to find memory bugs.
 struct AddressSanitizer : public ModulePass {
   AddressSanitizer();
   virtual const char *getPassName() const;
-  void instrumentMop(Instruction *I);
-  void instrumentAddress(Instruction *OrigIns, IRBuilder<> &IRB,
+  void instrumentMop(AsanFunctionContext &AFC, Instruction *I);
+  void instrumentAddress(AsanFunctionContext &AFC,
+                         Instruction *OrigIns, IRBuilder<> &IRB,
                          Value *Addr, uint32_t TypeSize, bool IsWrite);
-  Instruction *generateCrashCode(IRBuilder<> &IRB, Value *Addr,
-                                 bool IsWrite, uint32_t TypeSize);
-  bool instrumentMemIntrinsic(MemIntrinsic *MI);
-  void instrumentMemIntrinsicParam(Instruction *OrigIns, Value *Addr,
-                                  Value *Size,
+  Value *createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
+                           Value *ShadowValue, uint32_t TypeSize);
+  Instruction *generateCrashCode(BasicBlock *BB, Value *Addr, Value *PC,
+                                 bool IsWrite, size_t AccessSizeIndex);
+  bool instrumentMemIntrinsic(AsanFunctionContext &AFC, MemIntrinsic *MI);
+  void instrumentMemIntrinsicParam(AsanFunctionContext &AFC,
+                                   Instruction *OrigIns, Value *Addr,
+                                   Value *Size,
                                    Instruction *InsertBefore, bool IsWrite);
   Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
   bool handleFunction(Module &M, Function &F);
@@ -146,7 +188,6 @@ struct AddressSanitizer : public ModulePass {
   bool poisonStackInFunction(Module &M, Function &F);
   virtual bool runOnModule(Module &M);
   bool insertGlobalRedzones(Module &M);
-  BranchInst *splitBlockAndInsertIfThen(Instruction *SplitBefore, Value *Cmp);
   static char ID;  // Pass identification, replacement for typeid
 
  private:
@@ -165,11 +206,11 @@ struct AddressSanitizer : public ModulePass {
     return getAlignedSize(SizeInBytes);
   }
 
+  Function *checkInterfaceFunction(Constant *FuncOrBitcast);
   void PoisonStack(const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> IRB,
                    Value *ShadowBase, bool DoPoison);
   bool LooksLikeCodeInBug11395(Instruction *I);
 
-  Module      *CurrentModule;
   LLVMContext *C;
   TargetData *TD;
   uint64_t MappingOffset;
@@ -182,7 +223,11 @@ struct AddressSanitizer : public ModulePass {
   Function *AsanInitFunction;
   Instruction *CtorInsertBefore;
   OwningPtr<FunctionBlackList> BL;
+  // This array is indexed by AccessIsWrite and log2(AccessSize).
+  Function *AsanErrorCallback[2][kNumberOfAccessSizes];
+  InlineAsm *EmptyAsm;
 };
+
 }  // namespace
 
 char AddressSanitizer::ID = 0;
@@ -198,6 +243,12 @@ const char *AddressSanitizer::getPassName() const {
   return "AddressSanitizer";
 }
 
+static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
+  size_t Res = CountTrailingZeros_32(TypeSize / 8);
+  assert(Res < kNumberOfAccessSizes);
+  return Res;
+}
+
 // Create a constant for Str so that we can pass it to the run-time lib.
 static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str) {
   Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
@@ -208,29 +259,32 @@ static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str) {
 // Split the basic block and insert an if-then code.
 // Before:
 //   Head
-//   SplitBefore
+//   Cmp
 //   Tail
 // After:
 //   Head
 //   if (Cmp)
-//     NewBasicBlock
-//   SplitBefore
+//     ThenBlock
 //   Tail
 //
-// Returns the NewBasicBlock's terminator.
-BranchInst *AddressSanitizer::splitBlockAndInsertIfThen(
-    Instruction *SplitBefore, Value *Cmp) {
+// If ThenBlock is zero, a new block is created and its terminator is returned.
+// Otherwize 0 is returned.
+static BranchInst *splitBlockAndInsertIfThen(Value *Cmp,
+                                             BasicBlock *ThenBlock = 0) {
+  Instruction *SplitBefore = cast<Instruction>(Cmp)->getNextNode();
   BasicBlock *Head = SplitBefore->getParent();
   BasicBlock *Tail = Head->splitBasicBlock(SplitBefore);
   TerminatorInst *HeadOldTerm = Head->getTerminator();
-  BasicBlock *NewBasicBlock =
-      BasicBlock::Create(*C, "", Head->getParent());
-  BranchInst *HeadNewTerm = BranchInst::Create(/*ifTrue*/NewBasicBlock,
-                                               /*ifFalse*/Tail,
-                                               Cmp);
+  BranchInst *CheckTerm = 0;
+  if (!ThenBlock) {
+    LLVMContext &C = Head->getParent()->getParent()->getContext();
+    ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
+    CheckTerm = BranchInst::Create(Tail, ThenBlock);
+  }
+  BranchInst *HeadNewTerm =
+    BranchInst::Create(/*ifTrue*/ThenBlock, /*ifFalse*/Tail, Cmp);
   ReplaceInstWithInst(HeadOldTerm, HeadNewTerm);
 
-  BranchInst *CheckTerm = BranchInst::Create(Tail, NewBasicBlock);
   return CheckTerm;
 }
 
@@ -244,12 +298,13 @@ Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) {
                                                MappingOffset));
 }
 
-void AddressSanitizer::instrumentMemIntrinsicParam(Instruction *OrigIns,
+void AddressSanitizer::instrumentMemIntrinsicParam(
+    AsanFunctionContext &AFC, Instruction *OrigIns,
     Value *Addr, Value *Size, Instruction *InsertBefore, bool IsWrite) {
   // Check the first byte.
   {
     IRBuilder<> IRB(InsertBefore);
-    instrumentAddress(OrigIns, IRB, Addr, 8, IsWrite);
+    instrumentAddress(AFC, OrigIns, IRB, Addr, 8, IsWrite);
   }
   // Check the last byte.
   {
@@ -259,15 +314,16 @@ void AddressSanitizer::instrumentMemIntrinsicParam(Instruction *OrigIns,
     SizeMinusOne = IRB.CreateIntCast(SizeMinusOne, IntptrTy, false);
     Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
     Value *AddrPlusSizeMinisOne = IRB.CreateAdd(AddrLong, SizeMinusOne);
-    instrumentAddress(OrigIns, IRB, AddrPlusSizeMinisOne, 8, IsWrite);
+    instrumentAddress(AFC, OrigIns, IRB, AddrPlusSizeMinisOne, 8, IsWrite);
   }
 }
 
 // Instrument memset/memmove/memcpy
-bool AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
+bool AddressSanitizer::instrumentMemIntrinsic(AsanFunctionContext &AFC,
+                                              MemIntrinsic *MI) {
   Value *Dst = MI->getDest();
   MemTransferInst *MemTran = dyn_cast<MemTransferInst>(MI);
-  Value *Src = MemTran ? MemTran->getSource() : NULL;
+  Value *Src = MemTran ? MemTran->getSource() : 0;
   Value *Length = MI->getLength();
 
   Constant *ConstLength = dyn_cast<Constant>(Length);
@@ -279,26 +335,46 @@ bool AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
     IRBuilder<> IRB(InsertBefore);
 
     Value *Cmp = IRB.CreateICmpNE(Length,
-                                   Constant::getNullValue(Length->getType()));
-    InsertBefore = splitBlockAndInsertIfThen(InsertBefore, Cmp);
+                                  Constant::getNullValue(Length->getType()));
+    InsertBefore = splitBlockAndInsertIfThen(Cmp);
   }
 
-  instrumentMemIntrinsicParam(MI, Dst, Length, InsertBefore, true);
+  instrumentMemIntrinsicParam(AFC, MI, Dst, Length, InsertBefore, true);
   if (Src)
-    instrumentMemIntrinsicParam(MI, Src, Length, InsertBefore, false);
+    instrumentMemIntrinsicParam(AFC, MI, Src, Length, InsertBefore, false);
   return true;
 }
 
-static Value *getLDSTOperand(Instruction *I) {
+// If I is an interesting memory access, return the PointerOperand
+// and set IsWrite. Otherwise return NULL.
+static Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite) {
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    if (!ClInstrumentReads) return NULL;
+    *IsWrite = false;
     return LI->getPointerOperand();
   }
-  return cast<StoreInst>(*I).getPointerOperand();
+  if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    if (!ClInstrumentWrites) return NULL;
+    *IsWrite = true;
+    return SI->getPointerOperand();
+  }
+  if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
+    if (!ClInstrumentAtomics) return NULL;
+    *IsWrite = true;
+    return RMW->getPointerOperand();
+  }
+  if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
+    if (!ClInstrumentAtomics) return NULL;
+    *IsWrite = true;
+    return XCHG->getPointerOperand();
+  }
+  return NULL;
 }
 
-void AddressSanitizer::instrumentMop(Instruction *I) {
-  int IsWrite = isa<StoreInst>(*I);
-  Value *Addr = getLDSTOperand(I);
+void AddressSanitizer::instrumentMop(AsanFunctionContext &AFC, Instruction *I) {
+  bool IsWrite;
+  Value *Addr = isInterestingMemoryAccess(I, &IsWrite);
+  assert(Addr);
   if (ClOpt && ClOptGlobals && isa<GlobalVariable>(Addr)) {
     // We are accessing a global scalar variable. Nothing to catch here.
     return;
@@ -316,22 +392,57 @@ void AddressSanitizer::instrumentMop(Instruction *I) {
   }
 
   IRBuilder<> IRB(I);
-  instrumentAddress(I, IRB, Addr, TypeSize, IsWrite);
+  instrumentAddress(AFC, I, IRB, Addr, TypeSize, IsWrite);
+}
+
+// Validate the result of Module::getOrInsertFunction called for an interface
+// function of AddressSanitizer. If the instrumented module defines a function
+// with the same name, their prototypes must match, otherwise
+// getOrInsertFunction returns a bitcast.
+Function *AddressSanitizer::checkInterfaceFunction(Constant *FuncOrBitcast) {
+  if (isa<Function>(FuncOrBitcast)) return cast<Function>(FuncOrBitcast);
+  FuncOrBitcast->dump();
+  report_fatal_error("trying to redefine an AddressSanitizer "
+                     "interface function");
 }
 
 Instruction *AddressSanitizer::generateCrashCode(
-    IRBuilder<> &IRB, Value *Addr, bool IsWrite, uint32_t TypeSize) {
-  // IsWrite and TypeSize are encoded in the function name.
-  std::string FunctionName = std::string(kAsanReportErrorTemplate) +
-      (IsWrite ? "store" : "load") + itostr(TypeSize / 8);
-  Value *ReportWarningFunc = CurrentModule->getOrInsertFunction(
-      FunctionName, IRB.getVoidTy(), IntptrTy, NULL);
-  CallInst *Call = IRB.CreateCall(ReportWarningFunc, Addr);
-  Call->setDoesNotReturn();
+    BasicBlock *BB, Value *Addr, Value *PC,
+    bool IsWrite, size_t AccessSizeIndex) {
+  IRBuilder<> IRB(BB->getFirstNonPHI());
+  CallInst *Call;
+  if (PC)
+    Call = IRB.CreateCall2(AsanErrorCallback[IsWrite][AccessSizeIndex],
+                           Addr, PC);
+  else
+    Call = IRB.CreateCall(AsanErrorCallback[IsWrite][AccessSizeIndex], Addr);
+  // We don't do Call->setDoesNotReturn() because the BB already has
+  // UnreachableInst at the end.
+  // This EmptyAsm is required to avoid callback merge.
+  IRB.CreateCall(EmptyAsm);
   return Call;
 }
 
-void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
+Value *AddressSanitizer::createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
+                                            Value *ShadowValue,
+                                            uint32_t TypeSize) {
+  size_t Granularity = 1 << MappingScale;
+  // Addr & (Granularity - 1)
+  Value *LastAccessedByte = IRB.CreateAnd(
+      AddrLong, ConstantInt::get(IntptrTy, Granularity - 1));
+  // (Addr & (Granularity - 1)) + size - 1
+  if (TypeSize / 8 > 1)
+    LastAccessedByte = IRB.CreateAdd(
+        LastAccessedByte, ConstantInt::get(IntptrTy, TypeSize / 8 - 1));
+  // (uint8_t) ((Addr & (Granularity-1)) + size - 1)
+  LastAccessedByte = IRB.CreateIntCast(
+      LastAccessedByte, IRB.getInt8Ty(), false);
+  // ((uint8_t) ((Addr & (Granularity-1)) + size - 1)) >= ShadowValue
+  return IRB.CreateICmpSGE(LastAccessedByte, ShadowValue);
+}
+
+void AddressSanitizer::instrumentAddress(AsanFunctionContext &AFC,
+                                         Instruction *OrigIns,
                                          IRBuilder<> &IRB, Value *Addr,
                                          uint32_t TypeSize, bool IsWrite) {
   Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
@@ -346,31 +457,47 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
 
   Value *Cmp = IRB.CreateICmpNE(ShadowValue, CmpVal);
 
-  Instruction *CheckTerm = splitBlockAndInsertIfThen(
-      cast<Instruction>(Cmp)->getNextNode(), Cmp);
-  IRBuilder<> IRB2(CheckTerm);
+  BasicBlock *CrashBlock = 0;
+  if (ClMergeCallbacks) {
+    size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize);
+    BasicBlock **Cached = &AFC.CrashBlock[IsWrite][AccessSizeIndex];
+    if (!*Cached) {
+      std::string BBName("crash_bb-");
+      BBName += (IsWrite ? "w-" : "r-") + itostr(1 << AccessSizeIndex);
+      BasicBlock *BB = BasicBlock::Create(*C, BBName, &AFC.F);
+      new UnreachableInst(*C, BB);
+      *Cached = BB;
+    }
+    CrashBlock = *Cached;
+    // We need to pass the PC as the second parameter to __asan_report_*.
+    // There are few problems:
+    //  - Some architectures (e.g. x86_32) don't have a cheap way to get the PC.
+    //  - LLVM doesn't have the appropriate intrinsic.
+    // For now, put a random number into the PC, just to allow experiments.
+    Value *PC = ConstantInt::get(IntptrTy, rand());
+    CrashArg Arg = {AddrLong, PC};
+    AFC.CrashArgs[IsWrite][AccessSizeIndex].push_back(Arg);
+  } else {
+    CrashBlock = BasicBlock::Create(*C, "crash_bb", &AFC.F);
+    new UnreachableInst(*C, CrashBlock);
+    size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize);
+    Instruction *Crash =
+        generateCrashCode(CrashBlock, AddrLong, 0, IsWrite, AccessSizeIndex);
+    Crash->setDebugLoc(OrigIns->getDebugLoc());
+  }
 
   size_t Granularity = 1 << MappingScale;
   if (TypeSize < 8 * Granularity) {
-    // Addr & (Granularity - 1)
-    Value *Lower3Bits = IRB2.CreateAnd(
-        AddrLong, ConstantInt::get(IntptrTy, Granularity - 1));
-    // (Addr & (Granularity - 1)) + size - 1
-    Value *LastAccessedByte = IRB2.CreateAdd(
-        Lower3Bits, ConstantInt::get(IntptrTy, TypeSize / 8 - 1));
-    // (uint8_t) ((Addr & (Granularity-1)) + size - 1)
-    LastAccessedByte = IRB2.CreateIntCast(
-        LastAccessedByte, IRB.getInt8Ty(), false);
-    // ((uint8_t) ((Addr & (Granularity-1)) + size - 1)) >= ShadowValue
-    Value *Cmp2 = IRB2.CreateICmpSGE(LastAccessedByte, ShadowValue);
-
-    CheckTerm = splitBlockAndInsertIfThen(CheckTerm, Cmp2);
-  }
-
-  IRBuilder<> IRB1(CheckTerm);
-  Instruction *Crash = generateCrashCode(IRB1, AddrLong, IsWrite, TypeSize);
-  Crash->setDebugLoc(OrigIns->getDebugLoc());
-  ReplaceInstWithInst(CheckTerm, new UnreachableInst(*C));
+    BranchInst *CheckTerm = splitBlockAndInsertIfThen(Cmp);
+    assert(CheckTerm->isUnconditional());
+    BasicBlock *NextBB = CheckTerm->getSuccessor(0);
+    IRB.SetInsertPoint(CheckTerm);
+    Value *Cmp2 = createSlowPathCmp(IRB, AddrLong, ShadowValue, TypeSize);
+    BranchInst *NewTerm = BranchInst::Create(CrashBlock, NextBB, Cmp2);
+    ReplaceInstWithInst(CheckTerm, NewTerm);
+  } else {
+    splitBlockAndInsertIfThen(Cmp, CrashBlock);
+  }
 }
 
 // This function replaces all global variables with new variables that have
@@ -475,7 +602,7 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) {
     // Create a new global variable with enough space for a redzone.
     GlobalVariable *NewGlobal = new GlobalVariable(
         M, NewTy, G->isConstant(), G->getLinkage(),
-        NewInitializer, "", G, G->isThreadLocal());
+        NewInitializer, "", G, G->getThreadLocalMode());
     NewGlobal->copyAttributesFrom(G);
     NewGlobal->setAlignment(RedzoneSize);
 
@@ -503,7 +630,7 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) {
       M, ArrayOfGlobalStructTy, false, GlobalVariable::PrivateLinkage,
       ConstantArray::get(ArrayOfGlobalStructTy, Initializers), "");
 
-  Function *AsanRegisterGlobals = cast<Function>(M.getOrInsertFunction(
+  Function *AsanRegisterGlobals = checkInterfaceFunction(M.getOrInsertFunction(
       kAsanRegisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
   AsanRegisterGlobals->setLinkage(Function::ExternalLinkage);
 
@@ -518,8 +645,10 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) {
       GlobalValue::InternalLinkage, kAsanModuleDtorName, &M);
   BasicBlock *AsanDtorBB = BasicBlock::Create(*C, "", AsanDtorFunction);
   IRBuilder<> IRB_Dtor(ReturnInst::Create(*C, AsanDtorBB));
-  Function *AsanUnregisterGlobals = cast<Function>(M.getOrInsertFunction(
-      kAsanUnregisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
+  Function *AsanUnregisterGlobals =
+      checkInterfaceFunction(M.getOrInsertFunction(
+          kAsanUnregisterGlobalsName,
+          IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
   AsanUnregisterGlobals->setLinkage(Function::ExternalLinkage);
 
   IRB_Dtor.CreateCall2(AsanUnregisterGlobals,
@@ -539,7 +668,6 @@ bool AddressSanitizer::runOnModule(Module &M) {
     return false;
   BL.reset(new FunctionBlackList(ClBlackListFile));
 
-  CurrentModule = &M;
   C = &(M.getContext());
   LongSize = TD->getPointerSizeInBits();
   IntptrTy = Type::getIntNTy(*C, LongSize);
@@ -553,11 +681,33 @@ bool AddressSanitizer::runOnModule(Module &M) {
 
   // call __asan_init in the module ctor.
   IRBuilder<> IRB(CtorInsertBefore);
-  AsanInitFunction = cast<Function>(
+  AsanInitFunction = checkInterfaceFunction(
       M.getOrInsertFunction(kAsanInitName, IRB.getVoidTy(), NULL));
   AsanInitFunction->setLinkage(Function::ExternalLinkage);
   IRB.CreateCall(AsanInitFunction);
 
+  // Create __asan_report* callbacks.
+  for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) {
+    for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
+         AccessSizeIndex++) {
+      // IsWrite and TypeSize are encoded in the function name.
+      std::string FunctionName = std::string(kAsanReportErrorTemplate) +
+          (AccessIsWrite ? "store" : "load") + itostr(1 << AccessSizeIndex);
+      // If we are merging crash callbacks, they have two parameters.
+      if (ClMergeCallbacks)
+        AsanErrorCallback[AccessIsWrite][AccessSizeIndex] = cast<Function>(
+          M.getOrInsertFunction(FunctionName, IRB.getVoidTy(), IntptrTy,
+                                IntptrTy, NULL));
+      else
+        AsanErrorCallback[AccessIsWrite][AccessSizeIndex] = cast<Function>(
+          M.getOrInsertFunction(FunctionName, IRB.getVoidTy(), IntptrTy, NULL));
+    }
+  }
+  // We insert an empty inline asm after __asan_report* to avoid callback merge.
+  EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
+                            StringRef(""), StringRef(""),
+                            /*hasSideEffects=*/true);
+
   llvm::Triple targetTriple(M.getTargetTriple());
   bool isAndroid = targetTriple.getEnvironment() == llvm::Triple::ANDROIDEABI;
 
@@ -645,17 +795,17 @@ bool AddressSanitizer::handleFunction(Module &M, Function &F) {
   SmallSet<Value*, 16> TempsToInstrument;
   SmallVector<Instruction*, 16> ToInstrument;
   SmallVector<Instruction*, 8> NoReturnCalls;
+  bool IsWrite;
 
   // Fill the set of memory operations to instrument.
   for (Function::iterator FI = F.begin(), FE = F.end();
        FI != FE; ++FI) {
     TempsToInstrument.clear();
+    int NumInsnsPerBB = 0;
     for (BasicBlock::iterator BI = FI->begin(), BE = FI->end();
          BI != BE; ++BI) {
       if (LooksLikeCodeInBug11395(BI)) return false;
-      if ((isa<LoadInst>(BI) && ClInstrumentReads) ||
-          (isa<StoreInst>(BI) && ClInstrumentWrites)) {
-        Value *Addr = getLDSTOperand(BI);
+      if (Value *Addr = isInterestingMemoryAccess(BI, &IsWrite)) {
         if (ClOpt && ClOptSameTemp) {
           if (!TempsToInstrument.insert(Addr))
             continue;  // We've seen this temp in the current BB.
@@ -673,23 +823,55 @@ bool AddressSanitizer::handleFunction(Module &M, Function &F) {
         continue;
       }
       ToInstrument.push_back(BI);
+      NumInsnsPerBB++;
+      if (NumInsnsPerBB >= ClMaxInsnsToInstrumentPerBB)
+        break;
     }
   }
 
+  AsanFunctionContext AFC(F);
+
   // Instrument.
   int NumInstrumented = 0;
   for (size_t i = 0, n = ToInstrument.size(); i != n; i++) {
     Instruction *Inst = ToInstrument[i];
     if (ClDebugMin < 0 || ClDebugMax < 0 ||
         (NumInstrumented >= ClDebugMin && NumInstrumented <= ClDebugMax)) {
-      if (isa<StoreInst>(Inst) || isa<LoadInst>(Inst))
-        instrumentMop(Inst);
+      if (isInterestingMemoryAccess(Inst, &IsWrite))
+        instrumentMop(AFC, Inst);
       else
-        instrumentMemIntrinsic(cast<MemIntrinsic>(Inst));
+        instrumentMemIntrinsic(AFC, cast<MemIntrinsic>(Inst));
     }
     NumInstrumented++;
   }
 
+  // Create PHI nodes and crash callbacks if we are merging crash callbacks.
+  if (NumInstrumented) {
+    for (size_t IsWrite = 0; IsWrite <= 1; IsWrite++) {
+      for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
+           AccessSizeIndex++) {
+        BasicBlock *BB = AFC.CrashBlock[IsWrite][AccessSizeIndex];
+        if (!BB) continue;
+        assert(ClMergeCallbacks);
+        AsanFunctionContext::CrashArgsVec &Args =
+            AFC.CrashArgs[IsWrite][AccessSizeIndex];
+        IRBuilder<> IRB(BB->getFirstNonPHI());
+        size_t n = Args.size();
+        PHINode *PN1 = IRB.CreatePHI(IntptrTy, n);
+        PHINode *PN2 = IRB.CreatePHI(IntptrTy, n);
+        // We need to match crash parameters and the predecessors.
+        for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
+             PI != PE; ++PI) {
+          n--;
+          PN1->addIncoming(Args[n].Arg1, *PI);
+          PN2->addIncoming(Args[n].Arg2, *PI);
+        }
+        assert(n == 0);
+        generateCrashCode(BB, PN1, PN2, IsWrite, AccessSizeIndex);
+      }
+    }
+  }
+
   DEBUG(dbgs() << F);
 
   bool ChangedStack = poisonStackInFunction(M, F);
diff --git a/lib/Transforms/Instrumentation/BoundsChecking.cpp b/lib/Transforms/Instrumentation/BoundsChecking.cpp
new file mode 100644
index 0000000..09e0f14
--- /dev/null
+++ b/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -0,0 +1,209 @@
+//===- BoundsChecking.cpp - Instrumentation for run-time bounds checking --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass that instruments the code to perform run-time
+// bounds checking on loads, stores, and other memory intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "bounds-checking"
+#include "llvm/IRBuilder.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Pass.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/TargetFolder.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/Instrumentation.h"
+using namespace llvm;
+
+static cl::opt<bool> SingleTrapBB("bounds-checking-single-trap",
+                                  cl::desc("Use one trap block per function"));
+
+STATISTIC(ChecksAdded, "Bounds checks added");
+STATISTIC(ChecksSkipped, "Bounds checks skipped");
+STATISTIC(ChecksUnable, "Bounds checks unable to add");
+
+typedef IRBuilder<true, TargetFolder> BuilderTy;
+
+namespace {
+  struct BoundsChecking : public FunctionPass {
+    static char ID;
+
+    BoundsChecking(unsigned _Penalty = 5) : FunctionPass(ID), Penalty(_Penalty){
+      initializeBoundsCheckingPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<TargetData>();
+    }
+
+  private:
+    const TargetData *TD;
+    ObjectSizeOffsetEvaluator *ObjSizeEval;
+    BuilderTy *Builder;
+    Instruction *Inst;
+    BasicBlock *TrapBB;
+    unsigned Penalty;
+
+    BasicBlock *getTrapBB();
+    void emitBranchToTrap(Value *Cmp = 0);
+    bool computeAllocSize(Value *Ptr, APInt &Offset, Value* &OffsetValue,
+                          APInt &Size, Value* &SizeValue);
+    bool instrument(Value *Ptr, Value *Val);
+ };
+}
+
+char BoundsChecking::ID = 0;
+INITIALIZE_PASS(BoundsChecking, "bounds-checking", "Run-time bounds checking",
+                false, false)
+
+
+/// getTrapBB - create a basic block that traps. All overflowing conditions
+/// branch to this block. There's only one trap block per function.
+BasicBlock *BoundsChecking::getTrapBB() {
+  if (TrapBB && SingleTrapBB)
+    return TrapBB;
+
+  Function *Fn = Inst->getParent()->getParent();
+  BasicBlock::iterator PrevInsertPoint = Builder->GetInsertPoint();
+  TrapBB = BasicBlock::Create(Fn->getContext(), "trap", Fn);
+  Builder->SetInsertPoint(TrapBB);
+
+  llvm::Value *F = Intrinsic::getDeclaration(Fn->getParent(), Intrinsic::trap);
+  CallInst *TrapCall = Builder->CreateCall(F);
+  TrapCall->setDoesNotReturn();
+  TrapCall->setDoesNotThrow();
+  TrapCall->setDebugLoc(Inst->getDebugLoc());
+  Builder->CreateUnreachable();
+
+  Builder->SetInsertPoint(PrevInsertPoint);
+  return TrapBB;
+}
+
+
+/// emitBranchToTrap - emit a branch instruction to a trap block.
+/// If Cmp is non-null, perform a jump only if its value evaluates to true.
+void BoundsChecking::emitBranchToTrap(Value *Cmp) {
+  // check if the comparison is always false
+  ConstantInt *C = dyn_cast_or_null<ConstantInt>(Cmp);
+  if (C) {
+    ++ChecksSkipped;
+    if (!C->getZExtValue())
+      return;
+    else
+      Cmp = 0; // unconditional branch
+  }
+
+  Instruction *Inst = Builder->GetInsertPoint();
+  BasicBlock *OldBB = Inst->getParent();
+  BasicBlock *Cont = OldBB->splitBasicBlock(Inst);
+  OldBB->getTerminator()->eraseFromParent();
+
+  if (Cmp)
+    BranchInst::Create(getTrapBB(), Cont, Cmp, OldBB);
+  else
+    BranchInst::Create(getTrapBB(), OldBB);
+}
+
+
+/// instrument - adds run-time bounds checks to memory accessing instructions.
+/// Ptr is the pointer that will be read/written, and InstVal is either the
+/// result from the load or the value being stored. It is used to determine the
+/// size of memory block that is touched.
+/// Returns true if any change was made to the IR, false otherwise.
+bool BoundsChecking::instrument(Value *Ptr, Value *InstVal) {
+  uint64_t NeededSize = TD->getTypeStoreSize(InstVal->getType());
+  DEBUG(dbgs() << "Instrument " << *Ptr << " for " << Twine(NeededSize)
+              << " bytes\n");
+
+  SizeOffsetEvalType SizeOffset = ObjSizeEval->compute(Ptr);
+
+  if (!ObjSizeEval->bothKnown(SizeOffset)) {
+    ++ChecksUnable;
+    return false;
+  }
+
+  Value *Size   = SizeOffset.first;
+  Value *Offset = SizeOffset.second;
+  ConstantInt *SizeCI = dyn_cast<ConstantInt>(Size);
+
+  IntegerType *IntTy = TD->getIntPtrType(Inst->getContext());
+  Value *NeededSizeVal = ConstantInt::get(IntTy, NeededSize);
+
+  // three checks are required to ensure safety:
+  // . Offset >= 0  (since the offset is given from the base ptr)
+  // . Size >= Offset  (unsigned)
+  // . Size - Offset >= NeededSize  (unsigned)
+  //
+  // optimization: if Size >= 0 (signed), skip 1st check
+  // FIXME: add NSW/NUW here?  -- we dont care if the subtraction overflows
+  Value *ObjSize = Builder->CreateSub(Size, Offset);
+  Value *Cmp2 = Builder->CreateICmpULT(Size, Offset);
+  Value *Cmp3 = Builder->CreateICmpULT(ObjSize, NeededSizeVal);
+  Value *Or = Builder->CreateOr(Cmp2, Cmp3);
+  if (!SizeCI || SizeCI->getValue().slt(0)) {
+    Value *Cmp1 = Builder->CreateICmpSLT(Offset, ConstantInt::get(IntTy, 0));
+    Or = Builder->CreateOr(Cmp1, Or);
+  }
+  emitBranchToTrap(Or);
+
+  ++ChecksAdded;
+  return true;
+}
+
+bool BoundsChecking::runOnFunction(Function &F) {
+  TD = &getAnalysis<TargetData>();
+
+  TrapBB = 0;
+  BuilderTy TheBuilder(F.getContext(), TargetFolder(TD));
+  Builder = &TheBuilder;
+  ObjectSizeOffsetEvaluator TheObjSizeEval(TD, F.getContext());
+  ObjSizeEval = &TheObjSizeEval;
+
+  // check HANDLE_MEMORY_INST in include/llvm/Instruction.def for memory
+  // touching instructions
+  std::vector<Instruction*> WorkList;
+  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+    Instruction *I = &*i;
+    if (isa<LoadInst>(I) || isa<StoreInst>(I) || isa<AtomicCmpXchgInst>(I) ||
+        isa<AtomicRMWInst>(I))
+        WorkList.push_back(I);
+  }
+
+  bool MadeChange = false;
+  for (std::vector<Instruction*>::iterator i = WorkList.begin(),
+       e = WorkList.end(); i != e; ++i) {
+    Inst = *i;
+
+    Builder->SetInsertPoint(Inst);
+    if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+      MadeChange |= instrument(LI->getPointerOperand(), LI);
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+      MadeChange |= instrument(SI->getPointerOperand(), SI->getValueOperand());
+    } else if (AtomicCmpXchgInst *AI = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+      MadeChange |= instrument(AI->getPointerOperand(),AI->getCompareOperand());
+    } else if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst)) {
+      MadeChange |= instrument(AI->getPointerOperand(), AI->getValOperand());
+    } else {
+      llvm_unreachable("unknown Instruction type");
+    }
+  }
+  return MadeChange;
+}
+
+FunctionPass *llvm::createBoundsCheckingPass(unsigned Penalty) {
+  return new BoundsChecking(Penalty);
+}
diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt
index e4c8cf1..00de882 100644
--- a/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_llvm_library(LLVMInstrumentation
   AddressSanitizer.cpp
+  BoundsChecking.cpp
   EdgeProfiling.cpp
   FunctionBlackList.cpp
   GCOVProfiling.cpp
@@ -9,3 +10,5 @@ add_llvm_library(LLVMInstrumentation
   ProfilingUtils.cpp
   ThreadSanitizer.cpp
   )
+
+add_dependencies(LLVMInstrumentation intrinsics_gen)
diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 96e5d5b..264a6a6 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -18,22 +18,23 @@
 
 #include "ProfilingUtils.h"
 #include "llvm/Transforms/Instrumentation.h"
-#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/Instructions.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/Instructions.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/DebugLoc.h"
-#include "llvm/Support/InstIterator.h"
-#include "llvm/Support/IRBuilder.h"
-#include "llvm/Support/PathV2.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/UniqueVector.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugLoc.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/PathV2.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <string>
 #include <utility>
 using namespace llvm;
@@ -57,7 +58,6 @@ namespace {
     virtual const char *getPassName() const {
       return "GCOV Profiler";
     }
-
   private:
     bool runOnModule(Module &M);
 
@@ -90,6 +90,7 @@ namespace {
     // list.
     void insertCounterWriteout(SmallVector<std::pair<GlobalVariable *,
                                                      MDNode *>, 8> &);
+    void insertIndirectCounterIncrement();
 
     std::string mangleName(DICompileUnit CU, std::string NewStem);
 
@@ -421,6 +422,7 @@ bool GCOVProfiler::emitProfileArcs() {
   if (!CU_Nodes) return false;
 
   bool Result = false;  
+  bool InsertIndCounterIncrCode = false;
   for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
     DICompileUnit CU(CU_Nodes->getOperand(i));
     DIArray SPs = CU.getSubprograms();
@@ -446,7 +448,7 @@ bool GCOVProfiler::emitProfileArcs() {
         new GlobalVariable(*M, CounterTy, false,
                            GlobalValue::InternalLinkage,
                            Constant::getNullValue(CounterTy),
-                           "__llvm_gcov_ctr", 0, false, 0);
+                           "__llvm_gcov_ctr");
       CountersBySP.push_back(std::make_pair(Counters, (MDNode*)SP));
       
       UniqueVector<BasicBlock *> ComplexEdgePreds;
@@ -507,15 +509,21 @@ bool GCOVProfiler::emitProfileArcs() {
           Value *CounterPtrArray =
             Builder.CreateConstInBoundsGEP2_64(EdgeTable, 0,
                                                i * ComplexEdgePreds.size());
+
+          // Build code to increment the counter.
+          InsertIndCounterIncrCode = true;
           Builder.CreateCall2(getIncrementIndirectCounterFunc(),
                               EdgeState, CounterPtrArray);
-          // clear the predecessor number
-          Builder.CreateStore(ConstantInt::get(Int32Ty, 0xffffffff), EdgeState);
         }
       }
     }
+
     insertCounterWriteout(CountersBySP);
   }
+
+  if (InsertIndCounterIncrCode)
+    insertIndirectCounterIncrement();
+
   return Result;
 }
 
@@ -574,13 +582,14 @@ Constant *GCOVProfiler::getStartFileFunc() {
 }
 
 Constant *GCOVProfiler::getIncrementIndirectCounterFunc() {
+  Type *Int32Ty = Type::getInt32Ty(*Ctx);
+  Type *Int64Ty = Type::getInt64Ty(*Ctx);
   Type *Args[] = {
-    Type::getInt32PtrTy(*Ctx),                  // uint32_t *predecessor
-    Type::getInt64PtrTy(*Ctx)->getPointerTo(),  // uint64_t **state_table_row
+    Int32Ty->getPointerTo(),                // uint32_t *predecessor
+    Int64Ty->getPointerTo()->getPointerTo() // uint64_t **counters
   };
-  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx),
-                                              Args, false);
-  return M->getOrInsertFunction("llvm_gcda_increment_indirect_counter", FTy);
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
+  return M->getOrInsertFunction("__llvm_gcov_indirect_counter_increment", FTy);
 }
 
 Constant *GCOVProfiler::getEmitFunctionFunc() {
@@ -588,8 +597,7 @@ Constant *GCOVProfiler::getEmitFunctionFunc() {
     Type::getInt32Ty(*Ctx),    // uint32_t ident
     Type::getInt8PtrTy(*Ctx),  // const char *function_name
   };
-  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx),
-                                              Args, false);
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
   return M->getOrInsertFunction("llvm_gcda_emit_function", FTy);
 }
 
@@ -665,5 +673,75 @@ void GCOVProfiler::insertCounterWriteout(
   }
   Builder.CreateRetVoid();
 
-  InsertProfilingShutdownCall(WriteoutF, M);
+  // Create a small bit of code that registers the "__llvm_gcov_writeout"
+  // function to be executed at exit.
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  Function *F = Function::Create(FTy, GlobalValue::InternalLinkage,
+                                 "__llvm_gcov_init", M);
+  F->setUnnamedAddr(true);
+  F->setLinkage(GlobalValue::InternalLinkage);
+  F->addFnAttr(Attribute::NoInline);
+
+  BB = BasicBlock::Create(*Ctx, "entry", F);
+  Builder.SetInsertPoint(BB);
+
+  FTy = FunctionType::get(Type::getInt32Ty(*Ctx),
+                          PointerType::get(FTy, 0), false);
+  Constant *AtExitFn = M->getOrInsertFunction("atexit", FTy);
+  Builder.CreateCall(AtExitFn, WriteoutF);
+  Builder.CreateRetVoid();
+
+  appendToGlobalCtors(*M, F, 0);
+}
+
+void GCOVProfiler::insertIndirectCounterIncrement() {
+  Function *Fn =
+    cast<Function>(GCOVProfiler::getIncrementIndirectCounterFunc());
+  Fn->setUnnamedAddr(true);
+  Fn->setLinkage(GlobalValue::InternalLinkage);
+  Fn->addFnAttr(Attribute::NoInline);
+
+  Type *Int32Ty = Type::getInt32Ty(*Ctx);
+  Type *Int64Ty = Type::getInt64Ty(*Ctx);
+  Constant *NegOne = ConstantInt::get(Int32Ty, 0xffffffff);
+
+  // Create basic blocks for function.
+  BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", Fn);
+  IRBuilder<> Builder(BB);
+
+  BasicBlock *PredNotNegOne = BasicBlock::Create(*Ctx, "", Fn);
+  BasicBlock *CounterEnd = BasicBlock::Create(*Ctx, "", Fn);
+  BasicBlock *Exit = BasicBlock::Create(*Ctx, "exit", Fn);
+
+  // uint32_t pred = *predecessor;
+  // if (pred == 0xffffffff) return;
+  Argument *Arg = Fn->arg_begin();
+  Arg->setName("predecessor");
+  Value *Pred = Builder.CreateLoad(Arg, "pred");
+  Value *Cond = Builder.CreateICmpEQ(Pred, NegOne);
+  BranchInst::Create(Exit, PredNotNegOne, Cond, BB);
+
+  Builder.SetInsertPoint(PredNotNegOne);
+
+  // uint64_t *counter = counters[pred];
+  // if (!counter) return;
+  Value *ZExtPred = Builder.CreateZExt(Pred, Int64Ty);
+  Arg = llvm::next(Fn->arg_begin());
+  Arg->setName("counters");
+  Value *GEP = Builder.CreateGEP(Arg, ZExtPred);
+  Value *Counter = Builder.CreateLoad(GEP, "counter");
+  Cond = Builder.CreateICmpEQ(Counter,
+                              Constant::getNullValue(Int64Ty->getPointerTo()));
+  Builder.CreateCondBr(Cond, Exit, CounterEnd);
+
+  // ++*counter;
+  Builder.SetInsertPoint(CounterEnd);
+  Value *Add = Builder.CreateAdd(Builder.CreateLoad(Counter),
+                                 ConstantInt::get(Int64Ty, 1));
+  Builder.CreateStore(Add, Counter);
+  Builder.CreateBr(Exit);
+
+  // Fill in the exit block.
+  Builder.SetInsertPoint(Exit);
+  Builder.CreateRetVoid();
 }
diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp
index c7266e2..1e0b4a3 100644
--- a/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -20,11 +20,12 @@ using namespace llvm;
 /// initializeInstrumentation - Initialize all passes in the TransformUtils
 /// library.
 void llvm::initializeInstrumentation(PassRegistry &Registry) {
+  initializeAddressSanitizerPass(Registry);
+  initializeBoundsCheckingPass(Registry);
   initializeEdgeProfilerPass(Registry);
+  initializeGCOVProfilerPass(Registry);
   initializeOptimalEdgeProfilerPass(Registry);
   initializePathProfilerPass(Registry);
-  initializeGCOVProfilerPass(Registry);
-  initializeAddressSanitizerPass(Registry);
   initializeThreadSanitizerPass(Registry);
 }
 
diff --git a/lib/Transforms/Instrumentation/PathProfiling.cpp b/lib/Transforms/Instrumentation/PathProfiling.cpp
index b214796..cc27146 100644
--- a/lib/Transforms/Instrumentation/PathProfiling.cpp
+++ b/lib/Transforms/Instrumentation/PathProfiling.cpp
@@ -55,11 +55,11 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
+#include "llvm/TypeBuilder.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/TypeBuilder.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Instrumentation.h"
diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 8bb337e..dc0fa71 100644
--- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -22,73 +22,73 @@
 #define DEBUG_TYPE "tsan"
 
 #include "FunctionBlackList.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Intrinsics.h"
 #include "llvm/Function.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/Intrinsics.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Metadata.h"
 #include "llvm/Module.h"
+#include "llvm/Type.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
-#include "llvm/Type.h"
 
 using namespace llvm;
 
 static cl::opt<std::string>  ClBlackListFile("tsan-blacklist",
        cl::desc("Blacklist file"), cl::Hidden);
 
-static cl::opt<bool> ClPrintStats("tsan-print-stats",
-       cl::desc("Print ThreadSanitizer instrumentation stats"), cl::Hidden);
+STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
+STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
+STATISTIC(NumOmittedReadsBeforeWrite, 
+          "Number of reads ignored due to following writes");
+STATISTIC(NumAccessesWithBadSize, "Number of accesses with bad size");
+STATISTIC(NumInstrumentedVtableWrites, "Number of vtable ptr writes");
+STATISTIC(NumOmittedReadsFromConstantGlobals,
+          "Number of reads from constant globals");
+STATISTIC(NumOmittedReadsFromVtable, "Number of vtable reads");
 
 namespace {
 
-// Stats counters for ThreadSanitizer instrumentation.
-struct ThreadSanitizerStats {
-  size_t NumInstrumentedReads;
-  size_t NumInstrumentedWrites;
-  size_t NumOmittedReadsBeforeWrite;
-  size_t NumAccessesWithBadSize;
-  size_t NumInstrumentedVtableWrites;
-  size_t NumOmittedReadsFromConstantGlobals;
-  size_t NumOmittedReadsFromVtable;
-};
-
 /// ThreadSanitizer: instrument the code in module to find races.
 struct ThreadSanitizer : public FunctionPass {
   ThreadSanitizer();
+  const char *getPassName() const;
   bool runOnFunction(Function &F);
   bool doInitialization(Module &M);
-  bool doFinalization(Module &M);
-  bool instrumentLoadOrStore(Instruction *I);
   static char ID;  // Pass identification, replacement for typeid.
 
  private:
-  void choseInstructionsToInstrument(SmallVectorImpl<Instruction*> &Local,
-                                     SmallVectorImpl<Instruction*> &All);
+  bool instrumentLoadOrStore(Instruction *I);
+  bool instrumentAtomic(Instruction *I);
+  void chooseInstructionsToInstrument(SmallVectorImpl<Instruction*> &Local,
+                                      SmallVectorImpl<Instruction*> &All);
   bool addrPointsToConstantData(Value *Addr);
+  int getMemoryAccessFuncIndex(Value *Addr);
 
   TargetData *TD;
   OwningPtr<FunctionBlackList> BL;
+  IntegerType *OrdTy;
   // Callbacks to run-time library are computed in doInitialization.
-  Value *TsanFuncEntry;
-  Value *TsanFuncExit;
+  Function *TsanFuncEntry;
+  Function *TsanFuncExit;
   // Accesses sizes are powers of two: 1, 2, 4, 8, 16.
   static const size_t kNumberOfAccessSizes = 5;
-  Value *TsanRead[kNumberOfAccessSizes];
-  Value *TsanWrite[kNumberOfAccessSizes];
-  Value *TsanVptrUpdate;
-
-  // Stats are modified w/o synchronization.
-  ThreadSanitizerStats stats;
+  Function *TsanRead[kNumberOfAccessSizes];
+  Function *TsanWrite[kNumberOfAccessSizes];
+  Function *TsanAtomicLoad[kNumberOfAccessSizes];
+  Function *TsanAtomicStore[kNumberOfAccessSizes];
+  Function *TsanVptrUpdate;
 };
 }  // namespace
 
@@ -97,6 +97,10 @@ INITIALIZE_PASS(ThreadSanitizer, "tsan",
     "ThreadSanitizer: detects data races.",
     false, false)
 
+const char *ThreadSanitizer::getPassName() const {
+  return "ThreadSanitizer";
+}
+
 ThreadSanitizer::ThreadSanitizer()
   : FunctionPass(ID),
   TD(NULL) {
@@ -106,12 +110,18 @@ FunctionPass *llvm::createThreadSanitizerPass() {
   return new ThreadSanitizer();
 }
 
+static Function *checkInterfaceFunction(Constant *FuncOrBitcast) {
+  if (Function *F = dyn_cast<Function>(FuncOrBitcast))
+     return F;
+  FuncOrBitcast->dump();
+  report_fatal_error("ThreadSanitizer interface function redefined");
+}
+
 bool ThreadSanitizer::doInitialization(Module &M) {
   TD = getAnalysisIfAvailable<TargetData>();
   if (!TD)
     return false;
   BL.reset(new FunctionBlackList(ClBlackListFile));
-  memset(&stats, 0, sizeof(stats));
 
   // Always insert a call to __tsan_init into the module's CTORs.
   IRBuilder<> IRB(M.getContext());
@@ -120,38 +130,38 @@ bool ThreadSanitizer::doInitialization(Module &M) {
   appendToGlobalCtors(M, cast<Function>(TsanInit), 0);
 
   // Initialize the callbacks.
-  TsanFuncEntry = M.getOrInsertFunction("__tsan_func_entry", IRB.getVoidTy(),
-                                        IRB.getInt8PtrTy(), NULL);
-  TsanFuncExit = M.getOrInsertFunction("__tsan_func_exit", IRB.getVoidTy(),
-                                       NULL);
+  TsanFuncEntry = checkInterfaceFunction(M.getOrInsertFunction(
+      "__tsan_func_entry", IRB.getVoidTy(), IRB.getInt8PtrTy(), NULL));
+  TsanFuncExit = checkInterfaceFunction(M.getOrInsertFunction(
+      "__tsan_func_exit", IRB.getVoidTy(), NULL));
+  OrdTy = IRB.getInt32Ty();
   for (size_t i = 0; i < kNumberOfAccessSizes; ++i) {
-    SmallString<32> ReadName("__tsan_read");
-    ReadName += itostr(1 << i);
-    TsanRead[i] = M.getOrInsertFunction(ReadName, IRB.getVoidTy(),
-                                        IRB.getInt8PtrTy(), NULL);
-    SmallString<32> WriteName("__tsan_write");
-    WriteName += itostr(1 << i);
-    TsanWrite[i] = M.getOrInsertFunction(WriteName, IRB.getVoidTy(),
-                                         IRB.getInt8PtrTy(), NULL);
-  }
-  TsanVptrUpdate = M.getOrInsertFunction("__tsan_vptr_update", IRB.getVoidTy(),
-                                         IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
-                                         NULL);
-  return true;
-}
+    const size_t ByteSize = 1 << i;
+    const size_t BitSize = ByteSize * 8;
+    SmallString<32> ReadName("__tsan_read" + itostr(ByteSize));
+    TsanRead[i] = checkInterfaceFunction(M.getOrInsertFunction(
+        ReadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), NULL));
 
-bool ThreadSanitizer::doFinalization(Module &M) {
-  if (ClPrintStats) {
-    errs() << "ThreadSanitizerStats " << M.getModuleIdentifier()
-           << ": wr " << stats.NumInstrumentedWrites
-           << "; rd " << stats.NumInstrumentedReads
-           << "; vt " << stats.NumInstrumentedVtableWrites
-           << "; bs " << stats.NumAccessesWithBadSize
-           << "; rbw " << stats.NumOmittedReadsBeforeWrite
-           << "; rcg " << stats.NumOmittedReadsFromConstantGlobals
-           << "; rvt " << stats.NumOmittedReadsFromVtable
-           << "\n";
+    SmallString<32> WriteName("__tsan_write" + itostr(ByteSize));
+    TsanWrite[i] = checkInterfaceFunction(M.getOrInsertFunction(
+        WriteName, IRB.getVoidTy(), IRB.getInt8PtrTy(), NULL));
+
+    Type *Ty = Type::getIntNTy(M.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    SmallString<32> AtomicLoadName("__tsan_atomic" + itostr(BitSize) +
+                                   "_load");
+    TsanAtomicLoad[i] = checkInterfaceFunction(M.getOrInsertFunction(
+        AtomicLoadName, Ty, PtrTy, OrdTy, NULL));
+
+    SmallString<32> AtomicStoreName("__tsan_atomic" + itostr(BitSize) +
+                                    "_store");
+    TsanAtomicStore[i] = checkInterfaceFunction(M.getOrInsertFunction(
+        AtomicStoreName, IRB.getVoidTy(), PtrTy, Ty, OrdTy,
+        NULL));
   }
+  TsanVptrUpdate = checkInterfaceFunction(M.getOrInsertFunction(
+      "__tsan_vptr_update", IRB.getVoidTy(), IRB.getInt8PtrTy(),
+      IRB.getInt8PtrTy(), NULL));
   return true;
 }
 
@@ -173,13 +183,13 @@ bool ThreadSanitizer::addrPointsToConstantData(Value *Addr) {
   if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
     if (GV->isConstant()) {
       // Reads from constant globals can not race with any writes.
-      stats.NumOmittedReadsFromConstantGlobals++;
+      NumOmittedReadsFromConstantGlobals++;
       return true;
     }
   } else if(LoadInst *L = dyn_cast<LoadInst>(Addr)) {
     if (isVtableAccess(L)) {
       // Reads from a vtable pointer can not race with any writes.
-      stats.NumOmittedReadsFromVtable++;
+      NumOmittedReadsFromVtable++;
       return true;
     }
   }
@@ -197,7 +207,7 @@ bool ThreadSanitizer::addrPointsToConstantData(Value *Addr) {
 //
 // 'Local' is a vector of insns within the same BB (no calls between).
 // 'All' is a vector of insns that will be instrumented.
-void ThreadSanitizer::choseInstructionsToInstrument(
+void ThreadSanitizer::chooseInstructionsToInstrument(
     SmallVectorImpl<Instruction*> &Local,
     SmallVectorImpl<Instruction*> &All) {
   SmallSet<Value*, 8> WriteTargets;
@@ -212,7 +222,7 @@ void ThreadSanitizer::choseInstructionsToInstrument(
       Value *Addr = Load->getPointerOperand();
       if (WriteTargets.count(Addr)) {
         // We will write to this temp, so no reason to analyze the read.
-        stats.NumOmittedReadsBeforeWrite++;
+        NumOmittedReadsBeforeWrite++;
         continue;
       }
       if (addrPointsToConstantData(Addr)) {
@@ -225,12 +235,27 @@ void ThreadSanitizer::choseInstructionsToInstrument(
   Local.clear();
 }
 
+static bool isAtomic(Instruction *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->isAtomic() && LI->getSynchScope() == CrossThread;
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->isAtomic() && SI->getSynchScope() == CrossThread;
+  if (isa<AtomicRMWInst>(I))
+    return true;
+  if (isa<AtomicCmpXchgInst>(I))
+    return true;
+  if (FenceInst *FI = dyn_cast<FenceInst>(I))
+    return FI->getSynchScope() == CrossThread;
+  return false;
+}
+
 bool ThreadSanitizer::runOnFunction(Function &F) {
   if (!TD) return false;
   if (BL->isIn(F)) return false;
   SmallVector<Instruction*, 8> RetVec;
   SmallVector<Instruction*, 8> AllLoadsAndStores;
   SmallVector<Instruction*, 8> LocalLoadsAndStores;
+  SmallVector<Instruction*, 8> AtomicAccesses;
   bool Res = false;
   bool HasCalls = false;
 
@@ -240,16 +265,18 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
     BasicBlock &BB = *FI;
     for (BasicBlock::iterator BI = BB.begin(), BE = BB.end();
          BI != BE; ++BI) {
-      if (isa<LoadInst>(BI) || isa<StoreInst>(BI))
+      if (isAtomic(BI))
+        AtomicAccesses.push_back(BI);
+      else if (isa<LoadInst>(BI) || isa<StoreInst>(BI))
         LocalLoadsAndStores.push_back(BI);
       else if (isa<ReturnInst>(BI))
         RetVec.push_back(BI);
       else if (isa<CallInst>(BI) || isa<InvokeInst>(BI)) {
         HasCalls = true;
-        choseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores);
+        chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores);
       }
     }
-    choseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores);
+    chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores);
   }
 
   // We have collected all loads and stores.
@@ -261,6 +288,11 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
     Res |= instrumentLoadOrStore(AllLoadsAndStores[i]);
   }
 
+  // Instrument atomic memory accesses.
+  for (size_t i = 0, n = AtomicAccesses.size(); i < n; ++i) {
+    Res |= instrumentAtomic(AtomicAccesses[i]);
+  }
+
   // Instrument function entry/exit points if there were instrumented accesses.
   if (Res || HasCalls) {
     IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
@@ -283,29 +315,98 @@ bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I) {
   Value *Addr = IsWrite
       ? cast<StoreInst>(I)->getPointerOperand()
       : cast<LoadInst>(I)->getPointerOperand();
-  Type *OrigPtrTy = Addr->getType();
-  Type *OrigTy = cast<PointerType>(OrigPtrTy)->getElementType();
-  assert(OrigTy->isSized());
-  uint32_t TypeSize = TD->getTypeStoreSizeInBits(OrigTy);
-  if (TypeSize != 8  && TypeSize != 16 &&
-      TypeSize != 32 && TypeSize != 64 && TypeSize != 128) {
-    stats.NumAccessesWithBadSize++;
-    // Ignore all unusual sizes.
+  int Idx = getMemoryAccessFuncIndex(Addr);
+  if (Idx < 0)
     return false;
-  }
   if (IsWrite && isVtableAccess(I)) {
+    DEBUG(dbgs() << "  VPTR : " << *I << "\n");
     Value *StoredValue = cast<StoreInst>(I)->getValueOperand();
+    // StoredValue does not necessary have a pointer type.
+    if (isa<IntegerType>(StoredValue->getType()))
+      StoredValue = IRB.CreateIntToPtr(StoredValue, IRB.getInt8PtrTy());
+    // Call TsanVptrUpdate.
     IRB.CreateCall2(TsanVptrUpdate,
                     IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()),
                     IRB.CreatePointerCast(StoredValue, IRB.getInt8PtrTy()));
-    stats.NumInstrumentedVtableWrites++;
+    NumInstrumentedVtableWrites++;
     return true;
   }
-  size_t Idx = CountTrailingZeros_32(TypeSize / 8);
-  assert(Idx < kNumberOfAccessSizes);
   Value *OnAccessFunc = IsWrite ? TsanWrite[Idx] : TsanRead[Idx];
   IRB.CreateCall(OnAccessFunc, IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()));
-  if (IsWrite) stats.NumInstrumentedWrites++;
-  else         stats.NumInstrumentedReads++;
+  if (IsWrite) NumInstrumentedWrites++;
+  else         NumInstrumentedReads++;
+  return true;
+}
+
+static ConstantInt *createOrdering(IRBuilder<> *IRB, AtomicOrdering ord) {
+  uint32_t v = 0;
+  switch (ord) {
+    case NotAtomic:              assert(false);
+    case Unordered:              // Fall-through.
+    case Monotonic:              v = 1 << 0; break;
+ // case Consume:                v = 1 << 1; break;  // Not specified yet.
+    case Acquire:                v = 1 << 2; break;
+    case Release:                v = 1 << 3; break;
+    case AcquireRelease:         v = 1 << 4; break;
+    case SequentiallyConsistent: v = 1 << 5; break;
+  }
+  return IRB->getInt32(v);
+}
+
+bool ThreadSanitizer::instrumentAtomic(Instruction *I) {
+  IRBuilder<> IRB(I);
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    Value *Addr = LI->getPointerOperand();
+    int Idx = getMemoryAccessFuncIndex(Addr);
+    if (Idx < 0)
+      return false;
+    const size_t ByteSize = 1 << Idx;
+    const size_t BitSize = ByteSize * 8;
+    Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+                     createOrdering(&IRB, LI->getOrdering())};
+    CallInst *C = CallInst::Create(TsanAtomicLoad[Idx],
+                                   ArrayRef<Value*>(Args));
+    ReplaceInstWithInst(I, C);
+
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    Value *Addr = SI->getPointerOperand();
+    int Idx = getMemoryAccessFuncIndex(Addr);
+    if (Idx < 0)
+      return false;
+    const size_t ByteSize = 1 << Idx;
+    const size_t BitSize = ByteSize * 8;
+    Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+                     IRB.CreateIntCast(SI->getValueOperand(), Ty, false),
+                     createOrdering(&IRB, SI->getOrdering())};
+    CallInst *C = CallInst::Create(TsanAtomicStore[Idx],
+                                   ArrayRef<Value*>(Args));
+    ReplaceInstWithInst(I, C);
+  } else if (isa<AtomicRMWInst>(I)) {
+    // FIXME: Not yet supported.
+  } else if (isa<AtomicCmpXchgInst>(I)) {
+    // FIXME: Not yet supported.
+  } else if (isa<FenceInst>(I)) {
+    // FIXME: Not yet supported.
+  }
   return true;
 }
+
+int ThreadSanitizer::getMemoryAccessFuncIndex(Value *Addr) {
+  Type *OrigPtrTy = Addr->getType();
+  Type *OrigTy = cast<PointerType>(OrigPtrTy)->getElementType();
+  assert(OrigTy->isSized());
+  uint32_t TypeSize = TD->getTypeStoreSizeInBits(OrigTy);
+  if (TypeSize != 8  && TypeSize != 16 &&
+      TypeSize != 32 && TypeSize != 64 && TypeSize != 128) {
+    NumAccessesWithBadSize++;
+    // Ignore all unusual sizes.
+    return -1;
+  }
+  size_t Idx = CountTrailingZeros_32(TypeSize / 8);
+  assert(Idx < kNumberOfAccessSizes);
+  return Idx;
+}
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp
index ba214d1..b344952 100644
--- a/lib/Transforms/Scalar/ADCE.cpp
+++ b/lib/Transforms/Scalar/ADCE.cpp
@@ -9,7 +9,7 @@
 //
 // This file implements the Aggressive Dead Code Elimination pass.  This pass
 // optimistically assumes that all instructions are dead until proven otherwise,
-// allowing it to eliminate dead computations that other DCE passes do not 
+// allowing it to eliminate dead computations that other DCE passes do not
 // catch, particularly involving loop computations.
 //
 //===----------------------------------------------------------------------===//
@@ -36,13 +36,13 @@ namespace {
     ADCE() : FunctionPass(ID) {
       initializeADCEPass(*PassRegistry::getPassRegistry());
     }
-    
+
     virtual bool runOnFunction(Function& F);
-    
+
     virtual void getAnalysisUsage(AnalysisUsage& AU) const {
       AU.setPreservesCFG();
     }
-    
+
   };
 }
 
@@ -52,7 +52,7 @@ INITIALIZE_PASS(ADCE, "adce", "Aggressive Dead Code Elimination", false, false)
 bool ADCE::runOnFunction(Function& F) {
   SmallPtrSet<Instruction*, 128> alive;
   SmallVector<Instruction*, 128> worklist;
-  
+
   // Collect the set of "root" instructions that are known live.
   for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
     if (isa<TerminatorInst>(I.getInstructionIterator()) ||
@@ -62,7 +62,7 @@ bool ADCE::runOnFunction(Function& F) {
       alive.insert(I.getInstructionIterator());
       worklist.push_back(I.getInstructionIterator());
     }
-  
+
   // Propagate liveness backwards to operands.
   while (!worklist.empty()) {
     Instruction* curr = worklist.pop_back_val();
@@ -72,7 +72,7 @@ bool ADCE::runOnFunction(Function& F) {
         if (alive.insert(Inst))
           worklist.push_back(Inst);
   }
-  
+
   // The inverse of the live set is the dead set.  These are those instructions
   // which have no side effects and do not influence the control flow or return
   // value of the function, and may therefore be deleted safely.
@@ -82,7 +82,7 @@ bool ADCE::runOnFunction(Function& F) {
       worklist.push_back(I.getInstructionIterator());
       I->dropAllReferences();
     }
-  
+
   for (SmallVector<Instruction*, 1024>::iterator I = worklist.begin(),
        E = worklist.end(); I != E; ++I) {
     ++NumRemoved;
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index d660c72..a01e066 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -32,3 +32,5 @@ add_llvm_library(LLVMScalarOpts
   Sink.cpp
   TailRecursionElimination.cpp
   )
+
+add_dependencies(LLVMScalarOpts intrinsics_gen)
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index 9a5423f..277c4d5 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -18,32 +18,32 @@
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/InlineAsm.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Pass.h"
-#include "llvm/Analysis/Dominators.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/ProfileInfo.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/Target/TargetLowering.h"
-#include "llvm/Transforms/Utils/AddrModeMatcher.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/PatternMatch.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/ValueHandle.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Transforms/Utils/AddrModeMatcher.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
+#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
@@ -60,6 +60,7 @@ STATISTIC(NumExtsMoved,  "Number of [s|z]ext instructions combined with loads");
 STATISTIC(NumExtUses,    "Number of uses of [s|z]ext instructions optimized");
 STATISTIC(NumRetsDup,    "Number of return instructions duplicated");
 STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
+STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
 
 static cl::opt<bool> DisableBranchOpts(
   "disable-cgp-branch-opts", cl::Hidden, cl::init(false),
@@ -70,6 +71,10 @@ static cl::opt<bool> DisableDeleteDeadBlocks(
   "disable-cgp-delete-dead-blocks", cl::Hidden, cl::init(false),
   cl::desc("Disable deleting dead blocks in CodeGenPrepare"));
 
+static cl::opt<bool> DisableSelectToBranch(
+  "disable-cgp-select2branch", cl::Hidden, cl::init(false),
+  cl::desc("Disable select to branch conversion."));
+
 namespace {
   class CodeGenPrepare : public FunctionPass {
     /// TLI - Keep a pointer of a TargetLowering to consult for determining
@@ -78,7 +83,7 @@ namespace {
     const TargetLibraryInfo *TLInfo;
     DominatorTree *DT;
     ProfileInfo *PFI;
-    
+
     /// CurInstIterator - As we scan instructions optimizing them, this is the
     /// next instruction to optimize.  Xforms that can invalidate this should
     /// update it.
@@ -93,6 +98,9 @@ namespace {
     /// be updated.
     bool ModifiedDT;
 
+    /// OptSize - True if optimizing for size.
+    bool OptSize;
+
   public:
     static char ID; // Pass identification, replacement for typeid
     explicit CodeGenPrepare(const TargetLowering *tli = 0)
@@ -118,6 +126,7 @@ namespace {
     bool OptimizeCallInst(CallInst *CI);
     bool MoveExtToFormExtLoad(Instruction *I);
     bool OptimizeExtUses(Instruction *I);
+    bool OptimizeSelectInst(SelectInst *SI);
     bool DupRetToEnableTailCallOpts(ReturnInst *RI);
     bool PlaceDbgValues(Function &F);
   };
@@ -141,13 +150,14 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   TLInfo = &getAnalysis<TargetLibraryInfo>();
   DT = getAnalysisIfAvailable<DominatorTree>();
   PFI = getAnalysisIfAvailable<ProfileInfo>();
+  OptSize = F.hasFnAttr(Attribute::OptimizeForSize);
 
   // First pass, eliminate blocks that contain only PHI nodes and an
   // unconditional branch.
   EverMadeChange |= EliminateMostlyEmptyBlocks(F);
 
   // llvm.dbg.value is far away from the value then iSel may not be able
-  // handle it properly. iSel will drop llvm.dbg.value if it can not 
+  // handle it properly. iSel will drop llvm.dbg.value if it can not
   // find a node corresponding to the value.
   EverMadeChange |= PlaceDbgValues(F);
 
@@ -326,7 +336,7 @@ void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) {
 
       if (isEntry && BB != &BB->getParent()->getEntryBlock())
         BB->moveBefore(&BB->getParent()->getEntryBlock());
-      
+
       DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n");
       return;
     }
@@ -537,7 +547,7 @@ protected:
 
 bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
   BasicBlock *BB = CI->getParent();
-  
+
   // Lower inline assembly if we can.
   // If we found an inline asm expession, and if the target knows how to
   // lower it to normal LLVM code, do so now.
@@ -554,19 +564,19 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
     if (OptimizeInlineAsmInst(CI))
       return true;
   }
-  
+
   // Lower all uses of llvm.objectsize.*
   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
   if (II && II->getIntrinsicID() == Intrinsic::objectsize) {
     bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1);
     Type *ReturnTy = CI->getType();
-    Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL);    
-    
+    Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL);
+
     // Substituting this can cause recursive simplifications, which can
     // invalidate our iterator.  Use a WeakVH to hold onto it in case this
     // happens.
     WeakVH IterHandle(CurInstIterator);
-    
+
     replaceAndRecursivelySimplify(CI, RetVal, TLI ? TLI->getTargetData() : 0,
                                   TLInfo, ModifiedDT ? 0 : DT);
 
@@ -594,7 +604,7 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
   // We'll need TargetData from here on out.
   const TargetData *TD = TLI ? TLI->getTargetData() : 0;
   if (!TD) return false;
-  
+
   // Lower all default uses of _chk calls.  This is very similar
   // to what InstCombineCalls does, but here we are only lowering calls
   // that have the default "don't know" as the objectsize.  Anything else
@@ -750,13 +760,13 @@ static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
 bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
                                         Type *AccessTy) {
   Value *Repl = Addr;
-  
-  // Try to collapse single-value PHI nodes.  This is necessary to undo 
+
+  // Try to collapse single-value PHI nodes.  This is necessary to undo
   // unprofitable PRE transformations.
   SmallVector<Value*, 8> worklist;
   SmallPtrSet<Value*, 16> Visited;
   worklist.push_back(Addr);
-  
+
   // Use a worklist to iteratively look through PHI nodes, and ensure that
   // the addressing mode obtained from the non-PHI roots of the graph
   // are equivalent.
@@ -768,20 +778,20 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
   while (!worklist.empty()) {
     Value *V = worklist.back();
     worklist.pop_back();
-    
+
     // Break use-def graph loops.
     if (!Visited.insert(V)) {
       Consensus = 0;
       break;
     }
-    
+
     // For a PHI node, push all of its incoming values.
     if (PHINode *P = dyn_cast<PHINode>(V)) {
       for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i)
         worklist.push_back(P->getIncomingValue(i));
       continue;
     }
-    
+
     // For non-PHIs, determine the addressing mode being computed.
     SmallVector<Instruction*, 16> NewAddrModeInsts;
     ExtAddrMode NewAddrMode =
@@ -816,15 +826,15 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
       }
       continue;
     }
-    
+
     Consensus = 0;
     break;
   }
-  
+
   // If the addressing mode couldn't be determined, or if multiple different
   // ones were determined, bail out now.
   if (!Consensus) return false;
-  
+
   // Check to see if any of the instructions supersumed by this addr mode are
   // non-local to I's BB.
   bool AnyNonLocal = false;
@@ -933,7 +943,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     // Use a WeakVH to hold onto it in case this happens.
     WeakVH IterHandle(CurInstIterator);
     BasicBlock *BB = CurInstIterator->getParent();
-    
+
     RecursivelyDeleteTriviallyDeadInstructions(Repl);
 
     if (IterHandle != CurInstIterator) {
@@ -945,7 +955,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
       // This address is now available for reassignment, so erase the table
       // entry; we don't want to match some completely different instruction.
       SunkAddrs[Addr] = 0;
-    }    
+    }
   }
   ++NumMemoryInsts;
   return true;
@@ -957,12 +967,12 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
 bool CodeGenPrepare::OptimizeInlineAsmInst(CallInst *CS) {
   bool MadeChange = false;
 
-  TargetLowering::AsmOperandInfoVector 
+  TargetLowering::AsmOperandInfoVector
     TargetConstraints = TLI->ParseConstraints(CS);
   unsigned ArgNo = 0;
   for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
     TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
-    
+
     // Compute the constraint code and ConstraintType to use.
     TLI->ComputeConstraintToUse(OpInfo, SDValue());
 
@@ -1091,6 +1101,79 @@ bool CodeGenPrepare::OptimizeExtUses(Instruction *I) {
   return MadeChange;
 }
 
+/// isFormingBranchFromSelectProfitable - Returns true if a SelectInst should be
+/// turned into an explicit branch.
+static bool isFormingBranchFromSelectProfitable(SelectInst *SI) {
+  // FIXME: This should use the same heuristics as IfConversion to determine
+  // whether a select is better represented as a branch.  This requires that
+  // branch probability metadata is preserved for the select, which is not the
+  // case currently.
+
+  CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition());
+
+  // If the branch is predicted right, an out of order CPU can avoid blocking on
+  // the compare.  Emit cmovs on compares with a memory operand as branches to
+  // avoid stalls on the load from memory.  If the compare has more than one use
+  // there's probably another cmov or setcc around so it's not worth emitting a
+  // branch.
+  if (!Cmp)
+    return false;
+
+  Value *CmpOp0 = Cmp->getOperand(0);
+  Value *CmpOp1 = Cmp->getOperand(1);
+
+  // We check that the memory operand has one use to avoid uses of the loaded
+  // value directly after the compare, making branches unprofitable.
+  return Cmp->hasOneUse() &&
+         ((isa<LoadInst>(CmpOp0) && CmpOp0->hasOneUse()) ||
+          (isa<LoadInst>(CmpOp1) && CmpOp1->hasOneUse()));
+}
+
+
+bool CodeGenPrepare::OptimizeSelectInst(SelectInst *SI) {
+  // If we have a SelectInst that will likely profit from branch prediction,
+  // turn it into a branch.
+  if (DisableSelectToBranch || OptSize || !TLI ||
+      !TLI->isPredictableSelectExpensive())
+    return false;
+
+  if (!SI->getCondition()->getType()->isIntegerTy(1) ||
+      !isFormingBranchFromSelectProfitable(SI))
+    return false;
+
+  ModifiedDT = true;
+
+  // First, we split the block containing the select into 2 blocks.
+  BasicBlock *StartBlock = SI->getParent();
+  BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(SI));
+  BasicBlock *NextBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
+
+  // Create a new block serving as the landing pad for the branch.
+  BasicBlock *SmallBlock = BasicBlock::Create(SI->getContext(), "select.mid",
+                                             NextBlock->getParent(), NextBlock);
+
+  // Move the unconditional branch from the block with the select in it into our
+  // landing pad block.
+  StartBlock->getTerminator()->eraseFromParent();
+  BranchInst::Create(NextBlock, SmallBlock);
+
+  // Insert the real conditional branch based on the original condition.
+  BranchInst::Create(NextBlock, SmallBlock, SI->getCondition(), SI);
+
+  // The select itself is replaced with a PHI Node.
+  PHINode *PN = PHINode::Create(SI->getType(), 2, "", NextBlock->begin());
+  PN->takeName(SI);
+  PN->addIncoming(SI->getTrueValue(), StartBlock);
+  PN->addIncoming(SI->getFalseValue(), SmallBlock);
+  SI->replaceAllUsesWith(PN);
+  SI->eraseFromParent();
+
+  // Instruct OptimizeBlock to skip to the next block.
+  CurInstIterator = StartBlock->end();
+  ++NumSelectsExpanded;
+  return true;
+}
+
 bool CodeGenPrepare::OptimizeInst(Instruction *I) {
   if (PHINode *P = dyn_cast<PHINode>(I)) {
     // It is possible for very late stage optimizations (such as SimplifyCFG)
@@ -1104,7 +1187,7 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) {
     }
     return false;
   }
-  
+
   if (CastInst *CI = dyn_cast<CastInst>(I)) {
     // If the source of the cast is a constant, then this should have
     // already been constant folded.  The only reason NOT to constant fold
@@ -1124,23 +1207,23 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) {
     }
     return false;
   }
-  
+
   if (CmpInst *CI = dyn_cast<CmpInst>(I))
     return OptimizeCmpExpression(CI);
-  
+
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
     if (TLI)
       return OptimizeMemoryInst(I, I->getOperand(0), LI->getType());
     return false;
   }
-  
+
   if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
     if (TLI)
       return OptimizeMemoryInst(I, SI->getOperand(1),
                                 SI->getOperand(0)->getType());
     return false;
   }
-  
+
   if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
     if (GEPI->hasAllZeroIndices()) {
       /// The GEP operand must be a pointer, so must its result -> BitCast
@@ -1154,13 +1237,16 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) {
     }
     return false;
   }
-  
+
   if (CallInst *CI = dyn_cast<CallInst>(I))
     return OptimizeCallInst(CI);
 
   if (ReturnInst *RI = dyn_cast<ReturnInst>(I))
     return DupRetToEnableTailCallOpts(RI);
 
+  if (SelectInst *SI = dyn_cast<SelectInst>(I))
+    return OptimizeSelectInst(SI);
+
   return false;
 }
 
@@ -1179,7 +1265,7 @@ bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) {
 }
 
 // llvm.dbg.value is far away from the value then iSel may not be able
-// handle it properly. iSel will drop llvm.dbg.value if it can not 
+// handle it properly. iSel will drop llvm.dbg.value if it can not
 // find a node corresponding to the value.
 bool CodeGenPrepare::PlaceDbgValues(Function &F) {
   bool MadeChange = false;
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index c8c5360..5eff0e5 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -32,7 +32,7 @@
 #include "llvm/Target/TargetData.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/STLExtras.h"
 using namespace llvm;
@@ -71,7 +71,7 @@ namespace {
     bool HandleFree(CallInst *F);
     bool handleEndBlock(BasicBlock &BB);
     void RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc,
-                               SmallPtrSet<Value*, 16> &DeadStackObjects);
+                               SmallSetVector<Value*, 16> &DeadStackObjects);
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesCFG();
@@ -106,7 +106,7 @@ FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); }
 ///
 static void DeleteDeadInstruction(Instruction *I,
                                   MemoryDependenceAnalysis &MD,
-                                  SmallPtrSet<Value*, 16> *ValueSet = 0) {
+                                  SmallSetVector<Value*, 16> *ValueSet = 0) {
   SmallVector<Instruction*, 32> NowDeadInsts;
 
   NowDeadInsts.push_back(I);
@@ -136,7 +136,7 @@ static void DeleteDeadInstruction(Instruction *I,
 
     DeadInst->eraseFromParent();
 
-    if (ValueSet) ValueSet->erase(DeadInst);
+    if (ValueSet) ValueSet->remove(DeadInst);
   } while (!NowDeadInsts.empty());
 }
 
@@ -248,7 +248,7 @@ static bool isShortenable(Instruction *I) {
   // Don't shorten stores for now
   if (isa<StoreInst>(I))
     return false;
-  
+
   IntrinsicInst *II = cast<IntrinsicInst>(I);
   switch (II->getIntrinsicID()) {
     default: return false;
@@ -275,33 +275,9 @@ static Value *getStoredPointerOperand(Instruction *I) {
 }
 
 static uint64_t getPointerSize(const Value *V, AliasAnalysis &AA) {
-  const TargetData *TD = AA.getTargetData();
-
-  if (const CallInst *CI = extractMallocCall(V)) {
-    if (const ConstantInt *C = dyn_cast<ConstantInt>(CI->getArgOperand(0)))
-      return C->getZExtValue();
-  }
-
-  if (TD == 0)
-    return AliasAnalysis::UnknownSize;
-
-  if (const AllocaInst *A = dyn_cast<AllocaInst>(V)) {
-    // Get size information for the alloca
-    if (const ConstantInt *C = dyn_cast<ConstantInt>(A->getArraySize()))
-      return C->getZExtValue() * TD->getTypeAllocSize(A->getAllocatedType());
-  }
-
-  if (const Argument *A = dyn_cast<Argument>(V)) {
-    if (A->hasByValAttr())
-      if (PointerType *PT = dyn_cast<PointerType>(A->getType()))
-        return TD->getTypeAllocSize(PT->getElementType());
-  }
-
-  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) {
-    if (!GV->mayBeOverridden())
-      return TD->getTypeAllocSize(GV->getType()->getElementType());
-  }
-
+  uint64_t Size;
+  if (getObjectSize(V, Size, AA.getTargetData()))
+    return Size;
   return AliasAnalysis::UnknownSize;
 }
 
@@ -316,7 +292,7 @@ namespace {
 
 /// isOverwrite - Return 'OverwriteComplete' if a store to the 'Later' location
 /// completely overwrites a store to the 'Earlier' location.
-/// 'OverwriteEnd' if the end of the 'Earlier' location is completely 
+/// 'OverwriteEnd' if the end of the 'Earlier' location is completely
 /// overwritten by 'Later', or 'OverwriteUnknown' if nothing can be determined
 static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later,
                                    const AliasAnalysis::Location &Earlier,
@@ -339,7 +315,7 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later,
       if (AA.getTargetData() == 0 &&
           Later.Ptr->getType() == Earlier.Ptr->getType())
         return OverwriteComplete;
-        
+
       return OverwriteUnknown;
     }
 
@@ -405,7 +381,7 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later,
       Later.Size > Earlier.Size &&
       uint64_t(EarlierOff - LaterOff) + Earlier.Size <= Later.Size)
     return OverwriteComplete;
-  
+
   // The other interesting case is if the later store overwrites the end of
   // the earlier store
   //
@@ -544,11 +520,11 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
       // If we find a write that is a) removable (i.e., non-volatile), b) is
       // completely obliterated by the store to 'Loc', and c) which we know that
       // 'Inst' doesn't load from, then we can remove it.
-      if (isRemovable(DepWrite) && 
+      if (isRemovable(DepWrite) &&
           !isPossibleSelfRead(Inst, Loc, DepWrite, *AA)) {
-        int64_t InstWriteOffset, DepWriteOffset; 
-        OverwriteResult OR = isOverwrite(Loc, DepLoc, *AA, 
-                                         DepWriteOffset, InstWriteOffset); 
+        int64_t InstWriteOffset, DepWriteOffset;
+        OverwriteResult OR = isOverwrite(Loc, DepLoc, *AA,
+                                         DepWriteOffset, InstWriteOffset);
         if (OR == OverwriteComplete) {
           DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: "
                 << *DepWrite << "\n  KILLER: " << *Inst << '\n');
@@ -557,7 +533,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
           DeleteDeadInstruction(DepWrite, *MD);
           ++NumFastStores;
           MadeChange = true;
-          
+
           // DeleteDeadInstruction can delete the current instruction in loop
           // cases, reset BBI.
           BBI = Inst;
@@ -575,16 +551,16 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
           unsigned DepWriteAlign = DepIntrinsic->getAlignment();
           if (llvm::isPowerOf2_64(InstWriteOffset) ||
               ((DepWriteAlign != 0) && InstWriteOffset % DepWriteAlign == 0)) {
-            
+
             DEBUG(dbgs() << "DSE: Remove Dead Store:\n  OW END: "
-                  << *DepWrite << "\n  KILLER (offset " 
-                  << InstWriteOffset << ", " 
+                  << *DepWrite << "\n  KILLER (offset "
+                  << InstWriteOffset << ", "
                   << DepLoc.Size << ")"
                   << *Inst << '\n');
-            
+
             Value* DepWriteLength = DepIntrinsic->getLength();
             Value* TrimmedLength = ConstantInt::get(DepWriteLength->getType(),
-                                                    InstWriteOffset - 
+                                                    InstWriteOffset -
                                                     DepWriteOffset);
             DepIntrinsic->setLength(TrimmedLength);
             MadeChange = true;
@@ -694,19 +670,18 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
 
   // Keep track of all of the stack objects that are dead at the end of the
   // function.
-  SmallPtrSet<Value*, 16> DeadStackObjects;
+  SmallSetVector<Value*, 16> DeadStackObjects;
 
   // Find all of the alloca'd pointers in the entry block.
   BasicBlock *Entry = BB.getParent()->begin();
   for (BasicBlock::iterator I = Entry->begin(), E = Entry->end(); I != E; ++I) {
-    if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
-      DeadStackObjects.insert(AI);
+    if (isa<AllocaInst>(I))
+      DeadStackObjects.insert(I);
 
     // Okay, so these are dead heap objects, but if the pointer never escapes
     // then it's leaked by this function anyways.
-    if (CallInst *CI = extractMallocCall(I))
-      if (!PointerMayBeCaptured(CI, true, true))
-        DeadStackObjects.insert(CI);
+    else if (isAllocLikeFn(I) && !PointerMayBeCaptured(I, true, true))
+      DeadStackObjects.insert(I);
   }
 
   // Treat byval arguments the same, stores to them are dead at the end of the
@@ -723,14 +698,30 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
     // If we find a store, check to see if it points into a dead stack value.
     if (hasMemoryWrite(BBI) && isRemovable(BBI)) {
       // See through pointer-to-pointer bitcasts
-      Value *Pointer = GetUnderlyingObject(getStoredPointerOperand(BBI));
+      SmallVector<Value *, 4> Pointers;
+      GetUnderlyingObjects(getStoredPointerOperand(BBI), Pointers);
 
       // Stores to stack values are valid candidates for removal.
-      if (DeadStackObjects.count(Pointer)) {
+      bool AllDead = true;
+      for (SmallVectorImpl<Value *>::iterator I = Pointers.begin(),
+           E = Pointers.end(); I != E; ++I)
+        if (!DeadStackObjects.count(*I)) {
+          AllDead = false;
+          break;
+        }
+
+      if (AllDead) {
         Instruction *Dead = BBI++;
 
         DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n  DEAD: "
-                     << *Dead << "\n  Object: " << *Pointer << '\n');
+                     << *Dead << "\n  Objects: ";
+              for (SmallVectorImpl<Value *>::iterator I = Pointers.begin(),
+                   E = Pointers.end(); I != E; ++I) {
+                dbgs() << **I;
+                if (llvm::next(I) != E)
+                  dbgs() << ", ";
+              }
+              dbgs() << '\n');
 
         // DCE instructions only used to calculate that store.
         DeleteDeadInstruction(Dead, *MD, &DeadStackObjects);
@@ -749,13 +740,8 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
       continue;
     }
 
-    if (AllocaInst *A = dyn_cast<AllocaInst>(BBI)) {
-      DeadStackObjects.erase(A);
-      continue;
-    }
-
-    if (CallInst *CI = extractMallocCall(BBI)) {
-      DeadStackObjects.erase(CI);
+    if (isa<AllocaInst>(BBI) || isAllocLikeFn(BBI)) {
+      DeadStackObjects.remove(BBI);
       continue;
     }
 
@@ -768,7 +754,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
       // If the call might load from any of our allocas, then any store above
       // the call is live.
       SmallVector<Value*, 8> LiveAllocas;
-      for (SmallPtrSet<Value*, 16>::iterator I = DeadStackObjects.begin(),
+      for (SmallSetVector<Value*, 16>::iterator I = DeadStackObjects.begin(),
            E = DeadStackObjects.end(); I != E; ++I) {
         // See if the call site touches it.
         AliasAnalysis::ModRefResult A =
@@ -780,7 +766,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
 
       for (SmallVector<Value*, 8>::iterator I = LiveAllocas.begin(),
            E = LiveAllocas.end(); I != E; ++I)
-        DeadStackObjects.erase(*I);
+        DeadStackObjects.remove(*I);
 
       // If all of the allocas were clobbered by the call then we're not going
       // to find anything else to process.
@@ -827,7 +813,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
 /// of the stack objects in the DeadStackObjects set.  If so, they become live
 /// because the location is being loaded.
 void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc,
-                                SmallPtrSet<Value*, 16> &DeadStackObjects) {
+                                SmallSetVector<Value*, 16> &DeadStackObjects) {
   const Value *UnderlyingPointer = GetUnderlyingObject(LoadedLoc.Ptr);
 
   // A constant can't be in the dead pointer set.
@@ -837,12 +823,12 @@ void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc,
   // If the kill pointer can be easily reduced to an alloca, don't bother doing
   // extraneous AA queries.
   if (isa<AllocaInst>(UnderlyingPointer) || isa<Argument>(UnderlyingPointer)) {
-    DeadStackObjects.erase(const_cast<Value*>(UnderlyingPointer));
+    DeadStackObjects.remove(const_cast<Value*>(UnderlyingPointer));
     return;
   }
 
   SmallVector<Value*, 16> NowLive;
-  for (SmallPtrSet<Value*, 16>::iterator I = DeadStackObjects.begin(),
+  for (SmallSetVector<Value*, 16>::iterator I = DeadStackObjects.begin(),
        E = DeadStackObjects.end(); I != E; ++I) {
     // See if the loaded location could alias the stack location.
     AliasAnalysis::Location StackLoc(*I, getPointerSize(*I, *AA));
@@ -852,5 +838,5 @@ void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc,
 
   for (SmallVector<Value*, 16>::iterator I = NowLive.begin(), E = NowLive.end();
        I != E; ++I)
-    DeadStackObjects.erase(*I);
+    DeadStackObjects.remove(*I);
 }
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index f3c92d6..9759549 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -39,7 +39,7 @@ static unsigned getHash(const void *V) {
 }
 
 //===----------------------------------------------------------------------===//
-// SimpleValue 
+// SimpleValue
 //===----------------------------------------------------------------------===//
 
 namespace {
@@ -47,16 +47,16 @@ namespace {
   /// scoped hash table.
   struct SimpleValue {
     Instruction *Inst;
-    
+
     SimpleValue(Instruction *I) : Inst(I) {
       assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
     }
-    
+
     bool isSentinel() const {
       return Inst == DenseMapInfo<Instruction*>::getEmptyKey() ||
              Inst == DenseMapInfo<Instruction*>::getTombstoneKey();
     }
-    
+
     static bool canHandle(Instruction *Inst) {
       // This can only handle non-void readnone functions.
       if (CallInst *CI = dyn_cast<CallInst>(Inst))
@@ -90,7 +90,7 @@ template<> struct DenseMapInfo<SimpleValue> {
 
 unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) {
   Instruction *Inst = Val.Inst;
-  
+
   // Hash in all of the operands as pointers.
   unsigned Res = 0;
   for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i)
@@ -126,13 +126,13 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
 
   if (LHS.isSentinel() || RHS.isSentinel())
     return LHSI == RHSI;
-  
+
   if (LHSI->getOpcode() != RHSI->getOpcode()) return false;
   return LHSI->isIdenticalTo(RHSI);
 }
 
 //===----------------------------------------------------------------------===//
-// CallValue 
+// CallValue
 //===----------------------------------------------------------------------===//
 
 namespace {
@@ -140,21 +140,21 @@ namespace {
   /// the scoped hash table.
   struct CallValue {
     Instruction *Inst;
-    
+
     CallValue(Instruction *I) : Inst(I) {
       assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
     }
-    
+
     bool isSentinel() const {
       return Inst == DenseMapInfo<Instruction*>::getEmptyKey() ||
              Inst == DenseMapInfo<Instruction*>::getTombstoneKey();
     }
-    
+
     static bool canHandle(Instruction *Inst) {
       // Don't value number anything that returns void.
       if (Inst->getType()->isVoidTy())
         return false;
-      
+
       CallInst *CI = dyn_cast<CallInst>(Inst);
       if (CI == 0 || !CI->onlyReadsMemory())
         return false;
@@ -168,7 +168,7 @@ namespace llvm {
   template<> struct isPodLike<CallValue> {
     static const bool value = true;
   };
-  
+
   template<> struct DenseMapInfo<CallValue> {
     static inline CallValue getEmptyKey() {
       return DenseMapInfo<Instruction*>::getEmptyKey();
@@ -189,7 +189,7 @@ unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) {
            "Cannot value number calls with metadata operands");
     Res ^= getHash(Inst->getOperand(i)) << (i & 0xF);
   }
-  
+
   // Mix in the opcode.
   return (Res << 1) ^ Inst->getOpcode();
 }
@@ -203,11 +203,11 @@ bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) {
 
 
 //===----------------------------------------------------------------------===//
-// EarlyCSE pass. 
+// EarlyCSE pass.
 //===----------------------------------------------------------------------===//
 
 namespace {
-  
+
 /// EarlyCSE - This pass does a simple depth-first walk over the dominator
 /// tree, eliminating trivially redundant instructions and using instsimplify
 /// to canonicalize things as it goes.  It is intended to be fast and catch
@@ -223,14 +223,14 @@ public:
                       ScopedHashTableVal<SimpleValue, Value*> > AllocatorTy;
   typedef ScopedHashTable<SimpleValue, Value*, DenseMapInfo<SimpleValue>,
                           AllocatorTy> ScopedHTType;
-  
+
   /// AvailableValues - This scoped hash table contains the current values of
   /// all of our simple scalar expressions.  As we walk down the domtree, we
   /// look to see if instructions are in this: if so, we replace them with what
   /// we find, otherwise we insert them so that dominated values can succeed in
   /// their lookup.
   ScopedHTType *AvailableValues;
-  
+
   /// AvailableLoads - This scoped hash table contains the current values
   /// of loads.  This allows us to get efficient access to dominating loads when
   /// we have a fully redundant load.  In addition to the most recent load, we
@@ -243,15 +243,15 @@ public:
   typedef ScopedHashTable<Value*, std::pair<Value*, unsigned>,
                           DenseMapInfo<Value*>, LoadMapAllocator> LoadHTType;
   LoadHTType *AvailableLoads;
-  
+
   /// AvailableCalls - This scoped hash table contains the current values
   /// of read-only call values.  It uses the same generation count as loads.
   typedef ScopedHashTable<CallValue, std::pair<Value*, unsigned> > CallHTType;
   CallHTType *AvailableCalls;
-  
+
   /// CurrentGeneration - This is the current generation of the memory value.
   unsigned CurrentGeneration;
-  
+
   static char ID;
   explicit EarlyCSE() : FunctionPass(ID) {
     initializeEarlyCSEPass(*PassRegistry::getPassRegistry());
@@ -326,7 +326,7 @@ private:
   };
 
   bool processNode(DomTreeNode *Node);
-  
+
   // This transformation requires dominator postdominator info
   virtual void getAnalysisUsage(AnalysisUsage &AU) const {
     AU.addRequired<DominatorTree>();
@@ -350,7 +350,7 @@ INITIALIZE_PASS_END(EarlyCSE, "early-cse", "Early CSE", false, false)
 
 bool EarlyCSE::processNode(DomTreeNode *Node) {
   BasicBlock *BB = Node->getBlock();
-  
+
   // If this block has a single predecessor, then the predecessor is the parent
   // of the domtree node and all of the live out memory values are still current
   // in this block.  If this block has multiple predecessors, then they could
@@ -359,20 +359,20 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
   // predecessors.
   if (BB->getSinglePredecessor() == 0)
     ++CurrentGeneration;
-  
+
   /// LastStore - Keep track of the last non-volatile store that we saw... for
   /// as long as there in no instruction that reads memory.  If we see a store
   /// to the same location, we delete the dead store.  This zaps trivial dead
   /// stores which can occur in bitfield code among other things.
   StoreInst *LastStore = 0;
-  
+
   bool Changed = false;
 
   // See if any instructions in the block can be eliminated.  If so, do it.  If
   // not, add them to AvailableValues.
   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
     Instruction *Inst = I++;
-    
+
     // Dead instructions should just be removed.
     if (isInstructionTriviallyDead(Inst)) {
       DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n');
@@ -381,7 +381,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       ++NumSimplify;
       continue;
     }
-    
+
     // If the instruction can be simplified (e.g. X+0 = X) then replace it with
     // its simpler value.
     if (Value *V = SimplifyInstruction(Inst, TD, TLI, DT)) {
@@ -392,7 +392,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       ++NumSimplify;
       continue;
     }
-    
+
     // If this is a simple instruction that we can value number, process it.
     if (SimpleValue::canHandle(Inst)) {
       // See if the instruction has an available value.  If so, use it.
@@ -404,12 +404,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
         ++NumCSE;
         continue;
       }
-      
+
       // Otherwise, just remember that this value is available.
       AvailableValues->insert(Inst, Inst);
       continue;
     }
-    
+
     // If this is a non-volatile load, process it.
     if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
       // Ignore volatile loads.
@@ -417,7 +417,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
         LastStore = 0;
         continue;
       }
-      
+
       // If we have an available version of this load, and if it is the right
       // generation, replace this instruction.
       std::pair<Value*, unsigned> InVal =
@@ -431,18 +431,18 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
         ++NumCSELoad;
         continue;
       }
-      
+
       // Otherwise, remember that we have this instruction.
       AvailableLoads->insert(Inst->getOperand(0),
                           std::pair<Value*, unsigned>(Inst, CurrentGeneration));
       LastStore = 0;
       continue;
     }
-    
+
     // If this instruction may read from memory, forget LastStore.
     if (Inst->mayReadFromMemory())
       LastStore = 0;
-    
+
     // If this is a read-only call, process it.
     if (CallValue::canHandle(Inst)) {
       // If we have an available version of this call, and if it is the right
@@ -457,19 +457,19 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
         ++NumCSECall;
         continue;
       }
-      
+
       // Otherwise, remember that we have this instruction.
       AvailableCalls->insert(Inst,
                          std::pair<Value*, unsigned>(Inst, CurrentGeneration));
       continue;
     }
-    
+
     // Okay, this isn't something we can CSE at all.  Check to see if it is
     // something that could modify memory.  If so, our available memory values
     // cannot be used so bump the generation count.
     if (Inst->mayWriteToMemory()) {
       ++CurrentGeneration;
-     
+
       if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
         // We do a trivial form of DSE if there are two stores to the same
         // location with no intervening loads.  Delete the earlier store.
@@ -483,7 +483,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
           LastStore = 0;
           continue;
         }
-        
+
         // Okay, we just invalidated anything we knew about loaded values.  Try
         // to salvage *something* by remembering that the stored value is a live
         // version of the pointer.  It is safe to forward from volatile stores
@@ -491,7 +491,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
         // the store.
         AvailableLoads->insert(SI->getPointerOperand(),
          std::pair<Value*, unsigned>(SI->getValueOperand(), CurrentGeneration));
-        
+
         // Remember that this was the last store we saw for DSE.
         if (SI->isSimple())
           LastStore = SI;
@@ -509,7 +509,7 @@ bool EarlyCSE::runOnFunction(Function &F) {
   TD = getAnalysisIfAvailable<TargetData>();
   TLI = &getAnalysis<TargetLibraryInfo>();
   DT = &getAnalysis<DominatorTree>();
-  
+
   // Tables that the pass uses when walking the domtree.
   ScopedHTType AVTable;
   AvailableValues = &AVTable;
@@ -517,7 +517,7 @@ bool EarlyCSE::runOnFunction(Function &F) {
   AvailableLoads = &LoadTable;
   CallHTType CallTable;
   AvailableCalls = &CallTable;
-  
+
   CurrentGeneration = 0;
   bool Changed = false;
 
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index fb733ad..140864d 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -18,8 +18,15 @@
 #define DEBUG_TYPE "gvn"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/GlobalVariable.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/LLVMContext.h"
+#include "llvm/Metadata.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/Dominators.h"
@@ -30,20 +37,14 @@
 #include "llvm/Analysis/PHITransAddr.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Assembly/Writer.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/PatternMatch.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
 using namespace llvm;
 using namespace PatternMatch;
 
@@ -59,6 +60,11 @@ static cl::opt<bool> EnablePRE("enable-pre",
                                cl::init(true), cl::Hidden);
 static cl::opt<bool> EnableLoadPRE("enable-load-pre", cl::init(true));
 
+// Maximum allowed recursion depth.
+static cl::opt<uint32_t>
+MaxRecurseDepth("max-recurse-depth", cl::Hidden, cl::init(1000), cl::ZeroOrMore,
+                cl::desc("Max recurse depth (default = 1000)"));
+
 //===----------------------------------------------------------------------===//
 //                         ValueTable Class
 //===----------------------------------------------------------------------===//
@@ -167,7 +173,7 @@ Expression ValueTable::create_expression(Instruction *I) {
     if (e.varargs[0] > e.varargs[1])
       std::swap(e.varargs[0], e.varargs[1]);
   }
-  
+
   if (CmpInst *C = dyn_cast<CmpInst>(I)) {
     // Sort the operand value numbers so x<y and y>x get the same value number.
     CmpInst::Predicate Predicate = C->getPredicate();
@@ -181,7 +187,7 @@ Expression ValueTable::create_expression(Instruction *I) {
          II != IE; ++II)
       e.varargs.push_back(*II);
   }
-  
+
   return e;
 }
 
@@ -385,7 +391,7 @@ uint32_t ValueTable::lookup_or_add(Value *V) {
     valueNumbering[V] = nextValueNumber;
     return nextValueNumber++;
   }
-  
+
   Instruction* I = cast<Instruction>(V);
   Expression exp;
   switch (I->getOpcode()) {
@@ -501,7 +507,7 @@ namespace {
     const TargetLibraryInfo *TLI;
 
     ValueTable VN;
-    
+
     /// LeaderTable - A mapping from value numbers to lists of Value*'s that
     /// have that value number.  Use findLeader to query it.
     struct LeaderTableEntry {
@@ -511,7 +517,7 @@ namespace {
     };
     DenseMap<uint32_t, LeaderTableEntry> LeaderTable;
     BumpPtrAllocator TableAllocator;
-    
+
     SmallVector<Instruction*, 8> InstrsToErase;
   public:
     static char ID; // Pass identification, replacement for typeid
@@ -521,14 +527,14 @@ namespace {
     }
 
     bool runOnFunction(Function &F);
-    
+
     /// markInstructionForDeletion - This removes the specified instruction from
     /// our various maps and marks it for deletion.
     void markInstructionForDeletion(Instruction *I) {
       VN.erase(I);
       InstrsToErase.push_back(I);
     }
-    
+
     const TargetData *getTargetData() const { return TD; }
     DominatorTree &getDominatorTree() const { return *DT; }
     AliasAnalysis *getAliasAnalysis() const { return VN.getAliasAnalysis(); }
@@ -543,25 +549,25 @@ namespace {
         Curr.BB = BB;
         return;
       }
-      
+
       LeaderTableEntry *Node = TableAllocator.Allocate<LeaderTableEntry>();
       Node->Val = V;
       Node->BB = BB;
       Node->Next = Curr.Next;
       Curr.Next = Node;
     }
-    
+
     /// removeFromLeaderTable - Scan the list of values corresponding to a given
-    /// value number, and remove the given value if encountered.
-    void removeFromLeaderTable(uint32_t N, Value *V, BasicBlock *BB) {
+    /// value number, and remove the given instruction if encountered.
+    void removeFromLeaderTable(uint32_t N, Instruction *I, BasicBlock *BB) {
       LeaderTableEntry* Prev = 0;
       LeaderTableEntry* Curr = &LeaderTable[N];
 
-      while (Curr->Val != V || Curr->BB != BB) {
+      while (Curr->Val != I || Curr->BB != BB) {
         Prev = Curr;
         Curr = Curr->Next;
       }
-      
+
       if (Prev) {
         Prev->Next = Curr->Next;
       } else {
@@ -591,7 +597,7 @@ namespace {
       AU.addPreserved<DominatorTree>();
       AU.addPreserved<AliasAnalysis>();
     }
-    
+
 
     // Helper fuctions
     // FIXME: eliminate or document these better
@@ -647,7 +653,11 @@ void GVN::dump(DenseMap<uint32_t, Value*>& d) {
 ///   3) we are speculating for this block and have used that to speculate for
 ///      other blocks.
 static bool IsValueFullyAvailableInBlock(BasicBlock *BB,
-                            DenseMap<BasicBlock*, char> &FullyAvailableBlocks) {
+                            DenseMap<BasicBlock*, char> &FullyAvailableBlocks,
+                            uint32_t RecurseDepth) {
+  if (RecurseDepth > MaxRecurseDepth)
+    return false;
+
   // Optimistically assume that the block is fully available and check to see
   // if we already know about this block in one lookup.
   std::pair<DenseMap<BasicBlock*, char>::iterator, char> IV =
@@ -673,7 +683,7 @@ static bool IsValueFullyAvailableInBlock(BasicBlock *BB,
     // If the value isn't fully available in one of our predecessors, then it
     // isn't fully available in this block either.  Undo our previous
     // optimistic assumption and bail out.
-    if (!IsValueFullyAvailableInBlock(*PI, FullyAvailableBlocks))
+    if (!IsValueFullyAvailableInBlock(*PI, FullyAvailableBlocks,RecurseDepth+1))
       goto SpeculationFailure;
 
   return true;
@@ -725,15 +735,15 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
       StoredVal->getType()->isStructTy() ||
       StoredVal->getType()->isArrayTy())
     return false;
-  
+
   // The store has to be at least as big as the load.
   if (TD.getTypeSizeInBits(StoredVal->getType()) <
         TD.getTypeSizeInBits(LoadTy))
     return false;
-  
+
   return true;
 }
-  
+
 
 /// CoerceAvailableValueToLoadType - If we saw a store of a value to memory, and
 /// then a load from a must-aliased pointer of a different type, try to coerce
@@ -741,80 +751,80 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
 /// InsertPt is the place to insert new instructions.
 ///
 /// If we can't do it, return null.
-static Value *CoerceAvailableValueToLoadType(Value *StoredVal, 
+static Value *CoerceAvailableValueToLoadType(Value *StoredVal,
                                              Type *LoadedTy,
                                              Instruction *InsertPt,
                                              const TargetData &TD) {
   if (!CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, TD))
     return 0;
-  
+
   // If this is already the right type, just return it.
   Type *StoredValTy = StoredVal->getType();
-  
+
   uint64_t StoreSize = TD.getTypeSizeInBits(StoredValTy);
   uint64_t LoadSize = TD.getTypeSizeInBits(LoadedTy);
-  
+
   // If the store and reload are the same size, we can always reuse it.
   if (StoreSize == LoadSize) {
     // Pointer to Pointer -> use bitcast.
     if (StoredValTy->isPointerTy() && LoadedTy->isPointerTy())
       return new BitCastInst(StoredVal, LoadedTy, "", InsertPt);
-    
+
     // Convert source pointers to integers, which can be bitcast.
     if (StoredValTy->isPointerTy()) {
       StoredValTy = TD.getIntPtrType(StoredValTy->getContext());
       StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt);
     }
-    
+
     Type *TypeToCastTo = LoadedTy;
     if (TypeToCastTo->isPointerTy())
       TypeToCastTo = TD.getIntPtrType(StoredValTy->getContext());
-    
+
     if (StoredValTy != TypeToCastTo)
       StoredVal = new BitCastInst(StoredVal, TypeToCastTo, "", InsertPt);
-    
+
     // Cast to pointer if the load needs a pointer type.
     if (LoadedTy->isPointerTy())
       StoredVal = new IntToPtrInst(StoredVal, LoadedTy, "", InsertPt);
-    
+
     return StoredVal;
   }
-  
+
   // If the loaded value is smaller than the available value, then we can
   // extract out a piece from it.  If the available value is too small, then we
   // can't do anything.
   assert(StoreSize >= LoadSize && "CanCoerceMustAliasedValueToLoad fail");
-  
+
   // Convert source pointers to integers, which can be manipulated.
   if (StoredValTy->isPointerTy()) {
     StoredValTy = TD.getIntPtrType(StoredValTy->getContext());
     StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt);
   }
-  
+
   // Convert vectors and fp to integer, which can be manipulated.
   if (!StoredValTy->isIntegerTy()) {
     StoredValTy = IntegerType::get(StoredValTy->getContext(), StoreSize);
     StoredVal = new BitCastInst(StoredVal, StoredValTy, "", InsertPt);
   }
-  
+
   // If this is a big-endian system, we need to shift the value down to the low
   // bits so that a truncate will work.
   if (TD.isBigEndian()) {
     Constant *Val = ConstantInt::get(StoredVal->getType(), StoreSize-LoadSize);
     StoredVal = BinaryOperator::CreateLShr(StoredVal, Val, "tmp", InsertPt);
   }
-  
+
   // Truncate the integer to the right size now.
   Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadSize);
   StoredVal = new TruncInst(StoredVal, NewIntTy, "trunc", InsertPt);
-  
+
   if (LoadedTy == NewIntTy)
     return StoredVal;
-  
+
   // If the result is a pointer, inttoptr.
   if (LoadedTy->isPointerTy())
     return new IntToPtrInst(StoredVal, LoadedTy, "inttoptr", InsertPt);
-  
+
   // Otherwise, bitcast.
   return new BitCastInst(StoredVal, LoadedTy, "bitcast", InsertPt);
 }
@@ -835,13 +845,13 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
   // to transform them.  We need to be able to bitcast to integer.
   if (LoadTy->isStructTy() || LoadTy->isArrayTy())
     return -1;
-  
+
   int64_t StoreOffset = 0, LoadOffset = 0;
   Value *StoreBase = GetPointerBaseWithConstantOffset(WritePtr, StoreOffset,TD);
   Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, TD);
   if (StoreBase != LoadBase)
     return -1;
-  
+
   // If the load and store are to the exact same address, they should have been
   // a must alias.  AA must have gotten confused.
   // FIXME: Study to see if/when this happens.  One case is forwarding a memset
@@ -856,18 +866,18 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
     abort();
   }
 #endif
-  
+
   // If the load and store don't overlap at all, the store doesn't provide
   // anything to the load.  In this case, they really don't alias at all, AA
   // must have gotten confused.
   uint64_t LoadSize = TD.getTypeSizeInBits(LoadTy);
-  
+
   if ((WriteSizeInBits & 7) | (LoadSize & 7))
     return -1;
   uint64_t StoreSize = WriteSizeInBits >> 3;  // Convert to bytes.
   LoadSize >>= 3;
-  
-  
+
+
   bool isAAFailure = false;
   if (StoreOffset < LoadOffset)
     isAAFailure = StoreOffset+int64_t(StoreSize) <= LoadOffset;
@@ -885,7 +895,7 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
 #endif
     return -1;
   }
-  
+
   // If the Load isn't completely contained within the stored bits, we don't
   // have all the bits to feed it.  We could do something crazy in the future
   // (issue a smaller load then merge the bits in) but this seems unlikely to be
@@ -893,11 +903,11 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
   if (StoreOffset > LoadOffset ||
       StoreOffset+StoreSize < LoadOffset+LoadSize)
     return -1;
-  
+
   // Okay, we can do this transformation.  Return the number of bytes into the
   // store that the load is.
   return LoadOffset-StoreOffset;
-}  
+}
 
 /// AnalyzeLoadFromClobberingStore - This function is called when we have a
 /// memdep query of a load that ends up being a clobbering store.
@@ -923,23 +933,23 @@ static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr,
   // Cannot handle reading from store of first-class aggregate yet.
   if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
     return -1;
-  
+
   Value *DepPtr = DepLI->getPointerOperand();
   uint64_t DepSize = TD.getTypeSizeInBits(DepLI->getType());
   int R = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, TD);
   if (R != -1) return R;
-  
+
   // If we have a load/load clobber an DepLI can be widened to cover this load,
   // then we should widen it!
   int64_t LoadOffs = 0;
   const Value *LoadBase =
     GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, TD);
   unsigned LoadSize = TD.getTypeStoreSize(LoadTy);
-  
+
   unsigned Size = MemoryDependenceAnalysis::
     getLoadLoadClobberFullWidthSize(LoadBase, LoadOffs, LoadSize, DepLI, TD);
   if (Size == 0) return -1;
-  
+
   return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size*8, TD);
 }
 
@@ -958,29 +968,29 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
   if (MI->getIntrinsicID() == Intrinsic::memset)
     return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(),
                                           MemSizeInBits, TD);
-  
+
   // If we have a memcpy/memmove, the only case we can handle is if this is a
   // copy from constant memory.  In that case, we can read directly from the
   // constant memory.
   MemTransferInst *MTI = cast<MemTransferInst>(MI);
-  
+
   Constant *Src = dyn_cast<Constant>(MTI->getSource());
   if (Src == 0) return -1;
-  
+
   GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, &TD));
   if (GV == 0 || !GV->isConstant()) return -1;
-  
+
   // See if the access is within the bounds of the transfer.
   int Offset = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr,
                                               MI->getDest(), MemSizeInBits, TD);
   if (Offset == -1)
     return Offset;
-  
+
   // Otherwise, see if we can constant fold a load from the constant with the
   // offset applied as appropriate.
   Src = ConstantExpr::getBitCast(Src,
                                  llvm::Type::getInt8PtrTy(Src->getContext()));
-  Constant *OffsetCst = 
+  Constant *OffsetCst =
     ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
   Src = ConstantExpr::getGetElementPtr(Src, OffsetCst);
   Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy));
@@ -988,7 +998,7 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
     return Offset;
   return -1;
 }
-                                            
+
 
 /// GetStoreValueForLoad - This function is called when we have a
 /// memdep query of a load that ends up being a clobbering store.  This means
@@ -999,32 +1009,32 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
                                    Type *LoadTy,
                                    Instruction *InsertPt, const TargetData &TD){
   LLVMContext &Ctx = SrcVal->getType()->getContext();
-  
+
   uint64_t StoreSize = (TD.getTypeSizeInBits(SrcVal->getType()) + 7) / 8;
   uint64_t LoadSize = (TD.getTypeSizeInBits(LoadTy) + 7) / 8;
-  
+
   IRBuilder<> Builder(InsertPt->getParent(), InsertPt);
-  
+
   // Compute which bits of the stored value are being used by the load.  Convert
   // to an integer type to start with.
   if (SrcVal->getType()->isPointerTy())
     SrcVal = Builder.CreatePtrToInt(SrcVal, TD.getIntPtrType(Ctx));
   if (!SrcVal->getType()->isIntegerTy())
     SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8));
-  
+
   // Shift the bits to the least significant depending on endianness.
   unsigned ShiftAmt;
   if (TD.isLittleEndian())
     ShiftAmt = Offset*8;
   else
     ShiftAmt = (StoreSize-LoadSize-Offset)*8;
-  
+
   if (ShiftAmt)
     SrcVal = Builder.CreateLShr(SrcVal, ShiftAmt);
-  
+
   if (LoadSize != StoreSize)
     SrcVal = Builder.CreateTrunc(SrcVal, IntegerType::get(Ctx, LoadSize*8));
-  
+
   return CoerceAvailableValueToLoadType(SrcVal, LoadTy, InsertPt, TD);
 }
 
@@ -1051,14 +1061,14 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
       NewLoadSize = NextPowerOf2(NewLoadSize);
 
     Value *PtrVal = SrcVal->getPointerOperand();
-    
+
     // Insert the new load after the old load.  This ensures that subsequent
     // memdep queries will find the new load.  We can't easily remove the old
     // load completely because it is already in the value numbering table.
     IRBuilder<> Builder(SrcVal->getParent(), ++BasicBlock::iterator(SrcVal));
-    Type *DestPTy = 
+    Type *DestPTy =
       IntegerType::get(LoadTy->getContext(), NewLoadSize*8);
-    DestPTy = PointerType::get(DestPTy, 
+    DestPTy = PointerType::get(DestPTy,
                        cast<PointerType>(PtrVal->getType())->getAddressSpace());
     Builder.SetCurrentDebugLocation(SrcVal->getDebugLoc());
     PtrVal = Builder.CreateBitCast(PtrVal, DestPTy);
@@ -1068,7 +1078,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
 
     DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n");
     DEBUG(dbgs() << "TO: " << *NewLoad << "\n");
-    
+
     // Replace uses of the original load with the wider load.  On a big endian
     // system, we need to shift down to get the relevant bits.
     Value *RV = NewLoad;
@@ -1077,7 +1087,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
                     NewLoadSize*8-SrcVal->getType()->getPrimitiveSizeInBits());
     RV = Builder.CreateTrunc(RV, SrcVal->getType());
     SrcVal->replaceAllUsesWith(RV);
-    
+
     // We would like to use gvn.markInstructionForDeletion here, but we can't
     // because the load is already memoized into the leader map table that GVN
     // tracks.  It is potentially possible to remove the load from the table,
@@ -1086,7 +1096,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
     gvn.getMemDep().removeInstruction(SrcVal);
     SrcVal = NewLoad;
   }
-  
+
   return GetStoreValueForLoad(SrcVal, Offset, LoadTy, InsertPt, TD);
 }
 
@@ -1100,7 +1110,7 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
   uint64_t LoadSize = TD.getTypeSizeInBits(LoadTy)/8;
 
   IRBuilder<> Builder(InsertPt->getParent(), InsertPt);
-  
+
   // We know that this method is only called when the mem transfer fully
   // provides the bits for the load.
   if (MemSetInst *MSI = dyn_cast<MemSetInst>(SrcInst)) {
@@ -1109,9 +1119,9 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
     Value *Val = MSI->getValue();
     if (LoadSize != 1)
       Val = Builder.CreateZExt(Val, IntegerType::get(Ctx, LoadSize*8));
-    
+
     Value *OneElt = Val;
-    
+
     // Splat the value out to the right number of bits.
     for (unsigned NumBytesSet = 1; NumBytesSet != LoadSize; ) {
       // If we can double the number of bytes set, do it.
@@ -1121,16 +1131,16 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
         NumBytesSet <<= 1;
         continue;
       }
-      
+
       // Otherwise insert one byte at a time.
       Value *ShVal = Builder.CreateShl(Val, 1*8);
       Val = Builder.CreateOr(OneElt, ShVal);
       ++NumBytesSet;
     }
-    
+
     return CoerceAvailableValueToLoadType(Val, LoadTy, InsertPt, TD);
   }
- 
+
   // Otherwise, this is a memcpy/memmove from a constant global.
   MemTransferInst *MTI = cast<MemTransferInst>(SrcInst);
   Constant *Src = cast<Constant>(MTI->getSource());
@@ -1139,7 +1149,7 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
   // offset applied as appropriate.
   Src = ConstantExpr::getBitCast(Src,
                                  llvm::Type::getInt8PtrTy(Src->getContext()));
-  Constant *OffsetCst = 
+  Constant *OffsetCst =
   ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
   Src = ConstantExpr::getGetElementPtr(Src, OffsetCst);
   Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy));
@@ -1156,13 +1166,13 @@ struct AvailableValueInBlock {
     LoadVal,    // A value produced by a load.
     MemIntrin   // A memory intrinsic which is loaded from.
   };
-  
+
   /// V - The value that is live out of the block.
   PointerIntPair<Value *, 2, ValType> Val;
-  
+
   /// Offset - The byte offset in Val that is interesting for the load query.
   unsigned Offset;
-  
+
   static AvailableValueInBlock get(BasicBlock *BB, Value *V,
                                    unsigned Offset = 0) {
     AvailableValueInBlock Res;
@@ -1182,7 +1192,7 @@ struct AvailableValueInBlock {
     Res.Offset = Offset;
     return Res;
   }
-  
+
   static AvailableValueInBlock getLoad(BasicBlock *BB, LoadInst *LI,
                                        unsigned Offset = 0) {
     AvailableValueInBlock Res;
@@ -1201,17 +1211,17 @@ struct AvailableValueInBlock {
     assert(isSimpleValue() && "Wrong accessor");
     return Val.getPointer();
   }
-  
+
   LoadInst *getCoercedLoadValue() const {
     assert(isCoercedLoadValue() && "Wrong accessor");
     return cast<LoadInst>(Val.getPointer());
   }
-  
+
   MemIntrinsic *getMemIntrinValue() const {
     assert(isMemIntrinValue() && "Wrong accessor");
     return cast<MemIntrinsic>(Val.getPointer());
   }
-  
+
   /// MaterializeAdjustedValue - Emit code into this block to adjust the value
   /// defined here to the specified type.  This handles various coercion cases.
   Value *MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) const {
@@ -1223,7 +1233,7 @@ struct AvailableValueInBlock {
         assert(TD && "Need target data to handle type mismatch case");
         Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(),
                                    *TD);
-        
+
         DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << "  "
                      << *getSimpleValue() << '\n'
                      << *Res << '\n' << "\n\n\n");
@@ -1235,7 +1245,7 @@ struct AvailableValueInBlock {
       } else {
         Res = GetLoadValueForLoad(Load, Offset, LoadTy, BB->getTerminator(),
                                   gvn);
-        
+
         DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset << "  "
                      << *getCoercedLoadValue() << '\n'
                      << *Res << '\n' << "\n\n\n");
@@ -1258,12 +1268,12 @@ struct AvailableValueInBlock {
 /// ConstructSSAForLoadSet - Given a set of loads specified by ValuesPerBlock,
 /// construct SSA form, allowing us to eliminate LI.  This returns the value
 /// that should be used at LI's definition site.
-static Value *ConstructSSAForLoadSet(LoadInst *LI, 
+static Value *ConstructSSAForLoadSet(LoadInst *LI,
                          SmallVectorImpl<AvailableValueInBlock> &ValuesPerBlock,
                                      GVN &gvn) {
   // Check for the fully redundant, dominating load case.  In this case, we can
   // just use the dominating value directly.
-  if (ValuesPerBlock.size() == 1 && 
+  if (ValuesPerBlock.size() == 1 &&
       gvn.getDominatorTree().properlyDominates(ValuesPerBlock[0].BB,
                                                LI->getParent()))
     return ValuesPerBlock[0].MaterializeAdjustedValue(LI->getType(), gvn);
@@ -1272,29 +1282,29 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,
   SmallVector<PHINode*, 8> NewPHIs;
   SSAUpdater SSAUpdate(&NewPHIs);
   SSAUpdate.Initialize(LI->getType(), LI->getName());
-  
+
   Type *LoadTy = LI->getType();
-  
+
   for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) {
     const AvailableValueInBlock &AV = ValuesPerBlock[i];
     BasicBlock *BB = AV.BB;
-    
+
     if (SSAUpdate.HasValueForBlock(BB))
       continue;
 
     SSAUpdate.AddAvailableValue(BB, AV.MaterializeAdjustedValue(LoadTy, gvn));
   }
-  
+
   // Perform PHI construction.
   Value *V = SSAUpdate.GetValueInMiddleOfBlock(LI->getParent());
-  
+
   // If new PHI nodes were created, notify alias analysis.
   if (V->getType()->isPointerTy()) {
     AliasAnalysis *AA = gvn.getAliasAnalysis();
-    
+
     for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i)
       AA->copyValue(LI, NewPHIs[i]);
-    
+
     // Now that we've copied information to the new PHIs, scan through
     // them again and inform alias analysis that we've added potentially
     // escaping uses to any values that are operands to these PHIs.
@@ -1366,7 +1376,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
       // the pointer operand of the load if PHI translation occurs.  Make sure
       // to consider the right address.
       Value *Address = Deps[i].getAddress();
-      
+
       // If the dependence is to a store that writes to a superset of the bits
       // read by the load, we can extract the bits we need for the load from the
       // stored value.
@@ -1382,7 +1392,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
           }
         }
       }
-      
+
       // Check to see if we have something like this:
       //    load i32* P
       //    load i8* (P+1)
@@ -1394,7 +1404,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
           int Offset = AnalyzeLoadFromClobberingLoad(LI->getType(),
                                                      LI->getPointerOperand(),
                                                      DepLI, *TD);
-          
+
           if (Offset != -1) {
             ValuesPerBlock.push_back(AvailableValueInBlock::getLoad(DepBB,DepLI,
                                                                     Offset));
@@ -1413,10 +1423,10 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
             ValuesPerBlock.push_back(AvailableValueInBlock::getMI(DepBB, DepMI,
                                                                   Offset));
             continue;
-          }            
+          }
         }
       }
-      
+
       UnavailableBlocks.push_back(DepBB);
       continue;
     }
@@ -1426,14 +1436,14 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
     Instruction *DepInst = DepInfo.getInst();
 
     // Loading the allocation -> undef.
-    if (isa<AllocaInst>(DepInst) || isMalloc(DepInst) ||
+    if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst) ||
         // Loading immediately after lifetime begin -> undef.
         isLifetimeStart(DepInst)) {
       ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB,
                                              UndefValue::get(LI->getType())));
       continue;
     }
-    
+
     if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) {
       // Reject loads and stores that are to the same address but are of
       // different types if we have to.
@@ -1451,7 +1461,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
                                                          S->getValueOperand()));
       continue;
     }
-    
+
     if (LoadInst *LD = dyn_cast<LoadInst>(DepInst)) {
       // If the types mismatch and we can't handle it, reject reuse of the load.
       if (LD->getType() != LI->getType()) {
@@ -1460,12 +1470,12 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
         if (TD == 0 || !CanCoerceMustAliasedValueToLoad(LD, LI->getType(),*TD)){
           UnavailableBlocks.push_back(DepBB);
           continue;
-        }          
+        }
       }
       ValuesPerBlock.push_back(AvailableValueInBlock::getLoad(DepBB, LD));
       continue;
     }
-    
+
     UnavailableBlocks.push_back(DepBB);
     continue;
   }
@@ -1479,7 +1489,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   // its value.  Insert PHIs and remove the fully redundant value now.
   if (UnavailableBlocks.empty()) {
     DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n');
-    
+
     // Perform PHI construction.
     Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this);
     LI->replaceAllUsesWith(V);
@@ -1522,10 +1532,10 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
       return false;
     if (Blockers.count(TmpBB))
       return false;
-    
+
     // If any of these blocks has more than one successor (i.e. if the edge we
-    // just traversed was critical), then there are other paths through this 
-    // block along which the load may not be anticipated.  Hoisting the load 
+    // just traversed was critical), then there are other paths through this
+    // block along which the load may not be anticipated.  Hoisting the load
     // above this block would be adding the load to execution paths along
     // which it was not previously executed.
     if (TmpBB->getTerminator()->getNumSuccessors() != 1)
@@ -1570,7 +1580,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB);
        PI != E; ++PI) {
     BasicBlock *Pred = *PI;
-    if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks)) {
+    if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks, 0)) {
       continue;
     }
     PredLoads[Pred] = 0;
@@ -1603,7 +1613,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   unsigned NumUnavailablePreds = PredLoads.size();
   assert(NumUnavailablePreds != 0 &&
          "Fully available value should be eliminated above!");
-  
+
   // If this load is unavailable in multiple predecessors, reject it.
   // FIXME: If we could restructure the CFG, we could make a common pred with
   // all the preds that don't have an available LI and insert a new load into
@@ -1680,10 +1690,10 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   DEBUG(if (!NewInsts.empty())
           dbgs() << "INSERTED " << NewInsts.size() << " INSTS: "
                  << *NewInsts.back() << '\n');
-  
+
   // Assign value numbers to the new instructions.
   for (unsigned i = 0, e = NewInsts.size(); i != e; ++i) {
-    // FIXME: We really _ought_ to insert these value numbers into their 
+    // FIXME: We really _ought_ to insert these value numbers into their
     // parent's availability map.  However, in doing so, we risk getting into
     // ordering issues.  If a block hasn't been processed yet, we would be
     // marking a value as AVAIL-IN, which isn't what we intend.
@@ -1725,6 +1735,53 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   return true;
 }
 
+static void patchReplacementInstruction(Value *Repl, Instruction *I) {
+  // Patch the replacement so that it is not more restrictive than the value
+  // being replaced.
+  BinaryOperator *Op = dyn_cast<BinaryOperator>(I);
+  BinaryOperator *ReplOp = dyn_cast<BinaryOperator>(Repl);
+  if (Op && ReplOp && isa<OverflowingBinaryOperator>(Op) &&
+      isa<OverflowingBinaryOperator>(ReplOp)) {
+    if (ReplOp->hasNoSignedWrap() && !Op->hasNoSignedWrap())
+      ReplOp->setHasNoSignedWrap(false);
+    if (ReplOp->hasNoUnsignedWrap() && !Op->hasNoUnsignedWrap())
+      ReplOp->setHasNoUnsignedWrap(false);
+  }
+  if (Instruction *ReplInst = dyn_cast<Instruction>(Repl)) {
+    SmallVector<std::pair<unsigned, MDNode*>, 4> Metadata;
+    ReplInst->getAllMetadataOtherThanDebugLoc(Metadata);
+    for (int i = 0, n = Metadata.size(); i < n; ++i) {
+      unsigned Kind = Metadata[i].first;
+      MDNode *IMD = I->getMetadata(Kind);
+      MDNode *ReplMD = Metadata[i].second;
+      switch(Kind) {
+      default:
+        ReplInst->setMetadata(Kind, NULL); // Remove unknown metadata
+        break;
+      case LLVMContext::MD_dbg:
+        llvm_unreachable("getAllMetadataOtherThanDebugLoc returned a MD_dbg");
+      case LLVMContext::MD_tbaa:
+        ReplInst->setMetadata(Kind, MDNode::getMostGenericTBAA(IMD, ReplMD));
+        break;
+      case LLVMContext::MD_range:
+        ReplInst->setMetadata(Kind, MDNode::getMostGenericRange(IMD, ReplMD));
+        break;
+      case LLVMContext::MD_prof:
+        llvm_unreachable("MD_prof in a non terminator instruction");
+        break;
+      case LLVMContext::MD_fpmath:
+        ReplInst->setMetadata(Kind, MDNode::getMostGenericFPMath(IMD, ReplMD));
+        break;
+      }
+    }
+  }
+}
+
+static void patchAndReplaceAllUsesWith(Value *Repl, Instruction *I) {
+  patchReplacementInstruction(Repl, I);
+  I->replaceAllUsesWith(Repl);
+}
+
 /// processLoad - Attempt to eliminate a load, first by eliminating it
 /// locally, and then attempting non-local elimination if that fails.
 bool GVN::processLoad(LoadInst *L) {
@@ -1738,7 +1795,7 @@ bool GVN::processLoad(LoadInst *L) {
     markInstructionForDeletion(L);
     return true;
   }
-  
+
   // ... to a pointer that has been loaded from before...
   MemDepResult Dep = MD->getDependency(L);
 
@@ -1764,7 +1821,7 @@ bool GVN::processLoad(LoadInst *L) {
         AvailVal = GetStoreValueForLoad(DepSI->getValueOperand(), Offset,
                                         L->getType(), L, *TD);
     }
-    
+
     // Check to see if we have something like this:
     //    load i32* P
     //    load i8* (P+1)
@@ -1774,14 +1831,14 @@ bool GVN::processLoad(LoadInst *L) {
       // we have the first instruction in the entry block.
       if (DepLI == L)
         return false;
-      
+
       int Offset = AnalyzeLoadFromClobberingLoad(L->getType(),
                                                  L->getPointerOperand(),
                                                  DepLI, *TD);
       if (Offset != -1)
         AvailVal = GetLoadValueForLoad(DepLI, Offset, L->getType(), L, *this);
     }
-    
+
     // If the clobbering value is a memset/memcpy/memmove, see if we can forward
     // a value on from it.
     if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(Dep.getInst())) {
@@ -1791,11 +1848,11 @@ bool GVN::processLoad(LoadInst *L) {
       if (Offset != -1)
         AvailVal = GetMemInstValueForLoad(DepMI, Offset, L->getType(), L, *TD);
     }
-        
+
     if (AvailVal) {
       DEBUG(dbgs() << "GVN COERCED INST:\n" << *Dep.getInst() << '\n'
             << *AvailVal << '\n' << *L << "\n\n\n");
-      
+
       // Replace the load!
       L->replaceAllUsesWith(AvailVal);
       if (AvailVal->getType()->isPointerTy())
@@ -1805,7 +1862,7 @@ bool GVN::processLoad(LoadInst *L) {
       return true;
     }
   }
-  
+
   // If the value isn't available, don't do anything!
   if (Dep.isClobber()) {
     DEBUG(
@@ -1835,7 +1892,7 @@ bool GVN::processLoad(LoadInst *L) {
   Instruction *DepInst = Dep.getInst();
   if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) {
     Value *StoredVal = DepSI->getValueOperand();
-    
+
     // The store and load are to a must-aliased pointer, but they may not
     // actually have the same type.  See if we know how to reuse the stored
     // value (depending on its type).
@@ -1845,11 +1902,11 @@ bool GVN::processLoad(LoadInst *L) {
                                                    L, *TD);
         if (StoredVal == 0)
           return false;
-        
+
         DEBUG(dbgs() << "GVN COERCED STORE:\n" << *DepSI << '\n' << *StoredVal
                      << '\n' << *L << "\n\n\n");
       }
-      else 
+      else
         return false;
     }
 
@@ -1864,7 +1921,7 @@ bool GVN::processLoad(LoadInst *L) {
 
   if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInst)) {
     Value *AvailableVal = DepLI;
-    
+
     // The loads are of a must-aliased pointer, but they may not actually have
     // the same type.  See if we know how to reuse the previously loaded value
     // (depending on its type).
@@ -1874,16 +1931,16 @@ bool GVN::processLoad(LoadInst *L) {
                                                       L, *TD);
         if (AvailableVal == 0)
           return false;
-      
+
         DEBUG(dbgs() << "GVN COERCED LOAD:\n" << *DepLI << "\n" << *AvailableVal
                      << "\n" << *L << "\n\n\n");
       }
-      else 
+      else
         return false;
     }
-    
+
     // Remove it!
-    L->replaceAllUsesWith(AvailableVal);
+    patchAndReplaceAllUsesWith(AvailableVal, L);
     if (DepLI->getType()->isPointerTy())
       MD->invalidateCachedPointerInfo(DepLI);
     markInstructionForDeletion(L);
@@ -1894,13 +1951,13 @@ bool GVN::processLoad(LoadInst *L) {
   // If this load really doesn't depend on anything, then we must be loading an
   // undef value.  This can happen when loading for a fresh allocation with no
   // intervening stores, for example.
-  if (isa<AllocaInst>(DepInst) || isMalloc(DepInst)) {
+  if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst)) {
     L->replaceAllUsesWith(UndefValue::get(L->getType()));
     markInstructionForDeletion(L);
     ++NumGVNLoad;
     return true;
   }
-  
+
   // If this load occurs either right after a lifetime begin,
   // then the loaded value is undefined.
   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(DepInst)) {
@@ -1915,28 +1972,28 @@ bool GVN::processLoad(LoadInst *L) {
   return false;
 }
 
-// findLeader - In order to find a leader for a given value number at a 
+// findLeader - In order to find a leader for a given value number at a
 // specific basic block, we first obtain the list of all Values for that number,
-// and then scan the list to find one whose block dominates the block in 
+// and then scan the list to find one whose block dominates the block in
 // question.  This is fast because dominator tree queries consist of only
 // a few comparisons of DFS numbers.
 Value *GVN::findLeader(BasicBlock *BB, uint32_t num) {
   LeaderTableEntry Vals = LeaderTable[num];
   if (!Vals.Val) return 0;
-  
+
   Value *Val = 0;
   if (DT->dominates(Vals.BB, BB)) {
     Val = Vals.Val;
     if (isa<Constant>(Val)) return Val;
   }
-  
+
   LeaderTableEntry* Next = Vals.Next;
   while (Next) {
     if (DT->dominates(Next->BB, BB)) {
       if (isa<Constant>(Next->Val)) return Next->Val;
       if (!Val) Val = Next->Val;
     }
-    
+
     Next = Next->Next;
   }
 
@@ -2012,9 +2069,15 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, BasicBlock *Root) {
             DT->properlyDominates(cast<Instruction>(RHS)->getParent(), Root)) &&
            "Instruction doesn't dominate scope!");
 
-    // If value numbering later deduces that an instruction in the scope is equal
-    // to 'LHS' then ensure it will be turned into 'RHS'.
-    addToLeaderTable(LVN, RHS, Root);
+    // If value numbering later sees that an instruction in the scope is equal
+    // to 'LHS' then ensure it will be turned into 'RHS'.  In order to preserve
+    // the invariant that instructions only occur in the leader table for their
+    // own value number (this is used by removeFromLeaderTable), do not do this
+    // if RHS is an instruction (if an instruction in the scope is morphed into
+    // LHS then it will be turned into RHS by the next GVN iteration anyway, so
+    // using the leader table is about compiling faster, not optimizing better).
+    if (!isa<Instruction>(RHS))
+      addToLeaderTable(LVN, RHS, Root);
 
     // Replace all occurrences of 'LHS' with 'RHS' everywhere in the scope.  As
     // LHS always has at least one use that is not dominated by Root, this will
@@ -2180,7 +2243,7 @@ bool GVN::processInstruction(Instruction *I) {
   // Instructions with void type don't return a value, so there's
   // no point in trying to find redundancies in them.
   if (I->getType()->isVoidTy()) return false;
-  
+
   uint32_t NextNum = VN.getNextUnusedValueNumber();
   unsigned Num = VN.lookup_or_add(I);
 
@@ -2198,7 +2261,7 @@ bool GVN::processInstruction(Instruction *I) {
     addToLeaderTable(Num, I, I->getParent());
     return false;
   }
-  
+
   // Perform fast-path value-number based elimination of values inherited from
   // dominators.
   Value *repl = findLeader(I->getParent(), Num);
@@ -2207,9 +2270,9 @@ bool GVN::processInstruction(Instruction *I) {
     addToLeaderTable(Num, I, I->getParent());
     return false;
   }
-  
+
   // Remove it!
-  I->replaceAllUsesWith(repl);
+  patchAndReplaceAllUsesWith(repl, I);
   if (MD && repl->getType()->isPointerTy())
     MD->invalidateCachedPointerInfo(repl);
   markInstructionForDeletion(I);
@@ -2234,7 +2297,7 @@ bool GVN::runOnFunction(Function& F) {
   // optimization opportunities.
   for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) {
     BasicBlock *BB = FI++;
-    
+
     bool removedBlock = MergeBlockIntoPredecessor(BB, this);
     if (removedBlock) ++NumGVNBlocks;
 
@@ -2391,7 +2454,7 @@ bool GVN::performPRE(Function &F) {
       // we would need to insert instructions in more than one pred.
       if (NumWithout != 1 || NumWith == 0)
         continue;
-      
+
       // Don't do PRE across indirect branch.
       if (isa<IndirectBrInst>(PREPred->getTerminator()))
         continue;
@@ -2467,7 +2530,7 @@ bool GVN::performPRE(Function &F) {
           unsigned jj = PHINode::getOperandNumForIncomingValue(ii);
           VN.getAliasAnalysis()->addEscapingUse(Phi->getOperandUse(jj));
         }
-        
+
         if (MD)
           MD->invalidateCachedPointerInfo(Phi);
       }
@@ -2504,7 +2567,7 @@ bool GVN::splitCriticalEdges() {
 /// iterateOnFunction - Executes one iteration of GVN
 bool GVN::iterateOnFunction(Function &F) {
   cleanupGlobalSets();
-  
+
   // Top-down walk of the dominator tree
   bool Changed = false;
 #if 0
@@ -2539,7 +2602,7 @@ void GVN::verifyRemoved(const Instruction *Inst) const {
        I = LeaderTable.begin(), E = LeaderTable.end(); I != E; ++I) {
     const LeaderTableEntry *Node = &I->second;
     assert(Node->Val != Inst && "Inst still in value numbering scope!");
-    
+
     while (Node->Next) {
       Node = Node->Next;
       assert(Node->Val != Inst && "Inst still in value numbering scope!");
diff --git a/lib/Transforms/Scalar/GlobalMerge.cpp b/lib/Transforms/Scalar/GlobalMerge.cpp
index c2bd6e6..b36a3cb 100644
--- a/lib/Transforms/Scalar/GlobalMerge.cpp
+++ b/lib/Transforms/Scalar/GlobalMerge.cpp
@@ -12,7 +12,7 @@
 // global). Such a transformation can significantly reduce the register pressure
 // when many globals are involved.
 //
-// For example, consider the code which touches several global variables at 
+// For example, consider the code which touches several global variables at
 // once:
 //
 // static int foo[N], bar[N], baz[N];
@@ -208,8 +208,8 @@ bool GlobalMerge::doInitialization(Module &M) {
   if (BSSGlobals.size() > 1)
     Changed |= doMerge(BSSGlobals, M, false);
 
-  // FIXME: This currently breaks the EH processing due to way how the 
-  // typeinfo detection works. We might want to detect the TIs and ignore 
+  // FIXME: This currently breaks the EH processing due to way how the
+  // typeinfo detection works. We might want to detect the TIs and ignore
   // them in the future.
   // if (ConstGlobals.size() > 1)
   //  Changed |= doMerge(ConstGlobals, M, true);
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index a9ba657..37f8bdf 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -1215,21 +1215,26 @@ static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) {
   return 0;
 }
 
-/// needsLFTR - LinearFunctionTestReplace policy. Return true unless we can show
-/// that the current exit test is already sufficiently canonical.
-static bool needsLFTR(Loop *L, DominatorTree *DT) {
+/// Return the compare guarding the loop latch, or NULL for unrecognized tests.
+static ICmpInst *getLoopTest(Loop *L) {
   assert(L->getExitingBlock() && "expected loop exit");
 
   BasicBlock *LatchBlock = L->getLoopLatch();
   // Don't bother with LFTR if the loop is not properly simplified.
   if (!LatchBlock)
-    return false;
+    return 0;
 
   BranchInst *BI = dyn_cast<BranchInst>(L->getExitingBlock()->getTerminator());
   assert(BI && "expected exit branch");
 
+  return dyn_cast<ICmpInst>(BI->getCondition());
+}
+
+/// needsLFTR - LinearFunctionTestReplace policy. Return true unless we can show
+/// that the current exit test is already sufficiently canonical.
+static bool needsLFTR(Loop *L, DominatorTree *DT) {
   // Do LFTR to simplify the exit condition to an ICMP.
-  ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
+  ICmpInst *Cond = getLoopTest(L);
   if (!Cond)
     return true;
 
@@ -1259,6 +1264,48 @@ static bool needsLFTR(Loop *L, DominatorTree *DT) {
   return Phi != getLoopPhiForCounter(IncV, L, DT);
 }
 
+/// Recursive helper for hasConcreteDef(). Unfortunately, this currently boils
+/// down to checking that all operands are constant and listing instructions
+/// that may hide undef.
+static bool hasConcreteDefImpl(Value *V, SmallPtrSet<Value*, 8> &Visited,
+                               unsigned Depth) {
+  if (isa<Constant>(V))
+    return !isa<UndefValue>(V);
+
+  if (Depth >= 6)
+    return false;
+
+  // Conservatively handle non-constant non-instructions. For example, Arguments
+  // may be undef.
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return false;
+
+  // Load and return values may be undef.
+  if(I->mayReadFromMemory() || isa<CallInst>(I) || isa<InvokeInst>(I))
+    return false;
+
+  // Optimistically handle other instructions.
+  for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) {
+    if (!Visited.insert(*OI))
+      continue;
+    if (!hasConcreteDefImpl(*OI, Visited, Depth+1))
+      return false;
+  }
+  return true;
+}
+
+/// Return true if the given value is concrete. We must prove that undef can
+/// never reach it.
+///
+/// TODO: If we decide that this is a good approach to checking for undef, we
+/// may factor it into a common location.
+static bool hasConcreteDef(Value *V) {
+  SmallPtrSet<Value*, 8> Visited;
+  Visited.insert(V);
+  return hasConcreteDefImpl(V, Visited, 0);
+}
+
 /// AlmostDeadIV - Return true if this IV has any uses other than the (soon to
 /// be rewritten) loop exit test.
 static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) {
@@ -1283,6 +1330,8 @@ static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) {
 /// valid count without scaling the address stride, so it remains a pointer
 /// expression as far as SCEV is concerned.
 ///
+/// Currently only valid for LFTR. See the comments on hasConcreteDef below.
+///
 /// FIXME: Accept -1 stride and set IVLimit = IVInit - BECount
 ///
 /// FIXME: Accept non-unit stride as long as SCEV can reduce BECount * Stride.
@@ -1331,6 +1380,19 @@ FindLoopCounter(Loop *L, const SCEV *BECount,
     if (getLoopPhiForCounter(IncV, L, DT) != Phi)
       continue;
 
+    // Avoid reusing a potentially undef value to compute other values that may
+    // have originally had a concrete definition.
+    if (!hasConcreteDef(Phi)) {
+      // We explicitly allow unknown phis as long as they are already used by
+      // the loop test. In this case we assume that performing LFTR could not
+      // increase the number of undef users.
+      if (ICmpInst *Cond = getLoopTest(L)) {
+        if (Phi != getLoopPhiForCounter(Cond->getOperand(0), L, DT)
+            && Phi != getLoopPhiForCounter(Cond->getOperand(1), L, DT)) {
+          continue;
+        }
+      }
+    }
     const SCEV *Init = AR->getStart();
 
     if (BestPhi && !AlmostDeadIV(BestPhi, LatchBlock, Cond)) {
@@ -1347,7 +1409,7 @@ FindLoopCounter(Loop *L, const SCEV *BECount,
       // If two IVs both count from zero or both count from nonzero then the
       // narrower is likely a dead phi that has been widened. Use the wider phi
       // to allow the other to be eliminated.
-      if (PhiWidth <= SE->getTypeSizeInBits(BestPhi->getType()))
+      else if (PhiWidth <= SE->getTypeSizeInBits(BestPhi->getType()))
         continue;
     }
     BestPhi = Phi;
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index 429b61b..dd42c59 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -670,6 +670,8 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
   } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Terminator)) {
     Condition = SI->getCondition();
   } else if (IndirectBrInst *IB = dyn_cast<IndirectBrInst>(Terminator)) {
+    // Can't thread indirect branch with no successors.
+    if (IB->getNumSuccessors() == 0) return false;
     Condition = IB->getAddress()->stripPointerCasts();
     Preference = WantBlockAddress;
   } else {
@@ -859,7 +861,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
 
   // If all of the loads and stores that feed the value have the same TBAA tag,
   // then we can propagate it onto any newly inserted loads.
-  MDNode *TBAATag = LI->getMetadata(LLVMContext::MD_tbaa); 
+  MDNode *TBAATag = LI->getMetadata(LLVMContext::MD_tbaa);
 
   SmallPtrSet<BasicBlock*, 8> PredsScanned;
   typedef SmallVector<std::pair<BasicBlock*, Value*>, 8> AvailablePredsTy;
@@ -885,7 +887,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
       OneUnavailablePred = PredBB;
       continue;
     }
-    
+
     // If tbaa tags disagree or are not present, forget about them.
     if (TBAATag != ThisTBAATag) TBAATag = 0;
 
@@ -949,7 +951,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
     NewVal->setDebugLoc(LI->getDebugLoc());
     if (TBAATag)
       NewVal->setMetadata(LLVMContext::MD_tbaa, TBAATag);
-    
+
     AvailablePreds.push_back(std::make_pair(UnavailablePred, NewVal));
   }
 
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 8795cd8..582948e 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -618,6 +618,11 @@ bool LICM::isGuaranteedToExecute(Instruction &Inst) {
     if (!DT->dominates(Inst.getParent(), ExitBlocks[i]))
       return false;
 
+  // As a degenerate case, if the loop is statically infinite then we haven't
+  // proven anything since there are no exit blocks.
+  if (ExitBlocks.empty())
+    return false;
+
   return true;
 }
 
diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp
index f7f3298..3771f5a 100644
--- a/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -32,10 +32,10 @@ namespace {
     LoopDeletion() : LoopPass(ID) {
       initializeLoopDeletionPass(*PassRegistry::getPassRegistry());
     }
-    
+
     // Possibly eliminate loop L if it is dead.
     bool runOnLoop(Loop* L, LPPassManager& LPM);
-    
+
     bool IsLoopDead(Loop* L, SmallVector<BasicBlock*, 4>& exitingBlocks,
                     SmallVector<BasicBlock*, 4>& exitBlocks,
                     bool &Changed, BasicBlock *Preheader);
@@ -46,7 +46,7 @@ namespace {
       AU.addRequired<ScalarEvolution>();
       AU.addRequiredID(LoopSimplifyID);
       AU.addRequiredID(LCSSAID);
-      
+
       AU.addPreserved<ScalarEvolution>();
       AU.addPreserved<DominatorTree>();
       AU.addPreserved<LoopInfo>();
@@ -55,7 +55,7 @@ namespace {
     }
   };
 }
-  
+
 char LoopDeletion::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopDeletion, "loop-deletion",
                 "Delete dead loops", false, false)
@@ -79,7 +79,7 @@ bool LoopDeletion::IsLoopDead(Loop* L,
                               SmallVector<BasicBlock*, 4>& exitBlocks,
                               bool &Changed, BasicBlock *Preheader) {
   BasicBlock* exitBlock = exitBlocks[0];
-  
+
   // Make sure that all PHI entries coming from the loop are loop invariant.
   // Because the code is in LCSSA form, any values used outside of the loop
   // must pass through a PHI in the exit block, meaning that this check is
@@ -97,14 +97,14 @@ bool LoopDeletion::IsLoopDead(Loop* L,
       if (incoming != P->getIncomingValueForBlock(exitingBlocks[i]))
         return false;
     }
-      
+
     if (Instruction* I = dyn_cast<Instruction>(incoming))
       if (!L->makeLoopInvariant(I, Changed, Preheader->getTerminator()))
         return false;
 
     ++BI;
   }
-  
+
   // Make sure that no instructions in the block have potential side-effects.
   // This includes instructions that could write to memory, and loads that are
   // marked volatile.  This could be made more aggressive by using aliasing
@@ -117,23 +117,23 @@ bool LoopDeletion::IsLoopDead(Loop* L,
         return false;
     }
   }
-  
+
   return true;
 }
 
 /// runOnLoop - Remove dead loops, by which we mean loops that do not impact the
-/// observable behavior of the program other than finite running time.  Note 
+/// observable behavior of the program other than finite running time.  Note
 /// we do ensure that this never remove a loop that might be infinite, as doing
 /// so could change the halting/non-halting nature of a program.
 /// NOTE: This entire process relies pretty heavily on LoopSimplify and LCSSA
 /// in order to make various safety checks work.
 bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
-  // We can only remove the loop if there is a preheader that we can 
+  // We can only remove the loop if there is a preheader that we can
   // branch from after removing it.
   BasicBlock* preheader = L->getLoopPreheader();
   if (!preheader)
     return false;
-  
+
   // If LoopSimplify form is not available, stay out of trouble.
   if (!L->hasDedicatedExits())
     return false;
@@ -142,36 +142,36 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
   // they would already have been removed in earlier executions of this pass.
   if (L->begin() != L->end())
     return false;
-  
+
   SmallVector<BasicBlock*, 4> exitingBlocks;
   L->getExitingBlocks(exitingBlocks);
-  
+
   SmallVector<BasicBlock*, 4> exitBlocks;
   L->getUniqueExitBlocks(exitBlocks);
-  
+
   // We require that the loop only have a single exit block.  Otherwise, we'd
   // be in the situation of needing to be able to solve statically which exit
   // block will be branched to, or trying to preserve the branching logic in
   // a loop invariant manner.
   if (exitBlocks.size() != 1)
     return false;
-  
+
   // Finally, we have to check that the loop really is dead.
   bool Changed = false;
   if (!IsLoopDead(L, exitingBlocks, exitBlocks, Changed, preheader))
     return Changed;
-  
+
   // Don't remove loops for which we can't solve the trip count.
   // They could be infinite, in which case we'd be changing program behavior.
   ScalarEvolution& SE = getAnalysis<ScalarEvolution>();
   const SCEV *S = SE.getMaxBackedgeTakenCount(L);
   if (isa<SCEVCouldNotCompute>(S))
     return Changed;
-  
+
   // Now that we know the removal is safe, remove the loop by changing the
-  // branch from the preheader to go to the single exit block.  
+  // branch from the preheader to go to the single exit block.
   BasicBlock* exitBlock = exitBlocks[0];
-  
+
   // Because we're deleting a large chunk of code at once, the sequence in which
   // we remove things is very important to avoid invalidation issues.  Don't
   // mess with this unless you have good reason and know what you're doing.
@@ -197,7 +197,7 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
       P->removeIncomingValue(exitingBlocks[i]);
     ++BI;
   }
-  
+
   // Update the dominator tree and remove the instructions and blocks that will
   // be deleted from the reference counting scheme.
   DominatorTree& DT = getAnalysis<DominatorTree>();
@@ -211,7 +211,7 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
          DE = ChildNodes.end(); DI != DE; ++DI) {
       DT.changeImmediateDominator(*DI, DT[preheader]);
     }
-    
+
     ChildNodes.clear();
     DT.eraseNode(*LI);
 
@@ -219,7 +219,7 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
     // delete it freely later.
     (*LI)->dropAllReferences();
   }
-  
+
   // Erase the instructions and the blocks without having to worry
   // about ordering because we already dropped the references.
   // NOTE: This iteration is safe because erasing the block does not remove its
@@ -236,13 +236,13 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
   for (SmallPtrSet<BasicBlock*,8>::iterator I = blocks.begin(),
        E = blocks.end(); I != E; ++I)
     loopInfo.removeBlock(*I);
-  
+
   // The last step is to inform the loop pass manager that we've
   // eliminated this loop.
   LPM.deleteLoopFromQueue(L);
   Changed = true;
-  
+
   ++NumDeleted;
-  
+
   return Changed;
 }
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index ad15cbb..ac1082c 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -43,20 +43,20 @@
 
 #define DEBUG_TYPE "loop-idiom"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Module.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/IRBuilder.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
 STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
@@ -173,7 +173,7 @@ static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE) {
 bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
   CurLoop = L;
 
-  // Disable loop idiom recognition if the function's name is a common idiom. 
+  // Disable loop idiom recognition if the function's name is a common idiom.
   StringRef Name = L->getHeader()->getParent()->getName();
   if (Name == "memset" || Name == "memcpy")
     return false;
diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp
index f0f05e6..982400c 100644
--- a/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -48,7 +48,7 @@ namespace {
     }
   };
 }
-  
+
 char LoopInstSimplify::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopInstSimplify, "loop-instsimplify",
                 "Simplify instructions in loops", false, false)
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index 59aace9..7eeb152 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -418,12 +418,13 @@ bool LoopRotate::rotateLoop(Loop *L) {
     }
 
     // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
-    // thus is not a preheader anymore.  Split the edge to form a real preheader.
+    // thus is not a preheader anymore.
+    // Split the edge to form a real preheader.
     BasicBlock *NewPH = SplitCriticalEdge(OrigPreheader, NewHeader, this);
     NewPH->setName(NewHeader->getName() + ".lr.ph");
 
-    // Preserve canonical loop form, which means that 'Exit' should have only one
-    // predecessor.
+    // Preserve canonical loop form, which means that 'Exit' should have only
+    // one predecessor.
     BasicBlock *ExitSplit = SplitCriticalEdge(L->getLoopLatch(), Exit, this);
     ExitSplit->moveBefore(Exit);
   } else {
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index fe4700b..b14a713 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1308,8 +1308,8 @@ static bool isLegalUse(const TargetLowering::AddrMode &AM,
     return !AM.BaseGV && AM.Scale == 0 && AM.BaseOffs == 0;
 
   case LSRUse::Special:
-    // Only handle -1 scales, or no scale.
-    return AM.Scale == 0 || AM.Scale == -1;
+    // Special case Basic to handle -1 scales.
+    return !AM.BaseGV && (AM.Scale == 0 || AM.Scale == -1) && AM.BaseOffs == 0;
   }
 
   llvm_unreachable("Invalid LSRUse Kind!");
@@ -1439,7 +1439,41 @@ struct IVInc {
 
 // IVChain - The list of IV increments in program order.
 // We typically add the head of a chain without finding subsequent links.
-typedef SmallVector<IVInc,1> IVChain;
+struct IVChain {
+  SmallVector<IVInc,1> Incs;
+  const SCEV *ExprBase;
+
+  IVChain() : ExprBase(0) {}
+
+  IVChain(const IVInc &Head, const SCEV *Base)
+    : Incs(1, Head), ExprBase(Base) {}
+
+  typedef SmallVectorImpl<IVInc>::const_iterator const_iterator;
+
+  // begin - return the first increment in the chain.
+  const_iterator begin() const {
+    assert(!Incs.empty());
+    return llvm::next(Incs.begin());
+  }
+  const_iterator end() const {
+    return Incs.end();
+  }
+
+  // hasIncs - Returns true if this chain contains any increments.
+  bool hasIncs() const { return Incs.size() >= 2; }
+
+  // add - Add an IVInc to the end of this chain.
+  void add(const IVInc &X) { Incs.push_back(X); }
+
+  // tailUserInst - Returns the last UserInst in the chain.
+  Instruction *tailUserInst() const { return Incs.back().UserInst; }
+
+  // isProfitableIncrement - Returns true if IncExpr can be profitably added to
+  // this chain.
+  bool isProfitableIncrement(const SCEV *OperExpr,
+                             const SCEV *IncExpr,
+                             ScalarEvolution&);
+};
 
 /// ChainUsers - Helper for CollectChains to track multiple IV increment uses.
 /// Distinguish between FarUsers that definitely cross IV increments and
@@ -2160,7 +2194,7 @@ LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
             return &LU;
           // This is the formula where all the registers and symbols matched;
           // there aren't going to be any others. Since we declined it, we
-          // can skip the rest of the formulae and procede to the next LSRUse.
+          // can skip the rest of the formulae and proceed to the next LSRUse.
           break;
         }
       }
@@ -2319,41 +2353,23 @@ static const SCEV *getExprBase(const SCEV *S) {
 /// increment will be an offset relative to the same base. We allow such offsets
 /// to potentially be used as chain increment as long as it's not obviously
 /// expensive to expand using real instructions.
-static const SCEV *
-getProfitableChainIncrement(Value *NextIV, Value *PrevIV,
-                            const IVChain &Chain, Loop *L,
-                            ScalarEvolution &SE, const TargetLowering *TLI) {
-  // Prune the solution space aggressively by checking that both IV operands
-  // are expressions that operate on the same unscaled SCEVUnknown. This
-  // "base" will be canceled by the subsequent getMinusSCEV call. Checking first
-  // avoids creating extra SCEV expressions.
-  const SCEV *OperExpr = SE.getSCEV(NextIV);
-  const SCEV *PrevExpr = SE.getSCEV(PrevIV);
-  if (getExprBase(OperExpr) != getExprBase(PrevExpr) && !StressIVChain)
-    return 0;
-
-  const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
-  if (!SE.isLoopInvariant(IncExpr, L))
-    return 0;
-
-  // We are not able to expand an increment unless it is loop invariant,
-  // however, the following checks are purely for profitability.
+bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
+                                    const SCEV *IncExpr,
+                                    ScalarEvolution &SE) {
+  // Aggressively form chains when -stress-ivchain.
   if (StressIVChain)
-    return IncExpr;
+    return true;
 
   // Do not replace a constant offset from IV head with a nonconstant IV
   // increment.
   if (!isa<SCEVConstant>(IncExpr)) {
-    const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Chain[0].IVOperand));
+    const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
     if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
       return 0;
   }
 
   SmallPtrSet<const SCEV*, 8> Processed;
-  if (isHighCostExpansion(IncExpr, Processed, SE))
-    return 0;
-
-  return IncExpr;
+  return !isHighCostExpansion(IncExpr, Processed, SE);
 }
 
 /// Return true if the number of registers needed for the chain is estimated to
@@ -2372,18 +2388,18 @@ isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users,
   if (StressIVChain)
     return true;
 
-  if (Chain.size() <= 2)
+  if (!Chain.hasIncs())
     return false;
 
   if (!Users.empty()) {
-    DEBUG(dbgs() << "Chain: " << *Chain[0].UserInst << " users:\n";
+    DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
           for (SmallPtrSet<Instruction*, 4>::const_iterator I = Users.begin(),
                  E = Users.end(); I != E; ++I) {
             dbgs() << "  " << **I << "\n";
           });
     return false;
   }
-  assert(!Chain.empty() && "empty IV chains are not allowed");
+  assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
 
   // The chain itself may require a register, so intialize cost to 1.
   int cost = 1;
@@ -2391,15 +2407,15 @@ isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users,
   // A complete chain likely eliminates the need for keeping the original IV in
   // a register. LSR does not currently know how to form a complete chain unless
   // the header phi already exists.
-  if (isa<PHINode>(Chain.back().UserInst)
-      && SE.getSCEV(Chain.back().UserInst) == Chain[0].IncExpr) {
+  if (isa<PHINode>(Chain.tailUserInst())
+      && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
     --cost;
   }
   const SCEV *LastIncExpr = 0;
   unsigned NumConstIncrements = 0;
   unsigned NumVarIncrements = 0;
   unsigned NumReusedIncrements = 0;
-  for (IVChain::const_iterator I = llvm::next(Chain.begin()), E = Chain.end();
+  for (IVChain::const_iterator I = Chain.begin(), E = Chain.end();
        I != E; ++I) {
 
     if (I->IncExpr->isZero())
@@ -2435,7 +2451,8 @@ isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users,
   // the stride.
   cost -= NumReusedIncrements;
 
-  DEBUG(dbgs() << "Chain: " << *Chain[0].UserInst << " Cost: " << cost << "\n");
+  DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
+               << "\n");
 
   return cost < 0;
 }
@@ -2446,25 +2463,39 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
                                    SmallVectorImpl<ChainUsers> &ChainUsersVec) {
   // When IVs are used as types of varying widths, they are generally converted
   // to a wider type with some uses remaining narrow under a (free) trunc.
-  Value *NextIV = getWideOperand(IVOper);
+  Value *const NextIV = getWideOperand(IVOper);
+  const SCEV *const OperExpr = SE.getSCEV(NextIV);
+  const SCEV *const OperExprBase = getExprBase(OperExpr);
 
   // Visit all existing chains. Check if its IVOper can be computed as a
   // profitable loop invariant increment from the last link in the Chain.
   unsigned ChainIdx = 0, NChains = IVChainVec.size();
   const SCEV *LastIncExpr = 0;
   for (; ChainIdx < NChains; ++ChainIdx) {
-    Value *PrevIV = getWideOperand(IVChainVec[ChainIdx].back().IVOperand);
+    IVChain &Chain = IVChainVec[ChainIdx];
+
+    // Prune the solution space aggressively by checking that both IV operands
+    // are expressions that operate on the same unscaled SCEVUnknown. This
+    // "base" will be canceled by the subsequent getMinusSCEV call. Checking
+    // first avoids creating extra SCEV expressions.
+    if (!StressIVChain && Chain.ExprBase != OperExprBase)
+      continue;
+
+    Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
     if (!isCompatibleIVType(PrevIV, NextIV))
       continue;
 
     // A phi node terminates a chain.
-    if (isa<PHINode>(UserInst)
-        && isa<PHINode>(IVChainVec[ChainIdx].back().UserInst))
+    if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
+      continue;
+
+    // The increment must be loop-invariant so it can be kept in a register.
+    const SCEV *PrevExpr = SE.getSCEV(PrevIV);
+    const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
+    if (!SE.isLoopInvariant(IncExpr, L))
       continue;
 
-    if (const SCEV *IncExpr =
-        getProfitableChainIncrement(NextIV, PrevIV, IVChainVec[ChainIdx],
-                                    L, SE, TLI)) {
+    if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
       LastIncExpr = IncExpr;
       break;
     }
@@ -2478,24 +2509,24 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
       DEBUG(dbgs() << "IV Chain Limit\n");
       return;
     }
-    LastIncExpr = SE.getSCEV(NextIV);
+    LastIncExpr = OperExpr;
     // IVUsers may have skipped over sign/zero extensions. We don't currently
     // attempt to form chains involving extensions unless they can be hoisted
     // into this loop's AddRec.
     if (!isa<SCEVAddRecExpr>(LastIncExpr))
       return;
     ++NChains;
-    IVChainVec.resize(NChains);
+    IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
+                                 OperExprBase));
     ChainUsersVec.resize(NChains);
-    DEBUG(dbgs() << "IV Head: (" << *UserInst << ") IV=" << *LastIncExpr
-          << "\n");
+    DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
+                 << ") IV=" << *LastIncExpr << "\n");
+  } else {
+    DEBUG(dbgs() << "IV Chain#" << ChainIdx << "  Inc: (" << *UserInst
+                 << ") IV+" << *LastIncExpr << "\n");
+    // Add this IV user to the end of the chain.
+    IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
   }
-  else
-    DEBUG(dbgs() << "IV  Inc: (" << *UserInst << ") IV+" << *LastIncExpr
-          << "\n");
-
-  // Add this IV user to the end of the chain.
-  IVChainVec[ChainIdx].push_back(IVInc(UserInst, IVOper, LastIncExpr));
 
   SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
   // This chain's NearUsers become FarUsers.
@@ -2551,6 +2582,7 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
 /// loop latch. This will discover chains on side paths, but requires
 /// maintaining multiple copies of the Chains state.
 void LSRInstance::CollectChains() {
+  DEBUG(dbgs() << "Collecting IV Chains.\n");
   SmallVector<ChainUsers, 8> ChainUsersVec;
 
   SmallVector<BasicBlock *,8> LatchPath;
@@ -2622,10 +2654,10 @@ void LSRInstance::CollectChains() {
 }
 
 void LSRInstance::FinalizeChain(IVChain &Chain) {
-  assert(!Chain.empty() && "empty IV chains are not allowed");
-  DEBUG(dbgs() << "Final Chain: " << *Chain[0].UserInst << "\n");
+  assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
+  DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
 
-  for (IVChain::const_iterator I = llvm::next(Chain.begin()), E = Chain.end();
+  for (IVChain::const_iterator I = Chain.begin(), E = Chain.end();
        I != E; ++I) {
     DEBUG(dbgs() << "        Inc: " << *I->UserInst << "\n");
     User::op_iterator UseI =
@@ -2659,7 +2691,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
                                   SmallVectorImpl<WeakVH> &DeadInsts) {
   // Find the new IVOperand for the head of the chain. It may have been replaced
   // by LSR.
-  const IVInc &Head = Chain[0];
+  const IVInc &Head = Chain.Incs[0];
   User::op_iterator IVOpEnd = Head.UserInst->op_end();
   User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
                                              IVOpEnd, L, SE);
@@ -2691,7 +2723,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
   Type *IVTy = IVSrc->getType();
   Type *IntTy = SE.getEffectiveSCEVType(IVTy);
   const SCEV *LeftOverExpr = 0;
-  for (IVChain::const_iterator IncI = llvm::next(Chain.begin()),
+  for (IVChain::const_iterator IncI = Chain.begin(),
          IncE = Chain.end(); IncI != IncE; ++IncI) {
 
     Instruction *InsertPt = IncI->UserInst;
@@ -2736,7 +2768,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
   }
   // If LSR created a new, wider phi, we may also replace its postinc. We only
   // do this if we also found a wide value for the head of the chain.
-  if (isa<PHINode>(Chain.back().UserInst)) {
+  if (isa<PHINode>(Chain.tailUserInst())) {
     for (BasicBlock::iterator I = L->getHeader()->begin();
          PHINode *Phi = dyn_cast<PHINode>(I); ++I) {
       if (!isCompatibleIVType(Phi, IVSrc))
@@ -2804,7 +2836,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
 
         // x == y  -->  x - y == 0
         const SCEV *N = SE.getSCEV(NV);
-        if (SE.isLoopInvariant(N, L)) {
+        if (SE.isLoopInvariant(N, L) && isSafeToExpand(N)) {
           // S is normalized, so normalize N before folding it into S
           // to keep the result normalized.
           N = TransformForPostIncUse(Normalize, N, CI, 0,
@@ -2974,42 +3006,64 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
 
 /// CollectSubexprs - Split S into subexpressions which can be pulled out into
 /// separate registers. If C is non-null, multiply each subexpression by C.
-static void CollectSubexprs(const SCEV *S, const SCEVConstant *C,
-                            SmallVectorImpl<const SCEV *> &Ops,
-                            const Loop *L,
-                            ScalarEvolution &SE) {
+///
+/// Return remainder expression after factoring the subexpressions captured by
+/// Ops. If Ops is complete, return NULL.
+static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
+                                   SmallVectorImpl<const SCEV *> &Ops,
+                                   const Loop *L,
+                                   ScalarEvolution &SE,
+                                   unsigned Depth = 0) {
+  // Arbitrarily cap recursion to protect compile time.
+  if (Depth >= 3)
+    return S;
+
   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
     // Break out add operands.
     for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
-         I != E; ++I)
-      CollectSubexprs(*I, C, Ops, L, SE);
-    return;
+         I != E; ++I) {
+      const SCEV *Remainder = CollectSubexprs(*I, C, Ops, L, SE, Depth+1);
+      if (Remainder)
+        Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
+    }
+    return NULL;
   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
     // Split a non-zero base out of an addrec.
-    if (!AR->getStart()->isZero()) {
-      CollectSubexprs(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
-                                       AR->getStepRecurrence(SE),
-                                       AR->getLoop(),
-                                       //FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
-                                       SCEV::FlagAnyWrap),
-                      C, Ops, L, SE);
-      CollectSubexprs(AR->getStart(), C, Ops, L, SE);
-      return;
+    if (AR->getStart()->isZero())
+      return S;
+
+    const SCEV *Remainder = CollectSubexprs(AR->getStart(),
+                                            C, Ops, L, SE, Depth+1);
+    // Split the non-zero AddRec unless it is part of a nested recurrence that
+    // does not pertain to this loop.
+    if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) {
+      Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
+      Remainder = NULL;
+    }
+    if (Remainder != AR->getStart()) {
+      if (!Remainder)
+        Remainder = SE.getConstant(AR->getType(), 0);
+      return SE.getAddRecExpr(Remainder,
+                              AR->getStepRecurrence(SE),
+                              AR->getLoop(),
+                              //FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
+                              SCEV::FlagAnyWrap);
     }
   } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
     // Break (C * (a + b + c)) into C*a + C*b + C*c.
-    if (Mul->getNumOperands() == 2)
-      if (const SCEVConstant *Op0 =
-            dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
-        CollectSubexprs(Mul->getOperand(1),
-                        C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0,
-                        Ops, L, SE);
-        return;
-      }
+    if (Mul->getNumOperands() != 2)
+      return S;
+    if (const SCEVConstant *Op0 =
+        dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
+      C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
+      const SCEV *Remainder =
+        CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1);
+      if (Remainder)
+        Ops.push_back(SE.getMulExpr(C, Remainder));
+      return NULL;
+    }
   }
-
-  // Otherwise use the value itself, optionally with a scale applied.
-  Ops.push_back(C ? SE.getMulExpr(C, S) : S);
+  return S;
 }
 
 /// GenerateReassociations - Split out subexpressions from adds and the bases of
@@ -3024,7 +3078,9 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
     const SCEV *BaseReg = Base.BaseRegs[i];
 
     SmallVector<const SCEV *, 8> AddOps;
-    CollectSubexprs(BaseReg, 0, AddOps, L, SE);
+    const SCEV *Remainder = CollectSubexprs(BaseReg, 0, AddOps, L, SE);
+    if (Remainder)
+      AddOps.push_back(Remainder);
 
     if (AddOps.size() == 1) continue;
 
@@ -4108,7 +4164,7 @@ LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
       // Attempt to find an insert position in the middle of the block,
       // instead of at the end, so that it can be used for other expansions.
       if (IDom == Inst->getParent() &&
-          (!BetterPos || DT.dominates(BetterPos, Inst)))
+          (!BetterPos || !DT.dominates(Inst, BetterPos)))
         BetterPos = llvm::next(BasicBlock::iterator(Inst));
     }
     if (!AllDominate)
@@ -4236,13 +4292,6 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
     Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, 0, IP)));
   }
 
-  // Flush the operand list to suppress SCEVExpander hoisting.
-  if (!Ops.empty()) {
-    Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
-    Ops.clear();
-    Ops.push_back(SE.getUnknown(FullV));
-  }
-
   // Expand the ScaledReg portion.
   Value *ICmpScaledV = 0;
   if (F.AM.Scale != 0) {
@@ -4264,23 +4313,34 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
     } else {
       // Otherwise just expand the scaled register and an explicit scale,
       // which is expected to be matched as part of the address.
+
+      // Flush the operand list to suppress SCEVExpander hoisting address modes.
+      if (!Ops.empty() && LU.Kind == LSRUse::Address) {
+        Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
+        Ops.clear();
+        Ops.push_back(SE.getUnknown(FullV));
+      }
       ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP));
       ScaledS = SE.getMulExpr(ScaledS,
                               SE.getConstant(ScaledS->getType(), F.AM.Scale));
       Ops.push_back(ScaledS);
-
-      // Flush the operand list to suppress SCEVExpander hoisting.
-      Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
-      Ops.clear();
-      Ops.push_back(SE.getUnknown(FullV));
     }
   }
 
   // Expand the GV portion.
   if (F.AM.BaseGV) {
+    // Flush the operand list to suppress SCEVExpander hoisting.
+    if (!Ops.empty()) {
+      Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
+      Ops.clear();
+      Ops.push_back(SE.getUnknown(FullV));
+    }
     Ops.push_back(SE.getUnknown(F.AM.BaseGV));
+  }
 
-    // Flush the operand list to suppress SCEVExpander hoisting.
+  // Flush the operand list to suppress SCEVExpander hoisting of both folded and
+  // unfolded offsets. LSR assumes they both live next to their uses.
+  if (!Ops.empty()) {
     Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
     Ops.clear();
     Ops.push_back(SE.getUnknown(FullV));
@@ -4485,7 +4545,7 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
   // Mark phi nodes that terminate chains so the expander tries to reuse them.
   for (SmallVectorImpl<IVChain>::const_iterator ChainI = IVChainVec.begin(),
          ChainE = IVChainVec.end(); ChainI != ChainE; ++ChainI) {
-    if (PHINode *PN = dyn_cast<PHINode>(ChainI->back().UserInst))
+    if (PHINode *PN = dyn_cast<PHINode>(ChainI->tailUserInst()))
       Rewriter.setChainedPhi(PN);
   }
 
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index 00ecc74..58f7739 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -409,15 +409,6 @@ bool LoopUnswitch::processCurrentLoop() {
   if (!currentLoop->isSafeToClone())
     return false;
 
-  // Loops with invokes, whose unwind edge escapes the loop, cannot be
-  // unswitched because splitting their edges are non-trivial and don't preserve
-  // loop simplify information.
-  for (Loop::block_iterator I = currentLoop->block_begin(),
-         E = currentLoop->block_end(); I != E; ++I)
-    if (const InvokeInst *II = dyn_cast<InvokeInst>((*I)->getTerminator()))
-      if (!currentLoop->contains(II->getUnwindDest()))
-        return false;
-
   // Without dedicated exits, splitting the exit edge may fail.
   if (!currentLoop->hasDedicatedExits())
     return false;
@@ -633,11 +624,10 @@ bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val,
 /// LoopCond == Val to simplify the loop.  If we decide that this is profitable,
 /// unswitch the loop, reprocess the pieces, then return true.
 bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val) {
-
   Function *F = loopHeader->getParent();
-
   Constant *CondVal = 0;
   BasicBlock *ExitBlock = 0;
+
   if (IsTrivialUnswitchCondition(LoopCond, &CondVal, &ExitBlock)) {
     // If the condition is trivial, always unswitch. There is no code growth
     // for this case.
@@ -697,8 +687,8 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
 
   // If either edge is critical, split it. This helps preserve LoopSimplify
   // form for enclosing loops.
-  SplitCriticalEdge(BI, 0, this);
-  SplitCriticalEdge(BI, 1, this);
+  SplitCriticalEdge(BI, 0, this, false, false, true);
+  SplitCriticalEdge(BI, 1, this, false, false, true);
 }
 
 /// UnswitchTrivialCondition - Given a loop that has a trivial unswitchable
@@ -1224,8 +1214,8 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
 
     // See if instruction simplification can hack this up.  This is common for
     // things like "select false, X, Y" after unswitching made the condition be
-    // 'false'.
-    if (Value *V = SimplifyInstruction(I, 0, 0, DT))
+    // 'false'.  TODO: update the domtree properly so we can pass it here.
+    if (Value *V = SimplifyInstruction(I))
       if (LI->replacementPreservesLCSSAForm(I, V)) {
         ReplaceUsesOfWith(I, V, Worklist, L, LPM);
         continue;
diff --git a/lib/Transforms/Scalar/LowerAtomic.cpp b/lib/Transforms/Scalar/LowerAtomic.cpp
index 689bbe9..7419a65 100644
--- a/lib/Transforms/Scalar/LowerAtomic.cpp
+++ b/lib/Transforms/Scalar/LowerAtomic.cpp
@@ -15,9 +15,9 @@
 #define DEBUG_TYPE "loweratomic"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Function.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/IRBuilder.h"
 using namespace llvm;
 
 static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) {
@@ -25,12 +25,12 @@ static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) {
   Value *Ptr = CXI->getPointerOperand();
   Value *Cmp = CXI->getCompareOperand();
   Value *Val = CXI->getNewValOperand();
- 
+
   LoadInst *Orig = Builder.CreateLoad(Ptr);
   Value *Equal = Builder.CreateICmpEQ(Orig, Cmp);
   Value *Res = Builder.CreateSelect(Equal, Val, Orig);
   Builder.CreateStore(Res, Ptr);
- 
+
   CXI->replaceAllUsesWith(Orig);
   CXI->eraseFromParent();
   return true;
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index a87cce3..2a5ee33 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -15,21 +15,21 @@
 #define DEBUG_TYPE "memcpyopt"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/GlobalVariable.h"
-#include "llvm/IntrinsicInst.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <list>
 using namespace llvm;
 
@@ -44,7 +44,7 @@ static int64_t GetOffsetFromIndex(const GetElementPtrInst *GEP, unsigned Idx,
   gep_type_iterator GTI = gep_type_begin(GEP);
   for (unsigned i = 1; i != Idx; ++i, ++GTI)
     /*skip along*/;
-  
+
   // Compute the offset implied by the rest of the indices.
   int64_t Offset = 0;
   for (unsigned i = Idx, e = GEP->getNumOperands(); i != e; ++i, ++GTI) {
@@ -58,7 +58,7 @@ static int64_t GetOffsetFromIndex(const GetElementPtrInst *GEP, unsigned Idx,
       Offset += TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
       continue;
     }
-    
+
     // Otherwise, we have a sequential type like an array or vector.  Multiply
     // the index by the ElementSize.
     uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType());
@@ -77,7 +77,7 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
   Ptr2 = Ptr2->stripPointerCasts();
   GetElementPtrInst *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
   GetElementPtrInst *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
-  
+
   bool VariableIdxFound = false;
 
   // If one pointer is a GEP and the other isn't, then see if the GEP is a
@@ -91,7 +91,7 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
     Offset = GetOffsetFromIndex(GEP2, 1, VariableIdxFound, TD);
     return !VariableIdxFound;
   }
-  
+
   // Right now we handle the case when Ptr1/Ptr2 are both GEPs with an identical
   // base.  After that base, they may have some number of common (and
   // potentially variable) indices.  After that they handle some constant
@@ -99,7 +99,7 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
   // handle no other case.
   if (!GEP1 || !GEP2 || GEP1->getOperand(0) != GEP2->getOperand(0))
     return false;
-  
+
   // Skip any common indices and track the GEP types.
   unsigned Idx = 1;
   for (; Idx != GEP1->getNumOperands() && Idx != GEP2->getNumOperands(); ++Idx)
@@ -109,7 +109,7 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
   int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, TD);
   int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, TD);
   if (VariableIdxFound) return false;
-  
+
   Offset = Offset2-Offset1;
   return true;
 }
@@ -128,19 +128,19 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
 namespace {
 struct MemsetRange {
   // Start/End - A semi range that describes the span that this range covers.
-  // The range is closed at the start and open at the end: [Start, End).  
+  // The range is closed at the start and open at the end: [Start, End).
   int64_t Start, End;
 
   /// StartPtr - The getelementptr instruction that points to the start of the
   /// range.
   Value *StartPtr;
-  
+
   /// Alignment - The known alignment of the first store.
   unsigned Alignment;
-  
+
   /// TheStores - The actual stores that make up this range.
   SmallVector<Instruction*, 16> TheStores;
-  
+
   bool isProfitableToUseMemset(const TargetData &TD) const;
 
 };
@@ -152,17 +152,17 @@ bool MemsetRange::isProfitableToUseMemset(const TargetData &TD) const {
 
   // If there is nothing to merge, don't do anything.
   if (TheStores.size() < 2) return false;
-  
+
   // If any of the stores are a memset, then it is always good to extend the
   // memset.
   for (unsigned i = 0, e = TheStores.size(); i != e; ++i)
     if (!isa<StoreInst>(TheStores[i]))
       return true;
-  
+
   // Assume that the code generator is capable of merging pairs of stores
   // together if it wants to.
   if (TheStores.size() == 2) return false;
-  
+
   // If we have fewer than 8 stores, it can still be worthwhile to do this.
   // For example, merging 4 i8 stores into an i32 store is useful almost always.
   // However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the
@@ -175,15 +175,15 @@ bool MemsetRange::isProfitableToUseMemset(const TargetData &TD) const {
   // actually reducing the number of stores used.
   unsigned Bytes = unsigned(End-Start);
   unsigned NumPointerStores = Bytes/TD.getPointerSize();
-  
+
   // Assume the remaining bytes if any are done a byte at a time.
   unsigned NumByteStores = Bytes - NumPointerStores*TD.getPointerSize();
-  
+
   // If we will reduce the # stores (according to this heuristic), do the
   // transformation.  This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32
   // etc.
   return TheStores.size() > NumPointerStores+NumByteStores;
-}    
+}
 
 
 namespace {
@@ -195,12 +195,12 @@ class MemsetRanges {
   const TargetData &TD;
 public:
   MemsetRanges(const TargetData &td) : TD(td) {}
-  
+
   typedef std::list<MemsetRange>::const_iterator const_iterator;
   const_iterator begin() const { return Ranges.begin(); }
   const_iterator end() const { return Ranges.end(); }
   bool empty() const { return Ranges.empty(); }
-  
+
   void addInst(int64_t OffsetFromFirst, Instruction *Inst) {
     if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
       addStore(OffsetFromFirst, SI);
@@ -210,21 +210,21 @@ public:
 
   void addStore(int64_t OffsetFromFirst, StoreInst *SI) {
     int64_t StoreSize = TD.getTypeStoreSize(SI->getOperand(0)->getType());
-    
+
     addRange(OffsetFromFirst, StoreSize,
              SI->getPointerOperand(), SI->getAlignment(), SI);
   }
-  
+
   void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) {
     int64_t Size = cast<ConstantInt>(MSI->getLength())->getZExtValue();
     addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getAlignment(), MSI);
   }
-  
+
   void addRange(int64_t Start, int64_t Size, Value *Ptr,
                 unsigned Alignment, Instruction *Inst);
 
 };
-  
+
 } // end anon namespace
 
 
@@ -240,10 +240,10 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
                             unsigned Alignment, Instruction *Inst) {
   int64_t End = Start+Size;
   range_iterator I = Ranges.begin(), E = Ranges.end();
-  
+
   while (I != E && Start > I->End)
     ++I;
-  
+
   // We now know that I == E, in which case we didn't find anything to merge
   // with, or that Start <= I->End.  If End < I->Start or I == E, then we need
   // to insert a new range.  Handle this now.
@@ -256,18 +256,18 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
     R.TheStores.push_back(Inst);
     return;
   }
-  
+
   // This store overlaps with I, add it.
   I->TheStores.push_back(Inst);
-  
+
   // At this point, we may have an interval that completely contains our store.
   // If so, just add it to the interval and return.
   if (I->Start <= Start && I->End >= End)
     return;
-  
+
   // Now we know that Start <= I->End and End >= I->Start so the range overlaps
   // but is not entirely contained within the range.
-  
+
   // See if the range extends the start of the range.  In this case, it couldn't
   // possibly cause it to join the prior range, because otherwise we would have
   // stopped on *it*.
@@ -276,7 +276,7 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
     I->StartPtr = Ptr;
     I->Alignment = Alignment;
   }
-    
+
   // Now we know that Start <= I->End and Start >= I->Start (so the startpoint
   // is in or right at the end of I), and that End >= I->Start.  Extend I out to
   // End.
@@ -325,7 +325,7 @@ namespace {
       AU.addPreserved<AliasAnalysis>();
       AU.addPreserved<MemoryDependenceAnalysis>();
     }
-  
+
     // Helper fuctions
     bool processStore(StoreInst *SI, BasicBlock::iterator &BBI);
     bool processMemSet(MemSetInst *SI, BasicBlock::iterator &BBI);
@@ -341,7 +341,7 @@ namespace {
 
     bool iterateOnFunction(Function &F);
   };
-  
+
   char MemCpyOpt::ID = 0;
 }
 
@@ -361,16 +361,16 @@ INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization",
 /// some other patterns to fold away.  In particular, this looks for stores to
 /// neighboring locations of memory.  If it sees enough consecutive ones, it
 /// attempts to merge them together into a memcpy/memset.
-Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, 
+Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
                                              Value *StartPtr, Value *ByteVal) {
   if (TD == 0) return 0;
-  
+
   // Okay, so we now have a single store that can be splatable.  Scan to find
   // all subsequent stores of the same value to offset from the same pointer.
   // Join these together into ranges, so we can decide whether contiguous blocks
   // are stored.
   MemsetRanges Ranges(*TD);
-  
+
   BasicBlock::iterator BI = StartInst;
   for (++BI; !isa<TerminatorInst>(BI); ++BI) {
     if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) {
@@ -381,43 +381,43 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
         break;
       continue;
     }
-    
+
     if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) {
       // If this is a store, see if we can merge it in.
       if (!NextStore->isSimple()) break;
-    
+
       // Check to see if this stored value is of the same byte-splattable value.
       if (ByteVal != isBytewiseValue(NextStore->getOperand(0)))
         break;
-      
+
       // Check to see if this store is to a constant offset from the start ptr.
       int64_t Offset;
       if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(),
                            Offset, *TD))
         break;
-      
+
       Ranges.addStore(Offset, NextStore);
     } else {
       MemSetInst *MSI = cast<MemSetInst>(BI);
-      
+
       if (MSI->isVolatile() || ByteVal != MSI->getValue() ||
           !isa<ConstantInt>(MSI->getLength()))
         break;
-      
+
       // Check to see if this store is to a constant offset from the start ptr.
       int64_t Offset;
       if (!IsPointerOffset(StartPtr, MSI->getDest(), Offset, *TD))
         break;
-      
+
       Ranges.addMemSet(Offset, MSI);
     }
   }
-  
+
   // If we have no ranges, then we just had a single store with nothing that
   // could be merged in.  This is a very common case of course.
   if (Ranges.empty())
     return 0;
-  
+
   // If we had at least one store that could be merged in, add the starting
   // store as well.  We try to avoid this unless there is at least something
   // interesting as a small compile-time optimization.
@@ -434,28 +434,28 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
   for (MemsetRanges::const_iterator I = Ranges.begin(), E = Ranges.end();
        I != E; ++I) {
     const MemsetRange &Range = *I;
-    
+
     if (Range.TheStores.size() == 1) continue;
-    
+
     // If it is profitable to lower this range to memset, do so now.
     if (!Range.isProfitableToUseMemset(*TD))
       continue;
-    
+
     // Otherwise, we do want to transform this!  Create a new memset.
     // Get the starting pointer of the block.
     StartPtr = Range.StartPtr;
-    
+
     // Determine alignment
     unsigned Alignment = Range.Alignment;
     if (Alignment == 0) {
-      Type *EltType = 
+      Type *EltType =
         cast<PointerType>(StartPtr->getType())->getElementType();
       Alignment = TD->getABITypeAlignment(EltType);
     }
-    
-    AMemSet = 
+
+    AMemSet =
       Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment);
-    
+
     DEBUG(dbgs() << "Replace stores:\n";
           for (unsigned i = 0, e = Range.TheStores.size(); i != e; ++i)
             dbgs() << *Range.TheStores[i] << '\n';
@@ -473,14 +473,14 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
     }
     ++NumMemSetInfer;
   }
-  
+
   return AMemSet;
 }
 
 
 bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
   if (!SI->isSimple()) return false;
-  
+
   if (TD == 0) return false;
 
   // Detect cases where we're performing call slot forwarding, but
@@ -510,7 +510,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
 
       if (C) {
         bool changed = performCallSlotOptzn(LI,
-                        SI->getPointerOperand()->stripPointerCasts(), 
+                        SI->getPointerOperand()->stripPointerCasts(),
                         LI->getPointerOperand()->stripPointerCasts(),
                         TD->getTypeStoreSize(SI->getOperand(0)->getType()), C);
         if (changed) {
@@ -524,10 +524,10 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
       }
     }
   }
-  
+
   // There are two cases that are interesting for this code to handle: memcpy
   // and memset.  Right now we only handle memset.
-  
+
   // Ensure that the value being stored is something that can be memset'able a
   // byte at a time like "0" or "-1" or any width, as well as things like
   // 0xA0A0A0A0 and 0.0.
@@ -537,7 +537,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
       BBI = I;  // Don't invalidate iterator.
       return true;
     }
-  
+
   return false;
 }
 
@@ -662,7 +662,11 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
   // the use analysis, we also need to know that it does not sneakily
   // access dest.  We rely on AA to figure this out for us.
   AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
-  if (AA.getModRefInfo(C, cpyDest, srcSize) != AliasAnalysis::NoModRef)
+  AliasAnalysis::ModRefResult MR = AA.getModRefInfo(C, cpyDest, srcSize);
+  // If necessary, perform additional analysis.
+  if (MR != AliasAnalysis::NoModRef)
+    MR = AA.callCapturesBefore(C, cpyDest, srcSize, &DT);
+  if (MR != AliasAnalysis::NoModRef)
     return false;
 
   // All the checks have passed, so do the transformation.
@@ -676,7 +680,7 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
       if (CS.getArgument(i)->getType() == cpyDest->getType())
         CS.setArgument(i, cpyDest);
       else
-        CS.setArgument(i, CastInst::CreatePointerCast(cpyDest, 
+        CS.setArgument(i, CastInst::CreatePointerCast(cpyDest,
                           CS.getArgument(i)->getType(), cpyDest->getName(), C));
     }
 
@@ -697,14 +701,14 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
 /// processMemCpyMemCpyDependence - We've found that the (upward scanning)
 /// memory dependence of memcpy 'M' is the memcpy 'MDep'.  Try to simplify M to
 /// copy from MDep's input if we can.  MSize is the size of M's copy.
-/// 
+///
 bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep,
                                               uint64_t MSize) {
   // We can only transforms memcpy's where the dest of one is the source of the
   // other.
   if (M->getSource() != MDep->getDest() || MDep->isVolatile())
     return false;
-  
+
   // If dep instruction is reading from our current input, then it is a noop
   // transfer and substituting the input won't change this instruction.  Just
   // ignore the input and let someone else zap MDep.  This handles cases like:
@@ -712,14 +716,14 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep,
   //    memcpy(b <- a)
   if (M->getSource() == MDep->getSource())
     return false;
-  
+
   // Second, the length of the memcpy's must be the same, or the preceding one
   // must be larger than the following one.
   ConstantInt *MDepLen = dyn_cast<ConstantInt>(MDep->getLength());
   ConstantInt *MLen = dyn_cast<ConstantInt>(M->getLength());
   if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue())
     return false;
-  
+
   AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
 
   // Verify that the copied-from memory doesn't change in between the two
@@ -739,23 +743,23 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep,
                                  false, M, M->getParent());
   if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
     return false;
-  
+
   // If the dest of the second might alias the source of the first, then the
   // source and dest might overlap.  We still want to eliminate the intermediate
   // value, but we have to generate a memmove instead of memcpy.
   bool UseMemMove = false;
   if (!AA.isNoAlias(AA.getLocationForDest(M), AA.getLocationForSource(MDep)))
     UseMemMove = true;
-  
+
   // If all checks passed, then we can transform M.
-  
+
   // Make sure to use the lesser of the alignment of the source and the dest
   // since we're changing where we're reading from, but don't want to increase
   // the alignment past what can be read from or written to.
   // TODO: Is this worth it if we're creating a less aligned memcpy? For
   // example we could be moving from movaps -> movq on x86.
   unsigned Align = std::min(MDep->getAlignment(), M->getAlignment());
-  
+
   IRBuilder<> Builder(M);
   if (UseMemMove)
     Builder.CreateMemMove(M->getRawDest(), MDep->getRawSource(), M->getLength(),
@@ -835,13 +839,13 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) {
 
   if (!TLI->has(LibFunc::memmove))
     return false;
-  
+
   // See if the pointers alias.
   if (!AA.isNoAlias(AA.getLocationForDest(M), AA.getLocationForSource(M)))
     return false;
-  
+
   DEBUG(dbgs() << "MemCpyOpt: Optimizing memmove -> memcpy: " << *M << "\n");
-  
+
   // If not, then we know we can transform this.
   Module *Mod = M->getParent()->getParent()->getParent();
   Type *ArgTys[3] = { M->getRawDest()->getType(),
@@ -857,7 +861,7 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) {
   ++NumMoveToCpy;
   return true;
 }
-  
+
 /// processByValArgument - This is called on every byval argument in call sites.
 bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
   if (TD == 0) return false;
@@ -880,7 +884,7 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
   if (MDep == 0 || MDep->isVolatile() ||
       ByValArg->stripPointerCasts() != MDep->getDest())
     return false;
-  
+
   // The length of the memcpy must be larger or equal to the size of the byval.
   ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength());
   if (C1 == 0 || C1->getValue().getZExtValue() < ByValSize)
@@ -890,13 +894,13 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
   // then it is some target specific value that we can't know.
   unsigned ByValAlign = CS.getParamAlignment(ArgNo+1);
   if (ByValAlign == 0) return false;
-  
+
   // If it is greater than the memcpy, then we check to see if we can force the
   // source of the memcpy to the alignment we need.  If we fail, we bail out.
   if (MDep->getAlignment() < ByValAlign &&
       getOrEnforceKnownAlignment(MDep->getSource(),ByValAlign, TD) < ByValAlign)
     return false;
-  
+
   // Verify that the copied-from memory doesn't change in between the memcpy and
   // the byval call.
   //    memcpy(a <- b)
@@ -911,16 +915,16 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
                                  false, CS.getInstruction(), MDep->getParent());
   if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
     return false;
-  
+
   Value *TmpCast = MDep->getSource();
   if (MDep->getSource()->getType() != ByValArg->getType())
     TmpCast = new BitCastInst(MDep->getSource(), ByValArg->getType(),
                               "tmpcast", CS.getInstruction());
-  
+
   DEBUG(dbgs() << "MemCpyOpt: Forwarding memcpy to byval:\n"
                << "  " << *MDep << "\n"
                << "  " << *CS.getInstruction() << "\n");
-  
+
   // Otherwise we're good!  Update the byval argument.
   CS.setArgument(ArgNo, TmpCast);
   ++NumMemCpyInstr;
@@ -936,9 +940,9 @@ bool MemCpyOpt::iterateOnFunction(Function &F) {
     for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
       // Avoid invalidating the iterator.
       Instruction *I = BI++;
-      
+
       bool RepeatInstruction = false;
-      
+
       if (StoreInst *SI = dyn_cast<StoreInst>(I))
         MadeChange |= processStore(SI, BI);
       else if (MemSetInst *M = dyn_cast<MemSetInst>(I))
@@ -960,7 +964,7 @@ bool MemCpyOpt::iterateOnFunction(Function &F) {
       }
     }
   }
-  
+
   return MadeChange;
 }
 
@@ -972,19 +976,19 @@ bool MemCpyOpt::runOnFunction(Function &F) {
   MD = &getAnalysis<MemoryDependenceAnalysis>();
   TD = getAnalysisIfAvailable<TargetData>();
   TLI = &getAnalysis<TargetLibraryInfo>();
-  
+
   // If we don't have at least memset and memcpy, there is little point of doing
   // anything here.  These are required by a freestanding implementation, so if
   // even they are disabled, there is no point in trying hard.
   if (!TLI->has(LibFunc::memset) || !TLI->has(LibFunc::memcpy))
     return false;
-  
+
   while (1) {
     if (!iterateOnFunction(F))
       break;
     MadeChange = true;
   }
-  
+
   MD = 0;
   return MadeChange;
 }
diff --git a/lib/Transforms/Scalar/ObjCARC.cpp b/lib/Transforms/Scalar/ObjCARC.cpp
index 29234da..3222f20 100644
--- a/lib/Transforms/Scalar/ObjCARC.cpp
+++ b/lib/Transforms/Scalar/ObjCARC.cpp
@@ -20,7 +20,7 @@
 // This file also defines a simple ARC-aware AliasAnalysis.
 //
 // WARNING: This file knows about certain library functions. It recognizes them
-// by name, and hardwires knowedge of their semantics.
+// by name, and hardwires knowledge of their semantics.
 //
 // WARNING: This file knows about how certain Objective-C library functions are
 // used. Naive LLVM IR transformations which would otherwise be
@@ -29,18 +29,8 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "objc-arc"
-#include "llvm/Function.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/GlobalVariable.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Module.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Support/CallSite.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
 using namespace llvm;
 
 // A handy option to enable/disable all optimizations in this file.
@@ -141,6 +131,13 @@ namespace {
 // ARC Utilities.
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Intrinsics.h"
+#include "llvm/Module.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/ADT/StringSwitch.h"
+
 namespace {
   /// InstructionClass - A simple classification for instructions.
   enum InstructionClass {
@@ -299,22 +296,23 @@ static InstructionClass GetInstructionClass(const Value *V) {
         // None of the intrinsic functions do objc_release. For intrinsics, the
         // only question is whether or not they may be users.
         switch (F->getIntrinsicID()) {
-        case 0: break;
-        case Intrinsic::bswap: case Intrinsic::ctpop:
-        case Intrinsic::ctlz: case Intrinsic::cttz:
         case Intrinsic::returnaddress: case Intrinsic::frameaddress:
         case Intrinsic::stacksave: case Intrinsic::stackrestore:
         case Intrinsic::vastart: case Intrinsic::vacopy: case Intrinsic::vaend:
+        case Intrinsic::objectsize: case Intrinsic::prefetch:
+        case Intrinsic::stackprotector:
+        case Intrinsic::eh_return_i32: case Intrinsic::eh_return_i64:
+        case Intrinsic::eh_typeid_for: case Intrinsic::eh_dwarf_cfa:
+        case Intrinsic::eh_sjlj_lsda: case Intrinsic::eh_sjlj_functioncontext:
+        case Intrinsic::init_trampoline: case Intrinsic::adjust_trampoline:
+        case Intrinsic::lifetime_start: case Intrinsic::lifetime_end:
+        case Intrinsic::invariant_start: case Intrinsic::invariant_end:
         // Don't let dbg info affect our results.
         case Intrinsic::dbg_declare: case Intrinsic::dbg_value:
           // Short cut: Some intrinsics obviously don't use ObjC pointers.
           return IC_None;
         default:
-          for (Function::const_arg_iterator AI = F->arg_begin(),
-               AE = F->arg_end(); AI != AE; ++AI)
-            if (IsPotentialUse(AI))
-              return IC_User;
-          return IC_None;
+          break;
         }
       }
       return GetCallSiteClass(CI);
@@ -382,14 +380,14 @@ static InstructionClass GetBasicInstructionClass(const Value *V) {
   return isa<InvokeInst>(V) ? IC_CallOrUser : IC_User;
 }
 
-/// IsRetain - Test if the the given class is objc_retain or
+/// IsRetain - Test if the given class is objc_retain or
 /// equivalent.
 static bool IsRetain(InstructionClass Class) {
   return Class == IC_Retain ||
          Class == IC_RetainRV;
 }
 
-/// IsAutorelease - Test if the the given class is objc_autorelease or
+/// IsAutorelease - Test if the given class is objc_autorelease or
 /// equivalent.
 static bool IsAutorelease(InstructionClass Class) {
   return Class == IC_Autorelease ||
@@ -444,7 +442,7 @@ static bool IsNoThrow(InstructionClass Class) {
          Class == IC_AutoreleasepoolPop;
 }
 
-/// EraseInstruction - Erase the given instruction. ObjC calls return their
+/// EraseInstruction - Erase the given instruction. Many ObjC calls return their
 /// argument verbatim, so if it's such a call and the return value has users,
 /// replace them with the argument value.
 static void EraseInstruction(Instruction *CI) {
@@ -565,9 +563,8 @@ static const Value *FindSingleUseIdentifiedObject(const Value *Arg) {
     return Arg;
   }
 
-  // If we found an identifiable object but it has multiple uses, but they
-  // are trivial uses, we can still consider this to be a single-use
-  // value.
+  // If we found an identifiable object but it has multiple uses, but they are
+  // trivial uses, we can still consider this to be a single-use value.
   if (IsObjCIdentifiedObject(Arg)) {
     for (Value::const_use_iterator UI = Arg->use_begin(), UE = Arg->use_end();
          UI != UE; ++UI) {
@@ -692,7 +689,7 @@ namespace {
     /// specified pass info.
     virtual void *getAdjustedAnalysisPointer(const void *PI) {
       if (PI == &AliasAnalysis::ID)
-        return (AliasAnalysis*)this;
+        return static_cast<AliasAnalysis *>(this);
       return this;
     }
 
@@ -815,7 +812,7 @@ ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS, const Location &Loc) {
   case IC_FusedRetainAutorelease:
   case IC_FusedRetainAutoreleaseRV:
     // These functions don't access any memory visible to the compiler.
-    // Note that this doesn't include objc_retainBlock, becuase it updates
+    // Note that this doesn't include objc_retainBlock, because it updates
     // pointers when it copies block data.
     return NoModRef;
   default:
@@ -915,6 +912,7 @@ bool ObjCARCExpand::runOnFunction(Function &F) {
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Constants.h"
+#include "llvm/ADT/STLExtras.h"
 
 namespace {
   /// ObjCARCAPElim - Autorelease pool elimination.
@@ -922,8 +920,8 @@ namespace {
     virtual void getAnalysisUsage(AnalysisUsage &AU) const;
     virtual bool runOnModule(Module &M);
 
-    bool MayAutorelease(CallSite CS, unsigned Depth = 0);
-    bool OptimizeBB(BasicBlock *BB);
+    static bool MayAutorelease(ImmutableCallSite CS, unsigned Depth = 0);
+    static bool OptimizeBB(BasicBlock *BB);
 
   public:
     static char ID;
@@ -949,15 +947,16 @@ void ObjCARCAPElim::getAnalysisUsage(AnalysisUsage &AU) const {
 
 /// MayAutorelease - Interprocedurally determine if calls made by the
 /// given call site can possibly produce autoreleases.
-bool ObjCARCAPElim::MayAutorelease(CallSite CS, unsigned Depth) {
-  if (Function *Callee = CS.getCalledFunction()) {
+bool ObjCARCAPElim::MayAutorelease(ImmutableCallSite CS, unsigned Depth) {
+  if (const Function *Callee = CS.getCalledFunction()) {
     if (Callee->isDeclaration() || Callee->mayBeOverridden())
       return true;
-    for (Function::iterator I = Callee->begin(), E = Callee->end();
+    for (Function::const_iterator I = Callee->begin(), E = Callee->end();
          I != E; ++I) {
-      BasicBlock *BB = I;
-      for (BasicBlock::iterator J = BB->begin(), F = BB->end(); J != F; ++J)
-        if (CallSite JCS = CallSite(J))
+      const BasicBlock *BB = I;
+      for (BasicBlock::const_iterator J = BB->begin(), F = BB->end();
+           J != F; ++J)
+        if (ImmutableCallSite JCS = ImmutableCallSite(J))
           // This recursion depth limit is arbitrary. It's just great
           // enough to cover known interesting testcases.
           if (Depth < 3 &&
@@ -992,7 +991,7 @@ bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) {
       Push = 0;
       break;
     case IC_CallOrUser:
-      if (MayAutorelease(CallSite(Inst)))
+      if (MayAutorelease(ImmutableCallSite(Inst)))
         Push = 0;
       break;
     default:
@@ -1093,14 +1092,10 @@ bool ObjCARCAPElim::runOnModule(Module &M) {
 
 // TODO: Delete release+retain pairs (rare).
 
-#include "llvm/GlobalAlias.h"
-#include "llvm/Constants.h"
 #include "llvm/LLVMContext.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/DenseSet.h"
 
 STATISTIC(NumNoops,       "Number of no-op objc calls eliminated");
 STATISTIC(NumPartialNoops, "Number of partially no-op objc calls eliminated");
@@ -1148,22 +1143,13 @@ bool ProvenanceAnalysis::relatedSelect(const SelectInst *A, const Value *B) {
   // If the values are Selects with the same condition, we can do a more precise
   // check: just check for relations between the values on corresponding arms.
   if (const SelectInst *SB = dyn_cast<SelectInst>(B))
-    if (A->getCondition() == SB->getCondition()) {
-      if (related(A->getTrueValue(), SB->getTrueValue()))
-        return true;
-      if (related(A->getFalseValue(), SB->getFalseValue()))
-        return true;
-      return false;
-    }
+    if (A->getCondition() == SB->getCondition())
+      return related(A->getTrueValue(), SB->getTrueValue()) ||
+             related(A->getFalseValue(), SB->getFalseValue());
 
   // Check both arms of the Select node individually.
-  if (related(A->getTrueValue(), B))
-    return true;
-  if (related(A->getFalseValue(), B))
-    return true;
-
-  // The arms both checked out.
-  return false;
+  return related(A->getTrueValue(), B) ||
+         related(A->getFalseValue(), B);
 }
 
 bool ProvenanceAnalysis::relatedPHI(const PHINode *A, const Value *B) {
@@ -1361,12 +1347,6 @@ namespace {
     /// with the "tail" keyword.
     bool IsTailCallRelease;
 
-    /// Partial - True of we've seen an opportunity for partial RR elimination,
-    /// such as pushing calls into a CFG triangle or into one side of a
-    /// CFG diamond.
-    /// TODO: Consider moving this to PtrState.
-    bool Partial;
-
     /// ReleaseMetadata - If the Calls are objc_release calls and they all have
     /// a clang.imprecise_release tag, this is the metadata tag.
     MDNode *ReleaseMetadata;
@@ -1381,7 +1361,7 @@ namespace {
 
     RRInfo() :
       KnownSafe(false), IsRetainBlock(false),
-      IsTailCallRelease(false), Partial(false),
+      IsTailCallRelease(false),
       ReleaseMetadata(0) {}
 
     void clear();
@@ -1392,7 +1372,6 @@ void RRInfo::clear() {
   KnownSafe = false;
   IsRetainBlock = false;
   IsTailCallRelease = false;
-  Partial = false;
   ReleaseMetadata = 0;
   Calls.clear();
   ReverseInsertPts.clear();
@@ -1402,36 +1381,39 @@ namespace {
   /// PtrState - This class summarizes several per-pointer runtime properties
   /// which are propogated through the flow graph.
   class PtrState {
-    /// RefCount - The known minimum number of reference count increments.
-    unsigned RefCount;
-
     /// NestCount - The known minimum level of retain+release nesting.
     unsigned NestCount;
 
+    /// KnownPositiveRefCount - True if the reference count is known to
+    /// be incremented.
+    bool KnownPositiveRefCount;
+
+    /// Partial - True of we've seen an opportunity for partial RR elimination,
+    /// such as pushing calls into a CFG triangle or into one side of a
+    /// CFG diamond.
+    bool Partial;
+
     /// Seq - The current position in the sequence.
-    Sequence Seq;
+    Sequence Seq : 8;
 
   public:
     /// RRI - Unidirectional information about the current sequence.
     /// TODO: Encapsulate this better.
     RRInfo RRI;
 
-    PtrState() : RefCount(0), NestCount(0), Seq(S_None) {}
-
-    void SetAtLeastOneRefCount()  {
-      if (RefCount == 0) RefCount = 1;
-    }
+    PtrState() : NestCount(0), KnownPositiveRefCount(false), Partial(false),
+                 Seq(S_None) {}
 
-    void IncrementRefCount() {
-      if (RefCount != UINT_MAX) ++RefCount;
+    void SetKnownPositiveRefCount() {
+      KnownPositiveRefCount = true;
     }
 
-    void DecrementRefCount() {
-      if (RefCount != 0) --RefCount;
+    void ClearRefCount() {
+      KnownPositiveRefCount = false;
     }
 
     bool IsKnownIncremented() const {
-      return RefCount > 0;
+      return KnownPositiveRefCount;
     }
 
     void IncrementNestCount() {
@@ -1455,7 +1437,12 @@ namespace {
     }
 
     void ClearSequenceProgress() {
-      Seq = S_None;
+      ResetSequenceProgress(S_None);
+    }
+
+    void ResetSequenceProgress(Sequence NewSeq) {
+      Seq = NewSeq;
+      Partial = false;
       RRI.clear();
     }
 
@@ -1466,7 +1453,7 @@ namespace {
 void
 PtrState::Merge(const PtrState &Other, bool TopDown) {
   Seq = MergeSeqs(Seq, Other.Seq, TopDown);
-  RefCount = std::min(RefCount, Other.RefCount);
+  KnownPositiveRefCount = KnownPositiveRefCount && Other.KnownPositiveRefCount;
   NestCount = std::min(NestCount, Other.NestCount);
 
   // We can't merge a plain objc_retain with an objc_retainBlock.
@@ -1475,31 +1462,31 @@ PtrState::Merge(const PtrState &Other, bool TopDown) {
 
   // If we're not in a sequence (anymore), drop all associated state.
   if (Seq == S_None) {
+    Partial = false;
     RRI.clear();
-  } else if (RRI.Partial || Other.RRI.Partial) {
+  } else if (Partial || Other.Partial) {
     // If we're doing a merge on a path that's previously seen a partial
     // merge, conservatively drop the sequence, to avoid doing partial
     // RR elimination. If the branch predicates for the two merge differ,
     // mixing them is unsafe.
-    Seq = S_None;
-    RRI.clear();
+    ClearSequenceProgress();
   } else {
     // Conservatively merge the ReleaseMetadata information.
     if (RRI.ReleaseMetadata != Other.RRI.ReleaseMetadata)
       RRI.ReleaseMetadata = 0;
 
     RRI.KnownSafe = RRI.KnownSafe && Other.RRI.KnownSafe;
-    RRI.IsTailCallRelease = RRI.IsTailCallRelease && Other.RRI.IsTailCallRelease;
+    RRI.IsTailCallRelease = RRI.IsTailCallRelease &&
+                            Other.RRI.IsTailCallRelease;
     RRI.Calls.insert(Other.RRI.Calls.begin(), Other.RRI.Calls.end());
 
     // Merge the insert point sets. If there are any differences,
     // that makes this a partial merge.
-    RRI.Partial = RRI.ReverseInsertPts.size() !=
-                  Other.RRI.ReverseInsertPts.size();
+    Partial = RRI.ReverseInsertPts.size() != Other.RRI.ReverseInsertPts.size();
     for (SmallPtrSet<Instruction *, 2>::const_iterator
          I = Other.RRI.ReverseInsertPts.begin(),
          E = Other.RRI.ReverseInsertPts.end(); I != E; ++I)
-      RRI.Partial |= RRI.ReverseInsertPts.insert(*I);
+      Partial |= RRI.ReverseInsertPts.insert(*I);
   }
 }
 
@@ -1525,6 +1512,11 @@ namespace {
     /// known about a pointer at the top of each block.
     MapTy PerPtrBottomUp;
 
+    /// Preds, Succs - Effective successors and predecessors of the current
+    /// block (this ignores ignorable edges and ignored backedges).
+    SmallVector<BasicBlock *, 2> Preds;
+    SmallVector<BasicBlock *, 2> Succs;
+
   public:
     BBState() : TopDownPathCount(0), BottomUpPathCount(0) {}
 
@@ -1582,14 +1574,22 @@ namespace {
     /// entry to an exit which pass through this block. This is only valid
     /// after both the top-down and bottom-up traversals are complete.
     unsigned GetAllPathCount() const {
+      assert(TopDownPathCount != 0);
+      assert(BottomUpPathCount != 0);
       return TopDownPathCount * BottomUpPathCount;
     }
 
-    /// IsVisitedTopDown - Test whether the block for this BBState has been
-    /// visited by the top-down portion of the algorithm.
-    bool isVisitedTopDown() const {
-      return TopDownPathCount != 0;
-    }
+    // Specialized CFG utilities.
+    typedef SmallVectorImpl<BasicBlock *>::const_iterator edge_iterator;
+    edge_iterator pred_begin() { return Preds.begin(); }
+    edge_iterator pred_end() { return Preds.end(); }
+    edge_iterator succ_begin() { return Succs.begin(); }
+    edge_iterator succ_end() { return Succs.end(); }
+
+    void addSucc(BasicBlock *Succ) { Succs.push_back(Succ); }
+    void addPred(BasicBlock *Pred) { Preds.push_back(Pred); }
+
+    bool isExit() const { return Succs.empty(); }
   };
 }
 
@@ -1787,12 +1787,9 @@ Constant *ObjCARCOpt::getRetainRVCallee(Module *M) {
   if (!RetainRVCallee) {
     LLVMContext &C = M->getContext();
     Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-    std::vector<Type *> Params;
-    Params.push_back(I8X);
-    FunctionType *FTy =
-      FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttrListPtr Attributes;
-    Attributes.addAttr(~0u, Attribute::NoUnwind);
+    Type *Params[] = { I8X };
+    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
+    AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind);
     RetainRVCallee =
       M->getOrInsertFunction("objc_retainAutoreleasedReturnValue", FTy,
                              Attributes);
@@ -1804,12 +1801,9 @@ Constant *ObjCARCOpt::getAutoreleaseRVCallee(Module *M) {
   if (!AutoreleaseRVCallee) {
     LLVMContext &C = M->getContext();
     Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-    std::vector<Type *> Params;
-    Params.push_back(I8X);
-    FunctionType *FTy =
-      FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttrListPtr Attributes;
-    Attributes.addAttr(~0u, Attribute::NoUnwind);
+    Type *Params[] = { I8X };
+    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
+    AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind);
     AutoreleaseRVCallee =
       M->getOrInsertFunction("objc_autoreleaseReturnValue", FTy,
                              Attributes);
@@ -1820,10 +1814,8 @@ Constant *ObjCARCOpt::getAutoreleaseRVCallee(Module *M) {
 Constant *ObjCARCOpt::getReleaseCallee(Module *M) {
   if (!ReleaseCallee) {
     LLVMContext &C = M->getContext();
-    std::vector<Type *> Params;
-    Params.push_back(PointerType::getUnqual(Type::getInt8Ty(C)));
-    AttrListPtr Attributes;
-    Attributes.addAttr(~0u, Attribute::NoUnwind);
+    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
+    AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind);
     ReleaseCallee =
       M->getOrInsertFunction(
         "objc_release",
@@ -1836,10 +1828,8 @@ Constant *ObjCARCOpt::getReleaseCallee(Module *M) {
 Constant *ObjCARCOpt::getRetainCallee(Module *M) {
   if (!RetainCallee) {
     LLVMContext &C = M->getContext();
-    std::vector<Type *> Params;
-    Params.push_back(PointerType::getUnqual(Type::getInt8Ty(C)));
-    AttrListPtr Attributes;
-    Attributes.addAttr(~0u, Attribute::NoUnwind);
+    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
+    AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind);
     RetainCallee =
       M->getOrInsertFunction(
         "objc_retain",
@@ -1852,16 +1842,14 @@ Constant *ObjCARCOpt::getRetainCallee(Module *M) {
 Constant *ObjCARCOpt::getRetainBlockCallee(Module *M) {
   if (!RetainBlockCallee) {
     LLVMContext &C = M->getContext();
-    std::vector<Type *> Params;
-    Params.push_back(PointerType::getUnqual(Type::getInt8Ty(C)));
-    AttrListPtr Attributes;
+    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
     // objc_retainBlock is not nounwind because it calls user copy constructors
     // which could theoretically throw.
     RetainBlockCallee =
       M->getOrInsertFunction(
         "objc_retainBlock",
         FunctionType::get(Params[0], Params, /*isVarArg=*/false),
-        Attributes);
+        AttrListPtr());
   }
   return RetainBlockCallee;
 }
@@ -1869,10 +1857,8 @@ Constant *ObjCARCOpt::getRetainBlockCallee(Module *M) {
 Constant *ObjCARCOpt::getAutoreleaseCallee(Module *M) {
   if (!AutoreleaseCallee) {
     LLVMContext &C = M->getContext();
-    std::vector<Type *> Params;
-    Params.push_back(PointerType::getUnqual(Type::getInt8Ty(C)));
-    AttrListPtr Attributes;
-    Attributes.addAttr(~0u, Attribute::NoUnwind);
+    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
+    AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind);
     AutoreleaseCallee =
       M->getOrInsertFunction(
         "objc_autorelease",
@@ -2157,13 +2143,13 @@ static bool isNoopInstruction(const Instruction *I) {
 /// objc_retainAutoreleasedReturnValue if the operand is a return value.
 void
 ObjCARCOpt::OptimizeRetainCall(Function &F, Instruction *Retain) {
-  CallSite CS(GetObjCArg(Retain));
-  Instruction *Call = CS.getInstruction();
+  ImmutableCallSite CS(GetObjCArg(Retain));
+  const Instruction *Call = CS.getInstruction();
   if (!Call) return;
   if (Call->getParent() != Retain->getParent()) return;
 
   // Check that the call is next to the retain.
-  BasicBlock::iterator I = Call;
+  BasicBlock::const_iterator I = Call;
   ++I;
   while (isNoopInstruction(I)) ++I;
   if (&*I != Retain)
@@ -2176,25 +2162,24 @@ ObjCARCOpt::OptimizeRetainCall(Function &F, Instruction *Retain) {
 }
 
 /// OptimizeRetainRVCall - Turn objc_retainAutoreleasedReturnValue into
-/// objc_retain if the operand is not a return value.  Or, if it can be
-/// paired with an objc_autoreleaseReturnValue, delete the pair and
-/// return true.
+/// objc_retain if the operand is not a return value.  Or, if it can be paired
+/// with an objc_autoreleaseReturnValue, delete the pair and return true.
 bool
 ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
   // Check for the argument being from an immediately preceding call or invoke.
-  Value *Arg = GetObjCArg(RetainRV);
-  CallSite CS(Arg);
-  if (Instruction *Call = CS.getInstruction()) {
+  const Value *Arg = GetObjCArg(RetainRV);
+  ImmutableCallSite CS(Arg);
+  if (const Instruction *Call = CS.getInstruction()) {
     if (Call->getParent() == RetainRV->getParent()) {
-      BasicBlock::iterator I = Call;
+      BasicBlock::const_iterator I = Call;
       ++I;
       while (isNoopInstruction(I)) ++I;
       if (&*I == RetainRV)
         return false;
-    } else if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+    } else if (const InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
       BasicBlock *RetainRVParent = RetainRV->getParent();
       if (II->getNormalDest() == RetainRVParent) {
-        BasicBlock::iterator I = RetainRVParent->begin();
+        BasicBlock::const_iterator I = RetainRVParent->begin();
         while (isNoopInstruction(I)) ++I;
         if (&*I == RetainRV)
           return false;
@@ -2422,7 +2407,8 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
           // These can always be moved up.
           break;
         case IC_Release:
-          // These can't be moved across things that care about the retain count.
+          // These can't be moved across things that care about the retain
+          // count.
           FindDependencies(NeedsPositiveRetainCount, Arg,
                            Inst->getParent(), Inst,
                            DependingInstructions, Visited, PA);
@@ -2504,13 +2490,14 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
       for (; SI != SE; ++SI) {
         Sequence SuccSSeq = S_None;
         bool SuccSRRIKnownSafe = false;
-        // If VisitBottomUp has visited this successor, take what we know about it.
-        DenseMap<const BasicBlock *, BBState>::iterator BBI = BBStates.find(*SI);
-        if (BBI != BBStates.end()) {
-          const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
-          SuccSSeq = SuccS.GetSeq();
-          SuccSRRIKnownSafe = SuccS.RRI.KnownSafe;
-        }
+        // If VisitBottomUp has pointer information for this successor, take
+        // what we know about it.
+        DenseMap<const BasicBlock *, BBState>::iterator BBI =
+          BBStates.find(*SI);
+        assert(BBI != BBStates.end());
+        const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
+        SuccSSeq = SuccS.GetSeq();
+        SuccSRRIKnownSafe = SuccS.RRI.KnownSafe;
         switch (SuccSSeq) {
         case S_None:
         case S_CanRelease: {
@@ -2557,13 +2544,14 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
       for (; SI != SE; ++SI) {
         Sequence SuccSSeq = S_None;
         bool SuccSRRIKnownSafe = false;
-        // If VisitBottomUp has visited this successor, take what we know about it.
-        DenseMap<const BasicBlock *, BBState>::iterator BBI = BBStates.find(*SI);
-        if (BBI != BBStates.end()) {
-          const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
-          SuccSSeq = SuccS.GetSeq();
-          SuccSRRIKnownSafe = SuccS.RRI.KnownSafe;
-        }
+        // If VisitBottomUp has pointer information for this successor, take
+        // what we know about it.
+        DenseMap<const BasicBlock *, BBState>::iterator BBI =
+          BBStates.find(*SI);
+        assert(BBI != BBStates.end());
+        const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
+        SuccSSeq = SuccS.GetSeq();
+        SuccSRRIKnownSafe = SuccS.RRI.KnownSafe;
         switch (SuccSSeq) {
         case S_None: {
           if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) {
@@ -2621,16 +2609,13 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
     if (S.GetSeq() == S_Release || S.GetSeq() == S_MovableRelease)
       NestingDetected = true;
 
-    S.RRI.clear();
-
     MDNode *ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
-    S.SetSeq(ReleaseMetadata ? S_MovableRelease : S_Release);
+    S.ResetSequenceProgress(ReleaseMetadata ? S_MovableRelease : S_Release);
     S.RRI.ReleaseMetadata = ReleaseMetadata;
     S.RRI.KnownSafe = S.IsKnownNested() || S.IsKnownIncremented();
     S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
     S.RRI.Calls.insert(Inst);
 
-    S.IncrementRefCount();
     S.IncrementNestCount();
     break;
   }
@@ -2645,8 +2630,7 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
     Arg = GetObjCArg(Inst);
 
     PtrState &S = MyStates.getPtrBottomUpState(Arg);
-    S.DecrementRefCount();
-    S.SetAtLeastOneRefCount();
+    S.SetKnownPositiveRefCount();
     S.DecrementNestCount();
 
     switch (S.GetSeq()) {
@@ -2696,7 +2680,7 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
 
     // Check for possible releases.
     if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
-      S.DecrementRefCount();
+      S.ClearRefCount();
       switch (Seq) {
       case S_Use:
         S.SetSeq(S_CanRelease);
@@ -2763,37 +2747,20 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
 
   // Merge the states from each successor to compute the initial state
   // for the current block.
-  const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
-  succ_const_iterator SI(TI), SE(TI, false);
-  if (SI == SE)
-    MyStates.SetAsExit();
-  else {
-    // If the terminator is an invoke marked with the
-    // clang.arc.no_objc_arc_exceptions metadata, the unwind edge can be
-    // ignored, for ARC purposes.
-    if (isa<InvokeInst>(TI) && TI->getMetadata(NoObjCARCExceptionsMDKind))
-      --SE;
-
-    do {
-      const BasicBlock *Succ = *SI++;
-      if (Succ == BB)
-        continue;
-      DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ);
-      // If we haven't seen this node yet, then we've found a CFG cycle.
-      // Be optimistic here; it's CheckForCFGHazards' job detect trouble.
-      if (I == BBStates.end())
-        continue;
-      MyStates.InitFromSucc(I->second);
-      while (SI != SE) {
-        Succ = *SI++;
-        if (Succ != BB) {
-          I = BBStates.find(Succ);
-          if (I != BBStates.end())
-            MyStates.MergeSucc(I->second);
-        }
-      }
-      break;
-    } while (SI != SE);
+  for (BBState::edge_iterator SI(MyStates.succ_begin()),
+       SE(MyStates.succ_end()); SI != SE; ++SI) {
+    const BasicBlock *Succ = *SI;
+    DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ);
+    assert(I != BBStates.end());
+    MyStates.InitFromSucc(I->second);
+    ++SI;
+    for (; SI != SE; ++SI) {
+      Succ = *SI;
+      I = BBStates.find(Succ);
+      assert(I != BBStates.end());
+      MyStates.MergeSucc(I->second);
+    }
+    break;
   }
 
   // Visit all the instructions, bottom-up.
@@ -2807,15 +2774,14 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
     NestingDetected |= VisitInstructionBottomUp(Inst, BB, Retains, MyStates);
   }
 
-  // If there's a predecessor with an invoke, visit the invoke as
-  // if it were part of this block, since we can't insert code after
-  // an invoke in its own block, and we don't want to split critical
-  // edges.
-  for (pred_iterator PI(BB), PE(BB, false); PI != PE; ++PI) {
+  // If there's a predecessor with an invoke, visit the invoke as if it were
+  // part of this block, since we can't insert code after an invoke in its own
+  // block, and we don't want to split critical edges.
+  for (BBState::edge_iterator PI(MyStates.pred_begin()),
+       PE(MyStates.pred_end()); PI != PE; ++PI) {
     BasicBlock *Pred = *PI;
-    TerminatorInst *PredTI = cast<TerminatorInst>(&Pred->back());
-    if (isa<InvokeInst>(PredTI))
-      NestingDetected |= VisitInstructionBottomUp(PredTI, BB, Retains, MyStates);
+    if (InvokeInst *II = dyn_cast<InvokeInst>(&Pred->back()))
+      NestingDetected |= VisitInstructionBottomUp(II, BB, Retains, MyStates);
   }
 
   return NestingDetected;
@@ -2855,25 +2821,23 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
       if (S.GetSeq() == S_Retain)
         NestingDetected = true;
 
-      S.SetSeq(S_Retain);
-      S.RRI.clear();
+      S.ResetSequenceProgress(S_Retain);
       S.RRI.IsRetainBlock = Class == IC_RetainBlock;
-      // Don't check S.IsKnownIncremented() here because it's not
-      // sufficient.
+      // Don't check S.IsKnownIncremented() here because it's not sufficient.
       S.RRI.KnownSafe = S.IsKnownNested();
       S.RRI.Calls.insert(Inst);
     }
 
-    S.SetAtLeastOneRefCount();
-    S.IncrementRefCount();
     S.IncrementNestCount();
-    return NestingDetected;
+
+    // A retain can be a potential use; procede to the generic checking
+    // code below.
+    break;
   }
   case IC_Release: {
     Arg = GetObjCArg(Inst);
 
     PtrState &S = MyStates.getPtrTopDownState(Arg);
-    S.DecrementRefCount();
     S.DecrementNestCount();
 
     switch (S.GetSeq()) {
@@ -2920,7 +2884,7 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
 
     // Check for possible releases.
     if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
-      S.DecrementRefCount();
+      S.ClearRefCount();
       switch (Seq) {
       case S_Retain:
         S.SetSeq(S_CanRelease);
@@ -2971,41 +2935,21 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,
 
   // Merge the states from each predecessor to compute the initial state
   // for the current block.
-  const_pred_iterator PI(BB), PE(BB, false);
-  if (PI == PE)
-    MyStates.SetAsEntry();
-  else
-    do {
-      unsigned OperandNo = PI.getOperandNo();
-      const Use &Us = PI.getUse();
-      ++PI;
-
-      // Skip invoke unwind edges on invoke instructions marked with
-      // clang.arc.no_objc_arc_exceptions.
-      if (const InvokeInst *II = dyn_cast<InvokeInst>(Us.getUser()))
-        if (OperandNo == II->getNumArgOperands() + 2 &&
-            II->getMetadata(NoObjCARCExceptionsMDKind))
-          continue;
-
-      const BasicBlock *Pred = cast<TerminatorInst>(Us.getUser())->getParent();
-      if (Pred == BB)
-        continue;
-      DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred);
-      // If we haven't seen this node yet, then we've found a CFG cycle.
-      // Be optimistic here; it's CheckForCFGHazards' job detect trouble.
-      if (I == BBStates.end() || !I->second.isVisitedTopDown())
-        continue;
-      MyStates.InitFromPred(I->second);
-      while (PI != PE) {
-        Pred = *PI++;
-        if (Pred != BB) {
-          I = BBStates.find(Pred);
-          if (I != BBStates.end() && I->second.isVisitedTopDown())
-            MyStates.MergePred(I->second);
-        }
-      }
-      break;
-    } while (PI != PE);
+  for (BBState::edge_iterator PI(MyStates.pred_begin()),
+       PE(MyStates.pred_end()); PI != PE; ++PI) {
+    const BasicBlock *Pred = *PI;
+    DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred);
+    assert(I != BBStates.end());
+    MyStates.InitFromPred(I->second);
+    ++PI;
+    for (; PI != PE; ++PI) {
+      Pred = *PI;
+      I = BBStates.find(Pred);
+      assert(I != BBStates.end());
+      MyStates.MergePred(I->second);
+    }
+    break;
+  }
 
   // Visit all the instructions, top-down.
   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
@@ -3020,73 +2964,82 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,
 static void
 ComputePostOrders(Function &F,
                   SmallVectorImpl<BasicBlock *> &PostOrder,
-                  SmallVectorImpl<BasicBlock *> &ReverseCFGPostOrder) {
-  /// Backedges - Backedges detected in the DFS. These edges will be
-  /// ignored in the reverse-CFG DFS, so that loops with multiple exits will be
-  /// traversed in the desired order.
-  DenseSet<std::pair<BasicBlock *, BasicBlock *> > Backedges;
-
+                  SmallVectorImpl<BasicBlock *> &ReverseCFGPostOrder,
+                  unsigned NoObjCARCExceptionsMDKind,
+                  DenseMap<const BasicBlock *, BBState> &BBStates) {
   /// Visited - The visited set, for doing DFS walks.
   SmallPtrSet<BasicBlock *, 16> Visited;
 
   // Do DFS, computing the PostOrder.
   SmallPtrSet<BasicBlock *, 16> OnStack;
   SmallVector<std::pair<BasicBlock *, succ_iterator>, 16> SuccStack;
+
+  // Functions always have exactly one entry block, and we don't have
+  // any other block that we treat like an entry block.
   BasicBlock *EntryBB = &F.getEntryBlock();
-  SuccStack.push_back(std::make_pair(EntryBB, succ_begin(EntryBB)));
+  BBState &MyStates = BBStates[EntryBB];
+  MyStates.SetAsEntry();
+  TerminatorInst *EntryTI = cast<TerminatorInst>(&EntryBB->back());
+  SuccStack.push_back(std::make_pair(EntryBB, succ_iterator(EntryTI)));
   Visited.insert(EntryBB);
   OnStack.insert(EntryBB);
   do {
   dfs_next_succ:
-    TerminatorInst *TI = cast<TerminatorInst>(&SuccStack.back().first->back());
-    succ_iterator End = succ_iterator(TI, true);
-    while (SuccStack.back().second != End) {
-      BasicBlock *BB = *SuccStack.back().second++;
-      if (Visited.insert(BB)) {
-        SuccStack.push_back(std::make_pair(BB, succ_begin(BB)));
-        OnStack.insert(BB);
+    BasicBlock *CurrBB = SuccStack.back().first;
+    TerminatorInst *TI = cast<TerminatorInst>(&CurrBB->back());
+    succ_iterator SE(TI, false);
+
+    // If the terminator is an invoke marked with the
+    // clang.arc.no_objc_arc_exceptions metadata, the unwind edge can be
+    // ignored, for ARC purposes.
+    if (isa<InvokeInst>(TI) && TI->getMetadata(NoObjCARCExceptionsMDKind))
+      --SE;
+
+    while (SuccStack.back().second != SE) {
+      BasicBlock *SuccBB = *SuccStack.back().second++;
+      if (Visited.insert(SuccBB)) {
+        TerminatorInst *TI = cast<TerminatorInst>(&SuccBB->back());
+        SuccStack.push_back(std::make_pair(SuccBB, succ_iterator(TI)));
+        BBStates[CurrBB].addSucc(SuccBB);
+        BBState &SuccStates = BBStates[SuccBB];
+        SuccStates.addPred(CurrBB);
+        OnStack.insert(SuccBB);
         goto dfs_next_succ;
       }
-      if (OnStack.count(BB))
-        Backedges.insert(std::make_pair(SuccStack.back().first, BB));
+
+      if (!OnStack.count(SuccBB)) {
+        BBStates[CurrBB].addSucc(SuccBB);
+        BBStates[SuccBB].addPred(CurrBB);
+      }
     }
-    OnStack.erase(SuccStack.back().first);
-    PostOrder.push_back(SuccStack.pop_back_val().first);
+    OnStack.erase(CurrBB);
+    PostOrder.push_back(CurrBB);
+    SuccStack.pop_back();
   } while (!SuccStack.empty());
 
   Visited.clear();
 
-  // Compute the exits, which are the starting points for reverse-CFG DFS.
-  // This includes blocks where all the successors are backedges that
-  // we're skipping.
-  SmallVector<BasicBlock *, 4> Exits;
+  // Do reverse-CFG DFS, computing the reverse-CFG PostOrder.
+  // Functions may have many exits, and there also blocks which we treat
+  // as exits due to ignored edges.
+  SmallVector<std::pair<BasicBlock *, BBState::edge_iterator>, 16> PredStack;
   for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
-    BasicBlock *BB = I;
-    TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
-    for (succ_iterator SI(TI), SE(TI, true); SI != SE; ++SI)
-      if (!Backedges.count(std::make_pair(BB, *SI)))
-        goto HasNonBackedgeSucc;
-    Exits.push_back(BB);
-  HasNonBackedgeSucc:;
-  }
+    BasicBlock *ExitBB = I;
+    BBState &MyStates = BBStates[ExitBB];
+    if (!MyStates.isExit())
+      continue;
 
-  // Do reverse-CFG DFS, computing the reverse-CFG PostOrder.
-  SmallVector<std::pair<BasicBlock *, pred_iterator>, 16> PredStack;
-  for (SmallVectorImpl<BasicBlock *>::iterator I = Exits.begin(), E = Exits.end();
-       I != E; ++I) {
-    BasicBlock *ExitBB = *I;
-    PredStack.push_back(std::make_pair(ExitBB, pred_begin(ExitBB)));
+    MyStates.SetAsExit();
+
+    PredStack.push_back(std::make_pair(ExitBB, MyStates.pred_begin()));
     Visited.insert(ExitBB);
     while (!PredStack.empty()) {
     reverse_dfs_next_succ:
-      pred_iterator End = pred_end(PredStack.back().first);
-      while (PredStack.back().second != End) {
+      BBState::edge_iterator PE = BBStates[PredStack.back().first].pred_end();
+      while (PredStack.back().second != PE) {
         BasicBlock *BB = *PredStack.back().second++;
-        // Skip backedges detected in the forward-CFG DFS.
-        if (Backedges.count(std::make_pair(BB, PredStack.back().first)))
-          continue;
         if (Visited.insert(BB)) {
-          PredStack.push_back(std::make_pair(BB, pred_begin(BB)));
+          PredStack.push_back(std::make_pair(BB, BBStates[BB].pred_begin()));
           goto reverse_dfs_next_succ;
         }
       }
@@ -3109,7 +3062,9 @@ ObjCARCOpt::Visit(Function &F,
   // function exit point, and we want to ignore selected cycle edges.
   SmallVector<BasicBlock *, 16> PostOrder;
   SmallVector<BasicBlock *, 16> ReverseCFGPostOrder;
-  ComputePostOrders(F, PostOrder, ReverseCFGPostOrder);
+  ComputePostOrders(F, PostOrder, ReverseCFGPostOrder,
+                    NoObjCARCExceptionsMDKind,
+                    BBStates);
 
   // Use reverse-postorder on the reverse CFG for bottom-up.
   bool BottomUpNestingDetected = false;
@@ -3218,7 +3173,7 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
     // not being managed by ObjC reference counting, so we can delete pairs
     // regardless of what possible decrements or uses lie between them.
     bool KnownSafe = isa<Constant>(Arg) || isa<AllocaInst>(Arg);
-   
+
     // A constant pointer can't be pointing to an object on the heap. It may
     // be reference-counted, but it won't be deleted.
     if (const LoadInst *LI = dyn_cast<LoadInst>(Arg))
@@ -3379,6 +3334,7 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
 
     // Ok, everything checks out and we're all set. Let's move some code!
     Changed = true;
+    assert(OldCount != 0 && "Unreachable code?");
     AnyPairsCompletelyEliminated = NewCount == 0;
     NumRRs += OldCount - NewCount;
     MoveCalls(Arg, RetainsToMove, ReleasesToMove,
@@ -3519,7 +3475,7 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
     if (AllocaInst *Alloca = dyn_cast<AllocaInst>(Arg)) {
       for (Value::use_iterator UI = Alloca->use_begin(),
            UE = Alloca->use_end(); UI != UE; ++UI) {
-        Instruction *UserInst = cast<Instruction>(*UI);
+        const Instruction *UserInst = cast<Instruction>(*UI);
         switch (GetBasicInstructionClass(UserInst)) {
         case IC_InitWeak:
         case IC_StoreWeak:
@@ -3533,8 +3489,18 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
       for (Value::use_iterator UI = Alloca->use_begin(),
            UE = Alloca->use_end(); UI != UE; ) {
         CallInst *UserInst = cast<CallInst>(*UI++);
-        if (!UserInst->use_empty())
-          UserInst->replaceAllUsesWith(UserInst->getArgOperand(0));
+        switch (GetBasicInstructionClass(UserInst)) {
+        case IC_InitWeak:
+        case IC_StoreWeak:
+          // These functions return their second argument.
+          UserInst->replaceAllUsesWith(UserInst->getArgOperand(1));
+          break;
+        case IC_DestroyWeak:
+          // No return value.
+          break;
+        default:
+          llvm_unreachable("alloca really is used!");
+        }
         UserInst->eraseFromParent();
       }
       Alloca->eraseFromParent();
@@ -3602,8 +3568,7 @@ void ObjCARCOpt::OptimizeReturns(Function &F) {
         dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
       if (!Autorelease)
         goto next_block;
-      InstructionClass AutoreleaseClass =
-        GetBasicInstructionClass(Autorelease);
+      InstructionClass AutoreleaseClass = GetBasicInstructionClass(Autorelease);
       if (!IsAutorelease(AutoreleaseClass))
         goto next_block;
       if (GetObjCArg(Autorelease) != Arg)
@@ -3694,7 +3659,7 @@ bool ObjCARCOpt::doInitialization(Module &M) {
 
   // Intuitively, objc_retain and others are nocapture, however in practice
   // they are not, because they return their argument value. And objc_release
-  // calls finalizers.
+  // calls finalizers which can have arbitrary side effects.
 
   // These are initialized lazily.
   RetainRVCallee = 0;
@@ -3746,8 +3711,8 @@ bool ObjCARCOpt::runOnFunction(Function &F) {
       while (OptimizeSequences(F)) {}
 
   // Optimizations if objc_autorelease is used.
-  if (UsedInThisFunction &
-      ((1 << IC_Autorelease) | (1 << IC_AutoreleaseRV)))
+  if (UsedInThisFunction & ((1 << IC_Autorelease) |
+                            (1 << IC_AutoreleaseRV)))
     OptimizeReturns(F);
 
   return Changed;
@@ -3795,7 +3760,7 @@ namespace {
     /// StoreStrongCalls - The set of inserted objc_storeStrong calls. If
     /// at the end of walking the function we have found no alloca
     /// instructions, these calls can be marked "tail".
-    DenseSet<CallInst *> StoreStrongCalls;
+    SmallPtrSet<CallInst *, 8> StoreStrongCalls;
 
     Constant *getStoreStrongCallee(Module *M);
     Constant *getRetainAutoreleaseCallee(Module *M);
@@ -3846,13 +3811,11 @@ Constant *ObjCARCContract::getStoreStrongCallee(Module *M) {
     LLVMContext &C = M->getContext();
     Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
     Type *I8XX = PointerType::getUnqual(I8X);
-    std::vector<Type *> Params;
-    Params.push_back(I8XX);
-    Params.push_back(I8X);
+    Type *Params[] = { I8XX, I8X };
 
-    AttrListPtr Attributes;
-    Attributes.addAttr(~0u, Attribute::NoUnwind);
-    Attributes.addAttr(1, Attribute::NoCapture);
+    AttrListPtr Attributes = AttrListPtr()
+      .addAttr(~0u, Attribute::NoUnwind)
+      .addAttr(1, Attribute::NoCapture);
 
     StoreStrongCallee =
       M->getOrInsertFunction(
@@ -3867,12 +3830,9 @@ Constant *ObjCARCContract::getRetainAutoreleaseCallee(Module *M) {
   if (!RetainAutoreleaseCallee) {
     LLVMContext &C = M->getContext();
     Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-    std::vector<Type *> Params;
-    Params.push_back(I8X);
-    FunctionType *FTy =
-      FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttrListPtr Attributes;
-    Attributes.addAttr(~0u, Attribute::NoUnwind);
+    Type *Params[] = { I8X };
+    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
+    AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind);
     RetainAutoreleaseCallee =
       M->getOrInsertFunction("objc_retainAutorelease", FTy, Attributes);
   }
@@ -3883,12 +3843,9 @@ Constant *ObjCARCContract::getRetainAutoreleaseRVCallee(Module *M) {
   if (!RetainAutoreleaseRVCallee) {
     LLVMContext &C = M->getContext();
     Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-    std::vector<Type *> Params;
-    Params.push_back(I8X);
-    FunctionType *FTy =
-      FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttrListPtr Attributes;
-    Attributes.addAttr(~0u, Attribute::NoUnwind);
+    Type *Params[] = { I8X };
+    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
+    AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind);
     RetainAutoreleaseRVCallee =
       M->getOrInsertFunction("objc_retainAutoreleaseReturnValue", FTy,
                              Attributes);
@@ -3896,8 +3853,7 @@ Constant *ObjCARCContract::getRetainAutoreleaseRVCallee(Module *M) {
   return RetainAutoreleaseRVCallee;
 }
 
-/// ContractAutorelease - Merge an autorelease with a retain into a fused
-/// call.
+/// ContractAutorelease - Merge an autorelease with a retain into a fused call.
 bool
 ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease,
                                      InstructionClass Class,
@@ -3958,18 +3914,41 @@ void ObjCARCContract::ContractRelease(Instruction *Release,
   BasicBlock *BB = Release->getParent();
   if (Load->getParent() != BB) return;
 
-  // Walk down to find the store.
+  // Walk down to find the store and the release, which may be in either order.
   BasicBlock::iterator I = Load, End = BB->end();
   ++I;
   AliasAnalysis::Location Loc = AA->getLocation(Load);
-  while (I != End &&
-         (&*I == Release ||
-          IsRetain(GetBasicInstructionClass(I)) ||
-          !(AA->getModRefInfo(I, Loc) & AliasAnalysis::Mod)))
-    ++I;
-  StoreInst *Store = dyn_cast<StoreInst>(I);
-  if (!Store || !Store->isSimple()) return;
-  if (Store->getPointerOperand() != Loc.Ptr) return;
+  StoreInst *Store = 0;
+  bool SawRelease = false;
+  for (; !Store || !SawRelease; ++I) {
+    if (I == End)
+      return;
+
+    Instruction *Inst = I;
+    if (Inst == Release) {
+      SawRelease = true;
+      continue;
+    }
+
+    InstructionClass Class = GetBasicInstructionClass(Inst);
+
+    // Unrelated retains are harmless.
+    if (IsRetain(Class))
+      continue;
+
+    if (Store) {
+      // The store is the point where we're going to put the objc_storeStrong,
+      // so make sure there are no uses after it.
+      if (CanUse(Inst, Load, PA, Class))
+        return;
+    } else if (AA->getModRefInfo(Inst, Loc) & AliasAnalysis::Mod) {
+      // We are moving the load down to the store, so check for anything
+      // else which writes to the memory between the load and the store.
+      Store = dyn_cast<StoreInst>(Inst);
+      if (!Store || !Store->isSimple()) return;
+      if (Store->getPointerOperand() != Loc.Ptr) return;
+    }
+  }
 
   Value *New = StripPointerCastsAndObjCCalls(Store->getValueOperand());
 
@@ -4057,7 +4036,8 @@ bool ObjCARCContract::runOnFunction(Function &F) {
   // It seems that functions which "return twice" are also unsafe for the
   // "tail" argument, because they are setjmp, which could need to
   // return to an earlier stack state.
-  bool TailOkForStoreStrongs = !F.isVarArg() && !F.callsFunctionThatReturnsTwice();
+  bool TailOkForStoreStrongs = !F.isVarArg() &&
+                               !F.callsFunctionThatReturnsTwice();
 
   // For ObjC library calls which return their argument, replace uses of the
   // argument with uses of the call return value, if it dominates the use. This
@@ -4087,8 +4067,22 @@ bool ObjCARCContract::runOnFunction(Function &F) {
       if (!RetainRVMarker)
         break;
       BasicBlock::iterator BBI = Inst;
-      --BBI;
-      while (isNoopInstruction(BBI)) --BBI;
+      BasicBlock *InstParent = Inst->getParent();
+
+      // Step up to see if the call immediately precedes the RetainRV call.
+      // If it's an invoke, we have to cross a block boundary. And we have
+      // to carefully dodge no-op instructions.
+      do {
+        if (&*BBI == InstParent->begin()) {
+          BasicBlock *Pred = InstParent->getSinglePredecessor();
+          if (!Pred)
+            goto decline_rv_optimization;
+          BBI = Pred->getTerminator();
+          break;
+        }
+        --BBI;
+      } while (isNoopInstruction(BBI));
+
       if (&*BBI == GetObjCArg(Inst)) {
         Changed = true;
         InlineAsm *IA =
@@ -4098,6 +4092,7 @@ bool ObjCARCContract::runOnFunction(Function &F) {
                          /*Constraints=*/"", /*hasSideEffects=*/true);
         CallInst::Create(IA, "", Inst);
       }
+    decline_rv_optimization:
       break;
     }
     case IC_InitWeak: {
@@ -4147,25 +4142,21 @@ bool ObjCARCContract::runOnFunction(Function &F) {
         // trivially dominate itself, which would lead us to rewriting its
         // argument in terms of its return value, which would lead to
         // infinite loops in GetObjCArg.
-        if (DT->isReachableFromEntry(U) &&
-            DT->dominates(Inst, U)) {
+        if (DT->isReachableFromEntry(U) && DT->dominates(Inst, U)) {
           Changed = true;
           Instruction *Replacement = Inst;
           Type *UseTy = U.get()->getType();
           if (PHINode *PHI = dyn_cast<PHINode>(U.getUser())) {
             // For PHI nodes, insert the bitcast in the predecessor block.
-            unsigned ValNo =
-              PHINode::getIncomingValueNumForOperand(OperandNo);
-            BasicBlock *BB =
-              PHI->getIncomingBlock(ValNo);
+            unsigned ValNo = PHINode::getIncomingValueNumForOperand(OperandNo);
+            BasicBlock *BB = PHI->getIncomingBlock(ValNo);
             if (Replacement->getType() != UseTy)
               Replacement = new BitCastInst(Replacement, UseTy, "",
                                             &BB->back());
             // While we're here, rewrite all edges for this PHI, rather
             // than just one use at a time, to minimize the number of
             // bitcasts we emit.
-            for (unsigned i = 0, e = PHI->getNumIncomingValues();
-                 i != e; ++i)
+            for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i)
               if (PHI->getIncomingBlock(i) == BB) {
                 // Keep the UI iterator valid.
                 if (&PHI->getOperandUse(
@@ -4183,8 +4174,7 @@ bool ObjCARCContract::runOnFunction(Function &F) {
         }
       }
 
-      // If Arg is a no-op casted pointer, strip one level of casts and
-      // iterate.
+      // If Arg is a no-op casted pointer, strip one level of casts and iterate.
       if (const BitCastInst *BI = dyn_cast<BitCastInst>(Arg))
         Arg = BI->getOperand(0);
       else if (isa<GEPOperator>(Arg) &&
@@ -4201,7 +4191,7 @@ bool ObjCARCContract::runOnFunction(Function &F) {
   // If this function has no escaping allocas or suspicious vararg usage,
   // objc_storeStrong calls can be marked with the "tail" keyword.
   if (TailOkForStoreStrongs)
-    for (DenseSet<CallInst *>::iterator I = StoreStrongCalls.begin(),
+    for (SmallPtrSet<CallInst *, 8>::iterator I = StoreStrongCalls.begin(),
          E = StoreStrongCalls.end(); I != E; ++I)
       (*I)->setTailCall();
   StoreStrongCalls.clear();
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index cb408a1..ffcf97c 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -26,21 +26,23 @@
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Pass.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/DenseMap.h"
 #include <algorithm>
 using namespace llvm;
 
-STATISTIC(NumLinear , "Number of insts linearized");
 STATISTIC(NumChanged, "Number of insts reassociated");
 STATISTIC(NumAnnihil, "Number of expr tree annihilated");
 STATISTIC(NumFactor , "Number of multiplies factored");
@@ -70,13 +72,51 @@ static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) {
   }
 }
 #endif
-  
+
+namespace {
+  /// \brief Utility class representing a base and exponent pair which form one
+  /// factor of some product.
+  struct Factor {
+    Value *Base;
+    unsigned Power;
+
+    Factor(Value *Base, unsigned Power) : Base(Base), Power(Power) {}
+
+    /// \brief Sort factors by their Base.
+    struct BaseSorter {
+      bool operator()(const Factor &LHS, const Factor &RHS) {
+        return LHS.Base < RHS.Base;
+      }
+    };
+
+    /// \brief Compare factors for equal bases.
+    struct BaseEqual {
+      bool operator()(const Factor &LHS, const Factor &RHS) {
+        return LHS.Base == RHS.Base;
+      }
+    };
+
+    /// \brief Sort factors in descending order by their power.
+    struct PowerDescendingSorter {
+      bool operator()(const Factor &LHS, const Factor &RHS) {
+        return LHS.Power > RHS.Power;
+      }
+    };
+
+    /// \brief Compare factors for equal powers.
+    struct PowerEqual {
+      bool operator()(const Factor &LHS, const Factor &RHS) {
+        return LHS.Power == RHS.Power;
+      }
+    };
+  };
+}
+
 namespace {
   class Reassociate : public FunctionPass {
     DenseMap<BasicBlock*, unsigned> RankMap;
     DenseMap<AssertingVH<Value>, unsigned> ValueRankMap;
-    SmallVector<WeakVH, 8> RedoInsts;
-    SmallVector<WeakVH, 8> DeadInsts;
+    SetVector<AssertingVH<Instruction> > RedoInsts;
     bool MadeChange;
   public:
     static char ID; // Pass identification, replacement for typeid
@@ -92,18 +132,19 @@ namespace {
   private:
     void BuildRankMap(Function &F);
     unsigned getRank(Value *V);
-    Value *ReassociateExpression(BinaryOperator *I);
-    void RewriteExprTree(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops,
-                         unsigned Idx = 0);
+    void ReassociateExpression(BinaryOperator *I);
+    void RewriteExprTree(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops);
     Value *OptimizeExpression(BinaryOperator *I,
                               SmallVectorImpl<ValueEntry> &Ops);
     Value *OptimizeAdd(Instruction *I, SmallVectorImpl<ValueEntry> &Ops);
-    void LinearizeExprTree(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops);
-    void LinearizeExpr(BinaryOperator *I);
+    bool collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
+                                SmallVectorImpl<Factor> &Factors);
+    Value *buildMinimalMultiplyDAG(IRBuilder<> &Builder,
+                                   SmallVectorImpl<Factor> &Factors);
+    Value *OptimizeMul(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops);
     Value *RemoveFactorFromExpression(Value *V, Value *Factor);
-    void ReassociateInst(BasicBlock::iterator &BBI);
-    
-    void RemoveDeadBinaryOp(Value *V);
+    void EraseInst(Instruction *I);
+    void OptimizeInst(Instruction *I);
   };
 }
 
@@ -114,28 +155,24 @@ INITIALIZE_PASS(Reassociate, "reassociate",
 // Public interface to the Reassociate pass
 FunctionPass *llvm::createReassociatePass() { return new Reassociate(); }
 
-void Reassociate::RemoveDeadBinaryOp(Value *V) {
-  Instruction *Op = dyn_cast<Instruction>(V);
-  if (!Op || !isa<BinaryOperator>(Op))
-    return;
-  
-  Value *LHS = Op->getOperand(0), *RHS = Op->getOperand(1);
-  
-  ValueRankMap.erase(Op);
-  DeadInsts.push_back(Op);
-  RemoveDeadBinaryOp(LHS);
-  RemoveDeadBinaryOp(RHS);
+/// isReassociableOp - Return true if V is an instruction of the specified
+/// opcode and if it only has one use.
+static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) {
+  if (V->hasOneUse() && isa<Instruction>(V) &&
+      cast<Instruction>(V)->getOpcode() == Opcode)
+    return cast<BinaryOperator>(V);
+  return 0;
 }
 
-
 static bool isUnmovableInstruction(Instruction *I) {
   if (I->getOpcode() == Instruction::PHI ||
+      I->getOpcode() == Instruction::LandingPad ||
       I->getOpcode() == Instruction::Alloca ||
       I->getOpcode() == Instruction::Load ||
       I->getOpcode() == Instruction::Invoke ||
       (I->getOpcode() == Instruction::Call &&
        !isa<DbgInfoIntrinsic>(I)) ||
-      I->getOpcode() == Instruction::UDiv || 
+      I->getOpcode() == Instruction::UDiv ||
       I->getOpcode() == Instruction::SDiv ||
       I->getOpcode() == Instruction::FDiv ||
       I->getOpcode() == Instruction::URem ||
@@ -198,211 +235,570 @@ unsigned Reassociate::getRank(Value *V) {
   return ValueRankMap[I] = Rank;
 }
 
-/// isReassociableOp - Return true if V is an instruction of the specified
-/// opcode and if it only has one use.
-static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) {
-  if ((V->hasOneUse() || V->use_empty()) && isa<Instruction>(V) &&
-      cast<Instruction>(V)->getOpcode() == Opcode)
-    return cast<BinaryOperator>(V);
-  return 0;
-}
-
 /// LowerNegateToMultiply - Replace 0-X with X*-1.
 ///
-static Instruction *LowerNegateToMultiply(Instruction *Neg,
-                         DenseMap<AssertingVH<Value>, unsigned> &ValueRankMap) {
+static BinaryOperator *LowerNegateToMultiply(Instruction *Neg) {
   Constant *Cst = Constant::getAllOnesValue(Neg->getType());
 
-  Instruction *Res = BinaryOperator::CreateMul(Neg->getOperand(1), Cst, "",Neg);
-  ValueRankMap.erase(Neg);
+  BinaryOperator *Res =
+    BinaryOperator::CreateMul(Neg->getOperand(1), Cst, "",Neg);
+  Neg->setOperand(1, Constant::getNullValue(Neg->getType())); // Drop use of op.
   Res->takeName(Neg);
   Neg->replaceAllUsesWith(Res);
   Res->setDebugLoc(Neg->getDebugLoc());
-  Neg->eraseFromParent();
   return Res;
 }
 
-// Given an expression of the form '(A+B)+(D+C)', turn it into '(((A+B)+C)+D)'.
-// Note that if D is also part of the expression tree that we recurse to
-// linearize it as well.  Besides that case, this does not recurse into A,B, or
-// C.
-void Reassociate::LinearizeExpr(BinaryOperator *I) {
-  BinaryOperator *LHS = cast<BinaryOperator>(I->getOperand(0));
-  BinaryOperator *RHS = cast<BinaryOperator>(I->getOperand(1));
-  assert(isReassociableOp(LHS, I->getOpcode()) &&
-         isReassociableOp(RHS, I->getOpcode()) &&
-         "Not an expression that needs linearization?");
-
-  DEBUG(dbgs() << "Linear" << *LHS << '\n' << *RHS << '\n' << *I << '\n');
-
-  // Move the RHS instruction to live immediately before I, avoiding breaking
-  // dominator properties.
-  RHS->moveBefore(I);
-
-  // Move operands around to do the linearization.
-  I->setOperand(1, RHS->getOperand(0));
-  RHS->setOperand(0, LHS);
-  I->setOperand(0, RHS);
-
-  // Conservatively clear all the optional flags, which may not hold
-  // after the reassociation.
-  I->clearSubclassOptionalData();
-  LHS->clearSubclassOptionalData();
-  RHS->clearSubclassOptionalData();
-
-  ++NumLinear;
-  MadeChange = true;
-  DEBUG(dbgs() << "Linearized: " << *I << '\n');
-
-  // If D is part of this expression tree, tail recurse.
-  if (isReassociableOp(I->getOperand(1), I->getOpcode()))
-    LinearizeExpr(I);
+/// CarmichaelShift - Returns k such that lambda(2^Bitwidth) = 2^k, where lambda
+/// is the Carmichael function. This means that x^(2^k) === 1 mod 2^Bitwidth for
+/// every odd x, i.e. x^(2^k) = 1 for every odd x in Bitwidth-bit arithmetic.
+/// Note that 0 <= k < Bitwidth, and if Bitwidth > 3 then x^(2^k) = 0 for every
+/// even x in Bitwidth-bit arithmetic.
+static unsigned CarmichaelShift(unsigned Bitwidth) {
+  if (Bitwidth < 3)
+    return Bitwidth - 1;
+  return Bitwidth - 2;
+}
+
+/// IncorporateWeight - Add the extra weight 'RHS' to the existing weight 'LHS',
+/// reducing the combined weight using any special properties of the operation.
+/// The existing weight LHS represents the computation X op X op ... op X where
+/// X occurs LHS times.  The combined weight represents  X op X op ... op X with
+/// X occurring LHS + RHS times.  If op is "Xor" for example then the combined
+/// operation is equivalent to X if LHS + RHS is odd, or 0 if LHS + RHS is even;
+/// the routine returns 1 in LHS in the first case, and 0 in LHS in the second.
+static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) {
+  // If we were working with infinite precision arithmetic then the combined
+  // weight would be LHS + RHS.  But we are using finite precision arithmetic,
+  // and the APInt sum LHS + RHS may not be correct if it wraps (it is correct
+  // for nilpotent operations and addition, but not for idempotent operations
+  // and multiplication), so it is important to correctly reduce the combined
+  // weight back into range if wrapping would be wrong.
+
+  // If RHS is zero then the weight didn't change.
+  if (RHS.isMinValue())
+    return;
+  // If LHS is zero then the combined weight is RHS.
+  if (LHS.isMinValue()) {
+    LHS = RHS;
+    return;
+  }
+  // From this point on we know that neither LHS nor RHS is zero.
+
+  if (Instruction::isIdempotent(Opcode)) {
+    // Idempotent means X op X === X, so any non-zero weight is equivalent to a
+    // weight of 1.  Keeping weights at zero or one also means that wrapping is
+    // not a problem.
+    assert(LHS == 1 && RHS == 1 && "Weights not reduced!");
+    return; // Return a weight of 1.
+  }
+  if (Instruction::isNilpotent(Opcode)) {
+    // Nilpotent means X op X === 0, so reduce weights modulo 2.
+    assert(LHS == 1 && RHS == 1 && "Weights not reduced!");
+    LHS = 0; // 1 + 1 === 0 modulo 2.
+    return;
+  }
+  if (Opcode == Instruction::Add) {
+    // TODO: Reduce the weight by exploiting nsw/nuw?
+    LHS += RHS;
+    return;
+  }
+
+  assert(Opcode == Instruction::Mul && "Unknown associative operation!");
+  unsigned Bitwidth = LHS.getBitWidth();
+  // If CM is the Carmichael number then a weight W satisfying W >= CM+Bitwidth
+  // can be replaced with W-CM.  That's because x^W=x^(W-CM) for every Bitwidth
+  // bit number x, since either x is odd in which case x^CM = 1, or x is even in
+  // which case both x^W and x^(W - CM) are zero.  By subtracting off multiples
+  // of CM like this weights can always be reduced to the range [0, CM+Bitwidth)
+  // which by a happy accident means that they can always be represented using
+  // Bitwidth bits.
+  // TODO: Reduce the weight by exploiting nsw/nuw?  (Could do much better than
+  // the Carmichael number).
+  if (Bitwidth > 3) {
+    /// CM - The value of Carmichael's lambda function.
+    APInt CM = APInt::getOneBitSet(Bitwidth, CarmichaelShift(Bitwidth));
+    // Any weight W >= Threshold can be replaced with W - CM.
+    APInt Threshold = CM + Bitwidth;
+    assert(LHS.ult(Threshold) && RHS.ult(Threshold) && "Weights not reduced!");
+    // For Bitwidth 4 or more the following sum does not overflow.
+    LHS += RHS;
+    while (LHS.uge(Threshold))
+      LHS -= CM;
+  } else {
+    // To avoid problems with overflow do everything the same as above but using
+    // a larger type.
+    unsigned CM = 1U << CarmichaelShift(Bitwidth);
+    unsigned Threshold = CM + Bitwidth;
+    assert(LHS.getZExtValue() < Threshold && RHS.getZExtValue() < Threshold &&
+           "Weights not reduced!");
+    unsigned Total = LHS.getZExtValue() + RHS.getZExtValue();
+    while (Total >= Threshold)
+      Total -= CM;
+    LHS = Total;
+  }
 }
 
+/// EvaluateRepeatedConstant - Compute C op C op ... op C where the constant C
+/// is repeated Weight times.
+static Constant *EvaluateRepeatedConstant(unsigned Opcode, Constant *C,
+                                          APInt Weight) {
+  // For addition the result can be efficiently computed as the product of the
+  // constant and the weight.
+  if (Opcode == Instruction::Add)
+    return ConstantExpr::getMul(C, ConstantInt::get(C->getContext(), Weight));
+
+  // The weight might be huge, so compute by repeated squaring to ensure that
+  // compile time is proportional to the logarithm of the weight.
+  Constant *Result = 0;
+  Constant *Power = C; // Successively C, C op C, (C op C) op (C op C) etc.
+  // Visit the bits in Weight.
+  while (Weight != 0) {
+    // If the current bit in Weight is non-zero do Result = Result op Power.
+    if (Weight[0])
+      Result = Result ? ConstantExpr::get(Opcode, Result, Power) : Power;
+    // Move on to the next bit if any more are non-zero.
+    Weight = Weight.lshr(1);
+    if (Weight.isMinValue())
+      break;
+    // Square the power.
+    Power = ConstantExpr::get(Opcode, Power, Power);
+  }
+
+  assert(Result && "Only positive weights supported!");
+  return Result;
+}
 
-/// LinearizeExprTree - Given an associative binary expression tree, traverse
-/// all of the uses putting it into canonical form.  This forces a left-linear
-/// form of the expression (((a+b)+c)+d), and collects information about the
-/// rank of the non-tree operands.
+typedef std::pair<Value*, APInt> RepeatedValue;
+
+/// LinearizeExprTree - Given an associative binary expression, return the leaf
+/// nodes in Ops along with their weights (how many times the leaf occurs).  The
+/// original expression is the same as
+///   (Ops[0].first op Ops[0].first op ... Ops[0].first)  <- Ops[0].second times
+/// op
+///   (Ops[1].first op Ops[1].first op ... Ops[1].first)  <- Ops[1].second times
+/// op
+///   ...
+/// op
+///   (Ops[N].first op Ops[N].first op ... Ops[N].first)  <- Ops[N].second times
+///
+/// Note that the values Ops[0].first, ..., Ops[N].first are all distinct, and
+/// they are all non-constant except possibly for the last one, which if it is
+/// constant will have weight one (Ops[N].second === 1).
+///
+/// This routine may modify the function, in which case it returns 'true'.  The
+/// changes it makes may well be destructive, changing the value computed by 'I'
+/// to something completely different.  Thus if the routine returns 'true' then
+/// you MUST either replace I with a new expression computed from the Ops array,
+/// or use RewriteExprTree to put the values back in.
+///
+/// A leaf node is either not a binary operation of the same kind as the root
+/// node 'I' (i.e. is not a binary operator at all, or is, but with a different
+/// opcode), or is the same kind of binary operator but has a use which either
+/// does not belong to the expression, or does belong to the expression but is
+/// a leaf node.  Every leaf node has at least one use that is a non-leaf node
+/// of the expression, while for non-leaf nodes (except for the root 'I') every
+/// use is a non-leaf node of the expression.
+///
+/// For example:
+///           expression graph        node names
+///
+///                     +        |        I
+///                    / \       |
+///                   +   +      |      A,  B
+///                  / \ / \     |
+///                 *   +   *    |    C,  D,  E
+///                / \ / \ / \   |
+///                   +   *      |      F,  G
 ///
-/// NOTE: These intentionally destroys the expression tree operands (turning
-/// them into undef values) to reduce #uses of the values.  This means that the
-/// caller MUST use something like RewriteExprTree to put the values back in.
+/// The leaf nodes are C, E, F and G.  The Ops array will contain (maybe not in
+/// that order) (C, 1), (E, 1), (F, 2), (G, 2).
 ///
-void Reassociate::LinearizeExprTree(BinaryOperator *I,
-                                    SmallVectorImpl<ValueEntry> &Ops) {
-  Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
+/// The expression is maximal: if some instruction is a binary operator of the
+/// same kind as 'I', and all of its uses are non-leaf nodes of the expression,
+/// then the instruction also belongs to the expression, is not a leaf node of
+/// it, and its operands also belong to the expression (but may be leaf nodes).
+///
+/// NOTE: This routine will set operands of non-leaf non-root nodes to undef in
+/// order to ensure that every non-root node in the expression has *exactly one*
+/// use by a non-leaf node of the expression.  This destruction means that the
+/// caller MUST either replace 'I' with a new expression or use something like
+/// RewriteExprTree to put the values back in if the routine indicates that it
+/// made a change by returning 'true'.
+///
+/// In the above example either the right operand of A or the left operand of B
+/// will be replaced by undef.  If it is B's operand then this gives:
+///
+///                     +        |        I
+///                    / \       |
+///                   +   +      |      A,  B - operand of B replaced with undef
+///                  / \   \     |
+///                 *   +   *    |    C,  D,  E
+///                / \ / \ / \   |
+///                   +   *      |      F,  G
+///
+/// Note that such undef operands can only be reached by passing through 'I'.
+/// For example, if you visit operands recursively starting from a leaf node
+/// then you will never see such an undef operand unless you get back to 'I',
+/// which requires passing through a phi node.
+///
+/// Note that this routine may also mutate binary operators of the wrong type
+/// that have all uses inside the expression (i.e. only used by non-leaf nodes
+/// of the expression) if it can turn them into binary operators of the right
+/// type and thus make the expression bigger.
+
+static bool LinearizeExprTree(BinaryOperator *I,
+                              SmallVectorImpl<RepeatedValue> &Ops) {
+  DEBUG(dbgs() << "LINEARIZE: " << *I << '\n');
+  unsigned Bitwidth = I->getType()->getScalarType()->getPrimitiveSizeInBits();
   unsigned Opcode = I->getOpcode();
+  assert(Instruction::isAssociative(Opcode) &&
+         Instruction::isCommutative(Opcode) &&
+         "Expected an associative and commutative operation!");
+  // If we see an absorbing element then the entire expression must be equal to
+  // it.  For example, if this is a multiplication expression and zero occurs as
+  // an operand somewhere in it then the result of the expression must be zero.
+  Constant *Absorber = ConstantExpr::getBinOpAbsorber(Opcode, I->getType());
+
+  // Visit all operands of the expression, keeping track of their weight (the
+  // number of paths from the expression root to the operand, or if you like
+  // the number of times that operand occurs in the linearized expression).
+  // For example, if I = X + A, where X = A + B, then I, X and B have weight 1
+  // while A has weight two.
+
+  // Worklist of non-leaf nodes (their operands are in the expression too) along
+  // with their weights, representing a certain number of paths to the operator.
+  // If an operator occurs in the worklist multiple times then we found multiple
+  // ways to get to it.
+  SmallVector<std::pair<BinaryOperator*, APInt>, 8> Worklist; // (Op, Weight)
+  Worklist.push_back(std::make_pair(I, APInt(Bitwidth, 1)));
+  bool MadeChange = false;
+
+  // Leaves of the expression are values that either aren't the right kind of
+  // operation (eg: a constant, or a multiply in an add tree), or are, but have
+  // some uses that are not inside the expression.  For example, in I = X + X,
+  // X = A + B, the value X has two uses (by I) that are in the expression.  If
+  // X has any other uses, for example in a return instruction, then we consider
+  // X to be a leaf, and won't analyze it further.  When we first visit a value,
+  // if it has more than one use then at first we conservatively consider it to
+  // be a leaf.  Later, as the expression is explored, we may discover some more
+  // uses of the value from inside the expression.  If all uses turn out to be
+  // from within the expression (and the value is a binary operator of the right
+  // kind) then the value is no longer considered to be a leaf, and its operands
+  // are explored.
+
+  // Leaves - Keeps track of the set of putative leaves as well as the number of
+  // paths to each leaf seen so far.
+  typedef DenseMap<Value*, APInt> LeafMap;
+  LeafMap Leaves; // Leaf -> Total weight so far.
+  SmallVector<Value*, 8> LeafOrder; // Ensure deterministic leaf output order.
+
+#ifndef NDEBUG
+  SmallPtrSet<Value*, 8> Visited; // For sanity checking the iteration scheme.
+#endif
+  while (!Worklist.empty()) {
+    std::pair<BinaryOperator*, APInt> P = Worklist.pop_back_val();
+    I = P.first; // We examine the operands of this binary operator.
+
+    for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx) { // Visit operands.
+      Value *Op = I->getOperand(OpIdx);
+      APInt Weight = P.second; // Number of paths to this operand.
+      DEBUG(dbgs() << "OPERAND: " << *Op << " (" << Weight << ")\n");
+      assert(!Op->use_empty() && "No uses, so how did we get to it?!");
+
+      // If the expression contains an absorbing element then there is no need
+      // to analyze it further: it must evaluate to the absorbing element.
+      if (Op == Absorber && !Weight.isMinValue()) {
+        Ops.push_back(std::make_pair(Absorber, APInt(Bitwidth, 1)));
+        return MadeChange;
+      }
 
-  // First step, linearize the expression if it is in ((A+B)+(C+D)) form.
-  BinaryOperator *LHSBO = isReassociableOp(LHS, Opcode);
-  BinaryOperator *RHSBO = isReassociableOp(RHS, Opcode);
+      // If this is a binary operation of the right kind with only one use then
+      // add its operands to the expression.
+      if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
+        assert(Visited.insert(Op) && "Not first visit!");
+        DEBUG(dbgs() << "DIRECT ADD: " << *Op << " (" << Weight << ")\n");
+        Worklist.push_back(std::make_pair(BO, Weight));
+        continue;
+      }
 
-  // If this is a multiply expression tree and it contains internal negations,
-  // transform them into multiplies by -1 so they can be reassociated.
-  if (I->getOpcode() == Instruction::Mul) {
-    if (!LHSBO && LHS->hasOneUse() && BinaryOperator::isNeg(LHS)) {
-      LHS = LowerNegateToMultiply(cast<Instruction>(LHS), ValueRankMap);
-      LHSBO = isReassociableOp(LHS, Opcode);
-    }
-    if (!RHSBO && RHS->hasOneUse() && BinaryOperator::isNeg(RHS)) {
-      RHS = LowerNegateToMultiply(cast<Instruction>(RHS), ValueRankMap);
-      RHSBO = isReassociableOp(RHS, Opcode);
+      // Appears to be a leaf.  Is the operand already in the set of leaves?
+      LeafMap::iterator It = Leaves.find(Op);
+      if (It == Leaves.end()) {
+        // Not in the leaf map.  Must be the first time we saw this operand.
+        assert(Visited.insert(Op) && "Not first visit!");
+        if (!Op->hasOneUse()) {
+          // This value has uses not accounted for by the expression, so it is
+          // not safe to modify.  Mark it as being a leaf.
+          DEBUG(dbgs() << "ADD USES LEAF: " << *Op << " (" << Weight << ")\n");
+          LeafOrder.push_back(Op);
+          Leaves[Op] = Weight;
+          continue;
+        }
+        // No uses outside the expression, try morphing it.
+      } else if (It != Leaves.end()) {
+        // Already in the leaf map.
+        assert(Visited.count(Op) && "In leaf map but not visited!");
+
+        // Update the number of paths to the leaf.
+        IncorporateWeight(It->second, Weight, Opcode);
+
+        // The leaf already has one use from inside the expression.  As we want
+        // exactly one such use, drop this new use of the leaf.
+        assert(!Op->hasOneUse() && "Only one use, but we got here twice!");
+        I->setOperand(OpIdx, UndefValue::get(I->getType()));
+        MadeChange = true;
+
+        // If the leaf is a binary operation of the right kind and we now see
+        // that its multiple original uses were in fact all by nodes belonging
+        // to the expression, then no longer consider it to be a leaf and add
+        // its operands to the expression.
+        if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
+          DEBUG(dbgs() << "UNLEAF: " << *Op << " (" << It->second << ")\n");
+          Worklist.push_back(std::make_pair(BO, It->second));
+          Leaves.erase(It);
+          continue;
+        }
+
+        // If we still have uses that are not accounted for by the expression
+        // then it is not safe to modify the value.
+        if (!Op->hasOneUse())
+          continue;
+
+        // No uses outside the expression, try morphing it.
+        Weight = It->second;
+        Leaves.erase(It); // Since the value may be morphed below.
+      }
+
+      // At this point we have a value which, first of all, is not a binary
+      // expression of the right kind, and secondly, is only used inside the
+      // expression.  This means that it can safely be modified.  See if we
+      // can usefully morph it into an expression of the right kind.
+      assert((!isa<Instruction>(Op) ||
+              cast<Instruction>(Op)->getOpcode() != Opcode) &&
+             "Should have been handled above!");
+      assert(Op->hasOneUse() && "Has uses outside the expression tree!");
+
+      // If this is a multiply expression, turn any internal negations into
+      // multiplies by -1 so they can be reassociated.
+      BinaryOperator *BO = dyn_cast<BinaryOperator>(Op);
+      if (Opcode == Instruction::Mul && BO && BinaryOperator::isNeg(BO)) {
+        DEBUG(dbgs() << "MORPH LEAF: " << *Op << " (" << Weight << ") TO ");
+        BO = LowerNegateToMultiply(BO);
+        DEBUG(dbgs() << *BO << 'n');
+        Worklist.push_back(std::make_pair(BO, Weight));
+        MadeChange = true;
+        continue;
+      }
+
+      // Failed to morph into an expression of the right type.  This really is
+      // a leaf.
+      DEBUG(dbgs() << "ADD LEAF: " << *Op << " (" << Weight << ")\n");
+      assert(!isReassociableOp(Op, Opcode) && "Value was morphed?");
+      LeafOrder.push_back(Op);
+      Leaves[Op] = Weight;
     }
   }
 
-  if (!LHSBO) {
-    if (!RHSBO) {
-      // Neither the LHS or RHS as part of the tree, thus this is a leaf.  As
-      // such, just remember these operands and their rank.
-      Ops.push_back(ValueEntry(getRank(LHS), LHS));
-      Ops.push_back(ValueEntry(getRank(RHS), RHS));
-      
-      // Clear the leaves out.
-      I->setOperand(0, UndefValue::get(I->getType()));
-      I->setOperand(1, UndefValue::get(I->getType()));
-      return;
+  // The leaves, repeated according to their weights, represent the linearized
+  // form of the expression.
+  Constant *Cst = 0; // Accumulate constants here.
+  for (unsigned i = 0, e = LeafOrder.size(); i != e; ++i) {
+    Value *V = LeafOrder[i];
+    LeafMap::iterator It = Leaves.find(V);
+    if (It == Leaves.end())
+      // Node initially thought to be a leaf wasn't.
+      continue;
+    assert(!isReassociableOp(V, Opcode) && "Shouldn't be a leaf!");
+    APInt Weight = It->second;
+    if (Weight.isMinValue())
+      // Leaf already output or weight reduction eliminated it.
+      continue;
+    // Ensure the leaf is only output once.
+    It->second = 0;
+    // Glob all constants together into Cst.
+    if (Constant *C = dyn_cast<Constant>(V)) {
+      C = EvaluateRepeatedConstant(Opcode, C, Weight);
+      Cst = Cst ? ConstantExpr::get(Opcode, Cst, C) : C;
+      continue;
     }
-    
-    // Turn X+(Y+Z) -> (Y+Z)+X
-    std::swap(LHSBO, RHSBO);
-    std::swap(LHS, RHS);
-    bool Success = !I->swapOperands();
-    assert(Success && "swapOperands failed");
-    (void)Success;
-    MadeChange = true;
-  } else if (RHSBO) {
-    // Turn (A+B)+(C+D) -> (((A+B)+C)+D).  This guarantees the RHS is not
-    // part of the expression tree.
-    LinearizeExpr(I);
-    LHS = LHSBO = cast<BinaryOperator>(I->getOperand(0));
-    RHS = I->getOperand(1);
-    RHSBO = 0;
+    // Add non-constant
+    Ops.push_back(std::make_pair(V, Weight));
   }
 
-  // Okay, now we know that the LHS is a nested expression and that the RHS is
-  // not.  Perform reassociation.
-  assert(!isReassociableOp(RHS, Opcode) && "LinearizeExpr failed!");
-
-  // Move LHS right before I to make sure that the tree expression dominates all
-  // values.
-  LHSBO->moveBefore(I);
+  // Add any constants back into Ops, all globbed together and reduced to having
+  // weight 1 for the convenience of users.
+  Constant *Identity = ConstantExpr::getBinOpIdentity(Opcode, I->getType());
+  if (Cst && Cst != Identity) {
+    // If combining multiple constants resulted in the absorber then the entire
+    // expression must evaluate to the absorber.
+    if (Cst == Absorber)
+      Ops.clear();
+    Ops.push_back(std::make_pair(Cst, APInt(Bitwidth, 1)));
+  }
 
-  // Linearize the expression tree on the LHS.
-  LinearizeExprTree(LHSBO, Ops);
+  // For nilpotent operations or addition there may be no operands, for example
+  // because the expression was "X xor X" or consisted of 2^Bitwidth additions:
+  // in both cases the weight reduces to 0 causing the value to be skipped.
+  if (Ops.empty()) {
+    assert(Identity && "Associative operation without identity!");
+    Ops.push_back(std::make_pair(Identity, APInt(Bitwidth, 1)));
+  }
 
-  // Remember the RHS operand and its rank.
-  Ops.push_back(ValueEntry(getRank(RHS), RHS));
-  
-  // Clear the RHS leaf out.
-  I->setOperand(1, UndefValue::get(I->getType()));
+  return MadeChange;
 }
 
 // RewriteExprTree - Now that the operands for this expression tree are
-// linearized and optimized, emit them in-order.  This function is written to be
-// tail recursive.
+// linearized and optimized, emit them in-order.
 void Reassociate::RewriteExprTree(BinaryOperator *I,
-                                  SmallVectorImpl<ValueEntry> &Ops,
-                                  unsigned i) {
-  if (i+2 == Ops.size()) {
-    if (I->getOperand(0) != Ops[i].Op ||
-        I->getOperand(1) != Ops[i+1].Op) {
-      Value *OldLHS = I->getOperand(0);
-      DEBUG(dbgs() << "RA: " << *I << '\n');
-      I->setOperand(0, Ops[i].Op);
-      I->setOperand(1, Ops[i+1].Op);
-
-      // Clear all the optional flags, which may not hold after the
-      // reassociation if the expression involved more than just this operation.
-      if (Ops.size() != 2)
-        I->clearSubclassOptionalData();
-
-      DEBUG(dbgs() << "TO: " << *I << '\n');
+                                  SmallVectorImpl<ValueEntry> &Ops) {
+  assert(Ops.size() > 1 && "Single values should be used directly!");
+
+  // Since our optimizations never increase the number of operations, the new
+  // expression can always be written by reusing the existing binary operators
+  // from the original expression tree, without creating any new instructions,
+  // though the rewritten expression may have a completely different topology.
+  // We take care to not change anything if the new expression will be the same
+  // as the original.  If more than trivial changes (like commuting operands)
+  // were made then we are obliged to clear out any optional subclass data like
+  // nsw flags.
+
+  /// NodesToRewrite - Nodes from the original expression available for writing
+  /// the new expression into.
+  SmallVector<BinaryOperator*, 8> NodesToRewrite;
+  unsigned Opcode = I->getOpcode();
+  BinaryOperator *Op = I;
+
+  // ExpressionChanged - Non-null if the rewritten expression differs from the
+  // original in some non-trivial way, requiring the clearing of optional flags.
+  // Flags are cleared from the operator in ExpressionChanged up to I inclusive.
+  BinaryOperator *ExpressionChanged = 0;
+  for (unsigned i = 0; ; ++i) {
+    // The last operation (which comes earliest in the IR) is special as both
+    // operands will come from Ops, rather than just one with the other being
+    // a subexpression.
+    if (i+2 == Ops.size()) {
+      Value *NewLHS = Ops[i].Op;
+      Value *NewRHS = Ops[i+1].Op;
+      Value *OldLHS = Op->getOperand(0);
+      Value *OldRHS = Op->getOperand(1);
+
+      if (NewLHS == OldLHS && NewRHS == OldRHS)
+        // Nothing changed, leave it alone.
+        break;
+
+      if (NewLHS == OldRHS && NewRHS == OldLHS) {
+        // The order of the operands was reversed.  Swap them.
+        DEBUG(dbgs() << "RA: " << *Op << '\n');
+        Op->swapOperands();
+        DEBUG(dbgs() << "TO: " << *Op << '\n');
+        MadeChange = true;
+        ++NumChanged;
+        break;
+      }
+
+      // The new operation differs non-trivially from the original. Overwrite
+      // the old operands with the new ones.
+      DEBUG(dbgs() << "RA: " << *Op << '\n');
+      if (NewLHS != OldLHS) {
+        if (BinaryOperator *BO = isReassociableOp(OldLHS, Opcode))
+          NodesToRewrite.push_back(BO);
+        Op->setOperand(0, NewLHS);
+      }
+      if (NewRHS != OldRHS) {
+        if (BinaryOperator *BO = isReassociableOp(OldRHS, Opcode))
+          NodesToRewrite.push_back(BO);
+        Op->setOperand(1, NewRHS);
+      }
+      DEBUG(dbgs() << "TO: " << *Op << '\n');
+
+      ExpressionChanged = Op;
+      MadeChange = true;
+      ++NumChanged;
+
+      break;
+    }
+
+    // Not the last operation.  The left-hand side will be a sub-expression
+    // while the right-hand side will be the current element of Ops.
+    Value *NewRHS = Ops[i].Op;
+    if (NewRHS != Op->getOperand(1)) {
+      DEBUG(dbgs() << "RA: " << *Op << '\n');
+      if (NewRHS == Op->getOperand(0)) {
+        // The new right-hand side was already present as the left operand.  If
+        // we are lucky then swapping the operands will sort out both of them.
+        Op->swapOperands();
+      } else {
+        // Overwrite with the new right-hand side.
+        if (BinaryOperator *BO = isReassociableOp(Op->getOperand(1), Opcode))
+          NodesToRewrite.push_back(BO);
+        Op->setOperand(1, NewRHS);
+        ExpressionChanged = Op;
+      }
+      DEBUG(dbgs() << "TO: " << *Op << '\n');
       MadeChange = true;
       ++NumChanged;
-      
-      // If we reassociated a tree to fewer operands (e.g. (1+a+2) -> (a+3)
-      // delete the extra, now dead, nodes.
-      RemoveDeadBinaryOp(OldLHS);
     }
-    return;
-  }
-  assert(i+2 < Ops.size() && "Ops index out of range!");
 
-  if (I->getOperand(1) != Ops[i].Op) {
-    DEBUG(dbgs() << "RA: " << *I << '\n');
-    I->setOperand(1, Ops[i].Op);
+    // Now deal with the left-hand side.  If this is already an operation node
+    // from the original expression then just rewrite the rest of the expression
+    // into it.
+    if (BinaryOperator *BO = isReassociableOp(Op->getOperand(0), Opcode)) {
+      Op = BO;
+      continue;
+    }
 
-    // Conservatively clear all the optional flags, which may not hold
-    // after the reassociation.
-    I->clearSubclassOptionalData();
+    // Otherwise, grab a spare node from the original expression and use that as
+    // the left-hand side.  If there are no nodes left then the optimizers made
+    // an expression with more nodes than the original!  This usually means that
+    // they did something stupid but it might mean that the problem was just too
+    // hard (finding the mimimal number of multiplications needed to realize a
+    // multiplication expression is NP-complete).  Whatever the reason, smart or
+    // stupid, create a new node if there are none left.
+    BinaryOperator *NewOp;
+    if (NodesToRewrite.empty()) {
+      Constant *Undef = UndefValue::get(I->getType());
+      NewOp = BinaryOperator::Create(Instruction::BinaryOps(Opcode),
+                                     Undef, Undef, "", I);
+    } else {
+      NewOp = NodesToRewrite.pop_back_val();
+    }
 
-    DEBUG(dbgs() << "TO: " << *I << '\n');
+    DEBUG(dbgs() << "RA: " << *Op << '\n');
+    Op->setOperand(0, NewOp);
+    DEBUG(dbgs() << "TO: " << *Op << '\n');
+    ExpressionChanged = Op;
     MadeChange = true;
     ++NumChanged;
+    Op = NewOp;
   }
-  
-  BinaryOperator *LHS = cast<BinaryOperator>(I->getOperand(0));
-  assert(LHS->getOpcode() == I->getOpcode() &&
-         "Improper expression tree!");
-  
-  // Compactify the tree instructions together with each other to guarantee
-  // that the expression tree is dominated by all of Ops.
-  LHS->moveBefore(I);
-  RewriteExprTree(LHS, Ops, i+1);
-}
-
 
+  // If the expression changed non-trivially then clear out all subclass data
+  // starting from the operator specified in ExpressionChanged, and compactify
+  // the operators to just before the expression root to guarantee that the
+  // expression tree is dominated by all of Ops.
+  if (ExpressionChanged)
+    do {
+      ExpressionChanged->clearSubclassOptionalData();
+      if (ExpressionChanged == I)
+        break;
+      ExpressionChanged->moveBefore(I);
+      ExpressionChanged = cast<BinaryOperator>(*ExpressionChanged->use_begin());
+    } while (1);
+
+  // Throw away any left over nodes from the original expression.
+  for (unsigned i = 0, e = NodesToRewrite.size(); i != e; ++i)
+    RedoInsts.insert(NodesToRewrite[i]);
+}
 
-// NegateValue - Insert instructions before the instruction pointed to by BI,
-// that computes the negative version of the value specified.  The negative
-// version of the value is returned, and BI is left pointing at the instruction
-// that should be processed next by the reassociation pass.
-//
+/// NegateValue - Insert instructions before the instruction pointed to by BI,
+/// that computes the negative version of the value specified.  The negative
+/// version of the value is returned, and BI is left pointing at the instruction
+/// that should be processed next by the reassociation pass.
 static Value *NegateValue(Value *V, Instruction *BI) {
   if (Constant *C = dyn_cast<Constant>(V))
     return ConstantExpr::getNeg(C);
-  
+
   // We are trying to expose opportunity for reassociation.  One of the things
   // that we want to do to achieve this is to push a negation as deep into an
   // expression chain as possible, to expose the add instructions.  In practice,
@@ -412,22 +808,21 @@ static Value *NegateValue(Value *V, Instruction *BI) {
   // the constants.  We assume that instcombine will clean up the mess later if
   // we introduce tons of unnecessary negation instructions.
   //
-  if (Instruction *I = dyn_cast<Instruction>(V))
-    if (I->getOpcode() == Instruction::Add && I->hasOneUse()) {
-      // Push the negates through the add.
-      I->setOperand(0, NegateValue(I->getOperand(0), BI));
-      I->setOperand(1, NegateValue(I->getOperand(1), BI));
-
-      // We must move the add instruction here, because the neg instructions do
-      // not dominate the old add instruction in general.  By moving it, we are
-      // assured that the neg instructions we just inserted dominate the 
-      // instruction we are about to insert after them.
-      //
-      I->moveBefore(BI);
-      I->setName(I->getName()+".neg");
-      return I;
-    }
-  
+  if (BinaryOperator *I = isReassociableOp(V, Instruction::Add)) {
+    // Push the negates through the add.
+    I->setOperand(0, NegateValue(I->getOperand(0), BI));
+    I->setOperand(1, NegateValue(I->getOperand(1), BI));
+
+    // We must move the add instruction here, because the neg instructions do
+    // not dominate the old add instruction in general.  By moving it, we are
+    // assured that the neg instructions we just inserted dominate the
+    // instruction we are about to insert after them.
+    //
+    I->moveBefore(BI);
+    I->setName(I->getName()+".neg");
+    return I;
+  }
+
   // Okay, we need to materialize a negated version of V with an instruction.
   // Scan the use lists of V to see if we have one already.
   for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;++UI){
@@ -443,7 +838,7 @@ static Value *NegateValue(Value *V, Instruction *BI) {
     // Verify that the negate is in this function, V might be a constant expr.
     if (TheNeg->getParent()->getParent() != BI->getParent()->getParent())
       continue;
-    
+
     BasicBlock::iterator InsertPt;
     if (Instruction *InstInput = dyn_cast<Instruction>(V)) {
       if (InvokeInst *II = dyn_cast<InvokeInst>(InstInput)) {
@@ -471,7 +866,7 @@ static bool ShouldBreakUpSubtract(Instruction *Sub) {
   // If this is a negation, we can't split it up!
   if (BinaryOperator::isNeg(Sub))
     return false;
-  
+
   // Don't bother to break this up unless either the LHS is an associable add or
   // subtract or if this is only used by one.
   if (isReassociableOp(Sub->getOperand(0), Instruction::Add) ||
@@ -480,19 +875,18 @@ static bool ShouldBreakUpSubtract(Instruction *Sub) {
   if (isReassociableOp(Sub->getOperand(1), Instruction::Add) ||
       isReassociableOp(Sub->getOperand(1), Instruction::Sub))
     return true;
-  if (Sub->hasOneUse() && 
+  if (Sub->hasOneUse() &&
       (isReassociableOp(Sub->use_back(), Instruction::Add) ||
        isReassociableOp(Sub->use_back(), Instruction::Sub)))
     return true;
-    
+
   return false;
 }
 
 /// BreakUpSubtract - If we have (X-Y), and if either X is an add, or if this is
 /// only used by an add, transform this into (X+(0-Y)) to promote better
 /// reassociation.
-static Instruction *BreakUpSubtract(Instruction *Sub,
-                         DenseMap<AssertingVH<Value>, unsigned> &ValueRankMap) {
+static BinaryOperator *BreakUpSubtract(Instruction *Sub) {
   // Convert a subtract into an add and a neg instruction. This allows sub
   // instructions to be commuted with other add instructions.
   //
@@ -500,15 +894,15 @@ static Instruction *BreakUpSubtract(Instruction *Sub,
   // and set it as the RHS of the add instruction we just made.
   //
   Value *NegVal = NegateValue(Sub->getOperand(1), Sub);
-  Instruction *New =
+  BinaryOperator *New =
     BinaryOperator::CreateAdd(Sub->getOperand(0), NegVal, "", Sub);
+  Sub->setOperand(0, Constant::getNullValue(Sub->getType())); // Drop use of op.
+  Sub->setOperand(1, Constant::getNullValue(Sub->getType())); // Drop use of op.
   New->takeName(Sub);
 
   // Everyone now refers to the add instruction.
-  ValueRankMap.erase(Sub);
   Sub->replaceAllUsesWith(New);
   New->setDebugLoc(Sub->getDebugLoc());
-  Sub->eraseFromParent();
 
   DEBUG(dbgs() << "Negated: " << *New << '\n');
   return New;
@@ -517,32 +911,23 @@ static Instruction *BreakUpSubtract(Instruction *Sub,
 /// ConvertShiftToMul - If this is a shift of a reassociable multiply or is used
 /// by one, change this into a multiply by a constant to assist with further
 /// reassociation.
-static Instruction *ConvertShiftToMul(Instruction *Shl,
-                         DenseMap<AssertingVH<Value>, unsigned> &ValueRankMap) {
-  // If an operand of this shift is a reassociable multiply, or if the shift
-  // is used by a reassociable multiply or add, turn into a multiply.
-  if (isReassociableOp(Shl->getOperand(0), Instruction::Mul) ||
-      (Shl->hasOneUse() && 
-       (isReassociableOp(Shl->use_back(), Instruction::Mul) ||
-        isReassociableOp(Shl->use_back(), Instruction::Add)))) {
-    Constant *MulCst = ConstantInt::get(Shl->getType(), 1);
-    MulCst = ConstantExpr::getShl(MulCst, cast<Constant>(Shl->getOperand(1)));
-    
-    Instruction *Mul =
-      BinaryOperator::CreateMul(Shl->getOperand(0), MulCst, "", Shl);
-    ValueRankMap.erase(Shl);
-    Mul->takeName(Shl);
-    Shl->replaceAllUsesWith(Mul);
-    Mul->setDebugLoc(Shl->getDebugLoc());
-    Shl->eraseFromParent();
-    return Mul;
-  }
-  return 0;
+static BinaryOperator *ConvertShiftToMul(Instruction *Shl) {
+  Constant *MulCst = ConstantInt::get(Shl->getType(), 1);
+  MulCst = ConstantExpr::getShl(MulCst, cast<Constant>(Shl->getOperand(1)));
+
+  BinaryOperator *Mul =
+    BinaryOperator::CreateMul(Shl->getOperand(0), MulCst, "", Shl);
+  Shl->setOperand(0, UndefValue::get(Shl->getType())); // Drop use of op.
+  Mul->takeName(Shl);
+  Shl->replaceAllUsesWith(Mul);
+  Mul->setDebugLoc(Shl->getDebugLoc());
+  return Mul;
 }
 
-// Scan backwards and forwards among values with the same rank as element i to
-// see if X exists.  If X does not exist, return i.  This is useful when
-// scanning for 'x' when we see '-x' because they both get the same rank.
+/// FindInOperandList - Scan backwards and forwards among values with the same
+/// rank as element i to see if X exists.  If X does not exist, return i.  This
+/// is useful when scanning for 'x' when we see '-x' because they both get the
+/// same rank.
 static unsigned FindInOperandList(SmallVectorImpl<ValueEntry> &Ops, unsigned i,
                                   Value *X) {
   unsigned XRank = Ops[i].Rank;
@@ -559,24 +944,32 @@ static unsigned FindInOperandList(SmallVectorImpl<ValueEntry> &Ops, unsigned i,
 
 /// EmitAddTreeOfValues - Emit a tree of add instructions, summing Ops together
 /// and returning the result.  Insert the tree before I.
-static Value *EmitAddTreeOfValues(Instruction *I, SmallVectorImpl<Value*> &Ops){
+static Value *EmitAddTreeOfValues(Instruction *I,
+                                  SmallVectorImpl<WeakVH> &Ops){
   if (Ops.size() == 1) return Ops.back();
-  
+
   Value *V1 = Ops.back();
   Ops.pop_back();
   Value *V2 = EmitAddTreeOfValues(I, Ops);
   return BinaryOperator::CreateAdd(V2, V1, "tmp", I);
 }
 
-/// RemoveFactorFromExpression - If V is an expression tree that is a 
+/// RemoveFactorFromExpression - If V is an expression tree that is a
 /// multiplication sequence, and if this sequence contains a multiply by Factor,
 /// remove Factor from the tree and return the new tree.
 Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) {
   BinaryOperator *BO = isReassociableOp(V, Instruction::Mul);
   if (!BO) return 0;
-  
+
+  SmallVector<RepeatedValue, 8> Tree;
+  MadeChange |= LinearizeExprTree(BO, Tree);
   SmallVector<ValueEntry, 8> Factors;
-  LinearizeExprTree(BO, Factors);
+  Factors.reserve(Tree.size());
+  for (unsigned i = 0, e = Tree.size(); i != e; ++i) {
+    RepeatedValue E = Tree[i];
+    Factors.append(E.second.getZExtValue(),
+                   ValueEntry(getRank(E.first), E.first));
+  }
 
   bool FoundFactor = false;
   bool NeedsNegate = false;
@@ -586,7 +979,7 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) {
       Factors.erase(Factors.begin()+i);
       break;
     }
-    
+
     // If this is a negative version of this factor, remove it.
     if (ConstantInt *FC1 = dyn_cast<ConstantInt>(Factor))
       if (ConstantInt *FC2 = dyn_cast<ConstantInt>(Factors[i].Op))
@@ -596,29 +989,28 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) {
           break;
         }
   }
-  
+
   if (!FoundFactor) {
     // Make sure to restore the operands to the expression tree.
     RewriteExprTree(BO, Factors);
     return 0;
   }
-  
+
   BasicBlock::iterator InsertPt = BO; ++InsertPt;
-  
+
   // If this was just a single multiply, remove the multiply and return the only
   // remaining operand.
   if (Factors.size() == 1) {
-    ValueRankMap.erase(BO);
-    DeadInsts.push_back(BO);
+    RedoInsts.insert(BO);
     V = Factors[0].Op;
   } else {
     RewriteExprTree(BO, Factors);
     V = BO;
   }
-  
+
   if (NeedsNegate)
     V = BinaryOperator::CreateNeg(V, "neg", InsertPt);
-  
+
   return V;
 }
 
@@ -628,31 +1020,16 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) {
 /// Ops is the top-level list of add operands we're trying to factor.
 static void FindSingleUseMultiplyFactors(Value *V,
                                          SmallVectorImpl<Value*> &Factors,
-                                       const SmallVectorImpl<ValueEntry> &Ops,
-                                         bool IsRoot) {
-  BinaryOperator *BO;
-  if (!(V->hasOneUse() || V->use_empty()) || // More than one use.
-      !(BO = dyn_cast<BinaryOperator>(V)) ||
-      BO->getOpcode() != Instruction::Mul) {
+                                       const SmallVectorImpl<ValueEntry> &Ops) {
+  BinaryOperator *BO = isReassociableOp(V, Instruction::Mul);
+  if (!BO) {
     Factors.push_back(V);
     return;
   }
-  
-  // If this value has a single use because it is another input to the add
-  // tree we're reassociating and we dropped its use, it actually has two
-  // uses and we can't factor it.
-  if (!IsRoot) {
-    for (unsigned i = 0, e = Ops.size(); i != e; ++i)
-      if (Ops[i].Op == V) {
-        Factors.push_back(V);
-        return;
-      }
-  }
-  
-  
+
   // Otherwise, add the LHS and RHS to the list of factors.
-  FindSingleUseMultiplyFactors(BO->getOperand(1), Factors, Ops, false);
-  FindSingleUseMultiplyFactors(BO->getOperand(0), Factors, Ops, false);
+  FindSingleUseMultiplyFactors(BO->getOperand(1), Factors, Ops);
+  FindSingleUseMultiplyFactors(BO->getOperand(0), Factors, Ops);
 }
 
 /// OptimizeAndOrXor - Optimize a series of operands to an 'and', 'or', or 'xor'
@@ -672,12 +1049,12 @@ static Value *OptimizeAndOrXor(unsigned Opcode,
       if (FoundX != i) {
         if (Opcode == Instruction::And)   // ...&X&~X = 0
           return Constant::getNullValue(X->getType());
-        
+
         if (Opcode == Instruction::Or)    // ...|X|~X = -1
           return Constant::getAllOnesValue(X->getType());
       }
     }
-    
+
     // Next, check for duplicate pairs of values, which we assume are next to
     // each other, due to our sorting criteria.
     assert(i < Ops.size());
@@ -689,12 +1066,12 @@ static Value *OptimizeAndOrXor(unsigned Opcode,
         ++NumAnnihil;
         continue;
       }
-      
+
       // Drop pairs of values for Xor.
       assert(Opcode == Instruction::Xor);
       if (e == 2)
         return Constant::getNullValue(Ops[0].Op->getType());
-      
+
       // Y ^ X^X -> Y
       Ops.erase(Ops.begin()+i, Ops.begin()+i+2);
       i -= 1; e -= 2;
@@ -727,46 +1104,46 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
         Ops.erase(Ops.begin()+i);
         ++NumFound;
       } while (i != Ops.size() && Ops[i].Op == TheOp);
-      
+
       DEBUG(errs() << "\nFACTORING [" << NumFound << "]: " << *TheOp << '\n');
       ++NumFactor;
-      
+
       // Insert a new multiply.
       Value *Mul = ConstantInt::get(cast<IntegerType>(I->getType()), NumFound);
       Mul = BinaryOperator::CreateMul(TheOp, Mul, "factor", I);
-      
+
       // Now that we have inserted a multiply, optimize it. This allows us to
       // handle cases that require multiple factoring steps, such as this:
       // (X*2) + (X*2) + (X*2) -> (X*2)*3 -> X*6
-      RedoInsts.push_back(Mul);
-      
+      RedoInsts.insert(cast<Instruction>(Mul));
+
       // If every add operand was a duplicate, return the multiply.
       if (Ops.empty())
         return Mul;
-      
+
       // Otherwise, we had some input that didn't have the dupe, such as
       // "A + A + B" -> "A*2 + B".  Add the new multiply to the list of
       // things being added by this operation.
       Ops.insert(Ops.begin(), ValueEntry(getRank(Mul), Mul));
-      
+
       --i;
       e = Ops.size();
       continue;
     }
-    
+
     // Check for X and -X in the operand list.
     if (!BinaryOperator::isNeg(TheOp))
       continue;
-    
+
     Value *X = BinaryOperator::getNegArgument(TheOp);
     unsigned FoundX = FindInOperandList(Ops, i, X);
     if (FoundX == i)
       continue;
-    
+
     // Remove X and -X from the operand list.
     if (Ops.size() == 2)
       return Constant::getNullValue(X->getType());
-    
+
     Ops.erase(Ops.begin()+i);
     if (i < FoundX)
       --FoundX;
@@ -777,37 +1154,37 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
     --i;     // Revisit element.
     e -= 2;  // Removed two elements.
   }
-  
+
   // Scan the operand list, checking to see if there are any common factors
   // between operands.  Consider something like A*A+A*B*C+D.  We would like to
   // reassociate this to A*(A+B*C)+D, which reduces the number of multiplies.
   // To efficiently find this, we count the number of times a factor occurs
   // for any ADD operands that are MULs.
   DenseMap<Value*, unsigned> FactorOccurrences;
-  
+
   // Keep track of each multiply we see, to avoid triggering on (X*4)+(X*4)
   // where they are actually the same multiply.
   unsigned MaxOcc = 0;
   Value *MaxOccVal = 0;
   for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
-    BinaryOperator *BOp = dyn_cast<BinaryOperator>(Ops[i].Op);
-    if (BOp == 0 || BOp->getOpcode() != Instruction::Mul || !BOp->use_empty())
+    BinaryOperator *BOp = isReassociableOp(Ops[i].Op, Instruction::Mul);
+    if (!BOp)
       continue;
-    
+
     // Compute all of the factors of this added value.
     SmallVector<Value*, 8> Factors;
-    FindSingleUseMultiplyFactors(BOp, Factors, Ops, true);
+    FindSingleUseMultiplyFactors(BOp, Factors, Ops);
     assert(Factors.size() > 1 && "Bad linearize!");
-    
+
     // Add one to FactorOccurrences for each unique factor in this op.
     SmallPtrSet<Value*, 8> Duplicates;
     for (unsigned i = 0, e = Factors.size(); i != e; ++i) {
       Value *Factor = Factors[i];
       if (!Duplicates.insert(Factor)) continue;
-      
+
       unsigned Occ = ++FactorOccurrences[Factor];
       if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factor; }
-      
+
       // If Factor is a negative constant, add the negated value as a factor
       // because we can percolate the negate out.  Watch for minint, which
       // cannot be positivified.
@@ -816,13 +1193,13 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
           Factor = ConstantInt::get(CI->getContext(), -CI->getValue());
           assert(!Duplicates.count(Factor) &&
                  "Shouldn't have two constant factors, missed a canonicalize");
-          
+
           unsigned Occ = ++FactorOccurrences[Factor];
           if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factor; }
         }
     }
   }
-  
+
   // If any factor occurred more than one time, we can pull it out.
   if (MaxOcc > 1) {
     DEBUG(errs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << '\n');
@@ -830,16 +1207,16 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
 
     // Create a new instruction that uses the MaxOccVal twice.  If we don't do
     // this, we could otherwise run into situations where removing a factor
-    // from an expression will drop a use of maxocc, and this can cause 
+    // from an expression will drop a use of maxocc, and this can cause
     // RemoveFactorFromExpression on successive values to behave differently.
     Instruction *DummyInst = BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal);
-    SmallVector<Value*, 4> NewMulOps;
+    SmallVector<WeakVH, 4> NewMulOps;
     for (unsigned i = 0; i != Ops.size(); ++i) {
       // Only try to remove factors from expressions we're allowed to.
-      BinaryOperator *BOp = dyn_cast<BinaryOperator>(Ops[i].Op);
-      if (BOp == 0 || BOp->getOpcode() != Instruction::Mul || !BOp->use_empty())
+      BinaryOperator *BOp = isReassociableOp(Ops[i].Op, Instruction::Mul);
+      if (!BOp)
         continue;
-      
+
       if (Value *V = RemoveFactorFromExpression(Ops[i].Op, MaxOccVal)) {
         // The factorized operand may occur several times.  Convert them all in
         // one fell swoop.
@@ -853,7 +1230,7 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
         --i;
       }
     }
-    
+
     // No need for extra uses anymore.
     delete DummyInst;
 
@@ -865,26 +1242,201 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
     // A*A*B + A*A*C   -->   A*(A*B+A*C)   -->   A*(A*(B+C))
     assert(NumAddedValues > 1 && "Each occurrence should contribute a value");
     (void)NumAddedValues;
-    V = ReassociateExpression(cast<BinaryOperator>(V));
+    if (Instruction *VI = dyn_cast<Instruction>(V))
+      RedoInsts.insert(VI);
 
     // Create the multiply.
-    Value *V2 = BinaryOperator::CreateMul(V, MaxOccVal, "tmp", I);
+    Instruction *V2 = BinaryOperator::CreateMul(V, MaxOccVal, "tmp", I);
 
     // Rerun associate on the multiply in case the inner expression turned into
     // a multiply.  We want to make sure that we keep things in canonical form.
-    V2 = ReassociateExpression(cast<BinaryOperator>(V2));
-    
+    RedoInsts.insert(V2);
+
     // If every add operand included the factor (e.g. "A*B + A*C"), then the
     // entire result expression is just the multiply "A*(B+C)".
     if (Ops.empty())
       return V2;
-    
+
     // Otherwise, we had some input that didn't have the factor, such as
     // "A*B + A*C + D" -> "A*(B+C) + D".  Add the new multiply to the list of
     // things being added by this operation.
     Ops.insert(Ops.begin(), ValueEntry(getRank(V2), V2));
   }
-  
+
+  return 0;
+}
+
+namespace {
+  /// \brief Predicate tests whether a ValueEntry's op is in a map.
+  struct IsValueInMap {
+    const DenseMap<Value *, unsigned> &Map;
+
+    IsValueInMap(const DenseMap<Value *, unsigned> &Map) : Map(Map) {}
+
+    bool operator()(const ValueEntry &Entry) {
+      return Map.find(Entry.Op) != Map.end();
+    }
+  };
+}
+
+/// \brief Build up a vector of value/power pairs factoring a product.
+///
+/// Given a series of multiplication operands, build a vector of factors and
+/// the powers each is raised to when forming the final product. Sort them in
+/// the order of descending power.
+///
+///      (x*x)          -> [(x, 2)]
+///     ((x*x)*x)       -> [(x, 3)]
+///   ((((x*y)*x)*y)*x) -> [(x, 3), (y, 2)]
+///
+/// \returns Whether any factors have a power greater than one.
+bool Reassociate::collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
+                                         SmallVectorImpl<Factor> &Factors) {
+  // FIXME: Have Ops be (ValueEntry, Multiplicity) pairs, simplifying this.
+  // Compute the sum of powers of simplifiable factors.
+  unsigned FactorPowerSum = 0;
+  for (unsigned Idx = 1, Size = Ops.size(); Idx < Size; ++Idx) {
+    Value *Op = Ops[Idx-1].Op;
+
+    // Count the number of occurrences of this value.
+    unsigned Count = 1;
+    for (; Idx < Size && Ops[Idx].Op == Op; ++Idx)
+      ++Count;
+    // Track for simplification all factors which occur 2 or more times.
+    if (Count > 1)
+      FactorPowerSum += Count;
+  }
+
+  // We can only simplify factors if the sum of the powers of our simplifiable
+  // factors is 4 or higher. When that is the case, we will *always* have
+  // a simplification. This is an important invariant to prevent cyclicly
+  // trying to simplify already minimal formations.
+  if (FactorPowerSum < 4)
+    return false;
+
+  // Now gather the simplifiable factors, removing them from Ops.
+  FactorPowerSum = 0;
+  for (unsigned Idx = 1; Idx < Ops.size(); ++Idx) {
+    Value *Op = Ops[Idx-1].Op;
+
+    // Count the number of occurrences of this value.
+    unsigned Count = 1;
+    for (; Idx < Ops.size() && Ops[Idx].Op == Op; ++Idx)
+      ++Count;
+    if (Count == 1)
+      continue;
+    // Move an even number of occurrences to Factors.
+    Count &= ~1U;
+    Idx -= Count;
+    FactorPowerSum += Count;
+    Factors.push_back(Factor(Op, Count));
+    Ops.erase(Ops.begin()+Idx, Ops.begin()+Idx+Count);
+  }
+
+  // None of the adjustments above should have reduced the sum of factor powers
+  // below our mininum of '4'.
+  assert(FactorPowerSum >= 4);
+
+  std::sort(Factors.begin(), Factors.end(), Factor::PowerDescendingSorter());
+  return true;
+}
+
+/// \brief Build a tree of multiplies, computing the product of Ops.
+static Value *buildMultiplyTree(IRBuilder<> &Builder,
+                                SmallVectorImpl<Value*> &Ops) {
+  if (Ops.size() == 1)
+    return Ops.back();
+
+  Value *LHS = Ops.pop_back_val();
+  do {
+    LHS = Builder.CreateMul(LHS, Ops.pop_back_val());
+  } while (!Ops.empty());
+
+  return LHS;
+}
+
+/// \brief Build a minimal multiplication DAG for (a^x)*(b^y)*(c^z)*...
+///
+/// Given a vector of values raised to various powers, where no two values are
+/// equal and the powers are sorted in decreasing order, compute the minimal
+/// DAG of multiplies to compute the final product, and return that product
+/// value.
+Value *Reassociate::buildMinimalMultiplyDAG(IRBuilder<> &Builder,
+                                            SmallVectorImpl<Factor> &Factors) {
+  assert(Factors[0].Power);
+  SmallVector<Value *, 4> OuterProduct;
+  for (unsigned LastIdx = 0, Idx = 1, Size = Factors.size();
+       Idx < Size && Factors[Idx].Power > 0; ++Idx) {
+    if (Factors[Idx].Power != Factors[LastIdx].Power) {
+      LastIdx = Idx;
+      continue;
+    }
+
+    // We want to multiply across all the factors with the same power so that
+    // we can raise them to that power as a single entity. Build a mini tree
+    // for that.
+    SmallVector<Value *, 4> InnerProduct;
+    InnerProduct.push_back(Factors[LastIdx].Base);
+    do {
+      InnerProduct.push_back(Factors[Idx].Base);
+      ++Idx;
+    } while (Idx < Size && Factors[Idx].Power == Factors[LastIdx].Power);
+
+    // Reset the base value of the first factor to the new expression tree.
+    // We'll remove all the factors with the same power in a second pass.
+    Value *M = Factors[LastIdx].Base = buildMultiplyTree(Builder, InnerProduct);
+    if (Instruction *MI = dyn_cast<Instruction>(M))
+      RedoInsts.insert(MI);
+
+    LastIdx = Idx;
+  }
+  // Unique factors with equal powers -- we've folded them into the first one's
+  // base.
+  Factors.erase(std::unique(Factors.begin(), Factors.end(),
+                            Factor::PowerEqual()),
+                Factors.end());
+
+  // Iteratively collect the base of each factor with an add power into the
+  // outer product, and halve each power in preparation for squaring the
+  // expression.
+  for (unsigned Idx = 0, Size = Factors.size(); Idx != Size; ++Idx) {
+    if (Factors[Idx].Power & 1)
+      OuterProduct.push_back(Factors[Idx].Base);
+    Factors[Idx].Power >>= 1;
+  }
+  if (Factors[0].Power) {
+    Value *SquareRoot = buildMinimalMultiplyDAG(Builder, Factors);
+    OuterProduct.push_back(SquareRoot);
+    OuterProduct.push_back(SquareRoot);
+  }
+  if (OuterProduct.size() == 1)
+    return OuterProduct.front();
+
+  Value *V = buildMultiplyTree(Builder, OuterProduct);
+  return V;
+}
+
+Value *Reassociate::OptimizeMul(BinaryOperator *I,
+                                SmallVectorImpl<ValueEntry> &Ops) {
+  // We can only optimize the multiplies when there is a chain of more than
+  // three, such that a balanced tree might require fewer total multiplies.
+  if (Ops.size() < 4)
+    return 0;
+
+  // Try to turn linear trees of multiplies without other uses of the
+  // intermediate stages into minimal multiply DAGs with perfect sub-expression
+  // re-use.
+  SmallVector<Factor, 4> Factors;
+  if (!collectMultiplyFactors(Ops, Factors))
+    return 0; // All distinct factors, so nothing left for us to do.
+
+  IRBuilder<> Builder(I);
+  Value *V = buildMinimalMultiplyDAG(Builder, Factors);
+  if (Ops.empty())
+    return V;
+
+  ValueEntry NewEntry = ValueEntry(getRank(V), V);
+  Ops.insert(std::lower_bound(Ops.begin(), Ops.end(), NewEntry), NewEntry);
   return 0;
 }
 
@@ -892,95 +1444,105 @@ Value *Reassociate::OptimizeExpression(BinaryOperator *I,
                                        SmallVectorImpl<ValueEntry> &Ops) {
   // Now that we have the linearized expression tree, try to optimize it.
   // Start by folding any constants that we found.
-  bool IterateOptimization = false;
   if (Ops.size() == 1) return Ops[0].Op;
 
   unsigned Opcode = I->getOpcode();
-  
-  if (Constant *V1 = dyn_cast<Constant>(Ops[Ops.size()-2].Op))
-    if (Constant *V2 = dyn_cast<Constant>(Ops.back().Op)) {
-      Ops.pop_back();
-      Ops.back().Op = ConstantExpr::get(Opcode, V1, V2);
-      return OptimizeExpression(I, Ops);
-    }
-
-  // Check for destructive annihilation due to a constant being used.
-  if (ConstantInt *CstVal = dyn_cast<ConstantInt>(Ops.back().Op))
-    switch (Opcode) {
-    default: break;
-    case Instruction::And:
-      if (CstVal->isZero())                  // X & 0 -> 0
-        return CstVal;
-      if (CstVal->isAllOnesValue())          // X & -1 -> X
-        Ops.pop_back();
-      break;
-    case Instruction::Mul:
-      if (CstVal->isZero()) {                // X * 0 -> 0
-        ++NumAnnihil;
-        return CstVal;
-      }
-        
-      if (cast<ConstantInt>(CstVal)->isOne())
-        Ops.pop_back();                      // X * 1 -> X
-      break;
-    case Instruction::Or:
-      if (CstVal->isAllOnesValue())          // X | -1 -> -1
-        return CstVal;
-      // FALLTHROUGH!
-    case Instruction::Add:
-    case Instruction::Xor:
-      if (CstVal->isZero())                  // X [|^+] 0 -> X
-        Ops.pop_back();
-      break;
-    }
-  if (Ops.size() == 1) return Ops[0].Op;
 
   // Handle destructive annihilation due to identities between elements in the
   // argument list here.
+  unsigned NumOps = Ops.size();
   switch (Opcode) {
   default: break;
   case Instruction::And:
   case Instruction::Or:
-  case Instruction::Xor: {
-    unsigned NumOps = Ops.size();
+  case Instruction::Xor:
     if (Value *Result = OptimizeAndOrXor(Opcode, Ops))
       return Result;
-    IterateOptimization |= Ops.size() != NumOps;
     break;
-  }
 
-  case Instruction::Add: {
-    unsigned NumOps = Ops.size();
+  case Instruction::Add:
     if (Value *Result = OptimizeAdd(I, Ops))
       return Result;
-    IterateOptimization |= Ops.size() != NumOps;
-  }
+    break;
 
+  case Instruction::Mul:
+    if (Value *Result = OptimizeMul(I, Ops))
+      return Result;
     break;
-  //case Instruction::Mul:
   }
 
-  if (IterateOptimization)
+  if (Ops.size() != NumOps)
     return OptimizeExpression(I, Ops);
   return 0;
 }
 
+/// EraseInst - Zap the given instruction, adding interesting operands to the
+/// work list.
+void Reassociate::EraseInst(Instruction *I) {
+  assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
+  SmallVector<Value*, 8> Ops(I->op_begin(), I->op_end());
+  // Erase the dead instruction.
+  ValueRankMap.erase(I);
+  RedoInsts.remove(I);
+  I->eraseFromParent();
+  // Optimize its operands.
+  SmallPtrSet<Instruction *, 8> Visited; // Detect self-referential nodes.
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    if (Instruction *Op = dyn_cast<Instruction>(Ops[i])) {
+      // If this is a node in an expression tree, climb to the expression root
+      // and add that since that's where optimization actually happens.
+      unsigned Opcode = Op->getOpcode();
+      while (Op->hasOneUse() && Op->use_back()->getOpcode() == Opcode &&
+             Visited.insert(Op))
+        Op = Op->use_back();
+      RedoInsts.insert(Op);
+    }
+}
+
+/// OptimizeInst - Inspect and optimize the given instruction. Note that erasing
+/// instructions is not allowed.
+void Reassociate::OptimizeInst(Instruction *I) {
+  // Only consider operations that we understand.
+  if (!isa<BinaryOperator>(I))
+    return;
 
-/// ReassociateInst - Inspect and reassociate the instruction at the
-/// given position, post-incrementing the position.
-void Reassociate::ReassociateInst(BasicBlock::iterator &BBI) {
-  Instruction *BI = BBI++;
-  if (BI->getOpcode() == Instruction::Shl &&
-      isa<ConstantInt>(BI->getOperand(1)))
-    if (Instruction *NI = ConvertShiftToMul(BI, ValueRankMap)) {
+  if (I->getOpcode() == Instruction::Shl &&
+      isa<ConstantInt>(I->getOperand(1)))
+    // If an operand of this shift is a reassociable multiply, or if the shift
+    // is used by a reassociable multiply or add, turn into a multiply.
+    if (isReassociableOp(I->getOperand(0), Instruction::Mul) ||
+        (I->hasOneUse() &&
+         (isReassociableOp(I->use_back(), Instruction::Mul) ||
+          isReassociableOp(I->use_back(), Instruction::Add)))) {
+      Instruction *NI = ConvertShiftToMul(I);
+      RedoInsts.insert(I);
       MadeChange = true;
-      BI = NI;
+      I = NI;
+    }
+
+  // Floating point binary operators are not associative, but we can still
+  // commute (some) of them, to canonicalize the order of their operands.
+  // This can potentially expose more CSE opportunities, and makes writing
+  // other transformations simpler.
+  if ((I->getType()->isFloatingPointTy() || I->getType()->isVectorTy())) {
+    // FAdd and FMul can be commuted.
+    if (I->getOpcode() != Instruction::FMul &&
+        I->getOpcode() != Instruction::FAdd)
+      return;
+
+    Value *LHS = I->getOperand(0);
+    Value *RHS = I->getOperand(1);
+    unsigned LHSRank = getRank(LHS);
+    unsigned RHSRank = getRank(RHS);
+
+    // Sort the operands by rank.
+    if (RHSRank < LHSRank) {
+      I->setOperand(0, RHS);
+      I->setOperand(1, LHS);
     }
 
-  // Reject cases where it is pointless to do this.
-  if (!isa<BinaryOperator>(BI) || BI->getType()->isFloatingPointTy() || 
-      BI->getType()->isVectorTy())
-    return;  // Floating point ops are not associative.
+    return;
+  }
 
   // Do not reassociate boolean (i1) expressions.  We want to preserve the
   // original order of evaluation for short-circuited comparisons that
@@ -988,58 +1550,66 @@ void Reassociate::ReassociateInst(BasicBlock::iterator &BBI) {
   // is not further optimized, it is likely to be transformed back to a
   // short-circuited form for code gen, and the source order may have been
   // optimized for the most likely conditions.
-  if (BI->getType()->isIntegerTy(1))
+  if (I->getType()->isIntegerTy(1))
     return;
 
   // If this is a subtract instruction which is not already in negate form,
   // see if we can convert it to X+-Y.
-  if (BI->getOpcode() == Instruction::Sub) {
-    if (ShouldBreakUpSubtract(BI)) {
-      BI = BreakUpSubtract(BI, ValueRankMap);
-      // Reset the BBI iterator in case BreakUpSubtract changed the
-      // instruction it points to.
-      BBI = BI;
-      ++BBI;
+  if (I->getOpcode() == Instruction::Sub) {
+    if (ShouldBreakUpSubtract(I)) {
+      Instruction *NI = BreakUpSubtract(I);
+      RedoInsts.insert(I);
       MadeChange = true;
-    } else if (BinaryOperator::isNeg(BI)) {
+      I = NI;
+    } else if (BinaryOperator::isNeg(I)) {
       // Otherwise, this is a negation.  See if the operand is a multiply tree
       // and if this is not an inner node of a multiply tree.
-      if (isReassociableOp(BI->getOperand(1), Instruction::Mul) &&
-          (!BI->hasOneUse() ||
-           !isReassociableOp(BI->use_back(), Instruction::Mul))) {
-        BI = LowerNegateToMultiply(BI, ValueRankMap);
+      if (isReassociableOp(I->getOperand(1), Instruction::Mul) &&
+          (!I->hasOneUse() ||
+           !isReassociableOp(I->use_back(), Instruction::Mul))) {
+        Instruction *NI = LowerNegateToMultiply(I);
+        RedoInsts.insert(I);
         MadeChange = true;
+        I = NI;
       }
     }
   }
 
-  // If this instruction is a commutative binary operator, process it.
-  if (!BI->isAssociative()) return;
-  BinaryOperator *I = cast<BinaryOperator>(BI);
+  // If this instruction is an associative binary operator, process it.
+  if (!I->isAssociative()) return;
+  BinaryOperator *BO = cast<BinaryOperator>(I);
 
   // If this is an interior node of a reassociable tree, ignore it until we
   // get to the root of the tree, to avoid N^2 analysis.
-  if (I->hasOneUse() && isReassociableOp(I->use_back(), I->getOpcode()))
+  unsigned Opcode = BO->getOpcode();
+  if (BO->hasOneUse() && BO->use_back()->getOpcode() == Opcode)
     return;
 
-  // If this is an add tree that is used by a sub instruction, ignore it 
+  // If this is an add tree that is used by a sub instruction, ignore it
   // until we process the subtract.
-  if (I->hasOneUse() && I->getOpcode() == Instruction::Add &&
-      cast<Instruction>(I->use_back())->getOpcode() == Instruction::Sub)
+  if (BO->hasOneUse() && BO->getOpcode() == Instruction::Add &&
+      cast<Instruction>(BO->use_back())->getOpcode() == Instruction::Sub)
     return;
 
-  ReassociateExpression(I);
+  ReassociateExpression(BO);
 }
 
-Value *Reassociate::ReassociateExpression(BinaryOperator *I) {
-  
+void Reassociate::ReassociateExpression(BinaryOperator *I) {
+
   // First, walk the expression tree, linearizing the tree, collecting the
   // operand information.
+  SmallVector<RepeatedValue, 8> Tree;
+  MadeChange |= LinearizeExprTree(I, Tree);
   SmallVector<ValueEntry, 8> Ops;
-  LinearizeExprTree(I, Ops);
-  
+  Ops.reserve(Tree.size());
+  for (unsigned i = 0, e = Tree.size(); i != e; ++i) {
+    RepeatedValue E = Tree[i];
+    Ops.append(E.second.getZExtValue(),
+               ValueEntry(getRank(E.first), E.first));
+  }
+
   DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n');
-  
+
   // Now that we have linearized the tree to a list and have gathered all of
   // the operands and their ranks, sort the operands by their rank.  Use a
   // stable_sort so that values with equal ranks will have their relative
@@ -1047,21 +1617,24 @@ Value *Reassociate::ReassociateExpression(BinaryOperator *I) {
   // this sorts so that the highest ranking values end up at the beginning of
   // the vector.
   std::stable_sort(Ops.begin(), Ops.end());
-  
+
   // OptimizeExpression - Now that we have the expression tree in a convenient
   // sorted form, optimize it globally if possible.
   if (Value *V = OptimizeExpression(I, Ops)) {
+    if (V == I)
+      // Self-referential expression in unreachable code.
+      return;
     // This expression tree simplified to something that isn't a tree,
     // eliminate it.
     DEBUG(dbgs() << "Reassoc to scalar: " << *V << '\n');
     I->replaceAllUsesWith(V);
     if (Instruction *VI = dyn_cast<Instruction>(V))
       VI->setDebugLoc(I->getDebugLoc());
-    RemoveDeadBinaryOp(I);
+    RedoInsts.insert(I);
     ++NumAnnihil;
-    return V;
+    return;
   }
-  
+
   // We want to sink immediates as deeply as possible except in the case where
   // this is a multiply tree used only by an add, and the immediate is a -1.
   // In this case we reassociate to put the negation on the outside so that we
@@ -1073,51 +1646,57 @@ Value *Reassociate::ReassociateExpression(BinaryOperator *I) {
     ValueEntry Tmp = Ops.pop_back_val();
     Ops.insert(Ops.begin(), Tmp);
   }
-  
+
   DEBUG(dbgs() << "RAOut:\t"; PrintOps(I, Ops); dbgs() << '\n');
-  
+
   if (Ops.size() == 1) {
+    if (Ops[0].Op == I)
+      // Self-referential expression in unreachable code.
+      return;
+
     // This expression tree simplified to something that isn't a tree,
     // eliminate it.
     I->replaceAllUsesWith(Ops[0].Op);
     if (Instruction *OI = dyn_cast<Instruction>(Ops[0].Op))
       OI->setDebugLoc(I->getDebugLoc());
-    RemoveDeadBinaryOp(I);
-    return Ops[0].Op;
+    RedoInsts.insert(I);
+    return;
   }
-  
+
   // Now that we ordered and optimized the expressions, splat them back into
   // the expression tree, removing any unneeded nodes.
   RewriteExprTree(I, Ops);
-  return I;
 }
 
-
 bool Reassociate::runOnFunction(Function &F) {
-  // Recalculate the rank map for F
+  // Calculate the rank map for F
   BuildRankMap(F);
 
   MadeChange = false;
-  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI)
-    for (BasicBlock::iterator BBI = FI->begin(); BBI != FI->end(); )
-      ReassociateInst(BBI);
-
-  // Now that we're done, revisit any instructions which are likely to
-  // have secondary reassociation opportunities.
-  while (!RedoInsts.empty())
-    if (Value *V = RedoInsts.pop_back_val()) {
-      BasicBlock::iterator BBI = cast<Instruction>(V);
-      ReassociateInst(BBI);
-    }
+  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+    // Optimize every instruction in the basic block.
+    for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE; )
+      if (isInstructionTriviallyDead(II)) {
+        EraseInst(II++);
+      } else {
+        OptimizeInst(II);
+        assert(II->getParent() == BI && "Moved to a different block!");
+        ++II;
+      }
 
-  // Now that we're done, delete any instructions which are no longer used.
-  while (!DeadInsts.empty())
-    if (Value *V = DeadInsts.pop_back_val())
-      RecursivelyDeleteTriviallyDeadInstructions(V);
+    // If this produced extra instructions to optimize, handle them now.
+    while (!RedoInsts.empty()) {
+      Instruction *I = RedoInsts.pop_back_val();
+      if (isInstructionTriviallyDead(I))
+        EraseInst(I);
+      else
+        OptimizeInst(I);
+    }
+  }
 
   // We are done with the rank map.
   RankMap.clear();
   ValueRankMap.clear();
+
   return MadeChange;
 }
-
diff --git a/lib/Transforms/Scalar/Reg2Mem.cpp b/lib/Transforms/Scalar/Reg2Mem.cpp
index 47afc77..ea1de63 100644
--- a/lib/Transforms/Scalar/Reg2Mem.cpp
+++ b/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file demotes all registers to memory references.  It is intented to be
+// This file demotes all registers to memory references.  It is intended to be
 // the inverse of PromoteMemoryToRegister.  By converting to loads, the only
 // values live across basic blocks are allocas and loads before phi nodes.
 // It is intended that this should make CFG hacking much easier.
@@ -59,7 +59,7 @@ namespace {
     virtual bool runOnFunction(Function &F);
   };
 }
-  
+
 char RegToMem::ID = 0;
 INITIALIZE_PASS_BEGIN(RegToMem, "reg2mem", "Demote all values to stack slots",
                 false, false)
@@ -68,25 +68,25 @@ INITIALIZE_PASS_END(RegToMem, "reg2mem", "Demote all values to stack slots",
                 false, false)
 
 bool RegToMem::runOnFunction(Function &F) {
-  if (F.isDeclaration()) 
+  if (F.isDeclaration())
     return false;
-  
+
   // Insert all new allocas into entry block.
   BasicBlock *BBEntry = &F.getEntryBlock();
   assert(pred_begin(BBEntry) == pred_end(BBEntry) &&
          "Entry block to function must not have predecessors!");
-  
+
   // Find first non-alloca instruction and create insertion point. This is
   // safe if block is well-formed: it always have terminator, otherwise
   // we'll get and assertion.
   BasicBlock::iterator I = BBEntry->begin();
   while (isa<AllocaInst>(I)) ++I;
-  
+
   CastInst *AllocaInsertionPoint =
     new BitCastInst(Constant::getNullValue(Type::getInt32Ty(F.getContext())),
                     Type::getInt32Ty(F.getContext()),
                     "reg2mem alloca point", I);
-  
+
   // Find the escaped instructions. But don't create stack slots for
   // allocas in entry block.
   std::list<Instruction*> WorkList;
@@ -99,15 +99,15 @@ bool RegToMem::runOnFunction(Function &F) {
         WorkList.push_front(&*iib);
       }
     }
-  
+
   // Demote escaped instructions
   NumRegsDemoted += WorkList.size();
-  for (std::list<Instruction*>::iterator ilb = WorkList.begin(), 
+  for (std::list<Instruction*>::iterator ilb = WorkList.begin(),
        ile = WorkList.end(); ilb != ile; ++ilb)
     DemoteRegToStack(**ilb, false, AllocaInsertionPoint);
-  
+
   WorkList.clear();
-  
+
   // Find all phi's
   for (Function::iterator ibb = F.begin(), ibe = F.end();
        ibb != ibe; ++ibb)
@@ -115,19 +115,18 @@ bool RegToMem::runOnFunction(Function &F) {
          iib != iie; ++iib)
       if (isa<PHINode>(iib))
         WorkList.push_front(&*iib);
-  
+
   // Demote phi nodes
   NumPhisDemoted += WorkList.size();
-  for (std::list<Instruction*>::iterator ilb = WorkList.begin(), 
+  for (std::list<Instruction*>::iterator ilb = WorkList.begin(),
        ile = WorkList.end(); ilb != ile; ++ilb)
     DemotePHIToStack(cast<PHINode>(*ilb), AllocaInsertionPoint);
-  
+
   return true;
 }
 
 
 // createDemoteRegisterToMemory - Provide an entry point to create this pass.
-//
 char &llvm::DemoteRegisterToMemoryID = RegToMem::ID;
 FunctionPass *llvm::createDemoteRegisterToMemoryPass() {
   return new RegToMem();
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 16b64a5..2c39aab 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -409,7 +409,7 @@ private:
 
     if (Constant *C = dyn_cast<Constant>(V)) {
       Constant *Elt = C->getAggregateElement(i);
-      
+
       if (Elt == 0)
         LV.markOverdefined();      // Unknown sort of constant.
       else if (isa<UndefValue>(Elt))
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index 7d65bcc..48318c8 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements common infrastructure for libLLVMScalarOpts.a, which 
+// This file implements common infrastructure for libLLVMScalarOpts.a, which
 // implements several scalar transformations over the LLVM intermediate
 // representation, including the C bindings for that library.
 //
@@ -24,7 +24,7 @@
 
 using namespace llvm;
 
-/// initializeScalarOptsPasses - Initialize all passes linked into the 
+/// initializeScalarOptsPasses - Initialize all passes linked into the
 /// ScalarOpts library.
 void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeADCEPass(Registry);
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index 026fea1..ec835b1 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -22,33 +22,34 @@
 #define DEBUG_TYPE "scalarrepl"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Constants.h"
+#include "llvm/DIBuilder.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
 #include "llvm/GlobalVariable.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
+#include "llvm/Operator.h"
 #include "llvm/Pass.h"
-#include "llvm/Analysis/DebugInfo.h"
-#include "llvm/Analysis/DIBuilder.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Transforms/Utils/PromoteMemToReg.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
 using namespace llvm;
 
 STATISTIC(NumReplaced,  "Number of allocas broken up");
@@ -59,12 +60,25 @@ STATISTIC(NumGlobals,   "Number of allocas copied from constant global");
 
 namespace {
   struct SROA : public FunctionPass {
-    SROA(int T, bool hasDT, char &ID)
+    SROA(int T, bool hasDT, char &ID, int ST, int AT, int SLT)
       : FunctionPass(ID), HasDomTree(hasDT) {
       if (T == -1)
         SRThreshold = 128;
       else
         SRThreshold = T;
+      if (ST == -1)
+        StructMemberThreshold = 32;
+      else
+        StructMemberThreshold = ST;
+      if (AT == -1)
+        ArrayElementThreshold = 8;
+      else
+        ArrayElementThreshold = AT;
+      if (SLT == -1)
+        // Do not limit the scalar integer load size if no threshold is given.
+        ScalarLoadThreshold = -1;
+      else
+        ScalarLoadThreshold = SLT;
     }
 
     bool runOnFunction(Function &F);
@@ -86,11 +100,11 @@ namespace {
     struct AllocaInfo {
       /// The alloca to promote.
       AllocaInst *AI;
-      
+
       /// CheckedPHIs - This is a set of verified PHI nodes, to prevent infinite
       /// looping and avoid redundant work.
       SmallPtrSet<PHINode*, 8> CheckedPHIs;
-      
+
       /// isUnsafe - This is set to true if the alloca cannot be SROA'd.
       bool isUnsafe : 1;
 
@@ -104,19 +118,32 @@ namespace {
       /// ever accessed, or false if the alloca is only accessed with mem
       /// intrinsics or load/store that only access the entire alloca at once.
       bool hasSubelementAccess : 1;
-      
+
       /// hasALoadOrStore - This is true if there are any loads or stores to it.
       /// The alloca may just be accessed with memcpy, for example, which would
       /// not set this.
       bool hasALoadOrStore : 1;
-      
+
       explicit AllocaInfo(AllocaInst *ai)
         : AI(ai), isUnsafe(false), isMemCpySrc(false), isMemCpyDst(false),
           hasSubelementAccess(false), hasALoadOrStore(false) {}
     };
 
+    /// SRThreshold - The maximum alloca size to considered for SROA.
     unsigned SRThreshold;
 
+    /// StructMemberThreshold - The maximum number of members a struct can
+    /// contain to be considered for SROA.
+    unsigned StructMemberThreshold;
+
+    /// ArrayElementThreshold - The maximum number of elements an array can
+    /// have to be considered for SROA.
+    unsigned ArrayElementThreshold;
+
+    /// ScalarLoadThreshold - The maximum size in bits of scalars to load when
+    /// converting to scalar
+    unsigned ScalarLoadThreshold;
+
     void MarkUnsafe(AllocaInfo &I, Instruction *User) {
       I.isUnsafe = true;
       DEBUG(dbgs() << "  Transformation preventing inst: " << *User << '\n');
@@ -155,19 +182,21 @@ namespace {
                                        SmallVector<AllocaInst*, 32> &NewElts);
     void RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
                                       SmallVector<AllocaInst*, 32> &NewElts);
+    bool ShouldAttemptScalarRepl(AllocaInst *AI);
 
     static MemTransferInst *isOnlyCopiedFromConstantGlobal(
         AllocaInst *AI, SmallVector<Instruction*, 4> &ToDelete);
   };
-  
+
   // SROA_DT - SROA that uses DominatorTree.
   struct SROA_DT : public SROA {
     static char ID;
   public:
-    SROA_DT(int T = -1) : SROA(T, true, ID) {
+    SROA_DT(int T = -1, int ST = -1, int AT = -1, int SLT = -1) :
+        SROA(T, true, ID, ST, AT, SLT) {
       initializeSROA_DTPass(*PassRegistry::getPassRegistry());
     }
-    
+
     // getAnalysisUsage - This pass does not require any passes, but we know it
     // will not alter the CFG, so say so.
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
@@ -175,22 +204,23 @@ namespace {
       AU.setPreservesCFG();
     }
   };
-  
+
   // SROA_SSAUp - SROA that uses SSAUpdater.
   struct SROA_SSAUp : public SROA {
     static char ID;
   public:
-    SROA_SSAUp(int T = -1) : SROA(T, false, ID) {
+    SROA_SSAUp(int T = -1, int ST = -1, int AT = -1, int SLT = -1) :
+        SROA(T, false, ID, ST, AT, SLT) {
       initializeSROA_SSAUpPass(*PassRegistry::getPassRegistry());
     }
-    
+
     // getAnalysisUsage - This pass does not require any passes, but we know it
     // will not alter the CFG, so say so.
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesCFG();
     }
   };
-  
+
 }
 
 char SROA_DT::ID = 0;
@@ -209,10 +239,15 @@ INITIALIZE_PASS_END(SROA_SSAUp, "scalarrepl-ssa",
 
 // Public interface to the ScalarReplAggregates pass
 FunctionPass *llvm::createScalarReplAggregatesPass(int Threshold,
-                                                   bool UseDomTree) {
+                                                   bool UseDomTree,
+                                                   int StructMemberThreshold,
+                                                   int ArrayElementThreshold,
+                                                   int ScalarLoadThreshold) {
   if (UseDomTree)
-    return new SROA_DT(Threshold);
-  return new SROA_SSAUp(Threshold);
+    return new SROA_DT(Threshold, StructMemberThreshold, ArrayElementThreshold,
+                       ScalarLoadThreshold);
+  return new SROA_SSAUp(Threshold, StructMemberThreshold,
+                        ArrayElementThreshold, ScalarLoadThreshold);
 }
 
 
@@ -228,6 +263,7 @@ class ConvertToScalarInfo {
   /// AllocaSize - The size of the alloca being considered in bytes.
   unsigned AllocaSize;
   const TargetData &TD;
+  unsigned ScalarLoadThreshold;
 
   /// IsNotTrivial - This is set to true if there is some access to the object
   /// which means that mem2reg can't promote it.
@@ -258,28 +294,38 @@ class ConvertToScalarInfo {
   /// isn't possible to turn into a vector type, it gets set to VoidTy.
   VectorType *VectorTy;
 
-  /// HadNonMemTransferAccess - True if there is at least one access to the 
+  /// HadNonMemTransferAccess - True if there is at least one access to the
   /// alloca that is not a MemTransferInst.  We don't want to turn structs into
   /// large integers unless there is some potential for optimization.
   bool HadNonMemTransferAccess;
 
+  /// HadDynamicAccess - True if some element of this alloca was dynamic.
+  /// We don't yet have support for turning a dynamic access into a large
+  /// integer.
+  bool HadDynamicAccess;
+
 public:
-  explicit ConvertToScalarInfo(unsigned Size, const TargetData &td)
-    : AllocaSize(Size), TD(td), IsNotTrivial(false), ScalarKind(Unknown),
-      VectorTy(0), HadNonMemTransferAccess(false) { }
+  explicit ConvertToScalarInfo(unsigned Size, const TargetData &td,
+                               unsigned SLT)
+    : AllocaSize(Size), TD(td), ScalarLoadThreshold(SLT), IsNotTrivial(false),
+    ScalarKind(Unknown), VectorTy(0), HadNonMemTransferAccess(false),
+    HadDynamicAccess(false) { }
 
   AllocaInst *TryConvert(AllocaInst *AI);
 
 private:
-  bool CanConvertToScalar(Value *V, uint64_t Offset);
+  bool CanConvertToScalar(Value *V, uint64_t Offset, Value* NonConstantIdx);
   void MergeInTypeForLoadOrStore(Type *In, uint64_t Offset);
   bool MergeInVectorType(VectorType *VInTy, uint64_t Offset);
-  void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset);
+  void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset,
+                           Value *NonConstantIdx);
 
   Value *ConvertScalar_ExtractValue(Value *NV, Type *ToType,
-                                    uint64_t Offset, IRBuilder<> &Builder);
+                                    uint64_t Offset, Value* NonConstantIdx,
+                                    IRBuilder<> &Builder);
   Value *ConvertScalar_InsertValue(Value *StoredVal, Value *ExistingVal,
-                                   uint64_t Offset, IRBuilder<> &Builder);
+                                   uint64_t Offset, Value* NonConstantIdx,
+                                   IRBuilder<> &Builder);
 };
 } // end anonymous namespace.
 
@@ -290,7 +336,7 @@ private:
 AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
   // If we can't convert this scalar, or if mem2reg can trivially do it, bail
   // out.
-  if (!CanConvertToScalar(AI, 0) || !IsNotTrivial)
+  if (!CanConvertToScalar(AI, 0, 0) || !IsNotTrivial)
     return 0;
 
   // If an alloca has only memset / memcpy uses, it may still have an Unknown
@@ -315,16 +361,27 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
     NewTy = VectorTy;  // Use the vector type.
   } else {
     unsigned BitWidth = AllocaSize * 8;
+
+    // Do not convert to scalar integer if the alloca size exceeds the
+    // scalar load threshold.
+    if (BitWidth > ScalarLoadThreshold)
+      return 0;
+
     if ((ScalarKind == ImplicitVector || ScalarKind == Integer) &&
         !HadNonMemTransferAccess && !TD.fitsInLegalInteger(BitWidth))
       return 0;
+    // Dynamic accesses on integers aren't yet supported.  They need us to shift
+    // by a dynamic amount which could be difficult to work out as we might not
+    // know whether to use a left or right shift.
+    if (ScalarKind == Integer && HadDynamicAccess)
+      return 0;
 
     DEBUG(dbgs() << "CONVERT TO SCALAR INTEGER: " << *AI << "\n");
     // Create and insert the integer alloca.
     NewTy = IntegerType::get(AI->getContext(), BitWidth);
   }
   AllocaInst *NewAI = new AllocaInst(NewTy, 0, "", AI->getParent()->begin());
-  ConvertUsesToScalar(AI, NewAI, 0);
+  ConvertUsesToScalar(AI, NewAI, 0, 0);
   return NewAI;
 }
 
@@ -411,7 +468,8 @@ bool ConvertToScalarInfo::MergeInVectorType(VectorType *VInTy,
 ///
 /// If we see at least one access to the value that is as a vector type, set the
 /// SawVec flag.
-bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
+bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset,
+                                             Value* NonConstantIdx) {
   for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) {
     Instruction *User = cast<Instruction>(*UI);
 
@@ -441,24 +499,35 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
     if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) {
       if (!onlyUsedByLifetimeMarkers(BCI))
         IsNotTrivial = true;  // Can't be mem2reg'd.
-      if (!CanConvertToScalar(BCI, Offset))
+      if (!CanConvertToScalar(BCI, Offset, NonConstantIdx))
         return false;
       continue;
     }
 
     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) {
       // If this is a GEP with a variable indices, we can't handle it.
-      if (!GEP->hasAllConstantIndices())
+      PointerType* PtrTy = dyn_cast<PointerType>(GEP->getPointerOperandType());
+      if (!PtrTy)
         return false;
 
       // Compute the offset that this GEP adds to the pointer.
       SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
-      if (!GEP->getPointerOperandType()->isPointerTy())
-        return false;
-      uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(),
+      Value *GEPNonConstantIdx = 0;
+      if (!GEP->hasAllConstantIndices()) {
+        if (!isa<VectorType>(PtrTy->getElementType()))
+          return false;
+        if (NonConstantIdx)
+          return false;
+        GEPNonConstantIdx = Indices.pop_back_val();
+        if (!GEPNonConstantIdx->getType()->isIntegerTy(32))
+          return false;
+        HadDynamicAccess = true;
+      } else
+        GEPNonConstantIdx = NonConstantIdx;
+      uint64_t GEPOffset = TD.getIndexedOffset(PtrTy,
                                                Indices);
       // See if all uses can be converted.
-      if (!CanConvertToScalar(GEP, Offset+GEPOffset))
+      if (!CanConvertToScalar(GEP, Offset+GEPOffset, GEPNonConstantIdx))
         return false;
       IsNotTrivial = true;  // Can't be mem2reg'd.
       HadNonMemTransferAccess = true;
@@ -468,6 +537,9 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
     // If this is a constant sized memset of a constant value (e.g. 0) we can
     // handle it.
     if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) {
+      // Store to dynamic index.
+      if (NonConstantIdx)
+        return false;
       // Store of constant value.
       if (!isa<ConstantInt>(MSI->getValue()))
         return false;
@@ -492,6 +564,9 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
     // If this is a memcpy or memmove into or out of the whole allocation, we
     // can handle it like a load or store of the scalar type.
     if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) {
+      // Store to dynamic index.
+      if (NonConstantIdx)
+        return false;
       ConstantInt *Len = dyn_cast<ConstantInt>(MTI->getLength());
       if (Len == 0 || Len->getZExtValue() != AllocaSize || Offset != 0)
         return false;
@@ -523,12 +598,13 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
 /// Offset is an offset from the original alloca, in bits that need to be
 /// shifted to the right.  By the end of this, there should be no uses of Ptr.
 void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
-                                              uint64_t Offset) {
+                                              uint64_t Offset,
+                                              Value* NonConstantIdx) {
   while (!Ptr->use_empty()) {
     Instruction *User = cast<Instruction>(Ptr->use_back());
 
     if (BitCastInst *CI = dyn_cast<BitCastInst>(User)) {
-      ConvertUsesToScalar(CI, NewAI, Offset);
+      ConvertUsesToScalar(CI, NewAI, Offset, NonConstantIdx);
       CI->eraseFromParent();
       continue;
     }
@@ -536,9 +612,11 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) {
       // Compute the offset that this GEP adds to the pointer.
       SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
+      if (!GEP->hasAllConstantIndices())
+        NonConstantIdx = Indices.pop_back_val();
       uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(),
                                                Indices);
-      ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8);
+      ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8, NonConstantIdx);
       GEP->eraseFromParent();
       continue;
     }
@@ -549,7 +627,8 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
       // The load is a bit extract from NewAI shifted right by Offset bits.
       Value *LoadedVal = Builder.CreateLoad(NewAI);
       Value *NewLoadVal
-        = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset, Builder);
+        = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset,
+                                     NonConstantIdx, Builder);
       LI->replaceAllUsesWith(NewLoadVal);
       LI->eraseFromParent();
       continue;
@@ -559,7 +638,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
       assert(SI->getOperand(0) != Ptr && "Consistency error!");
       Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in");
       Value *New = ConvertScalar_InsertValue(SI->getOperand(0), Old, Offset,
-                                             Builder);
+                                             NonConstantIdx, Builder);
       Builder.CreateStore(New, NewAI);
       SI->eraseFromParent();
 
@@ -574,6 +653,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
     // transform it into a store of the expanded constant value.
     if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) {
       assert(MSI->getRawDest() == Ptr && "Consistency error!");
+      assert(!NonConstantIdx && "Cannot replace dynamic memset with insert");
       int64_t SNumBytes = cast<ConstantInt>(MSI->getLength())->getSExtValue();
       if (SNumBytes > 0 && (SNumBytes >> 32) == 0) {
         unsigned NumBytes = static_cast<unsigned>(SNumBytes);
@@ -590,7 +670,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
         Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in");
         Value *New = ConvertScalar_InsertValue(
                                     ConstantInt::get(User->getContext(), APVal),
-                                               Old, Offset, Builder);
+                                               Old, Offset, 0, Builder);
         Builder.CreateStore(New, NewAI);
 
         // If the load we just inserted is now dead, then the memset overwrote
@@ -606,6 +686,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
     // can handle it like a load or store of the scalar type.
     if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) {
       assert(Offset == 0 && "must be store to start of alloca");
+      assert(!NonConstantIdx && "Cannot replace dynamic transfer with insert");
 
       // If the source and destination are both to the same alloca, then this is
       // a noop copy-to-self, just delete it.  Otherwise, emit a load and store
@@ -678,7 +759,8 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
 /// shifted to the right.
 Value *ConvertToScalarInfo::
 ConvertScalar_ExtractValue(Value *FromVal, Type *ToType,
-                           uint64_t Offset, IRBuilder<> &Builder) {
+                           uint64_t Offset, Value* NonConstantIdx,
+                           IRBuilder<> &Builder) {
   // If the load is of the whole new alloca, no conversion is needed.
   Type *FromType = FromVal->getType();
   if (FromType == ToType && Offset == 0)
@@ -700,7 +782,17 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType,
       assert(EltSize*Elt == Offset && "Invalid modulus in validity checking");
     }
     // Return the element extracted out of it.
-    Value *V = Builder.CreateExtractElement(FromVal, Builder.getInt32(Elt));
+    Value *Idx;
+    if (NonConstantIdx) {
+      if (Elt)
+        Idx = Builder.CreateAdd(NonConstantIdx,
+                                Builder.getInt32(Elt),
+                                "dyn.offset");
+      else
+        Idx = NonConstantIdx;
+    } else
+      Idx = Builder.getInt32(Elt);
+    Value *V = Builder.CreateExtractElement(FromVal, Idx);
     if (V->getType() != ToType)
       V = Builder.CreateBitCast(V, ToType);
     return V;
@@ -709,23 +801,27 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType,
   // If ToType is a first class aggregate, extract out each of the pieces and
   // use insertvalue's to form the FCA.
   if (StructType *ST = dyn_cast<StructType>(ToType)) {
+    assert(!NonConstantIdx &&
+           "Dynamic indexing into struct types not supported");
     const StructLayout &Layout = *TD.getStructLayout(ST);
     Value *Res = UndefValue::get(ST);
     for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
       Value *Elt = ConvertScalar_ExtractValue(FromVal, ST->getElementType(i),
                                         Offset+Layout.getElementOffsetInBits(i),
-                                              Builder);
+                                              0, Builder);
       Res = Builder.CreateInsertValue(Res, Elt, i);
     }
     return Res;
   }
 
   if (ArrayType *AT = dyn_cast<ArrayType>(ToType)) {
+    assert(!NonConstantIdx &&
+           "Dynamic indexing into array types not supported");
     uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType());
     Value *Res = UndefValue::get(AT);
     for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
       Value *Elt = ConvertScalar_ExtractValue(FromVal, AT->getElementType(),
-                                              Offset+i*EltSize, Builder);
+                                              Offset+i*EltSize, 0, Builder);
       Res = Builder.CreateInsertValue(Res, Elt, i);
     }
     return Res;
@@ -791,9 +887,14 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType,
 ///
 /// Offset is an offset from the original alloca, in bits that need to be
 /// shifted to the right.
+///
+/// NonConstantIdx is an index value if there was a GEP with a non-constant
+/// index value.  If this is 0 then all GEPs used to find this insert address
+/// are constant.
 Value *ConvertToScalarInfo::
 ConvertScalar_InsertValue(Value *SV, Value *Old,
-                          uint64_t Offset, IRBuilder<> &Builder) {
+                          uint64_t Offset, Value* NonConstantIdx,
+                          IRBuilder<> &Builder) {
   // Convert the stored type to the actual type, shift it left to insert
   // then 'or' into place.
   Type *AllocaType = Old->getType();
@@ -814,26 +915,40 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
       SV = Builder.CreateBitCast(SV, EltTy);
     uint64_t EltSize = TD.getTypeAllocSizeInBits(EltTy);
     unsigned Elt = Offset/EltSize;
-    return Builder.CreateInsertElement(Old, SV, Builder.getInt32(Elt));
+    Value *Idx;
+    if (NonConstantIdx) {
+      if (Elt)
+        Idx = Builder.CreateAdd(NonConstantIdx,
+                                Builder.getInt32(Elt),
+                                "dyn.offset");
+      else
+        Idx = NonConstantIdx;
+    } else
+      Idx = Builder.getInt32(Elt);
+    return Builder.CreateInsertElement(Old, SV, Idx);
   }
 
   // If SV is a first-class aggregate value, insert each value recursively.
   if (StructType *ST = dyn_cast<StructType>(SV->getType())) {
+    assert(!NonConstantIdx &&
+           "Dynamic indexing into struct types not supported");
     const StructLayout &Layout = *TD.getStructLayout(ST);
     for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
       Value *Elt = Builder.CreateExtractValue(SV, i);
       Old = ConvertScalar_InsertValue(Elt, Old,
                                       Offset+Layout.getElementOffsetInBits(i),
-                                      Builder);
+                                      0, Builder);
     }
     return Old;
   }
 
   if (ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) {
+    assert(!NonConstantIdx &&
+           "Dynamic indexing into array types not supported");
     uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType());
     for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
       Value *Elt = Builder.CreateExtractValue(SV, i);
-      Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, Builder);
+      Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, 0, Builder);
     }
     return Old;
   }
@@ -935,7 +1050,7 @@ public:
   AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S,
                  DIBuilder *DB)
     : LoadAndStorePromoter(Insts, S), AI(0), DIB(DB) {}
-  
+
   void run(AllocaInst *AI, const SmallVectorImpl<Instruction*> &Insts) {
     // Remember which alloca we're promoting (for isInstInList).
     this->AI = AI;
@@ -950,18 +1065,18 @@ public:
 
     LoadAndStorePromoter::run(Insts);
     AI->eraseFromParent();
-    for (SmallVector<DbgDeclareInst *, 4>::iterator I = DDIs.begin(), 
+    for (SmallVector<DbgDeclareInst *, 4>::iterator I = DDIs.begin(),
            E = DDIs.end(); I != E; ++I) {
       DbgDeclareInst *DDI = *I;
       DDI->eraseFromParent();
     }
-    for (SmallVector<DbgValueInst *, 4>::iterator I = DVIs.begin(), 
+    for (SmallVector<DbgValueInst *, 4>::iterator I = DVIs.begin(),
            E = DVIs.end(); I != E; ++I) {
       DbgValueInst *DVI = *I;
       DVI->eraseFromParent();
     }
   }
-  
+
   virtual bool isInstInList(Instruction *I,
                             const SmallVectorImpl<Instruction*> &Insts) const {
     if (LoadInst *LI = dyn_cast<LoadInst>(I))
@@ -970,7 +1085,7 @@ public:
   }
 
   virtual void updateDebugInfo(Instruction *Inst) const {
-    for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(), 
+    for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(),
            E = DDIs.end(); I != E; ++I) {
       DbgDeclareInst *DDI = *I;
       if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
@@ -978,7 +1093,7 @@ public:
       else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
         ConvertDebugDeclareToDebugValue(DDI, LI, *DIB);
     }
-    for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(), 
+    for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(),
            E = DVIs.end(); I != E; ++I) {
       DbgValueInst *DVI = *I;
       Value *Arg = NULL;
@@ -1021,12 +1136,12 @@ public:
 static bool isSafeSelectToSpeculate(SelectInst *SI, const TargetData *TD) {
   bool TDerefable = SI->getTrueValue()->isDereferenceablePointer();
   bool FDerefable = SI->getFalseValue()->isDereferenceablePointer();
-  
+
   for (Value::use_iterator UI = SI->use_begin(), UE = SI->use_end();
        UI != UE; ++UI) {
     LoadInst *LI = dyn_cast<LoadInst>(*UI);
     if (LI == 0 || !LI->isSimple()) return false;
-    
+
     // Both operands to the select need to be dereferencable, either absolutely
     // (e.g. allocas) or at this point because we can see other accesses to it.
     if (!TDerefable && !isSafeToLoadUnconditionally(SI->getTrueValue(), LI,
@@ -1036,7 +1151,7 @@ static bool isSafeSelectToSpeculate(SelectInst *SI, const TargetData *TD) {
                                                     LI->getAlignment(), TD))
       return false;
   }
-  
+
   return true;
 }
 
@@ -1067,20 +1182,20 @@ static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) {
        UI != UE; ++UI) {
     LoadInst *LI = dyn_cast<LoadInst>(*UI);
     if (LI == 0 || !LI->isSimple()) return false;
-    
+
     // For now we only allow loads in the same block as the PHI.  This is a
     // common case that happens when instcombine merges two loads through a PHI.
     if (LI->getParent() != BB) return false;
-    
+
     // Ensure that there are no instructions between the PHI and the load that
     // could store.
     for (BasicBlock::iterator BBI = PN; &*BBI != LI; ++BBI)
       if (BBI->mayWriteToMemory())
         return false;
-    
+
     MaxAlign = std::max(MaxAlign, LI->getAlignment());
   }
-  
+
   // Okay, we know that we have one or more loads in the same block as the PHI.
   // We can transform this if it is safe to push the loads into the predecessor
   // blocks.  The only thing to watch out for is that we can't put a possibly
@@ -1108,10 +1223,10 @@ static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) {
     if (InVal->isDereferenceablePointer() ||
         isSafeToLoadUnconditionally(InVal, Pred->getTerminator(), MaxAlign, TD))
       continue;
-    
+
     return false;
   }
-    
+
   return true;
 }
 
@@ -1123,7 +1238,7 @@ static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) {
 static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
   SetVector<Instruction*, SmallVector<Instruction*, 4>,
             SmallPtrSet<Instruction*, 4> > InstsToRewrite;
-  
+
   for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end();
        UI != UE; ++UI) {
     User *U = *UI;
@@ -1132,7 +1247,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
         return false;
       continue;
     }
-    
+
     if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
       if (SI->getOperand(0) == AI || !SI->isSimple())
         return false;   // Don't allow a store OF the AI, only INTO the AI.
@@ -1146,7 +1261,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
         Value *Result = SI->getOperand(1+CI->isZero());
         SI->replaceAllUsesWith(Result);
         SI->eraseFromParent();
-        
+
         // This is very rare and we just scrambled the use list of AI, start
         // over completely.
         return tryToMakeAllocaBePromotable(AI, TD);
@@ -1156,33 +1271,33 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
       // loads, then we can transform this by rewriting the select.
       if (!isSafeSelectToSpeculate(SI, TD))
         return false;
-      
+
       InstsToRewrite.insert(SI);
       continue;
     }
-    
+
     if (PHINode *PN = dyn_cast<PHINode>(U)) {
       if (PN->use_empty()) {  // Dead PHIs can be stripped.
         InstsToRewrite.insert(PN);
         continue;
       }
-      
+
       // If it is safe to turn "load (phi [AI, ptr, ...])" into a PHI of loads
       // in the pred blocks, then we can transform this by rewriting the PHI.
       if (!isSafePHIToSpeculate(PN, TD))
         return false;
-      
+
       InstsToRewrite.insert(PN);
       continue;
     }
-    
+
     if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
       if (onlyUsedByLifetimeMarkers(BCI)) {
         InstsToRewrite.insert(BCI);
         continue;
       }
     }
-    
+
     return false;
   }
 
@@ -1190,7 +1305,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
   // we're done!
   if (InstsToRewrite.empty())
     return true;
-  
+
   // If we have instructions that need to be rewritten for this to be promotable
   // take care of it now.
   for (unsigned i = 0, e = InstsToRewrite.size(); i != e; ++i) {
@@ -1211,13 +1326,13 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
       // loads with a new select.
       while (!SI->use_empty()) {
         LoadInst *LI = cast<LoadInst>(SI->use_back());
-      
+
         IRBuilder<> Builder(LI);
-        LoadInst *TrueLoad = 
+        LoadInst *TrueLoad =
           Builder.CreateLoad(SI->getTrueValue(), LI->getName()+".t");
-        LoadInst *FalseLoad = 
+        LoadInst *FalseLoad =
           Builder.CreateLoad(SI->getFalseValue(), LI->getName()+".f");
-        
+
         // Transfer alignment and TBAA info if present.
         TrueLoad->setAlignment(LI->getAlignment());
         FalseLoad->setAlignment(LI->getAlignment());
@@ -1225,18 +1340,18 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
           TrueLoad->setMetadata(LLVMContext::MD_tbaa, Tag);
           FalseLoad->setMetadata(LLVMContext::MD_tbaa, Tag);
         }
-        
+
         Value *V = Builder.CreateSelect(SI->getCondition(), TrueLoad, FalseLoad);
         V->takeName(LI);
         LI->replaceAllUsesWith(V);
         LI->eraseFromParent();
       }
-    
+
       // Now that all the loads are gone, the select is gone too.
       SI->eraseFromParent();
       continue;
     }
-    
+
     // Otherwise, we have a PHI node which allows us to push the loads into the
     // predecessors.
     PHINode *PN = cast<PHINode>(InstsToRewrite[i]);
@@ -1244,7 +1359,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
       PN->eraseFromParent();
       continue;
     }
-    
+
     Type *LoadTy = cast<PointerType>(PN->getType())->getElementType();
     PHINode *NewPN = PHINode::Create(LoadTy, PN->getNumIncomingValues(),
                                      PN->getName()+".ld", PN);
@@ -1254,18 +1369,18 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
     LoadInst *SomeLoad = cast<LoadInst>(PN->use_back());
     MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa);
     unsigned Align = SomeLoad->getAlignment();
-    
+
     // Rewrite all loads of the PN to use the new PHI.
     while (!PN->use_empty()) {
       LoadInst *LI = cast<LoadInst>(PN->use_back());
       LI->replaceAllUsesWith(NewPN);
       LI->eraseFromParent();
     }
-    
+
     // Inject loads into all of the pred blocks.  Keep track of which blocks we
     // insert them into in case we have multiple edges from the same block.
     DenseMap<BasicBlock*, LoadInst*> InsertedLoads;
-    
+
     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
       BasicBlock *Pred = PN->getIncomingBlock(i);
       LoadInst *&Load = InsertedLoads[Pred];
@@ -1276,13 +1391,13 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
         Load->setAlignment(Align);
         if (TBAATag) Load->setMetadata(LLVMContext::MD_tbaa, TBAATag);
       }
-      
+
       NewPN->addIncoming(Load, Pred);
     }
-    
+
     PN->eraseFromParent();
   }
-    
+
   ++NumAdjusted;
   return true;
 }
@@ -1315,7 +1430,7 @@ bool SROA::performPromotion(Function &F) {
       SSAUpdater SSA;
       for (unsigned i = 0, e = Allocas.size(); i != e; ++i) {
         AllocaInst *AI = Allocas[i];
-        
+
         // Build list of instructions to promote.
         for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end();
              UI != E; ++UI)
@@ -1334,18 +1449,36 @@ bool SROA::performPromotion(Function &F) {
 
 /// ShouldAttemptScalarRepl - Decide if an alloca is a good candidate for
 /// SROA.  It must be a struct or array type with a small number of elements.
-static bool ShouldAttemptScalarRepl(AllocaInst *AI) {
+bool SROA::ShouldAttemptScalarRepl(AllocaInst *AI) {
   Type *T = AI->getAllocatedType();
-  // Do not promote any struct into more than 32 separate vars.
+  // Do not promote any struct that has too many members.
   if (StructType *ST = dyn_cast<StructType>(T))
-    return ST->getNumElements() <= 32;
-  // Arrays are much less likely to be safe for SROA; only consider
-  // them if they are very small.
+    return ST->getNumElements() <= StructMemberThreshold;
+  // Do not promote any array that has too many elements.
   if (ArrayType *AT = dyn_cast<ArrayType>(T))
-    return AT->getNumElements() <= 8;
+    return AT->getNumElements() <= ArrayElementThreshold;
   return false;
 }
 
+/// getPointeeAlignment - Compute the minimum alignment of the value pointed
+/// to by the given pointer.
+static unsigned getPointeeAlignment(Value *V, const TargetData &TD) {
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+    if (CE->getOpcode() == Instruction::BitCast ||
+        (CE->getOpcode() == Instruction::GetElementPtr &&
+         cast<GEPOperator>(CE)->hasAllZeroIndices()))
+      return getPointeeAlignment(CE->getOperand(0), TD);
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+    if (!GV->isDeclaration())
+      return TD.getPreferredAlignment(GV);
+
+  if (PointerType *PT = dyn_cast<PointerType>(V->getType()))
+    return TD.getABITypeAlignment(PT->getElementType());
+
+  return 0;
+}
+
 
 // performScalarRepl - This algorithm is a simple worklist driven algorithm,
 // which runs on all of the alloca instructions in the function, removing them
@@ -1379,23 +1512,26 @@ bool SROA::performScalarRepl(Function &F) {
       continue;
 
     // Check to see if this allocation is only modified by a memcpy/memmove from
-    // a constant global.  If this is the case, we can change all users to use
+    // a constant global whose alignment is equal to or exceeds that of the
+    // allocation.  If this is the case, we can change all users to use
     // the constant global instead.  This is commonly produced by the CFE by
     // constructs like "void foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A'
     // is only subsequently read.
     SmallVector<Instruction *, 4> ToDelete;
     if (MemTransferInst *Copy = isOnlyCopiedFromConstantGlobal(AI, ToDelete)) {
-      DEBUG(dbgs() << "Found alloca equal to global: " << *AI << '\n');
-      DEBUG(dbgs() << "  memcpy = " << *Copy << '\n');
-      for (unsigned i = 0, e = ToDelete.size(); i != e; ++i)
-        ToDelete[i]->eraseFromParent();
-      Constant *TheSrc = cast<Constant>(Copy->getSource());
-      AI->replaceAllUsesWith(ConstantExpr::getBitCast(TheSrc, AI->getType()));
-      Copy->eraseFromParent();  // Don't mutate the global.
-      AI->eraseFromParent();
-      ++NumGlobals;
-      Changed = true;
-      continue;
+      if (AI->getAlignment() <= getPointeeAlignment(Copy->getSource(), *TD)) {
+        DEBUG(dbgs() << "Found alloca equal to global: " << *AI << '\n');
+        DEBUG(dbgs() << "  memcpy = " << *Copy << '\n');
+        for (unsigned i = 0, e = ToDelete.size(); i != e; ++i)
+          ToDelete[i]->eraseFromParent();
+        Constant *TheSrc = cast<Constant>(Copy->getSource());
+        AI->replaceAllUsesWith(ConstantExpr::getBitCast(TheSrc, AI->getType()));
+        Copy->eraseFromParent();  // Don't mutate the global.
+        AI->eraseFromParent();
+        ++NumGlobals;
+        Changed = true;
+        continue;
+      }
     }
 
     // Check to see if we can perform the core SROA transformation.  We cannot
@@ -1425,8 +1561,8 @@ bool SROA::performScalarRepl(Function &F) {
     // promoted itself.  If so, we don't want to transform it needlessly.  Note
     // that we can't just check based on the type: the alloca may be of an i32
     // but that has pointer arithmetic to set byte 3 of it or something.
-    if (AllocaInst *NewAI =
-          ConvertToScalarInfo((unsigned)AllocaSize, *TD).TryConvert(AI)) {
+    if (AllocaInst *NewAI = ConvertToScalarInfo(
+              (unsigned)AllocaSize, *TD, ScalarLoadThreshold).TryConvert(AI)) {
       NewAI->takeName(AI);
       AI->eraseFromParent();
       ++NumConverted;
@@ -1531,12 +1667,12 @@ void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset,
       isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType),
                       LIType, false, Info, LI, true /*AllowWholeAccess*/);
       Info.hasALoadOrStore = true;
-        
+
     } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
       // Store is ok if storing INTO the pointer, not storing the pointer
       if (!SI->isSimple() || SI->getOperand(0) == I)
         return MarkUnsafe(Info, User);
-        
+
       Type *SIType = SI->getOperand(0)->getType();
       isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType),
                       SIType, true, Info, SI, true /*AllowWholeAccess*/);
@@ -1553,7 +1689,7 @@ void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset,
     if (Info.isUnsafe) return;
   }
 }
- 
+
 
 /// isSafePHIUseForScalarRepl - If we see a PHI node or select using a pointer
 /// derived from the alloca, we can often still split the alloca into elements.
@@ -1570,10 +1706,10 @@ void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset,
   if (PHINode *PN = dyn_cast<PHINode>(I))
     if (!Info.CheckedPHIs.insert(PN))
       return;
-  
+
   for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E; ++UI) {
     Instruction *User = cast<Instruction>(*UI);
-    
+
     if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) {
       isSafePHISelectUseForScalarRepl(BC, Offset, Info);
     } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) {
@@ -1590,12 +1726,12 @@ void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset,
       isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType),
                       LIType, false, Info, LI, false /*AllowWholeAccess*/);
       Info.hasALoadOrStore = true;
-      
+
     } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
       // Store is ok if storing INTO the pointer, not storing the pointer
       if (!SI->isSimple() || SI->getOperand(0) == I)
         return MarkUnsafe(Info, User);
-      
+
       Type *SIType = SI->getOperand(0)->getType();
       isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType),
                       SIType, true, Info, SI, false /*AllowWholeAccess*/);
@@ -1619,6 +1755,8 @@ void SROA::isSafeGEP(GetElementPtrInst *GEPI,
   gep_type_iterator GEPIt = gep_type_begin(GEPI), E = gep_type_end(GEPI);
   if (GEPIt == E)
     return;
+  bool NonConstant = false;
+  unsigned NonConstantIdxSize = 0;
 
   // Walk through the GEP type indices, checking the types that this indexes
   // into.
@@ -1628,15 +1766,30 @@ void SROA::isSafeGEP(GetElementPtrInst *GEPI,
       continue;
 
     ConstantInt *IdxVal = dyn_cast<ConstantInt>(GEPIt.getOperand());
-    if (!IdxVal)
-      return MarkUnsafe(Info, GEPI);
+    if (!IdxVal) {
+      // Non constant GEPs are only a problem on arrays, structs, and pointers
+      // Vectors can be dynamically indexed.
+      // FIXME: Add support for dynamic indexing on arrays.  This should be
+      // ok on any subarrays of the alloca array, eg, a[0][i] is ok, but a[i][0]
+      // isn't.
+      if (!(*GEPIt)->isVectorTy())
+        return MarkUnsafe(Info, GEPI);
+      NonConstant = true;
+      NonConstantIdxSize = TD->getTypeAllocSize(*GEPIt);
+    }
   }
 
   // Compute the offset due to this GEP and check if the alloca has a
   // component element at that offset.
   SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end());
+  // If this GEP is non constant then the last operand must have been a
+  // dynamic index into a vector.  Pop this now as it has no impact on the
+  // constant part of the offset.
+  if (NonConstant)
+    Indices.pop_back();
   Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(), Indices);
-  if (!TypeHasComponent(Info.AI->getAllocatedType(), Offset, 0))
+  if (!TypeHasComponent(Info.AI->getAllocatedType(), Offset,
+                        NonConstantIdxSize))
     MarkUnsafe(Info, GEPI);
 }
 
@@ -1741,6 +1894,12 @@ bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size) {
     if (Offset >= AT->getNumElements() * EltSize)
       return false;
     Offset %= EltSize;
+  } else if (VectorType *VT = dyn_cast<VectorType>(T)) {
+    EltTy = VT->getElementType();
+    EltSize = TD->getTypeAllocSize(EltTy);
+    if (Offset >= VT->getNumElements() * EltSize)
+      return false;
+    Offset %= EltSize;
   } else {
     return false;
   }
@@ -1766,12 +1925,12 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
       RewriteBitCast(BC, AI, Offset, NewElts);
       continue;
     }
-    
+
     if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) {
       RewriteGEP(GEPI, AI, Offset, NewElts);
       continue;
     }
-    
+
     if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) {
       ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength());
       uint64_t MemSize = Length->getZExtValue();
@@ -1790,10 +1949,10 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
       }
       continue;
     }
-    
+
     if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
       Type *LIType = LI->getType();
-      
+
       if (isCompatibleAggregate(LIType, AI->getAllocatedType())) {
         // Replace:
         //   %res = load { i32, i32 }* %alloc
@@ -1819,7 +1978,7 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
       }
       continue;
     }
-    
+
     if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
       Value *Val = SI->getOperand(0);
       Type *SIType = Val->getType();
@@ -1846,16 +2005,16 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
       }
       continue;
     }
-    
+
     if (isa<SelectInst>(User) || isa<PHINode>(User)) {
-      // If we have a PHI user of the alloca itself (as opposed to a GEP or 
+      // If we have a PHI user of the alloca itself (as opposed to a GEP or
       // bitcast) we have to rewrite it.  GEP and bitcast uses will be RAUW'd to
       // the new pointer.
       if (!isa<AllocaInst>(I)) continue;
-      
+
       assert(Offset == 0 && NewElts[0] &&
              "Direct alloca use should have a zero offset");
-      
+
       // If we have a use of the alloca, we know the derived uses will be
       // utilizing just the first element of the scalarized result.  Insert a
       // bitcast of the first alloca before the user as required.
@@ -1908,9 +2067,16 @@ uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset,
     Offset -= Layout->getElementOffset(Idx);
     IdxTy = Type::getInt32Ty(T->getContext());
     return Idx;
+  } else if (ArrayType *AT = dyn_cast<ArrayType>(T)) {
+    T = AT->getElementType();
+    uint64_t EltSize = TD->getTypeAllocSize(T);
+    Idx = Offset / EltSize;
+    Offset -= Idx * EltSize;
+    IdxTy = Type::getInt64Ty(T->getContext());
+    return Idx;
   }
-  ArrayType *AT = cast<ArrayType>(T);
-  T = AT->getElementType();
+  VectorType *VT = cast<VectorType>(T);
+  T = VT->getElementType();
   uint64_t EltSize = TD->getTypeAllocSize(T);
   Idx = Offset / EltSize;
   Offset -= Idx * EltSize;
@@ -1925,6 +2091,13 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
                       SmallVector<AllocaInst*, 32> &NewElts) {
   uint64_t OldOffset = Offset;
   SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end());
+  // If the GEP was dynamic then it must have been a dynamic vector lookup.
+  // In this case, it must be the last GEP operand which is dynamic so keep that
+  // aside until we've found the constant GEP offset then add it back in at the
+  // end.
+  Value* NonConstantIdx = 0;
+  if (!GEPI->hasAllConstantIndices())
+    NonConstantIdx = Indices.pop_back_val();
   Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(), Indices);
 
   RewriteForScalarRepl(GEPI, AI, Offset, NewElts);
@@ -1951,6 +2124,17 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
     uint64_t EltIdx = FindElementAndOffset(T, EltOffset, IdxTy);
     NewArgs.push_back(ConstantInt::get(IdxTy, EltIdx));
   }
+  if (NonConstantIdx) {
+    Type* GepTy = T;
+    // This GEP has a dynamic index.  We need to add "i32 0" to index through
+    // any structs or arrays in the original type until we get to the vector
+    // to index.
+    while (!isa<VectorType>(GepTy)) {
+      NewArgs.push_back(Constant::getNullValue(i32Ty));
+      GepTy = cast<CompositeType>(GepTy)->getTypeAtIndex(0U);
+    }
+    NewArgs.push_back(NonConstantIdx);
+  }
   Instruction *Val = NewElts[Idx];
   if (NewArgs.size() > 1) {
     Val = GetElementPtrInst::CreateInBounds(Val, NewArgs, "", GEPI);
@@ -2202,7 +2386,7 @@ void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
   uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy);
 
   IRBuilder<> Builder(SI);
-  
+
   // Handle tail padding by extending the operand
   if (TD->getTypeSizeInBits(SrcVal->getType()) != AllocaSizeBits)
     SrcVal = Builder.CreateZExt(SrcVal,
@@ -2464,7 +2648,7 @@ bool SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) {
         return false;
     }
   }
-  
+
   return true;
 }
 
diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index a66b3e3..d13e4ab 100644
--- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -67,7 +67,7 @@ static void ChangeToUnreachable(Instruction *I, bool UseLLVMTrap) {
   // nodes.
   for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
     (*SI)->removePredecessor(BB);
-  
+
   // Insert a call to llvm.trap right before this.  This turns the undefined
   // behavior into a hard fail instead of falling through into random code.
   if (UseLLVMTrap) {
@@ -77,7 +77,7 @@ static void ChangeToUnreachable(Instruction *I, bool UseLLVMTrap) {
     CallTrap->setDebugLoc(I->getDebugLoc());
   }
   new UnreachableInst(I->getContext(), I);
-  
+
   // All instructions after this are dead.
   BasicBlock::iterator BBI = I, BBE = BB->end();
   while (BBI != BBE) {
@@ -89,7 +89,6 @@ static void ChangeToUnreachable(Instruction *I, bool UseLLVMTrap) {
 
 /// ChangeToCall - Convert the specified invoke into a normal call.
 static void ChangeToCall(InvokeInst *II) {
-  BasicBlock *BB = II->getParent();
   SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3);
   CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, "", II);
   NewCall->takeName(II);
@@ -102,19 +101,19 @@ static void ChangeToCall(InvokeInst *II) {
   BranchInst::Create(II->getNormalDest(), II);
 
   // Update PHI nodes in the unwind destination
-  II->getUnwindDest()->removePredecessor(BB);
-  BB->getInstList().erase(II);
+  II->getUnwindDest()->removePredecessor(II->getParent());
+  II->eraseFromParent();
 }
 
 static bool MarkAliveBlocks(BasicBlock *BB,
                             SmallPtrSet<BasicBlock*, 128> &Reachable) {
-  
+
   SmallVector<BasicBlock*, 128> Worklist;
   Worklist.push_back(BB);
   bool Changed = false;
   do {
     BB = Worklist.pop_back_val();
-    
+
     if (!Reachable.insert(BB))
       continue;
 
@@ -136,7 +135,7 @@ static bool MarkAliveBlocks(BasicBlock *BB,
           break;
         }
       }
-      
+
       // Store to undef and store to null are undefined and used to signal that
       // they should be changed to unreachable by passes that can't modify the
       // CFG.
@@ -145,7 +144,7 @@ static bool MarkAliveBlocks(BasicBlock *BB,
         if (SI->isVolatile()) continue;
 
         Value *Ptr = SI->getOperand(1);
-        
+
         if (isa<UndefValue>(Ptr) ||
             (isa<ConstantPointerNull>(Ptr) &&
              SI->getPointerAddressSpace() == 0)) {
@@ -157,11 +156,22 @@ static bool MarkAliveBlocks(BasicBlock *BB,
     }
 
     // Turn invokes that call 'nounwind' functions into ordinary calls.
-    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator()))
-      if (II->doesNotThrow()) {
-        ChangeToCall(II);
+    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
+      Value *Callee = II->getCalledValue();
+      if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
+        ChangeToUnreachable(II, true);
+        Changed = true;
+      } else if (II->doesNotThrow()) {
+        if (II->use_empty() && II->onlyReadsMemory()) {
+          // jump to the normal destination branch.
+          BranchInst::Create(II->getNormalDest(), II);
+          II->getUnwindDest()->removePredecessor(II->getParent());
+          II->eraseFromParent();
+        } else
+          ChangeToCall(II);
         Changed = true;
       }
+    }
 
     Changed |= ConstantFoldTerminator(BB, true);
     for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
@@ -170,38 +180,38 @@ static bool MarkAliveBlocks(BasicBlock *BB,
   return Changed;
 }
 
-/// RemoveUnreachableBlocksFromFn - Remove blocks that are not reachable, even 
-/// if they are in a dead cycle.  Return true if a change was made, false 
+/// RemoveUnreachableBlocksFromFn - Remove blocks that are not reachable, even
+/// if they are in a dead cycle.  Return true if a change was made, false
 /// otherwise.
 static bool RemoveUnreachableBlocksFromFn(Function &F) {
   SmallPtrSet<BasicBlock*, 128> Reachable;
   bool Changed = MarkAliveBlocks(F.begin(), Reachable);
-  
+
   // If there are unreachable blocks in the CFG...
   if (Reachable.size() == F.size())
     return Changed;
-  
+
   assert(Reachable.size() < F.size());
   NumSimpl += F.size()-Reachable.size();
-  
+
   // Loop over all of the basic blocks that are not reachable, dropping all of
   // their internal references...
   for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB) {
     if (Reachable.count(BB))
       continue;
-    
+
     for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
       if (Reachable.count(*SI))
         (*SI)->removePredecessor(BB);
     BB->dropAllReferences();
   }
-  
+
   for (Function::iterator I = ++F.begin(); I != F.end();)
     if (!Reachable.count(I))
       I = F.getBasicBlockList().erase(I);
     else
       ++I;
-  
+
   return true;
 }
 
@@ -209,17 +219,17 @@ static bool RemoveUnreachableBlocksFromFn(Function &F) {
 /// node) return blocks, merge them together to promote recursive block merging.
 static bool MergeEmptyReturnBlocks(Function &F) {
   bool Changed = false;
-  
+
   BasicBlock *RetBlock = 0;
-  
+
   // Scan all the blocks in the function, looking for empty return blocks.
   for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; ) {
     BasicBlock &BB = *BBI++;
-    
+
     // Only look at return blocks.
     ReturnInst *Ret = dyn_cast<ReturnInst>(BB.getTerminator());
     if (Ret == 0) continue;
-    
+
     // Only look at the block if it is empty or the only other thing in it is a
     // single PHI node that is the operand to the return.
     if (Ret != &BB.front()) {
@@ -241,21 +251,21 @@ static bool MergeEmptyReturnBlocks(Function &F) {
       RetBlock = &BB;
       continue;
     }
-    
+
     // Otherwise, we found a duplicate return block.  Merge the two.
     Changed = true;
-    
+
     // Case when there is no input to the return or when the returned values
     // agree is trivial.  Note that they can't agree if there are phis in the
     // blocks.
     if (Ret->getNumOperands() == 0 ||
-        Ret->getOperand(0) == 
+        Ret->getOperand(0) ==
           cast<ReturnInst>(RetBlock->getTerminator())->getOperand(0)) {
       BB.replaceAllUsesWith(RetBlock);
       BB.eraseFromParent();
       continue;
     }
-    
+
     // If the canonical return block has no PHI node, create one now.
     PHINode *RetBlockPHI = dyn_cast<PHINode>(RetBlock->begin());
     if (RetBlockPHI == 0) {
@@ -264,12 +274,12 @@ static bool MergeEmptyReturnBlocks(Function &F) {
       RetBlockPHI = PHINode::Create(Ret->getOperand(0)->getType(),
                                     std::distance(PB, PE), "merge",
                                     &RetBlock->front());
-      
+
       for (pred_iterator PI = PB; PI != PE; ++PI)
         RetBlockPHI->addIncoming(InVal, *PI);
       RetBlock->getTerminator()->setOperand(0, RetBlockPHI);
     }
-    
+
     // Turn BB into a block that just unconditionally branches to the return
     // block.  This handles the case when the two return blocks have a common
     // predecessor but that return different things.
@@ -277,7 +287,7 @@ static bool MergeEmptyReturnBlocks(Function &F) {
     BB.getTerminator()->eraseFromParent();
     BranchInst::Create(RetBlock, &BB);
   }
-  
+
   return Changed;
 }
 
@@ -288,7 +298,7 @@ static bool IterativeSimplifyCFG(Function &F, const TargetData *TD) {
   bool LocalChange = true;
   while (LocalChange) {
     LocalChange = false;
-    
+
     // Loop over all of the basic blocks and remove them if they are unneeded...
     //
     for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) {
@@ -317,7 +327,7 @@ bool CFGSimplifyPass::runOnFunction(Function &F) {
   // IterativeSimplifyCFG can (rarely) make some loops dead.  If this happens,
   // RemoveUnreachableBlocksFromFn is needed to nuke them, which means we should
   // iterate between the two optimizations.  We structure the code like this to
-  // avoid reruning IterativeSimplifyCFG if the second pass of 
+  // avoid reruning IterativeSimplifyCFG if the second pass of
   // RemoveUnreachableBlocksFromFn doesn't do anything.
   if (!RemoveUnreachableBlocksFromFn(F))
     return true;
diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
index f7b6941..a1a8a41 100644
--- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -18,20 +18,20 @@
 #define DEBUG_TYPE "simplify-libcalls"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/Intrinsics.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/IRBuilder.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Config/config.h"            // FIXME: Shouldn't depend on host!
 using namespace llvm;
 
@@ -100,7 +100,7 @@ static bool IsOnlyUsedInZeroEqualityComparison(Value *V) {
   }
   return true;
 }
- 
+
 static bool CallHasFloatingPointArgument(const CallInst *CI) {
   for (CallInst::const_op_iterator it = CI->op_begin(), e = CI->op_end();
        it != e; ++it) {
@@ -256,7 +256,7 @@ struct StrChrOpt : public LibCallOptimization {
                         ConstantInt::get(TD->getIntPtrType(*Context), Len),
                         B, TD);
     }
-    
+
     // Otherwise, the character is a constant, see if the first argument is
     // a string literal.  If so, we can constant fold.
     StringRef Str;
@@ -459,6 +459,50 @@ struct StrCpyOpt : public LibCallOptimization {
 };
 
 //===---------------------------------------===//
+// 'stpcpy' Optimizations
+
+struct StpCpyOpt: public LibCallOptimization {
+  bool OptChkCall;  // True if it's optimizing a __stpcpy_chk libcall.
+
+  StpCpyOpt(bool c) : OptChkCall(c) {}
+
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "stpcpy" function prototype.
+    unsigned NumParams = OptChkCall ? 3 : 2;
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != NumParams ||
+        FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != B.getInt8PtrTy())
+      return 0;
+
+    // These optimizations require TargetData.
+    if (!TD) return 0;
+
+    Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
+    if (Dst == Src)  // stpcpy(x,x)  -> x+strlen(x)
+      return B.CreateInBoundsGEP(Dst, EmitStrLen(Src, B, TD));
+
+    // See if we can get the length of the input string.
+    uint64_t Len = GetStringLength(Src);
+    if (Len == 0) return 0;
+
+    Value *LenV = ConstantInt::get(TD->getIntPtrType(*Context), Len);
+    Value *DstEnd = B.CreateGEP(Dst,
+                                ConstantInt::get(TD->getIntPtrType(*Context),
+                                                 Len - 1));
+
+    // We have enough information to now generate the memcpy call to do the
+    // copy for us.  Make a memcpy to copy the nul byte with align = 1.
+    if (OptChkCall)
+      EmitMemCpyChk(Dst, Src, LenV, CI->getArgOperand(2), B, TD);
+    else
+      B.CreateMemCpy(Dst, Src, LenV, 1);
+    return DstEnd;
+  }
+};
+
+//===---------------------------------------===//
 // 'strncpy' Optimizations
 
 struct StrNCpyOpt : public LibCallOptimization {
@@ -1470,12 +1514,15 @@ namespace {
   ///
   class SimplifyLibCalls : public FunctionPass {
     TargetLibraryInfo *TLI;
-    
+
     StringMap<LibCallOptimization*> Optimizations;
     // String and Memory LibCall Optimizations
     StrCatOpt StrCat; StrNCatOpt StrNCat; StrChrOpt StrChr; StrRChrOpt StrRChr;
-    StrCmpOpt StrCmp; StrNCmpOpt StrNCmp; StrCpyOpt StrCpy; StrCpyOpt StrCpyChk;
-    StrNCpyOpt StrNCpy; StrLenOpt StrLen; StrPBrkOpt StrPBrk;
+    StrCmpOpt StrCmp; StrNCmpOpt StrNCmp;
+    StrCpyOpt StrCpy; StrCpyOpt StrCpyChk;
+    StpCpyOpt StpCpy; StpCpyOpt StpCpyChk;
+    StrNCpyOpt StrNCpy;
+    StrLenOpt StrLen; StrPBrkOpt StrPBrk;
     StrToOpt StrTo; StrSpnOpt StrSpn; StrCSpnOpt StrCSpn; StrStrOpt StrStr;
     MemCmpOpt MemCmp; MemCpyOpt MemCpy; MemMoveOpt MemMove; MemSetOpt MemSet;
     // Math Library Optimizations
@@ -1487,11 +1534,12 @@ namespace {
     SPrintFOpt SPrintF; PrintFOpt PrintF;
     FWriteOpt FWrite; FPutsOpt FPuts; FPrintFOpt FPrintF;
     PutsOpt Puts;
-    
+
     bool Modified;  // This is only used by doInitialization.
   public:
     static char ID; // Pass identification
-    SimplifyLibCalls() : FunctionPass(ID), StrCpy(false), StrCpyChk(true) {
+    SimplifyLibCalls() : FunctionPass(ID), StrCpy(false), StrCpyChk(true),
+                         StpCpy(false), StpCpyChk(true) {
       initializeSimplifyLibCallsPass(*PassRegistry::getPassRegistry());
     }
     void AddOpt(LibFunc::Func F, LibCallOptimization* Opt);
@@ -1542,6 +1590,7 @@ void SimplifyLibCalls::InitOptimizations() {
   Optimizations["strncmp"] = &StrNCmp;
   Optimizations["strcpy"] = &StrCpy;
   Optimizations["strncpy"] = &StrNCpy;
+  Optimizations["stpcpy"] = &StpCpy;
   Optimizations["strlen"] = &StrLen;
   Optimizations["strpbrk"] = &StrPBrk;
   Optimizations["strtol"] = &StrTo;
@@ -1561,6 +1610,7 @@ void SimplifyLibCalls::InitOptimizations() {
 
   // _chk variants of String and Memory LibCall Optimizations.
   Optimizations["__strcpy_chk"] = &StrCpyChk;
+  Optimizations["__stpcpy_chk"] = &StpCpyChk;
 
   // Math Library Optimizations
   Optimizations["cosf"] = &Cos;
@@ -1717,7 +1767,7 @@ void SimplifyLibCalls::setDoesNotAlias(Function &F, unsigned n) {
 
 void SimplifyLibCalls::inferPrototypeAttributes(Function &F) {
   FunctionType *FTy = F.getFunctionType();
-  
+
   StringRef Name = F.getName();
   switch (Name[0]) {
   case 's':
@@ -1746,6 +1796,7 @@ void SimplifyLibCalls::inferPrototypeAttributes(Function &F) {
                Name == "strtold" ||
                Name == "strncat" ||
                Name == "strncpy" ||
+               Name == "stpncpy" ||
                Name == "strtoull") {
       if (FTy->getNumParams() < 2 ||
           !FTy->getParamType(1)->isPointerTy())
@@ -2406,10 +2457,6 @@ bool SimplifyLibCalls::doInitialization(Module &M) {
 //   * sqrt(Nroot(x)) -> pow(x,1/(2*N))
 //   * sqrt(pow(x,y)) -> pow(|x|,y*0.5)
 //
-// stpcpy:
-//   * stpcpy(str, "literal") ->
-//           llvm.memcpy(str,"literal",strlen("literal")+1,1)
-//
 // strchr:
 //   * strchr(p, 0) -> strlen(p)
 // tan, tanf, tanl:
diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp
index ef65c0a..34f1d6c 100644
--- a/lib/Transforms/Scalar/Sink.cpp
+++ b/lib/Transforms/Scalar/Sink.cpp
@@ -27,6 +27,7 @@
 using namespace llvm;
 
 STATISTIC(NumSunk, "Number of instructions sunk");
+STATISTIC(NumSinkIter, "Number of sinking iterations");
 
 namespace {
   class Sinking : public FunctionPass {
@@ -39,9 +40,9 @@ namespace {
     Sinking() : FunctionPass(ID) {
       initializeSinkingPass(*PassRegistry::getPassRegistry());
     }
-    
+
     virtual bool runOnFunction(Function &F);
-    
+
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesCFG();
       FunctionPass::getAnalysisUsage(AU);
@@ -55,9 +56,10 @@ namespace {
     bool ProcessBlock(BasicBlock &BB);
     bool SinkInstruction(Instruction *I, SmallPtrSet<Instruction *, 8> &Stores);
     bool AllUsesDominatedByBlock(Instruction *Inst, BasicBlock *BB) const;
+    bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo) const;
   };
 } // end anonymous namespace
-  
+
 char Sinking::ID = 0;
 INITIALIZE_PASS_BEGIN(Sinking, "sink", "Code sinking", false, false)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
@@ -69,7 +71,7 @@ FunctionPass *llvm::createSinkingPass() { return new Sinking(); }
 
 /// AllUsesDominatedByBlock - Return true if all uses of the specified value
 /// occur in blocks dominated by the specified block.
-bool Sinking::AllUsesDominatedByBlock(Instruction *Inst, 
+bool Sinking::AllUsesDominatedByBlock(Instruction *Inst,
                                       BasicBlock *BB) const {
   // Ignoring debug uses is necessary so debug info doesn't affect the code.
   // This may leave a referencing dbg_value in the original block, before
@@ -98,20 +100,19 @@ bool Sinking::runOnFunction(Function &F) {
   LI = &getAnalysis<LoopInfo>();
   AA = &getAnalysis<AliasAnalysis>();
 
-  bool EverMadeChange = false;
-  
-  while (1) {
-    bool MadeChange = false;
+  bool MadeChange, EverMadeChange = false;
 
+  do {
+    MadeChange = false;
+    DEBUG(dbgs() << "Sinking iteration " << NumSinkIter << "\n");
     // Process all basic blocks.
-    for (Function::iterator I = F.begin(), E = F.end(); 
+    for (Function::iterator I = F.begin(), E = F.end();
          I != E; ++I)
       MadeChange |= ProcessBlock(*I);
-    
-    // If this iteration over the code changed anything, keep iterating.
-    if (!MadeChange) break;
-    EverMadeChange = true;
-  } 
+    EverMadeChange |= MadeChange;
+    NumSinkIter++;
+  } while (MadeChange);
+
   return EverMadeChange;
 }
 
@@ -120,8 +121,8 @@ bool Sinking::ProcessBlock(BasicBlock &BB) {
   if (BB.getTerminator()->getNumSuccessors() <= 1 || BB.empty()) return false;
 
   // Don't bother sinking code out of unreachable blocks. In addition to being
-  // unprofitable, it can also lead to infinite looping, because in an unreachable
-  // loop there may be nowhere to stop.
+  // unprofitable, it can also lead to infinite looping, because in an
+  // unreachable loop there may be nowhere to stop.
   if (!DT->isReachableFromEntry(&BB)) return false;
 
   bool MadeChange = false;
@@ -133,7 +134,7 @@ bool Sinking::ProcessBlock(BasicBlock &BB) {
   SmallPtrSet<Instruction *, 8> Stores;
   do {
     Instruction *Inst = I;  // The instruction to sink.
-    
+
     // Predecrement I (if it's not begin) so that it isn't invalidated by
     // sinking.
     ProcessedBegin = I == BB.begin();
@@ -145,10 +146,10 @@ bool Sinking::ProcessBlock(BasicBlock &BB) {
 
     if (SinkInstruction(Inst, Stores))
       ++NumSunk, MadeChange = true;
-    
+
     // If we just processed the first instruction in the block, we're done.
   } while (!ProcessedBegin);
-  
+
   return MadeChange;
 }
 
@@ -174,6 +175,45 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA,
   return true;
 }
 
+/// IsAcceptableTarget - Return true if it is possible to sink the instruction
+/// in the specified basic block.
+bool Sinking::IsAcceptableTarget(Instruction *Inst,
+                                 BasicBlock *SuccToSinkTo) const {
+  assert(Inst && "Instruction to be sunk is null");
+  assert(SuccToSinkTo && "Candidate sink target is null");
+
+  // It is not possible to sink an instruction into its own block.  This can
+  // happen with loops.
+  if (Inst->getParent() == SuccToSinkTo)
+    return false;
+
+  // If the block has multiple predecessors, this would introduce computation
+  // on different code paths.  We could split the critical edge, but for now we
+  // just punt.
+  // FIXME: Split critical edges if not backedges.
+  if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) {
+    // We cannot sink a load across a critical edge - there may be stores in
+    // other code paths.
+    if (!isSafeToSpeculativelyExecute(Inst))
+      return false;
+
+    // We don't want to sink across a critical edge if we don't dominate the
+    // successor. We could be introducing calculations to new code paths.
+    if (!DT->dominates(Inst->getParent(), SuccToSinkTo))
+      return false;
+
+    // Don't sink instructions into a loop.
+    Loop *succ = LI->getLoopFor(SuccToSinkTo);
+    Loop *cur = LI->getLoopFor(Inst->getParent());
+    if (succ != 0 && succ != cur)
+      return false;
+  }
+
+  // Finally, check that all the uses of the instruction are actually
+  // dominated by the candidate
+  return AllUsesDominatedByBlock(Inst, SuccToSinkTo);
+}
+
 /// SinkInstruction - Determine whether it is safe to sink the specified machine
 /// instruction out of its current block into a successor.
 bool Sinking::SinkInstruction(Instruction *Inst,
@@ -181,7 +221,7 @@ bool Sinking::SinkInstruction(Instruction *Inst,
   // Check if it's safe to move the instruction.
   if (!isSafeToMove(Inst, AA, Stores))
     return false;
-  
+
   // FIXME: This should include support for sinking instructions within the
   // block they are currently in to shorten the live ranges.  We often get
   // instructions sunk into the top of a large block, but it would be better to
@@ -189,86 +229,42 @@ bool Sinking::SinkInstruction(Instruction *Inst,
   // be careful not to *increase* register pressure though, e.g. sinking
   // "x = y + z" down if it kills y and z would increase the live ranges of y
   // and z and only shrink the live range of x.
-  
-  // Loop over all the operands of the specified instruction.  If there is
-  // anything we can't handle, bail out.
-  BasicBlock *ParentBlock = Inst->getParent();
-  
+
   // SuccToSinkTo - This is the successor to sink this instruction to, once we
   // decide.
   BasicBlock *SuccToSinkTo = 0;
-  
-  // FIXME: This picks a successor to sink into based on having one
-  // successor that dominates all the uses.  However, there are cases where
-  // sinking can happen but where the sink point isn't a successor.  For
-  // example:
-  //   x = computation
-  //   if () {} else {}
-  //   use x
-  // the instruction could be sunk over the whole diamond for the 
-  // if/then/else (or loop, etc), allowing it to be sunk into other blocks
-  // after that.
-  
+
   // Instructions can only be sunk if all their uses are in blocks
   // dominated by one of the successors.
-  // Look at all the successors and decide which one
-  // we should sink to.
-  for (succ_iterator SI = succ_begin(ParentBlock),
-       E = succ_end(ParentBlock); SI != E; ++SI) {
-    if (AllUsesDominatedByBlock(Inst, *SI)) {
-      SuccToSinkTo = *SI;
-      break;
-    }
+  // Look at all the postdominators and see if we can sink it in one.
+  DomTreeNode *DTN = DT->getNode(Inst->getParent());
+  for (DomTreeNode::iterator I = DTN->begin(), E = DTN->end();
+      I != E && SuccToSinkTo == 0; ++I) {
+    BasicBlock *Candidate = (*I)->getBlock();
+    if ((*I)->getIDom()->getBlock() == Inst->getParent() &&
+        IsAcceptableTarget(Inst, Candidate))
+      SuccToSinkTo = Candidate;
+  }
+
+  // If no suitable postdominator was found, look at all the successors and
+  // decide which one we should sink to, if any.
+  for (succ_iterator I = succ_begin(Inst->getParent()),
+      E = succ_end(Inst->getParent()); I != E && SuccToSinkTo == 0; ++I) {
+    if (IsAcceptableTarget(Inst, *I))
+      SuccToSinkTo = *I;
   }
-      
+
   // If we couldn't find a block to sink to, ignore this instruction.
   if (SuccToSinkTo == 0)
     return false;
-  
-  // It is not possible to sink an instruction into its own block.  This can
-  // happen with loops.
-  if (Inst->getParent() == SuccToSinkTo)
-    return false;
-  
-  DEBUG(dbgs() << "Sink instr " << *Inst);
-  DEBUG(dbgs() << "to block ";
-        WriteAsOperand(dbgs(), SuccToSinkTo, false));
-  
-  // If the block has multiple predecessors, this would introduce computation on
-  // a path that it doesn't already exist.  We could split the critical edge,
-  // but for now we just punt.
-  // FIXME: Split critical edges if not backedges.
-  if (SuccToSinkTo->getUniquePredecessor() != ParentBlock) {
-    // We cannot sink a load across a critical edge - there may be stores in
-    // other code paths.
-    if (!isSafeToSpeculativelyExecute(Inst)) {
-      DEBUG(dbgs() << " *** PUNTING: Wont sink load along critical edge.\n");
-      return false;
-    }
 
-    // We don't want to sink across a critical edge if we don't dominate the
-    // successor. We could be introducing calculations to new code paths.
-    if (!DT->dominates(ParentBlock, SuccToSinkTo)) {
-      DEBUG(dbgs() << " *** PUNTING: Critical edge found\n");
-      return false;
-    }
-
-    // Don't sink instructions into a loop.
-    if (LI->isLoopHeader(SuccToSinkTo)) {
-      DEBUG(dbgs() << " *** PUNTING: Loop header found\n");
-      return false;
-    }
+  DEBUG(dbgs() << "Sink" << *Inst << " (";
+        WriteAsOperand(dbgs(), Inst->getParent(), false);
+        dbgs() << " -> ";
+        WriteAsOperand(dbgs(), SuccToSinkTo, false);
+        dbgs() << ")\n");
 
-    // Otherwise we are OK with sinking along a critical edge.
-    DEBUG(dbgs() << "Sinking along critical edge.\n");
-  }
-  
-  // Determine where to insert into.  Skip phi nodes.
-  BasicBlock::iterator InsertPos = SuccToSinkTo->begin();
-  while (InsertPos != SuccToSinkTo->end() && isa<PHINode>(InsertPos))
-    ++InsertPos;
-  
   // Move the instruction.
-  Inst->moveBefore(InsertPos);
+  Inst->moveBefore(SuccToSinkTo->getFirstInsertionPt());
   return true;
 }
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
index e21eb9d..6557d63 100644
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -172,7 +172,7 @@ bool TailCallElim::runOnFunction(Function &F) {
     FunctionContainsEscapingAllocas |=
       CheckForEscapingAllocas(BB, CannotTCETailMarkedCall);
   }
-  
+
   /// FIXME: The code generator produces really bad code when an 'escaping
   /// alloca' is changed from being a static alloca to being a dynamic alloca.
   /// Until this is resolved, disable this transformation if that would ever
@@ -234,7 +234,7 @@ bool TailCallElim::CanMoveAboveCall(Instruction *I, CallInst *CI) {
   // call does not mod/ref the memory location being processed.
   if (I->mayHaveSideEffects())  // This also handles volatile loads.
     return false;
-  
+
   if (LoadInst *L = dyn_cast<LoadInst>(I)) {
     // Loads may always be moved above calls without side effects.
     if (CI->mayHaveSideEffects()) {
@@ -364,7 +364,7 @@ TailCallElim::FindTRECandidate(Instruction *TI,
 
   if (&BB->front() == TI) // Make sure there is something before the terminator.
     return 0;
-  
+
   // Scan backwards from the return, checking to see if there is a tail call in
   // this block.  If so, set CI to it.
   CallInst *CI = 0;
@@ -388,10 +388,10 @@ TailCallElim::FindTRECandidate(Instruction *TI,
   //   double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call
   // and disable this xform in this case, because the code generator will
   // lower the call to fabs into inline code.
-  if (BB == &F->getEntryBlock() && 
+  if (BB == &F->getEntryBlock() &&
       FirstNonDbg(BB->front()) == CI &&
       FirstNonDbg(llvm::next(BB->begin())) == TI &&
-      callIsSmall(F)) {
+      callIsSmall(CI)) {
     // A single-block function with just a call and a return. Check that
     // the arguments match.
     CallSite::arg_iterator I = CallSite(CI).arg_begin(),
@@ -432,7 +432,7 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
   BasicBlock::iterator BBI = CI;
   for (++BBI; &*BBI != Ret; ++BBI) {
     if (CanMoveAboveCall(BBI, CI)) continue;
-    
+
     // If we can't move the instruction above the call, it might be because it
     // is an associative and commutative operation that could be transformed
     // using accumulator recursion elimination.  Check to see if this is the
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index 3859a1a..5576432 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -671,12 +671,3 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
   return cast<ReturnInst>(NewRet);
 }
 
-/// GetFirstDebugLocInBasicBlock - Return first valid DebugLoc entry in a 
-/// given basic block.
-DebugLoc llvm::GetFirstDebugLocInBasicBlock(const BasicBlock *BB) {
-  if (const Instruction *I = BB->getFirstNonPHI())
-    return I->getDebugLoc();
-  // Scanning entire block may be too expensive, if the first instruction
-  // does not have valid location info.
-  return DebugLoc();
-}
diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp
index f752d79..6b04e3d 100644
--- a/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -117,33 +117,38 @@ bool llvm::isCriticalEdge(const TerminatorInst *TI, unsigned SuccNum,
   return false;
 }
 
-/// CreatePHIsForSplitLoopExit - When a loop exit edge is split, LCSSA form
+/// createPHIsForSplitLoopExit - When a loop exit edge is split, LCSSA form
 /// may require new PHIs in the new exit block. This function inserts the
-/// new PHIs, as needed.  Preds is a list of preds inside the loop, SplitBB
+/// new PHIs, as needed. Preds is a list of preds inside the loop, SplitBB
 /// is the new loop exit block, and DestBB is the old loop exit, now the
 /// successor of SplitBB.
-static void CreatePHIsForSplitLoopExit(SmallVectorImpl<BasicBlock *> &Preds,
+static void createPHIsForSplitLoopExit(ArrayRef<BasicBlock *> Preds,
                                        BasicBlock *SplitBB,
                                        BasicBlock *DestBB) {
   // SplitBB shouldn't have anything non-trivial in it yet.
-  assert(SplitBB->getFirstNonPHI() == SplitBB->getTerminator() &&
-         "SplitBB has non-PHI nodes!");
+  assert((SplitBB->getFirstNonPHI() == SplitBB->getTerminator() ||
+          SplitBB->isLandingPad()) && "SplitBB has non-PHI nodes!");
 
-  // For each PHI in the destination block...
+  // For each PHI in the destination block.
   for (BasicBlock::iterator I = DestBB->begin();
        PHINode *PN = dyn_cast<PHINode>(I); ++I) {
     unsigned Idx = PN->getBasicBlockIndex(SplitBB);
     Value *V = PN->getIncomingValue(Idx);
+
     // If the input is a PHI which already satisfies LCSSA, don't create
     // a new one.
     if (const PHINode *VP = dyn_cast<PHINode>(V))
       if (VP->getParent() == SplitBB)
         continue;
+
     // Otherwise a new PHI is needed. Create one and populate it.
-    PHINode *NewPN = PHINode::Create(PN->getType(), Preds.size(), "split",
-                                     SplitBB->getTerminator());
+    PHINode *NewPN =
+      PHINode::Create(PN->getType(), Preds.size(), "split",
+                      SplitBB->isLandingPad() ?
+                      SplitBB->begin() : SplitBB->getTerminator());
     for (unsigned i = 0, e = Preds.size(); i != e; ++i)
       NewPN->addIncoming(V, Preds[i]);
+
     // Update the original PHI.
     PN->setIncomingValue(Idx, NewPN);
   }
@@ -168,7 +173,8 @@ static void CreatePHIsForSplitLoopExit(SmallVectorImpl<BasicBlock *> &Preds,
 ///
 BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
                                     Pass *P, bool MergeIdenticalEdges,
-                                    bool DontDeleteUselessPhis) {
+                                    bool DontDeleteUselessPhis,
+                                    bool SplitLandingPads) {
   if (!isCriticalEdge(TI, SuccNum, MergeIdenticalEdges)) return 0;
 
   assert(!isa<IndirectBrInst>(TI) &&
@@ -335,11 +341,8 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
                "Split point for loop exit is contained in loop!");
 
         // Update LCSSA form in the newly created exit block.
-        if (P->mustPreserveAnalysisID(LCSSAID)) {
-          SmallVector<BasicBlock *, 1> OrigPred;
-          OrigPred.push_back(TIBB);
-          CreatePHIsForSplitLoopExit(OrigPred, NewBB, DestBB);
-        }
+        if (P->mustPreserveAnalysisID(LCSSAID))
+          createPHIsForSplitLoopExit(TIBB, NewBB, DestBB);
 
         // For each unique exit block...
         // FIXME: This code is functionally equivalent to the corresponding
@@ -371,10 +374,19 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
           // getUniqueExitBlocks above because that depends on LoopSimplify
           // form, which we're in the process of restoring!
           if (!Preds.empty() && HasPredOutsideOfLoop) {
-            BasicBlock *NewExitBB =
-              SplitBlockPredecessors(Exit, Preds, "split", P);
-            if (P->mustPreserveAnalysisID(LCSSAID))
-              CreatePHIsForSplitLoopExit(Preds, NewExitBB, Exit);
+            if (!Exit->isLandingPad()) {
+              BasicBlock *NewExitBB =
+                SplitBlockPredecessors(Exit, Preds, "split", P);
+              if (P->mustPreserveAnalysisID(LCSSAID))
+                createPHIsForSplitLoopExit(Preds, NewExitBB, Exit);
+            } else if (SplitLandingPads) {
+              SmallVector<BasicBlock*, 8> NewBBs;
+              SplitLandingPadPredecessors(Exit, Preds,
+                                          ".split1", ".split2",
+                                          P, NewBBs);
+              if (P->mustPreserveAnalysisID(LCSSAID))
+                createPHIsForSplitLoopExit(Preds, NewBBs[0], Exit);
+            }
           }
         }
       }
diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index a808303..27f7724 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -12,18 +12,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
-#include "llvm/Type.h"
 #include "llvm/Constants.h"
 #include "llvm/Function.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/Intrinsics.h"
 #include "llvm/Intrinsics.h"
 #include "llvm/LLVMContext.h"
+#include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
-#include "llvm/Support/IRBuilder.h"
+#include "llvm/Type.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/ADT/SmallString.h"
 
 using namespace llvm;
 
@@ -42,7 +42,7 @@ Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const TargetData *TD) {
                                    Attribute::NoUnwind);
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Constant *StrLen = M->getOrInsertFunction("strlen", AttrListPtr::get(AWI, 2),
+  Constant *StrLen = M->getOrInsertFunction("strlen", AttrListPtr::get(AWI),
                                             TD->getIntPtrType(Context),
                                             B.getInt8PtrTy(),
                                             NULL);
@@ -64,7 +64,7 @@ Value *llvm::EmitStrChr(Value *Ptr, char C, IRBuilder<> &B,
 
   Type *I8Ptr = B.getInt8PtrTy();
   Type *I32Ty = B.getInt32Ty();
-  Constant *StrChr = M->getOrInsertFunction("strchr", AttrListPtr::get(&AWI, 1),
+  Constant *StrChr = M->getOrInsertFunction("strchr", AttrListPtr::get(AWI),
                                             I8Ptr, I8Ptr, I32Ty, NULL);
   CallInst *CI = B.CreateCall2(StrChr, CastToCStr(Ptr, B),
                                ConstantInt::get(I32Ty, C), "strchr");
@@ -84,7 +84,7 @@ Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len,
                                    Attribute::NoUnwind);
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Value *StrNCmp = M->getOrInsertFunction("strncmp", AttrListPtr::get(AWI, 3),
+  Value *StrNCmp = M->getOrInsertFunction("strncmp", AttrListPtr::get(AWI),
                                           B.getInt32Ty(),
                                           B.getInt8PtrTy(),
                                           B.getInt8PtrTy(),
@@ -107,7 +107,7 @@ Value *llvm::EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
   AWI[0] = AttributeWithIndex::get(2, Attribute::NoCapture);
   AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
   Type *I8Ptr = B.getInt8PtrTy();
-  Value *StrCpy = M->getOrInsertFunction(Name, AttrListPtr::get(AWI, 2),
+  Value *StrCpy = M->getOrInsertFunction(Name, AttrListPtr::get(AWI),
                                          I8Ptr, I8Ptr, I8Ptr, NULL);
   CallInst *CI = B.CreateCall2(StrCpy, CastToCStr(Dst, B), CastToCStr(Src, B),
                                Name);
@@ -125,7 +125,7 @@ Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len,
   AWI[0] = AttributeWithIndex::get(2, Attribute::NoCapture);
   AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
   Type *I8Ptr = B.getInt8PtrTy();
-  Value *StrNCpy = M->getOrInsertFunction(Name, AttrListPtr::get(AWI, 2),
+  Value *StrNCpy = M->getOrInsertFunction(Name, AttrListPtr::get(AWI),
                                           I8Ptr, I8Ptr, I8Ptr,
                                           Len->getType(), NULL);
   CallInst *CI = B.CreateCall3(StrNCpy, CastToCStr(Dst, B), CastToCStr(Src, B),
@@ -145,7 +145,7 @@ Value *llvm::EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
   AWI = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *MemCpy = M->getOrInsertFunction("__memcpy_chk",
-                                         AttrListPtr::get(&AWI, 1),
+                                         AttrListPtr::get(AWI),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
@@ -167,7 +167,7 @@ Value *llvm::EmitMemChr(Value *Ptr, Value *Val,
   AttributeWithIndex AWI;
   AWI = AttributeWithIndex::get(~0u, Attribute::ReadOnly | Attribute::NoUnwind);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Value *MemChr = M->getOrInsertFunction("memchr", AttrListPtr::get(&AWI, 1),
+  Value *MemChr = M->getOrInsertFunction("memchr", AttrListPtr::get(AWI),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
                                          B.getInt32Ty(),
@@ -192,7 +192,7 @@ Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2,
                                    Attribute::NoUnwind);
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Value *MemCmp = M->getOrInsertFunction("memcmp", AttrListPtr::get(AWI, 3),
+  Value *MemCmp = M->getOrInsertFunction("memcmp", AttrListPtr::get(AWI),
                                          B.getInt32Ty(),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
@@ -260,7 +260,7 @@ void llvm::EmitPutS(Value *Str, IRBuilder<> &B, const TargetData *TD) {
   AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
   AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
 
-  Value *PutS = M->getOrInsertFunction("puts", AttrListPtr::get(AWI, 2),
+  Value *PutS = M->getOrInsertFunction("puts", AttrListPtr::get(AWI),
                                        B.getInt32Ty(),
                                        B.getInt8PtrTy(),
                                        NULL);
@@ -280,7 +280,7 @@ void llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B,
   AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
   Constant *F;
   if (File->getType()->isPointerTy())
-    F = M->getOrInsertFunction("fputc", AttrListPtr::get(AWI, 2),
+    F = M->getOrInsertFunction("fputc", AttrListPtr::get(AWI),
                                B.getInt32Ty(),
                                B.getInt32Ty(), File->getType(),
                                NULL);
@@ -309,7 +309,7 @@ void llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B,
   StringRef FPutsName = TLI->getName(LibFunc::fputs);
   Constant *F;
   if (File->getType()->isPointerTy())
-    F = M->getOrInsertFunction(FPutsName, AttrListPtr::get(AWI, 3),
+    F = M->getOrInsertFunction(FPutsName, AttrListPtr::get(AWI),
                                B.getInt32Ty(),
                                B.getInt8PtrTy(),
                                File->getType(), NULL);
@@ -337,7 +337,7 @@ void llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File,
   StringRef FWriteName = TLI->getName(LibFunc::fwrite);
   Constant *F;
   if (File->getType()->isPointerTy())
-    F = M->getOrInsertFunction(FWriteName, AttrListPtr::get(AWI, 3),
+    F = M->getOrInsertFunction(FWriteName, AttrListPtr::get(AWI),
                                TD->getIntPtrType(Context),
                                B.getInt8PtrTy(),
                                TD->getIntPtrType(Context),
diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt
index 7f5cb5e..4ff31ca 100644
--- a/lib/Transforms/Utils/CMakeLists.txt
+++ b/lib/Transforms/Utils/CMakeLists.txt
@@ -29,3 +29,5 @@ add_llvm_library(LLVMTransformUtils
   Utils.cpp
   ValueMapper.cpp
   )
+
+add_dependencies(LLVMTransformUtils intrinsics_gen)
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index 20052a4..99237b8 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -15,6 +15,7 @@
 
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
@@ -28,7 +29,6 @@
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/ADT/SmallVector.h"
 #include <map>
 using namespace llvm;
diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp
index a0e027b..1dac6b5 100644
--- a/lib/Transforms/Utils/CloneModule.cpp
+++ b/lib/Transforms/Utils/CloneModule.cpp
@@ -53,7 +53,7 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {
                                             I->isConstant(), I->getLinkage(),
                                             (Constant*) 0, I->getName(),
                                             (GlobalVariable*) 0,
-                                            I->isThreadLocal(),
+                                            I->getThreadLocalMode(),
                                             I->getType()->getAddressSpace());
     GV->copyAttributesFrom(I);
     VMap[I] = GV;
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index e8c0b80..c545cd6 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -13,7 +13,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Utils/FunctionUtils.h"
+#include "llvm/Transforms/Utils/CodeExtractor.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Instructions.h"
@@ -23,6 +23,8 @@
 #include "llvm/Pass.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/RegionInfo.h"
+#include "llvm/Analysis/RegionIterator.h"
 #include "llvm/Analysis/Verifier.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Support/CommandLine.h"
@@ -43,61 +45,139 @@ static cl::opt<bool>
 AggregateArgsOpt("aggregate-extracted-args", cl::Hidden,
                  cl::desc("Aggregate arguments to code-extracted functions"));
 
-namespace {
-  class CodeExtractor {
-    typedef SetVector<Value*> Values;
-    SetVector<BasicBlock*> BlocksToExtract;
-    DominatorTree* DT;
-    bool AggregateArgs;
-    unsigned NumExitBlocks;
-    Type *RetTy;
-  public:
-    CodeExtractor(DominatorTree* dt = 0, bool AggArgs = false)
-      : DT(dt), AggregateArgs(AggArgs||AggregateArgsOpt), NumExitBlocks(~0U) {}
-
-    Function *ExtractCodeRegion(ArrayRef<BasicBlock*> code);
-
-    bool isEligible(ArrayRef<BasicBlock*> code);
-
-  private:
-    /// definedInRegion - Return true if the specified value is defined in the
-    /// extracted region.
-    bool definedInRegion(Value *V) const {
-      if (Instruction *I = dyn_cast<Instruction>(V))
-        if (BlocksToExtract.count(I->getParent()))
-          return true;
-      return false;
-    }
+/// \brief Test whether a block is valid for extraction.
+static bool isBlockValidForExtraction(const BasicBlock &BB) {
+  // Landing pads must be in the function where they were inserted for cleanup.
+  if (BB.isLandingPad())
+    return false;
 
-    /// definedInCaller - Return true if the specified value is defined in the
-    /// function being code extracted, but not in the region being extracted.
-    /// These values must be passed in as live-ins to the function.
-    bool definedInCaller(Value *V) const {
-      if (isa<Argument>(V)) return true;
-      if (Instruction *I = dyn_cast<Instruction>(V))
-        if (!BlocksToExtract.count(I->getParent()))
-          return true;
+  // Don't hoist code containing allocas, invokes, or vastarts.
+  for (BasicBlock::const_iterator I = BB.begin(), E = BB.end(); I != E; ++I) {
+    if (isa<AllocaInst>(I) || isa<InvokeInst>(I))
       return false;
+    if (const CallInst *CI = dyn_cast<CallInst>(I))
+      if (const Function *F = CI->getCalledFunction())
+        if (F->getIntrinsicID() == Intrinsic::vastart)
+          return false;
+  }
+
+  return true;
+}
+
+/// \brief Build a set of blocks to extract if the input blocks are viable.
+template <typename IteratorT>
+static SetVector<BasicBlock *> buildExtractionBlockSet(IteratorT BBBegin,
+                                                       IteratorT BBEnd) {
+  SetVector<BasicBlock *> Result;
+
+  assert(BBBegin != BBEnd);
+
+  // Loop over the blocks, adding them to our set-vector, and aborting with an
+  // empty set if we encounter invalid blocks.
+  for (IteratorT I = BBBegin, E = BBEnd; I != E; ++I) {
+    if (!Result.insert(*I))
+      llvm_unreachable("Repeated basic blocks in extraction input");
+
+    if (!isBlockValidForExtraction(**I)) {
+      Result.clear();
+      return Result;
     }
+  }
+
+#ifndef NDEBUG
+  for (SetVector<BasicBlock *>::iterator I = llvm::next(Result.begin()),
+                                         E = Result.end();
+       I != E; ++I)
+    for (pred_iterator PI = pred_begin(*I), PE = pred_end(*I);
+         PI != PE; ++PI)
+      assert(Result.count(*PI) &&
+             "No blocks in this region may have entries from outside the region"
+             " except for the first block!");
+#endif
+
+  return Result;
+}
+
+/// \brief Helper to call buildExtractionBlockSet with an ArrayRef.
+static SetVector<BasicBlock *>
+buildExtractionBlockSet(ArrayRef<BasicBlock *> BBs) {
+  return buildExtractionBlockSet(BBs.begin(), BBs.end());
+}
+
+/// \brief Helper to call buildExtractionBlockSet with a RegionNode.
+static SetVector<BasicBlock *>
+buildExtractionBlockSet(const RegionNode &RN) {
+  if (!RN.isSubRegion())
+    // Just a single BasicBlock.
+    return buildExtractionBlockSet(RN.getNodeAs<BasicBlock>());
 
-    void severSplitPHINodes(BasicBlock *&Header);
-    void splitReturnBlocks();
-    void findInputsOutputs(Values &inputs, Values &outputs);
+  const Region &R = *RN.getNodeAs<Region>();
 
-    Function *constructFunction(const Values &inputs,
-                                const Values &outputs,
-                                BasicBlock *header,
-                                BasicBlock *newRootNode, BasicBlock *newHeader,
-                                Function *oldFunction, Module *M);
+  return buildExtractionBlockSet(R.block_begin(), R.block_end());
+}
 
-    void moveCodeToFunction(Function *newFunction);
+CodeExtractor::CodeExtractor(BasicBlock *BB, bool AggregateArgs)
+  : DT(0), AggregateArgs(AggregateArgs||AggregateArgsOpt),
+    Blocks(buildExtractionBlockSet(BB)), NumExitBlocks(~0U) {}
+
+CodeExtractor::CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT,
+                             bool AggregateArgs)
+  : DT(DT), AggregateArgs(AggregateArgs||AggregateArgsOpt),
+    Blocks(buildExtractionBlockSet(BBs)), NumExitBlocks(~0U) {}
+
+CodeExtractor::CodeExtractor(DominatorTree &DT, Loop &L, bool AggregateArgs)
+  : DT(&DT), AggregateArgs(AggregateArgs||AggregateArgsOpt),
+    Blocks(buildExtractionBlockSet(L.getBlocks())), NumExitBlocks(~0U) {}
+
+CodeExtractor::CodeExtractor(DominatorTree &DT, const RegionNode &RN,
+                             bool AggregateArgs)
+  : DT(&DT), AggregateArgs(AggregateArgs||AggregateArgsOpt),
+    Blocks(buildExtractionBlockSet(RN)), NumExitBlocks(~0U) {}
+
+/// definedInRegion - Return true if the specified value is defined in the
+/// extracted region.
+static bool definedInRegion(const SetVector<BasicBlock *> &Blocks, Value *V) {
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    if (Blocks.count(I->getParent()))
+      return true;
+  return false;
+}
 
-    void emitCallAndSwitchStatement(Function *newFunction,
-                                    BasicBlock *newHeader,
-                                    Values &inputs,
-                                    Values &outputs);
+/// definedInCaller - Return true if the specified value is defined in the
+/// function being code extracted, but not in the region being extracted.
+/// These values must be passed in as live-ins to the function.
+static bool definedInCaller(const SetVector<BasicBlock *> &Blocks, Value *V) {
+  if (isa<Argument>(V)) return true;
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    if (!Blocks.count(I->getParent()))
+      return true;
+  return false;
+}
 
-  };
+void CodeExtractor::findInputsOutputs(ValueSet &Inputs,
+                                      ValueSet &Outputs) const {
+  for (SetVector<BasicBlock *>::const_iterator I = Blocks.begin(),
+                                               E = Blocks.end();
+       I != E; ++I) {
+    BasicBlock *BB = *I;
+
+    // If a used value is defined outside the region, it's an input.  If an
+    // instruction is used outside the region, it's an output.
+    for (BasicBlock::iterator II = BB->begin(), IE = BB->end();
+         II != IE; ++II) {
+      for (User::op_iterator OI = II->op_begin(), OE = II->op_end();
+           OI != OE; ++OI)
+        if (definedInCaller(Blocks, *OI))
+          Inputs.insert(*OI);
+
+      for (Value::use_iterator UI = II->use_begin(), UE = II->use_end();
+           UI != UE; ++UI)
+        if (!definedInRegion(Blocks, *UI)) {
+          Outputs.insert(II);
+          break;
+        }
+    }
+  }
 }
 
 /// severSplitPHINodes - If a PHI node has multiple inputs from outside of the
@@ -115,7 +195,7 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) {
     // than one entry from outside the region.  If so, we need to sever the
     // header block into two.
     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
-      if (BlocksToExtract.count(PN->getIncomingBlock(i)))
+      if (Blocks.count(PN->getIncomingBlock(i)))
         ++NumPredsFromRegion;
       else
         ++NumPredsOutsideRegion;
@@ -136,8 +216,8 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) {
   // We only want to code extract the second block now, and it becomes the new
   // header of the region.
   BasicBlock *OldPred = Header;
-  BlocksToExtract.remove(OldPred);
-  BlocksToExtract.insert(NewBB);
+  Blocks.remove(OldPred);
+  Blocks.insert(NewBB);
   Header = NewBB;
 
   // Okay, update dominator sets. The blocks that dominate the new one are the
@@ -152,7 +232,7 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) {
     // Loop over all of the predecessors of OldPred that are in the region,
     // changing them to branch to NewBB instead.
     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
-      if (BlocksToExtract.count(PN->getIncomingBlock(i))) {
+      if (Blocks.count(PN->getIncomingBlock(i))) {
         TerminatorInst *TI = PN->getIncomingBlock(i)->getTerminator();
         TI->replaceUsesOfWith(OldPred, NewBB);
       }
@@ -170,7 +250,7 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) {
       // Loop over all of the incoming value in PN, moving them to NewPN if they
       // are from the extracted region.
       for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) {
-        if (BlocksToExtract.count(PN->getIncomingBlock(i))) {
+        if (Blocks.count(PN->getIncomingBlock(i))) {
           NewPN->addIncoming(PN->getIncomingValue(i), PN->getIncomingBlock(i));
           PN->removeIncomingValue(i);
           --i;
@@ -181,8 +261,8 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) {
 }
 
 void CodeExtractor::splitReturnBlocks() {
-  for (SetVector<BasicBlock*>::iterator I = BlocksToExtract.begin(),
-         E = BlocksToExtract.end(); I != E; ++I)
+  for (SetVector<BasicBlock *>::iterator I = Blocks.begin(), E = Blocks.end();
+       I != E; ++I)
     if (ReturnInst *RI = dyn_cast<ReturnInst>((*I)->getTerminator())) {
       BasicBlock *New = (*I)->splitBasicBlock(RI, (*I)->getName()+".ret");
       if (DT) {
@@ -203,45 +283,11 @@ void CodeExtractor::splitReturnBlocks() {
     }
 }
 
-// findInputsOutputs - Find inputs to, outputs from the code region.
-//
-void CodeExtractor::findInputsOutputs(Values &inputs, Values &outputs) {
-  std::set<BasicBlock*> ExitBlocks;
-  for (SetVector<BasicBlock*>::const_iterator ci = BlocksToExtract.begin(),
-       ce = BlocksToExtract.end(); ci != ce; ++ci) {
-    BasicBlock *BB = *ci;
-
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
-      // If a used value is defined outside the region, it's an input.  If an
-      // instruction is used outside the region, it's an output.
-      for (User::op_iterator O = I->op_begin(), E = I->op_end(); O != E; ++O)
-        if (definedInCaller(*O))
-          inputs.insert(*O);
-
-      // Consider uses of this instruction (outputs).
-      for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
-           UI != E; ++UI)
-        if (!definedInRegion(*UI)) {
-          outputs.insert(I);
-          break;
-        }
-    } // for: insts
-
-    // Keep track of the exit blocks from the region.
-    TerminatorInst *TI = BB->getTerminator();
-    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
-      if (!BlocksToExtract.count(TI->getSuccessor(i)))
-        ExitBlocks.insert(TI->getSuccessor(i));
-  } // for: basic blocks
-
-  NumExitBlocks = ExitBlocks.size();
-}
-
 /// constructFunction - make a function based on inputs and outputs, as follows:
 /// f(in0, ..., inN, out0, ..., outN)
 ///
-Function *CodeExtractor::constructFunction(const Values &inputs,
-                                           const Values &outputs,
+Function *CodeExtractor::constructFunction(const ValueSet &inputs,
+                                           const ValueSet &outputs,
                                            BasicBlock *header,
                                            BasicBlock *newRootNode,
                                            BasicBlock *newHeader,
@@ -261,15 +307,15 @@ Function *CodeExtractor::constructFunction(const Values &inputs,
   std::vector<Type*> paramTy;
 
   // Add the types of the input values to the function's argument list
-  for (Values::const_iterator i = inputs.begin(),
-         e = inputs.end(); i != e; ++i) {
+  for (ValueSet::const_iterator i = inputs.begin(), e = inputs.end();
+       i != e; ++i) {
     const Value *value = *i;
     DEBUG(dbgs() << "value used in func: " << *value << "\n");
     paramTy.push_back(value->getType());
   }
 
   // Add the types of the output values to the function's argument list.
-  for (Values::const_iterator I = outputs.begin(), E = outputs.end();
+  for (ValueSet::const_iterator I = outputs.begin(), E = outputs.end();
        I != E; ++I) {
     DEBUG(dbgs() << "instr used in func: " << **I << "\n");
     if (AggregateArgs)
@@ -326,7 +372,7 @@ Function *CodeExtractor::constructFunction(const Values &inputs,
     for (std::vector<User*>::iterator use = Users.begin(), useE = Users.end();
          use != useE; ++use)
       if (Instruction* inst = dyn_cast<Instruction>(*use))
-        if (BlocksToExtract.count(inst->getParent()))
+        if (Blocks.count(inst->getParent()))
           inst->replaceUsesOfWith(inputs[i], RewriteVal);
   }
 
@@ -347,7 +393,7 @@ Function *CodeExtractor::constructFunction(const Values &inputs,
     // The BasicBlock which contains the branch is not in the region
     // modify the branch target to a new block
     if (TerminatorInst *TI = dyn_cast<TerminatorInst>(Users[i]))
-      if (!BlocksToExtract.count(TI->getParent()) &&
+      if (!Blocks.count(TI->getParent()) &&
           TI->getParent()->getParent() == oldFunction)
         TI->replaceUsesOfWith(header, newHeader);
 
@@ -373,7 +419,7 @@ static BasicBlock* FindPhiPredForUseInBlock(Value* Used, BasicBlock* BB) {
 /// necessary.
 void CodeExtractor::
 emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
-                           Values &inputs, Values &outputs) {
+                           ValueSet &inputs, ValueSet &outputs) {
   // Emit a call to the new function, passing in: *pointer to struct (if
   // aggregating parameters), or plan inputs and allocated memory for outputs
   std::vector<Value*> params, StructValues, ReloadOutputs, Reloads;
@@ -381,14 +427,14 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
   LLVMContext &Context = newFunction->getContext();
 
   // Add inputs as params, or to be filled into the struct
-  for (Values::iterator i = inputs.begin(), e = inputs.end(); i != e; ++i)
+  for (ValueSet::iterator i = inputs.begin(), e = inputs.end(); i != e; ++i)
     if (AggregateArgs)
       StructValues.push_back(*i);
     else
       params.push_back(*i);
 
   // Create allocas for the outputs
-  for (Values::iterator i = outputs.begin(), e = outputs.end(); i != e; ++i) {
+  for (ValueSet::iterator i = outputs.begin(), e = outputs.end(); i != e; ++i) {
     if (AggregateArgs) {
       StructValues.push_back(*i);
     } else {
@@ -403,7 +449,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
   AllocaInst *Struct = 0;
   if (AggregateArgs && (inputs.size() + outputs.size() > 0)) {
     std::vector<Type*> ArgTypes;
-    for (Values::iterator v = StructValues.begin(),
+    for (ValueSet::iterator v = StructValues.begin(),
            ve = StructValues.end(); v != ve; ++v)
       ArgTypes.push_back((*v)->getType());
 
@@ -458,7 +504,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
     std::vector<User*> Users(outputs[i]->use_begin(), outputs[i]->use_end());
     for (unsigned u = 0, e = Users.size(); u != e; ++u) {
       Instruction *inst = cast<Instruction>(Users[u]);
-      if (!BlocksToExtract.count(inst->getParent()))
+      if (!Blocks.count(inst->getParent()))
         inst->replaceUsesOfWith(outputs[i], load);
     }
   }
@@ -476,11 +522,11 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
   std::map<BasicBlock*, BasicBlock*> ExitBlockMap;
 
   unsigned switchVal = 0;
-  for (SetVector<BasicBlock*>::const_iterator i = BlocksToExtract.begin(),
-         e = BlocksToExtract.end(); i != e; ++i) {
+  for (SetVector<BasicBlock*>::const_iterator i = Blocks.begin(),
+         e = Blocks.end(); i != e; ++i) {
     TerminatorInst *TI = (*i)->getTerminator();
     for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
-      if (!BlocksToExtract.count(TI->getSuccessor(i))) {
+      if (!Blocks.count(TI->getSuccessor(i))) {
         BasicBlock *OldTarget = TI->getSuccessor(i);
         // add a new basic block which returns the appropriate value
         BasicBlock *&NewTarget = ExitBlockMap[OldTarget];
@@ -618,18 +664,19 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
     TheSwitch->setCondition(call);
     TheSwitch->setDefaultDest(TheSwitch->getSuccessor(NumExitBlocks));
     // Remove redundant case
-    TheSwitch->removeCase(SwitchInst::CaseIt(TheSwitch, NumExitBlocks-1));
+    SwitchInst::CaseIt ToBeRemoved(TheSwitch, NumExitBlocks-1);
+    TheSwitch->removeCase(ToBeRemoved);
     break;
   }
 }
 
 void CodeExtractor::moveCodeToFunction(Function *newFunction) {
-  Function *oldFunc = (*BlocksToExtract.begin())->getParent();
+  Function *oldFunc = (*Blocks.begin())->getParent();
   Function::BasicBlockListType &oldBlocks = oldFunc->getBasicBlockList();
   Function::BasicBlockListType &newBlocks = newFunction->getBasicBlockList();
 
-  for (SetVector<BasicBlock*>::const_iterator i = BlocksToExtract.begin(),
-         e = BlocksToExtract.end(); i != e; ++i) {
+  for (SetVector<BasicBlock*>::const_iterator i = Blocks.begin(),
+         e = Blocks.end(); i != e; ++i) {
     // Delete the basic block from the old function, and the list of blocks
     oldBlocks.remove(*i);
 
@@ -638,47 +685,15 @@ void CodeExtractor::moveCodeToFunction(Function *newFunction) {
   }
 }
 
-/// ExtractRegion - Removes a loop from a function, replaces it with a call to
-/// new function. Returns pointer to the new function.
-///
-/// algorithm:
-///
-/// find inputs and outputs for the region
-///
-/// for inputs: add to function as args, map input instr* to arg#
-/// for outputs: add allocas for scalars,
-///             add to func as args, map output instr* to arg#
-///
-/// rewrite func to use argument #s instead of instr*
-///
-/// for each scalar output in the function: at every exit, store intermediate
-/// computed result back into memory.
-///
-Function *CodeExtractor::
-ExtractCodeRegion(ArrayRef<BasicBlock*> code) {
-  if (!isEligible(code))
+Function *CodeExtractor::extractCodeRegion() {
+  if (!isEligible())
     return 0;
 
-  // 1) Find inputs, outputs
-  // 2) Construct new function
-  //  * Add allocas for defs, pass as args by reference
-  //  * Pass in uses as args
-  // 3) Move code region, add call instr to func
-  //
-  BlocksToExtract.insert(code.begin(), code.end());
-
-  Values inputs, outputs;
+  ValueSet inputs, outputs;
 
   // Assumption: this is a single-entry code region, and the header is the first
   // block in the region.
-  BasicBlock *header = code[0];
-
-  for (unsigned i = 1, e = code.size(); i != e; ++i)
-    for (pred_iterator PI = pred_begin(code[i]), E = pred_end(code[i]);
-         PI != E; ++PI)
-      assert(BlocksToExtract.count(*PI) &&
-             "No blocks in this region may have entries from outside the region"
-             " except for the first block!");
+  BasicBlock *header = *Blocks.begin();
 
   // If we have to split PHI nodes or the entry block, do so now.
   severSplitPHINodes(header);
@@ -703,6 +718,14 @@ ExtractCodeRegion(ArrayRef<BasicBlock*> code) {
   // Find inputs to, outputs from the code region.
   findInputsOutputs(inputs, outputs);
 
+  SmallPtrSet<BasicBlock *, 1> ExitBlocks;
+  for (SetVector<BasicBlock *>::iterator I = Blocks.begin(), E = Blocks.end();
+       I != E; ++I)
+    for (succ_iterator SI = succ_begin(*I), SE = succ_end(*I); SI != SE; ++SI)
+      if (!Blocks.count(*SI))
+        ExitBlocks.insert(*SI);
+  NumExitBlocks = ExitBlocks.size();
+
   // Construct new function based on inputs/outputs & add allocas for all defs.
   Function *newFunction = constructFunction(inputs, outputs, header,
                                             newFuncRoot,
@@ -718,7 +741,7 @@ ExtractCodeRegion(ArrayRef<BasicBlock*> code) {
   for (BasicBlock::iterator I = header->begin(); isa<PHINode>(I); ++I) {
     PHINode *PN = cast<PHINode>(I);
     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
-      if (!BlocksToExtract.count(PN->getIncomingBlock(i)))
+      if (!Blocks.count(PN->getIncomingBlock(i)))
         PN->setIncomingBlock(i, newFuncRoot);
   }
 
@@ -732,7 +755,7 @@ ExtractCodeRegion(ArrayRef<BasicBlock*> code) {
       PHINode *PN = cast<PHINode>(I);
       std::set<BasicBlock*> ProcessedPreds;
       for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
-        if (BlocksToExtract.count(PN->getIncomingBlock(i))) {
+        if (Blocks.count(PN->getIncomingBlock(i))) {
           if (ProcessedPreds.insert(PN->getIncomingBlock(i)).second)
             PN->setIncomingBlock(i, codeReplacer);
           else {
@@ -754,44 +777,3 @@ ExtractCodeRegion(ArrayRef<BasicBlock*> code) {
         report_fatal_error("verifyFunction failed!"));
   return newFunction;
 }
-
-bool CodeExtractor::isEligible(ArrayRef<BasicBlock*> code) {
-  // Deny a single basic block that's a landing pad block.
-  if (code.size() == 1 && code[0]->isLandingPad())
-    return false;
-
-  // Deny code region if it contains allocas or vastarts.
-  for (ArrayRef<BasicBlock*>::iterator BB = code.begin(), e=code.end();
-       BB != e; ++BB)
-    for (BasicBlock::const_iterator I = (*BB)->begin(), Ie = (*BB)->end();
-         I != Ie; ++I)
-      if (isa<AllocaInst>(*I))
-        return false;
-      else if (const CallInst *CI = dyn_cast<CallInst>(I))
-        if (const Function *F = CI->getCalledFunction())
-          if (F->getIntrinsicID() == Intrinsic::vastart)
-            return false;
-  return true;
-}
-
-
-/// ExtractCodeRegion - Slurp a sequence of basic blocks into a brand new
-/// function.
-///
-Function* llvm::ExtractCodeRegion(DominatorTree &DT,
-                                  ArrayRef<BasicBlock*> code,
-                                  bool AggregateArgs) {
-  return CodeExtractor(&DT, AggregateArgs).ExtractCodeRegion(code);
-}
-
-/// ExtractLoop - Slurp a natural loop into a brand new function.
-///
-Function* llvm::ExtractLoop(DominatorTree &DT, Loop *L, bool AggregateArgs) {
-  return CodeExtractor(&DT, AggregateArgs).ExtractCodeRegion(L->getBlocks());
-}
-
-/// ExtractBasicBlock - Slurp a basic block into a brand new function.
-///
-Function* llvm::ExtractBasicBlock(ArrayRef<BasicBlock*> BBs, bool AggregateArgs){
-  return CodeExtractor(0, AggregateArgs).ExtractCodeRegion(BBs);
-}
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index d2b167a..89e89e7 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -13,22 +13,22 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Attributes.h"
 #include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
-#include "llvm/Module.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Intrinsics.h"
-#include "llvm/Attributes.h"
+#include "llvm/Module.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Support/CallSite.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/CallSite.h"
-#include "llvm/Support/IRBuilder.h"
 using namespace llvm;
 
 bool llvm::InlineFunction(CallInst *CI, InlineFunctionInfo &IFI,
@@ -43,10 +43,10 @@ bool llvm::InlineFunction(InvokeInst *II, InlineFunctionInfo &IFI,
 namespace {
   /// A class for recording information about inlining through an invoke.
   class InvokeInliningInfo {
-    BasicBlock *OuterResumeDest; //< Destination of the invoke's unwind.
-    BasicBlock *InnerResumeDest; //< Destination for the callee's resume.
-    LandingPadInst *CallerLPad;  //< LandingPadInst associated with the invoke.
-    PHINode *InnerEHValuesPHI;   //< PHI for EH values from landingpad insts.
+    BasicBlock *OuterResumeDest; ///< Destination of the invoke's unwind.
+    BasicBlock *InnerResumeDest; ///< Destination for the callee's resume.
+    LandingPadInst *CallerLPad;  ///< LandingPadInst associated with the invoke.
+    PHINode *InnerEHValuesPHI;   ///< PHI for EH values from landingpad insts.
     SmallVector<Value*, 8> UnwindDestPHIValues;
 
   public:
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index d1c4d59..bed7d72 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -14,31 +14,31 @@
 
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Constants.h"
+#include "llvm/DIBuilder.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/DerivedTypes.h"
 #include "llvm/GlobalAlias.h"
 #include "llvm/GlobalVariable.h"
-#include "llvm/DerivedTypes.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/Instructions.h"
-#include "llvm/Intrinsics.h"
 #include "llvm/IntrinsicInst.h"
+#include "llvm/Intrinsics.h"
 #include "llvm/Metadata.h"
 #include "llvm/Operator.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/Analysis/DebugInfo.h"
-#include "llvm/Analysis/DIBuilder.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Target/TargetData.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetData.h"
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -169,16 +169,21 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions) {
       // Otherwise, we can fold this switch into a conditional branch
       // instruction if it has only one non-default destination.
       SwitchInst::CaseIt FirstCase = SI->case_begin();
-      Value *Cond = Builder.CreateICmpEQ(SI->getCondition(),
-          FirstCase.getCaseValue(), "cond");
-
-      // Insert the new branch.
-      Builder.CreateCondBr(Cond, FirstCase.getCaseSuccessor(),
-                           SI->getDefaultDest());
-
-      // Delete the old switch.
-      SI->eraseFromParent();
-      return true;
+      IntegersSubset& Case = FirstCase.getCaseValueEx();
+      if (Case.isSingleNumber()) {
+        // FIXME: Currently work with ConstantInt based numbers.
+        Value *Cond = Builder.CreateICmpEQ(SI->getCondition(),
+             Case.getSingleNumber(0).toConstantInt(),
+            "cond");
+
+        // Insert the new branch.
+        Builder.CreateCondBr(Cond, FirstCase.getCaseSuccessor(),
+                             SI->getDefaultDest());
+
+        // Delete the old switch.
+        SI->eraseFromParent();
+        return true;
+      }
     }
     return false;
   }
@@ -260,7 +265,7 @@ bool llvm::isInstructionTriviallyDead(Instruction *I) {
       return isa<UndefValue>(II->getArgOperand(1));
   }
 
-  if (extractMallocCall(I)) return true;
+  if (isAllocLikeFn(I)) return true;
 
   if (CallInst *CI = isFreeCall(I))
     if (Constant *C = dyn_cast<Constant>(CI->getArgOperand(0)))
@@ -700,7 +705,7 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
         CollisionMap[PN] = Old;
         break;
       }
-      // Procede to the next PHI in the list.
+      // Proceed to the next PHI in the list.
       OtherPN = I->second;
     }
   }
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index e15497a..2023750 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -95,9 +95,11 @@ static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI,
   // Erase basic block from the function...
 
   // ScalarEvolution holds references to loop exit blocks.
-  if (ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>()) {
-    if (Loop *L = LI->getLoopFor(BB))
-      SE->forgetLoop(L);
+  if (LPM) {
+    if (ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>()) {
+      if (Loop *L = LI->getLoopFor(BB))
+        SE->forgetLoop(L);
+    }
   }
   LI->removeBlock(BB);
   BB->eraseFromParent();
@@ -204,9 +206,11 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
 
   // Notify ScalarEvolution that the loop will be substantially changed,
   // if not outright eliminated.
-  ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>();
-  if (SE)
-    SE->forgetLoop(L);
+  if (LPM) {
+    ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>();
+    if (SE)
+      SE->forgetLoop(L);
+  }
 
   // If we know the trip count, we know the multiple...
   unsigned BreakoutTrip = 0;
@@ -405,24 +409,26 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
     }
   }
 
-  // FIXME: Reconstruct dom info, because it is not preserved properly.
-  // Incrementally updating domtree after loop unrolling would be easy.
-  if (DominatorTree *DT = LPM->getAnalysisIfAvailable<DominatorTree>())
-    DT->runOnFunction(*L->getHeader()->getParent());
-
-  // Simplify any new induction variables in the partially unrolled loop.
-  if (SE && !CompletelyUnroll) {
-    SmallVector<WeakVH, 16> DeadInsts;
-    simplifyLoopIVs(L, SE, LPM, DeadInsts);
-
-    // Aggressively clean up dead instructions that simplifyLoopIVs already
-    // identified. Any remaining should be cleaned up below.
-    while (!DeadInsts.empty())
-      if (Instruction *Inst =
-          dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val()))
-        RecursivelyDeleteTriviallyDeadInstructions(Inst);
+  if (LPM) {
+    // FIXME: Reconstruct dom info, because it is not preserved properly.
+    // Incrementally updating domtree after loop unrolling would be easy.
+    if (DominatorTree *DT = LPM->getAnalysisIfAvailable<DominatorTree>())
+      DT->runOnFunction(*L->getHeader()->getParent());
+
+    // Simplify any new induction variables in the partially unrolled loop.
+    ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>();
+    if (SE && !CompletelyUnroll) {
+      SmallVector<WeakVH, 16> DeadInsts;
+      simplifyLoopIVs(L, SE, LPM, DeadInsts);
+
+      // Aggressively clean up dead instructions that simplifyLoopIVs already
+      // identified. Any remaining should be cleaned up below.
+      while (!DeadInsts.empty())
+        if (Instruction *Inst =
+            dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val()))
+          RecursivelyDeleteTriviallyDeadInstructions(Inst);
+    }
   }
-
   // At this point, the code is well formed.  We now do a quick sweep over the
   // inserted code, doing constant propagation and dead code elimination as we
   // go.
diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 3aa6bef..67e17f4 100644
--- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -131,7 +131,7 @@ static void ConnectProlog(Loop *L, Value *TripCount, unsigned Count,
 /// There are two value maps that are defined and used.  VMap is
 /// for the values in the current loop instance.  LVMap contains
 /// the values from the last loop instance.  We need the LVMap values
-/// to update the inital values for the current loop instance.
+/// to update the initial values for the current loop instance.
 ///
 static void CloneLoopBlocks(Loop *L,
                             bool FirstCopy,
@@ -237,6 +237,8 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI,
 
   // Use Scalar Evolution to compute the trip count.  This allows more
   // loops to be unrolled than relying on induction var simplification
+  if (!LPM)
+    return false;
   ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>();
   if (SE == 0)
     return false;
diff --git a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
index c70ced1..02bdcda 100644
--- a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
+++ b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
@@ -12,18 +12,19 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "lower-expect-intrinsic"
+#include "llvm/BasicBlock.h"
 #include "llvm/Constants.h"
 #include "llvm/Function.h"
-#include "llvm/BasicBlock.h"
-#include "llvm/LLVMContext.h"
 #include "llvm/Instructions.h"
 #include "llvm/Intrinsics.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/MDBuilder.h"
 #include "llvm/Metadata.h"
 #include "llvm/Pass.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/ADT/Statistic.h"
 #include <vector>
 
 using namespace llvm;
@@ -70,24 +71,18 @@ bool LowerExpectIntrinsic::HandleSwitchExpect(SwitchInst *SI) {
   if (!ExpectedValue)
     return false;
 
-  LLVMContext &Context = CI->getContext();
-  Type *Int32Ty = Type::getInt32Ty(Context);
-
   SwitchInst::CaseIt Case = SI->findCaseValue(ExpectedValue);
-  std::vector<Value *> Vec;
-  unsigned n = SI->getNumCases();
-  Vec.resize(n + 1 + 1); // +1 for MDString and +1 for default case
-
-  Vec[0] = MDString::get(Context, "branch_weights");
-  Vec[1] = ConstantInt::get(Int32Ty, Case == SI->case_default() ?
-                            LikelyBranchWeight : UnlikelyBranchWeight);
-  for (unsigned i = 0; i < n; ++i) {
-    Vec[i + 1 + 1] = ConstantInt::get(Int32Ty, i == Case.getCaseIndex() ?
-        LikelyBranchWeight : UnlikelyBranchWeight);
-  }
+  unsigned n = SI->getNumCases(); // +1 for default case.
+  std::vector<uint32_t> Weights(n + 1);
 
-  MDNode *WeightsNode = llvm::MDNode::get(Context, Vec);
-  SI->setMetadata(LLVMContext::MD_prof, WeightsNode);
+  Weights[0] = Case == SI->case_default() ? LikelyBranchWeight
+                                          : UnlikelyBranchWeight;
+  for (unsigned i = 0; i != n; ++i)
+    Weights[i + 1] = i == Case.getCaseIndex() ? LikelyBranchWeight
+                                              : UnlikelyBranchWeight;
+
+  SI->setMetadata(LLVMContext::MD_prof,
+                  MDBuilder(CI->getContext()).createBranchWeights(Weights));
 
   SI->setCondition(ArgValue);
   return true;
@@ -120,20 +115,17 @@ bool LowerExpectIntrinsic::HandleIfExpect(BranchInst *BI) {
   if (!ExpectedValue)
     return false;
 
-  LLVMContext &Context = CI->getContext();
-  Type *Int32Ty = Type::getInt32Ty(Context);
-  bool Likely = ExpectedValue->isOne();
+  MDBuilder MDB(CI->getContext());
+  MDNode *Node;
 
   // If expect value is equal to 1 it means that we are more likely to take
   // branch 0, in other case more likely is branch 1.
-  Value *Ops[] = {
-    MDString::get(Context, "branch_weights"),
-    ConstantInt::get(Int32Ty, Likely ? LikelyBranchWeight : UnlikelyBranchWeight),
-    ConstantInt::get(Int32Ty, Likely ? UnlikelyBranchWeight : LikelyBranchWeight)
-  };
+  if (ExpectedValue->isOne())
+    Node = MDB.createBranchWeights(LikelyBranchWeight, UnlikelyBranchWeight);
+  else
+    Node = MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight);
 
-  MDNode *WeightsNode = MDNode::get(Context, Ops);
-  BI->setMetadata(LLVMContext::MD_prof, WeightsNode);
+  BI->setMetadata(LLVMContext::MD_prof, Node);
 
   CmpI->setOperand(0, ArgValue);
   return true;
diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp
index a16130d..1547439 100644
--- a/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/lib/Transforms/Utils/LowerSwitch.cpp
@@ -66,18 +66,6 @@ namespace {
                              BasicBlock* OrigBlock, BasicBlock* Default);
     unsigned Clusterify(CaseVector& Cases, SwitchInst *SI);
   };
-
-  /// The comparison function for sorting the switch case values in the vector.
-  /// WARNING: Case ranges should be disjoint!
-  struct CaseCmp {
-    bool operator () (const LowerSwitch::CaseRange& C1,
-                      const LowerSwitch::CaseRange& C2) {
-
-      const ConstantInt* CI1 = cast<const ConstantInt>(C1.Low);
-      const ConstantInt* CI2 = cast<const ConstantInt>(C2.High);
-      return CI1->getValue().slt(CI2->getValue());
-    }
-  };
 }
 
 char LowerSwitch::ID = 0;
@@ -159,7 +147,7 @@ BasicBlock* LowerSwitch::switchConvert(CaseItr Begin, CaseItr End,
   Function::iterator FI = OrigBlock;
   F->getBasicBlockList().insert(++FI, NewNode);
 
-  ICmpInst* Comp = new ICmpInst(ICmpInst::ICMP_SLT,
+  ICmpInst* Comp = new ICmpInst(ICmpInst::ICMP_ULT,
                                 Val, Pivot.Low, "Pivot");
   NewNode->getInstList().push_back(Comp);
   BranchInst::Create(LBranch, RBranch, Comp, NewNode);
@@ -234,40 +222,34 @@ BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val,
 
 // Clusterify - Transform simple list of Cases into list of CaseRange's
 unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) {
-  unsigned numCmps = 0;
+
+  IntegersSubsetToBB TheClusterifier;
 
   // Start with "simple" cases
-  for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i)
-    Cases.push_back(CaseRange(i.getCaseValue(), i.getCaseValue(),
-                              i.getCaseSuccessor()));
+  for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+       i != e; ++i) {
+    BasicBlock *SuccBB = i.getCaseSuccessor();
+    IntegersSubset CaseRanges = i.getCaseValueEx();
+    TheClusterifier.add(CaseRanges, SuccBB);
+  }
   
-  std::sort(Cases.begin(), Cases.end(), CaseCmp());
-
-  // Merge case into clusters
-  if (Cases.size()>=2)
-    for (CaseItr I=Cases.begin(), J=llvm::next(Cases.begin()); J!=Cases.end(); ) {
-      int64_t nextValue = cast<ConstantInt>(J->Low)->getSExtValue();
-      int64_t currentValue = cast<ConstantInt>(I->High)->getSExtValue();
-      BasicBlock* nextBB = J->BB;
-      BasicBlock* currentBB = I->BB;
-
-      // If the two neighboring cases go to the same destination, merge them
-      // into a single case.
-      if ((nextValue-currentValue==1) && (currentBB == nextBB)) {
-        I->High = J->High;
-        J = Cases.erase(J);
-      } else {
-        I = J++;
-      }
-    }
-
-  for (CaseItr I=Cases.begin(), E=Cases.end(); I!=E; ++I, ++numCmps) {
-    if (I->Low != I->High)
+  TheClusterifier.optimize();
+  
+  size_t numCmps = 0;
+  for (IntegersSubsetToBB::RangeIterator i = TheClusterifier.begin(),
+       e = TheClusterifier.end(); i != e; ++i, ++numCmps) {
+    IntegersSubsetToBB::Cluster &C = *i;
+    
+    // FIXME: Currently work with ConstantInt based numbers.
+    // Changing it to APInt based is a pretty heavy for this commit.
+    Cases.push_back(CaseRange(C.first.getLow().toConstantInt(),
+                              C.first.getHigh().toConstantInt(), C.second));
+    if (C.first.isSingleNumber())
       // A range counts double, since it requires two compares.
       ++numCmps;
   }
 
-  return numCmps;
+  return numCmps;  
 }
 
 // processSwitchInst - Replace the specified switch instruction with a sequence
diff --git a/lib/Transforms/Utils/ModuleUtils.cpp b/lib/Transforms/Utils/ModuleUtils.cpp
index 8491c55..dbcf3b2 100644
--- a/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/lib/Transforms/Utils/ModuleUtils.cpp
@@ -14,8 +14,8 @@
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/Module.h"
-#include "llvm/Support/IRBuilder.h"
 
 using namespace llvm;
 
diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 2357d81..dd5e20e 100644
--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -28,14 +28,14 @@
 #define DEBUG_TYPE "mem2reg"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
+#include "llvm/DIBuilder.h"
 #include "llvm/Function.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Metadata.h"
 #include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/DebugInfo.h"
-#include "llvm/Analysis/DIBuilder.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/ValueTracking.h"
diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp
index e60a41b..b3f5289 100644
--- a/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/lib/Transforms/Utils/SSAUpdater.cpp
@@ -190,8 +190,11 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
     return V;
   }
 
-  // Set DebugLoc.
-  InsertedPHI->setDebugLoc(GetFirstDebugLocInBasicBlock(BB));
+  // Set the DebugLoc of the inserted PHI, if available.
+  DebugLoc DL;
+  if (const Instruction *I = BB->getFirstNonPHI())
+      DL = I->getDebugLoc();
+  InsertedPHI->setDebugLoc(DL);
 
   // If the client wants to know about all new instructions, tell it.
   if (InsertedPHIs) InsertedPHIs->push_back(InsertedPHI);
@@ -230,28 +233,6 @@ void SSAUpdater::RewriteUseAfterInsertions(Use &U) {
   U.set(V);
 }
 
-/// PHIiter - Iterator for PHI operands.  This is used for the PHI_iterator
-/// in the SSAUpdaterImpl template.
-namespace {
-  class PHIiter {
-  private:
-    PHINode *PHI;
-    unsigned idx;
-
-  public:
-    explicit PHIiter(PHINode *P) // begin iterator
-      : PHI(P), idx(0) {}
-    PHIiter(PHINode *P, bool) // end iterator
-      : PHI(P), idx(PHI->getNumIncomingValues()) {}
-
-    PHIiter &operator++() { ++idx; return *this; } 
-    bool operator==(const PHIiter& x) const { return idx == x.idx; }
-    bool operator!=(const PHIiter& x) const { return !operator==(x); }
-    Value *getIncomingValue() { return PHI->getIncomingValue(idx); }
-    BasicBlock *getIncomingBlock() { return PHI->getIncomingBlock(idx); }
-  };
-}
-
 /// SSAUpdaterTraits<SSAUpdater> - Traits for the SSAUpdaterImpl template,
 /// specialized for SSAUpdater.
 namespace llvm {
@@ -266,9 +247,26 @@ public:
   static BlkSucc_iterator BlkSucc_begin(BlkT *BB) { return succ_begin(BB); }
   static BlkSucc_iterator BlkSucc_end(BlkT *BB) { return succ_end(BB); }
 
-  typedef PHIiter PHI_iterator;
-  static inline PHI_iterator PHI_begin(PhiT *PHI) { return PHI_iterator(PHI); }
-  static inline PHI_iterator PHI_end(PhiT *PHI) {
+  class PHI_iterator {
+  private:
+    PHINode *PHI;
+    unsigned idx;
+
+  public:
+    explicit PHI_iterator(PHINode *P) // begin iterator
+      : PHI(P), idx(0) {}
+    PHI_iterator(PHINode *P, bool) // end iterator
+      : PHI(P), idx(PHI->getNumIncomingValues()) {}
+
+    PHI_iterator &operator++() { ++idx; return *this; } 
+    bool operator==(const PHI_iterator& x) const { return idx == x.idx; }
+    bool operator!=(const PHI_iterator& x) const { return !operator==(x); }
+    Value *getIncomingValue() { return PHI->getIncomingValue(idx); }
+    BasicBlock *getIncomingBlock() { return PHI->getIncomingBlock(idx); }
+  };
+
+  static PHI_iterator PHI_begin(PhiT *PHI) { return PHI_iterator(PHI); }
+  static PHI_iterator PHI_end(PhiT *PHI) {
     return PHI_iterator(PHI, true);
   }
 
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 66dd2c9..518df7c 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -16,29 +16,30 @@
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/GlobalVariable.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/LLVMContext.h"
+#include "llvm/MDBuilder.h"
 #include "llvm/Metadata.h"
 #include "llvm/Operator.h"
 #include "llvm/Type.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ConstantRange.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/NoFolder.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include <algorithm>
 #include <set>
 #include <map>
@@ -55,12 +56,26 @@ DupRet("simplifycfg-dup-ret", cl::Hidden, cl::init(false),
 STATISTIC(NumSpeculations, "Number of speculative executed instructions");
 
 namespace {
+  /// ValueEqualityComparisonCase - Represents a case of a switch.
+  struct ValueEqualityComparisonCase {
+    ConstantInt *Value;
+    BasicBlock *Dest;
+
+    ValueEqualityComparisonCase(ConstantInt *Value, BasicBlock *Dest)
+      : Value(Value), Dest(Dest) {}
+
+    bool operator<(ValueEqualityComparisonCase RHS) const {
+      // Comparing pointers is ok as we only rely on the order for uniquing.
+      return Value < RHS.Value;
+    }
+  };
+
 class SimplifyCFGOpt {
   const TargetData *const TD;
 
   Value *isValueEqualityComparison(TerminatorInst *TI);
   BasicBlock *GetValueEqualityComparisonCases(TerminatorInst *TI,
-    std::vector<std::pair<ConstantInt*, BasicBlock*> > &Cases);
+                               std::vector<ValueEqualityComparisonCase> &Cases);
   bool SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
                                                      BasicBlock *Pred,
                                                      IRBuilder<> &Builder);
@@ -107,6 +122,47 @@ static bool SafeToMergeTerminators(TerminatorInst *SI1, TerminatorInst *SI2) {
   return true;
 }
 
+/// isProfitableToFoldUnconditional - Return true if it is safe and profitable
+/// to merge these two terminator instructions together, where SI1 is an
+/// unconditional branch. PhiNodes will store all PHI nodes in common
+/// successors.
+///
+static bool isProfitableToFoldUnconditional(BranchInst *SI1,
+                                          BranchInst *SI2,
+                                          Instruction *Cond,
+                                          SmallVectorImpl<PHINode*> &PhiNodes) {
+  if (SI1 == SI2) return false;  // Can't merge with self!
+  assert(SI1->isUnconditional() && SI2->isConditional());
+
+  // We fold the unconditional branch if we can easily update all PHI nodes in
+  // common successors: 
+  // 1> We have a constant incoming value for the conditional branch;
+  // 2> We have "Cond" as the incoming value for the unconditional branch;
+  // 3> SI2->getCondition() and Cond have same operands.
+  CmpInst *Ci2 = dyn_cast<CmpInst>(SI2->getCondition());
+  if (!Ci2) return false;
+  if (!(Cond->getOperand(0) == Ci2->getOperand(0) &&
+        Cond->getOperand(1) == Ci2->getOperand(1)) &&
+      !(Cond->getOperand(0) == Ci2->getOperand(1) &&
+        Cond->getOperand(1) == Ci2->getOperand(0)))
+    return false;
+
+  BasicBlock *SI1BB = SI1->getParent();
+  BasicBlock *SI2BB = SI2->getParent();
+  SmallPtrSet<BasicBlock*, 16> SI1Succs(succ_begin(SI1BB), succ_end(SI1BB));
+  for (succ_iterator I = succ_begin(SI2BB), E = succ_end(SI2BB); I != E; ++I)
+    if (SI1Succs.count(*I))
+      for (BasicBlock::iterator BBI = (*I)->begin();
+           isa<PHINode>(BBI); ++BBI) {
+        PHINode *PN = cast<PHINode>(BBI);
+        if (PN->getIncomingValueForBlock(SI1BB) != Cond ||
+            !isa<ConstantInt>(PN->getIncomingValueForBlock(SI2BB)))
+          return false;
+        PhiNodes.push_back(PN);
+      }
+  return true;
+}
+
 /// AddPredecessorToBlock - Update PHI nodes in Succ to indicate that there will
 /// now be entries in it from the 'NewPred' block.  The values that will be
 /// flowing into the PHI nodes will be the same as those coming in from
@@ -476,21 +532,22 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) {
 /// decode all of the 'cases' that it represents and return the 'default' block.
 BasicBlock *SimplifyCFGOpt::
 GetValueEqualityComparisonCases(TerminatorInst *TI,
-                                std::vector<std::pair<ConstantInt*,
-                                                      BasicBlock*> > &Cases) {
+                                std::vector<ValueEqualityComparisonCase>
+                                                                       &Cases) {
   if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
     Cases.reserve(SI->getNumCases());
     for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i)
-      Cases.push_back(std::make_pair(i.getCaseValue(),
-                                     i.getCaseSuccessor()));
+      Cases.push_back(ValueEqualityComparisonCase(i.getCaseValue(),
+                                                  i.getCaseSuccessor()));
     return SI->getDefaultDest();
   }
 
   BranchInst *BI = cast<BranchInst>(TI);
   ICmpInst *ICI = cast<ICmpInst>(BI->getCondition());
-  Cases.push_back(std::make_pair(GetConstantInt(ICI->getOperand(1), TD),
-                                 BI->getSuccessor(ICI->getPredicate() ==
-                                                  ICmpInst::ICMP_NE)));
+  BasicBlock *Succ = BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_NE);
+  Cases.push_back(ValueEqualityComparisonCase(GetConstantInt(ICI->getOperand(1),
+                                                             TD),
+                                              Succ));
   return BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_EQ);
 }
 
@@ -498,9 +555,9 @@ GetValueEqualityComparisonCases(TerminatorInst *TI,
 /// EliminateBlockCases - Given a vector of bb/value pairs, remove any entries
 /// in the list that match the specified block.
 static void EliminateBlockCases(BasicBlock *BB,
-               std::vector<std::pair<ConstantInt*, BasicBlock*> > &Cases) {
+                              std::vector<ValueEqualityComparisonCase> &Cases) {
   for (unsigned i = 0, e = Cases.size(); i != e; ++i)
-    if (Cases[i].second == BB) {
+    if (Cases[i].Dest == BB) {
       Cases.erase(Cases.begin()+i);
       --i; --e;
     }
@@ -509,9 +566,9 @@ static void EliminateBlockCases(BasicBlock *BB,
 /// ValuesOverlap - Return true if there are any keys in C1 that exist in C2 as
 /// well.
 static bool
-ValuesOverlap(std::vector<std::pair<ConstantInt*, BasicBlock*> > &C1,
-              std::vector<std::pair<ConstantInt*, BasicBlock*> > &C2) {
-  std::vector<std::pair<ConstantInt*, BasicBlock*> > *V1 = &C1, *V2 = &C2;
+ValuesOverlap(std::vector<ValueEqualityComparisonCase> &C1,
+              std::vector<ValueEqualityComparisonCase > &C2) {
+  std::vector<ValueEqualityComparisonCase> *V1 = &C1, *V2 = &C2;
 
   // Make V1 be smaller than V2.
   if (V1->size() > V2->size())
@@ -520,9 +577,9 @@ ValuesOverlap(std::vector<std::pair<ConstantInt*, BasicBlock*> > &C1,
   if (V1->size() == 0) return false;
   if (V1->size() == 1) {
     // Just scan V2.
-    ConstantInt *TheVal = (*V1)[0].first;
+    ConstantInt *TheVal = (*V1)[0].Value;
     for (unsigned i = 0, e = V2->size(); i != e; ++i)
-      if (TheVal == (*V2)[i].first)
+      if (TheVal == (*V2)[i].Value)
         return true;
   }
 
@@ -531,9 +588,9 @@ ValuesOverlap(std::vector<std::pair<ConstantInt*, BasicBlock*> > &C1,
   array_pod_sort(V2->begin(), V2->end());
   unsigned i1 = 0, i2 = 0, e1 = V1->size(), e2 = V2->size();
   while (i1 != e1 && i2 != e2) {
-    if ((*V1)[i1].first == (*V2)[i2].first)
+    if ((*V1)[i1].Value == (*V2)[i2].Value)
       return true;
-    if ((*V1)[i1].first < (*V2)[i2].first)
+    if ((*V1)[i1].Value < (*V2)[i2].Value)
       ++i1;
     else
       ++i2;
@@ -559,13 +616,13 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
   if (ThisVal != PredVal) return false;  // Different predicates.
 
   // Find out information about when control will move from Pred to TI's block.
-  std::vector<std::pair<ConstantInt*, BasicBlock*> > PredCases;
+  std::vector<ValueEqualityComparisonCase> PredCases;
   BasicBlock *PredDef = GetValueEqualityComparisonCases(Pred->getTerminator(),
                                                         PredCases);
   EliminateBlockCases(PredDef, PredCases);  // Remove default from cases.
 
   // Find information about how control leaves this block.
-  std::vector<std::pair<ConstantInt*, BasicBlock*> > ThisCases;
+  std::vector<ValueEqualityComparisonCase> ThisCases;
   BasicBlock *ThisDef = GetValueEqualityComparisonCases(TI, ThisCases);
   EliminateBlockCases(ThisDef, ThisCases);  // Remove default from cases.
 
@@ -587,7 +644,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
       (void) NI;
 
       // Remove PHI node entries for the dead edge.
-      ThisCases[0].second->removePredecessor(TI->getParent());
+      ThisCases[0].Dest->removePredecessor(TI->getParent());
 
       DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
            << "Through successor TI: " << *TI << "Leaving: " << *NI << "\n");
@@ -600,7 +657,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
     // Okay, TI has cases that are statically dead, prune them away.
     SmallPtrSet<Constant*, 16> DeadCases;
     for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
-      DeadCases.insert(PredCases[i].first);
+      DeadCases.insert(PredCases[i].Value);
 
     DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
                  << "Through successor TI: " << *TI);
@@ -622,10 +679,10 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
   ConstantInt *TIV = 0;
   BasicBlock *TIBB = TI->getParent();
   for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
-    if (PredCases[i].second == TIBB) {
+    if (PredCases[i].Dest == TIBB) {
       if (TIV != 0)
         return false;  // Cannot handle multiple values coming to this block.
-      TIV = PredCases[i].first;
+      TIV = PredCases[i].Value;
     }
   assert(TIV && "No edge from pred to succ?");
 
@@ -633,8 +690,8 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
   // BB.  Find out which successor will unconditionally be branched to.
   BasicBlock *TheRealDest = 0;
   for (unsigned i = 0, e = ThisCases.size(); i != e; ++i)
-    if (ThisCases[i].first == TIV) {
-      TheRealDest = ThisCases[i].second;
+    if (ThisCases[i].Value == TIV) {
+      TheRealDest = ThisCases[i].Dest;
       break;
     }
 
@@ -702,10 +759,10 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
 
     if (PCV == CV && SafeToMergeTerminators(TI, PTI)) {
       // Figure out which 'cases' to copy from SI to PSI.
-      std::vector<std::pair<ConstantInt*, BasicBlock*> > BBCases;
+      std::vector<ValueEqualityComparisonCase> BBCases;
       BasicBlock *BBDefault = GetValueEqualityComparisonCases(TI, BBCases);
 
-      std::vector<std::pair<ConstantInt*, BasicBlock*> > PredCases;
+      std::vector<ValueEqualityComparisonCase> PredCases;
       BasicBlock *PredDefault = GetValueEqualityComparisonCases(PTI, PredCases);
 
       // Based on whether the default edge from PTI goes to BB or not, fill in
@@ -718,8 +775,8 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
         // that don't occur in PTI, or that branch to BB will be activated.
         std::set<ConstantInt*, ConstantIntOrdering> PTIHandled;
         for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
-          if (PredCases[i].second != BB)
-            PTIHandled.insert(PredCases[i].first);
+          if (PredCases[i].Dest != BB)
+            PTIHandled.insert(PredCases[i].Value);
           else {
             // The default destination is BB, we don't need explicit targets.
             std::swap(PredCases[i], PredCases.back());
@@ -734,10 +791,10 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
           NewSuccessors.push_back(BBDefault);
         }
         for (unsigned i = 0, e = BBCases.size(); i != e; ++i)
-          if (!PTIHandled.count(BBCases[i].first) &&
-              BBCases[i].second != BBDefault) {
+          if (!PTIHandled.count(BBCases[i].Value) &&
+              BBCases[i].Dest != BBDefault) {
             PredCases.push_back(BBCases[i]);
-            NewSuccessors.push_back(BBCases[i].second);
+            NewSuccessors.push_back(BBCases[i].Dest);
           }
 
       } else {
@@ -746,8 +803,8 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
         // activated.
         std::set<ConstantInt*, ConstantIntOrdering> PTIHandled;
         for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
-          if (PredCases[i].second == BB) {
-            PTIHandled.insert(PredCases[i].first);
+          if (PredCases[i].Dest == BB) {
+            PTIHandled.insert(PredCases[i].Value);
             std::swap(PredCases[i], PredCases.back());
             PredCases.pop_back();
             --i; --e;
@@ -756,11 +813,11 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
         // Okay, now we know which constants were sent to BB from the
         // predecessor.  Figure out where they will all go now.
         for (unsigned i = 0, e = BBCases.size(); i != e; ++i)
-          if (PTIHandled.count(BBCases[i].first)) {
+          if (PTIHandled.count(BBCases[i].Value)) {
             // If this is one we are capable of getting...
             PredCases.push_back(BBCases[i]);
-            NewSuccessors.push_back(BBCases[i].second);
-            PTIHandled.erase(BBCases[i].first);// This constant is taken care of
+            NewSuccessors.push_back(BBCases[i].Dest);
+            PTIHandled.erase(BBCases[i].Value);// This constant is taken care of
           }
 
         // If there are any constants vectored to BB that TI doesn't handle,
@@ -768,7 +825,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
         for (std::set<ConstantInt*, ConstantIntOrdering>::iterator I = 
                                     PTIHandled.begin(),
                E = PTIHandled.end(); I != E; ++I) {
-          PredCases.push_back(std::make_pair(*I, BBDefault));
+          PredCases.push_back(ValueEqualityComparisonCase(*I, BBDefault));
           NewSuccessors.push_back(BBDefault);
         }
       }
@@ -792,7 +849,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
                                                PredCases.size());
       NewSI->setDebugLoc(PTI->getDebugLoc());
       for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
-        NewSI->addCase(PredCases[i].first, PredCases[i].second);
+        NewSI->addCase(PredCases[i].Value, PredCases[i].Dest);
 
       EraseTerminatorInstAndDCECond(PTI);
 
@@ -1273,7 +1330,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) {
       return false;
   }
   
-  // If we folded the the first phi, PN dangles at this point.  Refresh it.  If
+  // If we folded the first phi, PN dangles at this point.  Refresh it.  If
   // we ran out of PHIs then we simplified them all.
   PN = dyn_cast<PHINode>(BB->begin());
   if (PN == 0) return true;
@@ -1490,6 +1547,23 @@ static APInt MultiplyAndLosePrecision(APInt &A, APInt &B, APInt &C, APInt &D,
   return Result;
 }
 
+/// checkCSEInPredecessor - Return true if the given instruction is available
+/// in its predecessor block. If yes, the instruction will be removed.
+///
+static bool checkCSEInPredecessor(Instruction *Inst, BasicBlock *PB) {
+  if (!isa<BinaryOperator>(Inst) && !isa<CmpInst>(Inst))
+    return false;
+  for (BasicBlock::iterator I = PB->begin(), E = PB->end(); I != E; I++) {
+    Instruction *PBI = &*I;
+    // Check whether Inst and PBI generate the same value.
+    if (Inst->isIdenticalTo(PBI)) {
+      Inst->replaceAllUsesWith(PBI);
+      Inst->eraseFromParent();
+      return true;
+    }
+  }
+  return false;
+}
 
 /// FoldBranchToCommonDest - If this basic block is simple enough, and if a
 /// predecessor branches to us and one of our successors, fold the block into
@@ -1497,7 +1571,36 @@ static APInt MultiplyAndLosePrecision(APInt &A, APInt &B, APInt &C, APInt &D,
 bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
   BasicBlock *BB = BI->getParent();
 
-  Instruction *Cond = dyn_cast<Instruction>(BI->getCondition());
+  Instruction *Cond = 0;
+  if (BI->isConditional())
+    Cond = dyn_cast<Instruction>(BI->getCondition());
+  else {
+    // For unconditional branch, check for a simple CFG pattern, where
+    // BB has a single predecessor and BB's successor is also its predecessor's
+    // successor. If such pattern exisits, check for CSE between BB and its
+    // predecessor.
+    if (BasicBlock *PB = BB->getSinglePredecessor())
+      if (BranchInst *PBI = dyn_cast<BranchInst>(PB->getTerminator()))
+        if (PBI->isConditional() &&
+            (BI->getSuccessor(0) == PBI->getSuccessor(0) ||
+             BI->getSuccessor(0) == PBI->getSuccessor(1))) {
+          for (BasicBlock::iterator I = BB->begin(), E = BB->end();
+               I != E; ) {
+            Instruction *Curr = I++;
+            if (isa<CmpInst>(Curr)) {
+              Cond = Curr;
+              break;
+            }
+            // Quit if we can't remove this instruction.
+            if (!checkCSEInPredecessor(Curr, PB))
+              return false;
+          }
+        }
+
+    if (Cond == 0)
+      return false;
+  }
+     
   if (Cond == 0 || (!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond)) ||
     Cond->getParent() != BB || !Cond->hasOneUse())
   return false;
@@ -1549,7 +1652,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
   
   // Finally, don't infinitely unroll conditional loops.
   BasicBlock *TrueDest  = BI->getSuccessor(0);
-  BasicBlock *FalseDest = BI->getSuccessor(1);
+  BasicBlock *FalseDest = (BI->isConditional()) ? BI->getSuccessor(1) : 0;
   if (TrueDest == BB || FalseDest == BB)
     return false;
 
@@ -1560,23 +1663,33 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
     // Check that we have two conditional branches.  If there is a PHI node in
     // the common successor, verify that the same value flows in from both
     // blocks.
-    if (PBI == 0 || PBI->isUnconditional() || !SafeToMergeTerminators(BI, PBI))
+    SmallVector<PHINode*, 4> PHIs;
+    if (PBI == 0 || PBI->isUnconditional() ||
+        (BI->isConditional() && 
+         !SafeToMergeTerminators(BI, PBI)) ||
+        (!BI->isConditional() &&
+         !isProfitableToFoldUnconditional(BI, PBI, Cond, PHIs)))
       continue;
     
     // Determine if the two branches share a common destination.
     Instruction::BinaryOps Opc;
     bool InvertPredCond = false;
     
-    if (PBI->getSuccessor(0) == TrueDest)
-      Opc = Instruction::Or;
-    else if (PBI->getSuccessor(1) == FalseDest)
-      Opc = Instruction::And;
-    else if (PBI->getSuccessor(0) == FalseDest)
-      Opc = Instruction::And, InvertPredCond = true;
-    else if (PBI->getSuccessor(1) == TrueDest)
-      Opc = Instruction::Or, InvertPredCond = true;
-    else
-      continue;
+    if (BI->isConditional()) {
+      if (PBI->getSuccessor(0) == TrueDest)
+        Opc = Instruction::Or;
+      else if (PBI->getSuccessor(1) == FalseDest)
+        Opc = Instruction::And;
+      else if (PBI->getSuccessor(0) == FalseDest)
+        Opc = Instruction::And, InvertPredCond = true;
+      else if (PBI->getSuccessor(1) == TrueDest)
+        Opc = Instruction::Or, InvertPredCond = true;
+      else
+        continue;
+    } else {
+      if (PBI->getSuccessor(0) != TrueDest && PBI->getSuccessor(1) != TrueDest)
+        continue;
+    }
 
     // Ensure that any values used in the bonus instruction are also used
     // by the terminator of the predecessor.  This means that those values
@@ -1652,17 +1765,69 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
     New->takeName(Cond);
     Cond->setName(New->getName()+".old");
     
-    Instruction *NewCond = 
-      cast<Instruction>(Builder.CreateBinOp(Opc, PBI->getCondition(),
+    if (BI->isConditional()) {
+      Instruction *NewCond = 
+        cast<Instruction>(Builder.CreateBinOp(Opc, PBI->getCondition(),
                                             New, "or.cond"));
-    PBI->setCondition(NewCond);
-    if (PBI->getSuccessor(0) == BB) {
-      AddPredecessorToBlock(TrueDest, PredBlock, BB);
-      PBI->setSuccessor(0, TrueDest);
-    }
-    if (PBI->getSuccessor(1) == BB) {
-      AddPredecessorToBlock(FalseDest, PredBlock, BB);
-      PBI->setSuccessor(1, FalseDest);
+      PBI->setCondition(NewCond);
+
+      if (PBI->getSuccessor(0) == BB) {
+        AddPredecessorToBlock(TrueDest, PredBlock, BB);
+        PBI->setSuccessor(0, TrueDest);
+      }
+      if (PBI->getSuccessor(1) == BB) {
+        AddPredecessorToBlock(FalseDest, PredBlock, BB);
+        PBI->setSuccessor(1, FalseDest);
+      }
+    } else {
+      // Update PHI nodes in the common successors.
+      for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
+        ConstantInt *PBI_C = cast<ConstantInt>(
+          PHIs[i]->getIncomingValueForBlock(PBI->getParent()));
+        assert(PBI_C->getType()->isIntegerTy(1));
+        Instruction *MergedCond = 0;
+        if (PBI->getSuccessor(0) == TrueDest) {
+          // Create (PBI_Cond and PBI_C) or (!PBI_Cond and BI_Value)
+          // PBI_C is true: PBI_Cond or (!PBI_Cond and BI_Value)
+          //       is false: !PBI_Cond and BI_Value
+          Instruction *NotCond =
+            cast<Instruction>(Builder.CreateNot(PBI->getCondition(),
+                                "not.cond"));
+          MergedCond =
+            cast<Instruction>(Builder.CreateBinOp(Instruction::And,
+                                NotCond, New,
+                                "and.cond"));
+          if (PBI_C->isOne())
+            MergedCond =
+              cast<Instruction>(Builder.CreateBinOp(Instruction::Or,
+                                  PBI->getCondition(), MergedCond,
+                                  "or.cond"));
+        } else {
+          // Create (PBI_Cond and BI_Value) or (!PBI_Cond and PBI_C)
+          // PBI_C is true: (PBI_Cond and BI_Value) or (!PBI_Cond)
+          //       is false: PBI_Cond and BI_Value
+          MergedCond = 
+            cast<Instruction>(Builder.CreateBinOp(Instruction::And,
+                                PBI->getCondition(), New,
+                                "and.cond"));
+          if (PBI_C->isOne()) {
+            Instruction *NotCond =
+              cast<Instruction>(Builder.CreateNot(PBI->getCondition(),
+                                  "not.cond"));
+            MergedCond = 
+              cast<Instruction>(Builder.CreateBinOp(Instruction::Or,
+                                  NotCond, MergedCond,
+                                  "or.cond"));
+          }
+        }
+        // Update PHI Node.
+        PHIs[i]->setIncomingValue(PHIs[i]->getBasicBlockIndex(PBI->getParent()),
+                                  MergedCond);
+      }
+      // Change PBI from Conditional to Unconditional.
+      BranchInst *New_PBI = BranchInst::Create(TrueDest, PBI);
+      EraseTerminatorInstAndDCECond(PBI);
+      PBI = New_PBI;
     }
 
     // TODO: If BB is reachable from all paths through PredBlock, then we
@@ -1670,7 +1835,8 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
 
     // Merge probability data into PredBlock's branch.
     APInt A, B, C, D;
-    if (ExtractBranchMetadata(PBI, C, D) && ExtractBranchMetadata(BI, A, B)) {
+    if (PBI->isConditional() && BI->isConditional() &&
+        ExtractBranchMetadata(PBI, C, D) && ExtractBranchMetadata(BI, A, B)) {
       // Given IR which does:
       //   bbA:
       //     br i1 %x, label %bbB, label %bbC
@@ -1740,12 +1906,10 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
         ProbTrue = ProbTrue.udiv(GCD);
         ProbFalse = ProbFalse.udiv(GCD);
 
-        LLVMContext &Context = BI->getContext();
-        Value *Ops[3];
-        Ops[0] = BI->getMetadata(LLVMContext::MD_prof)->getOperand(0);
-        Ops[1] = ConstantInt::get(Context, ProbTrue);
-        Ops[2] = ConstantInt::get(Context, ProbFalse);
-        PBI->setMetadata(LLVMContext::MD_prof, MDNode::get(Context, Ops));
+        MDBuilder MDB(BI->getContext());
+        MDNode *N = MDB.createBranchWeights(ProbTrue.getZExtValue(),
+                                            ProbFalse.getZExtValue());
+        PBI->setMetadata(LLVMContext::MD_prof, N);
       } else {
         PBI->setMetadata(LLVMContext::MD_prof, NULL);
       }
@@ -2758,6 +2922,12 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){
         return true;
     }
   
+  // If this basic block is ONLY a compare and a branch, and if a predecessor
+  // branches to us and our successor, fold the comparison into the
+  // predecessor and use logical operations to update the incoming value
+  // for PHI nodes in common successor.
+  if (FoldBranchToCommonDest(BI))
+    return SimplifyCFG(BB) | true;
   return false;
 }
 
diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp
index 4030bef..5d673f1 100644
--- a/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -16,7 +16,6 @@
 #define DEBUG_TYPE "indvars"
 
 #include "llvm/Instructions.h"
-#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/IVUsers.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -44,7 +43,6 @@ namespace {
   class SimplifyIndvar {
     Loop             *L;
     LoopInfo         *LI;
-    DominatorTree    *DT;
     ScalarEvolution  *SE;
     const TargetData *TD; // May be NULL
 
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp
index 9d62306..62d23cb 100644
--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Intrinsics.h"
 #include "llvm/LLVMContext.h"
+#include "llvm/Metadata.h"
 #include "llvm/Pass.h"
 #include "llvm/Type.h"
 #include "llvm/ADT/DenseMap.h"
@@ -41,6 +42,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Vectorize.h"
 #include <algorithm>
 #include <map>
@@ -66,6 +68,10 @@ static cl::opt<unsigned>
 MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden,
   cl::desc("The maximum number of pairing iterations"));
 
+static cl::opt<bool>
+Pow2LenOnly("bb-vectorize-pow2-len-only", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to form non-2^n-length vectors"));
+
 static cl::opt<unsigned>
 MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden,
   cl::desc("The maximum number of pairable instructions per group"));
@@ -76,6 +82,10 @@ MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200),
                        " a full cycle check"));
 
 static cl::opt<bool>
+NoBools("bb-vectorize-no-bools", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to vectorize boolean (i1) values"));
+
+static cl::opt<bool>
 NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden,
   cl::desc("Don't try to vectorize integer values"));
 
@@ -104,6 +114,10 @@ NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden,
   cl::desc("Don't try to vectorize select instructions"));
 
 static cl::opt<bool>
+NoCmp("bb-vectorize-no-cmp", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to vectorize comparison instructions"));
+
+static cl::opt<bool>
 NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden,
   cl::desc("Don't try to vectorize getelementptr instructions"));
 
@@ -182,12 +196,12 @@ namespace {
 
     // FIXME: const correct?
 
-    bool vectorizePairs(BasicBlock &BB);
+    bool vectorizePairs(BasicBlock &BB, bool NonPow2Len = false);
 
     bool getCandidatePairs(BasicBlock &BB,
                        BasicBlock::iterator &Start,
                        std::multimap<Value *, Value *> &CandidatePairs,
-                       std::vector<Value *> &PairableInsts);
+                       std::vector<Value *> &PairableInsts, bool NonPow2Len);
 
     void computeConnectedPairs(std::multimap<Value *, Value *> &CandidatePairs,
                        std::vector<Value *> &PairableInsts,
@@ -211,7 +225,7 @@ namespace {
     bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore);
 
     bool areInstsCompatible(Instruction *I, Instruction *J,
-                       bool IsSimpleLoadStore);
+                       bool IsSimpleLoadStore, bool NonPow2Len);
 
     bool trackUsesOfI(DenseSet<Value *> &Users,
                       AliasSetTracker &WriteSet, Instruction *I,
@@ -263,26 +277,32 @@ namespace {
                       bool UseCycleCheck);
 
     Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I,
-                     Instruction *J, unsigned o, bool &FlipMemInputs);
+                     Instruction *J, unsigned o, bool FlipMemInputs);
 
     void fillNewShuffleMask(LLVMContext& Context, Instruction *J,
-                     unsigned NumElem, unsigned MaskOffset, unsigned NumInElem,
-                     unsigned IdxOffset, std::vector<Constant*> &Mask);
+                     unsigned MaskOffset, unsigned NumInElem,
+                     unsigned NumInElem1, unsigned IdxOffset,
+                     std::vector<Constant*> &Mask);
 
     Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I,
                      Instruction *J);
 
+    bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J,
+                       unsigned o, Value *&LOp, unsigned numElemL,
+                       Type *ArgTypeL, Type *ArgTypeR,
+                       unsigned IdxOff = 0);
+
     Value *getReplacementInput(LLVMContext& Context, Instruction *I,
                      Instruction *J, unsigned o, bool FlipMemInputs);
 
     void getReplacementInputsForPair(LLVMContext& Context, Instruction *I,
                      Instruction *J, SmallVector<Value *, 3> &ReplacedOperands,
-                     bool &FlipMemInputs);
+                     bool FlipMemInputs);
 
     void replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
                      Instruction *J, Instruction *K,
                      Instruction *&InsertionPt, Instruction *&K1,
-                     Instruction *&K2, bool &FlipMemInputs);
+                     Instruction *&K2, bool FlipMemInputs);
 
     void collectPairLoadMoveSet(BasicBlock &BB,
                      DenseMap<Value *, Value *> &ChosenPairs,
@@ -294,6 +314,10 @@ namespace {
                      DenseMap<Value *, Value *> &ChosenPairs,
                      std::multimap<Value *, Value *> &LoadMoveSet);
 
+    void collectPtrInfo(std::vector<Value *> &PairableInsts,
+                        DenseMap<Value *, Value *> &ChosenPairs,
+                        DenseSet<Value *> &LowPtrInsts);
+
     bool canMoveUsesOfIAfterJ(BasicBlock &BB,
                      std::multimap<Value *, Value *> &LoadMoveSet,
                      Instruction *I, Instruction *J);
@@ -303,12 +327,15 @@ namespace {
                      Instruction *&InsertionPt,
                      Instruction *I, Instruction *J);
 
+    void combineMetadata(Instruction *K, const Instruction *J);
+
     bool vectorizeBB(BasicBlock &BB) {
       bool changed = false;
       // Iterate a sufficient number of times to merge types of size 1 bit,
       // then 2 bits, then 4, etc. up to half of the target vector width of the
       // target vector register.
-      for (unsigned v = 2, n = 1;
+      unsigned n = 1;
+      for (unsigned v = 2;
            v <= Config.VectorBits && (!Config.MaxIter || n <= Config.MaxIter);
            v *= 2, ++n) {
         DEBUG(dbgs() << "BBV: fusing loop #" << n <<
@@ -320,6 +347,16 @@ namespace {
           break;
       }
 
+      if (changed && !Pow2LenOnly) {
+        ++n;
+        for (; !Config.MaxIter || n <= Config.MaxIter; ++n) {
+          DEBUG(dbgs() << "BBV: fusing for non-2^n-length vectors loop #: " <<
+                n << " for " << BB.getName() << " in " <<
+                BB.getParent()->getName() << "...\n");
+          if (!vectorizePairs(BB, true)) break;
+        }
+      }
+
       DEBUG(dbgs() << "BBV: done!\n");
       return changed;
     }
@@ -341,15 +378,43 @@ namespace {
       AU.setPreservesCFG();
     }
 
-    // This returns the vector type that holds a pair of the provided type.
-    // If the provided type is already a vector, then its length is doubled.
-    static inline VectorType *getVecTypeForPair(Type *ElemTy) {
+    static inline VectorType *getVecTypeForPair(Type *ElemTy, Type *Elem2Ty) {
+      assert(ElemTy->getScalarType() == Elem2Ty->getScalarType() &&
+             "Cannot form vector from incompatible scalar types");
+      Type *STy = ElemTy->getScalarType();
+
+      unsigned numElem;
       if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) {
-        unsigned numElem = VTy->getNumElements();
-        return VectorType::get(ElemTy->getScalarType(), numElem*2);
+        numElem = VTy->getNumElements();
+      } else {
+        numElem = 1;
       }
 
-      return VectorType::get(ElemTy, 2);
+      if (VectorType *VTy = dyn_cast<VectorType>(Elem2Ty)) {
+        numElem += VTy->getNumElements();
+      } else {
+        numElem += 1;
+      }
+
+      return VectorType::get(STy, numElem);
+    }
+
+    static inline void getInstructionTypes(Instruction *I,
+                                           Type *&T1, Type *&T2) {
+      if (isa<StoreInst>(I)) {
+        // For stores, it is the value type, not the pointer type that matters
+        // because the value is what will come from a vector register.
+  
+        Value *IVal = cast<StoreInst>(I)->getValueOperand();
+        T1 = IVal->getType();
+      } else {
+        T1 = I->getType();
+      }
+  
+      if (I->isCast())
+        T2 = cast<CastInst>(I)->getSrcTy();
+      else
+        T2 = T1;
     }
 
     // Returns the weight associated with the provided value. A chain of
@@ -385,8 +450,7 @@ namespace {
     // true if the offset could be determined to be some constant value.
     // For example, if OffsetInElmts == 1, then J accesses the memory directly
     // after I; if OffsetInElmts == -1 then I accesses the memory
-    // directly after J. This function assumes that both instructions
-    // have the same type.
+    // directly after J.
     bool getPairPtrInfo(Instruction *I, Instruction *J,
         Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment,
         int64_t &OffsetInElmts) {
@@ -418,7 +482,12 @@ namespace {
         Type *VTy = cast<PointerType>(IPtr->getType())->getElementType();
         int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy);
 
-        assert(VTy == cast<PointerType>(JPtr->getType())->getElementType());
+        Type *VTy2 = cast<PointerType>(JPtr->getType())->getElementType();
+        if (VTy != VTy2 && Offset < 0) {
+          int64_t VTy2TSS = (int64_t) TD->getTypeStoreSize(VTy2);
+          OffsetInElmts = Offset/VTy2TSS;
+          return (abs64(Offset) % VTy2TSS) == 0;
+        }
 
         OffsetInElmts = Offset/VTyTSS;
         return (abs64(Offset) % VTyTSS) == 0;
@@ -471,7 +540,7 @@ namespace {
 
   // This function implements one vectorization iteration on the provided
   // basic block. It returns true if the block is changed.
-  bool BBVectorize::vectorizePairs(BasicBlock &BB) {
+  bool BBVectorize::vectorizePairs(BasicBlock &BB, bool NonPow2Len) {
     bool ShouldContinue;
     BasicBlock::iterator Start = BB.getFirstInsertionPt();
 
@@ -482,7 +551,7 @@ namespace {
       std::vector<Value *> PairableInsts;
       std::multimap<Value *, Value *> CandidatePairs;
       ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs,
-                                         PairableInsts);
+                                         PairableInsts, NonPow2Len);
       if (PairableInsts.empty()) continue;
 
       // Now we have a map of all of the pairable instructions and we need to
@@ -529,6 +598,10 @@ namespace {
     // passes should coalesce the build/extract combinations.
 
     fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs);
+
+    // It is important to cleanup here so that future iterations of this
+    // function have less work to do.
+    (void) SimplifyInstructionsInBlock(&BB, TD);
     return true;
   }
 
@@ -567,6 +640,9 @@ namespace {
     } else if (isa<SelectInst>(I)) {
       if (!Config.VectorizeSelect)
         return false;
+    } else if (isa<CmpInst>(I)) {
+      if (!Config.VectorizeCmp)
+        return false;
     } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(I)) {
       if (!Config.VectorizeGEP)
         return false;
@@ -584,41 +660,39 @@ namespace {
       return false;
 
     Type *T1, *T2;
-    if (isa<StoreInst>(I)) {
-      // For stores, it is the value type, not the pointer type that matters
-      // because the value is what will come from a vector register.
-
-      Value *IVal = cast<StoreInst>(I)->getValueOperand();
-      T1 = IVal->getType();
-    } else {
-      T1 = I->getType();
-    }
-
-    if (I->isCast())
-      T2 = cast<CastInst>(I)->getSrcTy();
-    else
-      T2 = T1;
+    getInstructionTypes(I, T1, T2);
 
     // Not every type can be vectorized...
     if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) ||
         !(VectorType::isValidElementType(T2) || T2->isVectorTy()))
       return false;
 
-    if (!Config.VectorizeInts
-        && (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy()))
-      return false;
-
+    if (T1->getScalarSizeInBits() == 1 && T2->getScalarSizeInBits() == 1) {
+      if (!Config.VectorizeBools)
+        return false;
+    } else {
+      if (!Config.VectorizeInts
+          && (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy()))
+        return false;
+    }
+  
     if (!Config.VectorizeFloats
         && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy()))
       return false;
 
+    // Don't vectorize target-specific types.
+    if (T1->isX86_FP80Ty() || T1->isPPC_FP128Ty() || T1->isX86_MMXTy())
+      return false;
+    if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy())
+      return false;
+
     if ((!Config.VectorizePointers || TD == 0) &&
         (T1->getScalarType()->isPointerTy() ||
          T2->getScalarType()->isPointerTy()))
       return false;
 
-    if (T1->getPrimitiveSizeInBits() > Config.VectorBits/2 ||
-        T2->getPrimitiveSizeInBits() > Config.VectorBits/2)
+    if (T1->getPrimitiveSizeInBits() >= Config.VectorBits ||
+        T2->getPrimitiveSizeInBits() >= Config.VectorBits)
       return false;
 
     return true;
@@ -629,36 +703,25 @@ namespace {
   // that I has already been determined to be vectorizable and that J is not
   // in the use tree of I.
   bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J,
-                       bool IsSimpleLoadStore) {
+                       bool IsSimpleLoadStore, bool NonPow2Len) {
     DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I <<
                      " <-> " << *J << "\n");
 
     // Loads and stores can be merged if they have different alignments,
     // but are otherwise the same.
-    LoadInst *LI, *LJ;
-    StoreInst *SI, *SJ;
-    if ((LI = dyn_cast<LoadInst>(I)) && (LJ = dyn_cast<LoadInst>(J))) {
-      if (I->getType() != J->getType())
-        return false;
+    if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment |
+                      (NonPow2Len ? Instruction::CompareUsingScalarTypes : 0)))
+      return false;
 
-      if (LI->getPointerOperand()->getType() !=
-            LJ->getPointerOperand()->getType() ||
-          LI->isVolatile() != LJ->isVolatile() ||
-          LI->getOrdering() != LJ->getOrdering() ||
-          LI->getSynchScope() != LJ->getSynchScope())
-        return false;
-    } else if ((SI = dyn_cast<StoreInst>(I)) && (SJ = dyn_cast<StoreInst>(J))) {
-      if (SI->getValueOperand()->getType() !=
-            SJ->getValueOperand()->getType() ||
-          SI->getPointerOperand()->getType() !=
-            SJ->getPointerOperand()->getType() ||
-          SI->isVolatile() != SJ->isVolatile() ||
-          SI->getOrdering() != SJ->getOrdering() ||
-          SI->getSynchScope() != SJ->getSynchScope())
-        return false;
-    } else if (!J->isSameOperationAs(I)) {
+    Type *IT1, *IT2, *JT1, *JT2;
+    getInstructionTypes(I, IT1, IT2);
+    getInstructionTypes(J, JT1, JT2);
+    unsigned MaxTypeBits = std::max(
+      IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(),
+      IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits());
+    if (MaxTypeBits > Config.VectorBits)
       return false;
-    }
+
     // FIXME: handle addsub-type operations!
 
     if (IsSimpleLoadStore) {
@@ -668,8 +731,11 @@ namespace {
       if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
             OffsetInElmts) && abs64(OffsetInElmts) == 1) {
         if (Config.AlignedOnly) {
-          Type *aType = isa<StoreInst>(I) ?
+          Type *aTypeI = isa<StoreInst>(I) ?
             cast<StoreInst>(I)->getValueOperand()->getType() : I->getType();
+          Type *aTypeJ = isa<StoreInst>(J) ?
+            cast<StoreInst>(J)->getValueOperand()->getType() : J->getType();
+
           // An aligned load or store is possible only if the instruction
           // with the lower offset has an alignment suitable for the
           // vector type.
@@ -677,7 +743,7 @@ namespace {
           unsigned BottomAlignment = IAlignment;
           if (OffsetInElmts < 0) BottomAlignment = JAlignment;
 
-          Type *VType = getVecTypeForPair(aType);
+          Type *VType = getVecTypeForPair(aTypeI, aTypeJ);
           unsigned VecAlignment = TD->getPrefTypeAlignment(VType);
           if (BottomAlignment < VecAlignment)
             return false;
@@ -685,11 +751,6 @@ namespace {
       } else {
         return false;
       }
-    } else if (isa<ShuffleVectorInst>(I)) {
-      // Only merge two shuffles if they're both constant
-      return isa<Constant>(I->getOperand(2)) &&
-             isa<Constant>(J->getOperand(2));
-      // FIXME: We may want to vectorize non-constant shuffles also.
     }
 
     // The powi intrinsic is special because only the first argument is
@@ -772,7 +833,7 @@ namespace {
   bool BBVectorize::getCandidatePairs(BasicBlock &BB,
                        BasicBlock::iterator &Start,
                        std::multimap<Value *, Value *> &CandidatePairs,
-                       std::vector<Value *> &PairableInsts) {
+                       std::vector<Value *> &PairableInsts, bool NonPow2Len) {
     BasicBlock::iterator E = BB.end();
     if (Start == E) return false;
 
@@ -808,7 +869,7 @@ namespace {
 
         // J does not use I, and comes before the first use of I, so it can be
         // merged with I if the instructions are compatible.
-        if (!areInstsCompatible(I, J, IsSimpleLoadStore)) continue;
+        if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len)) continue;
 
         // J is a candidate for merging with I.
         if (!PairableInsts.size() ||
@@ -1430,24 +1491,27 @@ namespace {
   // instruction that fuses I with J.
   Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context,
                      Instruction *I, Instruction *J, unsigned o,
-                     bool &FlipMemInputs) {
+                     bool FlipMemInputs) {
     Value *IPtr, *JPtr;
     unsigned IAlignment, JAlignment;
     int64_t OffsetInElmts;
+
+    // Note: the analysis might fail here, that is why FlipMemInputs has
+    // been precomputed (OffsetInElmts must be unused here).
     (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
                           OffsetInElmts);
 
     // The pointer value is taken to be the one with the lowest offset.
     Value *VPtr;
-    if (OffsetInElmts > 0) {
+    if (!FlipMemInputs) {
       VPtr = IPtr;
     } else {
-      FlipMemInputs = true;
       VPtr = JPtr;
     }
 
-    Type *ArgType = cast<PointerType>(IPtr->getType())->getElementType();
-    Type *VArgType = getVecTypeForPair(ArgType);
+    Type *ArgTypeI = cast<PointerType>(IPtr->getType())->getElementType();
+    Type *ArgTypeJ = cast<PointerType>(JPtr->getType())->getElementType();
+    Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
     Type *VArgPtrType = PointerType::get(VArgType,
       cast<PointerType>(IPtr->getType())->getAddressSpace());
     return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o),
@@ -1455,15 +1519,17 @@ namespace {
   }
 
   void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J,
-                     unsigned NumElem, unsigned MaskOffset, unsigned NumInElem,
-                     unsigned IdxOffset, std::vector<Constant*> &Mask) {
-    for (unsigned v = 0; v < NumElem/2; ++v) {
+                     unsigned MaskOffset, unsigned NumInElem,
+                     unsigned NumInElem1, unsigned IdxOffset,
+                     std::vector<Constant*> &Mask) {
+    unsigned NumElem1 = cast<VectorType>(J->getType())->getNumElements();
+    for (unsigned v = 0; v < NumElem1; ++v) {
       int m = cast<ShuffleVectorInst>(J)->getMaskValue(v);
       if (m < 0) {
         Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context));
       } else {
         unsigned mm = m + (int) IdxOffset;
-        if (m >= (int) NumInElem)
+        if (m >= (int) NumInElem1)
           mm += (int) NumInElem;
 
         Mask[v+MaskOffset] =
@@ -1479,8 +1545,11 @@ namespace {
     // This is the shuffle mask. We need to append the second
     // mask to the first, and the numbers need to be adjusted.
 
-    Type *ArgType = I->getType();
-    Type *VArgType = getVecTypeForPair(ArgType);
+    Type *ArgTypeI = I->getType();
+    Type *ArgTypeJ = J->getType();
+    Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
+
+    unsigned NumElemI = cast<VectorType>(ArgTypeI)->getNumElements();
 
     // Get the total number of elements in the fused vector type.
     // By definition, this must equal the number of elements in
@@ -1488,19 +1557,81 @@ namespace {
     unsigned NumElem = cast<VectorType>(VArgType)->getNumElements();
     std::vector<Constant*> Mask(NumElem);
 
-    Type *OpType = I->getOperand(0)->getType();
-    unsigned NumInElem = cast<VectorType>(OpType)->getNumElements();
+    Type *OpTypeI = I->getOperand(0)->getType();
+    unsigned NumInElemI = cast<VectorType>(OpTypeI)->getNumElements();
+    Type *OpTypeJ = J->getOperand(0)->getType();
+    unsigned NumInElemJ = cast<VectorType>(OpTypeJ)->getNumElements();
+
+    // The fused vector will be:
+    // -----------------------------------------------------
+    // | NumInElemI | NumInElemJ | NumInElemI | NumInElemJ |
+    // -----------------------------------------------------
+    // from which we'll extract NumElem total elements (where the first NumElemI
+    // of them come from the mask in I and the remainder come from the mask
+    // in J.
 
     // For the mask from the first pair...
-    fillNewShuffleMask(Context, I, NumElem, 0, NumInElem, 0, Mask);
+    fillNewShuffleMask(Context, I, 0,        NumInElemJ, NumInElemI,
+                       0,          Mask);
 
     // For the mask from the second pair...
-    fillNewShuffleMask(Context, J, NumElem, NumElem/2, NumInElem, NumInElem,
-                       Mask);
+    fillNewShuffleMask(Context, J, NumElemI, NumInElemI, NumInElemJ,
+                       NumInElemI, Mask);
 
     return ConstantVector::get(Mask);
   }
 
+  bool BBVectorize::expandIEChain(LLVMContext& Context, Instruction *I,
+                                  Instruction *J, unsigned o, Value *&LOp,
+                                  unsigned numElemL,
+                                  Type *ArgTypeL, Type *ArgTypeH,
+                                  unsigned IdxOff) {
+    bool ExpandedIEChain = false;
+    if (InsertElementInst *LIE = dyn_cast<InsertElementInst>(LOp)) {
+      // If we have a pure insertelement chain, then this can be rewritten
+      // into a chain that directly builds the larger type.
+      bool PureChain = true;
+      InsertElementInst *LIENext = LIE;
+      do {
+        if (!isa<UndefValue>(LIENext->getOperand(0)) &&
+            !isa<InsertElementInst>(LIENext->getOperand(0))) {
+          PureChain = false;
+          break;
+        }
+      } while ((LIENext =
+                 dyn_cast<InsertElementInst>(LIENext->getOperand(0))));
+
+      if (PureChain) {
+        SmallVector<Value *, 8> VectElemts(numElemL,
+          UndefValue::get(ArgTypeL->getScalarType()));
+        InsertElementInst *LIENext = LIE;
+        do {
+          unsigned Idx =
+            cast<ConstantInt>(LIENext->getOperand(2))->getSExtValue();
+          VectElemts[Idx] = LIENext->getOperand(1);
+        } while ((LIENext =
+                   dyn_cast<InsertElementInst>(LIENext->getOperand(0))));
+
+        LIENext = 0;
+        Value *LIEPrev = UndefValue::get(ArgTypeH);
+        for (unsigned i = 0; i < numElemL; ++i) {
+          if (isa<UndefValue>(VectElemts[i])) continue;
+          LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i],
+                             ConstantInt::get(Type::getInt32Ty(Context),
+                                              i + IdxOff),
+                             getReplacementName(I, true, o, i+1));
+          LIENext->insertBefore(J);
+          LIEPrev = LIENext;
+        }
+
+        LOp = LIENext ? (Value*) LIENext : UndefValue::get(ArgTypeH);
+        ExpandedIEChain = true;
+      }
+    }
+
+    return ExpandedIEChain;
+  }
+
   // Returns the value to be used as the specified operand of the vector
   // instruction that fuses I with J.
   Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I,
@@ -1508,84 +1639,333 @@ namespace {
     Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
     Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
 
-      // Compute the fused vector type for this operand
-    Type *ArgType = I->getOperand(o)->getType();
-    VectorType *VArgType = getVecTypeForPair(ArgType);
+    // Compute the fused vector type for this operand
+    Type *ArgTypeI = I->getOperand(o)->getType();
+    Type *ArgTypeJ = J->getOperand(o)->getType();
+    VectorType *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
 
     Instruction *L = I, *H = J;
+    Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ;
     if (FlipMemInputs) {
       L = J;
       H = I;
+      ArgTypeL = ArgTypeJ;
+      ArgTypeH = ArgTypeI;
     }
 
-    if (ArgType->isVectorTy()) {
-      unsigned numElem = cast<VectorType>(VArgType)->getNumElements();
-      std::vector<Constant*> Mask(numElem);
-      for (unsigned v = 0; v < numElem; ++v)
-        Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+    unsigned numElemL;
+    if (ArgTypeL->isVectorTy())
+      numElemL = cast<VectorType>(ArgTypeL)->getNumElements();
+    else
+      numElemL = 1;
 
-      Instruction *BV = new ShuffleVectorInst(L->getOperand(o),
-                                              H->getOperand(o),
-                                              ConstantVector::get(Mask),
-                                              getReplacementName(I, true, o));
-      BV->insertBefore(J);
-      return BV;
+    unsigned numElemH;
+    if (ArgTypeH->isVectorTy())
+      numElemH = cast<VectorType>(ArgTypeH)->getNumElements();
+    else
+      numElemH = 1;
+
+    Value *LOp = L->getOperand(o);
+    Value *HOp = H->getOperand(o);
+    unsigned numElem = VArgType->getNumElements();
+
+    // First, we check if we can reuse the "original" vector outputs (if these
+    // exist). We might need a shuffle.
+    ExtractElementInst *LEE = dyn_cast<ExtractElementInst>(LOp);
+    ExtractElementInst *HEE = dyn_cast<ExtractElementInst>(HOp);
+    ShuffleVectorInst *LSV = dyn_cast<ShuffleVectorInst>(LOp);
+    ShuffleVectorInst *HSV = dyn_cast<ShuffleVectorInst>(HOp);
+
+    // FIXME: If we're fusing shuffle instructions, then we can't apply this
+    // optimization. The input vectors to the shuffle might be a different
+    // length from the shuffle outputs. Unfortunately, the replacement
+    // shuffle mask has already been formed, and the mask entries are sensitive
+    // to the sizes of the inputs.
+    bool IsSizeChangeShuffle =
+      isa<ShuffleVectorInst>(L) &&
+        (LOp->getType() != L->getType() || HOp->getType() != H->getType());
+
+    if ((LEE || LSV) && (HEE || HSV) && !IsSizeChangeShuffle) {
+      // We can have at most two unique vector inputs.
+      bool CanUseInputs = true;
+      Value *I1, *I2 = 0;
+      if (LEE) {
+        I1 = LEE->getOperand(0);
+      } else {
+        I1 = LSV->getOperand(0);
+        I2 = LSV->getOperand(1);
+        if (I2 == I1 || isa<UndefValue>(I2))
+          I2 = 0;
+      }
+  
+      if (HEE) {
+        Value *I3 = HEE->getOperand(0);
+        if (!I2 && I3 != I1)
+          I2 = I3;
+        else if (I3 != I1 && I3 != I2)
+          CanUseInputs = false;
+      } else {
+        Value *I3 = HSV->getOperand(0);
+        if (!I2 && I3 != I1)
+          I2 = I3;
+        else if (I3 != I1 && I3 != I2)
+          CanUseInputs = false;
+
+        if (CanUseInputs) {
+          Value *I4 = HSV->getOperand(1);
+          if (!isa<UndefValue>(I4)) {
+            if (!I2 && I4 != I1)
+              I2 = I4;
+            else if (I4 != I1 && I4 != I2)
+              CanUseInputs = false;
+          }
+        }
+      }
+
+      if (CanUseInputs) {
+        unsigned LOpElem =
+          cast<VectorType>(cast<Instruction>(LOp)->getOperand(0)->getType())
+            ->getNumElements();
+        unsigned HOpElem =
+          cast<VectorType>(cast<Instruction>(HOp)->getOperand(0)->getType())
+            ->getNumElements();
+
+        // We have one or two input vectors. We need to map each index of the
+        // operands to the index of the original vector.
+        SmallVector<std::pair<int, int>, 8>  II(numElem);
+        for (unsigned i = 0; i < numElemL; ++i) {
+          int Idx, INum;
+          if (LEE) {
+            Idx =
+              cast<ConstantInt>(LEE->getOperand(1))->getSExtValue();
+            INum = LEE->getOperand(0) == I1 ? 0 : 1;
+          } else {
+            Idx = LSV->getMaskValue(i);
+            if (Idx < (int) LOpElem) {
+              INum = LSV->getOperand(0) == I1 ? 0 : 1;
+            } else {
+              Idx -= LOpElem;
+              INum = LSV->getOperand(1) == I1 ? 0 : 1;
+            }
+          }
+
+          II[i] = std::pair<int, int>(Idx, INum);
+        }
+        for (unsigned i = 0; i < numElemH; ++i) {
+          int Idx, INum;
+          if (HEE) {
+            Idx =
+              cast<ConstantInt>(HEE->getOperand(1))->getSExtValue();
+            INum = HEE->getOperand(0) == I1 ? 0 : 1;
+          } else {
+            Idx = HSV->getMaskValue(i);
+            if (Idx < (int) HOpElem) {
+              INum = HSV->getOperand(0) == I1 ? 0 : 1;
+            } else {
+              Idx -= HOpElem;
+              INum = HSV->getOperand(1) == I1 ? 0 : 1;
+            }
+          }
+
+          II[i + numElemL] = std::pair<int, int>(Idx, INum);
+        }
+
+        // We now have an array which tells us from which index of which
+        // input vector each element of the operand comes.
+        VectorType *I1T = cast<VectorType>(I1->getType());
+        unsigned I1Elem = I1T->getNumElements();
+
+        if (!I2) {
+          // In this case there is only one underlying vector input. Check for
+          // the trivial case where we can use the input directly.
+          if (I1Elem == numElem) {
+            bool ElemInOrder = true;
+            for (unsigned i = 0; i < numElem; ++i) {
+              if (II[i].first != (int) i && II[i].first != -1) {
+                ElemInOrder = false;
+                break;
+              }
+            }
+
+            if (ElemInOrder)
+              return I1;
+          }
+
+          // A shuffle is needed.
+          std::vector<Constant *> Mask(numElem);
+          for (unsigned i = 0; i < numElem; ++i) {
+            int Idx = II[i].first;
+            if (Idx == -1)
+              Mask[i] = UndefValue::get(Type::getInt32Ty(Context));
+            else
+              Mask[i] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
+          }
+
+          Instruction *S =
+            new ShuffleVectorInst(I1, UndefValue::get(I1T),
+                                  ConstantVector::get(Mask),
+                                  getReplacementName(I, true, o));
+          S->insertBefore(J);
+          return S;
+        }
+
+        VectorType *I2T = cast<VectorType>(I2->getType());
+        unsigned I2Elem = I2T->getNumElements();
+
+        // This input comes from two distinct vectors. The first step is to
+        // make sure that both vectors are the same length. If not, the
+        // smaller one will need to grow before they can be shuffled together.
+        if (I1Elem < I2Elem) {
+          std::vector<Constant *> Mask(I2Elem);
+          unsigned v = 0;
+          for (; v < I1Elem; ++v)
+            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+          for (; v < I2Elem; ++v)
+            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
+
+          Instruction *NewI1 =
+            new ShuffleVectorInst(I1, UndefValue::get(I1T),
+                                  ConstantVector::get(Mask),
+                                  getReplacementName(I, true, o, 1));
+          NewI1->insertBefore(J);
+          I1 = NewI1;
+          I1T = I2T;
+          I1Elem = I2Elem;
+        } else if (I1Elem > I2Elem) {
+          std::vector<Constant *> Mask(I1Elem);
+          unsigned v = 0;
+          for (; v < I2Elem; ++v)
+            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+          for (; v < I1Elem; ++v)
+            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
+
+          Instruction *NewI2 =
+            new ShuffleVectorInst(I2, UndefValue::get(I2T),
+                                  ConstantVector::get(Mask),
+                                  getReplacementName(I, true, o, 1));
+          NewI2->insertBefore(J);
+          I2 = NewI2;
+          I2T = I1T;
+          I2Elem = I1Elem;
+        }
+
+        // Now that both I1 and I2 are the same length we can shuffle them
+        // together (and use the result).
+        std::vector<Constant *> Mask(numElem);
+        for (unsigned v = 0; v < numElem; ++v) {
+          if (II[v].first == -1) {
+            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
+          } else {
+            int Idx = II[v].first + II[v].second * I1Elem;
+            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
+          }
+        }
+
+        Instruction *NewOp =
+          new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask),
+                                getReplacementName(I, true, o));
+        NewOp->insertBefore(J);
+        return NewOp;
+      }
     }
 
-    // If these two inputs are the output of another vector instruction,
-    // then we should use that output directly. It might be necessary to
-    // permute it first. [When pairings are fused recursively, you can
-    // end up with cases where a large vector is decomposed into scalars
-    // using extractelement instructions, then built into size-2
-    // vectors using insertelement and the into larger vectors using
-    // shuffles. InstCombine does not simplify all of these cases well,
-    // and so we make sure that shuffles are generated here when possible.
-    ExtractElementInst *LEE
-      = dyn_cast<ExtractElementInst>(L->getOperand(o));
-    ExtractElementInst *HEE
-      = dyn_cast<ExtractElementInst>(H->getOperand(o));
-
-    if (LEE && HEE &&
-        LEE->getOperand(0)->getType() == HEE->getOperand(0)->getType()) {
-      VectorType *EEType = cast<VectorType>(LEE->getOperand(0)->getType());
-      unsigned LowIndx = cast<ConstantInt>(LEE->getOperand(1))->getZExtValue();
-      unsigned HighIndx = cast<ConstantInt>(HEE->getOperand(1))->getZExtValue();
-      if (LEE->getOperand(0) == HEE->getOperand(0)) {
-        if (LowIndx == 0 && HighIndx == 1)
-          return LEE->getOperand(0);
-
-        std::vector<Constant*> Mask(2);
-        Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx);
-        Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx);
-
-        Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0),
-                                          UndefValue::get(EEType),
-                                          ConstantVector::get(Mask),
-                                          getReplacementName(I, true, o));
-        BV->insertBefore(J);
-        return BV;
+    Type *ArgType = ArgTypeL;
+    if (numElemL < numElemH) {
+      if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH,
+                                         ArgTypeL, VArgType, 1)) {
+        // This is another short-circuit case: we're combining a scalar into
+        // a vector that is formed by an IE chain. We've just expanded the IE
+        // chain, now insert the scalar and we're done.
+
+        Instruction *S = InsertElementInst::Create(HOp, LOp, CV0,
+                                               getReplacementName(I, true, o));
+        S->insertBefore(J);
+        return S;
+      } else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL,
+                                ArgTypeH)) {
+        // The two vector inputs to the shuffle must be the same length,
+        // so extend the smaller vector to be the same length as the larger one.
+        Instruction *NLOp;
+        if (numElemL > 1) {
+  
+          std::vector<Constant *> Mask(numElemH);
+          unsigned v = 0;
+          for (; v < numElemL; ++v)
+            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+          for (; v < numElemH; ++v)
+            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
+    
+          NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL),
+                                       ConstantVector::get(Mask),
+                                       getReplacementName(I, true, o, 1));
+        } else {
+          NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0,
+                                           getReplacementName(I, true, o, 1));
+        }
+  
+        NLOp->insertBefore(J);
+        LOp = NLOp;
       }
 
-      std::vector<Constant*> Mask(2);
-      HighIndx += EEType->getNumElements();
-      Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx);
-      Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx);
+      ArgType = ArgTypeH;
+    } else if (numElemL > numElemH) {
+      if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL,
+                                         ArgTypeH, VArgType)) {
+        Instruction *S =
+          InsertElementInst::Create(LOp, HOp, 
+                                    ConstantInt::get(Type::getInt32Ty(Context),
+                                                     numElemL),
+                                    getReplacementName(I, true, o));
+        S->insertBefore(J);
+        return S;
+      } else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH,
+                                ArgTypeL)) {
+        Instruction *NHOp;
+        if (numElemH > 1) {
+          std::vector<Constant *> Mask(numElemL);
+          unsigned v = 0;
+          for (; v < numElemH; ++v)
+            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+          for (; v < numElemL; ++v)
+            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
+    
+          NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH),
+                                       ConstantVector::get(Mask),
+                                       getReplacementName(I, true, o, 1));
+        } else {
+          NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0,
+                                           getReplacementName(I, true, o, 1));
+        }
+  
+        NHOp->insertBefore(J);
+        HOp = NHOp;
+      }
+    }
 
-      Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0),
-                                          HEE->getOperand(0),
-                                          ConstantVector::get(Mask),
-                                          getReplacementName(I, true, o));
+    if (ArgType->isVectorTy()) {
+      unsigned numElem = cast<VectorType>(VArgType)->getNumElements();
+      std::vector<Constant*> Mask(numElem);
+      for (unsigned v = 0; v < numElem; ++v) {
+        unsigned Idx = v;
+        // If the low vector was expanded, we need to skip the extra
+        // undefined entries.
+        if (v >= numElemL && numElemH > numElemL)
+          Idx += (numElemH - numElemL);
+        Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
+      }
+
+      Instruction *BV = new ShuffleVectorInst(LOp, HOp,
+                                              ConstantVector::get(Mask),
+                                              getReplacementName(I, true, o));
       BV->insertBefore(J);
       return BV;
     }
 
     Instruction *BV1 = InsertElementInst::Create(
-                                          UndefValue::get(VArgType),
-                                          L->getOperand(o), CV0,
+                                          UndefValue::get(VArgType), LOp, CV0,
                                           getReplacementName(I, true, o, 1));
     BV1->insertBefore(I);
-    Instruction *BV2 = InsertElementInst::Create(BV1, H->getOperand(o),
-                                          CV1,
+    Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1,
                                           getReplacementName(I, true, o, 2));
     BV2->insertBefore(J);
     return BV2;
@@ -1596,8 +1976,7 @@ namespace {
   void BBVectorize::getReplacementInputsForPair(LLVMContext& Context,
                      Instruction *I, Instruction *J,
                      SmallVector<Value *, 3> &ReplacedOperands,
-                     bool &FlipMemInputs) {
-    FlipMemInputs = false;
+                     bool FlipMemInputs) {
     unsigned NumOperands = I->getNumOperands();
 
     for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) {
@@ -1616,10 +1995,10 @@ namespace {
           BasicBlock &BB = *I->getParent();
 
           Module *M = BB.getParent()->getParent();
-          Type *ArgType = I->getType();
-          Type *VArgType = getVecTypeForPair(ArgType);
+          Type *ArgTypeI = I->getType();
+          Type *ArgTypeJ = J->getType();
+          Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
 
-          // FIXME: is it safe to do this here?
           ReplacedOperands[o] = Intrinsic::getDeclaration(M,
             (Intrinsic::ID) IID, VArgType);
           continue;
@@ -1648,36 +2027,60 @@ namespace {
                      Instruction *J, Instruction *K,
                      Instruction *&InsertionPt,
                      Instruction *&K1, Instruction *&K2,
-                     bool &FlipMemInputs) {
-    Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
-    Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
-
+                     bool FlipMemInputs) {
     if (isa<StoreInst>(I)) {
       AA->replaceWithNewValue(I, K);
       AA->replaceWithNewValue(J, K);
     } else {
       Type *IType = I->getType();
-      Type *VType = getVecTypeForPair(IType);
+      Type *JType = J->getType();
+
+      VectorType *VType = getVecTypeForPair(IType, JType);
+      unsigned numElem = VType->getNumElements();
+
+      unsigned numElemI, numElemJ;
+      if (IType->isVectorTy())
+        numElemI = cast<VectorType>(IType)->getNumElements();
+      else
+        numElemI = 1;
+
+      if (JType->isVectorTy())
+        numElemJ = cast<VectorType>(JType)->getNumElements();
+      else
+        numElemJ = 1;
 
       if (IType->isVectorTy()) {
-          unsigned numElem = cast<VectorType>(IType)->getNumElements();
-          std::vector<Constant*> Mask1(numElem), Mask2(numElem);
-          for (unsigned v = 0; v < numElem; ++v) {
-            Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
-            Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElem+v);
-          }
+        std::vector<Constant*> Mask1(numElemI), Mask2(numElemI);
+        for (unsigned v = 0; v < numElemI; ++v) {
+          Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+          Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ+v);
+        }
 
-          K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
-                                       ConstantVector::get(
-                                         FlipMemInputs ? Mask2 : Mask1),
-                                       getReplacementName(K, false, 1));
-          K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
-                                       ConstantVector::get(
-                                         FlipMemInputs ? Mask1 : Mask2),
-                                       getReplacementName(K, false, 2));
+        K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
+                                   ConstantVector::get(
+                                     FlipMemInputs ? Mask2 : Mask1),
+                                   getReplacementName(K, false, 1));
       } else {
+        Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
+        Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1);
         K1 = ExtractElementInst::Create(K, FlipMemInputs ? CV1 : CV0,
                                           getReplacementName(K, false, 1));
+      }
+
+      if (JType->isVectorTy()) {
+        std::vector<Constant*> Mask1(numElemJ), Mask2(numElemJ);
+        for (unsigned v = 0; v < numElemJ; ++v) {
+          Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+          Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI+v);
+        }
+
+        K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
+                                   ConstantVector::get(
+                                     FlipMemInputs ? Mask1 : Mask2),
+                                   getReplacementName(K, false, 2));
+      } else {
+        Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
+        Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1);
         K2 = ExtractElementInst::Create(K, FlipMemInputs ? CV0 : CV1,
                                           getReplacementName(K, false, 2));
       }
@@ -1778,6 +2181,61 @@ namespace {
     }
   }
 
+  // As with the aliasing information, SCEV can also change because of
+  // vectorization. This information is used to compute relative pointer
+  // offsets; the necessary information will be cached here prior to
+  // fusion.
+  void BBVectorize::collectPtrInfo(std::vector<Value *> &PairableInsts,
+                                   DenseMap<Value *, Value *> &ChosenPairs,
+                                   DenseSet<Value *> &LowPtrInsts) {
+    for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
+      PIE = PairableInsts.end(); PI != PIE; ++PI) {
+      DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI);
+      if (P == ChosenPairs.end()) continue;
+
+      Instruction *I = cast<Instruction>(P->first);
+      Instruction *J = cast<Instruction>(P->second);
+
+      if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
+        continue;
+
+      Value *IPtr, *JPtr;
+      unsigned IAlignment, JAlignment;
+      int64_t OffsetInElmts;
+      if (!getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
+                          OffsetInElmts) || abs64(OffsetInElmts) != 1)
+        llvm_unreachable("Pre-fusion pointer analysis failed");
+
+      Value *LowPI = (OffsetInElmts > 0) ? I : J;
+      LowPtrInsts.insert(LowPI);
+    }
+  }
+
+  // When the first instruction in each pair is cloned, it will inherit its
+  // parent's metadata. This metadata must be combined with that of the other
+  // instruction in a safe way.
+  void BBVectorize::combineMetadata(Instruction *K, const Instruction *J) {
+    SmallVector<std::pair<unsigned, MDNode*>, 4> Metadata;
+    K->getAllMetadataOtherThanDebugLoc(Metadata);
+    for (unsigned i = 0, n = Metadata.size(); i < n; ++i) {
+      unsigned Kind = Metadata[i].first;
+      MDNode *JMD = J->getMetadata(Kind);
+      MDNode *KMD = Metadata[i].second;
+
+      switch (Kind) {
+      default:
+        K->setMetadata(Kind, 0); // Remove unknown metadata
+        break;
+      case LLVMContext::MD_tbaa:
+        K->setMetadata(Kind, MDNode::getMostGenericTBAA(JMD, KMD));
+        break;
+      case LLVMContext::MD_fpmath:
+        K->setMetadata(Kind, MDNode::getMostGenericFPMath(JMD, KMD));
+        break;
+      }
+    }
+  }
+
   // This function fuses the chosen instruction pairs into vector instructions,
   // taking care preserve any needed scalar outputs and, then, it reorders the
   // remaining instructions as needed (users of the first member of the pair
@@ -1804,6 +2262,9 @@ namespace {
     std::multimap<Value *, Value *> LoadMoveSet;
     collectLoadMoveSet(BB, PairableInsts, ChosenPairs, LoadMoveSet);
 
+    DenseSet<Value *> LowPtrInsts;
+    collectPtrInfo(PairableInsts, ChosenPairs, LowPtrInsts);
+
     DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n");
 
     for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) {
@@ -1843,7 +2304,10 @@ namespace {
         continue;
       }
 
-      bool FlipMemInputs;
+      bool FlipMemInputs = false;
+      if (isa<LoadInst>(I) || isa<StoreInst>(I))
+        FlipMemInputs = (LowPtrInsts.find(I) == LowPtrInsts.end());
+
       unsigned NumOperands = I->getNumOperands();
       SmallVector<Value *, 3> ReplacedOperands(NumOperands);
       getReplacementInputsForPair(Context, I, J, ReplacedOperands,
@@ -1855,7 +2319,9 @@ namespace {
       if (I->hasName()) K->takeName(I);
 
       if (!isa<StoreInst>(K))
-        K->mutateType(getVecTypeForPair(I->getType()));
+        K->mutateType(getVecTypeForPair(I->getType(), J->getType()));
+
+      combineMetadata(K, J);
 
       for (unsigned o = 0; o < NumOperands; ++o)
         K->setOperand(o, ReplacedOperands[o]);
@@ -1947,6 +2413,7 @@ llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) {
 //===----------------------------------------------------------------------===//
 VectorizeConfig::VectorizeConfig() {
   VectorBits = ::VectorBits;
+  VectorizeBools = !::NoBools;
   VectorizeInts = !::NoInts;
   VectorizeFloats = !::NoFloats;
   VectorizePointers = !::NoPointers;
@@ -1954,6 +2421,7 @@ VectorizeConfig::VectorizeConfig() {
   VectorizeMath = !::NoMath;
   VectorizeFMA = !::NoFMA;
   VectorizeSelect = !::NoSelect;
+  VectorizeCmp = !::NoCmp;
   VectorizeGEP = !::NoGEP;
   VectorizeMemOps = !::NoMemOps;
   AlignedOnly = ::AlignedOnly;
@@ -1963,6 +2431,7 @@ VectorizeConfig::VectorizeConfig() {
   SplatBreaksChain = ::SplatBreaksChain;
   MaxInsts = ::MaxInsts;
   MaxIter = ::MaxIter;
+  Pow2LenOnly = ::Pow2LenOnly;
   NoMemOpBoost = ::NoMemOpBoost;
   FastDep = ::FastDep;
 }
diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt
index 4b66930..06cf1e4 100644
--- a/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/lib/Transforms/Vectorize/CMakeLists.txt
@@ -2,3 +2,5 @@ add_llvm_library(LLVMVectorize
   BBVectorize.cpp
   Vectorize.cpp
   )
+
+add_dependencies(LLVMVectorize intrinsics_gen)
diff --git a/lib/VMCore/AsmWriter.cpp b/lib/VMCore/AsmWriter.cpp
index 7b39efb..aedb86b 100644
--- a/lib/VMCore/AsmWriter.cpp
+++ b/lib/VMCore/AsmWriter.cpp
@@ -20,6 +20,7 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/CallingConv.h"
 #include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/InlineAsm.h"
 #include "llvm/IntrinsicInst.h"
@@ -99,7 +100,11 @@ static void PrintLLVMName(raw_ostream &OS, StringRef Name, PrefixType Prefix) {
   bool NeedsQuotes = isdigit(Name[0]);
   if (!NeedsQuotes) {
     for (unsigned i = 0, e = Name.size(); i != e; ++i) {
-      char C = Name[i];
+      // By making this unsigned, the value passed in to isalnum will always be
+      // in the range 0-255.  This is important when building with MSVC because
+      // its implementation will assert.  This situation can arise when dealing
+      // with UTF-8 multibyte characters.
+      unsigned char C = Name[i];
       if (!isalnum(C) && C != '-' && C != '.' && C != '_') {
         NeedsQuotes = true;
         break;
@@ -708,8 +713,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
   }
 
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) {
-    if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEhalf ||
-        &CFP->getValueAPF().getSemantics() == &APFloat::IEEEsingle ||
+    if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEsingle ||
         &CFP->getValueAPF().getSemantics() == &APFloat::IEEEdouble) {
       // We would like to output the FP constant value in exponential notation,
       // but we cannot do this if doing so will lose precision.  Check here to
@@ -759,16 +763,20 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
       return;
     }
 
-    // Some form of long double.  These appear as a magic letter identifying
-    // the type, then a fixed number of hex digits.
+    // Either half, or some form of long double.
+    // These appear as a magic letter identifying the type, then a
+    // fixed number of hex digits.
     Out << "0x";
+    // Bit position, in the current word, of the next nibble to print.
+    int shiftcount;
+
     if (&CFP->getValueAPF().getSemantics() == &APFloat::x87DoubleExtended) {
       Out << 'K';
       // api needed to prevent premature destruction
       APInt api = CFP->getValueAPF().bitcastToAPInt();
       const uint64_t* p = api.getRawData();
       uint64_t word = p[1];
-      int shiftcount=12;
+      shiftcount = 12;
       int width = api.getBitWidth();
       for (int j=0; j<width; j+=4, shiftcount-=4) {
         unsigned int nibble = (word>>shiftcount) & 15;
@@ -784,17 +792,21 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
         }
       }
       return;
-    } else if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEquad)
+    } else if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEquad) {
+      shiftcount = 60;
       Out << 'L';
-    else if (&CFP->getValueAPF().getSemantics() == &APFloat::PPCDoubleDouble)
+    } else if (&CFP->getValueAPF().getSemantics() == &APFloat::PPCDoubleDouble) {
+      shiftcount = 60;
       Out << 'M';
-    else
+    } else if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEhalf) {
+      shiftcount = 12;
+      Out << 'H';
+    } else
       llvm_unreachable("Unsupported floating point type");
     // api needed to prevent premature destruction
     APInt api = CFP->getValueAPF().bitcastToAPInt();
     const uint64_t* p = api.getRawData();
     uint64_t word = *p;
-    int shiftcount=60;
     int width = api.getBitWidth();
     for (int j=0; j<width; j+=4, shiftcount-=4) {
       unsigned int nibble = (word>>shiftcount) & 15;
@@ -1369,6 +1381,26 @@ static void PrintVisibility(GlobalValue::VisibilityTypes Vis,
   }
 }
 
+static void PrintThreadLocalModel(GlobalVariable::ThreadLocalMode TLM,
+                                  formatted_raw_ostream &Out) {
+  switch (TLM) {
+    case GlobalVariable::NotThreadLocal:
+      break;
+    case GlobalVariable::GeneralDynamicTLSModel:
+      Out << "thread_local ";
+      break;
+    case GlobalVariable::LocalDynamicTLSModel:
+      Out << "thread_local(localdynamic) ";
+      break;
+    case GlobalVariable::InitialExecTLSModel:
+      Out << "thread_local(initialexec) ";
+      break;
+    case GlobalVariable::LocalExecTLSModel:
+      Out << "thread_local(localexec) ";
+      break;
+  }
+}
+
 void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
   if (GV->isMaterializable())
     Out << "; Materializable\n";
@@ -1381,8 +1413,8 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
 
   PrintLinkage(GV->getLinkage(), Out);
   PrintVisibility(GV->getVisibility(), Out);
+  PrintThreadLocalModel(GV->getThreadLocalMode(), Out);
 
-  if (GV->isThreadLocal()) Out << "thread_local ";
   if (unsigned AddressSpace = GV->getType()->getAddressSpace())
     Out << "addrspace(" << AddressSpace << ") ";
   if (GV->hasUnnamedAddr()) Out << "unnamed_addr ";
@@ -2004,19 +2036,22 @@ static void WriteMDNodeComment(const MDNode *Node,
                                formatted_raw_ostream &Out) {
   if (Node->getNumOperands() < 1)
     return;
-  ConstantInt *CI = dyn_cast_or_null<ConstantInt>(Node->getOperand(0));
-  if (!CI) return;
-  APInt Val = CI->getValue();
-  APInt Tag = Val & ~APInt(Val.getBitWidth(), LLVMDebugVersionMask);
-  if (Val.ult(LLVMDebugVersion11))
+
+  Value *Op = Node->getOperand(0);
+  if (!Op || !isa<ConstantInt>(Op) || cast<ConstantInt>(Op)->getBitWidth() < 32)
+    return;
+
+  DIDescriptor Desc(Node);
+  if (Desc.getVersion() < LLVMDebugVersion11)
     return;
 
+  unsigned Tag = Desc.getTag();
   Out.PadToColumn(50);
-  if (Tag == dwarf::DW_TAG_user_base)
+  if (dwarf::TagString(Tag)) {
+    Out << "; ";
+    Desc.print(Out);
+  } else if (Tag == dwarf::DW_TAG_user_base) {
     Out << "; [ DW_TAG_user_base ]";
-  else if (Tag.isIntN(32)) {
-    if (const char *TagName = dwarf::TagString(Tag.getZExtValue()))
-      Out << "; [ " << TagName << " ]";
   }
 }
 
diff --git a/lib/VMCore/Attributes.cpp b/lib/VMCore/Attributes.cpp
index c05132b..d466ac6 100644
--- a/lib/VMCore/Attributes.cpp
+++ b/lib/VMCore/Attributes.cpp
@@ -131,8 +131,8 @@ class AttributeListImpl : public FoldingSetNode {
 public:
   SmallVector<AttributeWithIndex, 4> Attrs;
   
-  AttributeListImpl(const AttributeWithIndex *Attr, unsigned NumAttrs)
-    : Attrs(Attr, Attr+NumAttrs) {
+  AttributeListImpl(ArrayRef<AttributeWithIndex> attrs)
+    : Attrs(attrs.begin(), attrs.end()) {
     RefCount = 0;
   }
   
@@ -150,13 +150,12 @@ public:
   }
   
   void Profile(FoldingSetNodeID &ID) const {
-    Profile(ID, Attrs.data(), Attrs.size());
+    Profile(ID, Attrs);
   }
-  static void Profile(FoldingSetNodeID &ID, const AttributeWithIndex *Attr,
-                      unsigned NumAttrs) {
-    for (unsigned i = 0; i != NumAttrs; ++i) {
-      ID.AddInteger(Attr[i].Attrs.Raw());
-      ID.AddInteger(Attr[i].Index);
+  static void Profile(FoldingSetNodeID &ID, ArrayRef<AttributeWithIndex> Attrs){
+    for (unsigned i = 0, e = Attrs.size(); i != e; ++i) {
+      ID.AddInteger(Attrs[i].Attrs.Raw());
+      ID.AddInteger(Attrs[i].Index);
     }
   }
 };
@@ -168,13 +167,13 @@ AttributeListImpl::~AttributeListImpl() {
 }
 
 
-AttrListPtr AttrListPtr::get(const AttributeWithIndex *Attrs, unsigned NumAttrs) {
+AttrListPtr AttrListPtr::get(ArrayRef<AttributeWithIndex> Attrs) {
   // If there are no attributes then return a null AttributesList pointer.
-  if (NumAttrs == 0)
+  if (Attrs.empty())
     return AttrListPtr();
   
 #ifndef NDEBUG
-  for (unsigned i = 0; i != NumAttrs; ++i) {
+  for (unsigned i = 0, e = Attrs.size(); i != e; ++i) {
     assert(Attrs[i].Attrs != Attribute::None && 
            "Pointless attribute!");
     assert((!i || Attrs[i-1].Index < Attrs[i].Index) &&
@@ -184,7 +183,7 @@ AttrListPtr AttrListPtr::get(const AttributeWithIndex *Attrs, unsigned NumAttrs)
   
   // Otherwise, build a key to look up the existing attributes.
   FoldingSetNodeID ID;
-  AttributeListImpl::Profile(ID, Attrs, NumAttrs);
+  AttributeListImpl::Profile(ID, Attrs);
   void *InsertPos;
   
   sys::SmartScopedLock<true> Lock(*ALMutex);
@@ -195,7 +194,7 @@ AttrListPtr AttrListPtr::get(const AttributeWithIndex *Attrs, unsigned NumAttrs)
   // If we didn't find any existing attributes of the same shape then
   // create a new one and insert it.
   if (!PAL) {
-    PAL = new AttributeListImpl(Attrs, NumAttrs);
+    PAL = new AttributeListImpl(Attrs);
     AttributesLists->InsertNode(PAL, InsertPos);
   }
   
@@ -308,7 +307,7 @@ AttrListPtr AttrListPtr::addAttr(unsigned Idx, Attributes Attrs) const {
                        OldAttrList.begin()+i, OldAttrList.end());
   }
   
-  return get(NewAttrList.data(), NewAttrList.size());
+  return get(NewAttrList);
 }
 
 AttrListPtr AttrListPtr::removeAttr(unsigned Idx, Attributes Attrs) const {
@@ -343,7 +342,7 @@ AttrListPtr AttrListPtr::removeAttr(unsigned Idx, Attributes Attrs) const {
   NewAttrList.insert(NewAttrList.end(), 
                      OldAttrList.begin()+i, OldAttrList.end());
   
-  return get(NewAttrList.data(), NewAttrList.size());
+  return get(NewAttrList);
 }
 
 void AttrListPtr::dump() const {
diff --git a/lib/VMCore/AutoUpgrade.cpp b/lib/VMCore/AutoUpgrade.cpp
index 2e16372..094ca75 100644
--- a/lib/VMCore/AutoUpgrade.cpp
+++ b/lib/VMCore/AutoUpgrade.cpp
@@ -14,17 +14,32 @@
 #include "llvm/AutoUpgrade.h"
 #include "llvm/Constants.h"
 #include "llvm/Function.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/Instruction.h"
+#include "llvm/IntrinsicInst.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Support/CallSite.h"
 #include "llvm/Support/CFG.h"
+#include "llvm/Support/CallSite.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/IRBuilder.h"
 #include <cstring>
 using namespace llvm;
 
+// Upgrade the declarations of the SSE4.1 functions whose arguments have
+// changed their type from v4f32 to v2i64.
+static bool UpgradeSSE41Function(Function* F, Intrinsic::ID IID,
+                                 Function *&NewFn) {
+  // Check whether this is an old version of the function, which received
+  // v4f32 arguments.
+  Type *Arg0Type = F->getFunctionType()->getParamType(0);
+  if (Arg0Type != VectorType::get(Type::getFloatTy(F->getContext()), 4))
+    return false;
+
+  // Yes, it's old, replace it with new version.
+  F->setName(F->getName() + ".old");
+  NewFn = Intrinsic::getDeclaration(F->getParent(), IID);
+  return true;
+}
 
 static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
   assert(F && "Illegal to upgrade a non-existent Function.");
@@ -37,6 +52,27 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
 
   switch (Name[0]) {
   default: break;
+  case 'a': {
+    if (Name.startswith("arm.neon.vclz")) {
+      Type* args[2] = {
+        F->arg_begin()->getType(), 
+        Type::getInt1Ty(F->getContext())
+      };
+      // Can't use Intrinsic::getDeclaration here as it adds a ".i1" to
+      // the end of the name. Change name from llvm.arm.neon.vclz.* to
+      //  llvm.ctlz.*
+      FunctionType* fType = FunctionType::get(F->getReturnType(), args, false);
+      NewFn = Function::Create(fType, F->getLinkage(), 
+                               "llvm.ctlz." + Name.substr(14), F->getParent());
+      return true;
+    }
+    if (Name.startswith("arm.neon.vcnt")) {
+      NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctpop,
+                                        F->arg_begin()->getType());
+      return true;
+    }
+    break;
+  }
   case 'c': {
     if (Name.startswith("ctlz.") && F->arg_size() == 1) {
       F->setName(Name + ".old");
@@ -57,17 +93,49 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
         Name.startswith("x86.sse2.pcmpgt.") ||
         Name.startswith("x86.avx2.pcmpeq.") ||
         Name.startswith("x86.avx2.pcmpgt.") ||
-        Name.startswith("x86.avx.vpermil.")) {
+        Name.startswith("x86.avx.vpermil.") ||
+        Name == "x86.avx.movnt.dq.256" ||
+        Name == "x86.avx.movnt.pd.256" ||
+        Name == "x86.avx.movnt.ps.256" ||
+        (Name.startswith("x86.xop.vpcom") && F->arg_size() == 2)) {
       NewFn = 0;
       return true;
     }
+    // SSE4.1 ptest functions may have an old signature.
+    if (Name.startswith("x86.sse41.ptest")) {
+      if (Name == "x86.sse41.ptestc")
+        return UpgradeSSE41Function(F, Intrinsic::x86_sse41_ptestc, NewFn);
+      if (Name == "x86.sse41.ptestz")
+        return UpgradeSSE41Function(F, Intrinsic::x86_sse41_ptestz, NewFn);
+      if (Name == "x86.sse41.ptestnzc")
+        return UpgradeSSE41Function(F, Intrinsic::x86_sse41_ptestnzc, NewFn);
+    }
+    // frcz.ss/sd may need to have an argument dropped
+    if (Name.startswith("x86.xop.vfrcz.ss") && F->arg_size() == 2) {
+      F->setName(Name + ".old");
+      NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                        Intrinsic::x86_xop_vfrcz_ss);
+      return true;
+    }
+    if (Name.startswith("x86.xop.vfrcz.sd") && F->arg_size() == 2) {
+      F->setName(Name + ".old");
+      NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                        Intrinsic::x86_xop_vfrcz_sd);
+      return true;
+    }
+    // Fix the FMA4 intrinsics to remove the 4
+    if (Name.startswith("x86.fma4.")) {
+      F->setName("llvm.x86.fma" + Name.substr(8));
+      NewFn = F;
+      return true;
+    }
     break;
   }
   }
 
-  //  This may not belong here. This function is effectively being overloaded 
-  //  to both detect an intrinsic which needs upgrading, and to provide the 
-  //  upgraded form of the intrinsic. We should perhaps have two separate 
+  //  This may not belong here. This function is effectively being overloaded
+  //  to both detect an intrinsic which needs upgrading, and to provide the
+  //  upgraded form of the intrinsic. We should perhaps have two separate
   //  functions for this.
   return false;
 }
@@ -89,8 +157,8 @@ bool llvm::UpgradeGlobalVariable(GlobalVariable *GV) {
   return false;
 }
 
-// UpgradeIntrinsicCall - Upgrade a call to an old intrinsic to be a call the 
-// upgraded intrinsic. All argument and return casting must be provided in 
+// UpgradeIntrinsicCall - Upgrade a call to an old intrinsic to be a call the
+// upgraded intrinsic. All argument and return casting must be provided in
 // order to seamlessly integrate with existing context.
 void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
   Function *F = CI->getCalledFunction();
@@ -118,15 +186,85 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                                   "pcmpgt");
       // need to sign extend since icmp returns vector of i1
       Rep = Builder.CreateSExt(Rep, CI->getType(), "");
+    } else if (Name == "llvm.x86.avx.movnt.dq.256" ||
+               Name == "llvm.x86.avx.movnt.ps.256" ||
+               Name == "llvm.x86.avx.movnt.pd.256") {
+      IRBuilder<> Builder(C);
+      Builder.SetInsertPoint(CI->getParent(), CI);
+
+      Module *M = F->getParent();
+      SmallVector<Value *, 1> Elts;
+      Elts.push_back(ConstantInt::get(Type::getInt32Ty(C), 1));
+      MDNode *Node = MDNode::get(C, Elts);
+
+      Value *Arg0 = CI->getArgOperand(0);
+      Value *Arg1 = CI->getArgOperand(1);
+
+      // Convert the type of the pointer to a pointer to the stored type.
+      Value *BC = Builder.CreateBitCast(Arg0,
+                                        PointerType::getUnqual(Arg1->getType()),
+                                        "cast");
+      StoreInst *SI = Builder.CreateStore(Arg1, BC);
+      SI->setMetadata(M->getMDKindID("nontemporal"), Node);
+      SI->setAlignment(16);
+
+      // Remove intrinsic.
+      CI->eraseFromParent();
+      return;
+    } else if (Name.startswith("llvm.x86.xop.vpcom")) {
+      Intrinsic::ID intID;
+      if (Name.endswith("ub"))
+        intID = Intrinsic::x86_xop_vpcomub;
+      else if (Name.endswith("uw"))
+        intID = Intrinsic::x86_xop_vpcomuw;
+      else if (Name.endswith("ud"))
+        intID = Intrinsic::x86_xop_vpcomud;
+      else if (Name.endswith("uq"))
+        intID = Intrinsic::x86_xop_vpcomuq;
+      else if (Name.endswith("b"))
+        intID = Intrinsic::x86_xop_vpcomb;
+      else if (Name.endswith("w"))
+        intID = Intrinsic::x86_xop_vpcomw;
+      else if (Name.endswith("d"))
+        intID = Intrinsic::x86_xop_vpcomd;
+      else if (Name.endswith("q"))
+        intID = Intrinsic::x86_xop_vpcomq;
+      else
+        llvm_unreachable("Unknown suffix");
+
+      Name = Name.substr(18); // strip off "llvm.x86.xop.vpcom"
+      unsigned Imm;
+      if (Name.startswith("lt"))
+        Imm = 0;
+      else if (Name.startswith("le"))
+        Imm = 1;
+      else if (Name.startswith("gt"))
+        Imm = 2;
+      else if (Name.startswith("ge"))
+        Imm = 3;
+      else if (Name.startswith("eq"))
+        Imm = 4;
+      else if (Name.startswith("ne"))
+        Imm = 5;
+      else if (Name.startswith("true"))
+        Imm = 6;
+      else if (Name.startswith("false"))
+        Imm = 7;
+      else
+        llvm_unreachable("Unknown condition");
+
+      Function *VPCOM = Intrinsic::getDeclaration(F->getParent(), intID);
+      Rep = Builder.CreateCall3(VPCOM, CI->getArgOperand(0),
+                                CI->getArgOperand(1), Builder.getInt8(Imm));
     } else {
       bool PD128 = false, PD256 = false, PS128 = false, PS256 = false;
-      if (Name.startswith("llvm.x86.avx.vpermil.pd.256"))
+      if (Name == "llvm.x86.avx.vpermil.pd.256")
         PD256 = true;
-      else if (Name.startswith("llvm.x86.avx.vpermil.pd"))
+      else if (Name == "llvm.x86.avx.vpermil.pd")
         PD128 = true;
-      else if (Name.startswith("llvm.x86.avx.vpermil.ps.256"))
+      else if (Name == "llvm.x86.avx.vpermil.ps.256")
         PS256 = true;
-      else if (Name.startswith("llvm.x86.avx.vpermil.ps"))
+      else if (Name == "llvm.x86.avx.vpermil.ps")
         PS128 = true;
 
       if (PD256 || PD128 || PS256 || PS128) {
@@ -162,6 +300,9 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     return;
   }
 
+  std::string Name = CI->getName().str();
+  CI->setName(Name + ".old");
+
   switch (NewFn->getIntrinsicID()) {
   default:
     llvm_unreachable("Unknown function for CallInst upgrade.");
@@ -170,12 +311,60 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
   case Intrinsic::cttz:
     assert(CI->getNumArgOperands() == 1 &&
            "Mismatch between function args and call args");
-    StringRef Name = CI->getName();
-    CI->setName(Name + ".old");
     CI->replaceAllUsesWith(Builder.CreateCall2(NewFn, CI->getArgOperand(0),
                                                Builder.getFalse(), Name));
     CI->eraseFromParent();
     return;
+
+  case Intrinsic::arm_neon_vclz: {
+    // Change name from llvm.arm.neon.vclz.* to llvm.ctlz.*
+    CI->replaceAllUsesWith(Builder.CreateCall2(NewFn, CI->getArgOperand(0),
+                                               Builder.getFalse(),
+                                               "llvm.ctlz." + Name.substr(14)));
+    CI->eraseFromParent();
+    return;
+  }
+  case Intrinsic::ctpop: {
+    CI->replaceAllUsesWith(Builder.CreateCall(NewFn, CI->getArgOperand(0)));
+    CI->eraseFromParent();
+    return;
+  }
+
+  case Intrinsic::x86_xop_vfrcz_ss:
+  case Intrinsic::x86_xop_vfrcz_sd:
+    CI->replaceAllUsesWith(Builder.CreateCall(NewFn, CI->getArgOperand(1),
+                                              Name));
+    CI->eraseFromParent();
+    return;
+
+  case Intrinsic::x86_sse41_ptestc:
+  case Intrinsic::x86_sse41_ptestz:
+  case Intrinsic::x86_sse41_ptestnzc: {
+    // The arguments for these intrinsics used to be v4f32, and changed
+    // to v2i64. This is purely a nop, since those are bitwise intrinsics.
+    // So, the only thing required is a bitcast for both arguments.
+    // First, check the arguments have the old type.
+    Value *Arg0 = CI->getArgOperand(0);
+    if (Arg0->getType() != VectorType::get(Type::getFloatTy(C), 4))
+      return;
+
+    // Old intrinsic, add bitcasts
+    Value *Arg1 = CI->getArgOperand(1);
+
+    Value *BC0 =
+      Builder.CreateBitCast(Arg0,
+                            VectorType::get(Type::getInt64Ty(C), 2),
+                            "cast");
+    Value *BC1 =
+      Builder.CreateBitCast(Arg1,
+                            VectorType::get(Type::getInt64Ty(C), 2),
+                            "cast");
+
+    CallInst* NewCall = Builder.CreateCall2(NewFn, BC0, BC1, Name);
+    CI->replaceAllUsesWith(NewCall);
+    CI->eraseFromParent();
+    return;
+  }
   }
 }
 
diff --git a/lib/VMCore/CMakeLists.txt b/lib/VMCore/CMakeLists.txt
index e1efcda..648ccbd 100644
--- a/lib/VMCore/CMakeLists.txt
+++ b/lib/VMCore/CMakeLists.txt
@@ -8,7 +8,9 @@ add_llvm_library(LLVMCore
   ConstantFold.cpp
   Constants.cpp
   Core.cpp
+  DebugInfo.cpp
   DebugLoc.cpp
+  DIBuilder.cpp
   Dominators.cpp
   Function.cpp
   GCOV.cpp
@@ -36,3 +38,14 @@ add_llvm_library(LLVMCore
   ValueTypes.cpp
   Verifier.cpp
   )
+
+# Workaround: It takes over 20 minutes to compile with msvc10.
+# FIXME: Suppressing optimizations to core libraries would not be good thing.
+if( MSVC_VERSION EQUAL 1600 )
+set_property(
+  SOURCE Function.cpp
+  PROPERTY COMPILE_FLAGS "/Og-"
+  )
+endif()
+
+add_dependencies(LLVMCore intrinsics_gen)
diff --git a/lib/VMCore/ConstantFold.cpp b/lib/VMCore/ConstantFold.cpp
index b743287..8e82876 100644
--- a/lib/VMCore/ConstantFold.cpp
+++ b/lib/VMCore/ConstantFold.cpp
@@ -55,13 +55,12 @@ static Constant *BitCastConstantVector(Constant *CV, VectorType *DstTy) {
   
   Type *DstEltTy = DstTy->getElementType();
 
-  // Check to verify that all elements of the input are simple.
   SmallVector<Constant*, 16> Result;
+  Type *Ty = IntegerType::get(CV->getContext(), 32);
   for (unsigned i = 0; i != NumElts; ++i) {
-    Constant *C = CV->getAggregateElement(i);
-    if (C == 0) return 0;
+    Constant *C =
+      ConstantExpr::getExtractElement(CV, ConstantInt::get(Ty, i));
     C = ConstantExpr::getBitCast(C, DstEltTy);
-    if (isa<ConstantExpr>(C)) return 0;
     Result.push_back(C);
   }
 
@@ -553,9 +552,12 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
     SmallVector<Constant*, 16> res;
     VectorType *DestVecTy = cast<VectorType>(DestTy);
     Type *DstEltTy = DestVecTy->getElementType();
-    for (unsigned i = 0, e = V->getType()->getVectorNumElements(); i != e; ++i)
-      res.push_back(ConstantExpr::getCast(opc,
-                                          V->getAggregateElement(i), DstEltTy));
+    Type *Ty = IntegerType::get(V->getContext(), 32);
+    for (unsigned i = 0, e = V->getType()->getVectorNumElements(); i != e; ++i) {
+      Constant *C =
+        ConstantExpr::getExtractElement(V, ConstantInt::get(Ty, i));
+      res.push_back(ConstantExpr::getCast(opc, C, DstEltTy));
+    }
     return ConstantVector::get(res);
   }
 
@@ -696,12 +698,13 @@ Constant *llvm::ConstantFoldSelectInstruction(Constant *Cond,
   // If the condition is a vector constant, fold the result elementwise.
   if (ConstantVector *CondV = dyn_cast<ConstantVector>(Cond)) {
     SmallVector<Constant*, 16> Result;
+    Type *Ty = IntegerType::get(CondV->getContext(), 32);
     for (unsigned i = 0, e = V1->getType()->getVectorNumElements(); i != e;++i){
       ConstantInt *Cond = dyn_cast<ConstantInt>(CondV->getOperand(i));
       if (Cond == 0) break;
       
-      Constant *Res = (Cond->getZExtValue() ? V2 : V1)->getAggregateElement(i);
-      if (Res == 0) break;
+      Constant *V = Cond->isNullValue() ? V2 : V1;
+      Constant *Res = ConstantExpr::getExtractElement(V, ConstantInt::get(Ty, i));
       Result.push_back(Res);
     }
     
@@ -721,12 +724,12 @@ Constant *llvm::ConstantFoldSelectInstruction(Constant *Cond,
   if (ConstantExpr *TrueVal = dyn_cast<ConstantExpr>(V1)) {
     if (TrueVal->getOpcode() == Instruction::Select)
       if (TrueVal->getOperand(0) == Cond)
-	return ConstantExpr::getSelect(Cond, TrueVal->getOperand(1), V2);
+        return ConstantExpr::getSelect(Cond, TrueVal->getOperand(1), V2);
   }
   if (ConstantExpr *FalseVal = dyn_cast<ConstantExpr>(V2)) {
     if (FalseVal->getOpcode() == Instruction::Select)
       if (FalseVal->getOperand(0) == Cond)
-	return ConstantExpr::getSelect(Cond, V1, FalseVal->getOperand(2));
+        return ConstantExpr::getSelect(Cond, V1, FalseVal->getOperand(2));
   }
 
   return 0;
@@ -760,16 +763,16 @@ Constant *llvm::ConstantFoldInsertElementInstruction(Constant *Val,
   const APInt &IdxVal = CIdx->getValue();
   
   SmallVector<Constant*, 16> Result;
+  Type *Ty = IntegerType::get(Val->getContext(), 32);
   for (unsigned i = 0, e = Val->getType()->getVectorNumElements(); i != e; ++i){
     if (i == IdxVal) {
       Result.push_back(Elt);
       continue;
     }
     
-    if (Constant *C = Val->getAggregateElement(i))
-      Result.push_back(C);
-    else
-      return 0;
+    Constant *C =
+      ConstantExpr::getExtractElement(Val, ConstantInt::get(Ty, i));
+    Result.push_back(C);
   }
   
   return ConstantVector::get(Result);
@@ -801,11 +804,15 @@ Constant *llvm::ConstantFoldShuffleVectorInstruction(Constant *V1,
     Constant *InElt;
     if (unsigned(Elt) >= SrcNumElts*2)
       InElt = UndefValue::get(EltTy);
-    else if (unsigned(Elt) >= SrcNumElts)
-      InElt = V2->getAggregateElement(Elt - SrcNumElts);
-    else
-      InElt = V1->getAggregateElement(Elt);
-    if (InElt == 0) return 0;
+    else if (unsigned(Elt) >= SrcNumElts) {
+      Type *Ty = IntegerType::get(V2->getContext(), 32);
+      InElt =
+        ConstantExpr::getExtractElement(V2,
+                                        ConstantInt::get(Ty, Elt - SrcNumElts));
+    } else {
+      Type *Ty = IntegerType::get(V1->getContext(), 32);
+      InElt = ConstantExpr::getExtractElement(V1, ConstantInt::get(Ty, Elt));
+    }
     Result.push_back(InElt);
   }
 
@@ -1130,16 +1137,17 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
   } else if (VectorType *VTy = dyn_cast<VectorType>(C1->getType())) {
     // Perform elementwise folding.
     SmallVector<Constant*, 16> Result;
+    Type *Ty = IntegerType::get(VTy->getContext(), 32);
     for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) {
-      Constant *LHS = C1->getAggregateElement(i);
-      Constant *RHS = C2->getAggregateElement(i);
-      if (LHS == 0 || RHS == 0) break;
+      Constant *LHS =
+        ConstantExpr::getExtractElement(C1, ConstantInt::get(Ty, i));
+      Constant *RHS =
+        ConstantExpr::getExtractElement(C2, ConstantInt::get(Ty, i));
       
       Result.push_back(ConstantExpr::get(Opcode, LHS, RHS));
     }
     
-    if (Result.size() == VTy->getNumElements())
-      return ConstantVector::get(Result);
+    return ConstantVector::get(Result);
   }
 
   if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(C1)) {
@@ -1697,17 +1705,18 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
     // If we can constant fold the comparison of each element, constant fold
     // the whole vector comparison.
     SmallVector<Constant*, 4> ResElts;
+    Type *Ty = IntegerType::get(C1->getContext(), 32);
     // Compare the elements, producing an i1 result or constant expr.
     for (unsigned i = 0, e = C1->getType()->getVectorNumElements(); i != e;++i){
-      Constant *C1E = C1->getAggregateElement(i);
-      Constant *C2E = C2->getAggregateElement(i);
-      if (C1E == 0 || C2E == 0) break;
+      Constant *C1E =
+        ConstantExpr::getExtractElement(C1, ConstantInt::get(Ty, i));
+      Constant *C2E =
+        ConstantExpr::getExtractElement(C2, ConstantInt::get(Ty, i));
       
       ResElts.push_back(ConstantExpr::getCompare(pred, C1E, C2E));
     }
     
-    if (ResElts.size() == C1->getType()->getVectorNumElements())
-      return ConstantVector::get(ResElts);
+    return ConstantVector::get(ResElts);
   }
 
   if (C1->getType()->isFloatingPointTy()) {
diff --git a/lib/VMCore/Constants.cpp b/lib/VMCore/Constants.cpp
index 6dbc144..a4e21e1 100644
--- a/lib/VMCore/Constants.cpp
+++ b/lib/VMCore/Constants.cpp
@@ -46,7 +46,7 @@ bool Constant::isNegativeZeroValue() const {
   // Floating point values have an explicit -0.0 value.
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this))
     return CFP->isZero() && CFP->isNegative();
-  
+
   // Otherwise, just use +0.0.
   return isNullValue();
 }
@@ -55,7 +55,7 @@ bool Constant::isNullValue() const {
   // 0 is null.
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(this))
     return CI->isZero();
-  
+
   // +0.0 is null.
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this))
     return CFP->isZero() && !CFP->isNegative();
@@ -161,19 +161,19 @@ Constant *Constant::getAllOnesValue(Type *Ty) {
 Constant *Constant::getAggregateElement(unsigned Elt) const {
   if (const ConstantStruct *CS = dyn_cast<ConstantStruct>(this))
     return Elt < CS->getNumOperands() ? CS->getOperand(Elt) : 0;
-  
+
   if (const ConstantArray *CA = dyn_cast<ConstantArray>(this))
     return Elt < CA->getNumOperands() ? CA->getOperand(Elt) : 0;
-  
+
   if (const ConstantVector *CV = dyn_cast<ConstantVector>(this))
     return Elt < CV->getNumOperands() ? CV->getOperand(Elt) : 0;
-  
+
   if (const ConstantAggregateZero *CAZ =dyn_cast<ConstantAggregateZero>(this))
     return CAZ->getElementValue(Elt);
-  
+
   if (const UndefValue *UV = dyn_cast<UndefValue>(this))
     return UV->getElementValue(Elt);
-  
+
   if (const ConstantDataSequential *CDS =dyn_cast<ConstantDataSequential>(this))
     return Elt < CDS->getNumElements() ? CDS->getElementAsConstant(Elt) : 0;
   return 0;
@@ -222,10 +222,10 @@ bool Constant::canTrap() const {
   // The only thing that could possibly trap are constant exprs.
   const ConstantExpr *CE = dyn_cast<ConstantExpr>(this);
   if (!CE) return false;
-  
-  // ConstantExpr traps if any operands can trap. 
+
+  // ConstantExpr traps if any operands can trap.
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
-    if (CE->getOperand(i)->canTrap()) 
+    if (CE->getOperand(i)->canTrap())
       return true;
 
   // Otherwise, only specific operations can trap.
@@ -252,7 +252,7 @@ bool Constant::isConstantUsed() const {
     const Constant *UC = dyn_cast<Constant>(*UI);
     if (UC == 0 || isa<GlobalValue>(UC))
       return true;
-    
+
     if (UC->isConstantUsed())
       return true;
   }
@@ -302,12 +302,12 @@ Constant::PossibleRelocationsTy Constant::getRelocationInfo() const {
             cast<BlockAddress>(RHS->getOperand(0))->getFunction())
         return NoRelocation;
     }
-  
+
   PossibleRelocationsTy Result = NoRelocation;
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
     Result = std::max(Result,
                       cast<Constant>(getOperand(i))->getRelocationInfo());
-  
+
   return Result;
 }
 
@@ -316,14 +316,14 @@ Constant::PossibleRelocationsTy Constant::getRelocationInfo() const {
 /// constantexpr.
 static bool removeDeadUsersOfConstant(const Constant *C) {
   if (isa<GlobalValue>(C)) return false; // Cannot remove this
-  
+
   while (!C->use_empty()) {
     const Constant *User = dyn_cast<Constant>(C->use_back());
     if (!User) return false; // Non-constant usage;
     if (!removeDeadUsersOfConstant(User))
       return false; // Constant wasn't dead
   }
-  
+
   const_cast<Constant*>(C)->destroyConstant();
   return true;
 }
@@ -343,7 +343,7 @@ void Constant::removeDeadConstantUsers() const {
       ++I;
       continue;
     }
-    
+
     if (!removeDeadUsersOfConstant(User)) {
       // If the constant wasn't dead, remember that this was the last live use
       // and move on to the next constant.
@@ -351,7 +351,7 @@ void Constant::removeDeadConstantUsers() const {
       ++I;
       continue;
     }
-    
+
     // If the constant was dead, then the iterator is invalidated.
     if (LastNonDeadUser == E) {
       I = use_begin();
@@ -485,7 +485,7 @@ static const fltSemantics *TypeToFloatSemantics(Type *Ty) {
     return &APFloat::x87DoubleExtended;
   else if (Ty->isFP128Ty())
     return &APFloat::IEEEquad;
-  
+
   assert(Ty->isPPC_FP128Ty() && "Unknown FP format");
   return &APFloat::PPCDoubleDouble;
 }
@@ -497,7 +497,7 @@ void ConstantFP::anchor() { }
 /// 2.0/1.0 etc, that are known-valid both as double and as the target format.
 Constant *ConstantFP::get(Type *Ty, double V) {
   LLVMContext &Context = Ty->getContext();
-  
+
   APFloat FV(V);
   bool ignored;
   FV.convert(*TypeToFloatSemantics(Ty->getScalarType()),
@@ -550,11 +550,11 @@ Constant *ConstantFP::getZeroValueForNegation(Type *Ty) {
 // ConstantFP accessors.
 ConstantFP* ConstantFP::get(LLVMContext &Context, const APFloat& V) {
   DenseMapAPFloatKeyInfo::KeyTy Key(V);
-  
+
   LLVMContextImpl* pImpl = Context.pImpl;
-  
+
   ConstantFP *&Slot = pImpl->FPConstants[Key];
-    
+
   if (!Slot) {
     Type *Ty;
     if (&V.getSemantics() == &APFloat::IEEEhalf)
@@ -574,7 +574,7 @@ ConstantFP* ConstantFP::get(LLVMContext &Context, const APFloat& V) {
     }
     Slot = new ConstantFP(Ty, V);
   }
-  
+
   return Slot;
 }
 
@@ -695,7 +695,7 @@ Constant *ConstantArray::get(ArrayType *Ty, ArrayRef<Constant*> V) {
            "Wrong type in array element initializer");
   }
   LLVMContextImpl *pImpl = Ty->getContext().pImpl;
-  
+
   // If this is an all-zero array, return a ConstantAggregateZero object.  If
   // all undef, return an UndefValue, if "all simple", then return a
   // ConstantDataArray.
@@ -751,7 +751,7 @@ Constant *ConstantArray::get(ArrayType *Ty, ArrayRef<Constant*> V) {
           return ConstantDataArray::get(C->getContext(), Elts);
       }
     }
-    
+
     if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
       if (CFP->getType()->isFloatTy()) {
         SmallVector<float, 16> Elts;
@@ -788,7 +788,7 @@ StructType *ConstantStruct::getTypeForElements(LLVMContext &Context,
   SmallVector<Type*, 16> EltTypes(VecSize);
   for (unsigned i = 0; i != VecSize; ++i)
     EltTypes[i] = V[i]->getType();
-  
+
   return StructType::get(Context, EltTypes, Packed);
 }
 
@@ -833,12 +833,12 @@ Constant *ConstantStruct::get(StructType *ST, ArrayRef<Constant*> V) {
           isUndef = false;
       }
     }
-  }  
+  }
   if (isZero)
     return ConstantAggregateZero::get(ST);
   if (isUndef)
     return UndefValue::get(ST);
-    
+
   return ST->getContext().pImpl->StructConstants.getOrCreate(ST, V);
 }
 
@@ -881,12 +881,12 @@ Constant *ConstantVector::get(ArrayRef<Constant*> V) {
         break;
       }
   }
-  
+
   if (isZero)
     return ConstantAggregateZero::get(T);
   if (isUndef)
     return UndefValue::get(T);
-   
+
   // Check to see if all of the elements are ConstantFP or ConstantInt and if
   // the element type is compatible with ConstantDataVector.  If so, use it.
   if (ConstantDataSequential::isElementTypeCompatible(C->getType())) {
@@ -932,7 +932,7 @@ Constant *ConstantVector::get(ArrayRef<Constant*> V) {
           return ConstantDataVector::get(C->getContext(), Elts);
       }
     }
-    
+
     if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
       if (CFP->getType()->isFloatTy()) {
         SmallVector<float, 16> Elts;
@@ -955,7 +955,7 @@ Constant *ConstantVector::get(ArrayRef<Constant*> V) {
       }
     }
   }
-  
+
   // Otherwise, the element type isn't compatible with ConstantDataVector, or
   // the operand list constants a ConstantExpr or something else strange.
   return pImpl->VectorConstants.getOrCreate(T, V);
@@ -967,7 +967,7 @@ Constant *ConstantVector::getSplat(unsigned NumElts, Constant *V) {
   if ((isa<ConstantFP>(V) || isa<ConstantInt>(V)) &&
       ConstantDataSequential::isElementTypeCompatible(V->getType()))
     return ConstantDataVector::getSplat(NumElts, V);
-  
+
   SmallVector<Constant*, 32> Elts(NumElts, V);
   return get(Elts);
 }
@@ -1039,7 +1039,7 @@ ConstantExpr::getWithOperandReplaced(unsigned OpNo, Constant *Op) const {
   SmallVector<Constant*, 8> NewOps;
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
     NewOps.push_back(i == OpNo ? Op : getOperand(i));
-  
+
   return getWithOperands(NewOps);
 }
 
@@ -1052,7 +1052,7 @@ getWithOperands(ArrayRef<Constant*> Ops, Type *Ty) const {
   bool AnyChange = Ty != getType();
   for (unsigned i = 0; i != Ops.size(); ++i)
     AnyChange |= Ops[i] != getOperand(i);
-  
+
   if (!AnyChange)  // No operands changed, return self.
     return const_cast<ConstantExpr*>(this);
 
@@ -1177,7 +1177,7 @@ ConstantAggregateZero *ConstantAggregateZero::get(Type *Ty) {
   ConstantAggregateZero *&Entry = Ty->getContext().pImpl->CAZConstants[Ty];
   if (Entry == 0)
     Entry = new ConstantAggregateZero(Ty);
-  
+
   return Entry;
 }
 
@@ -1232,7 +1232,7 @@ ConstantPointerNull *ConstantPointerNull::get(PointerType *Ty) {
   ConstantPointerNull *&Entry = Ty->getContext().pImpl->CPNConstants[Ty];
   if (Entry == 0)
     Entry = new ConstantPointerNull(Ty);
-  
+
   return Entry;
 }
 
@@ -1252,7 +1252,7 @@ UndefValue *UndefValue::get(Type *Ty) {
   UndefValue *&Entry = Ty->getContext().pImpl->UVConstants[Ty];
   if (Entry == 0)
     Entry = new UndefValue(Ty);
-  
+
   return Entry;
 }
 
@@ -1277,7 +1277,7 @@ BlockAddress *BlockAddress::get(Function *F, BasicBlock *BB) {
     F->getContext().pImpl->BlockAddresses[std::make_pair(F, BB)];
   if (BA == 0)
     BA = new BlockAddress(F, BB);
-  
+
   assert(BA->getFunction() == F && "Basic block moved between functions");
   return BA;
 }
@@ -1305,19 +1305,19 @@ void BlockAddress::replaceUsesOfWithOnConstant(Value *From, Value *To, Use *U) {
   // case, we have to remove the map entry.
   Function *NewF = getFunction();
   BasicBlock *NewBB = getBasicBlock();
-  
+
   if (U == &Op<0>())
     NewF = cast<Function>(To);
   else
     NewBB = cast<BasicBlock>(To);
-  
+
   // See if the 'new' entry already exists, if not, just update this in place
   // and return early.
   BlockAddress *&NewBA =
     getContext().pImpl->BlockAddresses[std::make_pair(NewF, NewBB)];
   if (NewBA == 0) {
     getBasicBlock()->AdjustBlockAddressRefCount(-1);
-    
+
     // Remove the old entry, this can't cause the map to rehash (just a
     // tombstone will get added).
     getContext().pImpl->BlockAddresses.erase(std::make_pair(getFunction(),
@@ -1331,10 +1331,10 @@ void BlockAddress::replaceUsesOfWithOnConstant(Value *From, Value *To, Use *U) {
 
   // Otherwise, I do need to replace this with an existing value.
   assert(NewBA != this && "I didn't contain From!");
-  
+
   // Everyone using this now uses the replacement.
   replaceAllUsesWith(NewBA);
-  
+
   destroyConstant();
 }
 
@@ -1355,10 +1355,10 @@ static inline Constant *getFoldedCast(
   // Look up the constant in the table first to ensure uniqueness
   std::vector<Constant*> argVec(1, C);
   ExprMapKeyType Key(opc, argVec);
-  
+
   return pImpl->ExprConstants.getOrCreate(Ty, Key);
 }
- 
+
 Constant *ConstantExpr::getCast(unsigned oc, Constant *C, Type *Ty) {
   Instruction::CastOps opc = Instruction::CastOps(oc);
   assert(Instruction::isCast(opc) && "opcode out of range");
@@ -1381,7 +1381,7 @@ Constant *ConstantExpr::getCast(unsigned oc, Constant *C, Type *Ty) {
   case Instruction::IntToPtr: return getIntToPtr(C, Ty);
   case Instruction::BitCast:  return getBitCast(C, Ty);
   }
-} 
+}
 
 Constant *ConstantExpr::getZExtOrBitCast(Constant *C, Type *Ty) {
   if (C->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits())
@@ -1572,11 +1572,11 @@ Constant *ConstantExpr::getIntToPtr(Constant *C, Type *DstTy) {
 Constant *ConstantExpr::getBitCast(Constant *C, Type *DstTy) {
   assert(CastInst::castIsValid(Instruction::BitCast, C, DstTy) &&
          "Invalid constantexpr bitcast!");
-  
+
   // It is common to ask for a bitcast of a value to its own type, handle this
   // speedily.
   if (C->getType() == DstTy) return C;
-  
+
   return getFoldedCast(Instruction::BitCast, C, DstTy);
 }
 
@@ -1588,7 +1588,7 @@ Constant *ConstantExpr::get(unsigned Opcode, Constant *C1, Constant *C2,
          "Invalid opcode in binary constant expression");
   assert(C1->getType() == C2->getType() &&
          "Operand types in binary constant expression should match");
-  
+
 #ifndef NDEBUG
   switch (Opcode) {
   case Instruction::Add:
@@ -1649,11 +1649,11 @@ Constant *ConstantExpr::get(unsigned Opcode, Constant *C1, Constant *C2,
 
   if (Constant *FC = ConstantFoldBinaryInstruction(Opcode, C1, C2))
     return FC;          // Fold a few common cases.
-  
+
   std::vector<Constant*> argVec(1, C1);
   argVec.push_back(C2);
   ExprMapKeyType Key(Opcode, argVec, 0, Flags);
-  
+
   LLVMContextImpl *pImpl = C1->getContext().pImpl;
   return pImpl->ExprConstants.getOrCreate(C1->getType(), Key);
 }
@@ -1703,7 +1703,7 @@ Constant *ConstantExpr::getOffsetOf(Type* Ty, Constant *FieldNo) {
 Constant *ConstantExpr::getCompare(unsigned short Predicate, 
                                    Constant *C1, Constant *C2) {
   assert(C1->getType() == C2->getType() && "Op types should be identical!");
-  
+
   switch (Predicate) {
   default: llvm_unreachable("Invalid CmpInst predicate");
   case CmpInst::FCMP_FALSE: case CmpInst::FCMP_OEQ: case CmpInst::FCMP_OGT:
@@ -1713,7 +1713,7 @@ Constant *ConstantExpr::getCompare(unsigned short Predicate,
   case CmpInst::FCMP_ULT:   case CmpInst::FCMP_ULE: case CmpInst::FCMP_UNE:
   case CmpInst::FCMP_TRUE:
     return getFCmp(Predicate, C1, C2);
-    
+
   case CmpInst::ICMP_EQ:  case CmpInst::ICMP_NE:  case CmpInst::ICMP_UGT:
   case CmpInst::ICMP_UGE: case CmpInst::ICMP_ULT: case CmpInst::ICMP_ULE:
   case CmpInst::ICMP_SGT: case CmpInst::ICMP_SGE: case CmpInst::ICMP_SLT:
@@ -1732,7 +1732,7 @@ Constant *ConstantExpr::getSelect(Constant *C, Constant *V1, Constant *V2) {
   argVec[1] = V1;
   argVec[2] = V2;
   ExprMapKeyType Key(Instruction::Select, argVec);
-  
+
   LLVMContextImpl *pImpl = C->getContext().pImpl;
   return pImpl->ExprConstants.getOrCreate(V1->getType(), Key);
 }
@@ -1747,7 +1747,7 @@ Constant *ConstantExpr::getGetElementPtr(Constant *C, ArrayRef<Value *> Idxs,
   assert(Ty && "GEP indices invalid!");
   unsigned AS = C->getType()->getPointerAddressSpace();
   Type *ReqTy = Ty->getPointerTo(AS);
-  
+
   assert(C->getType()->isPointerTy() &&
          "Non-pointer type for constant GetElementPtr expression");
   // Look up the constant in the table first to ensure uniqueness
@@ -1758,7 +1758,7 @@ Constant *ConstantExpr::getGetElementPtr(Constant *C, ArrayRef<Value *> Idxs,
     ArgVec.push_back(cast<Constant>(Idxs[i]));
   const ExprMapKeyType Key(Instruction::GetElementPtr, ArgVec, 0,
                            InBounds ? GEPOperator::IsInBounds : 0);
-  
+
   LLVMContextImpl *pImpl = C->getContext().pImpl;
   return pImpl->ExprConstants.getOrCreate(ReqTy, Key);
 }
@@ -1815,15 +1815,15 @@ Constant *ConstantExpr::getExtractElement(Constant *Val, Constant *Idx) {
          "Tried to create extractelement operation on non-vector type!");
   assert(Idx->getType()->isIntegerTy(32) &&
          "Extractelement index must be i32 type!");
-  
+
   if (Constant *FC = ConstantFoldExtractElementInstruction(Val, Idx))
     return FC;          // Fold a few common cases.
-  
+
   // Look up the constant in the table first to ensure uniqueness
   std::vector<Constant*> ArgVec(1, Val);
   ArgVec.push_back(Idx);
   const ExprMapKeyType Key(Instruction::ExtractElement,ArgVec);
-  
+
   LLVMContextImpl *pImpl = Val->getContext().pImpl;
   Type *ReqTy = Val->getType()->getVectorElementType();
   return pImpl->ExprConstants.getOrCreate(ReqTy, Key);
@@ -1845,7 +1845,7 @@ Constant *ConstantExpr::getInsertElement(Constant *Val, Constant *Elt,
   ArgVec.push_back(Elt);
   ArgVec.push_back(Idx);
   const ExprMapKeyType Key(Instruction::InsertElement,ArgVec);
-  
+
   LLVMContextImpl *pImpl = Val->getContext().pImpl;
   return pImpl->ExprConstants.getOrCreate(Val->getType(), Key);
 }
@@ -1867,7 +1867,7 @@ Constant *ConstantExpr::getShuffleVector(Constant *V1, Constant *V2,
   ArgVec.push_back(V2);
   ArgVec.push_back(Mask);
   const ExprMapKeyType Key(Instruction::ShuffleVector,ArgVec);
-  
+
   LLVMContextImpl *pImpl = ShufTy->getContext().pImpl;
   return pImpl->ExprConstants.getOrCreate(ShufTy, Key);
 }
@@ -1892,7 +1892,7 @@ Constant *ConstantExpr::getExtractValue(Constant *Agg,
   Type *ReqTy = ExtractValueInst::getIndexedType(Agg->getType(), Idxs);
   (void)ReqTy;
   assert(ReqTy && "extractvalue indices invalid!");
-  
+
   assert(Agg->getType()->isFirstClassType() &&
          "Non-first-class type for constant extractvalue expression");
   Constant *FC = ConstantFoldExtractValueInstruction(Agg, Idxs);
@@ -2007,6 +2007,47 @@ Constant *ConstantExpr::getAShr(Constant *C1, Constant *C2, bool isExact) {
              isExact ? PossiblyExactOperator::IsExact : 0);
 }
 
+/// getBinOpIdentity - Return the identity for the given binary operation,
+/// i.e. a constant C such that X op C = X and C op X = X for every X.  It
+/// returns null if the operator doesn't have an identity.
+Constant *ConstantExpr::getBinOpIdentity(unsigned Opcode, Type *Ty) {
+  switch (Opcode) {
+  default:
+    // Doesn't have an identity.
+    return 0;
+
+  case Instruction::Add:
+  case Instruction::Or:
+  case Instruction::Xor:
+    return Constant::getNullValue(Ty);
+
+  case Instruction::Mul:
+    return ConstantInt::get(Ty, 1);
+
+  case Instruction::And:
+    return Constant::getAllOnesValue(Ty);
+  }
+}
+
+/// getBinOpAbsorber - Return the absorbing element for the given binary
+/// operation, i.e. a constant C such that X op C = C and C op X = C for
+/// every X.  For example, this returns zero for integer multiplication.
+/// It returns null if the operator doesn't have an absorbing element.
+Constant *ConstantExpr::getBinOpAbsorber(unsigned Opcode, Type *Ty) {
+  switch (Opcode) {
+  default:
+    // Doesn't have an absorber.
+    return 0;
+
+  case Instruction::Or:
+    return Constant::getAllOnesValue(Ty);
+
+  case Instruction::And:
+  case Instruction::Mul:
+    return Constant::getNullValue(Ty);
+  }
+}
+
 // destroyConstant - Remove the constant from the constant table...
 //
 void ConstantExpr::destroyConstant() {
@@ -2107,7 +2148,7 @@ Constant *ConstantDataSequential::getImpl(StringRef Elements, Type *Ty) {
   // Do a lookup to see if we have already formed one of these.
   StringMap<ConstantDataSequential*>::MapEntryTy &Slot =
     Ty->getContext().pImpl->CDSConstants.GetOrCreateValue(Elements);
-  
+
   // The bucket can point to a linked list of different CDS's that have the same
   // body but different types.  For example, 0,0,0,1 could be a 4 element array
   // of i8, or a 1-element array of i32.  They'll both end up in the same
@@ -2117,7 +2158,7 @@ Constant *ConstantDataSequential::getImpl(StringRef Elements, Type *Ty) {
        Entry = &Node->Next, Node = *Entry)
     if (Node->getType() == Ty)
       return Node;
-  
+
   // Okay, we didn't get a hit.  Create a node of the right class, link it in,
   // and return it.
   if (isa<ArrayType>(Ty))
@@ -2131,7 +2172,7 @@ void ConstantDataSequential::destroyConstant() {
   // Remove the constant from the StringMap.
   StringMap<ConstantDataSequential*> &CDSConstants = 
     getType()->getContext().pImpl->CDSConstants;
-  
+
   StringMap<ConstantDataSequential*>::iterator Slot =
     CDSConstants.find(getRawDataValues());
 
@@ -2158,11 +2199,11 @@ void ConstantDataSequential::destroyConstant() {
       }
     }
   }
-  
+
   // If we were part of a list, make sure that we don't delete the list that is
   // still owned by the uniquing map.
   Next = 0;
-  
+
   // Finally, actually delete it.
   destroyConstantImpl();
 }
@@ -2172,27 +2213,33 @@ void ConstantDataSequential::destroyConstant() {
 /// can return a ConstantAggregateZero object.
 Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<uint8_t> Elts) {
   Type *Ty = ArrayType::get(Type::getInt8Ty(Context), Elts.size());
-  return getImpl(StringRef((char*)Elts.data(), Elts.size()*1), Ty);
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*1), Ty);
 }
 Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<uint16_t> Elts){
   Type *Ty = ArrayType::get(Type::getInt16Ty(Context), Elts.size());
-  return getImpl(StringRef((char*)Elts.data(), Elts.size()*2), Ty);
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*2), Ty);
 }
 Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<uint32_t> Elts){
   Type *Ty = ArrayType::get(Type::getInt32Ty(Context), Elts.size());
-  return getImpl(StringRef((char*)Elts.data(), Elts.size()*4), Ty);
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*4), Ty);
 }
 Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<uint64_t> Elts){
   Type *Ty = ArrayType::get(Type::getInt64Ty(Context), Elts.size());
-  return getImpl(StringRef((char*)Elts.data(), Elts.size()*8), Ty);
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*8), Ty);
 }
 Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<float> Elts) {
   Type *Ty = ArrayType::get(Type::getFloatTy(Context), Elts.size());
-  return getImpl(StringRef((char*)Elts.data(), Elts.size()*4), Ty);
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*4), Ty);
 }
 Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<double> Elts) {
   Type *Ty = ArrayType::get(Type::getDoubleTy(Context), Elts.size());
-  return getImpl(StringRef((char*)Elts.data(), Elts.size()*8), Ty);
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*8), Ty);
 }
 
 /// getString - This method constructs a CDS and initializes it with a text
@@ -2202,9 +2249,12 @@ Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<double> Elts) {
 /// to disable this behavior.
 Constant *ConstantDataArray::getString(LLVMContext &Context,
                                        StringRef Str, bool AddNull) {
-  if (!AddNull)
-    return get(Context, ArrayRef<uint8_t>((uint8_t*)Str.data(), Str.size()));
-  
+  if (!AddNull) {
+    const uint8_t *Data = reinterpret_cast<const uint8_t *>(Str.data());
+    return get(Context, ArrayRef<uint8_t>(const_cast<uint8_t *>(Data),
+               Str.size()));
+  }
+
   SmallVector<uint8_t, 64> ElementVals;
   ElementVals.append(Str.begin(), Str.end());
   ElementVals.push_back(0);
@@ -2216,27 +2266,33 @@ Constant *ConstantDataArray::getString(LLVMContext &Context,
 /// can return a ConstantAggregateZero object.
 Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<uint8_t> Elts){
   Type *Ty = VectorType::get(Type::getInt8Ty(Context), Elts.size());
-  return getImpl(StringRef((char*)Elts.data(), Elts.size()*1), Ty);
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*1), Ty);
 }
 Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<uint16_t> Elts){
   Type *Ty = VectorType::get(Type::getInt16Ty(Context), Elts.size());
-  return getImpl(StringRef((char*)Elts.data(), Elts.size()*2), Ty);
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*2), Ty);
 }
 Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<uint32_t> Elts){
   Type *Ty = VectorType::get(Type::getInt32Ty(Context), Elts.size());
-  return getImpl(StringRef((char*)Elts.data(), Elts.size()*4), Ty);
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*4), Ty);
 }
 Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<uint64_t> Elts){
   Type *Ty = VectorType::get(Type::getInt64Ty(Context), Elts.size());
-  return getImpl(StringRef((char*)Elts.data(), Elts.size()*8), Ty);
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*8), Ty);
 }
 Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<float> Elts) {
   Type *Ty = VectorType::get(Type::getFloatTy(Context), Elts.size());
-  return getImpl(StringRef((char*)Elts.data(), Elts.size()*4), Ty);
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*4), Ty);
 }
 Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<double> Elts) {
   Type *Ty = VectorType::get(Type::getDoubleTy(Context), Elts.size());
-  return getImpl(StringRef((char*)Elts.data(), Elts.size()*8), Ty);
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*8), Ty);
 }
 
 Constant *ConstantDataVector::getSplat(unsigned NumElts, Constant *V) {
@@ -2281,15 +2337,19 @@ uint64_t ConstantDataSequential::getElementAsInteger(unsigned Elt) const {
   assert(isa<IntegerType>(getElementType()) &&
          "Accessor can only be used when element is an integer");
   const char *EltPtr = getElementPointer(Elt);
-  
+
   // The data is stored in host byte order, make sure to cast back to the right
   // type to load with the right endianness.
   switch (getElementType()->getIntegerBitWidth()) {
   default: llvm_unreachable("Invalid bitwidth for CDS");
-  case 8:  return *(uint8_t*)EltPtr;
-  case 16: return *(uint16_t*)EltPtr;
-  case 32: return *(uint32_t*)EltPtr;
-  case 64: return *(uint64_t*)EltPtr;
+  case 8:
+    return *const_cast<uint8_t *>(reinterpret_cast<const uint8_t *>(EltPtr));
+  case 16:
+    return *const_cast<uint16_t *>(reinterpret_cast<const uint16_t *>(EltPtr));
+  case 32:
+    return *const_cast<uint32_t *>(reinterpret_cast<const uint32_t *>(EltPtr));
+  case 64:
+    return *const_cast<uint64_t *>(reinterpret_cast<const uint64_t *>(EltPtr));
   }
 }
 
@@ -2301,8 +2361,14 @@ APFloat ConstantDataSequential::getElementAsAPFloat(unsigned Elt) const {
   switch (getElementType()->getTypeID()) {
   default:
     llvm_unreachable("Accessor can only be used when element is float/double!");
-  case Type::FloatTyID: return APFloat(*(float*)EltPtr);
-  case Type::DoubleTyID: return APFloat(*(double*)EltPtr);
+  case Type::FloatTyID: {
+      const float *FloatPrt = reinterpret_cast<const float *>(EltPtr);
+      return APFloat(*const_cast<float *>(FloatPrt));
+    }
+  case Type::DoubleTyID: {
+      const double *DoublePtr = reinterpret_cast<const double *>(EltPtr);
+      return APFloat(*const_cast<double *>(DoublePtr));
+    }
   }
 }
 
@@ -2311,7 +2377,8 @@ APFloat ConstantDataSequential::getElementAsAPFloat(unsigned Elt) const {
 float ConstantDataSequential::getElementAsFloat(unsigned Elt) const {
   assert(getElementType()->isFloatTy() &&
          "Accessor can only be used when element is a 'float'");
-  return *(float*)getElementPointer(Elt);
+  const float *EltPtr = reinterpret_cast<const float *>(getElementPointer(Elt));
+  return *const_cast<float *>(EltPtr);
 }
 
 /// getElementAsDouble - If this is an sequential container of doubles, return
@@ -2319,7 +2386,9 @@ float ConstantDataSequential::getElementAsFloat(unsigned Elt) const {
 double ConstantDataSequential::getElementAsDouble(unsigned Elt) const {
   assert(getElementType()->isDoubleTy() &&
          "Accessor can only be used when element is a 'float'");
-  return *(double*)getElementPointer(Elt);
+  const double *EltPtr =
+      reinterpret_cast<const double *>(getElementPointer(Elt));
+  return *const_cast<double *>(EltPtr);
 }
 
 /// getElementAsConstant - Return a Constant for a specified index's element.
@@ -2328,7 +2397,7 @@ double ConstantDataSequential::getElementAsDouble(unsigned Elt) const {
 Constant *ConstantDataSequential::getElementAsConstant(unsigned Elt) const {
   if (getElementType()->isFloatTy() || getElementType()->isDoubleTy())
     return ConstantFP::get(getContext(), getElementAsAPFloat(Elt));
-  
+
   return ConstantInt::get(getElementType(), getElementAsInteger(Elt));
 }
 
@@ -2342,12 +2411,12 @@ bool ConstantDataSequential::isString() const {
 bool ConstantDataSequential::isCString() const {
   if (!isString())
     return false;
-  
+
   StringRef Str = getAsString();
-  
+
   // The last value must be nul.
   if (Str.back() != 0) return false;
-  
+
   // Other elements must be non-nul.
   return Str.drop_back().find(0) == StringRef::npos;
 }
@@ -2356,13 +2425,13 @@ bool ConstantDataSequential::isCString() const {
 /// elements have the same value, return that value. Otherwise return NULL.
 Constant *ConstantDataVector::getSplatValue() const {
   const char *Base = getRawDataValues().data();
-  
+
   // Compare elements 1+ to the 0'th element.
   unsigned EltSize = getElementByteSize();
   for (unsigned i = 1, e = getNumElements(); i != e; ++i)
     if (memcmp(Base, Base+i*EltSize, EltSize))
       return 0;
-  
+
   // If they're all the same, return the 0th one as a representative.
   return getElementAsConstant(0);
 }
@@ -2393,10 +2462,10 @@ void ConstantArray::replaceUsesOfWithOnConstant(Value *From, Value *To,
   Lookup.first = cast<ArrayType>(getType());
   Values.reserve(getNumOperands());  // Build replacement array.
 
-  // Fill values with the modified operands of the constant array.  Also, 
+  // Fill values with the modified operands of the constant array.  Also,
   // compute whether this turns into an all-zeros array.
   unsigned NumUpdated = 0;
-  
+
   // Keep track of whether all the values in the array are "ToC".
   bool AllSame = true;
   for (Use *O = OperandList, *E = OperandList+getNumOperands(); O != E; ++O) {
@@ -2408,7 +2477,7 @@ void ConstantArray::replaceUsesOfWithOnConstant(Value *From, Value *To,
     Values.push_back(Val);
     AllSame &= Val == ToC;
   }
-  
+
   Constant *Replacement = 0;
   if (AllSame && ToC->isNullValue()) {
     Replacement = ConstantAggregateZero::get(getType());
@@ -2419,7 +2488,7 @@ void ConstantArray::replaceUsesOfWithOnConstant(Value *From, Value *To,
     Lookup.second = makeArrayRef(Values);
     LLVMContextImpl::ArrayConstantsTy::MapTy::iterator I =
       pImpl->ArrayConstants.find(Lookup);
-    
+
     if (I != pImpl->ArrayConstants.map_end()) {
       Replacement = I->first;
     } else {
@@ -2428,7 +2497,7 @@ void ConstantArray::replaceUsesOfWithOnConstant(Value *From, Value *To,
       // old with the new, then deleting the old... just update the current one
       // in place!
       pImpl->ArrayConstants.remove(this);
-      
+
       // Update to the new value.  Optimize for the case when we have a single
       // operand that we're changing, but handle bulk updates efficiently.
       if (NumUpdated == 1) {
@@ -2445,13 +2514,13 @@ void ConstantArray::replaceUsesOfWithOnConstant(Value *From, Value *To,
       return;
     }
   }
- 
+
   // Otherwise, I do need to replace this with an existing value.
   assert(Replacement != this && "I didn't contain From!");
-  
+
   // Everyone using this now uses the replacement.
   replaceAllUsesWith(Replacement);
-  
+
   // Delete the old constant!
   destroyConstant();
 }
@@ -2468,8 +2537,8 @@ void ConstantStruct::replaceUsesOfWithOnConstant(Value *From, Value *To,
   LLVMContextImpl::StructConstantsTy::LookupKey Lookup;
   Lookup.first = cast<StructType>(getType());
   Values.reserve(getNumOperands());  // Build replacement struct.
-  
-  // Fill values with the modified operands of the constant struct.  Also, 
+
+  // Fill values with the modified operands of the constant struct.  Also,
   // compute whether this turns into an all-zeros struct.
   bool isAllZeros = false;
   bool isAllUndef = false;
@@ -2492,9 +2561,9 @@ void ConstantStruct::replaceUsesOfWithOnConstant(Value *From, Value *To,
       Values.push_back(cast<Constant>(O->get()));
   }
   Values[OperandToUpdate] = ToC;
-  
+
   LLVMContextImpl *pImpl = getContext().pImpl;
-  
+
   Constant *Replacement = 0;
   if (isAllZeros) {
     Replacement = ConstantAggregateZero::get(getType());
@@ -2505,7 +2574,7 @@ void ConstantStruct::replaceUsesOfWithOnConstant(Value *From, Value *To,
     Lookup.second = makeArrayRef(Values);
     LLVMContextImpl::StructConstantsTy::MapTy::iterator I =
       pImpl->StructConstants.find(Lookup);
-    
+
     if (I != pImpl->StructConstants.map_end()) {
       Replacement = I->first;
     } else {
@@ -2514,19 +2583,19 @@ void ConstantStruct::replaceUsesOfWithOnConstant(Value *From, Value *To,
       // old with the new, then deleting the old... just update the current one
       // in place!
       pImpl->StructConstants.remove(this);
-      
+
       // Update to the new value.
       setOperand(OperandToUpdate, ToC);
       pImpl->StructConstants.insert(this);
       return;
     }
   }
-  
+
   assert(Replacement != this && "I didn't contain From!");
-  
+
   // Everyone using this now uses the replacement.
   replaceAllUsesWith(Replacement);
-  
+
   // Delete the old constant!
   destroyConstant();
 }
@@ -2534,7 +2603,7 @@ void ConstantStruct::replaceUsesOfWithOnConstant(Value *From, Value *To,
 void ConstantVector::replaceUsesOfWithOnConstant(Value *From, Value *To,
                                                  Use *U) {
   assert(isa<Constant>(To) && "Cannot make Constant refer to non-constant!");
-  
+
   SmallVector<Constant*, 8> Values;
   Values.reserve(getNumOperands());  // Build replacement array...
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
@@ -2542,13 +2611,13 @@ void ConstantVector::replaceUsesOfWithOnConstant(Value *From, Value *To,
     if (Val == From) Val = cast<Constant>(To);
     Values.push_back(Val);
   }
-  
+
   Constant *Replacement = get(Values);
   assert(Replacement != this && "I didn't contain From!");
-  
+
   // Everyone using this now uses the replacement.
   replaceAllUsesWith(Replacement);
-  
+
   // Delete the old constant!
   destroyConstant();
 }
@@ -2557,19 +2626,19 @@ void ConstantExpr::replaceUsesOfWithOnConstant(Value *From, Value *ToV,
                                                Use *U) {
   assert(isa<Constant>(ToV) && "Cannot make Constant refer to non-constant!");
   Constant *To = cast<Constant>(ToV);
-  
+
   SmallVector<Constant*, 8> NewOps;
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
     Constant *Op = getOperand(i);
     NewOps.push_back(Op == From ? To : Op);
   }
-  
+
   Constant *Replacement = getWithOperands(NewOps);
   assert(Replacement != this && "I didn't contain From!");
-  
+
   // Everyone using this now uses the replacement.
   replaceAllUsesWith(Replacement);
-  
+
   // Delete the old constant!
   destroyConstant();
 }
diff --git a/lib/VMCore/Core.cpp b/lib/VMCore/Core.cpp
index a9cca22..972db3c 100644
--- a/lib/VMCore/Core.cpp
+++ b/lib/VMCore/Core.cpp
@@ -115,6 +115,25 @@ void LLVMDumpModule(LLVMModuleRef M) {
   unwrap(M)->dump();
 }
 
+LLVMBool LLVMPrintModuleToFile(LLVMModuleRef M, const char *Filename,
+                               char **ErrorMessage) {
+  std::string error;
+  raw_fd_ostream dest(Filename, error);
+  if (!error.empty()) {
+    *ErrorMessage = strdup(error.c_str());
+    return true;
+  }
+
+  unwrap(M)->print(dest, NULL);
+
+  if (!error.empty()) {
+    *ErrorMessage = strdup(error.c_str());
+    return true;
+  }
+  dest.flush();
+  return false;
+}
+
 /*--.. Operations on inline assembler ......................................--*/
 void LLVMSetModuleInlineAsm(LLVMModuleRef M, const char *Asm) {
   unwrap(M)->setModuleInlineAsm(StringRef(Asm));
@@ -1191,7 +1210,7 @@ LLVMValueRef LLVMAddGlobalInAddressSpace(LLVMModuleRef M, LLVMTypeRef Ty,
                                          unsigned AddressSpace) {
   return wrap(new GlobalVariable(*unwrap(M), unwrap(Ty), false,
                                  GlobalValue::ExternalLinkage, 0, Name, 0,
-                                 false, AddressSpace));
+                                 GlobalVariable::NotThreadLocal, AddressSpace));
 }
 
 LLVMValueRef LLVMGetNamedGlobal(LLVMModuleRef M, const char *Name) {
diff --git a/lib/Analysis/DIBuilder.cpp b/lib/VMCore/DIBuilder.cpp
index 85913b1..f5894e9 100644
--- a/lib/Analysis/DIBuilder.cpp
+++ b/lib/VMCore/DIBuilder.cpp
@@ -11,9 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/DIBuilder.h"
-#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/DIBuilder.h"
 #include "llvm/Constants.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Module.h"
 #include "llvm/ADT/STLExtras.h"
@@ -47,16 +47,16 @@ void DIBuilder::finalize() {
   DIType(TempSubprograms).replaceAllUsesWith(SPs);
   for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) {
     DISubprogram SP(SPs.getElement(i));
+    SmallVector<Value *, 4> Variables;
     if (NamedMDNode *NMD = getFnSpecificMDNode(M, SP)) {
-      SmallVector<Value *, 4> Variables;
       for (unsigned ii = 0, ee = NMD->getNumOperands(); ii != ee; ++ii)
         Variables.push_back(NMD->getOperand(ii));
-      if (MDNode *Temp = SP.getVariablesNodes()) {
-        DIArray AV = getOrCreateArray(Variables);
-        DIType(Temp).replaceAllUsesWith(AV);
-      }
       NMD->eraseFromParent();
     }
+    if (MDNode *Temp = SP.getVariablesNodes()) {
+      DIArray AV = getOrCreateArray(Variables);
+      DIType(Temp).replaceAllUsesWith(AV);
+    }
   }
 
   DIArray GVs = getOrCreateArray(AllGVs);
@@ -101,7 +101,7 @@ void DIBuilder::createCompileUnit(unsigned Lang, StringRef Filename,
 
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_compile_unit),
-    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Constant::getNullValue(Type::getInt32Ty(VMContext)),
     ConstantInt::get(Type::getInt32Ty(VMContext), Lang),
     MDString::get(VMContext, Filename),
     MDString::get(VMContext, Directory),
@@ -163,7 +163,7 @@ DIType DIBuilder::createNullPtrType(StringRef Name) {
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags;
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Encoding
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0)  // Encoding
   };
   return DIType(MDNode::get(VMContext, Elts));
 }
@@ -229,12 +229,13 @@ DIType DIBuilder::createPointerType(DIType PointeeTy, uint64_t SizeInBits,
   return DIType(MDNode::get(VMContext, Elts));
 }
 
-/// createReferenceType - Create debugging information entry for a reference.
-DIType DIBuilder::createReferenceType(DIType RTy) {
+/// createReferenceType - Create debugging information entry for a reference
+/// type.
+DIType DIBuilder::createReferenceType(unsigned Tag, DIType RTy) {
   assert(RTy.Verify() && "Unable to create reference type");
   // References are encoded in DIDerivedType format.
   Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_reference_type),
+    GetTagConstant(VMContext, Tag),
     NULL, // TheCU,
     NULL, // Name
     NULL, // Filename
@@ -387,11 +388,11 @@ DIType DIBuilder::createObjCIVar(StringRef Name,
 /// createObjCProperty - Create debugging information entry for Objective-C
 /// property.
 DIObjCProperty DIBuilder::createObjCProperty(StringRef Name,
-					     DIFile File, unsigned LineNumber,
+                                             DIFile File, unsigned LineNumber,
                                              StringRef GetterName,
                                              StringRef SetterName, 
                                              unsigned PropertyAttributes,
-					     DIType Ty) {
+                                             DIType Ty) {
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_APPLE_property),
     MDString::get(VMContext, Name),
@@ -405,33 +406,6 @@ DIObjCProperty DIBuilder::createObjCProperty(StringRef Name,
   return DIObjCProperty(MDNode::get(VMContext, Elts));
 }
 
-/// createClassType - Create debugging information entry for a class.
-DIType DIBuilder::createClassType(DIDescriptor Context, StringRef Name,
-                                  DIFile File, unsigned LineNumber,
-                                  uint64_t SizeInBits, uint64_t AlignInBits,
-                                  uint64_t OffsetInBits, unsigned Flags,
-                                  DIType DerivedFrom, DIArray Elements,
-                                  MDNode *VTableHolder, MDNode *TemplateParams) {
- // TAG_class_type is encoded in DICompositeType format.
-  Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_class_type),
-    getNonCompileUnitScope(Context),
-    MDString::get(VMContext, Name),
-    File,
-    ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
-    ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
-    ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
-    ConstantInt::get(Type::getInt32Ty(VMContext), OffsetInBits),
-    ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
-    DerivedFrom,
-    Elements,
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    VTableHolder,
-    TemplateParams
-  };
-  return DIType(MDNode::get(VMContext, Elts));
-}
-
 /// createTemplateTypeParameter - Create debugging information for template
 /// type parameter.
 DITemplateTypeParameter
@@ -470,6 +444,34 @@ DIBuilder::createTemplateValueParameter(DIDescriptor Context, StringRef Name,
   return DITemplateValueParameter(MDNode::get(VMContext, Elts));
 }
 
+/// createClassType - Create debugging information entry for a class.
+DIType DIBuilder::createClassType(DIDescriptor Context, StringRef Name,
+                                  DIFile File, unsigned LineNumber,
+                                  uint64_t SizeInBits, uint64_t AlignInBits,
+                                  uint64_t OffsetInBits, unsigned Flags,
+                                  DIType DerivedFrom, DIArray Elements,
+                                  MDNode *VTableHolder,
+                                  MDNode *TemplateParams) {
+ // TAG_class_type is encoded in DICompositeType format.
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_class_type),
+    getNonCompileUnitScope(Context),
+    MDString::get(VMContext, Name),
+    File,
+    ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
+    ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
+    ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
+    ConstantInt::get(Type::getInt32Ty(VMContext), OffsetInBits),
+    ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
+    DerivedFrom,
+    Elements,
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    VTableHolder,
+    TemplateParams
+  };
+  return DIType(MDNode::get(VMContext, Elts));
+}
+
 /// createStructType - Create debugging information entry for a struct.
 DIType DIBuilder::createStructType(DIDescriptor Context, StringRef Name,
                                    DIFile File, unsigned LineNumber,
@@ -490,7 +492,7 @@ DIType DIBuilder::createStructType(DIDescriptor Context, StringRef Name,
     NULL,
     Elements,
     ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeLang),
-    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Constant::getNullValue(Type::getInt32Ty(VMContext))
   };
   return DIType(MDNode::get(VMContext, Elts));
 }
@@ -515,7 +517,7 @@ DIType DIBuilder::createUnionType(DIDescriptor Scope, StringRef Name,
     NULL,
     Elements,
     ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeLang),
-    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Constant::getNullValue(Type::getInt32Ty(VMContext))
   };
   return DIType(MDNode::get(VMContext, Elts));
 }
@@ -525,9 +527,9 @@ DIType DIBuilder::createSubroutineType(DIFile File, DIArray ParameterTypes) {
   // TAG_subroutine_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_subroutine_type),
-    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Constant::getNullValue(Type::getInt32Ty(VMContext)),
     MDString::get(VMContext, ""),
-    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Constant::getNullValue(Type::getInt32Ty(VMContext)),
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
     ConstantInt::get(Type::getInt64Ty(VMContext), 0),
     ConstantInt::get(Type::getInt64Ty(VMContext), 0),
@@ -536,7 +538,7 @@ DIType DIBuilder::createSubroutineType(DIFile File, DIArray ParameterTypes) {
     NULL,
     ParameterTypes,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Constant::getNullValue(Type::getInt32Ty(VMContext))
   };
   return DIType(MDNode::get(VMContext, Elts));
 }
@@ -547,7 +549,8 @@ DIType DIBuilder::createEnumerationType(DIDescriptor Scope, StringRef Name,
                                         DIFile File, unsigned LineNumber,
                                         uint64_t SizeInBits,
                                         uint64_t AlignInBits,
-                                        DIArray Elements) {
+                                        DIArray Elements,
+                                        DIType ClassType, unsigned Flags) {
   // TAG_enumeration_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_enumeration_type),
@@ -558,11 +561,11 @@ DIType DIBuilder::createEnumerationType(DIDescriptor Scope, StringRef Name,
     ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    NULL,
+    ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
+    ClassType,
     Elements,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Constant::getNullValue(Type::getInt32Ty(VMContext))
   };
   MDNode *Node = MDNode::get(VMContext, Elts);
   AllEnumTypes.push_back(Node);
@@ -586,7 +589,7 @@ DIType DIBuilder::createArrayType(uint64_t Size, uint64_t AlignInBits,
     Ty,
     Subscripts,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Constant::getNullValue(Type::getInt32Ty(VMContext))
   };
   return DIType(MDNode::get(VMContext, Elts));
 }
@@ -608,7 +611,7 @@ DIType DIBuilder::createVectorType(uint64_t Size, uint64_t AlignInBits,
     Ty,
     Subscripts,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Constant::getNullValue(Type::getInt32Ty(VMContext))
   };
   return DIType(MDNode::get(VMContext, Elts));
 }
@@ -677,12 +680,13 @@ DIType DIBuilder::createTemporaryType(DIFile F) {
 
 /// createForwardDecl - Create a temporary forward-declared type that
 /// can be RAUW'd if the full type is seen.
-DIType DIBuilder::createForwardDecl(unsigned Tag, StringRef Name, DIFile F,
+DIType DIBuilder::createForwardDecl(unsigned Tag, StringRef Name,
+                                    DIDescriptor Scope, DIFile F,
                                     unsigned Line, unsigned RuntimeLang) {
   // Create a temporary MDNode.
   Value *Elts[] = {
     GetTagConstant(VMContext, Tag),
-    NULL, // TheCU
+    getNonCompileUnitScope(Scope),
     MDString::get(VMContext, Name),
     F,
     ConstantInt::get(Type::getInt32Ty(VMContext), Line),
@@ -703,7 +707,7 @@ DIType DIBuilder::createForwardDecl(unsigned Tag, StringRef Name, DIFile F,
 /// getOrCreateArray - Get a DIArray, create one if required.
 DIArray DIBuilder::getOrCreateArray(ArrayRef<Value *> Elements) {
   if (Elements.empty()) {
-    Value *Null = llvm::Constant::getNullValue(Type::getInt32Ty(VMContext));
+    Value *Null = Constant::getNullValue(Type::getInt32Ty(VMContext));
     return DIArray(MDNode::get(VMContext, Null));
   }
   return DIArray(MDNode::get(VMContext, Elements));
@@ -724,10 +728,10 @@ DISubrange DIBuilder::getOrCreateSubrange(int64_t Lo, int64_t Hi) {
 /// createGlobalVariable - Create a new descriptor for the specified global.
 DIGlobalVariable DIBuilder::
 createGlobalVariable(StringRef Name, DIFile F, unsigned LineNumber,
-                     DIType Ty, bool isLocalToUnit, llvm::Value *Val) {
+                     DIType Ty, bool isLocalToUnit, Value *Val) {
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_variable),
-    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Constant::getNullValue(Type::getInt32Ty(VMContext)),
     NULL, // TheCU,
     MDString::get(VMContext, Name),
     MDString::get(VMContext, Name),
@@ -749,10 +753,10 @@ createGlobalVariable(StringRef Name, DIFile F, unsigned LineNumber,
 DIGlobalVariable DIBuilder::
 createStaticVariable(DIDescriptor Context, StringRef Name,
                      StringRef LinkageName, DIFile F, unsigned LineNumber,
-                     DIType Ty, bool isLocalToUnit, llvm::Value *Val) {
+                     DIType Ty, bool isLocalToUnit, Value *Val) {
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_variable),
-    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Constant::getNullValue(Type::getInt32Ty(VMContext)),
     getNonCompileUnitScope(Context),
     MDString::get(VMContext, Name),
     MDString::get(VMContext, Name),
@@ -783,7 +787,7 @@ DIVariable DIBuilder::createLocalVariable(unsigned Tag, DIDescriptor Scope,
     ConstantInt::get(Type::getInt32Ty(VMContext), (LineNo | (ArgNo << 24))),
     Ty,
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
-    Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Constant::getNullValue(Type::getInt32Ty(VMContext))
   };
   MDNode *Node = MDNode::get(VMContext, Elts);
   if (AlwaysPreserve) {
@@ -812,8 +816,8 @@ DIVariable DIBuilder::createComplexVariable(unsigned Tag, DIDescriptor Scope,
   Elts.push_back(ConstantInt::get(Type::getInt32Ty(VMContext),
                                   (LineNo | (ArgNo << 24))));
   Elts.push_back(Ty);
-  Elts.push_back(llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)));
-  Elts.push_back(llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)));
+  Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext)));
+  Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext)));
   Elts.append(Addr.begin(), Addr.end());
 
   return DIVariable(MDNode::get(VMContext, Elts));
@@ -838,7 +842,7 @@ DISubprogram DIBuilder::createFunction(DIDescriptor Context,
 
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_subprogram),
-    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Constant::getNullValue(Type::getInt32Ty(VMContext)),
     getNonCompileUnitScope(Context),
     MDString::get(VMContext, Name),
     MDString::get(VMContext, Name),
@@ -887,7 +891,7 @@ DISubprogram DIBuilder::createMethod(DIDescriptor Context,
 
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_subprogram),
-    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Constant::getNullValue(Type::getInt32Ty(VMContext)),
     getNonCompileUnitScope(Context),
     MDString::get(VMContext, Name),
     MDString::get(VMContext, Name),
@@ -904,9 +908,9 @@ DISubprogram DIBuilder::createMethod(DIDescriptor Context,
     ConstantInt::get(Type::getInt1Ty(VMContext), isOptimized),
     Fn,
     TParam,
-    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Constant::getNullValue(Type::getInt32Ty(VMContext)),
     THolder,
-    // FIXME: Do we want to use a different scope lines?
+    // FIXME: Do we want to use different scope/lines?
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNo)
   };
   MDNode *Node = MDNode::get(VMContext, Elts);
diff --git a/lib/Analysis/DebugInfo.cpp b/lib/VMCore/DebugInfo.cpp
index f61a8f3..c8f8f7d 100644
--- a/lib/Analysis/DebugInfo.cpp
+++ b/lib/VMCore/DebugInfo.cpp
@@ -12,7 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Intrinsics.h"
@@ -112,16 +112,16 @@ Function *DIDescriptor::getFunctionField(unsigned Elt) const {
 }
 
 unsigned DIVariable::getNumAddrElements() const {
-  if (getVersion() <= llvm::LLVMDebugVersion8)
+  if (getVersion() <= LLVMDebugVersion8)
     return DbgNode->getNumOperands()-6;
-  if (getVersion() == llvm::LLVMDebugVersion9)
+  if (getVersion() == LLVMDebugVersion9)
     return DbgNode->getNumOperands()-7;
   return DbgNode->getNumOperands()-8;
 }
 
 /// getInlinedAt - If this variable is inlined then return inline location.
 MDNode *DIVariable::getInlinedAt() const {
-  if (getVersion() <= llvm::LLVMDebugVersion9)
+  if (getVersion() <= LLVMDebugVersion9)
     return NULL;
   return dyn_cast_or_null<MDNode>(DbgNode->getOperand(7));
 }
@@ -150,6 +150,7 @@ bool DIDescriptor::isDerivedType() const {
   case dwarf::DW_TAG_typedef:
   case dwarf::DW_TAG_pointer_type:
   case dwarf::DW_TAG_reference_type:
+  case dwarf::DW_TAG_rvalue_reference_type:
   case dwarf::DW_TAG_const_type:
   case dwarf::DW_TAG_volatile_type:
   case dwarf::DW_TAG_restrict_type:
@@ -399,11 +400,13 @@ bool DIType::Verify() const {
   unsigned Tag = getTag();
   if (!isBasicType() && Tag != dwarf::DW_TAG_const_type &&
       Tag != dwarf::DW_TAG_volatile_type && Tag != dwarf::DW_TAG_pointer_type &&
-      Tag != dwarf::DW_TAG_reference_type && Tag != dwarf::DW_TAG_restrict_type 
-      && Tag != dwarf::DW_TAG_vector_type && Tag != dwarf::DW_TAG_array_type
-      && Tag != dwarf::DW_TAG_enumeration_type 
-      && Tag != dwarf::DW_TAG_subroutine_type
-      && getFilename().empty())
+      Tag != dwarf::DW_TAG_reference_type &&
+      Tag != dwarf::DW_TAG_rvalue_reference_type &&
+      Tag != dwarf::DW_TAG_restrict_type && Tag != dwarf::DW_TAG_vector_type &&
+      Tag != dwarf::DW_TAG_array_type &&
+      Tag != dwarf::DW_TAG_enumeration_type &&
+      Tag != dwarf::DW_TAG_subroutine_type &&
+      getFilename().empty())
     return false;
   return true;
 }
@@ -500,27 +503,28 @@ bool DINameSpace::Verify() const {
 uint64_t DIDerivedType::getOriginalTypeSize() const {
   unsigned Tag = getTag();
 
-  if (Tag == dwarf::DW_TAG_member || Tag == dwarf::DW_TAG_typedef ||
-      Tag == dwarf::DW_TAG_const_type || Tag == dwarf::DW_TAG_volatile_type ||
-      Tag == dwarf::DW_TAG_restrict_type) {
-    DIType BaseType = getTypeDerivedFrom();
-    // If this type is not derived from any type then take conservative
-    // approach.
-    if (!BaseType.isValid())
-      return getSizeInBits();
-    // If this is a derived type, go ahead and get the base type, unless
-    // it's a reference then it's just the size of the field. Pointer types
-    // have no need of this since they're a different type of qualification
-    // on the type.
-    if (BaseType.getTag() == dwarf::DW_TAG_reference_type)
-      return getSizeInBits();
-    else if (BaseType.isDerivedType())
-      return DIDerivedType(BaseType).getOriginalTypeSize();
-    else
-      return BaseType.getSizeInBits();
-  }
+  if (Tag != dwarf::DW_TAG_member && Tag != dwarf::DW_TAG_typedef &&
+      Tag != dwarf::DW_TAG_const_type && Tag != dwarf::DW_TAG_volatile_type &&
+      Tag != dwarf::DW_TAG_restrict_type)
+    return getSizeInBits();
+
+  DIType BaseType = getTypeDerivedFrom();
+
+  // If this type is not derived from any type then take conservative approach.
+  if (!BaseType.isValid())
+    return getSizeInBits();
+
+  // If this is a derived type, go ahead and get the base type, unless it's a
+  // reference then it's just the size of the field. Pointer types have no need
+  // of this since they're a different type of qualification on the type.
+  if (BaseType.getTag() == dwarf::DW_TAG_reference_type ||
+      BaseType.getTag() == dwarf::DW_TAG_rvalue_reference_type)
+    return getSizeInBits();
+
+  if (BaseType.isDerivedType())
+    return DIDerivedType(BaseType).getOriginalTypeSize();
 
-  return getSizeInBits();
+  return BaseType.getSizeInBits();
 }
 
 /// getObjCProperty - Return property node, if this ivar is associated with one.
@@ -538,7 +542,7 @@ bool DIVariable::isInlinedFnArgument(const Function *CurFn) {
     return false;
   // This variable is not inlined function argument if its scope
   // does not describe current function.
-  return !(DISubprogram(getContext()).describes(CurFn));
+  return !DISubprogram(getContext()).describes(CurFn);
 }
 
 /// describes - Return true if this subprogram provides debugging
@@ -660,257 +664,6 @@ DIArray DICompileUnit::getGlobalVariables() const {
   return DIArray();
 }
 
-//===----------------------------------------------------------------------===//
-// DIDescriptor: vtable anchors for all descriptors.
-//===----------------------------------------------------------------------===//
-
-void DIScope::anchor() { }
-
-void DICompileUnit::anchor() { }
-
-void DIFile::anchor() { }
-
-void DIType::anchor() { }
-
-void DIBasicType::anchor() { }
-
-void DIDerivedType::anchor() { }
-
-void DICompositeType::anchor() { }
-
-void DISubprogram::anchor() { }
-
-void DILexicalBlock::anchor() { }
-
-void DINameSpace::anchor() { }
-
-void DILexicalBlockFile::anchor() { }
-
-//===----------------------------------------------------------------------===//
-// DIDescriptor: dump routines for all descriptors.
-//===----------------------------------------------------------------------===//
-
-
-/// print - Print descriptor.
-void DIDescriptor::print(raw_ostream &OS) const {
-  OS << "[" << dwarf::TagString(getTag()) << "] ";
-  OS.write_hex((intptr_t) &*DbgNode) << ']';
-}
-
-/// print - Print compile unit.
-void DICompileUnit::print(raw_ostream &OS) const {
-  if (getLanguage())
-    OS << " [" << dwarf::LanguageString(getLanguage()) << "] ";
-
-  OS << " [" << getDirectory() << "/" << getFilename() << "]";
-}
-
-/// print - Print type.
-void DIType::print(raw_ostream &OS) const {
-  if (!DbgNode) return;
-
-  StringRef Res = getName();
-  if (!Res.empty())
-    OS << " [" << Res << "] ";
-
-  unsigned Tag = getTag();
-  OS << " [" << dwarf::TagString(Tag) << "] ";
-
-  // TODO : Print context
-  OS << " ["
-         << "line " << getLineNumber() << ", "
-         << getSizeInBits() << " bits, "
-         << getAlignInBits() << " bit alignment, "
-         << getOffsetInBits() << " bit offset"
-         << "] ";
-
-  if (isPrivate())
-    OS << " [private] ";
-  else if (isProtected())
-    OS << " [protected] ";
-
-  if (isForwardDecl())
-    OS << " [fwd] ";
-
-  if (isBasicType())
-    DIBasicType(DbgNode).print(OS);
-  else if (isDerivedType()) {
-    DIDerivedType DTy = DIDerivedType(DbgNode);
-    DTy.print(OS);
-    DICompositeType CTy = getDICompositeType(DTy);
-    if (CTy.Verify())
-      CTy.print(OS);
-  }
-  else if (isCompositeType())
-    DICompositeType(DbgNode).print(OS);
-  else {
-    OS << "Invalid DIType\n";
-    return;
-  }
-
-  OS << "\n";
-}
-
-/// print - Print basic type.
-void DIBasicType::print(raw_ostream &OS) const {
-  OS << " [" << dwarf::AttributeEncodingString(getEncoding()) << "] ";
-}
-
-/// print - Print derived type.
-void DIDerivedType::print(raw_ostream &OS) const {
-  OS << "\n\t Derived From: ";
-  getTypeDerivedFrom().print(OS);
-  OS << "\n\t";
-}
-
-/// print - Print composite type.
-void DICompositeType::print(raw_ostream &OS) const {
-  DIArray A = getTypeArray();
-  OS << " [" << A.getNumElements() << " elements]";
-}
-
-/// print - Print subprogram.
-void DISubprogram::print(raw_ostream &OS) const {
-  StringRef Res = getName();
-  if (!Res.empty())
-    OS << " [" << Res << "] ";
-
-  unsigned Tag = getTag();
-  OS << " [" << dwarf::TagString(Tag) << "] ";
-
-  // TODO : Print context
-  OS << " [" << getLineNumber() << "] ";
-
-  if (isLocalToUnit())
-    OS << " [local] ";
-
-  if (isDefinition())
-    OS << " [def] ";
-
-  if (getScopeLineNumber() != getLineNumber())
-    OS << " [Scope: " << getScopeLineNumber() << "] ";
-
-  OS << "\n";
-}
-
-/// print - Print global variable.
-void DIGlobalVariable::print(raw_ostream &OS) const {
-  OS << " [";
-  StringRef Res = getName();
-  if (!Res.empty())
-    OS << " [" << Res << "] ";
-
-  unsigned Tag = getTag();
-  OS << " [" << dwarf::TagString(Tag) << "] ";
-
-  // TODO : Print context
-  OS << " [" << getLineNumber() << "] ";
-
-  if (isLocalToUnit())
-    OS << " [local] ";
-
-  if (isDefinition())
-    OS << " [def] ";
-
-  if (isGlobalVariable())
-    DIGlobalVariable(DbgNode).print(OS);
-  OS << "]\n";
-}
-
-static void printDebugLoc(DebugLoc DL, raw_ostream &CommentOS,
-                          const LLVMContext &Ctx) {
-  if (!DL.isUnknown()) {          // Print source line info.
-    DIScope Scope(DL.getScope(Ctx));
-    // Omit the directory, because it's likely to be long and uninteresting.
-    if (Scope.Verify())
-      CommentOS << Scope.getFilename();
-    else
-      CommentOS << "<unknown>";
-    CommentOS << ':' << DL.getLine();
-    if (DL.getCol() != 0)
-      CommentOS << ':' << DL.getCol();
-    DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(DL.getInlinedAt(Ctx));
-    if (!InlinedAtDL.isUnknown()) {
-      CommentOS << " @[ ";
-      printDebugLoc(InlinedAtDL, CommentOS, Ctx);
-      CommentOS << " ]";
-    }
-  }
-}
-
-void DIVariable::printExtendedName(raw_ostream &OS) const {
-  const LLVMContext &Ctx = DbgNode->getContext();
-  StringRef Res = getName();
-  if (!Res.empty())
-    OS << Res << "," << getLineNumber();
-  if (MDNode *InlinedAt = getInlinedAt()) {
-    DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(InlinedAt);
-    if (!InlinedAtDL.isUnknown()) {
-      OS << " @[";
-      printDebugLoc(InlinedAtDL, OS, Ctx);
-      OS << "]";
-    }
-  }
-}
-
-/// print - Print variable.
-void DIVariable::print(raw_ostream &OS) const {
-  StringRef Res = getName();
-  if (!Res.empty())
-    OS << " [" << Res << "] ";
-
-  OS << " [" << getLineNumber() << "] ";
-  getType().print(OS);
-  OS << "\n";
-
-  // FIXME: Dump complex addresses
-}
-
-/// dump - Print descriptor to dbgs() with a newline.
-void DIDescriptor::dump() const {
-  print(dbgs()); dbgs() << '\n';
-}
-
-/// dump - Print compile unit to dbgs() with a newline.
-void DICompileUnit::dump() const {
-  print(dbgs()); dbgs() << '\n';
-}
-
-/// dump - Print type to dbgs() with a newline.
-void DIType::dump() const {
-  print(dbgs()); dbgs() << '\n';
-}
-
-/// dump - Print basic type to dbgs() with a newline.
-void DIBasicType::dump() const {
-  print(dbgs()); dbgs() << '\n';
-}
-
-/// dump - Print derived type to dbgs() with a newline.
-void DIDerivedType::dump() const {
-  print(dbgs()); dbgs() << '\n';
-}
-
-/// dump - Print composite type to dbgs() with a newline.
-void DICompositeType::dump() const {
-  print(dbgs()); dbgs() << '\n';
-}
-
-/// dump - Print subprogram to dbgs() with a newline.
-void DISubprogram::dump() const {
-  print(dbgs()); dbgs() << '\n';
-}
-
-/// dump - Print global variable.
-void DIGlobalVariable::dump() const {
-  print(dbgs()); dbgs() << '\n';
-}
-
-/// dump - Print variable.
-void DIVariable::dump() const {
-  print(dbgs()); dbgs() << '\n';
-}
-
 /// fixupObjcLikeName - Replace contains special characters used
 /// in a typical Objective-C names with '.' in a given string.
 static void fixupObjcLikeName(StringRef Str, SmallVectorImpl<char> &Out) {
@@ -981,11 +734,50 @@ DIVariable llvm::cleanseInlinedVariable(MDNode *DV, LLVMContext &VMContext) {
   // Insert inlined scope as 7th element.
   for (unsigned i = 0, e = DV->getNumOperands(); i != e; ++i)
     i == 7 ? 
-      Elts.push_back(llvm::Constant::getNullValue(Type::getInt32Ty(VMContext))):
+      Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext))):
       Elts.push_back(DV->getOperand(i));
   return DIVariable(MDNode::get(VMContext, Elts));
 }
 
+/// getDISubprogram - Find subprogram that is enclosing this scope.
+DISubprogram llvm::getDISubprogram(const MDNode *Scope) {
+  DIDescriptor D(Scope);
+  if (D.isSubprogram())
+    return DISubprogram(Scope);
+
+  if (D.isLexicalBlockFile())
+    return getDISubprogram(DILexicalBlockFile(Scope).getContext());
+  
+  if (D.isLexicalBlock())
+    return getDISubprogram(DILexicalBlock(Scope).getContext());
+
+  return DISubprogram();
+}
+
+/// getDICompositeType - Find underlying composite type.
+DICompositeType llvm::getDICompositeType(DIType T) {
+  if (T.isCompositeType())
+    return DICompositeType(T);
+
+  if (T.isDerivedType())
+    return getDICompositeType(DIDerivedType(T).getTypeDerivedFrom());
+
+  return DICompositeType();
+}
+
+/// isSubprogramContext - Return true if Context is either a subprogram
+/// or another context nested inside a subprogram.
+bool llvm::isSubprogramContext(const MDNode *Context) {
+  if (!Context)
+    return false;
+  DIDescriptor D(Context);
+  if (D.isSubprogram())
+    return true;
+  if (D.isType())
+    return isSubprogramContext(DIType(Context).getContext());
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // DebugInfoFinder implementations.
 //===----------------------------------------------------------------------===//
@@ -1188,42 +980,189 @@ bool DebugInfoFinder::addSubprogram(DISubprogram SP) {
   return true;
 }
 
-/// getDISubprogram - Find subprogram that is enclosing this scope.
-DISubprogram llvm::getDISubprogram(const MDNode *Scope) {
-  DIDescriptor D(Scope);
-  if (D.isSubprogram())
-    return DISubprogram(Scope);
+//===----------------------------------------------------------------------===//
+// DIDescriptor: dump routines for all descriptors.
+//===----------------------------------------------------------------------===//
 
-  if (D.isLexicalBlockFile())
-    return getDISubprogram(DILexicalBlockFile(Scope).getContext());
-  
-  if (D.isLexicalBlock())
-    return getDISubprogram(DILexicalBlock(Scope).getContext());
+/// dump - Print descriptor to dbgs() with a newline.
+void DIDescriptor::dump() const {
+  print(dbgs()); dbgs() << '\n';
+}
 
-  return DISubprogram();
+/// print - Print descriptor.
+void DIDescriptor::print(raw_ostream &OS) const {
+  if (!DbgNode) return;
+
+  if (const char *Tag = dwarf::TagString(getTag()))
+    OS << "[ " << Tag << " ]";
+
+  if (this->isSubrange()) {
+    DISubrange(DbgNode).printInternal(OS);
+  } else if (this->isCompileUnit()) {
+    DICompileUnit(DbgNode).printInternal(OS);
+  } else if (this->isFile()) {
+    DIFile(DbgNode).printInternal(OS);
+  } else if (this->isEnumerator()) {
+    DIEnumerator(DbgNode).printInternal(OS);
+  } else if (this->isBasicType()) {
+    DIType(DbgNode).printInternal(OS);
+  } else if (this->isDerivedType()) {
+    DIDerivedType(DbgNode).printInternal(OS);
+  } else if (this->isCompositeType()) {
+    DICompositeType(DbgNode).printInternal(OS);
+  } else if (this->isSubprogram()) {
+    DISubprogram(DbgNode).printInternal(OS);
+  } else if (this->isGlobalVariable()) {
+    DIGlobalVariable(DbgNode).printInternal(OS);
+  } else if (this->isVariable()) {
+    DIVariable(DbgNode).printInternal(OS);
+  } else if (this->isObjCProperty()) {
+    DIObjCProperty(DbgNode).printInternal(OS);
+  } else if (this->isScope()) {
+    DIScope(DbgNode).printInternal(OS);
+  }
 }
 
-/// getDICompositeType - Find underlying composite type.
-DICompositeType llvm::getDICompositeType(DIType T) {
-  if (T.isCompositeType())
-    return DICompositeType(T);
+void DISubrange::printInternal(raw_ostream &OS) const {
+  OS << " [" << getLo() << ", " << getHi() << ']';
+}
 
-  if (T.isDerivedType())
-    return getDICompositeType(DIDerivedType(T).getTypeDerivedFrom());
+void DIScope::printInternal(raw_ostream &OS) const {
+  OS << " [" << getDirectory() << "/" << getFilename() << ']';
+}
 
-  return DICompositeType();
+void DICompileUnit::printInternal(raw_ostream &OS) const {
+  DIScope::printInternal(OS);
+  if (unsigned Lang = getLanguage())
+    OS << " [" << dwarf::LanguageString(Lang) << ']';
 }
 
-/// isSubprogramContext - Return true if Context is either a subprogram
-/// or another context nested inside a subprogram.
-bool llvm::isSubprogramContext(const MDNode *Context) {
-  if (!Context)
-    return false;
-  DIDescriptor D(Context);
-  if (D.isSubprogram())
-    return true;
-  if (D.isType())
-    return isSubprogramContext(DIType(Context).getContext());
-  return false;
+void DIEnumerator::printInternal(raw_ostream &OS) const {
+  OS << " [" << getName() << " :: " << getEnumValue() << ']';
+}
+
+void DIType::printInternal(raw_ostream &OS) const {
+  if (!DbgNode) return;
+
+  StringRef Res = getName();
+  if (!Res.empty())
+    OS << " [" << Res << "]";
+
+  // TODO: Print context?
+
+  OS << " [line " << getLineNumber()
+     << ", size " << getSizeInBits()
+     << ", align " << getAlignInBits()
+     << ", offset " << getOffsetInBits();
+  if (isBasicType())
+    if (const char *Enc = 
+        dwarf::AttributeEncodingString(DIBasicType(DbgNode).getEncoding()))
+      OS << ", enc " << Enc;
+  OS << "]";
+
+  if (isPrivate())
+    OS << " [private]";
+  else if (isProtected())
+    OS << " [protected]";
+
+  if (isForwardDecl())
+    OS << " [fwd]";
+}
+
+void DIDerivedType::printInternal(raw_ostream &OS) const {
+  DIType::printInternal(OS);
+  OS << " [from " << getTypeDerivedFrom().getName() << ']';
+}
+
+void DICompositeType::printInternal(raw_ostream &OS) const {
+  DIType::printInternal(OS);
+  DIArray A = getTypeArray();
+  OS << " [" << A.getNumElements() << " elements]";
+}
+
+void DISubprogram::printInternal(raw_ostream &OS) const {
+  // TODO : Print context
+  OS << " [line " << getLineNumber() << ']';
+
+  if (isLocalToUnit())
+    OS << " [local]";
+
+  if (isDefinition())
+    OS << " [def]";
+
+  if (getScopeLineNumber() != getLineNumber())
+    OS << " [scope " << getScopeLineNumber() << "]";
+
+  StringRef Res = getName();
+  if (!Res.empty())
+    OS << " [" << Res << ']';
+}
+
+void DIGlobalVariable::printInternal(raw_ostream &OS) const {
+  StringRef Res = getName();
+  if (!Res.empty())
+    OS << " [" << Res << ']';
+
+  OS << " [line " << getLineNumber() << ']';
+
+  // TODO : Print context
+
+  if (isLocalToUnit())
+    OS << " [local]";
+
+  if (isDefinition())
+    OS << " [def]";
+}
+
+void DIVariable::printInternal(raw_ostream &OS) const {
+  StringRef Res = getName();
+  if (!Res.empty())
+    OS << " [" << Res << ']';
+
+  OS << " [line " << getLineNumber() << ']';
+}
+
+void DIObjCProperty::printInternal(raw_ostream &OS) const {
+  StringRef Name = getObjCPropertyName();
+  if (!Name.empty())
+    OS << " [" << Name << ']';
+
+  OS << " [line " << getLineNumber()
+     << ", properties " << getUnsignedField(6) << ']';
 }
 
+static void printDebugLoc(DebugLoc DL, raw_ostream &CommentOS,
+                          const LLVMContext &Ctx) {
+  if (!DL.isUnknown()) {          // Print source line info.
+    DIScope Scope(DL.getScope(Ctx));
+    // Omit the directory, because it's likely to be long and uninteresting.
+    if (Scope.Verify())
+      CommentOS << Scope.getFilename();
+    else
+      CommentOS << "<unknown>";
+    CommentOS << ':' << DL.getLine();
+    if (DL.getCol() != 0)
+      CommentOS << ':' << DL.getCol();
+    DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(DL.getInlinedAt(Ctx));
+    if (!InlinedAtDL.isUnknown()) {
+      CommentOS << " @[ ";
+      printDebugLoc(InlinedAtDL, CommentOS, Ctx);
+      CommentOS << " ]";
+    }
+  }
+}
+
+void DIVariable::printExtendedName(raw_ostream &OS) const {
+  const LLVMContext &Ctx = DbgNode->getContext();
+  StringRef Res = getName();
+  if (!Res.empty())
+    OS << Res << "," << getLineNumber();
+  if (MDNode *InlinedAt = getInlinedAt()) {
+    DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(InlinedAt);
+    if (!InlinedAtDL.isUnknown()) {
+      OS << " @[";
+      printDebugLoc(InlinedAtDL, OS, Ctx);
+      OS << "]";
+    }
+  }
+}
diff --git a/lib/VMCore/DebugLoc.cpp b/lib/VMCore/DebugLoc.cpp
index 9013d28..c6a3053 100644
--- a/lib/VMCore/DebugLoc.cpp
+++ b/lib/VMCore/DebugLoc.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/DebugLoc.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/ADT/DenseMapInfo.h"
 #include "LLVMContextImpl.h"
 using namespace llvm;
@@ -114,34 +115,19 @@ MDNode *DebugLoc::getAsMDNode(const LLVMContext &Ctx) const {
 
 /// getFromDILocation - Translate the DILocation quad into a DebugLoc.
 DebugLoc DebugLoc::getFromDILocation(MDNode *N) {
-  if (N == 0 || N->getNumOperands() != 4) return DebugLoc();
-  
-  MDNode *Scope = dyn_cast_or_null<MDNode>(N->getOperand(2));
+  DILocation Loc(N);
+  MDNode *Scope = Loc.getScope();
   if (Scope == 0) return DebugLoc();
-  
-  unsigned LineNo = 0, ColNo = 0;
-  if (ConstantInt *Line = dyn_cast_or_null<ConstantInt>(N->getOperand(0)))
-    LineNo = Line->getZExtValue();
-  if (ConstantInt *Col = dyn_cast_or_null<ConstantInt>(N->getOperand(1)))
-    ColNo = Col->getZExtValue();
-  
-  return get(LineNo, ColNo, Scope, dyn_cast_or_null<MDNode>(N->getOperand(3)));
+  return get(Loc.getLineNumber(), Loc.getColumnNumber(), Scope,
+             Loc.getOrigLocation());
 }
 
 /// getFromDILexicalBlock - Translate the DILexicalBlock into a DebugLoc.
 DebugLoc DebugLoc::getFromDILexicalBlock(MDNode *N) {
-  if (N == 0 || N->getNumOperands() < 3) return DebugLoc();
-  
-  MDNode *Scope = dyn_cast_or_null<MDNode>(N->getOperand(1));
+  DILexicalBlock LexBlock(N);
+  MDNode *Scope = LexBlock.getContext();
   if (Scope == 0) return DebugLoc();
-  
-  unsigned LineNo = 0, ColNo = 0;
-  if (ConstantInt *Line = dyn_cast_or_null<ConstantInt>(N->getOperand(2)))
-    LineNo = Line->getZExtValue();
-  if (ConstantInt *Col = dyn_cast_or_null<ConstantInt>(N->getOperand(3)))
-    ColNo = Col->getZExtValue();
-  
-  return get(LineNo, ColNo, Scope, NULL);
+  return get(LexBlock.getLineNumber(), LexBlock.getColumnNumber(), Scope, NULL);
 }
 
 void DebugLoc::dump(const LLVMContext &Ctx) const {
@@ -164,22 +150,10 @@ void DebugLoc::dump(const LLVMContext &Ctx) const {
 // DenseMap specialization
 //===----------------------------------------------------------------------===//
 
-DebugLoc DenseMapInfo<DebugLoc>::getEmptyKey() {
-  return DebugLoc::getEmptyKey();
-}
-
-DebugLoc DenseMapInfo<DebugLoc>::getTombstoneKey() {
-  return DebugLoc::getTombstoneKey();
-}
-
 unsigned DenseMapInfo<DebugLoc>::getHashValue(const DebugLoc &Key) {
   return static_cast<unsigned>(hash_combine(Key.LineCol, Key.ScopeIdx));
 }
 
-bool DenseMapInfo<DebugLoc>::isEqual(const DebugLoc &LHS, const DebugLoc &RHS) {
-  return LHS == RHS;
-}
-
 //===----------------------------------------------------------------------===//
 // LLVMContextImpl Implementation
 //===----------------------------------------------------------------------===//
diff --git a/lib/VMCore/Function.cpp b/lib/VMCore/Function.cpp
index af6344e..2e0b316 100644
--- a/lib/VMCore/Function.cpp
+++ b/lib/VMCore/Function.cpp
@@ -29,7 +29,6 @@
 #include "llvm/ADT/StringExtras.h"
 using namespace llvm;
 
-
 // Explicit instantiations of SymbolTableListTraits since some of the methods
 // are not in the public header file...
 template class llvm::SymbolTableListTraits<Argument, Function>;
@@ -358,17 +357,239 @@ std::string Intrinsic::getName(ID id, ArrayRef<Type*> Tys) {
   return Result;
 }
 
-FunctionType *Intrinsic::getType(LLVMContext &Context,
-                                       ID id, ArrayRef<Type*> Tys) {
-  Type *ResultTy = NULL;
-  SmallVector<Type*, 8> ArgTys;
-  bool IsVarArg = false;
+
+/// IIT_Info - These are enumerators that describe the entries returned by the
+/// getIntrinsicInfoTableEntries function.
+///
+/// NOTE: This must be kept in synch with the copy in TblGen/IntrinsicEmitter!
+enum IIT_Info {
+  // Common values should be encoded with 0-15.
+  IIT_Done = 0,
+  IIT_I1   = 1,
+  IIT_I8   = 2,
+  IIT_I16  = 3,
+  IIT_I32  = 4,
+  IIT_I64  = 5,
+  IIT_F32  = 6,
+  IIT_F64  = 7,
+  IIT_V2   = 8,
+  IIT_V4   = 9,
+  IIT_V8   = 10,
+  IIT_V16  = 11,
+  IIT_V32  = 12,
+  IIT_MMX  = 13,
+  IIT_PTR  = 14,
+  IIT_ARG  = 15,
   
-#define GET_INTRINSIC_GENERATOR
+  // Values from 16+ are only encodable with the inefficient encoding.
+  IIT_METADATA = 16,
+  IIT_EMPTYSTRUCT = 17,
+  IIT_STRUCT2 = 18,
+  IIT_STRUCT3 = 19,
+  IIT_STRUCT4 = 20,
+  IIT_STRUCT5 = 21,
+  IIT_EXTEND_VEC_ARG = 22,
+  IIT_TRUNC_VEC_ARG = 23,
+  IIT_ANYPTR = 24
+};
+
+
+static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
+                      SmallVectorImpl<Intrinsic::IITDescriptor> &OutputTable) {
+  IIT_Info Info = IIT_Info(Infos[NextElt++]);
+  unsigned StructElts = 2;
+  using namespace Intrinsic;
+  
+  switch (Info) {
+  case IIT_Done:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Void, 0));
+    return;
+  case IIT_MMX:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::MMX, 0));
+    return;
+  case IIT_METADATA:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Metadata, 0));
+    return;
+  case IIT_F32:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Float, 0));
+    return;
+  case IIT_F64:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Double, 0));
+    return;
+  case IIT_I1:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 1));
+    return;
+  case IIT_I8:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 8));
+    return;
+  case IIT_I16:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer,16));
+    return;
+  case IIT_I32:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 32));
+    return;
+  case IIT_I64:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 64));
+    return;
+  case IIT_V2:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 2));
+    DecodeIITType(NextElt, Infos, OutputTable);
+    return;
+  case IIT_V4:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 4));
+    DecodeIITType(NextElt, Infos, OutputTable);
+    return;
+  case IIT_V8:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 8));
+    DecodeIITType(NextElt, Infos, OutputTable);
+    return;
+  case IIT_V16:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 16));
+    DecodeIITType(NextElt, Infos, OutputTable);
+    return;
+  case IIT_V32:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 32));
+    DecodeIITType(NextElt, Infos, OutputTable);
+    return;
+  case IIT_PTR:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Pointer, 0));
+    DecodeIITType(NextElt, Infos, OutputTable);
+    return;
+  case IIT_ANYPTR: {  // [ANYPTR addrspace, subtype]
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Pointer, 
+                                             Infos[NextElt++]));
+    DecodeIITType(NextElt, Infos, OutputTable);
+    return;
+  }
+  case IIT_ARG: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Argument, ArgInfo));
+    return;
+  }
+  case IIT_EXTEND_VEC_ARG: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::ExtendVecArgument,
+                                             ArgInfo));
+    return;
+  }
+  case IIT_TRUNC_VEC_ARG: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::TruncVecArgument,
+                                             ArgInfo));
+    return;
+  }
+  case IIT_EMPTYSTRUCT:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Struct, 0));
+    return;
+  case IIT_STRUCT5: ++StructElts; // FALL THROUGH.
+  case IIT_STRUCT4: ++StructElts; // FALL THROUGH.
+  case IIT_STRUCT3: ++StructElts; // FALL THROUGH.
+  case IIT_STRUCT2: {
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Struct,StructElts));
+
+    for (unsigned i = 0; i != StructElts; ++i)
+      DecodeIITType(NextElt, Infos, OutputTable);
+    return;
+  }
+  }
+  llvm_unreachable("unhandled");
+}
+
+
+#define GET_INTRINSIC_GENERATOR_GLOBAL
 #include "llvm/Intrinsics.gen"
-#undef GET_INTRINSIC_GENERATOR
+#undef GET_INTRINSIC_GENERATOR_GLOBAL
+
+void Intrinsic::getIntrinsicInfoTableEntries(ID id, 
+                                             SmallVectorImpl<IITDescriptor> &T){
+  // Check to see if the intrinsic's type was expressible by the table.
+  unsigned TableVal = IIT_Table[id-1];
+  
+  // Decode the TableVal into an array of IITValues.
+  SmallVector<unsigned char, 8> IITValues;
+  ArrayRef<unsigned char> IITEntries;
+  unsigned NextElt = 0;
+  if ((TableVal >> 31) != 0) {
+    // This is an offset into the IIT_LongEncodingTable.
+    IITEntries = IIT_LongEncodingTable;
+    
+    // Strip sentinel bit.
+    NextElt = (TableVal << 1) >> 1;
+  } else {
+    // Decode the TableVal into an array of IITValues.  If the entry was encoded
+    // into a single word in the table itself, decode it now.
+    do {
+      IITValues.push_back(TableVal & 0xF);
+      TableVal >>= 4;
+    } while (TableVal);
+    
+    IITEntries = IITValues;
+    NextElt = 0;
+  }
 
-  return FunctionType::get(ResultTy, ArgTys, IsVarArg); 
+  // Okay, decode the table into the output vector of IITDescriptors.
+  DecodeIITType(NextElt, IITEntries, T);
+  while (NextElt != IITEntries.size() && IITEntries[NextElt] != 0)
+    DecodeIITType(NextElt, IITEntries, T);
+}
+
+
+static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
+                             ArrayRef<Type*> Tys, LLVMContext &Context) {
+  using namespace Intrinsic;
+  IITDescriptor D = Infos.front();
+  Infos = Infos.slice(1);
+  
+  switch (D.Kind) {
+  case IITDescriptor::Void: return Type::getVoidTy(Context);
+  case IITDescriptor::MMX: return Type::getX86_MMXTy(Context);
+  case IITDescriptor::Metadata: return Type::getMetadataTy(Context);
+  case IITDescriptor::Float: return Type::getFloatTy(Context);
+  case IITDescriptor::Double: return Type::getDoubleTy(Context);
+      
+  case IITDescriptor::Integer:
+    return IntegerType::get(Context, D.Integer_Width);
+  case IITDescriptor::Vector:
+    return VectorType::get(DecodeFixedType(Infos, Tys, Context),D.Vector_Width);
+  case IITDescriptor::Pointer:
+    return PointerType::get(DecodeFixedType(Infos, Tys, Context),
+                            D.Pointer_AddressSpace);
+  case IITDescriptor::Struct: {
+    Type *Elts[5];
+    assert(D.Struct_NumElements <= 5 && "Can't handle this yet");
+    for (unsigned i = 0, e = D.Struct_NumElements; i != e; ++i)
+      Elts[i] = DecodeFixedType(Infos, Tys, Context);
+    return StructType::get(Context, ArrayRef<Type*>(Elts,D.Struct_NumElements));
+  }
+
+  case IITDescriptor::Argument:
+    return Tys[D.getArgumentNumber()];
+  case IITDescriptor::ExtendVecArgument:
+    return VectorType::getExtendedElementVectorType(cast<VectorType>(
+                                                  Tys[D.getArgumentNumber()]));
+      
+  case IITDescriptor::TruncVecArgument:
+    return VectorType::getTruncatedElementVectorType(cast<VectorType>(
+                                                  Tys[D.getArgumentNumber()]));
+  }
+  llvm_unreachable("unhandled");
+}
+
+
+
+FunctionType *Intrinsic::getType(LLVMContext &Context,
+                                 ID id, ArrayRef<Type*> Tys) {
+  SmallVector<IITDescriptor, 8> Table;
+  getIntrinsicInfoTableEntries(id, Table);
+  
+  ArrayRef<IITDescriptor> TableRef = Table;
+  Type *ResultTy = DecodeFixedType(TableRef, Tys, Context);
+    
+  SmallVector<Type*, 8> ArgTys;
+  while (!TableRef.empty())
+    ArgTys.push_back(DecodeFixedType(TableRef, Tys, Context));
+
+  return FunctionType::get(ResultTy, ArgTys, false); 
 }
 
 bool Intrinsic::isOverloaded(ID id) {
@@ -400,7 +621,8 @@ Function *Intrinsic::getDeclaration(Module *M, ID id, ArrayRef<Type*> Tys) {
 bool Function::hasAddressTaken(const User* *PutOffender) const {
   for (Value::const_use_iterator I = use_begin(), E = use_end(); I != E; ++I) {
     const User *U = *I;
-    // FIXME: Check for blockaddress, which does not take the address.
+    if (isa<BlockAddress>(U))
+      continue;
     if (!isa<CallInst>(U) && !isa<InvokeInst>(U))
       return PutOffender ? (*PutOffender = U, true) : true;
     ImmutableCallSite CS(cast<Instruction>(U));
@@ -439,4 +661,3 @@ bool Function::callsFunctionThatReturnsTwice() const {
   return false;
 }
 
-// vim: sw=2 ai
diff --git a/lib/VMCore/GCOV.cpp b/lib/VMCore/GCOV.cpp
index 595c452..003a5d4 100644
--- a/lib/VMCore/GCOV.cpp
+++ b/lib/VMCore/GCOV.cpp
@@ -64,7 +64,7 @@ bool GCOVFile::read(GCOVBuffer &Buffer) {
 /// dump - Dump GCOVFile content on standard out for debugging purposes.
 void GCOVFile::dump() {
   for (SmallVector<GCOVFunction *, 16>::iterator I = Functions.begin(),
-	 E = Functions.end(); I != E; ++I)
+         E = Functions.end(); I != E; ++I)
     (*I)->dump();
 }
 
@@ -72,7 +72,7 @@ void GCOVFile::dump() {
 /// reading .gcno and .gcda files.
 void GCOVFile::collectLineCounts(FileInfo &FI) {
   for (SmallVector<GCOVFunction *, 16>::iterator I = Functions.begin(),
-	 E = Functions.end(); I != E; ++I) 
+         E = Functions.end(); I != E; ++I) 
     (*I)->collectLineCounts(FI);
   FI.print();
 }
@@ -143,7 +143,7 @@ bool GCOVFunction::read(GCOVBuffer &Buff, GCOVFormat Format) {
       StringRef Filename = Buff.readString();
       if (Buff.getCursor() == (Size - 4)) break;
       while (uint32_t L = Buff.readInt())
-	Block->addLine(Filename, L);
+        Block->addLine(Filename, L);
     }
     Buff.readInt(); // flag
   }
@@ -154,7 +154,7 @@ bool GCOVFunction::read(GCOVBuffer &Buff, GCOVFormat Format) {
 void GCOVFunction::dump() {
   outs() <<  "===== " << Name << " @ " << Filename << ":" << LineNumber << "\n";
   for (SmallVector<GCOVBlock *, 16>::iterator I = Blocks.begin(),
-	 E = Blocks.end(); I != E; ++I)
+         E = Blocks.end(); I != E; ++I)
     (*I)->dump();
 }
 
@@ -162,7 +162,7 @@ void GCOVFunction::dump() {
 /// reading .gcno and .gcda files.
 void GCOVFunction::collectLineCounts(FileInfo &FI) {
   for (SmallVector<GCOVBlock *, 16>::iterator I = Blocks.begin(),
-	 E = Blocks.end(); I != E; ++I)
+         E = Blocks.end(); I != E; ++I)
     (*I)->collectLineCounts(FI);
 }
 
@@ -186,7 +186,7 @@ void GCOVBlock::addLine(StringRef Filename, uint32_t LineNo) {
 /// reading .gcno and .gcda files.
 void GCOVBlock::collectLineCounts(FileInfo &FI) {
   for (StringMap<GCOVLines *>::iterator I = Lines.begin(),
-	 E = Lines.end(); I != E; ++I)
+         E = Lines.end(); I != E; ++I)
     I->second->collectLineCounts(FI, I->first(), Counter);
 }
 
@@ -196,14 +196,14 @@ void GCOVBlock::dump() {
   if (!Edges.empty()) {
     outs() << "\tEdges : ";
     for (SmallVector<uint32_t, 16>::iterator I = Edges.begin(), E = Edges.end();
-	 I != E; ++I)
+         I != E; ++I)
       outs() << (*I) << ",";
     outs() << "\n";
   }
   if (!Lines.empty()) {
     outs() << "\tLines : ";
     for (StringMap<GCOVLines *>::iterator LI = Lines.begin(),
-	   LE = Lines.end(); LI != LE; ++LI) {
+           LE = Lines.end(); LI != LE; ++LI) {
       outs() << LI->first() << " -> ";
       LI->second->dump();
       outs() << "\n";
@@ -217,16 +217,16 @@ void GCOVBlock::dump() {
 /// collectLineCounts - Collect line counts. This must be used after
 /// reading .gcno and .gcda files.
 void GCOVLines::collectLineCounts(FileInfo &FI, StringRef Filename, 
-				  uint32_t Count) {
+                                  uint32_t Count) {
   for (SmallVector<uint32_t, 16>::iterator I = Lines.begin(),
-	 E = Lines.end(); I != E; ++I)
+         E = Lines.end(); I != E; ++I)
     FI.addLineCount(Filename, *I, Count);
 }
 
 /// dump - Dump GCOVLines content on standard out for debugging purposes.
 void GCOVLines::dump() {
   for (SmallVector<uint32_t, 16>::iterator I = Lines.begin(),
-	 E = Lines.end(); I != E; ++I)
+         E = Lines.end(); I != E; ++I)
     outs() << (*I) << ",";
 }
 
@@ -266,12 +266,12 @@ void FileInfo::print() {
     StringRef AllLines = Buff.take()->getBuffer();
     for (unsigned i = 0, e = L.size(); i != e; ++i) {
       if (L[i])
-	outs() << L[i] << ":\t";
+        outs() << L[i] << ":\t";
       else
-	outs() << " :\t";
+        outs() << " :\t";
       std::pair<StringRef, StringRef> P = AllLines.split('\n');
       if (AllLines != P.first)
-	outs() << P.first;
+        outs() << P.first;
       outs() << "\n";
       AllLines = P.second;
     }
diff --git a/lib/VMCore/Globals.cpp b/lib/VMCore/Globals.cpp
index 4254fb2..c428b88 100644
--- a/lib/VMCore/Globals.cpp
+++ b/lib/VMCore/Globals.cpp
@@ -82,12 +82,12 @@ bool GlobalValue::isDeclaration() const {
 
 GlobalVariable::GlobalVariable(Type *Ty, bool constant, LinkageTypes Link,
                                Constant *InitVal, const Twine &Name,
-                               bool ThreadLocal, unsigned AddressSpace)
-  : GlobalValue(PointerType::get(Ty, AddressSpace), 
+                               ThreadLocalMode TLMode, unsigned AddressSpace)
+  : GlobalValue(PointerType::get(Ty, AddressSpace),
                 Value::GlobalVariableVal,
                 OperandTraits<GlobalVariable>::op_begin(this),
                 InitVal != 0, Link, Name),
-    isConstantGlobal(constant), isThreadLocalSymbol(ThreadLocal) {
+    isConstantGlobal(constant), threadLocalMode(TLMode) {
   if (InitVal) {
     assert(InitVal->getType() == Ty &&
            "Initializer should be the same type as the GlobalVariable!");
@@ -100,13 +100,13 @@ GlobalVariable::GlobalVariable(Type *Ty, bool constant, LinkageTypes Link,
 GlobalVariable::GlobalVariable(Module &M, Type *Ty, bool constant,
                                LinkageTypes Link, Constant *InitVal,
                                const Twine &Name,
-                               GlobalVariable *Before, bool ThreadLocal,
+                               GlobalVariable *Before, ThreadLocalMode TLMode,
                                unsigned AddressSpace)
-  : GlobalValue(PointerType::get(Ty, AddressSpace), 
+  : GlobalValue(PointerType::get(Ty, AddressSpace),
                 Value::GlobalVariableVal,
                 OperandTraits<GlobalVariable>::op_begin(this),
                 InitVal != 0, Link, Name),
-    isConstantGlobal(constant), isThreadLocalSymbol(ThreadLocal) {
+    isConstantGlobal(constant), threadLocalMode(TLMode) {
   if (InitVal) {
     assert(InitVal->getType() == Ty &&
            "Initializer should be the same type as the GlobalVariable!");
diff --git a/lib/VMCore/IRBuilder.cpp b/lib/VMCore/IRBuilder.cpp
index b459234..5c4e6d9 100644
--- a/lib/VMCore/IRBuilder.cpp
+++ b/lib/VMCore/IRBuilder.cpp
@@ -12,9 +12,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Support/IRBuilder.h"
-#include "llvm/GlobalVariable.h"
 #include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/Intrinsics.h"
 #include "llvm/LLVMContext.h"
 using namespace llvm;
@@ -28,7 +28,7 @@ Value *IRBuilderBase::CreateGlobalString(StringRef Str, const Twine &Name) {
   Module &M = *BB->getParent()->getParent();
   GlobalVariable *GV = new GlobalVariable(M, StrConstant->getType(),
                                           true, GlobalValue::PrivateLinkage,
-                                          StrConstant, "", 0, false);
+                                          StrConstant);
   GV->setName(Name);
   GV->setUnnamedAddr(true);
   return GV;
@@ -120,13 +120,13 @@ CreateMemMove(Value *Dst, Value *Src, Value *Size, unsigned Align,
 
 CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) {
   assert(isa<PointerType>(Ptr->getType()) &&
-	 "lifetime.start only applies to pointers.");
+         "lifetime.start only applies to pointers.");
   Ptr = getCastedInt8PtrValue(Ptr);
   if (!Size)
     Size = getInt64(-1);
   else
     assert(Size->getType() == getInt64Ty() &&
-	   "lifetime.start requires the size to be an i64");
+           "lifetime.start requires the size to be an i64");
   Value *Ops[] = { Size, Ptr };
   Module *M = BB->getParent()->getParent();
   Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::lifetime_start);
@@ -135,13 +135,13 @@ CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) {
 
 CallInst *IRBuilderBase::CreateLifetimeEnd(Value *Ptr, ConstantInt *Size) {
   assert(isa<PointerType>(Ptr->getType()) &&
-	 "lifetime.end only applies to pointers.");
+         "lifetime.end only applies to pointers.");
   Ptr = getCastedInt8PtrValue(Ptr);
   if (!Size)
     Size = getInt64(-1);
   else
     assert(Size->getType() == getInt64Ty() &&
-	   "lifetime.end requires the size to be an i64");
+           "lifetime.end requires the size to be an i64");
   Value *Ops[] = { Size, Ptr };
   Module *M = BB->getParent()->getParent();
   Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::lifetime_end);
diff --git a/lib/VMCore/Instruction.cpp b/lib/VMCore/Instruction.cpp
index 5449714..66379a0 100644
--- a/lib/VMCore/Instruction.cpp
+++ b/lib/VMCore/Instruction.cpp
@@ -226,34 +226,52 @@ bool Instruction::isIdenticalToWhenDefined(const Instruction *I) const {
            RMWI->isVolatile() == cast<AtomicRMWInst>(I)->isVolatile() &&
            RMWI->getOrdering() == cast<AtomicRMWInst>(I)->getOrdering() &&
            RMWI->getSynchScope() == cast<AtomicRMWInst>(I)->getSynchScope();
-
+  if (const PHINode *thisPHI = dyn_cast<PHINode>(this)) {
+    const PHINode *otherPHI = cast<PHINode>(I);
+    for (unsigned i = 0, e = thisPHI->getNumOperands(); i != e; ++i) {
+      if (thisPHI->getIncomingBlock(i) != otherPHI->getIncomingBlock(i))
+        return false;
+    }
+    return true;
+  }
   return true;
 }
 
 // isSameOperationAs
 // This should be kept in sync with isEquivalentOperation in
 // lib/Transforms/IPO/MergeFunctions.cpp.
-bool Instruction::isSameOperationAs(const Instruction *I) const {
+bool Instruction::isSameOperationAs(const Instruction *I,
+                                    unsigned flags) const {
+  bool IgnoreAlignment = flags & CompareIgnoringAlignment;
+  bool UseScalarTypes  = flags & CompareUsingScalarTypes;
+
   if (getOpcode() != I->getOpcode() ||
       getNumOperands() != I->getNumOperands() ||
-      getType() != I->getType())
+      (UseScalarTypes ?
+       getType()->getScalarType() != I->getType()->getScalarType() :
+       getType() != I->getType()))
     return false;
 
   // We have two instructions of identical opcode and #operands.  Check to see
   // if all operands are the same type
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
-    if (getOperand(i)->getType() != I->getOperand(i)->getType())
+    if (UseScalarTypes ?
+        getOperand(i)->getType()->getScalarType() !=
+          I->getOperand(i)->getType()->getScalarType() :
+        getOperand(i)->getType() != I->getOperand(i)->getType())
       return false;
 
   // Check special state that is a part of some instructions.
   if (const LoadInst *LI = dyn_cast<LoadInst>(this))
     return LI->isVolatile() == cast<LoadInst>(I)->isVolatile() &&
-           LI->getAlignment() == cast<LoadInst>(I)->getAlignment() &&
+           (LI->getAlignment() == cast<LoadInst>(I)->getAlignment() ||
+            IgnoreAlignment) &&
            LI->getOrdering() == cast<LoadInst>(I)->getOrdering() &&
            LI->getSynchScope() == cast<LoadInst>(I)->getSynchScope();
   if (const StoreInst *SI = dyn_cast<StoreInst>(this))
     return SI->isVolatile() == cast<StoreInst>(I)->isVolatile() &&
-           SI->getAlignment() == cast<StoreInst>(I)->getAlignment() &&
+           (SI->getAlignment() == cast<StoreInst>(I)->getAlignment() ||
+            IgnoreAlignment) &&
            SI->getOrdering() == cast<StoreInst>(I)->getOrdering() &&
            SI->getSynchScope() == cast<StoreInst>(I)->getSynchScope();
   if (const CmpInst *CI = dyn_cast<CmpInst>(this))
@@ -388,6 +406,29 @@ bool Instruction::isCommutative(unsigned op) {
   }
 }
 
+/// isIdempotent - Return true if the instruction is idempotent:
+///
+///   Idempotent operators satisfy:  x op x === x
+///
+/// In LLVM, the And and Or operators are idempotent.
+///
+bool Instruction::isIdempotent(unsigned Opcode) {
+  return Opcode == And || Opcode == Or;
+}
+
+/// isNilpotent - Return true if the instruction is nilpotent:
+///
+///   Nilpotent operators satisfy:  x op x === Id,
+///
+///   where Id is the identity for the operator, i.e. a constant such that
+///     x op Id === x and Id op x === x for all x.
+///
+/// In LLVM, the Xor operator is nilpotent.
+///
+bool Instruction::isNilpotent(unsigned Opcode) {
+  return Opcode == Xor;
+}
+
 Instruction *Instruction::clone() const {
   Instruction *New = clone_impl();
   New->SubclassOptionalData = SubclassOptionalData;
diff --git a/lib/VMCore/Instructions.cpp b/lib/VMCore/Instructions.cpp
index 6c5db32..9af98e8 100644
--- a/lib/VMCore/Instructions.cpp
+++ b/lib/VMCore/Instructions.cpp
@@ -161,8 +161,14 @@ Value *PHINode::hasConstantValue() const {
   // Exploit the fact that phi nodes always have at least one entry.
   Value *ConstantValue = getIncomingValue(0);
   for (unsigned i = 1, e = getNumIncomingValues(); i != e; ++i)
-    if (getIncomingValue(i) != ConstantValue)
-      return 0; // Incoming values not all the same.
+    if (getIncomingValue(i) != ConstantValue && getIncomingValue(i) != this) {
+      if (ConstantValue != this)
+        return 0; // Incoming values not all the same.
+       // The case where the first value is this PHI.
+      ConstantValue = getIncomingValue(i);
+    }
+  if (ConstantValue == this)
+    return UndefValue::get(getType());
   return ConstantValue;
 }
 
@@ -3158,6 +3164,7 @@ SwitchInst::SwitchInst(const SwitchInst &SI)
     OL[i] = InOL[i];
     OL[i+1] = InOL[i+1];
   }
+  TheSubsets = SI.TheSubsets;
   SubclassOptionalData = SI.SubclassOptionalData;
 }
 
@@ -3169,6 +3176,16 @@ SwitchInst::~SwitchInst() {
 /// addCase - Add an entry to the switch instruction...
 ///
 void SwitchInst::addCase(ConstantInt *OnVal, BasicBlock *Dest) {
+  IntegersSubsetToBB Mapping;
+  
+  // FIXME: Currently we work with ConstantInt based cases.
+  // So inititalize IntItem container directly from ConstantInt.
+  Mapping.add(IntItem::fromConstantInt(OnVal));
+  IntegersSubset CaseRanges = Mapping.getCase();
+  addCase(CaseRanges, Dest);
+}
+
+void SwitchInst::addCase(IntegersSubset& OnVal, BasicBlock *Dest) {
   unsigned NewCaseIdx = getNumCases(); 
   unsigned OpNo = NumOperands;
   if (OpNo+2 > ReservedSpace)
@@ -3176,14 +3193,17 @@ void SwitchInst::addCase(ConstantInt *OnVal, BasicBlock *Dest) {
   // Initialize some new operands.
   assert(OpNo+1 < ReservedSpace && "Growing didn't work!");
   NumOperands = OpNo+2;
-  CaseIt Case(this, NewCaseIdx);
-  Case.setValue(OnVal);
+
+  SubsetsIt TheSubsetsIt = TheSubsets.insert(TheSubsets.end(), OnVal);
+  
+  CaseIt Case(this, NewCaseIdx, TheSubsetsIt);
+  Case.updateCaseValueOperand(OnVal);
   Case.setSuccessor(Dest);
 }
 
 /// removeCase - This method removes the specified case and its successor
 /// from the switch instruction.
-void SwitchInst::removeCase(CaseIt i) {
+void SwitchInst::removeCase(CaseIt& i) {
   unsigned idx = i.getCaseIndex();
   
   assert(2 + idx*2 < getNumOperands() && "Case index out of range!!!");
@@ -3200,6 +3220,16 @@ void SwitchInst::removeCase(CaseIt i) {
   // Nuke the last value.
   OL[NumOps-2].set(0);
   OL[NumOps-2+1].set(0);
+
+  // Do the same with TheCases collection:
+  if (i.SubsetIt != --TheSubsets.end()) {
+    *i.SubsetIt = TheSubsets.back();
+    TheSubsets.pop_back();
+  } else {
+    TheSubsets.pop_back();
+    i.SubsetIt = TheSubsets.end();
+  }
+  
   NumOperands = NumOps-2;
 }
 
diff --git a/lib/VMCore/Metadata.cpp b/lib/VMCore/Metadata.cpp
index 090b09a..ede4626 100644
--- a/lib/VMCore/Metadata.cpp
+++ b/lib/VMCore/Metadata.cpp
@@ -21,6 +21,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/STLExtras.h"
 #include "SymbolTableListTraitsImpl.h"
+#include "llvm/Support/ConstantRange.h"
 #include "llvm/Support/LeakDetector.h"
 #include "llvm/Support/ValueHandle.h"
 using namespace llvm;
@@ -66,7 +67,11 @@ public:
   MDNodeOperand(Value *V) : CallbackVH(V) {}
   ~MDNodeOperand() {}
 
-  void set(Value *V) { this->setValPtr(V); }
+  void set(Value *V) {
+    unsigned IsFirst = this->getValPtrInt();
+    this->setValPtr(V);
+    this->setAsFirstOperand(IsFirst);
+  }
 
   /// setAsFirstOperand - Accessor method to mark the operand as the first in
   /// the list.
@@ -95,7 +100,7 @@ void MDNodeOperand::allUsesReplacedWith(Value *NV) {
 static MDNodeOperand *getOperandPtr(MDNode *N, unsigned Op) {
   // Use <= instead of < to permit a one-past-the-end address.
   assert(Op <= N->getNumOperands() && "Invalid operand number");
-  return reinterpret_cast<MDNodeOperand*>(N+1)+Op;
+  return reinterpret_cast<MDNodeOperand*>(N + 1) + Op;
 }
 
 void MDNode::replaceOperandWith(unsigned i, Value *Val) {
@@ -122,7 +127,6 @@ MDNode::MDNode(LLVMContext &C, ArrayRef<Value*> Vals, bool isFunctionLocal)
   }
 }
 
-
 /// ~MDNode - Destroy MDNode.
 MDNode::~MDNode() {
   assert((getSubclassDataFromValue() & DestroyFlag) != 0 &&
@@ -247,7 +251,7 @@ MDNode *MDNode::getMDNode(LLVMContext &Context, ArrayRef<Value*> Vals,
   }
 
   // Coallocate space for the node and Operands together, then placement new.
-  void *Ptr = malloc(sizeof(MDNode)+Vals.size()*sizeof(MDNodeOperand));
+  void *Ptr = malloc(sizeof(MDNode) + Vals.size() * sizeof(MDNodeOperand));
   N = new (Ptr) MDNode(Context, Vals, isFunctionLocal);
 
   // Cache the operand hash.
@@ -275,7 +279,7 @@ MDNode *MDNode::getIfExists(LLVMContext &Context, ArrayRef<Value*> Vals) {
 
 MDNode *MDNode::getTemporary(LLVMContext &Context, ArrayRef<Value*> Vals) {
   MDNode *N =
-    (MDNode *)malloc(sizeof(MDNode)+Vals.size()*sizeof(MDNodeOperand));
+    (MDNode *)malloc(sizeof(MDNode) + Vals.size() * sizeof(MDNodeOperand));
   N = new (N) MDNode(Context, Vals, FL_No);
   N->setValueSubclassData(N->getSubclassDataFromValue() |
                           NotUniquedBit);
@@ -398,6 +402,155 @@ void MDNode::replaceOperand(MDNodeOperand *Op, Value *To) {
   }
 }
 
+MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) {
+  if (!A || !B)
+    return NULL;
+
+  if (A == B)
+    return A;
+
+  SmallVector<MDNode *, 4> PathA;
+  MDNode *T = A;
+  while (T) {
+    PathA.push_back(T);
+    T = T->getNumOperands() >= 2 ? cast_or_null<MDNode>(T->getOperand(1)) : 0;
+  }
+
+  SmallVector<MDNode *, 4> PathB;
+  T = B;
+  while (T) {
+    PathB.push_back(T);
+    T = T->getNumOperands() >= 2 ? cast_or_null<MDNode>(T->getOperand(1)) : 0;
+  }
+
+  int IA = PathA.size() - 1;
+  int IB = PathB.size() - 1;
+
+  MDNode *Ret = 0;
+  while (IA >= 0 && IB >=0) {
+    if (PathA[IA] == PathB[IB])
+      Ret = PathA[IA];
+    else
+      break;
+    --IA;
+    --IB;
+  }
+  return Ret;
+}
+
+MDNode *MDNode::getMostGenericFPMath(MDNode *A, MDNode *B) {
+  if (!A || !B)
+    return NULL;
+
+  APFloat AVal = cast<ConstantFP>(A->getOperand(0))->getValueAPF();
+  APFloat BVal = cast<ConstantFP>(B->getOperand(0))->getValueAPF();
+  if (AVal.compare(BVal) == APFloat::cmpLessThan)
+    return A;
+  return B;
+}
+
+static bool isContiguous(const ConstantRange &A, const ConstantRange &B) {
+  return A.getUpper() == B.getLower() || A.getLower() == B.getUpper();
+}
+
+static bool canBeMerged(const ConstantRange &A, const ConstantRange &B) {
+  return !A.intersectWith(B).isEmptySet() || isContiguous(A, B);
+}
+
+static bool tryMergeRange(SmallVector<Value*, 4> &EndPoints, ConstantInt *Low,
+                          ConstantInt *High) {
+  ConstantRange NewRange(Low->getValue(), High->getValue());
+  unsigned Size = EndPoints.size();
+  APInt LB = cast<ConstantInt>(EndPoints[Size - 2])->getValue();
+  APInt LE = cast<ConstantInt>(EndPoints[Size - 1])->getValue();
+  ConstantRange LastRange(LB, LE);
+  if (canBeMerged(NewRange, LastRange)) {
+    ConstantRange Union = LastRange.unionWith(NewRange);
+    Type *Ty = High->getType();
+    EndPoints[Size - 2] = ConstantInt::get(Ty, Union.getLower());
+    EndPoints[Size - 1] = ConstantInt::get(Ty, Union.getUpper());
+    return true;
+  }
+  return false;
+}
+
+static void addRange(SmallVector<Value*, 4> &EndPoints, ConstantInt *Low,
+                     ConstantInt *High) {
+  if (!EndPoints.empty())
+    if (tryMergeRange(EndPoints, Low, High))
+      return;
+
+  EndPoints.push_back(Low);
+  EndPoints.push_back(High);
+}
+
+MDNode *MDNode::getMostGenericRange(MDNode *A, MDNode *B) {
+  // Given two ranges, we want to compute the union of the ranges. This
+  // is slightly complitade by having to combine the intervals and merge
+  // the ones that overlap.
+
+  if (!A || !B)
+    return NULL;
+
+  if (A == B)
+    return A;
+
+  // First, walk both lists in older of the lower boundary of each interval.
+  // At each step, try to merge the new interval to the last one we adedd.
+  SmallVector<Value*, 4> EndPoints;
+  int AI = 0;
+  int BI = 0;
+  int AN = A->getNumOperands() / 2;
+  int BN = B->getNumOperands() / 2;
+  while (AI < AN && BI < BN) {
+    ConstantInt *ALow = cast<ConstantInt>(A->getOperand(2 * AI));
+    ConstantInt *BLow = cast<ConstantInt>(B->getOperand(2 * BI));
+
+    if (ALow->getValue().slt(BLow->getValue())) {
+      addRange(EndPoints, ALow, cast<ConstantInt>(A->getOperand(2 * AI + 1)));
+      ++AI;
+    } else {
+      addRange(EndPoints, BLow, cast<ConstantInt>(B->getOperand(2 * BI + 1)));
+      ++BI;
+    }
+  }
+  while (AI < AN) {
+    addRange(EndPoints, cast<ConstantInt>(A->getOperand(2 * AI)),
+             cast<ConstantInt>(A->getOperand(2 * AI + 1)));
+    ++AI;
+  }
+  while (BI < BN) {
+    addRange(EndPoints, cast<ConstantInt>(B->getOperand(2 * BI)),
+             cast<ConstantInt>(B->getOperand(2 * BI + 1)));
+    ++BI;
+  }
+
+  // If we have more than 2 ranges (4 endpoints) we have to try to merge
+  // the last and first ones.
+  unsigned Size = EndPoints.size();
+  if (Size > 4) {
+    ConstantInt *FB = cast<ConstantInt>(EndPoints[0]);
+    ConstantInt *FE = cast<ConstantInt>(EndPoints[1]);
+    if (tryMergeRange(EndPoints, FB, FE)) {
+      for (unsigned i = 0; i < Size - 2; ++i) {
+        EndPoints[i] = EndPoints[i + 2];
+      }
+      EndPoints.resize(Size - 2);
+    }
+  }
+
+  // If in the end we have a single range, it is possible that it is now the
+  // full range. Just drop the metadata in that case.
+  if (EndPoints.size() == 2) {
+    ConstantRange Range(cast<ConstantInt>(EndPoints[0])->getValue(),
+                        cast<ConstantInt>(EndPoints[1])->getValue());
+    if (Range.isFullSet())
+      return NULL;
+  }
+
+  return MDNode::get(A->getContext(), EndPoints);
+}
+
 //===----------------------------------------------------------------------===//
 // NamedMDNode implementation.
 //
diff --git a/lib/VMCore/Module.cpp b/lib/VMCore/Module.cpp
index 3c67191..8ea3665 100644
--- a/lib/VMCore/Module.cpp
+++ b/lib/VMCore/Module.cpp
@@ -65,20 +65,20 @@ Module::~Module() {
 Module::Endianness Module::getEndianness() const {
   StringRef temp = DataLayout;
   Module::Endianness ret = AnyEndianness;
-  
+
   while (!temp.empty()) {
     std::pair<StringRef, StringRef> P = getToken(temp, "-");
-    
+
     StringRef token = P.first;
     temp = P.second;
-    
+
     if (token[0] == 'e') {
       ret = LittleEndian;
     } else if (token[0] == 'E') {
       ret = BigEndian;
     }
   }
-  
+
   return ret;
 }
 
@@ -86,13 +86,13 @@ Module::Endianness Module::getEndianness() const {
 Module::PointerSize Module::getPointerSize() const {
   StringRef temp = DataLayout;
   Module::PointerSize ret = AnyPointerSize;
-  
+
   while (!temp.empty()) {
     std::pair<StringRef, StringRef> TmpP = getToken(temp, "-");
     temp = TmpP.second;
     TmpP = getToken(TmpP.first, ":");
     StringRef token = TmpP.second, signalToken = TmpP.first;
-    
+
     if (signalToken[0] == 'p') {
       int size = 0;
       getToken(token, ":").first.getAsInteger(10, size);
@@ -102,7 +102,7 @@ Module::PointerSize Module::getPointerSize() const {
         ret = Pointer64;
     }
   }
-  
+
   return ret;
 }
 
@@ -164,9 +164,9 @@ Constant *Module::getOrInsertFunction(StringRef Name,
   // right type.
   if (F->getType() != PointerType::getUnqual(Ty))
     return ConstantExpr::getBitCast(F, PointerType::getUnqual(Ty));
-  
+
   // Otherwise, we just found the existing function or a prototype.
-  return F;  
+  return F;
 }
 
 Constant *Module::getOrInsertTargetIntrinsic(StringRef Name,
@@ -183,13 +183,12 @@ Constant *Module::getOrInsertTargetIntrinsic(StringRef Name,
   }
 
   // Otherwise, we just found the existing function or a prototype.
-  return F;  
+  return F;
 }
 
 Constant *Module::getOrInsertFunction(StringRef Name,
                                       FunctionType *Ty) {
-  AttrListPtr AttributeList = AttrListPtr::get((AttributeWithIndex *)0, 0);
-  return getOrInsertFunction(Name, Ty, AttributeList);
+  return getOrInsertFunction(Name, Ty, AttrListPtr());
 }
 
 // getOrInsertFunction - Look up the specified function in the module symbol
@@ -229,9 +228,9 @@ Constant *Module::getOrInsertFunction(StringRef Name,
   va_end(Args);
 
   // Build the function type and chain to the other getOrInsertFunction...
-  return getOrInsertFunction(Name, 
+  return getOrInsertFunction(Name,
                              FunctionType::get(RetTy, ArgTys, false),
-                             AttrListPtr::get((AttributeWithIndex *)0, 0));
+                             AttrListPtr());
 }
 
 // getFunction - Look up the specified function in the module symbol table.
@@ -254,7 +253,7 @@ Function *Module::getFunction(StringRef Name) const {
 ///
 GlobalVariable *Module::getGlobalVariable(StringRef Name,
                                           bool AllowLocal) const {
-  if (GlobalVariable *Result = 
+  if (GlobalVariable *Result =
       dyn_cast_or_null<GlobalVariable>(getNamedValue(Name)))
     if (AllowLocal || !Result->hasLocalLinkage())
       return Result;
@@ -282,7 +281,7 @@ Constant *Module::getOrInsertGlobal(StringRef Name, Type *Ty) {
   // right type.
   if (GV->getType() != PointerType::getUnqual(Ty))
     return ConstantExpr::getBitCast(GV, PointerType::getUnqual(Ty));
-  
+
   // Otherwise, we just found the existing function or a prototype.
   return GV;
 }
@@ -299,7 +298,7 @@ GlobalAlias *Module::getNamedAlias(StringRef Name) const {
 }
 
 /// getNamedMetadata - Return the first NamedMDNode in the module with the
-/// specified name. This method returns null if a NamedMDNode with the 
+/// specified name. This method returns null if a NamedMDNode with the
 /// specified name is not found.
 NamedMDNode *Module::getNamedMetadata(const Twine &Name) const {
   SmallString<256> NameData;
@@ -307,8 +306,8 @@ NamedMDNode *Module::getNamedMetadata(const Twine &Name) const {
   return static_cast<StringMap<NamedMDNode*> *>(NamedMDSymTab)->lookup(NameRef);
 }
 
-/// getOrInsertNamedMetadata - Return the first named MDNode in the module 
-/// with the specified name. This method returns a new NamedMDNode if a 
+/// getOrInsertNamedMetadata - Return the first named MDNode in the module
+/// with the specified name. This method returns a new NamedMDNode if a
 /// NamedMDNode with the specified name is not found.
 NamedMDNode *Module::getOrInsertNamedMetadata(StringRef Name) {
   NamedMDNode *&NMD =
@@ -481,12 +480,13 @@ namespace {
     // objects, we keep several helper maps.
     DenseSet<const Value*> VisitedConstants;
     DenseSet<Type*> VisitedTypes;
-    
+
     std::vector<StructType*> &StructTypes;
+    bool OnlyNamed;
   public:
-    TypeFinder(std::vector<StructType*> &structTypes)
-      : StructTypes(structTypes) {}
-    
+    TypeFinder(std::vector<StructType*> &structTypes, bool onlyNamed)
+      : StructTypes(structTypes), OnlyNamed(onlyNamed) {}
+
     void run(const Module &M) {
       // Get types from global variables.
       for (Module::const_global_iterator I = M.global_begin(),
@@ -495,7 +495,7 @@ namespace {
         if (I->hasInitializer())
           incorporateValue(I->getInitializer());
       }
-      
+
       // Get types from aliases.
       for (Module::const_alias_iterator I = M.alias_begin(),
            E = M.alias_end(); I != E; ++I) {
@@ -503,24 +503,32 @@ namespace {
         if (const Value *Aliasee = I->getAliasee())
           incorporateValue(Aliasee);
       }
-      
-      SmallVector<std::pair<unsigned, MDNode*>, 4> MDForInst;
 
       // Get types from functions.
+      SmallVector<std::pair<unsigned, MDNode*>, 4> MDForInst;
       for (Module::const_iterator FI = M.begin(), E = M.end(); FI != E; ++FI) {
         incorporateType(FI->getType());
-        
+
+        // First incorporate the arguments.
+        for (Function::const_arg_iterator AI = FI->arg_begin(),
+               AE = FI->arg_end(); AI != AE; ++AI)
+          incorporateValue(AI);
+
         for (Function::const_iterator BB = FI->begin(), E = FI->end();
              BB != E;++BB)
           for (BasicBlock::const_iterator II = BB->begin(),
                E = BB->end(); II != E; ++II) {
             const Instruction &I = *II;
-            // Incorporate the type of the instruction and all its operands.
+            // Incorporate the type of the instruction.
             incorporateType(I.getType());
+
+            // Incorporate non-instruction operand types. (We are incorporating
+            // all instructions with this loop.)
             for (User::const_op_iterator OI = I.op_begin(), OE = I.op_end();
                  OI != OE; ++OI)
-              incorporateValue(*OI);
-            
+              if (!isa<Instruction>(OI))
+                incorporateValue(*OI);
+
             // Incorporate types hiding in metadata.
             I.getAllMetadataOtherThanDebugLoc(MDForInst);
             for (unsigned i = 0, e = MDForInst.size(); i != e; ++i)
@@ -528,7 +536,7 @@ namespace {
             MDForInst.clear();
           }
       }
-      
+
       for (Module::const_named_metadata_iterator I = M.named_metadata_begin(),
            E = M.named_metadata_end(); I != E; ++I) {
         const NamedMDNode *NMD = I;
@@ -536,23 +544,24 @@ namespace {
           incorporateMDNode(NMD->getOperand(i));
       }
     }
-    
+
   private:
     void incorporateType(Type *Ty) {
       // Check to see if we're already visited this type.
       if (!VisitedTypes.insert(Ty).second)
         return;
-      
+
       // If this is a structure or opaque type, add a name for the type.
       if (StructType *STy = dyn_cast<StructType>(Ty))
-        StructTypes.push_back(STy);
-      
+        if (!OnlyNamed || STy->hasName())
+          StructTypes.push_back(STy);
+
       // Recursively walk all contained types.
       for (Type::subtype_iterator I = Ty->subtype_begin(),
            E = Ty->subtype_end(); I != E; ++I)
         incorporateType(*I);
     }
-    
+
     /// incorporateValue - This method is used to walk operand lists finding
     /// types hiding in constant expressions and other operands that won't be
     /// walked in other ways.  GlobalValues, basic blocks, instructions, and
@@ -561,27 +570,31 @@ namespace {
       if (const MDNode *M = dyn_cast<MDNode>(V))
         return incorporateMDNode(M);
       if (!isa<Constant>(V) || isa<GlobalValue>(V)) return;
-      
+
       // Already visited?
       if (!VisitedConstants.insert(V).second)
         return;
-      
+
       // Check this type.
       incorporateType(V->getType());
-      
+
+      // If this is an instruction, we incorporate it separately.
+      if (isa<Instruction>(V))
+        return;
+
       // Look in operands for types.
       const User *U = cast<User>(V);
       for (Constant::const_op_iterator I = U->op_begin(),
            E = U->op_end(); I != E;++I)
         incorporateValue(*I);
     }
-    
+
     void incorporateMDNode(const MDNode *V) {
-      
+
       // Already visited?
       if (!VisitedConstants.insert(V).second)
         return;
-      
+
       // Look in operands for types.
       for (unsigned i = 0, e = V->getNumOperands(); i != e; ++i)
         if (Value *Op = V->getOperand(i))
@@ -590,6 +603,7 @@ namespace {
   };
 } // end anonymous namespace
 
-void Module::findUsedStructTypes(std::vector<StructType*> &StructTypes) const {
-  TypeFinder(StructTypes).run(*this);
+void Module::findUsedStructTypes(std::vector<StructType*> &StructTypes,
+                                 bool OnlyNamed) const {
+  TypeFinder(StructTypes, OnlyNamed).run(*this);
 }
diff --git a/lib/VMCore/PassManager.cpp b/lib/VMCore/PassManager.cpp
index 28fbaa6..4530c04 100644
--- a/lib/VMCore/PassManager.cpp
+++ b/lib/VMCore/PassManager.cpp
@@ -478,8 +478,7 @@ PMTopLevelManager::PMTopLevelManager(PMDataManager *PMDM) {
 
 /// Set pass P as the last user of the given analysis passes.
 void
-PMTopLevelManager::setLastUser(const SmallVectorImpl<Pass *> &AnalysisPasses,
-                               Pass *P) {
+PMTopLevelManager::setLastUser(ArrayRef<Pass*> AnalysisPasses, Pass *P) {
   unsigned PDepth = 0;
   if (P->getResolver())
     PDepth = P->getResolver()->getPMDataManager().getDepth();
@@ -594,6 +593,26 @@ void PMTopLevelManager::schedulePass(Pass *P) {
       Pass *AnalysisPass = findAnalysisPass(*I);
       if (!AnalysisPass) {
         const PassInfo *PI = PassRegistry::getPassRegistry()->getPassInfo(*I);
+
+        if (PI == NULL) {
+          // Pass P is not in the global PassRegistry
+          dbgs() << "Pass '"  << P->getPassName() << "' is not initialized." << "\n";
+          dbgs() << "Verify if there is a pass dependency cycle." << "\n";
+          dbgs() << "Required Passes:" << "\n";
+          for (AnalysisUsage::VectorType::const_iterator I2 = RequiredSet.begin(),
+                 E = RequiredSet.end(); I2 != E && I2 != I; ++I2) {
+            Pass *AnalysisPass2 = findAnalysisPass(*I2);
+            if (AnalysisPass2) {
+              dbgs() << "\t" << AnalysisPass2->getPassName() << "\n";
+            }
+            else {
+              dbgs() << "\t"   << "Error: Required pass not found! Possible causes:"  << "\n";
+              dbgs() << "\t\t" << "- Pass misconfiguration (e.g.: missing macros)"    << "\n";
+              dbgs() << "\t\t" << "- Corruption of the global PassRegistry"           << "\n";
+            }
+          }
+        }
+
         assert(PI && "Expected required passes to be initialized");
         AnalysisPass = PI->createPass();
         if (P->getPotentialPassManagerType () ==
diff --git a/lib/VMCore/Value.cpp b/lib/VMCore/Value.cpp
index 4006b2c..d871108 100644
--- a/lib/VMCore/Value.cpp
+++ b/lib/VMCore/Value.cpp
@@ -686,6 +686,9 @@ void ValueHandleBase::ValueIsRAUWd(Value *Old, Value *New) {
 #endif
 }
 
-/// ~CallbackVH. Empty, but defined here to avoid emitting the vtable
-/// more than once.
-CallbackVH::~CallbackVH() {}
+// Default implementation for CallbackVH.
+void CallbackVH::allUsesReplacedWith(Value *) {}
+
+void CallbackVH::deleted() {
+  setValPtr(NULL);
+}
diff --git a/lib/VMCore/Verifier.cpp b/lib/VMCore/Verifier.cpp
index 47baef3..5d51f41 100644
--- a/lib/VMCore/Verifier.cpp
+++ b/lib/VMCore/Verifier.cpp
@@ -68,6 +68,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/ConstantRange.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -293,8 +294,9 @@ namespace {
     void VerifyCallSite(CallSite CS);
     bool PerformTypeCheck(Intrinsic::ID ID, Function *F, Type *Ty,
                           int VT, unsigned ArgNo, std::string &Suffix);
-    void VerifyIntrinsicPrototype(Intrinsic::ID ID, Function *F,
-                                  unsigned RetNum, unsigned ParamNum, ...);
+    bool VerifyIntrinsicType(Type *Ty,
+                             ArrayRef<Intrinsic::IITDescriptor> &Infos,
+                             SmallVectorImpl<Type*> &ArgTys);
     void VerifyParameterAttrs(Attributes Attrs, Type *Ty,
                               bool isReturnValue, const Value *V);
     void VerifyFunctionAttrs(FunctionType *FT, const AttrListPtr &Attrs,
@@ -804,14 +806,29 @@ void Verifier::visitSwitchInst(SwitchInst &SI) {
   // Check to make sure that all of the constants in the switch instruction
   // have the same type as the switched-on value.
   Type *SwitchTy = SI.getCondition()->getType();
-  SmallPtrSet<ConstantInt*, 32> Constants;
+  IntegerType *IntTy = cast<IntegerType>(SwitchTy);
+  IntegersSubsetToBB Mapping;
+  std::map<IntegersSubset::Range, unsigned> RangeSetMap;
   for (SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end(); i != e; ++i) {
-    Assert1(i.getCaseValue()->getType() == SwitchTy,
-            "Switch constants must all be same type as switch value!", &SI);
-    Assert2(Constants.insert(i.getCaseValue()),
-            "Duplicate integer as switch case", &SI, i.getCaseValue());
+    IntegersSubset CaseRanges = i.getCaseValueEx();
+    for (unsigned ri = 0, rie = CaseRanges.getNumItems(); ri < rie; ++ri) {
+      IntegersSubset::Range r = CaseRanges.getItem(ri);
+      Assert1(((const APInt&)r.getLow()).getBitWidth() == IntTy->getBitWidth(),
+              "Switch constants must all be same type as switch value!", &SI);
+      Assert1(((const APInt&)r.getHigh()).getBitWidth() == IntTy->getBitWidth(),
+              "Switch constants must all be same type as switch value!", &SI);
+      Mapping.add(r);
+      RangeSetMap[r] = i.getCaseIndex();
+    }
   }
-
+  
+  IntegersSubsetToBB::RangeIterator errItem;
+  if (!Mapping.verify(errItem)) {
+    unsigned CaseIndex = RangeSetMap[errItem->first];
+    SwitchInst::CaseIt i(&SI, CaseIndex);
+    Assert2(false, "Duplicate integer as switch case", &SI, i.getCaseValueEx());
+  }
+  
   visitTerminatorInst(SI);
 }
 
@@ -1346,6 +1363,10 @@ void Verifier::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   visitInstruction(GEP);
 }
 
+static bool isContiguous(const ConstantRange &A, const ConstantRange &B) {
+  return A.getUpper() == B.getLower() || A.getLower() == B.getUpper();
+}
+
 void Verifier::visitLoadInst(LoadInst &LI) {
   PointerType *PTy = dyn_cast<PointerType>(LI.getOperand(0)->getType());
   Assert1(PTy, "Load operand must be a pointer.", &LI);
@@ -1367,6 +1388,8 @@ void Verifier::visitLoadInst(LoadInst &LI) {
     Assert1(NumOperands % 2 == 0, "Unfinished range!", Range);
     unsigned NumRanges = NumOperands / 2;
     Assert1(NumRanges >= 1, "It should have at least one range!", Range);
+
+    ConstantRange LastRange(1); // Dummy initial value
     for (unsigned i = 0; i < NumRanges; ++i) {
       ConstantInt *Low = dyn_cast<ConstantInt>(Range->getOperand(2*i));
       Assert1(Low, "The lower limit must be an integer!", Low);
@@ -1375,9 +1398,35 @@ void Verifier::visitLoadInst(LoadInst &LI) {
       Assert1(High->getType() == Low->getType() &&
               High->getType() == ElTy, "Range types must match load type!",
               &LI);
-      Assert1(High->getValue() != Low->getValue(), "Range must not be empty!",
+
+      APInt HighV = High->getValue();
+      APInt LowV = Low->getValue();
+      ConstantRange CurRange(LowV, HighV);
+      Assert1(!CurRange.isEmptySet() && !CurRange.isFullSet(),
+              "Range must not be empty!", Range);
+      if (i != 0) {
+        Assert1(CurRange.intersectWith(LastRange).isEmptySet(),
+                "Intervals are overlapping", Range);
+        Assert1(LowV.sgt(LastRange.getLower()), "Intervals are not in order",
+                Range);
+        Assert1(!isContiguous(CurRange, LastRange), "Intervals are contiguous",
+                Range);
+      }
+      LastRange = ConstantRange(LowV, HighV);
+    }
+    if (NumRanges > 2) {
+      APInt FirstLow =
+        dyn_cast<ConstantInt>(Range->getOperand(0))->getValue();
+      APInt FirstHigh =
+        dyn_cast<ConstantInt>(Range->getOperand(1))->getValue();
+      ConstantRange FirstRange(FirstLow, FirstHigh);
+      Assert1(FirstRange.intersectWith(LastRange).isEmptySet(),
+              "Intervals are overlapping", Range);
+      Assert1(!isContiguous(FirstRange, LastRange), "Intervals are contiguous",
               Range);
     }
+
+
   }
 
   visitInstruction(LI);
@@ -1526,53 +1575,9 @@ void Verifier::visitLandingPadInst(LandingPadInst &LPI) {
 
 void Verifier::verifyDominatesUse(Instruction &I, unsigned i) {
   Instruction *Op = cast<Instruction>(I.getOperand(i));
-  BasicBlock *BB = I.getParent();
-  BasicBlock *OpBlock = Op->getParent();
-  PHINode *PN = dyn_cast<PHINode>(&I);
-
-  // DT can handle non phi instructions for us.
-  if (!PN) {
-    // Definition must dominate use unless use is unreachable!
-    Assert2(InstsInThisBlock.count(Op) || !DT->isReachableFromEntry(BB) ||
-            DT->dominates(Op, &I),
-            "Instruction does not dominate all uses!", Op, &I);
-    return;
-  }
 
-  // Check that a definition dominates all of its uses.
-  if (InvokeInst *II = dyn_cast<InvokeInst>(Op)) {
-    // Invoke results are only usable in the normal destination, not in the
-    // exceptional destination.
-    BasicBlock *NormalDest = II->getNormalDest();
-
-
-    // PHI nodes differ from other nodes because they actually "use" the
-    // value in the predecessor basic blocks they correspond to.
-    BasicBlock *UseBlock = BB;
-    unsigned j = PHINode::getIncomingValueNumForOperand(i);
-    UseBlock = PN->getIncomingBlock(j);
-    Assert2(UseBlock, "Invoke operand is PHI node with bad incoming-BB",
-            Op, &I);
-
-    if (UseBlock == OpBlock) {
-      // Special case of a phi node in the normal destination or the unwind
-      // destination.
-      Assert2(BB == NormalDest || !DT->isReachableFromEntry(UseBlock),
-              "Invoke result not available in the unwind destination!",
-              Op, &I);
-    } else {
-      Assert2(DT->dominates(II, UseBlock) ||
-              !DT->isReachableFromEntry(UseBlock),
-              "Invoke result does not dominate all uses!", Op, &I);
-    }
-  }
-
-  // PHI nodes are more difficult than other nodes because they actually
-  // "use" the value in the predecessor basic blocks they correspond to.
-  unsigned j = PHINode::getIncomingValueNumForOperand(i);
-  BasicBlock *PredBB = PN->getIncomingBlock(j);
-  Assert2(PredBB && (DT->dominates(OpBlock, PredBB) ||
-                     !DT->isReachableFromEntry(PredBB)),
+  const Use &U = I.getOperandUse(i);
+  Assert2(InstsInThisBlock.count(Op) || DT->dominates(Op, U),
           "Instruction does not dominate all uses!", Op, &I);
 }
 
@@ -1631,8 +1636,11 @@ void Verifier::visitInstruction(Instruction &I) {
     if (Function *F = dyn_cast<Function>(I.getOperand(i))) {
       // Check to make sure that the "address of" an intrinsic function is never
       // taken.
-      Assert1(!F->isIntrinsic() || (i + 1 == e && isa<CallInst>(I)),
+      Assert1(!F->isIntrinsic() || i == (isa<CallInst>(I) ? e-1 : 0),
               "Cannot take the address of an intrinsic!", &I);
+      Assert1(!F->isIntrinsic() || isa<CallInst>(I) ||
+              F->getIntrinsicID() == Intrinsic::donothing,
+              "Cannot invoke an intrinsinc other than donothing", &I);
       Assert1(F->getParent() == Mod, "Referencing function in another module!",
               &I);
     } else if (BasicBlock *OpBB = dyn_cast<BasicBlock>(I.getOperand(i))) {
@@ -1673,10 +1681,85 @@ void Verifier::visitInstruction(Instruction &I) {
   InstsInThisBlock.insert(&I);
 }
 
-// Flags used by TableGen to mark intrinsic parameters with the
-// LLVMExtendedElementVectorType and LLVMTruncatedElementVectorType classes.
-static const unsigned ExtendedElementVectorType = 0x40000000;
-static const unsigned TruncatedElementVectorType = 0x20000000;
+/// VerifyIntrinsicType - Verify that the specified type (which comes from an
+/// intrinsic argument or return value) matches the type constraints specified
+/// by the .td file (e.g. an "any integer" argument really is an integer).
+///
+/// This return true on error but does not print a message.
+bool Verifier::VerifyIntrinsicType(Type *Ty,
+                                   ArrayRef<Intrinsic::IITDescriptor> &Infos,
+                                   SmallVectorImpl<Type*> &ArgTys) {
+  using namespace Intrinsic;
+
+  // If we ran out of descriptors, there are too many arguments.
+  if (Infos.empty()) return true; 
+  IITDescriptor D = Infos.front();
+  Infos = Infos.slice(1);
+  
+  switch (D.Kind) {
+  case IITDescriptor::Void: return !Ty->isVoidTy();
+  case IITDescriptor::MMX:  return !Ty->isX86_MMXTy();
+  case IITDescriptor::Metadata: return !Ty->isMetadataTy();
+  case IITDescriptor::Float: return !Ty->isFloatTy();
+  case IITDescriptor::Double: return !Ty->isDoubleTy();
+  case IITDescriptor::Integer: return !Ty->isIntegerTy(D.Integer_Width);
+  case IITDescriptor::Vector: {
+    VectorType *VT = dyn_cast<VectorType>(Ty);
+    return VT == 0 || VT->getNumElements() != D.Vector_Width ||
+           VerifyIntrinsicType(VT->getElementType(), Infos, ArgTys);
+  }
+  case IITDescriptor::Pointer: {
+    PointerType *PT = dyn_cast<PointerType>(Ty);
+    return PT == 0 || PT->getAddressSpace() != D.Pointer_AddressSpace ||
+           VerifyIntrinsicType(PT->getElementType(), Infos, ArgTys);
+  }
+      
+  case IITDescriptor::Struct: {
+    StructType *ST = dyn_cast<StructType>(Ty);
+    if (ST == 0 || ST->getNumElements() != D.Struct_NumElements)
+      return true;
+    
+    for (unsigned i = 0, e = D.Struct_NumElements; i != e; ++i)
+      if (VerifyIntrinsicType(ST->getElementType(i), Infos, ArgTys))
+        return true;
+    return false;
+  }
+      
+  case IITDescriptor::Argument:
+    // Two cases here - If this is the second occurrence of an argument, verify
+    // that the later instance matches the previous instance. 
+    if (D.getArgumentNumber() < ArgTys.size())
+      return Ty != ArgTys[D.getArgumentNumber()];  
+      
+    // Otherwise, if this is the first instance of an argument, record it and
+    // verify the "Any" kind.
+    assert(D.getArgumentNumber() == ArgTys.size() && "Table consistency error");
+    ArgTys.push_back(Ty);
+      
+    switch (D.getArgumentKind()) {
+    case IITDescriptor::AK_AnyInteger: return !Ty->isIntOrIntVectorTy();
+    case IITDescriptor::AK_AnyFloat:   return !Ty->isFPOrFPVectorTy();
+    case IITDescriptor::AK_AnyVector:  return !isa<VectorType>(Ty);
+    case IITDescriptor::AK_AnyPointer: return !isa<PointerType>(Ty);
+    }
+    llvm_unreachable("all argument kinds not covered");
+      
+  case IITDescriptor::ExtendVecArgument:
+    // This may only be used when referring to a previous vector argument.
+    return D.getArgumentNumber() >= ArgTys.size() ||
+           !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
+           VectorType::getExtendedElementVectorType(
+                       cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
+
+  case IITDescriptor::TruncVecArgument:
+    // This may only be used when referring to a previous vector argument.
+    return D.getArgumentNumber() >= ArgTys.size() ||
+           !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
+           VectorType::getTruncatedElementVectorType(
+                         cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
+  }
+  llvm_unreachable("unhandled");
+}
 
 /// visitIntrinsicFunction - Allow intrinsics to be verified in different ways.
 ///
@@ -1685,10 +1768,30 @@ void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
   Assert1(IF->isDeclaration(), "Intrinsic functions should never be defined!",
           IF);
 
-#define GET_INTRINSIC_VERIFIER
-#include "llvm/Intrinsics.gen"
-#undef GET_INTRINSIC_VERIFIER
-
+  // Verify that the intrinsic prototype lines up with what the .td files
+  // describe.
+  FunctionType *IFTy = IF->getFunctionType();
+  Assert1(!IFTy->isVarArg(), "Intrinsic prototypes are not varargs", IF);
+  
+  SmallVector<Intrinsic::IITDescriptor, 8> Table;
+  getIntrinsicInfoTableEntries(ID, Table);
+  ArrayRef<Intrinsic::IITDescriptor> TableRef = Table;
+
+  SmallVector<Type *, 4> ArgTys;
+  Assert1(!VerifyIntrinsicType(IFTy->getReturnType(), TableRef, ArgTys),
+          "Intrinsic has incorrect return type!", IF);
+  for (unsigned i = 0, e = IFTy->getNumParams(); i != e; ++i)
+    Assert1(!VerifyIntrinsicType(IFTy->getParamType(i), TableRef, ArgTys),
+            "Intrinsic has incorrect argument type!", IF);
+  Assert1(TableRef.empty(), "Intrinsic has too few arguments!", IF);
+
+  // Now that we have the intrinsic ID and the actual argument types (and we
+  // know they are legal for the intrinsic!) get the intrinsic name through the
+  // usual means.  This allows us to verify the mangling of argument types into
+  // the name.
+  Assert1(Intrinsic::getName(ID, ArgTys) == IF->getName(),
+          "Intrinsic name not mangled correctly for type arguments!", IF);
+  
   // If the intrinsic takes MDNode arguments, verify that they are either global
   // or are local to *this* function.
   for (unsigned i = 0, e = CI.getNumArgOperands(); i != e; ++i)
@@ -1772,261 +1875,6 @@ void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
   }
 }
 
-/// Produce a string to identify an intrinsic parameter or return value.
-/// The ArgNo value numbers the return values from 0 to NumRets-1 and the
-/// parameters beginning with NumRets.
-///
-static std::string IntrinsicParam(unsigned ArgNo, unsigned NumRets) {
-  if (ArgNo >= NumRets)
-    return "Intrinsic parameter #" + utostr(ArgNo - NumRets);
-  if (NumRets == 1)
-    return "Intrinsic result type";
-  return "Intrinsic result type #" + utostr(ArgNo);
-}
-
-bool Verifier::PerformTypeCheck(Intrinsic::ID ID, Function *F, Type *Ty,
-                                int VT, unsigned ArgNo, std::string &Suffix) {
-  FunctionType *FTy = F->getFunctionType();
-
-  unsigned NumElts = 0;
-  Type *EltTy = Ty;
-  VectorType *VTy = dyn_cast<VectorType>(Ty);
-  if (VTy) {
-    EltTy = VTy->getElementType();
-    NumElts = VTy->getNumElements();
-  }
-
-  Type *RetTy = FTy->getReturnType();
-  StructType *ST = dyn_cast<StructType>(RetTy);
-  unsigned NumRetVals;
-  if (RetTy->isVoidTy())
-    NumRetVals = 0;
-  else if (ST)
-    NumRetVals = ST->getNumElements();
-  else
-    NumRetVals = 1;
-
-  if (VT < 0) {
-    int Match = ~VT;
-
-    // Check flags that indicate a type that is an integral vector type with
-    // elements that are larger or smaller than the elements of the matched
-    // type.
-    if ((Match & (ExtendedElementVectorType |
-                  TruncatedElementVectorType)) != 0) {
-      IntegerType *IEltTy = dyn_cast<IntegerType>(EltTy);
-      if (!VTy || !IEltTy) {
-        CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is not "
-                    "an integral vector type.", F);
-        return false;
-      }
-      // Adjust the current Ty (in the opposite direction) rather than
-      // the type being matched against.
-      if ((Match & ExtendedElementVectorType) != 0) {
-        if ((IEltTy->getBitWidth() & 1) != 0) {
-          CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " vector "
-                      "element bit-width is odd.", F);
-          return false;
-        }
-        Ty = VectorType::getTruncatedElementVectorType(VTy);
-      } else
-        Ty = VectorType::getExtendedElementVectorType(VTy);
-      Match &= ~(ExtendedElementVectorType | TruncatedElementVectorType);
-    }
-
-    if (Match <= static_cast<int>(NumRetVals - 1)) {
-      if (ST)
-        RetTy = ST->getElementType(Match);
-
-      if (Ty != RetTy) {
-        CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " does not "
-                    "match return type.", F);
-        return false;
-      }
-    } else {
-      if (Ty != FTy->getParamType(Match - NumRetVals)) {
-        CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " does not "
-                    "match parameter %" + utostr(Match - NumRetVals) + ".", F);
-        return false;
-      }
-    }
-  } else if (VT == MVT::iAny) {
-    if (!EltTy->isIntegerTy()) {
-      CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is not "
-                  "an integer type.", F);
-      return false;
-    }
-
-    unsigned GotBits = cast<IntegerType>(EltTy)->getBitWidth();
-    Suffix += ".";
-
-    if (EltTy != Ty)
-      Suffix += "v" + utostr(NumElts);
-
-    Suffix += "i" + utostr(GotBits);
-
-    // Check some constraints on various intrinsics.
-    switch (ID) {
-    default: break; // Not everything needs to be checked.
-    case Intrinsic::bswap:
-      if (GotBits < 16 || GotBits % 16 != 0) {
-        CheckFailed("Intrinsic requires even byte width argument", F);
-        return false;
-      }
-      break;
-    }
-  } else if (VT == MVT::fAny) {
-    if (!EltTy->isFloatingPointTy()) {
-      CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is not "
-                  "a floating-point type.", F);
-      return false;
-    }
-
-    Suffix += ".";
-
-    if (EltTy != Ty)
-      Suffix += "v" + utostr(NumElts);
-
-    Suffix += EVT::getEVT(EltTy).getEVTString();
-  } else if (VT == MVT::vAny) {
-    if (!VTy) {
-      CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is not a vector type.",
-                  F);
-      return false;
-    }
-    Suffix += ".v" + utostr(NumElts) + EVT::getEVT(EltTy).getEVTString();
-  } else if (VT == MVT::iPTR) {
-    if (!Ty->isPointerTy()) {
-      CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is not a "
-                  "pointer and a pointer is required.", F);
-      return false;
-    }
-  } else if (VT == MVT::iPTRAny) {
-    // Outside of TableGen, we don't distinguish iPTRAny (to any address space)
-    // and iPTR. In the verifier, we can not distinguish which case we have so
-    // allow either case to be legal.
-    if (PointerType* PTyp = dyn_cast<PointerType>(Ty)) {
-      EVT PointeeVT = EVT::getEVT(PTyp->getElementType(), true);
-      if (PointeeVT == MVT::Other) {
-        CheckFailed("Intrinsic has pointer to complex type.");
-        return false;
-      }
-      Suffix += ".p" + utostr(PTyp->getAddressSpace()) +
-        PointeeVT.getEVTString();
-    } else {
-      CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is not a "
-                  "pointer and a pointer is required.", F);
-      return false;
-    }
-  } else if (EVT((MVT::SimpleValueType)VT).isVector()) {
-    EVT VVT = EVT((MVT::SimpleValueType)VT);
-
-    // If this is a vector argument, verify the number and type of elements.
-    if (VVT.getVectorElementType() != EVT::getEVT(EltTy)) {
-      CheckFailed("Intrinsic prototype has incorrect vector element type!", F);
-      return false;
-    }
-
-    if (VVT.getVectorNumElements() != NumElts) {
-      CheckFailed("Intrinsic prototype has incorrect number of "
-                  "vector elements!", F);
-      return false;
-    }
-  } else if (EVT((MVT::SimpleValueType)VT).getTypeForEVT(Ty->getContext()) != 
-             EltTy) {
-    CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is wrong!", F);
-    return false;
-  } else if (EltTy != Ty) {
-    CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is a vector "
-                "and a scalar is required.", F);
-    return false;
-  }
-
-  return true;
-}
-
-/// VerifyIntrinsicPrototype - TableGen emits calls to this function into
-/// Intrinsics.gen.  This implements a little state machine that verifies the
-/// prototype of intrinsics.
-void Verifier::VerifyIntrinsicPrototype(Intrinsic::ID ID, Function *F,
-                                        unsigned NumRetVals,
-                                        unsigned NumParams, ...) {
-  va_list VA;
-  va_start(VA, NumParams);
-  FunctionType *FTy = F->getFunctionType();
-
-  // For overloaded intrinsics, the Suffix of the function name must match the
-  // types of the arguments. This variable keeps track of the expected
-  // suffix, to be checked at the end.
-  std::string Suffix;
-
-  if (FTy->getNumParams() + FTy->isVarArg() != NumParams) {
-    CheckFailed("Intrinsic prototype has incorrect number of arguments!", F);
-    return;
-  }
-
-  Type *Ty = FTy->getReturnType();
-  StructType *ST = dyn_cast<StructType>(Ty);
-
-  if (NumRetVals == 0 && !Ty->isVoidTy()) {
-    CheckFailed("Intrinsic should return void", F);
-    return;
-  }
-  
-  // Verify the return types.
-  if (ST && ST->getNumElements() != NumRetVals) {
-    CheckFailed("Intrinsic prototype has incorrect number of return types!", F);
-    return;
-  }
-  
-  for (unsigned ArgNo = 0; ArgNo != NumRetVals; ++ArgNo) {
-    int VT = va_arg(VA, int); // An MVT::SimpleValueType when non-negative.
-
-    if (ST) Ty = ST->getElementType(ArgNo);
-    if (!PerformTypeCheck(ID, F, Ty, VT, ArgNo, Suffix))
-      break;
-  }
-
-  // Verify the parameter types.
-  for (unsigned ArgNo = 0; ArgNo != NumParams; ++ArgNo) {
-    int VT = va_arg(VA, int); // An MVT::SimpleValueType when non-negative.
-
-    if (VT == MVT::isVoid && ArgNo > 0) {
-      if (!FTy->isVarArg())
-        CheckFailed("Intrinsic prototype has no '...'!", F);
-      break;
-    }
-
-    if (!PerformTypeCheck(ID, F, FTy->getParamType(ArgNo), VT,
-                          ArgNo + NumRetVals, Suffix))
-      break;
-  }
-
-  va_end(VA);
-
-  // For intrinsics without pointer arguments, if we computed a Suffix then the
-  // intrinsic is overloaded and we need to make sure that the name of the
-  // function is correct. We add the suffix to the name of the intrinsic and
-  // compare against the given function name. If they are not the same, the
-  // function name is invalid. This ensures that overloading of intrinsics
-  // uses a sane and consistent naming convention.  Note that intrinsics with
-  // pointer argument may or may not be overloaded so we will check assuming it
-  // has a suffix and not.
-  if (!Suffix.empty()) {
-    std::string Name(Intrinsic::getName(ID));
-    if (Name + Suffix != F->getName()) {
-      CheckFailed("Overloaded intrinsic has incorrect suffix: '" +
-                  F->getName().substr(Name.length()) + "'. It should be '" +
-                  Suffix + "'", F);
-    }
-  }
-
-  // Check parameter attributes.
-  Assert1(F->getAttributes() == Intrinsic::getAttributes(ID),
-          "Intrinsic has wrong parameter attributes!", F);
-}
-
-
 //===----------------------------------------------------------------------===//
 //  Implement the public interfaces to this file...
 //===----------------------------------------------------------------------===//