487 files changed, 28945 insertions, 12756 deletions
diff --git a/lib/Target/AArch64/AArch64BranchFixupPass.cpp b/lib/Target/AArch64/AArch64BranchFixupPass.cpp
index 71233ba..11e7f41 100644
--- a/lib/Target/AArch64/AArch64BranchFixupPass.cpp
+++ b/lib/Target/AArch64/AArch64BranchFixupPass.cpp
@@ -87,7 +87,7 @@ namespace {
         // If the block size isn't a multiple of the known bits, assume the
         // worst case padding.
         if (Size & ((1u << Bits) - 1))
-          Bits = CountTrailingZeros_32(Size);
+          Bits = countTrailingZeros(Size);
         return Bits;
       }
 
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index daa7f1d..8b907b2 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -54,7 +54,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
 
   MachineModuleInfo &MMI = MF.getMMI();
-  std::vector<MachineMove> &Moves = MMI.getFrameMoves();
+  const MCRegisterInfo &MRI = MMI.getContext().getRegisterInfo();
   bool NeedsFrameMoves = MMI.hasDebugInfo()
     || MF.getFunction()->needsUnwindTableEntry();
 
@@ -97,8 +97,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
       .addSym(SPLabel);
 
     MachineLocation Dst(MachineLocation::VirtualFP);
-    MachineLocation Src(AArch64::XSP, NumInitialBytes);
-    Moves.push_back(MachineMove(SPLabel, Dst, Src));
+    unsigned Reg = MRI.getDwarfRegNum(AArch64::XSP, true);
+    MMI.addFrameInst(
+        MCCFIInstruction::createDefCfa(SPLabel, Reg, -NumInitialBytes));
   }
 
   // Otherwise we need to set the frame pointer and/or add a second stack
@@ -131,9 +132,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
         MCSymbol *FPLabel = MMI.getContext().CreateTempSymbol();
         BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::PROLOG_LABEL))
           .addSym(FPLabel);
-        MachineLocation Dst(MachineLocation::VirtualFP);
-        MachineLocation Src(AArch64::X29, -MFI->getObjectOffset(X29FrameIdx));
-        Moves.push_back(MachineMove(FPLabel, Dst, Src));
+        unsigned Reg = MRI.getDwarfRegNum(AArch64::X29, true);
+        unsigned Offset = MFI->getObjectOffset(X29FrameIdx);
+        MMI.addFrameInst(MCCFIInstruction::createDefCfa(FPLabel, Reg, Offset));
       }
 
       FPNeedsSetting = false;
@@ -164,8 +165,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
       .addSym(CSLabel);
 
     MachineLocation Dst(MachineLocation::VirtualFP);
-    MachineLocation Src(AArch64::XSP, NumResidualBytes + NumInitialBytes);
-    Moves.push_back(MachineMove(CSLabel, Dst, Src));
+    unsigned Reg = MRI.getDwarfRegNum(AArch64::XSP, true);
+    unsigned Offset = NumResidualBytes + NumInitialBytes;
+    MMI.addFrameInst(MCCFIInstruction::createDefCfa(CSLabel, Reg, -Offset));
   }
 
   // And any callee-saved registers (it's fine to leave them to the end here,
@@ -180,10 +182,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
 
     for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
            E = CSI.end(); I != E; ++I) {
-      MachineLocation Dst(MachineLocation::VirtualFP,
-                          MFI->getObjectOffset(I->getFrameIdx()));
-      MachineLocation Src(I->getReg());
-      Moves.push_back(MachineMove(CSLabel, Dst, Src));
+      unsigned Offset = MFI->getObjectOffset(I->getFrameIdx());
+      unsigned Reg = MRI.getDwarfRegNum(I->getReg(), true);
+      MMI.addFrameInst(MCCFIInstruction::createOffset(CSLabel, Reg, Offset));
     }
   }
 }
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 468c561..2e37cb4 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -33,7 +33,6 @@ namespace {
 
 class AArch64DAGToDAGISel : public SelectionDAGISel {
   AArch64TargetMachine &TM;
-  const AArch64InstrInfo *TII;
 
   /// Keep a pointer to the AArch64Subtarget around so that we can
   /// make the right decision when generating code for different targets.
@@ -43,7 +42,6 @@ public:
   explicit AArch64DAGToDAGISel(AArch64TargetMachine &tm,
                                CodeGenOpt::Level OptLevel)
     : SelectionDAGISel(tm, OptLevel), TM(tm),
-      TII(static_cast<const AArch64InstrInfo*>(TM.getInstrInfo())),
       Subtarget(&TM.getSubtarget<AArch64Subtarget>()) {
   }
 
@@ -70,6 +68,15 @@ public:
     return SelectCVTFixedPosOperand(N, FixedPos, RegWidth);
   }
 
+  /// Used for pre-lowered address-reference nodes, so we already know
+  /// the fields match. This operand's job is simply to add an
+  /// appropriate shift operand (i.e. 0) to the MOVZ/MOVK instruction.
+  bool SelectMOVWAddressRef(SDValue N, SDValue &Imm, SDValue &Shift) {
+    Imm = N;
+    Shift = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+
   bool SelectFPZeroOperand(SDValue N, SDValue &Dummy);
 
   bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
@@ -88,7 +95,12 @@ public:
 
   bool SelectTSTBOperand(SDValue N, SDValue &FixedPos, unsigned RegWidth);
 
-  SDNode *SelectAtomic(SDNode *N, unsigned Op8, unsigned Op16, unsigned Op32, unsigned Op64);
+  SDNode *SelectAtomic(SDNode *N, unsigned Op8, unsigned Op16, unsigned Op32,
+                       unsigned Op64);
+
+  /// Put the given constant into a pool and return a DAG which will give its
+  /// address.
+  SDValue getConstantPoolItemAddress(SDLoc DL, const Constant *CV);
 
   SDNode *TrySelectToMoveImm(SDNode *N);
   SDNode *LowerToFPLitPool(SDNode *Node);
@@ -177,7 +189,7 @@ bool AArch64DAGToDAGISel::SelectLogicalImm(SDValue N, SDValue &Imm) {
 
 SDNode *AArch64DAGToDAGISel::TrySelectToMoveImm(SDNode *Node) {
   SDNode *ResNode;
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
   EVT DestType = Node->getValueType(0);
   unsigned DestWidth = DestType.getSizeInBits();
 
@@ -226,12 +238,51 @@ SDNode *AArch64DAGToDAGISel::TrySelectToMoveImm(SDNode *Node) {
   return ResNode;
 }
 
+SDValue
+AArch64DAGToDAGISel::getConstantPoolItemAddress(SDLoc DL,
+                                                const Constant *CV) {
+  EVT PtrVT = getTargetLowering()->getPointerTy();
+
+  switch (getTargetLowering()->getTargetMachine().getCodeModel()) {
+  case CodeModel::Small: {
+    unsigned Alignment =
+      getTargetLowering()->getDataLayout()->getABITypeAlignment(CV->getType());
+    return CurDAG->getNode(
+        AArch64ISD::WrapperSmall, DL, PtrVT,
+        CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_NO_FLAG),
+        CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_LO12),
+        CurDAG->getConstant(Alignment, MVT::i32));
+  }
+  case CodeModel::Large: {
+    SDNode *LitAddr;
+    LitAddr = CurDAG->getMachineNode(
+        AArch64::MOVZxii, DL, PtrVT,
+        CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_ABS_G3),
+        CurDAG->getTargetConstant(0, MVT::i32));
+    LitAddr = CurDAG->getMachineNode(
+        AArch64::MOVKxii, DL, PtrVT, SDValue(LitAddr, 0),
+        CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_ABS_G2_NC),
+        CurDAG->getTargetConstant(0, MVT::i32));
+    LitAddr = CurDAG->getMachineNode(
+        AArch64::MOVKxii, DL, PtrVT, SDValue(LitAddr, 0),
+        CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_ABS_G1_NC),
+        CurDAG->getTargetConstant(0, MVT::i32));
+    LitAddr = CurDAG->getMachineNode(
+        AArch64::MOVKxii, DL, PtrVT, SDValue(LitAddr, 0),
+        CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_ABS_G0_NC),
+        CurDAG->getTargetConstant(0, MVT::i32));
+    return SDValue(LitAddr, 0);
+  }
+  default:
+    llvm_unreachable("Only small and large code models supported now");
+  }
+}
+
 SDNode *AArch64DAGToDAGISel::SelectToLitPool(SDNode *Node) {
-  DebugLoc DL = Node->getDebugLoc();
+  SDLoc DL(Node);
   uint64_t UnsignedVal = cast<ConstantSDNode>(Node)->getZExtValue();
   int64_t SignedVal = cast<ConstantSDNode>(Node)->getSExtValue();
   EVT DestType = Node->getValueType(0);
-  EVT PtrVT = TLI.getPointerTy();
 
   // Since we may end up loading a 64-bit constant from a 32-bit entry the
   // constant in the pool may have a different type to the eventual node.
@@ -258,14 +309,9 @@ SDNode *AArch64DAGToDAGISel::SelectToLitPool(SDNode *Node) {
   Constant *CV = ConstantInt::get(Type::getIntNTy(*CurDAG->getContext(),
                                                   MemType.getSizeInBits()),
                                   UnsignedVal);
-  SDValue PoolAddr;
-  unsigned Alignment = TLI.getDataLayout()->getABITypeAlignment(CV->getType());
-  PoolAddr = CurDAG->getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
-                             CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0,
-                                                         AArch64II::MO_NO_FLAG),
-                             CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0,
-                                                           AArch64II::MO_LO12),
-                             CurDAG->getConstant(Alignment, MVT::i32));
+  SDValue PoolAddr = getConstantPoolItemAddress(DL, CV);
+  unsigned Alignment =
+    getTargetLowering()->getDataLayout()->getABITypeAlignment(CV->getType());
 
   return CurDAG->getExtLoad(Extension, DL, DestType, CurDAG->getEntryNode(),
                             PoolAddr,
@@ -276,22 +322,13 @@ SDNode *AArch64DAGToDAGISel::SelectToLitPool(SDNode *Node) {
 }
 
 SDNode *AArch64DAGToDAGISel::LowerToFPLitPool(SDNode *Node) {
-  DebugLoc DL = Node->getDebugLoc();
+  SDLoc DL(Node);
   const ConstantFP *FV = cast<ConstantFPSDNode>(Node)->getConstantFPValue();
-  EVT PtrVT = TLI.getPointerTy();
   EVT DestType = Node->getValueType(0);
 
-  unsigned Alignment = TLI.getDataLayout()->getABITypeAlignment(FV->getType());
-  SDValue PoolAddr;
-
-  assert(TM.getCodeModel() == CodeModel::Small &&
-         "Only small code model supported");
-  PoolAddr = CurDAG->getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
-                             CurDAG->getTargetConstantPool(FV, PtrVT, 0, 0,
-                                                         AArch64II::MO_NO_FLAG),
-                             CurDAG->getTargetConstantPool(FV, PtrVT, 0, 0,
-                                                           AArch64II::MO_LO12),
-                             CurDAG->getConstant(Alignment, MVT::i32));
+  unsigned Alignment =
+    getTargetLowering()->getDataLayout()->getABITypeAlignment(FV->getType());
+  SDValue PoolAddr = getConstantPoolItemAddress(DL, FV);
 
   return CurDAG->getLoad(DestType, DL, CurDAG->getEntryNode(), PoolAddr,
                          MachinePointerInfo::getConstantPool(),
@@ -436,7 +473,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
                         AArch64::ATOMIC_CMP_SWAP_I64);
   case ISD::FrameIndex: {
     int FI = cast<FrameIndexSDNode>(Node)->getIndex();
-    EVT PtrTy = TLI.getPointerTy();
+    EVT PtrTy = getTargetLowering()->getPointerTy();
     SDValue TFI = CurDAG->getTargetFrameIndex(FI, PtrTy);
     return CurDAG->SelectNodeTo(Node, AArch64::ADDxxi_lsl0_s, PtrTy,
                                 TFI, CurDAG->getTargetConstant(0, PtrTy));
@@ -460,7 +497,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
       assert((Ty == MVT::i32 || Ty == MVT::i64) && "unexpected type");
       uint16_t Register = Ty == MVT::i32 ? AArch64::WZR : AArch64::XZR;
       ResNode = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
-                                       Node->getDebugLoc(),
+                                       SDLoc(Node),
                                        Register, Ty).getNode();
     }
 
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 786b1ba..5a53339 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -39,12 +39,8 @@ static TargetLoweringObjectFile *createTLOF(AArch64TargetMachine &TM) {
   llvm_unreachable("unknown subtarget type");
 }
 
-
 AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
-  : TargetLowering(TM, createTLOF(TM)),
-    Subtarget(&TM.getSubtarget<AArch64Subtarget>()),
-    RegInfo(TM.getRegisterInfo()),
-    Itins(TM.getInstrItineraryData()) {
+  : TargetLowering(TM, createTLOF(TM)), Itins(TM.getInstrItineraryData()) {
 
   // SIMD compares set the entire lane's bits to 1
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
@@ -260,7 +256,7 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
   setExceptionSelectorRegister(AArch64::X1);
 }
 
-EVT AArch64TargetLowering::getSetCCResultType(EVT VT) const {
+EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
   // It's reasonably important that this value matches the "natural" legal
   // promotion from i1 for scalar types. Otherwise LegalizeTypes can get itself
   // in a twist (e.g. inserting an any_extend which then becomes i64 -> i64).
@@ -781,6 +777,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case AArch64ISD::TC_RETURN:      return "AArch64ISD::TC_RETURN";
   case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
   case AArch64ISD::TLSDESCCALL:    return "AArch64ISD::TLSDESCCALL";
+  case AArch64ISD::WrapperLarge:   return "AArch64ISD::WrapperLarge";
   case AArch64ISD::WrapperSmall:   return "AArch64ISD::WrapperSmall";
 
   default:                       return NULL;
@@ -825,7 +822,7 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const {
 
 void
 AArch64TargetLowering::SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG,
-                                           DebugLoc DL, SDValue &Chain) const {
+                                           SDLoc DL, SDValue &Chain) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   AArch64MachineFunctionInfo *FuncInfo
@@ -896,7 +893,7 @@ SDValue
 AArch64TargetLowering::LowerFormalArguments(SDValue Chain,
                                       CallingConv::ID CallConv, bool isVarArg,
                                       const SmallVectorImpl<ISD::InputArg> &Ins,
-                                      DebugLoc dl, SelectionDAG &DAG,
+                                      SDLoc dl, SelectionDAG &DAG,
                                       SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
   AArch64MachineFunctionInfo *FuncInfo
@@ -1011,7 +1008,7 @@ AArch64TargetLowering::LowerReturn(SDValue Chain,
                                    CallingConv::ID CallConv, bool isVarArg,
                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
                                    const SmallVectorImpl<SDValue> &OutVals,
-                                   DebugLoc dl, SelectionDAG &DAG) const {
+                                   SDLoc dl, SelectionDAG &DAG) const {
   // CCValAssign - represent the assignment of the return value to a location.
   SmallVector<CCValAssign, 16> RVLocs;
 
@@ -1084,7 +1081,7 @@ SDValue
 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
                                  SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG                     = CLI.DAG;
-  DebugLoc &dl                          = CLI.DL;
+  SDLoc &dl                             = CLI.DL;
   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
   SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
   SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
@@ -1150,7 +1147,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   }
 
   if (!IsSibCall)
-    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                                 dl);
 
   SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, AArch64::XSP,
                                         getPointerTy());
@@ -1281,7 +1279,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   // in the correct location.
   if (IsTailCall && !IsSibCall) {
     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                               DAG.getIntPtrConstant(0, true), InFlag);
+                               DAG.getIntPtrConstant(0, true), InFlag, dl);
     InFlag = Chain.getValue(1);
   }
 
@@ -1335,7 +1333,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
                                DAG.getIntPtrConstant(CalleePopBytes, true),
-                               InFlag);
+                               InFlag, dl);
     InFlag = Chain.getValue(1);
   }
 
@@ -1347,7 +1345,7 @@ SDValue
 AArch64TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
                                       CallingConv::ID CallConv, bool IsVarArg,
                                       const SmallVectorImpl<ISD::InputArg> &Ins,
-                                      DebugLoc dl, SelectionDAG &DAG,
+                                      SDLoc dl, SelectionDAG &DAG,
                                       SmallVectorImpl<SDValue> &InVals) const {
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
@@ -1536,7 +1534,7 @@ SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
         }
 
    // Build a tokenfactor for all the chains.
-   return DAG.getNode(ISD::TokenFactor, Chain.getDebugLoc(), MVT::Other,
+   return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other,
                       &ArgChains[0], ArgChains.size());
 }
 
@@ -1569,7 +1567,7 @@ bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Val) const {
 
 SDValue AArch64TargetLowering::getSelectableIntSetCC(SDValue LHS, SDValue RHS,
                                         ISD::CondCode CC, SDValue &A64cc,
-                                        SelectionDAG &DAG, DebugLoc &dl) const {
+                                        SelectionDAG &DAG, SDLoc &dl) const {
   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
     int64_t C = 0;
     EVT VT = RHSC->getValueType(0);
@@ -1662,28 +1660,37 @@ static A64CC::CondCodes FPCCToA64CC(ISD::CondCode CC,
 
 SDValue
 AArch64TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT PtrVT = getPointerTy();
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
 
-  assert(getTargetMachine().getCodeModel() == CodeModel::Small
-         && "Only small code model supported at the moment");
-
-  // The most efficient code is PC-relative anyway for the small memory model,
-  // so we don't need to worry about relocation model.
-  return DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
-                     DAG.getTargetBlockAddress(BA, PtrVT, 0,
-                                               AArch64II::MO_NO_FLAG),
-                     DAG.getTargetBlockAddress(BA, PtrVT, 0,
-                                               AArch64II::MO_LO12),
-                     DAG.getConstant(/*Alignment=*/ 4, MVT::i32));
+  switch(getTargetMachine().getCodeModel()) {
+  case CodeModel::Small:
+    // The most efficient code is PC-relative anyway for the small memory model,
+    // so we don't need to worry about relocation model.
+    return DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
+                       DAG.getTargetBlockAddress(BA, PtrVT, 0,
+                                                 AArch64II::MO_NO_FLAG),
+                       DAG.getTargetBlockAddress(BA, PtrVT, 0,
+                                                 AArch64II::MO_LO12),
+                       DAG.getConstant(/*Alignment=*/ 4, MVT::i32));
+  case CodeModel::Large:
+    return DAG.getNode(
+      AArch64ISD::WrapperLarge, DL, PtrVT,
+      DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G3),
+      DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G2_NC),
+      DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G1_NC),
+      DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G0_NC));
+  default:
+    llvm_unreachable("Only small and large code models supported now");
+  }
 }
 
 
 // (BRCOND chain, val, dest)
 SDValue
 AArch64TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue Chain = Op.getOperand(0);
   SDValue TheBit = Op.getOperand(1);
   SDValue DestBB = Op.getOperand(2);
@@ -1706,7 +1713,7 @@ AArch64TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
 // (BR_CC chain, condcode, lhs, rhs, dest)
 SDValue
 AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue Chain = Op.getOperand(0);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
   SDValue LHS = Op.getOperand(2);
@@ -1792,7 +1799,7 @@ AArch64TargetLowering::LowerF128ToCall(SDValue Op, SelectionDAG &DAG,
   CallLoweringInfo CLI(InChain, RetTy, false, false, false, false,
                     0, getLibcallCallingConv(Call), isTailCall,
                     /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
-                    Callee, Args, DAG, Op->getDebugLoc());
+                    Callee, Args, DAG, SDLoc(Op));
   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
 
   if (!CallInfo.second.getNode())
@@ -1814,7 +1821,7 @@ AArch64TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue SrcVal = Op.getOperand(0);
   return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
-                     /*isSigned*/ false, Op.getDebugLoc());
+                     /*isSigned*/ false, SDLoc(Op));
 }
 
 SDValue
@@ -1845,16 +1852,37 @@ AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
 }
 
 SDValue
-AArch64TargetLowering::LowerGlobalAddressELF(SDValue Op,
-                                             SelectionDAG &DAG) const {
-  // TableGen doesn't have easy access to the CodeModel or RelocationModel, so
-  // we make that distinction here.
+AArch64TargetLowering::LowerGlobalAddressELFLarge(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  assert(getTargetMachine().getCodeModel() == CodeModel::Large);
+  assert(getTargetMachine().getRelocationModel() == Reloc::Static);
+
+  EVT PtrVT = getPointerTy();
+  SDLoc dl(Op);
+  const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
+  const GlobalValue *GV = GN->getGlobal();
+
+  SDValue GlobalAddr = DAG.getNode(
+      AArch64ISD::WrapperLarge, dl, PtrVT,
+      DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G3),
+      DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G2_NC),
+      DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G1_NC),
+      DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G0_NC));
+
+  if (GN->getOffset() != 0)
+    return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalAddr,
+                       DAG.getConstant(GN->getOffset(), PtrVT));
 
-  // We support the small memory model for now.
+  return GlobalAddr;
+}
+
+SDValue
+AArch64TargetLowering::LowerGlobalAddressELFSmall(SDValue Op,
+                                                  SelectionDAG &DAG) const {
   assert(getTargetMachine().getCodeModel() == CodeModel::Small);
 
   EVT PtrVT = getPointerTy();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = GN->getGlobal();
   unsigned Alignment = GV->getAlignment();
@@ -1896,7 +1924,7 @@ AArch64TargetLowering::LowerGlobalAddressELF(SDValue Op,
   }
 
   unsigned char HiFixup, LoFixup;
-  bool UseGOT = Subtarget->GVIsIndirectSymbol(GV, RelocM);
+  bool UseGOT = getSubtarget()->GVIsIndirectSymbol(GV, RelocM);
 
   if (UseGOT) {
     HiFixup = AArch64II::MO_GOT;
@@ -1929,9 +1957,25 @@ AArch64TargetLowering::LowerGlobalAddressELF(SDValue Op,
   return GlobalRef;
 }
 
+SDValue
+AArch64TargetLowering::LowerGlobalAddressELF(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  // TableGen doesn't have easy access to the CodeModel or RelocationModel, so
+  // we make those distinctions here.
+
+  switch (getTargetMachine().getCodeModel()) {
+  case CodeModel::Small:
+    return LowerGlobalAddressELFSmall(Op, DAG);
+  case CodeModel::Large:
+    return LowerGlobalAddressELFLarge(Op, DAG);
+  default:
+    llvm_unreachable("Only small and large code models supported now");
+  }
+}
+
 SDValue AArch64TargetLowering::LowerTLSDescCall(SDValue SymAddr,
                                                 SDValue DescAddr,
-                                                DebugLoc DL,
+                                                SDLoc DL,
                                                 SelectionDAG &DAG) const {
   EVT PtrVT = getPointerTy();
 
@@ -1976,15 +2020,17 @@ SDValue AArch64TargetLowering::LowerTLSDescCall(SDValue SymAddr,
 SDValue
 AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
                                              SelectionDAG &DAG) const {
-  assert(Subtarget->isTargetELF() &&
+  assert(getSubtarget()->isTargetELF() &&
          "TLS not implemented for non-ELF targets");
+  assert(getTargetMachine().getCodeModel() == CodeModel::Small
+         && "TLS only supported in small memory model");
   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
 
   TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
 
   SDValue TPOff;
   EVT PtrVT = getPointerTy();
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   const GlobalValue *GV = GA->getGlobal();
 
   SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
@@ -2085,21 +2131,34 @@ AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG,
 SDValue
 AArch64TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
-  DebugLoc dl = JT->getDebugLoc();
+  SDLoc dl(JT);
+  EVT PtrVT = getPointerTy();
 
   // When compiling PIC, jump tables get put in the code section so a static
   // relocation-style is acceptable for both cases.
-  return DAG.getNode(AArch64ISD::WrapperSmall, dl, getPointerTy(),
-                     DAG.getTargetJumpTable(JT->getIndex(), getPointerTy()),
-                     DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
-                                            AArch64II::MO_LO12),
-                     DAG.getConstant(1, MVT::i32));
+  switch (getTargetMachine().getCodeModel()) {
+  case CodeModel::Small:
+    return DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT,
+                       DAG.getTargetJumpTable(JT->getIndex(), PtrVT),
+                       DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
+                                              AArch64II::MO_LO12),
+                       DAG.getConstant(1, MVT::i32));
+  case CodeModel::Large:
+    return DAG.getNode(
+      AArch64ISD::WrapperLarge, dl, PtrVT,
+      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G3),
+      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G2_NC),
+      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G1_NC),
+      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G0_NC));
+  default:
+    llvm_unreachable("Only small and large code models supported now");
+  }
 }
 
 // (SELECT_CC lhs, rhs, iftrue, iffalse, condcode)
 SDValue
 AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   SDValue IfTrue = Op.getOperand(2);
@@ -2155,7 +2214,7 @@ AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 // (SELECT testbit, iftrue, iffalse)
 SDValue
 AArch64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue TheBit = Op.getOperand(0);
   SDValue IfTrue = Op.getOperand(1);
   SDValue IfFalse = Op.getOperand(2);
@@ -2177,7 +2236,7 @@ AArch64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 // (SETCC lhs, rhs, condcode)
 SDValue
 AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
@@ -2236,7 +2295,7 @@ AArch64TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
 
   // We have to make sure we copy the entire structure: 8+8+8+4+4 = 32 bytes
   // rather than just 8.
-  return DAG.getMemcpy(Op.getOperand(0), Op.getDebugLoc(),
+  return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op),
                        Op.getOperand(1), Op.getOperand(2),
                        DAG.getConstant(32, MVT::i32), 8, false, false,
                        MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
@@ -2249,7 +2308,7 @@ AArch64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   AArch64MachineFunctionInfo *FuncInfo
     = MF.getInfo<AArch64MachineFunctionInfo>();
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   SDValue Chain = Op.getOperand(0);
   SDValue VAList = Op.getOperand(1);
@@ -2348,7 +2407,7 @@ static SDValue PerformANDCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI) {
 
   SelectionDAG &DAG = DCI.DAG;
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   EVT VT = N->getValueType(0);
 
   // We're looking for an SRA/SHL pair which form an SBFX.
@@ -2386,7 +2445,7 @@ static SDValue PerformANDCombine(SDNode *N,
 /// a compatible SHL operation (unless they're already low). This function
 /// checks that condition and returns the least-significant bit that's
 /// intended. If the operation not a field preparation, -1 is returned.
-static int32_t getLSBForBFI(SelectionDAG &DAG, DebugLoc DL, EVT VT,
+static int32_t getLSBForBFI(SelectionDAG &DAG, SDLoc DL, EVT VT,
                             SDValue &MaskedVal, uint64_t Mask) {
   if (!isShiftedMask_64(Mask))
     return -1;
@@ -2402,7 +2461,7 @@ static int32_t getLSBForBFI(SelectionDAG &DAG, DebugLoc DL, EVT VT,
   // cases (e.g. bitfield to bitfield copy) may still need a real shift before
   // the BFI.
 
-  uint64_t LSB = CountTrailingZeros_64(Mask);
+  uint64_t LSB = countTrailingZeros(Mask);
   int64_t ShiftRightRequired = LSB;
   if (MaskedVal.getOpcode() == ISD::SHL &&
       isa<ConstantSDNode>(MaskedVal.getOperand(1))) {
@@ -2462,7 +2521,7 @@ static SDValue tryCombineToBFI(SDNode *N,
                                TargetLowering::DAGCombinerInfo &DCI,
                                const AArch64Subtarget *Subtarget) {
   SelectionDAG &DAG = DCI.DAG;
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   EVT VT = N->getValueType(0);
 
   assert(N->getOpcode() == ISD::OR && "Unexpected root");
@@ -2543,7 +2602,7 @@ static SDValue tryCombineToLargerBFI(SDNode *N,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const AArch64Subtarget *Subtarget) {
   SelectionDAG &DAG = DCI.DAG;
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   EVT VT = N->getValueType(0);
 
   // First job is to hunt for a MaskedBFI on either the left or right. Swap
@@ -2625,7 +2684,7 @@ static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
 static SDValue tryCombineToEXTR(SDNode *N,
                                 TargetLowering::DAGCombinerInfo &DCI) {
   SelectionDAG &DAG = DCI.DAG;
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   EVT VT = N->getValueType(0);
 
   assert(N->getOpcode() == ISD::OR && "Unexpected root");
@@ -2697,7 +2756,7 @@ static SDValue PerformSRACombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI) {
 
   SelectionDAG &DAG = DCI.DAG;
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   EVT VT = N->getValueType(0);
 
   // We're looking for an SRA/SHL pair which form an SBFX.
@@ -2736,7 +2795,7 @@ AArch64TargetLowering::PerformDAGCombine(SDNode *N,
   switch (N->getOpcode()) {
   default: break;
   case ISD::AND: return PerformANDCombine(N, DCI);
-  case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
+  case ISD::OR: return PerformORCombine(N, DCI, getSubtarget());
   case ISD::SRA: return PerformSRACombine(N, DCI);
   }
   return SDValue();
@@ -2837,7 +2896,7 @@ AArch64TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   case 'S': {
     // An absolute symbolic address or label reference.
     if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
-      Result = DAG.getTargetGlobalAddress(GA->getGlobal(), Op.getDebugLoc(),
+      Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
                                           GA->getValueType(0));
     } else if (const BlockAddressSDNode *BA
                  = dyn_cast<BlockAddressSDNode>(Op)) {
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index 4960d28..edef68b 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -103,7 +103,12 @@ namespace AArch64ISD {
     UBFX,
 
     // Wraps an address which the ISelLowering phase has decided should be
-    // created using the small absolute memory model: i.e. adrp/add or
+    // created using the large memory model style: i.e. a sequence of four
+    // movz/movk instructions.
+    WrapperLarge,
+
+    // Wraps an address which the ISelLowering phase has decided should be
+    // created using the small memory model style: i.e. adrp/add or
     // adrp/mem-op. This exists to prevent bare TargetAddresses which may never
     // get selected.
     WrapperSmall
@@ -125,14 +130,14 @@ public:
   SDValue LowerFormalArguments(SDValue Chain,
                                CallingConv::ID CallConv, bool isVarArg,
                                const SmallVectorImpl<ISD::InputArg> &Ins,
-                               DebugLoc dl, SelectionDAG &DAG,
+                               SDLoc dl, SelectionDAG &DAG,
                                SmallVectorImpl<SDValue> &InVals) const;
 
   SDValue LowerReturn(SDValue Chain,
                       CallingConv::ID CallConv, bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
                       const SmallVectorImpl<SDValue> &OutVals,
-                      DebugLoc dl, SelectionDAG &DAG) const;
+                      SDLoc dl, SelectionDAG &DAG) const;
 
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const;
@@ -140,11 +145,11 @@ public:
   SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                           CallingConv::ID CallConv, bool IsVarArg,
                           const SmallVectorImpl<ISD::InputArg> &Ins,
-                          DebugLoc dl, SelectionDAG &DAG,
+                          SDLoc dl, SelectionDAG &DAG,
                           SmallVectorImpl<SDValue> &InVals) const;
 
   void SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG,
-                           DebugLoc DL, SDValue &Chain) const;
+                           SDLoc DL, SDValue &Chain) const;
 
 
   /// IsEligibleForTailCallOptimization - Check whether the call is eligible
@@ -166,7 +171,7 @@ public:
   SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG,
                               MachineFrameInfo *MFI, int ClobberedFI) const;
 
-  EVT getSetCCResultType(EVT VT) const;
+  EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
 
   bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const;
 
@@ -176,7 +181,7 @@ public:
 
   bool isLegalICmpImmediate(int64_t Val) const;
   SDValue getSelectableIntSetCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
-                         SDValue &A64cc, SelectionDAG &DAG, DebugLoc &dl) const;
+                         SDValue &A64cc, SelectionDAG &DAG, SDLoc &dl) const;
 
   virtual MachineBasicBlock *
   EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const;
@@ -206,8 +211,12 @@ public:
   SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, bool IsSigned) const;
+
+  SDValue LowerGlobalAddressELFSmall(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGlobalAddressELFLarge(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerTLSDescCall(SDValue SymAddr, SDValue DescAddr, DebugLoc DL,
+
+  SDValue LowerTLSDescCall(SDValue SymAddr, SDValue DescAddr, SDLoc DL,
                            SelectionDAG &DAG) const;
   SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool IsSigned) const;
@@ -238,9 +247,11 @@ public:
   std::pair<unsigned, const TargetRegisterClass*>
   getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const;
 private:
-  const AArch64Subtarget *Subtarget;
-  const TargetRegisterInfo *RegInfo;
   const InstrItineraryData *Itins;
+
+  const AArch64Subtarget *getSubtarget() const {
+    return &getTargetMachine().getSubtarget<AArch64Subtarget>();
+  }
 };
 } // namespace llvm
 
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index cf3a2c3..f90bcef 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -36,7 +36,7 @@ using namespace llvm;
 
 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
   : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP),
-    RI(*this, STI), Subtarget(STI) {}
+    Subtarget(STI) {}
 
 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator I, DebugLoc DL,
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index e3b39ce..d2cfc7d 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -70,12 +70,20 @@ def A64cmn : PatFrag<(ops node:$lhs, node:$rhs),
 //       made for a variable/address at ISelLowering.
 //     + The output of ISelLowering should be selectable (hence the Wrapper,
 //       rather than a bare target opcode)
-def SDTAArch64Wrapper : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
-                                             SDTCisSameAs<1, 2>,
-                                             SDTCisVT<3, i32>,
-                                             SDTCisPtrTy<0>]>;
+def SDTAArch64WrapperLarge : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
+                                                  SDTCisSameAs<0, 2>,
+                                                  SDTCisSameAs<0, 3>,
+                                                  SDTCisSameAs<0, 4>,
+                                                  SDTCisPtrTy<0>]>;
 
-def A64WrapperSmall : SDNode<"AArch64ISD::WrapperSmall", SDTAArch64Wrapper>;
+def A64WrapperLarge :SDNode<"AArch64ISD::WrapperLarge", SDTAArch64WrapperLarge>;
+
+def SDTAArch64WrapperSmall : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
+                                                  SDTCisSameAs<1, 2>,
+                                                  SDTCisVT<3, i32>,
+                                                  SDTCisPtrTy<0>]>;
+
+def A64WrapperSmall :SDNode<"AArch64ISD::WrapperSmall", SDTAArch64WrapperSmall>;
 
 
 def SDTAArch64GOTLoad : SDTypeProfile<1, 1, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
@@ -3871,7 +3879,7 @@ multiclass movw_operands<string prefix, string instname, int width> {
     let DiagnosticType = "MOVWUImm16";
   }
 
-  def _imm : Operand<i32> {
+  def _imm : Operand<i64> {
     let ParserMatchClass = !cast<AsmOperandClass>(prefix # "_imm_asmoperand");
     let PrintMethod = "printMoveWideImmOperand";
     let EncoderMethod = "getMoveWideImmOpValue";
@@ -3942,7 +3950,7 @@ multiclass movalias_operand<string prefix, string basename,
                                        # "A64Imms::" # immpredicate # ">";
   }
 
-  def _movimm : Operand<i32> {
+  def _movimm : Operand<i64> {
     let ParserMatchClass = !cast<AsmOperandClass>(prefix # "_asmoperand");
 
     let MIOperandInfo = (ops uimm16:$UImm16, imm:$Shift);
@@ -3966,6 +3974,15 @@ def : movalias<MOVZxii, GPR64, movz64_movimm>;
 def : movalias<MOVNwii, GPR32, movn32_movimm>;
 def : movalias<MOVNxii, GPR64, movn64_movimm>;
 
+def movw_addressref : ComplexPattern<i64, 2, "SelectMOVWAddressRef">;
+
+def : Pat<(A64WrapperLarge movw_addressref:$G3, movw_addressref:$G2,
+                           movw_addressref:$G1, movw_addressref:$G0),
+          (MOVKxii (MOVKxii (MOVKxii (MOVZxii movw_addressref:$G3),
+                                     movw_addressref:$G2),
+                            movw_addressref:$G1),
+                   movw_addressref:$G0)>;
+
 //===----------------------------------------------------------------------===//
 // PC-relative addressing instructions
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp
index c96bf85..3d22330 100644
--- a/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -68,6 +68,18 @@ AArch64AsmPrinter::lowerSymbolOperand(const MachineOperand &MO,
   case AArch64II::MO_TPREL_G0_NC:
     Expr = AArch64MCExpr::CreateTPREL_G0_NC(Expr, OutContext);
     break;
+  case AArch64II::MO_ABS_G3:
+    Expr = AArch64MCExpr::CreateABS_G3(Expr, OutContext);
+    break;
+  case AArch64II::MO_ABS_G2_NC:
+    Expr = AArch64MCExpr::CreateABS_G2_NC(Expr, OutContext);
+    break;
+  case AArch64II::MO_ABS_G1_NC:
+    Expr = AArch64MCExpr::CreateABS_G1_NC(Expr, OutContext);
+    break;
+  case AArch64II::MO_ABS_G0_NC:
+    Expr = AArch64MCExpr::CreateABS_G0_NC(Expr, OutContext);
+    break;
   case AArch64II::MO_NO_FLAG:
     // Expr is already correct
     break;
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 20b0dcf..75ec44f 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -29,9 +29,8 @@
 
 using namespace llvm;
 
-AArch64RegisterInfo::AArch64RegisterInfo(const AArch64InstrInfo &tii,
-                                         const AArch64Subtarget &sti)
-  : AArch64GenRegisterInfo(AArch64::X30), TII(tii) {
+AArch64RegisterInfo::AArch64RegisterInfo()
+  : AArch64GenRegisterInfo(AArch64::X30) {
 }
 
 const uint16_t *
@@ -122,6 +121,8 @@ AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MBBI,
     return;
   }
 
+  const AArch64InstrInfo &TII =
+    *static_cast<const AArch64InstrInfo*>(MF.getTarget().getInstrInfo());
   int MinOffset, MaxOffset, OffsetScale;
   if (MI.getOpcode() == AArch64::ADDxxi_lsl0_s) {
     MinOffset = 0;
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.h b/lib/Target/AArch64/AArch64RegisterInfo.h
index bb64fd5..4d67943 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -25,12 +25,7 @@ class AArch64InstrInfo;
 class AArch64Subtarget;
 
 struct AArch64RegisterInfo : public AArch64GenRegisterInfo {
-private:
-  const AArch64InstrInfo &TII;
-
-public:
-  AArch64RegisterInfo(const AArch64InstrInfo &tii,
-                      const AArch64Subtarget &sti);
+  AArch64RegisterInfo();
 
   const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
   const uint32_t *getCallPreservedMask(CallingConv::ID) const;
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td
index bd79546..cc2bb61 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -12,15 +12,15 @@
 //===----------------------------------------------------------------------===//
 
 let Namespace = "AArch64" in {
-def sub_128 : SubRegIndex;
-def sub_64 : SubRegIndex;
-def sub_32 : SubRegIndex;
-def sub_16 : SubRegIndex;
-def sub_8  : SubRegIndex;
+def sub_128 : SubRegIndex<128>;
+def sub_64 : SubRegIndex<64>;
+def sub_32 : SubRegIndex<32>;
+def sub_16 : SubRegIndex<16>;
+def sub_8  : SubRegIndex<8>;
 
 // The VPR registers are handled as sub-registers of FPR equivalents, but
 // they're really the same thing. We give this concept a special index.
-def sub_alias : SubRegIndex;
+def sub_alias : SubRegIndex<128>;
 }
 
 // Registers are identified with 5-bit ID numbers.
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index df599d5..f1695e2 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -38,6 +38,7 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, StringRef TT,
     TLInfo(*this),
     TSInfo(*this),
     FrameLowering(Subtarget) {
+  initAsmInfo();
 }
 
 namespace {
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index 12c1b8f..1c397b5 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -208,7 +208,7 @@ DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
   uint8_t bytes[4];
 
   // We want to read exactly 4 bytes of data.
-  if (Region.readBytes(Address, 4, (uint8_t*)bytes, NULL) == -1) {
+  if (Region.readBytes(Address, 4, bytes) == -1) {
     Size = 0;
     return MCDisassembler::Fail;
   }
diff --git a/lib/Target/AArch64/LLVMBuild.txt b/lib/Target/AArch64/LLVMBuild.txt
index 3b296fd..6e4ce8b 100644
--- a/lib/Target/AArch64/LLVMBuild.txt
+++ b/lib/Target/AArch64/LLVMBuild.txt
@@ -25,7 +25,7 @@ parent = Target
 has_asmparser = 1
 has_asmprinter = 1
 has_disassembler = 1
-;has_jit = 1
+has_jit = 1
 
 [component_1]
 type = Library
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
index c0e3b29..d9798ae 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -133,6 +133,26 @@ public:
     return Create(VK_AARCH64_TPREL_G0_NC, Expr, Ctx);
   }
 
+  static const AArch64MCExpr *CreateABS_G3(const MCExpr *Expr,
+                                           MCContext &Ctx) {
+    return Create(VK_AARCH64_ABS_G3, Expr, Ctx);
+  }
+
+  static const AArch64MCExpr *CreateABS_G2_NC(const MCExpr *Expr,
+                                           MCContext &Ctx) {
+    return Create(VK_AARCH64_ABS_G2_NC, Expr, Ctx);
+  }
+
+  static const AArch64MCExpr *CreateABS_G1_NC(const MCExpr *Expr,
+                                           MCContext &Ctx) {
+    return Create(VK_AARCH64_ABS_G1_NC, Expr, Ctx);
+  }
+
+  static const AArch64MCExpr *CreateABS_G0_NC(const MCExpr *Expr,
+                                           MCContext &Ctx) {
+    return Create(VK_AARCH64_ABS_G0_NC, Expr, Ctx);
+  }
+
   /// @}
   /// @name Accessors
   /// @{
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 7960db0..48d4819 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -57,13 +57,14 @@ static MCRegisterInfo *createAArch64MCRegisterInfo(StringRef Triple) {
   return X;
 }
 
-static MCAsmInfo *createAArch64MCAsmInfo(const Target &T, StringRef TT) {
+static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
+                                         StringRef TT) {
   Triple TheTriple(TT);
 
   MCAsmInfo *MAI = new AArch64ELFMCAsmInfo();
-  MachineLocation Dst(MachineLocation::VirtualFP);
-  MachineLocation Src(AArch64::XSP, 0);
-  MAI->addInitialFrameState(0, Dst, Src);
+  unsigned Reg = MRI.getDwarfRegNum(AArch64::XSP, true);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(0, Reg, 0);
+  MAI->addInitialFrameState(Inst);
 
   return MAI;
 }
@@ -81,6 +82,12 @@ static MCCodeGenInfo *createAArch64MCCodeGenInfo(StringRef TT, Reloc::Model RM,
 
   if (CM == CodeModel::Default)
     CM = CodeModel::Small;
+  else if (CM == CodeModel::JITDefault) {
+    // The default MCJIT memory managers make no guarantees about where they can
+    // find an executable page; JITed code needs to be able to refer to globals
+    // no matter how far away they are.
+    CM = CodeModel::Large;
+  }
 
   X->InitMCCodeGenInfo(RM, CM, OL);
   return X;
@@ -129,17 +136,17 @@ public:
     return MCInstrAnalysis::isConditionalBranch(Inst);
   }
 
-  uint64_t evaluateBranch(const MCInst &Inst, uint64_t Addr,
-                          uint64_t Size) const {
+  bool evaluateBranch(const MCInst &Inst, uint64_t Addr,
+                      uint64_t Size, uint64_t &Target) const {
     unsigned LblOperand = Inst.getOpcode() == AArch64::Bcc ? 1 : 0;
     // FIXME: We only handle PCRel branches for now.
     if (Info->get(Inst.getOpcode()).OpInfo[LblOperand].OperandType
         != MCOI::OPERAND_PCREL)
-      return -1ULL;
+      return false;
 
     int64_t Imm = Inst.getOperand(LblOperand).getImm();
-
-    return Addr + Imm;
+    Target = Addr + Imm;
+    return true;
   }
 };
 
diff --git a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
index b8099cb..377b533 100644
--- a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
+++ b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
@@ -19,6 +19,6 @@ using namespace llvm;
 Target llvm::TheAArch64Target;
 
 extern "C" void LLVMInitializeAArch64TargetInfo() {
-  RegisterTarget<Triple::aarch64>
-    X(TheAArch64Target, "aarch64", "AArch64");
+    RegisterTarget<Triple::aarch64, /*HasJIT=*/true>
+    X(TheAArch64Target, "aarch64", "AArch64 (ARM 64-bit target)");
 }
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index bedccb5..79865f6 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -972,7 +972,7 @@ bool A64Imms::isLogicalImm(unsigned RegWidth, uint64_t Imm, uint32_t &Bits) {
     // Now we have to work out the amount of rotation needed. The first part of
     // this calculation is actually independent of RepeatWidth, but the complex
     // case will depend on it.
-    Rotation = CountTrailingZeros_64(Imm);
+    Rotation = countTrailingZeros(Imm);
     if (Rotation == 0) {
       // There were no leading zeros, which means it's either in place or there
       // are 1s at each end (e.g. 0x8003 needs rotating).
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 1b773d6..9a1ca61 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -1037,7 +1037,14 @@ namespace AArch64II {
 
     // MO_LO12 - On a symbol operand, this represents a relocation containing
     // lower 12 bits of the address. Used in add/sub/ldr/str.
-    MO_LO12
+    MO_LO12,
+
+    // MO_ABS_G* - Represent the 16-bit granules of an absolute reference using
+    // movz/movk instructions.
+    MO_ABS_G3,
+    MO_ABS_G2_NC,
+    MO_ABS_G1_NC,
+    MO_ABS_G0_NC
   };
 }
 
diff --git a/lib/Target/AArch64/Utils/CMakeLists.txt b/lib/Target/AArch64/Utils/CMakeLists.txt
index 2c28348..2348e44 100644
--- a/lib/Target/AArch64/Utils/CMakeLists.txt
+++ b/lib/Target/AArch64/Utils/CMakeLists.txt
@@ -3,3 +3,5 @@ include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/
 add_llvm_library(LLVMAArch64Utils
   AArch64BaseInfo.cpp
   )
+
+add_dependencies(LLVMAArch64Utils AArch64CommonTableGen)
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index 2d747091..1bc9d6b 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -38,7 +38,8 @@ def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
 def FeatureThumb2 : SubtargetFeature<"thumb2", "HasThumb2", "true",
                                      "Enable Thumb2 instructions">;
 def FeatureNoARM  : SubtargetFeature<"noarm", "NoARM", "true",
-                                     "Does not support ARM mode execution">;
+                                     "Does not support ARM mode execution",
+                                     [ModeThumb]>;
 def FeatureFP16   : SubtargetFeature<"fp16", "HasFP16", "true",
                                      "Enable half-precision floating point">;
 def FeatureVFP4   : SubtargetFeature<"vfp4", "HasVFPv4", "true",
@@ -59,6 +60,8 @@ def FeatureSlowFPBrcc : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true",
                                          "FP compare + branch is slow">;
 def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true",
                           "Floating point unit supports single precision only">;
+def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true",
+                           "Enable support for Performance Monitor extensions">;
 def FeatureTrustZone : SubtargetFeature<"trustzone", "HasTrustZone", "true",
                           "Enable support for TrustZone security extensions">;
 
@@ -134,7 +137,7 @@ def HasV6T2Ops  : SubtargetFeature<"v6t2", "HasV6T2Ops", "true",
                                    [HasV6Ops, FeatureThumb2]>;
 def HasV7Ops    : SubtargetFeature<"v7", "HasV7Ops", "true",
                                    "Support ARM v7 instructions",
-                                   [HasV6T2Ops]>;
+                                   [HasV6T2Ops, FeaturePerfMon]>;
 
 //===----------------------------------------------------------------------===//
 // ARM Processors supported.
@@ -175,7 +178,8 @@ def ProcA15      : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15",
                                     FeatureTrustZone]>;
 def ProcR5      : SubtargetFeature<"r5", "ARMProcFamily", "CortexR5",
                                    "Cortex-R5 ARM processors",
-                                   [FeatureSlowFPBrcc, FeatureHWDivARM,
+                                   [FeatureSlowFPBrcc,
+                                    FeatureHWDiv, FeatureHWDivARM,
                                     FeatureHasSlowFPVMLx,
                                     FeatureAvoidPartialCPSR,
                                     FeatureT2XtPk]>;
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 0d1417d..ad14475 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -113,8 +113,7 @@ ScheduleHazardRecognizer *ARMBaseInstrInfo::
 CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
                                    const ScheduleDAG *DAG) const {
   if (Subtarget.isThumb2() || Subtarget.hasVFP2())
-    return (ScheduleHazardRecognizer *)
-      new ARMHazardRecognizer(II, *this, getRegisterInfo(), Subtarget, DAG);
+    return (ScheduleHazardRecognizer *)new ARMHazardRecognizer(II, DAG);
   return TargetInstrInfo::CreateTargetPostRAHazardRecognizer(II, DAG);
 }
 
@@ -283,14 +282,20 @@ ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
       return false;
     --I;
   }
-  if (!isUnpredicatedTerminator(I))
-    return false;
 
   // Get the last instruction in the block.
   MachineInstr *LastInst = I;
+  unsigned LastOpc = LastInst->getOpcode();
+
+  // Check if it's an indirect branch first, this should return 'unanalyzable'
+  // even if it's predicated.
+  if (isIndirectBranchOpcode(LastOpc))
+    return true;
+
+  if (!isUnpredicatedTerminator(I))
+    return false;
 
   // If there is only one terminator instruction, process it.
-  unsigned LastOpc = LastInst->getOpcode();
   if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
     if (isUncondBranchOpcode(LastOpc)) {
       TBB = LastInst->getOperand(0).getMBB();
@@ -4146,6 +4151,8 @@ bool ARMBaseInstrInfo::hasNOP() const {
 }
 
 bool ARMBaseInstrInfo::isSwiftFastImmShift(const MachineInstr *MI) const {
+  if (MI->getNumOperands() < 4)
+    return true;
   unsigned ShOpVal = MI->getOperand(3).getImm();
   unsigned ShImm = ARM_AM::getSORegOffset(ShOpVal);
   // Swift supports faster shifts for: lsl 2, lsl 1, and lsr 1.
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 2ef659c..4ca3d7b 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -46,7 +46,7 @@ public:
                                               MachineBasicBlock::iterator &MBBI,
                                               LiveVariables *LV) const;
 
-  virtual const ARMBaseRegisterInfo &getRegisterInfo() const =0;
+  virtual const ARMBaseRegisterInfo &getRegisterInfo() const = 0;
   const ARMSubtarget &getSubtarget() const { return Subtarget; }
 
   ScheduleHazardRecognizer *
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index b0d34a7..7c03055 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -43,9 +43,8 @@
 
 using namespace llvm;
 
-ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMBaseInstrInfo &tii,
-                                         const ARMSubtarget &sti)
-  : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC), TII(tii), STI(sti),
+ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMSubtarget &sti)
+  : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC), STI(sti),
     FramePtr((STI.isTargetDarwin() || STI.isThumb()) ? ARM::R7 : ARM::R11),
     BasePtr(ARM::R6) {
 }
@@ -94,6 +93,7 @@ getReservedRegs(const MachineFunction &MF) const {
   Reserved.set(ARM::SP);
   Reserved.set(ARM::PC);
   Reserved.set(ARM::FPSCR);
+  Reserved.set(ARM::APSR_NZCV);
   if (TFI->hasFP(MF))
     Reserved.set(FramePtr);
   if (hasBasePointer(MF))
@@ -375,6 +375,7 @@ emitLoadConstPool(MachineBasicBlock &MBB,
                   ARMCC::CondCodes Pred,
                   unsigned PredReg, unsigned MIFlags) const {
   MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   MachineConstantPool *ConstantPool = MF.getConstantPool();
   const Constant *C =
         ConstantInt::get(Type::getInt32Ty(MF.getFunction()->getContext()), Val);
@@ -556,9 +557,10 @@ materializeFrameBaseRegister(MachineBasicBlock *MBB,
   if (Ins != MBB->end())
     DL = Ins->getDebugLoc();
 
-  const MCInstrDesc &MCID = TII.get(ADDriOpc);
-  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   const MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const MCInstrDesc &MCID = TII.get(ADDriOpc);
   MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this, MF));
 
   MachineInstrBuilder MIB = AddDefaultPred(BuildMI(*MBB, Ins, DL, MCID, BaseReg)
@@ -574,6 +576,8 @@ ARMBaseRegisterInfo::resolveFrameIndex(MachineBasicBlock::iterator I,
   MachineInstr &MI = *I;
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
+  const ARMBaseInstrInfo &TII =
+    *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   int Off = Offset; // ARM doesn't need the general 64-bit offsets
   unsigned i = 0;
@@ -671,6 +675,8 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MachineInstr &MI = *II;
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
+  const ARMBaseInstrInfo &TII =
+    *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
   const ARMFrameLowering *TFI =
     static_cast<const ARMFrameLowering*>(MF.getTarget().getFrameLowering());
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index 0679919..03b3682 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -74,7 +74,6 @@ static inline bool isARMArea3Register(unsigned Reg, bool isIOS) {
 
 class ARMBaseRegisterInfo : public ARMGenRegisterInfo {
 protected:
-  const ARMBaseInstrInfo &TII;
   const ARMSubtarget &STI;
 
   /// FramePtr - ARM physical register used as frame ptr.
@@ -86,8 +85,7 @@ protected:
   unsigned BasePtr;
 
   // Can be only subclassed.
-  explicit ARMBaseRegisterInfo(const ARMBaseInstrInfo &tii,
-                               const ARMSubtarget &STI);
+  explicit ARMBaseRegisterInfo(const ARMSubtarget &STI);
 
   // Return the opcode that implements 'Op', or 0 if no opcode
   unsigned getOpcode(int Op) const;
diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp
index 95decfe..4a157d7 100644
--- a/lib/Target/ARM/ARMCodeEmitter.cpp
+++ b/lib/Target/ARM/ARMCodeEmitter.cpp
@@ -1137,8 +1137,8 @@ void ARMCodeEmitter::emitDataProcessingInstruction(const MachineInstr &MI,
       return;
   } else if ((MCID.Opcode == ARM::BFC) || (MCID.Opcode == ARM::BFI)) {
       uint32_t v = ~MI.getOperand(2).getImm();
-      int32_t lsb = CountTrailingZeros_32(v);
-      int32_t msb = (32 - CountLeadingZeros_32(v)) - 1;
+      int32_t lsb = countTrailingZeros(v);
+      int32_t msb = (32 - countLeadingZeros(v)) - 1;
       // Instr{20-16} = msb, Instr{11-7} = lsb
       Binary |= (msb & 0x1F) << 16;
       Binary |= (lsb & 0x1F) << 7;
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index 4891609..cff5ce2 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -128,7 +128,7 @@ namespace {
         // If the block size isn't a multiple of the known bits, assume the
         // worst case padding.
         if (Size & ((1u << Bits) - 1))
-          Bits = CountTrailingZeros_32(Size);
+          Bits = countTrailingZeros(Size);
         return Bits;
       }
 
@@ -753,6 +753,7 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
             Scale = 4;
             break;
 
+          case ARM::LDRBi12:
           case ARM::LDRi12:
           case ARM::LDRcp:
           case ARM::t2LDRpci:
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 5d45f64..a4de941 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -20,6 +20,7 @@
 #include "ARMSubtarget.h"
 #include "ARMTargetMachine.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
@@ -41,6 +42,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
@@ -1025,7 +1027,7 @@ bool ARMFastISel::ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
           useAM3 = true;
         }
       }
-      RC = &ARM::GPRRegClass;
+      RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass;
       break;
     case MVT::i16:
       if (Alignment && Alignment < 2 && !Subtarget->allowsUnalignedMem())
@@ -1040,7 +1042,7 @@ bool ARMFastISel::ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
         Opc = isZExt ? ARM::LDRH : ARM::LDRSH;
         useAM3 = true;
       }
-      RC = &ARM::GPRRegClass;
+      RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass;
       break;
     case MVT::i32:
       if (Alignment && Alignment < 4 && !Subtarget->allowsUnalignedMem())
@@ -1054,7 +1056,7 @@ bool ARMFastISel::ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
       } else {
         Opc = ARM::LDRi12;
       }
-      RC = &ARM::GPRRegClass;
+      RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass;
       break;
     case MVT::f32:
       if (!Subtarget->hasVFP2()) return false;
@@ -1063,7 +1065,7 @@ bool ARMFastISel::ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
         needVMOV = true;
         VT = MVT::i32;
         Opc = isThumb2 ? ARM::t2LDRi12 : ARM::LDRi12;
-        RC = &ARM::GPRRegClass;
+        RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass;
       } else {
         Opc = ARM::VLDRS;
         RC = TLI.getRegClassFor(VT);
@@ -1802,7 +1804,7 @@ bool ARMFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
   unsigned SrcReg2 = getRegForValue(I->getOperand(1));
   if (SrcReg2 == 0) return false;
 
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(MVT::i32));
+  unsigned ResultReg = createResultReg(&ARM::GPRnopcRegClass);
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                           TII.get(Opc), ResultReg)
                   .addReg(SrcReg1).addReg(SrcReg2));
@@ -1985,7 +1987,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
       case CCValAssign::ZExt: {
         MVT DestVT = VA.getLocVT();
         Arg = ARMEmitIntExt(ArgVT, Arg, DestVT, /*isZExt*/true);
-        assert (Arg != 0 && "Failed to emit a sext");
+        assert (Arg != 0 && "Failed to emit a zext");
         ArgVT = DestVT;
         break;
       }
@@ -2602,47 +2604,112 @@ unsigned ARMFastISel::ARMEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
                                     bool isZExt) {
   if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8)
     return 0;
+  if (SrcVT != MVT::i16 && SrcVT != MVT::i8 && SrcVT != MVT::i1)
+    return 0;
 
-  unsigned Opc;
-  bool isBoolZext = false;
-  const TargetRegisterClass *RC;
-  switch (SrcVT.SimpleTy) {
-  default: return 0;
-  case MVT::i16:
-    if (!Subtarget->hasV6Ops()) return 0;
-    RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass;
-    if (isZExt)
-      Opc = isThumb2 ? ARM::t2UXTH : ARM::UXTH;
-    else
-      Opc = isThumb2 ? ARM::t2SXTH : ARM::SXTH;
-    break;
-  case MVT::i8:
-    if (!Subtarget->hasV6Ops()) return 0;
-    RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass;
-    if (isZExt)
-      Opc = isThumb2 ? ARM::t2UXTB : ARM::UXTB;
-    else
-      Opc = isThumb2 ? ARM::t2SXTB : ARM::SXTB;
-    break;
-  case MVT::i1:
-    if (isZExt) {
-      RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass;
-      Opc = isThumb2 ? ARM::t2ANDri : ARM::ANDri;
-      isBoolZext = true;
-      break;
+  // Table of which combinations can be emitted as a single instruction,
+  // and which will require two.
+  static const uint8_t isSingleInstrTbl[3][2][2][2] = {
+    //            ARM                     Thumb
+    //           !hasV6Ops  hasV6Ops     !hasV6Ops  hasV6Ops
+    //    ext:     s  z      s  z          s  z      s  z
+    /*  1 */ { { { 0, 1 }, { 0, 1 } }, { { 0, 0 }, { 0, 1 } } },
+    /*  8 */ { { { 0, 1 }, { 1, 1 } }, { { 0, 0 }, { 1, 1 } } },
+    /* 16 */ { { { 0, 0 }, { 1, 1 } }, { { 0, 0 }, { 1, 1 } } }
+  };
+
+  // Target registers for:
+  //  - For ARM can never be PC.
+  //  - For 16-bit Thumb are restricted to lower 8 registers.
+  //  - For 32-bit Thumb are restricted to non-SP and non-PC.
+  static const TargetRegisterClass *RCTbl[2][2] = {
+    // Instructions: Two                     Single
+    /* ARM      */ { &ARM::GPRnopcRegClass, &ARM::GPRnopcRegClass },
+    /* Thumb    */ { &ARM::tGPRRegClass,    &ARM::rGPRRegClass    }
+  };
+
+  // Table governing the instruction(s) to be emitted.
+  static const struct {
+    // First entry for each of the following is sext, second zext.
+    uint16_t Opc[2];
+    uint8_t Imm[2];   // All instructions have either a shift or a mask.
+    uint8_t hasS[2];  // Some instructions have an S bit, always set it to 0.
+  } OpcTbl[2][2][3] = {
+    { // Two instructions (first is left shift, second is in this table).
+      { // ARM
+        /*  1 */ { { ARM::ASRi,   ARM::LSRi    }, {  31,  31 }, { 1, 1 } },
+        /*  8 */ { { ARM::ASRi,   ARM::LSRi    }, {  24,  24 }, { 1, 1 } },
+        /* 16 */ { { ARM::ASRi,   ARM::LSRi    }, {  16,  16 }, { 1, 1 } }
+      },
+      { // Thumb
+        /*  1 */ { { ARM::tASRri, ARM::tLSRri  }, {  31,  31 }, { 0, 0 } },
+        /*  8 */ { { ARM::tASRri, ARM::tLSRri  }, {  24,  24 }, { 0, 0 } },
+        /* 16 */ { { ARM::tASRri, ARM::tLSRri  }, {  16,  16 }, { 0, 0 } }
+      }
+    },
+    { // Single instruction.
+      { // ARM
+        /*  1 */ { { ARM::KILL,   ARM::ANDri   }, {   0,   1 }, { 0, 1 } },
+        /*  8 */ { { ARM::SXTB,   ARM::ANDri   }, {   0, 255 }, { 0, 1 } },
+        /* 16 */ { { ARM::SXTH,   ARM::UXTH    }, {   0,   0 }, { 0, 0 } }
+      },
+      { // Thumb
+        /*  1 */ { { ARM::KILL,   ARM::t2ANDri }, {   0,   1 }, { 0, 1 } },
+        /*  8 */ { { ARM::t2SXTB, ARM::t2ANDri }, {   0, 255 }, { 0, 1 } },
+        /* 16 */ { { ARM::t2SXTH, ARM::t2UXTH  }, {   0,   0 }, { 0, 0 } }
+      }
     }
-    return 0;
+  };
+
+  unsigned SrcBits = SrcVT.getSizeInBits();
+  unsigned DestBits = DestVT.getSizeInBits();
+  (void) DestBits;
+  assert((SrcBits < DestBits) && "can only extend to larger types");
+  assert((DestBits == 32 || DestBits == 16 || DestBits == 8) &&
+         "other sizes unimplemented");
+  assert((SrcBits == 16 || SrcBits == 8 || SrcBits == 1) &&
+         "other sizes unimplemented");
+
+  bool hasV6Ops = Subtarget->hasV6Ops();
+  unsigned Bitness = countTrailingZeros(SrcBits) >> 1;  // {1,8,16}=>{0,1,2}
+  assert((Bitness < 3) && "sanity-check table bounds");
+
+  bool isSingleInstr = isSingleInstrTbl[Bitness][isThumb2][hasV6Ops][isZExt];
+  const TargetRegisterClass *RC = RCTbl[isThumb2][isSingleInstr];
+  unsigned Opc = OpcTbl[isSingleInstr][isThumb2][Bitness].Opc[isZExt];
+  assert(ARM::KILL != Opc && "Invalid table entry");
+  unsigned Imm = OpcTbl[isSingleInstr][isThumb2][Bitness].Imm[isZExt];
+  unsigned hasS = OpcTbl[isSingleInstr][isThumb2][Bitness].hasS[isZExt];
+
+  // 16-bit Thumb instructions always set CPSR (unless they're in an IT block).
+  bool setsCPSR = &ARM::tGPRRegClass == RC;
+  unsigned LSLOpc = isThumb2 ? ARM::tLSLri : ARM::LSLi;
+  unsigned ResultReg;
+
+  // Either one or two instructions are emitted.
+  // They're always of the form:
+  //   dst = in OP imm
+  // CPSR is set only by 16-bit Thumb instructions.
+  // Predicate, if any, is AL.
+  // S bit, if available, is always 0.
+  // When two are emitted the first's result will feed as the second's input,
+  // that value is then dead.
+  unsigned NumInstrsEmitted = isSingleInstr ? 1 : 2;
+  for (unsigned Instr = 0; Instr != NumInstrsEmitted; ++Instr) {
+    ResultReg = createResultReg(RC);
+    unsigned Opcode = ((0 == Instr) && !isSingleInstr) ? LSLOpc : Opc;
+    bool isKill = 1 == Instr;
+    MachineInstrBuilder MIB = BuildMI(
+        *FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opcode), ResultReg);
+    if (setsCPSR)
+      MIB.addReg(ARM::CPSR, RegState::Define);
+    AddDefaultPred(MIB.addReg(SrcReg, isKill * RegState::Kill).addImm(Imm));
+    if (hasS)
+      AddDefaultCC(MIB);
+    // Second instruction consumes the first's result.
+    SrcReg = ResultReg;
   }
 
-  unsigned ResultReg = createResultReg(RC);
-  MachineInstrBuilder MIB;
-  MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg)
-        .addReg(SrcReg);
-  if (isBoolZext)
-    MIB.addImm(1);
-  else
-    MIB.addImm(0);
-  AddOptionalDefs(MIB);
   return ResultReg;
 }
 
@@ -2707,7 +2774,7 @@ bool ARMFastISel::SelectShift(const Instruction *I,
     if (Reg2 == 0) return false;
   }
 
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(MVT::i32));
+  unsigned ResultReg = createResultReg(&ARM::GPRnopcRegClass);
   if(ResultReg == 0) return false;
 
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
@@ -2797,6 +2864,25 @@ bool ARMFastISel::TargetSelectInstruction(const Instruction *I) {
   return false;
 }
 
+namespace {
+// This table describes sign- and zero-extend instructions which can be
+// folded into a preceding load. All of these extends have an immediate
+// (sometimes a mask and sometimes a shift) that's applied after
+// extension.
+const struct FoldableLoadExtendsStruct {
+  uint16_t Opc[2];  // ARM, Thumb.
+  uint8_t ExpectedImm;
+  uint8_t isZExt     : 1;
+  uint8_t ExpectedVT : 7;
+} FoldableLoadExtends[] = {
+  { { ARM::SXTH,  ARM::t2SXTH  },   0, 0, MVT::i16 },
+  { { ARM::UXTH,  ARM::t2UXTH  },   0, 1, MVT::i16 },
+  { { ARM::ANDri, ARM::t2ANDri }, 255, 1, MVT::i8  },
+  { { ARM::SXTB,  ARM::t2SXTB  },   0, 0, MVT::i8  },
+  { { ARM::UXTB,  ARM::t2UXTB  },   0, 1, MVT::i8  }
+};
+}
+
 /// \brief The specified machine instr operand is a vreg, and that
 /// vreg is being provided by the specified load instruction.  If possible,
 /// try to fold the load as an operand to the instruction, returning true if
@@ -2812,26 +2898,23 @@ bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
   // ldrb r1, [r0]       ldrb r1, [r0]
   // uxtb r2, r1     =>
   // mov  r3, r2         mov  r3, r1
-  bool isZExt = true;
-  switch(MI->getOpcode()) {
-    default: return false;
-    case ARM::SXTH:
-    case ARM::t2SXTH:
-      isZExt = false;
-    case ARM::UXTH:
-    case ARM::t2UXTH:
-      if (VT != MVT::i16)
-        return false;
-    break;
-    case ARM::SXTB:
-    case ARM::t2SXTB:
-      isZExt = false;
-    case ARM::UXTB:
-    case ARM::t2UXTB:
-      if (VT != MVT::i8)
-        return false;
-    break;
+  if (MI->getNumOperands() < 3 || !MI->getOperand(2).isImm())
+    return false;
+  const uint64_t Imm = MI->getOperand(2).getImm();
+
+  bool Found = false;
+  bool isZExt;
+  for (unsigned i = 0, e = array_lengthof(FoldableLoadExtends);
+       i != e; ++i) {
+    if (FoldableLoadExtends[i].Opc[isThumb2] == MI->getOpcode() &&
+        (uint64_t)FoldableLoadExtends[i].ExpectedImm == Imm &&
+        MVT((MVT::SimpleValueType)FoldableLoadExtends[i].ExpectedVT) == VT) {
+      Found = true;
+      isZExt = FoldableLoadExtends[i].isZExt;
+    }
   }
+  if (!Found) return false;
+
   // See if we can handle this address.
   Address Addr;
   if (!ARMComputeAddress(LI->getOperand(0), Addr)) return false;
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index 483802b..c8637be 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -141,7 +141,8 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
   assert(!AFI->isThumb1OnlyFunction() &&
          "This emitPrologue does not support Thumb1!");
   bool isARM = !AFI->isThumbFunction();
-  unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize();
+  unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
+  unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(Align);
   unsigned NumBytes = MFI->getStackSize();
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
   DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
@@ -357,7 +358,8 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
          "This emitEpilogue does not support Thumb1!");
   bool isARM = !AFI->isThumbFunction();
 
-  unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize();
+  unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
+  unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(Align);
   int NumBytes = (int)MFI->getStackSize();
   unsigned FramePtr = RegInfo->getFrameRegister(MF);
 
diff --git a/lib/Target/ARM/ARMHazardRecognizer.cpp b/lib/Target/ARM/ARMHazardRecognizer.cpp
index 1240169..c69d313 100644
--- a/lib/Target/ARM/ARMHazardRecognizer.cpp
+++ b/lib/Target/ARM/ARMHazardRecognizer.cpp
@@ -44,10 +44,16 @@ ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
     if (LastMI && (MCID.TSFlags & ARMII::DomainMask) != ARMII::DomainGeneral) {
       MachineInstr *DefMI = LastMI;
       const MCInstrDesc &LastMCID = LastMI->getDesc();
+      const TargetMachine &TM =
+        MI->getParent()->getParent()->getTarget();
+      const ARMBaseInstrInfo &TII =
+        *static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo());
+
       // Skip over one non-VFP / NEON instruction.
       if (!LastMI->isBarrier() &&
           // On A9, AGU and NEON/FPU are muxed.
-          !(STI.isLikeA9() && (LastMI->mayLoad() || LastMI->mayStore())) &&
+          !(TII.getSubtarget().isLikeA9() &&
+            (LastMI->mayLoad() || LastMI->mayStore())) &&
           (LastMCID.TSFlags & ARMII::DomainMask) == ARMII::DomainGeneral) {
         MachineBasicBlock::iterator I = LastMI;
         if (I != LastMI->getParent()->begin()) {
@@ -58,7 +64,7 @@ ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
 
       if (TII.isFpMLxInstruction(DefMI->getOpcode()) &&
           (TII.canCauseFpMLxStall(MI->getOpcode()) ||
-           hasRAWHazard(DefMI, MI, TRI))) {
+           hasRAWHazard(DefMI, MI, TII.getRegisterInfo()))) {
         // Try to schedule another instruction for the next 4 cycles.
         if (FpMLxStalls == 0)
           FpMLxStalls = 4;
diff --git a/lib/Target/ARM/ARMHazardRecognizer.h b/lib/Target/ARM/ARMHazardRecognizer.h
index 98bfc4c..e1dcec3 100644
--- a/lib/Target/ARM/ARMHazardRecognizer.h
+++ b/lib/Target/ARM/ARMHazardRecognizer.h
@@ -28,21 +28,14 @@ class MachineInstr;
 /// ARM preRA scheduler uses an unspecialized instance of the
 /// ScoreboardHazardRecognizer.
 class ARMHazardRecognizer : public ScoreboardHazardRecognizer {
-  const ARMBaseInstrInfo &TII;
-  const ARMBaseRegisterInfo &TRI;
-  const ARMSubtarget &STI;
-
   MachineInstr *LastMI;
   unsigned FpMLxStalls;
 
 public:
   ARMHazardRecognizer(const InstrItineraryData *ItinData,
-                      const ARMBaseInstrInfo &tii,
-                      const ARMBaseRegisterInfo &tri,
-                      const ARMSubtarget &sti,
-                      const ScheduleDAG *DAG) :
-    ScoreboardHazardRecognizer(ItinData, DAG, "post-RA-sched"), TII(tii),
-    TRI(tri), STI(sti), LastMI(0) {}
+                      const ScheduleDAG *DAG)
+    : ScoreboardHazardRecognizer(ItinData, DAG, "post-RA-sched"),
+      LastMI(0) {}
 
   virtual HazardType getHazardType(SUnit *SU, int Stalls);
   virtual void Reset();
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 9e1782e..962368d 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -364,7 +364,7 @@ void ARMDAGToDAGISel::PreprocessISelDAG() {
       continue;
 
     // Check if the AND mask is an immediate of the form: 000.....1111111100
-    unsigned TZ = CountTrailingZeros_32(And_imm);
+    unsigned TZ = countTrailingZeros(And_imm);
     if (TZ != 1 && TZ != 2)
       // Be conservative here. Shifter operands aren't always free. e.g. On
       // Swift, left shifter operand of 1 / 2 for free but others are not.
@@ -402,12 +402,12 @@ void ARMDAGToDAGISel::PreprocessISelDAG() {
     }
 
     // Now make the transformation.
-    Srl = CurDAG->getNode(ISD::SRL, Srl.getDebugLoc(), MVT::i32,
+    Srl = CurDAG->getNode(ISD::SRL, SDLoc(Srl), MVT::i32,
                           Srl.getOperand(0),
                           CurDAG->getConstant(Srl_imm+TZ, MVT::i32));
-    N1 = CurDAG->getNode(ISD::AND, N1.getDebugLoc(), MVT::i32,
+    N1 = CurDAG->getNode(ISD::AND, SDLoc(N1), MVT::i32,
                          Srl, CurDAG->getConstant(And_imm, MVT::i32));
-    N1 = CurDAG->getNode(ISD::SHL, N1.getDebugLoc(), MVT::i32,
+    N1 = CurDAG->getNode(ISD::SHL, SDLoc(N1), MVT::i32,
                          N1, CurDAG->getConstant(TZ, MVT::i32));
     CurDAG->UpdateNodeOperands(N, N0, N1);
   }  
@@ -533,7 +533,7 @@ bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
     if (N.getOpcode() == ISD::FrameIndex) {
       // Match frame index.
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
       OffImm  = CurDAG->getTargetConstant(0, MVT::i32);
       return true;
     }
@@ -557,7 +557,7 @@ bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
       Base   = N.getOperand(0);
       if (Base.getOpcode() == ISD::FrameIndex) {
         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
       }
       OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
       return true;
@@ -703,7 +703,7 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
     Base = N;
     if (N.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
     } else if (N.getOpcode() == ARMISD::Wrapper &&
                !(Subtarget->useMovt() &&
                  N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) {
@@ -724,7 +724,7 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
       Base = N.getOperand(0);
       if (Base.getOpcode() == ISD::FrameIndex) {
         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
       }
       Offset = CurDAG->getRegister(0, MVT::i32);
 
@@ -901,7 +901,7 @@ bool ARMDAGToDAGISel::SelectAddrMode3(SDValue N,
     Base = N;
     if (N.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
     }
     Offset = CurDAG->getRegister(0, MVT::i32);
     Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0),MVT::i32);
@@ -915,7 +915,7 @@ bool ARMDAGToDAGISel::SelectAddrMode3(SDValue N,
     Base = N.getOperand(0);
     if (Base.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
     }
     Offset = CurDAG->getRegister(0, MVT::i32);
 
@@ -960,7 +960,7 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
     Base = N;
     if (N.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
     } else if (N.getOpcode() == ARMISD::Wrapper &&
                !(Subtarget->useMovt() &&
                  N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) {
@@ -978,7 +978,7 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
     Base = N.getOperand(0);
     if (Base.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
     }
 
     ARM_AM::AddrOpc AddSub = ARM_AM::add;
@@ -1202,7 +1202,7 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N,
                                             SDValue &Base, SDValue &OffImm) {
   if (N.getOpcode() == ISD::FrameIndex) {
     int FI = cast<FrameIndexSDNode>(N)->getIndex();
-    Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
     OffImm = CurDAG->getTargetConstant(0, MVT::i32);
     return true;
   }
@@ -1219,7 +1219,7 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N,
       Base = N.getOperand(0);
       if (Base.getOpcode() == ISD::FrameIndex) {
         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
       }
       OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
       return true;
@@ -1267,7 +1267,7 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N,
     if (N.getOpcode() == ISD::FrameIndex) {
       // Match frame index.
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
       OffImm  = CurDAG->getTargetConstant(0, MVT::i32);
       return true;
     }
@@ -1297,7 +1297,7 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N,
       Base   = N.getOperand(0);
       if (Base.getOpcode() == ISD::FrameIndex) {
         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
       }
       OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
       return true;
@@ -1326,7 +1326,7 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N,
       Base = N.getOperand(0);
       if (Base.getOpcode() == ISD::FrameIndex) {
         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
       }
       OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
       return true;
@@ -1468,14 +1468,14 @@ SDNode *ARMDAGToDAGISel::SelectARMIndexedLoad(SDNode *N) {
       SDValue Base = LD->getBasePtr();
       SDValue Ops[]= { Base, AMOpc, getAL(CurDAG),
                        CurDAG->getRegister(0, MVT::i32), Chain };
-      return CurDAG->getMachineNode(Opcode, N->getDebugLoc(), MVT::i32,
+      return CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32,
                                     MVT::i32, MVT::Other, Ops);
     } else {
       SDValue Chain = LD->getChain();
       SDValue Base = LD->getBasePtr();
       SDValue Ops[]= { Base, Offset, AMOpc, getAL(CurDAG),
                        CurDAG->getRegister(0, MVT::i32), Chain };
-      return CurDAG->getMachineNode(Opcode, N->getDebugLoc(), MVT::i32,
+      return CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32,
                                     MVT::i32, MVT::Other, Ops);
     }
   }
@@ -1524,7 +1524,7 @@ SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) {
     SDValue Base = LD->getBasePtr();
     SDValue Ops[]= { Base, Offset, getAL(CurDAG),
                      CurDAG->getRegister(0, MVT::i32), Chain };
-    return CurDAG->getMachineNode(Opcode, N->getDebugLoc(), MVT::i32, MVT::i32,
+    return CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, MVT::i32,
                                   MVT::Other, Ops);
   }
 
@@ -1533,7 +1533,7 @@ SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) {
 
 /// \brief Form a GPRPair pseudo register from a pair of GPR regs.
 SDNode *ARMDAGToDAGISel::createGPRPairNode(EVT VT, SDValue V0, SDValue V1) {
-  DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDLoc dl(V0.getNode());
   SDValue RegClass =
     CurDAG->getTargetConstant(ARM::GPRPairRegClassID, MVT::i32);
   SDValue SubReg0 = CurDAG->getTargetConstant(ARM::gsub_0, MVT::i32);
@@ -1544,7 +1544,7 @@ SDNode *ARMDAGToDAGISel::createGPRPairNode(EVT VT, SDValue V0, SDValue V1) {
 
 /// \brief Form a D register from a pair of S registers.
 SDNode *ARMDAGToDAGISel::createSRegPairNode(EVT VT, SDValue V0, SDValue V1) {
-  DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDLoc dl(V0.getNode());
   SDValue RegClass =
     CurDAG->getTargetConstant(ARM::DPR_VFP2RegClassID, MVT::i32);
   SDValue SubReg0 = CurDAG->getTargetConstant(ARM::ssub_0, MVT::i32);
@@ -1555,7 +1555,7 @@ SDNode *ARMDAGToDAGISel::createSRegPairNode(EVT VT, SDValue V0, SDValue V1) {
 
 /// \brief Form a quad register from a pair of D registers.
 SDNode *ARMDAGToDAGISel::createDRegPairNode(EVT VT, SDValue V0, SDValue V1) {
-  DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDLoc dl(V0.getNode());
   SDValue RegClass = CurDAG->getTargetConstant(ARM::QPRRegClassID, MVT::i32);
   SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, MVT::i32);
   SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, MVT::i32);
@@ -1565,7 +1565,7 @@ SDNode *ARMDAGToDAGISel::createDRegPairNode(EVT VT, SDValue V0, SDValue V1) {
 
 /// \brief Form 4 consecutive D registers from a pair of Q registers.
 SDNode *ARMDAGToDAGISel::createQRegPairNode(EVT VT, SDValue V0, SDValue V1) {
-  DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDLoc dl(V0.getNode());
   SDValue RegClass = CurDAG->getTargetConstant(ARM::QQPRRegClassID, MVT::i32);
   SDValue SubReg0 = CurDAG->getTargetConstant(ARM::qsub_0, MVT::i32);
   SDValue SubReg1 = CurDAG->getTargetConstant(ARM::qsub_1, MVT::i32);
@@ -1576,7 +1576,7 @@ SDNode *ARMDAGToDAGISel::createQRegPairNode(EVT VT, SDValue V0, SDValue V1) {
 /// \brief Form 4 consecutive S registers.
 SDNode *ARMDAGToDAGISel::createQuadSRegsNode(EVT VT, SDValue V0, SDValue V1,
                                    SDValue V2, SDValue V3) {
-  DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDLoc dl(V0.getNode());
   SDValue RegClass =
     CurDAG->getTargetConstant(ARM::QPR_VFP2RegClassID, MVT::i32);
   SDValue SubReg0 = CurDAG->getTargetConstant(ARM::ssub_0, MVT::i32);
@@ -1591,7 +1591,7 @@ SDNode *ARMDAGToDAGISel::createQuadSRegsNode(EVT VT, SDValue V0, SDValue V1,
 /// \brief Form 4 consecutive D registers.
 SDNode *ARMDAGToDAGISel::createQuadDRegsNode(EVT VT, SDValue V0, SDValue V1,
                                    SDValue V2, SDValue V3) {
-  DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDLoc dl(V0.getNode());
   SDValue RegClass = CurDAG->getTargetConstant(ARM::QQPRRegClassID, MVT::i32);
   SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, MVT::i32);
   SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, MVT::i32);
@@ -1605,7 +1605,7 @@ SDNode *ARMDAGToDAGISel::createQuadDRegsNode(EVT VT, SDValue V0, SDValue V1,
 /// \brief Form 4 consecutive Q registers.
 SDNode *ARMDAGToDAGISel::createQuadQRegsNode(EVT VT, SDValue V0, SDValue V1,
                                    SDValue V2, SDValue V3) {
-  DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDLoc dl(V0.getNode());
   SDValue RegClass = CurDAG->getTargetConstant(ARM::QQQQPRRegClassID, MVT::i32);
   SDValue SubReg0 = CurDAG->getTargetConstant(ARM::qsub_0, MVT::i32);
   SDValue SubReg1 = CurDAG->getTargetConstant(ARM::qsub_1, MVT::i32);
@@ -1689,7 +1689,7 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
                                    const uint16_t *QOpcodes0,
                                    const uint16_t *QOpcodes1) {
   assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range");
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   SDValue MemAddr, Align;
   unsigned AddrOpIdx = isUpdating ? 1 : 2;
@@ -1821,7 +1821,7 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
                                    const uint16_t *QOpcodes0,
                                    const uint16_t *QOpcodes1) {
   assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   SDValue MemAddr, Align;
   unsigned AddrOpIdx = isUpdating ? 1 : 2;
@@ -1966,7 +1966,7 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
                                          const uint16_t *DOpcodes,
                                          const uint16_t *QOpcodes) {
   assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range");
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   SDValue MemAddr, Align;
   unsigned AddrOpIdx = isUpdating ? 1 : 2;
@@ -2084,7 +2084,7 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
                                       unsigned NumVecs,
                                       const uint16_t *Opcodes) {
   assert(NumVecs >=2 && NumVecs <= 4 && "VLDDup NumVecs out-of-range");
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   SDValue MemAddr, Align;
   if (!SelectAddrMode6(N, N->getOperand(1), MemAddr, Align))
@@ -2166,7 +2166,7 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
 SDNode *ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs,
                                     unsigned Opc) {
   assert(NumVecs >= 2 && NumVecs <= 4 && "VTBL NumVecs out-of-range");
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   EVT VT = N->getValueType(0);
   unsigned FirstTblReg = IsExt ? 2 : 1;
 
@@ -2536,7 +2536,7 @@ SDNode *ARMDAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
   Ops.push_back(Node->getOperand(0)); // Chain
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
   MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
-  SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(),
+  SDNode *ResNode = CurDAG->getMachineNode(Opc, SDLoc(Node),
                                            MVT::i32, MVT::i32, MVT::Other,
                                            Ops);
   cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp + 1);
@@ -2544,7 +2544,7 @@ SDNode *ARMDAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
 }
 
 SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   if (N->isMachineOpcode())
     return NULL;   // Already selected.
@@ -2587,7 +2587,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       SDValue CPIdx =
         CurDAG->getTargetConstantPool(ConstantInt::get(
                                   Type::getInt32Ty(*CurDAG->getContext()), Val),
-                                      TLI.getPointerTy());
+                                      TLI->getPointerTy());
 
       SDNode *ResNode;
       if (Subtarget->isThumb1Only()) {
@@ -2617,7 +2617,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
   case ISD::FrameIndex: {
     // Selects to ADDri FI, 0 which in turn will become ADDri SP, imm.
     int FI = cast<FrameIndexSDNode>(N)->getIndex();
-    SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
     if (Subtarget->isThumb1Only()) {
       SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32),
                         getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) };
@@ -3121,7 +3121,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
 
     case Intrinsic::arm_ldrexd: {
       SDValue MemAddr = N->getOperand(2);
-      DebugLoc dl = N->getDebugLoc();
+      SDLoc dl(N);
       SDValue Chain = N->getOperand(0);
 
       bool isThumb = Subtarget->isThumb() && Subtarget->hasThumb2();
@@ -3179,7 +3179,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     }
 
     case Intrinsic::arm_strexd: {
-      DebugLoc dl = N->getDebugLoc();
+      SDLoc dl(N);
       SDValue Chain = N->getOperand(0);
       SDValue Val0 = N->getOperand(2);
       SDValue Val1 = N->getOperand(3);
@@ -3383,7 +3383,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
   }
 
   case ARMISD::VTBL1: {
-    DebugLoc dl = N->getDebugLoc();
+    SDLoc dl(N);
     EVT VT = N->getValueType(0);
     SmallVector<SDValue, 6> Ops;
 
@@ -3394,7 +3394,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     return CurDAG->getMachineNode(ARM::VTBL1, dl, VT, Ops);
   }
   case ARMISD::VTBL2: {
-    DebugLoc dl = N->getDebugLoc();
+    SDLoc dl(N);
     EVT VT = N->getValueType(0);
 
     // Form a REG_SEQUENCE to force register allocation.
@@ -3462,7 +3462,7 @@ SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){
   if (AsmString.find(":H}") == StringRef::npos)
     return NULL;
 
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   SDValue Glue = N->getOperand(NumOps-1);
 
   // Glue node will be appended late.
@@ -3567,7 +3567,7 @@ SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){
   if (!Changed)
     return NULL;
 
-  SDValue New = CurDAG->getNode(ISD::INLINEASM, N->getDebugLoc(),
+  SDValue New = CurDAG->getNode(ISD::INLINEASM, SDLoc(N),
       CurDAG->getVTList(MVT::Other, MVT::Glue), &AsmNodeOperands[0],
                         AsmNodeOperands.size());
   New->setNodeId(-1);
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 9475f1b..ec0e9c2 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -681,6 +681,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i32  , Expand);
   setOperationAction(ISD::CTLZ_ZERO_UNDEF  , MVT::i32  , Expand);
 
+  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
+
   // Only ARMv6 has BSWAP.
   if (!Subtarget->hasV6Ops())
     setOperationAction(ISD::BSWAP, MVT::i32, Expand);
@@ -1069,7 +1071,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   }
 }
 
-EVT ARMTargetLowering::getSetCCResultType(EVT VT) const {
+EVT ARMTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
   if (!VT.isVector()) return getPointerTy();
   return VT.changeVectorElementTypeToInteger();
 }
@@ -1233,7 +1235,7 @@ SDValue
 ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
                                    CallingConv::ID CallConv, bool isVarArg,
                                    const SmallVectorImpl<ISD::InputArg> &Ins,
-                                   DebugLoc dl, SelectionDAG &DAG,
+                                   SDLoc dl, SelectionDAG &DAG,
                                    SmallVectorImpl<SDValue> &InVals,
                                    bool isThisReturn, SDValue ThisVal) const {
 
@@ -1314,7 +1316,7 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 SDValue
 ARMTargetLowering::LowerMemOpCallTo(SDValue Chain,
                                     SDValue StackPtr, SDValue Arg,
-                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SDLoc dl, SelectionDAG &DAG,
                                     const CCValAssign &VA,
                                     ISD::ArgFlagsTy Flags) const {
   unsigned LocMemOffset = VA.getLocMemOffset();
@@ -1325,7 +1327,7 @@ ARMTargetLowering::LowerMemOpCallTo(SDValue Chain,
                       false, false, 0);
 }
 
-void ARMTargetLowering::PassF64ArgInRegs(DebugLoc dl, SelectionDAG &DAG,
+void ARMTargetLowering::PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG,
                                          SDValue Chain, SDValue &Arg,
                                          RegsToPassVector &RegsToPass,
                                          CCValAssign &VA, CCValAssign &NextVA,
@@ -1357,7 +1359,7 @@ SDValue
 ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                              SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG                     = CLI.DAG;
-  DebugLoc &dl                          = CLI.DL;
+  SDLoc &dl                          = CLI.DL;
   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
   SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
   SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
@@ -1406,7 +1408,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
   if (!isSibCall)
-    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                                 dl);
 
   SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
 
@@ -1481,10 +1484,17 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
       // True if this byval aggregate will be split between registers
       // and memory.
-      if (CCInfo.isFirstByValRegValid()) {
+      unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
+      unsigned CurByValIdx = CCInfo.getInRegsParamsProceed();
+
+      if (CurByValIdx < ByValArgsCount) {
+
+        unsigned RegBegin, RegEnd;
+        CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
+
         EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
         unsigned int i, j;
-        for (i = 0, j = CCInfo.getFirstByValReg(); j < ARM::R4; i++, j++) {
+        for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
           SDValue Const = DAG.getConstant(4*i, MVT::i32);
           SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
           SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
@@ -1493,11 +1503,15 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(j, Load));
         }
-        offset = ARM::R4 - CCInfo.getFirstByValReg();
-        CCInfo.clearFirstByValReg();
+
+        // If parameter size outsides register area, "offset" value
+        // helps us to calculate stack slot for remained part properly.
+        offset = RegEnd - RegBegin;
+
+        CCInfo.nextInRegsParam();
       }
 
-      if (Flags.getByValSize() - 4*offset > 0) {
+      if (Flags.getByValSize() > 4*offset) {
         unsigned LocMemOffset = VA.getLocMemOffset();
         SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset);
         SDValue Dst = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr,
@@ -1718,7 +1732,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   InFlag = Chain.getValue(1);
 
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                             DAG.getIntPtrConstant(0, true), InFlag);
+                             DAG.getIntPtrConstant(0, true), InFlag, dl);
   if (!Ins.empty())
     InFlag = Chain.getValue(1);
 
@@ -1740,9 +1754,24 @@ ARMTargetLowering::HandleByVal(
   assert((State->getCallOrPrologue() == Prologue ||
           State->getCallOrPrologue() == Call) &&
          "unhandled ParmContext");
-  if ((!State->isFirstByValRegValid()) &&
-      (!Subtarget->isAAPCS_ABI() || State->getNextStackOffset() == 0) &&
-      (ARM::R0 <= reg) && (reg <= ARM::R3)) {
+
+  // For in-prologue parameters handling, we also introduce stack offset
+  // for byval registers: see CallingConvLower.cpp, CCState::HandleByVal.
+  // This behaviour outsides AAPCS rules (5.5 Parameters Passing) of how
+  // NSAA should be evaluted (NSAA means "next stacked argument address").
+  // So: NextStackOffset = NSAAOffset + SizeOfByValParamsStoredInRegs.
+  // Then: NSAAOffset = NextStackOffset - SizeOfByValParamsStoredInRegs.
+  unsigned NSAAOffset = State->getNextStackOffset();
+  if (State->getCallOrPrologue() != Call) {
+    for (unsigned i = 0, e = State->getInRegsParamsCount(); i != e; ++i) {
+      unsigned RB, RE;
+      State->getInRegsParamInfo(i, RB, RE);
+      assert(NSAAOffset >= (RE-RB)*4 &&
+             "Stack offset for byval regs doesn't introduced anymore?");
+      NSAAOffset -= (RE-RB)*4;
+    }
+  }
+  if ((ARM::R0 <= reg) && (reg <= ARM::R3)) {
     if (Subtarget->isAAPCS_ABI() && Align > 4) {
       unsigned AlignInRegs = Align / 4;
       unsigned Waste = (ARM::R4 - reg) % AlignInRegs;
@@ -1750,22 +1779,45 @@ ARMTargetLowering::HandleByVal(
         reg = State->AllocateReg(GPRArgRegs, 4);
     }
     if (reg != 0) {
-      State->setFirstByValReg(reg);
+      unsigned excess = 4 * (ARM::R4 - reg);
+
+      // Special case when NSAA != SP and parameter size greater than size of
+      // all remained GPR regs. In that case we can't split parameter, we must
+      // send it to stack. We also must set NCRN to R4, so waste all
+      // remained registers.
+      if (Subtarget->isAAPCS_ABI() && NSAAOffset != 0 && size > excess) {
+        while (State->AllocateReg(GPRArgRegs, 4))
+          ;
+        return;
+      }
+
+      // First register for byval parameter is the first register that wasn't
+      // allocated before this method call, so it would be "reg".
+      // If parameter is small enough to be saved in range [reg, r4), then
+      // the end (first after last) register would be reg + param-size-in-regs,
+      // else parameter would be splitted between registers and stack,
+      // end register would be r4 in this case.
+      unsigned ByValRegBegin = reg;
+      unsigned ByValRegEnd = (size < excess) ? reg + size/4 : (unsigned)ARM::R4;
+      State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
+      // Note, first register is allocated in the beginning of function already,
+      // allocate remained amount of registers we need.
+      for (unsigned i = reg+1; i != ByValRegEnd; ++i)
+        State->AllocateReg(GPRArgRegs, 4);
       // At a call site, a byval parameter that is split between
       // registers and memory needs its size truncated here.  In a
       // function prologue, such byval parameters are reassembled in
       // memory, and are not truncated.
       if (State->getCallOrPrologue() == Call) {
-        unsigned excess = 4 * (ARM::R4 - reg);
-        assert(size >= excess && "expected larger existing stack allocation");
-        size -= excess;
+        // Make remained size equal to 0 in case, when
+        // the whole structure may be stored into registers.
+        if (size < excess)
+          size = 0;
+        else
+          size -= excess;
       }
     }
   }
-  // Confiscate any remaining parameter registers to preclude their
-  // assignment to subsequent parameters.
-  while (State->AllocateReg(GPRArgRegs, 4))
-    ;
 }
 
 /// MatchingStackOffset - Return true if the given stack call argument is
@@ -1970,7 +2022,7 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
                                CallingConv::ID CallConv, bool isVarArg,
                                const SmallVectorImpl<ISD::OutputArg> &Outs,
                                const SmallVectorImpl<SDValue> &OutVals,
-                               DebugLoc dl, SelectionDAG &DAG) const {
+                               SDLoc dl, SelectionDAG &DAG) const {
 
   // CCValAssign - represent the assignment of the return value to a location.
   SmallVector<CCValAssign, 16> RVLocs;
@@ -2098,7 +2150,7 @@ bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
     Copy = *Copy->use_begin();
     if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
       return false;
-    Chain = Copy->getOperand(0);
+    TCChain = Copy->getOperand(0);
   } else {
     return false;
   }
@@ -2137,7 +2189,7 @@ bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
 static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
   EVT PtrVT = Op.getValueType();
   // FIXME there is no actual debug info here
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
   SDValue Res;
   if (CP->isMachineConstantPoolEntry())
@@ -2158,7 +2210,7 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
   MachineFunction &MF = DAG.getMachineFunction();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   unsigned ARMPCLabelIndex = 0;
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT PtrVT = getPointerTy();
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   Reloc::Model RelocM = getTargetMachine().getRelocationModel();
@@ -2187,7 +2239,7 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
 SDValue
 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
                                                  SelectionDAG &DAG) const {
-  DebugLoc dl = GA->getDebugLoc();
+  SDLoc dl(GA);
   EVT PtrVT = getPointerTy();
   unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
   MachineFunction &MF = DAG.getMachineFunction();
@@ -2230,7 +2282,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
                                         SelectionDAG &DAG,
                                         TLSModel::Model model) const {
   const GlobalValue *GV = GA->getGlobal();
-  DebugLoc dl = GA->getDebugLoc();
+  SDLoc dl(GA);
   SDValue Offset;
   SDValue Chain = DAG.getEntryNode();
   EVT PtrVT = getPointerTy();
@@ -2300,7 +2352,7 @@ ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
                                                  SelectionDAG &DAG) const {
   EVT PtrVT = getPointerTy();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
     bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility();
@@ -2343,7 +2395,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
                                                     SelectionDAG &DAG) const {
   EVT PtrVT = getPointerTy();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   Reloc::Model RelocM = getTargetMachine().getRelocationModel();
 
@@ -2408,7 +2460,7 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
   EVT PtrVT = getPointerTy();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
   ARMConstantPoolValue *CPV =
     ARMConstantPoolSymbol::Create(*DAG.getContext(), "_GLOBAL_OFFSET_TABLE_",
@@ -2424,7 +2476,7 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
 
 SDValue
 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue Val = DAG.getConstant(0, MVT::i32);
   return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
                      DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
@@ -2433,7 +2485,7 @@ ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue
 ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
                      Op.getOperand(1), DAG.getConstant(0, MVT::i32));
 }
@@ -2442,7 +2494,7 @@ SDValue
 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
                                           const ARMSubtarget *Subtarget) const {
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   switch (IntNo) {
   default: return SDValue();    // Don't custom lower most intrinsics.
   case Intrinsic::arm_thread_pointer: {
@@ -2478,7 +2530,7 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
   case Intrinsic::arm_neon_vmullu: {
     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
       ? ARMISD::VMULLs : ARMISD::VMULLu;
-    return DAG.getNode(NewOpc, Op.getDebugLoc(), Op.getValueType(),
+    return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
   }
   }
@@ -2487,7 +2539,7 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
 static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
                                  const ARMSubtarget *Subtarget) {
   // FIXME: handle "fence singlethread" more efficiently.
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   if (!Subtarget->hasDataBarrier()) {
     // Some ARMv6 cpus can support data barriers with an mcr instruction.
     // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
@@ -2510,7 +2562,7 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
     // Just preserve the chain.
     return Op.getOperand(0);
 
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1;
   if (!isRead &&
       (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
@@ -2535,7 +2587,7 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
 
   // vastart just stores the address of the VarArgsFrameIndex slot into the
   // memory location argument.
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
@@ -2546,7 +2598,7 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
 SDValue
 ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
                                         SDValue &Root, SelectionDAG &DAG,
-                                        DebugLoc dl) const {
+                                        SDLoc dl) const {
   MachineFunction &MF = DAG.getMachineFunction();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
 
@@ -2580,13 +2632,17 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
 
 void
 ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF,
+                                  unsigned InRegsParamRecordIdx,
+                                  unsigned ArgSize,
                                   unsigned &ArgRegsSize,
                                   unsigned &ArgRegsSaveSize)
   const {
   unsigned NumGPRs;
-  if (CCInfo.isFirstByValRegValid())
-    NumGPRs = ARM::R4 - CCInfo.getFirstByValReg();
-  else {
+  if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
+    unsigned RBegin, REnd;
+    CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
+    NumGPRs = REnd - RBegin;
+  } else {
     unsigned int firstUnalloced;
     firstUnalloced = CCInfo.getFirstUnallocated(GPRArgRegs,
                                                 sizeof(GPRArgRegs) /
@@ -2596,7 +2652,29 @@ ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF,
 
   unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
   ArgRegsSize = NumGPRs * 4;
-  ArgRegsSaveSize = (ArgRegsSize + Align - 1) & ~(Align - 1);
+
+  // If parameter is split between stack and GPRs...
+  if (NumGPRs && Align == 8 &&
+      (ArgRegsSize < ArgSize ||
+        InRegsParamRecordIdx >= CCInfo.getInRegsParamsCount())) {
+    // Add padding for part of param recovered from GPRs, so
+    // its last byte must be at address K*8 - 1.
+    // We need to do it, since remained (stack) part of parameter has
+    // stack alignment, and we need to "attach" "GPRs head" without gaps
+    // to it:
+    // Stack:
+    // |---- 8 bytes block ----| |---- 8 bytes block ----| |---- 8 bytes...
+    // [ [padding] [GPRs head] ] [        Tail passed via stack       ....
+    //
+    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+    unsigned Padding =
+        ((ArgRegsSize + AFI->getArgRegsSaveSize() + Align - 1) & ~(Align-1)) -
+        (ArgRegsSize + AFI->getArgRegsSaveSize());
+    ArgRegsSaveSize = ArgRegsSize + Padding;
+  } else
+    // We don't need to extend regs save size for byval parameters if they
+    // are passed via GPRs only.
+    ArgRegsSaveSize = ArgRegsSize;
 }
 
 // The remaining GPRs hold either the beginning of variable-argument
@@ -2609,10 +2687,12 @@ ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF,
 // Return: The frame index registers were stored into.
 int
 ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
-                                  DebugLoc dl, SDValue &Chain,
+                                  SDLoc dl, SDValue &Chain,
                                   const Value *OrigArg,
+                                  unsigned InRegsParamRecordIdx,
                                   unsigned OffsetFromOrigArg,
                                   unsigned ArgOffset,
+                                  unsigned ArgSize,
                                   bool ForceMutable) const {
 
   // Currently, two use-cases possible:
@@ -2629,33 +2709,45 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  unsigned firstRegToSaveIndex;
-  if (CCInfo.isFirstByValRegValid())
-    firstRegToSaveIndex = CCInfo.getFirstByValReg() - ARM::R0;
-  else {
+  unsigned firstRegToSaveIndex, lastRegToSaveIndex;
+  unsigned RBegin, REnd;
+  if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
+    CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
+    firstRegToSaveIndex = RBegin - ARM::R0;
+    lastRegToSaveIndex = REnd - ARM::R0;
+  } else {
     firstRegToSaveIndex = CCInfo.getFirstUnallocated
       (GPRArgRegs, sizeof(GPRArgRegs) / sizeof(GPRArgRegs[0]));
+    lastRegToSaveIndex = 4;
   }
 
   unsigned ArgRegsSize, ArgRegsSaveSize;
-  computeRegArea(CCInfo, MF, ArgRegsSize, ArgRegsSaveSize);
+  computeRegArea(CCInfo, MF, InRegsParamRecordIdx, ArgSize,
+                 ArgRegsSize, ArgRegsSaveSize);
 
   // Store any by-val regs to their spots on the stack so that they may be
   // loaded by deferencing the result of formal parameter pointer or va_next.
   // Note: once stack area for byval/varargs registers
   // was initialized, it can't be initialized again.
-  if (!AFI->getArgRegsSaveSize() && ArgRegsSaveSize) {
+  if (ArgRegsSaveSize) {
+
+    unsigned Padding = ArgRegsSaveSize - ArgRegsSize;
 
-    AFI->setArgRegsSaveSize(ArgRegsSaveSize);
+    if (Padding) {
+      assert(AFI->getStoredByValParamsPadding() == 0 &&
+             "The only parameter may be padded.");
+      AFI->setStoredByValParamsPadding(Padding);
+    }
 
     int FrameIndex = MFI->CreateFixedObject(
                       ArgRegsSaveSize,
-                      ArgOffset + ArgRegsSaveSize - ArgRegsSize,
+                      Padding + ArgOffset,
                       false);
     SDValue FIN = DAG.getFrameIndex(FrameIndex, getPointerTy());
 
     SmallVector<SDValue, 4> MemOps;
-    for (unsigned i = 0; firstRegToSaveIndex < 4; ++firstRegToSaveIndex, ++i) {
+    for (unsigned i = 0; firstRegToSaveIndex < lastRegToSaveIndex;
+         ++firstRegToSaveIndex, ++i) {
       const TargetRegisterClass *RC;
       if (AFI->isThumb1OnlyFunction())
         RC = &ARM::tGPRRegClass;
@@ -2672,19 +2764,23 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
       FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
                         DAG.getConstant(4, getPointerTy()));
     }
+
+    AFI->setArgRegsSaveSize(ArgRegsSaveSize + AFI->getArgRegsSaveSize());
+
     if (!MemOps.empty())
       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                           &MemOps[0], MemOps.size());
     return FrameIndex;
   } else
     // This will point to the next argument passed via stack.
-    return MFI->CreateFixedObject(4, ArgOffset, !ForceMutable);
+    return MFI->CreateFixedObject(
+        4, AFI->getStoredByValParamsPadding() + ArgOffset, !ForceMutable);
 }
 
 // Setup stack frame, the va_list pointer will start from.
 void
 ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
-                                        DebugLoc dl, SDValue &Chain,
+                                        SDLoc dl, SDValue &Chain,
                                         unsigned ArgOffset,
                                         bool ForceMutable) const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -2696,7 +2792,8 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
   // If there is no regs to be stored, just point address after last
   // argument passed via stack.
   int FrameIndex =
-    StoreByValRegs(CCInfo, DAG, dl, Chain, 0, 0, ArgOffset, ForceMutable);
+    StoreByValRegs(CCInfo, DAG, dl, Chain, 0, CCInfo.getInRegsParamsCount(),
+                   0, ArgOffset, 0, ForceMutable);
 
   AFI->setVarArgsFrameIndex(FrameIndex);
 }
@@ -2706,7 +2803,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
                                         CallingConv::ID CallConv, bool isVarArg,
                                         const SmallVectorImpl<ISD::InputArg>
                                           &Ins,
-                                        DebugLoc dl, SelectionDAG &DAG,
+                                        SDLoc dl, SelectionDAG &DAG,
                                         SmallVectorImpl<SDValue> &InVals)
                                           const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -2727,6 +2824,12 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
   SDValue ArgValue;
   Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
   unsigned CurArgIdx = 0;
+
+  // Initially ArgRegsSaveSize is zero.
+  // Then we increase this value each time we meet byval parameter.
+  // We also increase this value in case of varargs function.
+  AFI->setArgRegsSaveSize(0);
+
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
     std::advance(CurOrigArg, Ins[VA.getValNo()].OrigArgIndex - CurArgIdx);
@@ -2824,15 +2927,21 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
           // Since they could be overwritten by lowering of arguments in case of
           // a tail call.
           if (Flags.isByVal()) {
+            unsigned CurByValIndex = CCInfo.getInRegsParamsProceed();
             int FrameIndex = StoreByValRegs(
-                                CCInfo, DAG, dl, Chain, CurOrigArg,
-                                Ins[VA.getValNo()].PartOffset,
-                                VA.getLocMemOffset(),
-                                true /*force mutable frames*/);
+                CCInfo, DAG, dl, Chain, CurOrigArg,
+                CurByValIndex,
+                Ins[VA.getValNo()].PartOffset,
+                VA.getLocMemOffset(),
+                Flags.getByValSize(),
+                true /*force mutable frames*/);
             InVals.push_back(DAG.getFrameIndex(FrameIndex, getPointerTy()));
+            CCInfo.nextInRegsParam();
           } else {
+            unsigned FIOffset = VA.getLocMemOffset() +
+                                AFI->getStoredByValParamsPadding();
             int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
-                                            VA.getLocMemOffset(), true);
+                                            FIOffset, true);
 
             // Create load nodes to retrieve arguments from the stack.
             SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
@@ -2874,7 +2983,7 @@ static bool isFloatingPointZero(SDValue Op) {
 SDValue
 ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                              SDValue &ARMcc, SelectionDAG &DAG,
-                             DebugLoc dl) const {
+                             SDLoc dl) const {
   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
     unsigned C = RHSC->getZExtValue();
     if (!isLegalICmpImmediate(C)) {
@@ -2932,7 +3041,7 @@ ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
 SDValue
 ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
-                             DebugLoc dl) const {
+                             SDLoc dl) const {
   SDValue Cmp;
   if (!isFloatingPointZero(RHS))
     Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS);
@@ -2946,7 +3055,7 @@ ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
 SDValue
 ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
   unsigned Opc = Cmp.getOpcode();
-  DebugLoc DL = Cmp.getDebugLoc();
+  SDLoc DL(Cmp);
   if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ)
     return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
 
@@ -2966,7 +3075,7 @@ SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   SDValue Cond = Op.getOperand(0);
   SDValue SelectTrue = Op.getOperand(1);
   SDValue SelectFalse = Op.getOperand(2);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   // Convert:
   //
@@ -3021,7 +3130,7 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
   SDValue TrueVal = Op.getOperand(2);
   SDValue FalseVal = Op.getOperand(3);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   if (LHS.getValueType() == MVT::i32) {
     SDValue ARMcc;
@@ -3076,7 +3185,7 @@ static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
     return DAG.getConstant(0, MVT::i32);
 
   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
-    return DAG.getLoad(MVT::i32, Op.getDebugLoc(),
+    return DAG.getLoad(MVT::i32, SDLoc(Op),
                        Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
                        Ld->isVolatile(), Ld->isNonTemporal(),
                        Ld->isInvariant(), Ld->getAlignment());
@@ -3094,7 +3203,7 @@ static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
 
   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
     SDValue Ptr = Ld->getBasePtr();
-    RetVal1 = DAG.getLoad(MVT::i32, Op.getDebugLoc(),
+    RetVal1 = DAG.getLoad(MVT::i32, SDLoc(Op),
                           Ld->getChain(), Ptr,
                           Ld->getPointerInfo(),
                           Ld->isVolatile(), Ld->isNonTemporal(),
@@ -3102,9 +3211,9 @@ static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
 
     EVT PtrType = Ptr.getValueType();
     unsigned NewAlign = MinAlign(Ld->getAlignment(), 4);
-    SDValue NewPtr = DAG.getNode(ISD::ADD, Op.getDebugLoc(),
+    SDValue NewPtr = DAG.getNode(ISD::ADD, SDLoc(Op),
                                  PtrType, Ptr, DAG.getConstant(4, PtrType));
-    RetVal2 = DAG.getLoad(MVT::i32, Op.getDebugLoc(),
+    RetVal2 = DAG.getLoad(MVT::i32, SDLoc(Op),
                           Ld->getChain(), NewPtr,
                           Ld->getPointerInfo().getWithOffset(4),
                           Ld->isVolatile(), Ld->isNonTemporal(),
@@ -3124,7 +3233,7 @@ ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
   SDValue LHS = Op.getOperand(2);
   SDValue RHS = Op.getOperand(3);
   SDValue Dest = Op.getOperand(4);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   bool LHSSeenZero = false;
   bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
@@ -3174,7 +3283,7 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue LHS = Op.getOperand(2);
   SDValue RHS = Op.getOperand(3);
   SDValue Dest = Op.getOperand(4);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   if (LHS.getValueType() == MVT::i32) {
     SDValue ARMcc;
@@ -3215,7 +3324,7 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
   SDValue Table = Op.getOperand(1);
   SDValue Index = Op.getOperand(2);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   EVT PTy = getPointerTy();
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
@@ -3251,7 +3360,7 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
 
 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   if (Op.getValueType().getVectorElementType() == MVT::i32) {
     if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
@@ -3273,7 +3382,7 @@ static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
   if (VT.isVector())
     return LowerVectorFP_TO_INT(Op, DAG);
 
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned Opc;
 
   switch (Op.getOpcode()) {
@@ -3291,7 +3400,7 @@ static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
 
 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
     if (VT.getVectorElementType() == MVT::f32)
@@ -3327,7 +3436,7 @@ static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
   if (VT.isVector())
     return LowerVectorINT_TO_FP(Op, DAG);
 
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned Opc;
 
   switch (Op.getOpcode()) {
@@ -3348,7 +3457,7 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
   // Implement fcopysign with a fabs and a conditional fneg.
   SDValue Tmp0 = Op.getOperand(0);
   SDValue Tmp1 = Op.getOperand(1);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   EVT VT = Op.getValueType();
   EVT SrcVT = Tmp1.getValueType();
   bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
@@ -3432,7 +3541,7 @@ SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
   MFI->setReturnAddressIsTaken(true);
 
   EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   if (Depth) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
@@ -3452,7 +3561,7 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   MFI->setFrameAddressIsTaken(true);
 
   EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
+  SDLoc dl(Op);  // FIXME probably not meaningful
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   unsigned FrameReg = (Subtarget->isThumb() || Subtarget->isTargetDarwin())
     ? ARM::R7 : ARM::R11;
@@ -3481,7 +3590,7 @@ static SDValue ExpandVectorExtension(SDNode *N, SelectionDAG &DAG) {
       SrcVT.getSizeInBits()*8 != DestVT.getSizeInBits())
     return SDValue();
 
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits();
   unsigned DestEltSize = DestVT.getVectorElementType().getSizeInBits();
   unsigned NumElts = SrcVT.getVectorNumElements();
@@ -3512,7 +3621,7 @@ static SDValue ExpandVectorExtension(SDNode *N, SelectionDAG &DAG) {
 /// vectors), since the legalizer won't know what to do with that.
 static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   SDValue Op = N->getOperand(0);
 
   // This function is only supposed to be called for i64 types, either as the
@@ -3549,7 +3658,7 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
 /// not support i64 elements, so sometimes the zero vectors will need to be
 /// explicitly constructed.  Regardless, use a canonical VMOV to create the
 /// zero vector.
-static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
+static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, SDLoc dl) {
   assert(VT.isVector() && "Expected a vector type");
   // The canonical modified immediate encoding of a zero vector is....0!
   SDValue EncodedVal = DAG.getTargetConstant(0, MVT::i32);
@@ -3565,7 +3674,7 @@ SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   EVT VT = Op.getValueType();
   unsigned VTBits = VT.getSizeInBits();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue ShOpLo = Op.getOperand(0);
   SDValue ShOpHi = Op.getOperand(1);
   SDValue ShAmt  = Op.getOperand(2);
@@ -3601,7 +3710,7 @@ SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   EVT VT = Op.getValueType();
   unsigned VTBits = VT.getSizeInBits();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue ShOpLo = Op.getOperand(0);
   SDValue ShOpHi = Op.getOperand(1);
   SDValue ShAmt  = Op.getOperand(2);
@@ -3634,7 +3743,7 @@ SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
   // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
   // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
   // so that the shift + and get folded into a bitfield extract.
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
                               DAG.getConstant(Intrinsic::arm_get_fpscr,
                                               MVT::i32));
@@ -3649,7 +3758,7 @@ SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
                          const ARMSubtarget *ST) {
   EVT VT = N->getValueType(0);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   if (!ST->hasV6T2Ops())
     return SDValue();
@@ -3673,7 +3782,7 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
 /// vuzp:    = [k0 k1 k2 k3 k0 k1 k2 k3]  each ki is 8-bits)
 static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
   SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0));
@@ -3695,7 +3804,7 @@ static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) {
 /// v4i16:Extracted = [k0    k1    k2    k3    ]
 static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   SDValue BitCounts = getCTPOP16BitCounts(N, DAG);
   if (VT.is64BitVector()) {
@@ -3730,7 +3839,7 @@ static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) {
 ///
 static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
 
@@ -3769,7 +3878,7 @@ static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
                           const ARMSubtarget *ST) {
   EVT VT = N->getValueType(0);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   if (!VT.isVector())
     return SDValue();
@@ -3804,7 +3913,7 @@ static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
 static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
                                 const ARMSubtarget *ST) {
   EVT VT = N->getValueType(0);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   // We can get here for a node like i32 = ISD::SHL i32, i64
   if (VT != MVT::i64)
@@ -3850,7 +3959,7 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
   SDValue CC = Op.getOperand(2);
   EVT VT = Op.getValueType();
   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   if (Op.getOperand(1).getValueType().isFloatingPoint()) {
     switch (SetCCOpcode) {
@@ -4119,7 +4228,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
   APFloat FPVal = CFP->getValueAPF();
   int ImmVal = ARM_AM::getFP32Imm(FPVal);
   if (ImmVal != -1) {
-    DebugLoc DL = Op.getDebugLoc();
+    SDLoc DL(Op);
     SDValue NewVal = DAG.getTargetConstant(ImmVal, MVT::i32);
     SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
                                       NewVal);
@@ -4133,7 +4242,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
   SDValue NewVal = isNEONModifiedImm(iVal, 0, 32, DAG, VMovVT, false,
                                      VMOVModImm);
   if (NewVal != SDValue()) {
-    DebugLoc DL = Op.getDebugLoc();
+    SDLoc DL(Op);
     SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
                                       NewVal);
     SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
@@ -4146,7 +4255,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
   NewVal = isNEONModifiedImm(~iVal & 0xffffffff, 0, 32, DAG, VMovVT, false,
                              VMVNModImm);
   if (NewVal != SDValue()) {
-    DebugLoc DL = Op.getDebugLoc();
+    SDLoc DL(Op);
     SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
     SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
                                        VecConstant);
@@ -4406,7 +4515,7 @@ static bool isReverseMask(ArrayRef<int> M, EVT VT) {
 // instruction, return an SDValue of such a constant (will become a MOV
 // instruction).  Otherwise return null.
 static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
-                                     const ARMSubtarget *ST, DebugLoc dl) {
+                                     const ARMSubtarget *ST, SDLoc dl) {
   uint64_t Val;
   if (!isa<ConstantSDNode>(N))
     return SDValue();
@@ -4427,7 +4536,7 @@ static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
                                              const ARMSubtarget *ST) const {
   BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   EVT VT = Op.getValueType();
 
   APInt SplatBits, SplatUndef;
@@ -4617,7 +4726,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
 // shuffle in combination with VEXTs.
 SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
                                               SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   EVT VT = Op.getValueType();
   unsigned NumElts = VT.getVectorNumElements();
 
@@ -4806,7 +4915,7 @@ ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
 /// the specified operations to build the shuffle.
 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
                                       SDValue RHS, SelectionDAG &DAG,
-                                      DebugLoc dl) {
+                                      SDLoc dl) {
   unsigned OpNum = (PFEntry >> 26) & 0x0F;
   unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
   unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
@@ -4886,7 +4995,7 @@ static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
   // Check to see if we can use the VTBL instruction.
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   SmallVector<SDValue, 8> VTBLMask;
   for (ArrayRef<int>::iterator
@@ -4905,7 +5014,7 @@ static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
 
 static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
                                                       SelectionDAG &DAG) {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   SDValue OpLHS = Op.getOperand(0);
   EVT VT = OpLHS.getValueType();
 
@@ -4923,7 +5032,7 @@ static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   EVT VT = Op.getValueType();
   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
 
@@ -5087,7 +5196,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
   SDValue Vec = Op.getOperand(0);
   if (Op.getValueType() == MVT::i32 &&
       Vec.getValueType().getVectorElementType().getSizeInBits() < 32) {
-    DebugLoc dl = Op.getDebugLoc();
+    SDLoc dl(Op);
     return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
   }
 
@@ -5099,7 +5208,7 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   // two 64-bit vectors are concatenated to a 128-bit vector.
   assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
          "unexpected CONCAT_VECTORS");
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue Val = DAG.getUNDEF(MVT::v2f64);
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
@@ -5188,6 +5297,23 @@ static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
   return false;
 }
 
+static EVT getExtensionTo64Bits(const EVT &OrigVT) {
+  if (OrigVT.getSizeInBits() >= 64)
+    return OrigVT;
+
+  assert(OrigVT.isSimple() && "Expecting a simple value type");
+
+  MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
+  switch (OrigSimpleTy) {
+  default: llvm_unreachable("Unexpected Vector Type");
+  case MVT::v2i8:
+  case MVT::v2i16:
+     return MVT::v2i32;
+  case MVT::v4i8:
+    return  MVT::v4i16;
+  }
+}
+
 /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
 /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
 /// We insert the required extension here to get the vector to fill a D register.
@@ -5203,19 +5329,9 @@ static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,
     return N;
 
   // Must extend size to at least 64 bits to be used as an operand for VMULL.
-  MVT::SimpleValueType OrigSimpleTy = OrigTy.getSimpleVT().SimpleTy;
-  EVT NewVT;
-  switch (OrigSimpleTy) {
-  default: llvm_unreachable("Unexpected Orig Vector Type");
-  case MVT::v2i8:
-  case MVT::v2i16:
-    NewVT = MVT::v2i32;
-    break;
-  case MVT::v4i8:
-    NewVT = MVT::v4i16;
-    break;
-  }
-  return DAG.getNode(ExtOpcode, N->getDebugLoc(), NewVT, N);
+  EVT NewVT = getExtensionTo64Bits(OrigTy);
+
+  return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
 }
 
 /// SkipLoadExtensionForVMULL - return a load of the original vector size that
@@ -5224,22 +5340,22 @@ static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,
 /// reach a total size of 64 bits. We have to add the extension separately
 /// because ARM does not have a sign/zero extending load for vectors.
 static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
-  SDValue NonExtendingLoad =
-    DAG.getLoad(LD->getMemoryVT(), LD->getDebugLoc(), LD->getChain(),
+  EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
+
+  // The load already has the right type.
+  if (ExtendedTy == LD->getMemoryVT())
+    return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
                 LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(),
                 LD->isNonTemporal(), LD->isInvariant(),
                 LD->getAlignment());
-  unsigned ExtOp = 0;
-  switch (LD->getExtensionType()) {
-  default: llvm_unreachable("Unexpected LoadExtType");
-  case ISD::EXTLOAD:
-  case ISD::SEXTLOAD: ExtOp = ISD::SIGN_EXTEND; break;
-  case ISD::ZEXTLOAD: ExtOp = ISD::ZERO_EXTEND; break;
-  }
-  MVT::SimpleValueType MemType = LD->getMemoryVT().getSimpleVT().SimpleTy;
-  MVT::SimpleValueType ExtType = LD->getValueType(0).getSimpleVT().SimpleTy;
-  return AddRequiredExtensionForVMULL(NonExtendingLoad, DAG,
-                                      MemType, ExtType, ExtOp);
+
+  // We need to create a zextload/sextload. We cannot just create a load
+  // followed by a zext/zext node because LowerMUL is also run during normal
+  // operation legalization where we can't create illegal types.
+  return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
+                        LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
+                        LD->getMemoryVT(), LD->isVolatile(),
+                        LD->isNonTemporal(), LD->getAlignment());
 }
 
 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
@@ -5265,7 +5381,7 @@ static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
     assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
            BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
     unsigned LowElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0;
-    return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), MVT::v2i32,
+    return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), MVT::v2i32,
                        BVN->getOperand(LowElt), BVN->getOperand(LowElt+2));
   }
   // Construct a new BUILD_VECTOR with elements truncated to half the size.
@@ -5282,7 +5398,7 @@ static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
     // The values are implicitly truncated so sext vs. zext doesn't matter.
     Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32));
   }
-  return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(),
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N),
                      MVT::getVectorVT(TruncVT, NumElts), Ops.data(), NumElts);
 }
 
@@ -5354,7 +5470,7 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
   }
 
   // Legalize to a VMULL instruction.
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   SDValue Op0;
   SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
   if (!isMLA) {
@@ -5384,7 +5500,7 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
 }
 
 static SDValue
-LowerSDIV_v4i8(SDValue X, SDValue Y, DebugLoc dl, SelectionDAG &DAG) {
+LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) {
   // Convert to float
   // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
   // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
@@ -5413,7 +5529,7 @@ LowerSDIV_v4i8(SDValue X, SDValue Y, DebugLoc dl, SelectionDAG &DAG) {
 }
 
 static SDValue
-LowerSDIV_v4i16(SDValue N0, SDValue N1, DebugLoc dl, SelectionDAG &DAG) {
+LowerSDIV_v4i16(SDValue N0, SDValue N1, SDLoc dl, SelectionDAG &DAG) {
   SDValue N2;
   // Convert to float.
   // float4 yf = vcvt_f32_s32(vmovl_s16(y));
@@ -5454,7 +5570,7 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
   assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
          "unexpected type for custom-lowering ISD::SDIV");
 
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue N0 = Op.getOperand(0);
   SDValue N1 = Op.getOperand(1);
   SDValue N2, N3;
@@ -5489,7 +5605,7 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
   assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
          "unexpected type for custom-lowering ISD::UDIV");
 
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue N0 = Op.getOperand(0);
   SDValue N1 = Op.getOperand(1);
   SDValue N2, N3;
@@ -5573,9 +5689,9 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
   }
 
   if (!ExtraOp)
-    return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
+    return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
                        Op.getOperand(1));
-  return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
+  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
                      Op.getOperand(1), Op.getOperand(2));
 }
 
@@ -5589,11 +5705,10 @@ static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
   return SDValue();
 }
 
-
 static void
 ReplaceATOMIC_OP_64(SDNode *Node, SmallVectorImpl<SDValue>& Results,
                     SelectionDAG &DAG, unsigned NewOp) {
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
   assert (Node->getValueType(0) == MVT::i64 &&
           "Only know how to expand i64 atomics");
 
@@ -5623,6 +5738,44 @@ ReplaceATOMIC_OP_64(SDNode *Node, SmallVectorImpl<SDValue>& Results,
   Results.push_back(Result.getValue(2));
 }
 
+static void ReplaceREADCYCLECOUNTER(SDNode *N,
+                                    SmallVectorImpl<SDValue> &Results,
+                                    SelectionDAG &DAG,
+                                    const ARMSubtarget *Subtarget) {
+  SDLoc DL(N);
+  SDValue Cycles32, OutChain;
+
+  if (Subtarget->hasPerfMon()) {
+    // Under Power Management extensions, the cycle-count is:
+    //    mrc p15, #0, <Rt>, c9, c13, #0
+    SDValue Ops[] = { N->getOperand(0), // Chain
+                      DAG.getConstant(Intrinsic::arm_mrc, MVT::i32),
+                      DAG.getConstant(15, MVT::i32),
+                      DAG.getConstant(0, MVT::i32),
+                      DAG.getConstant(9, MVT::i32),
+                      DAG.getConstant(13, MVT::i32),
+                      DAG.getConstant(0, MVT::i32)
+    };
+
+    Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
+                           DAG.getVTList(MVT::i32, MVT::Other), &Ops[0],
+                           array_lengthof(Ops));
+    OutChain = Cycles32.getValue(1);
+  } else {
+    // Intrinsic is defined to return 0 on unsupported platforms. Technically
+    // there are older ARM CPUs that have implementation-specific ways of
+    // obtaining this information (FIXME!).
+    Cycles32 = DAG.getConstant(0, MVT::i32);
+    OutChain = DAG.getEntryNode();
+  }
+
+
+  SDValue Cycles64 = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64,
+                                 Cycles32, DAG.getConstant(0, MVT::i32));
+  Results.push_back(Cycles64);
+  Results.push_back(OutChain);
+}
+
 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Don't know how to custom lower this!");
@@ -5700,6 +5853,9 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::SRA:
     Res = Expand64BitShift(N, DAG, Subtarget);
     break;
+  case ISD::READCYCLECOUNTER:
+    ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
+    return;
   case ISD::ATOMIC_LOAD_ADD:
     ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMADD64_DAG);
     return;
@@ -7634,13 +7790,13 @@ SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
 
   // Slct is now know to be the desired identity constant when CC is true.
   SDValue TrueVal = OtherOp;
-  SDValue FalseVal = DAG.getNode(N->getOpcode(), N->getDebugLoc(), VT,
+  SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
                                  OtherOp, NonConstantVal);
   // Unless SwapSelectOps says CC should be false.
   if (SwapSelectOps)
     std::swap(TrueVal, FalseVal);
 
-  return DAG.getNode(ISD::SELECT, N->getDebugLoc(), VT,
+  return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
                      CCOp, TrueVal, FalseVal);
 }
 
@@ -7747,9 +7903,9 @@ static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
       llvm_unreachable("Invalid vector element type for padd optimization.");
   }
 
-  SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(),
+  SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N),
                             widenType, &Ops[0], Ops.size());
-  return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, tmp);
+  return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, tmp);
 }
 
 static SDValue findMUL_LOHI(SDValue V) {
@@ -7874,7 +8030,7 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
   Ops.push_back(*LowAdd);
   Ops.push_back(*HiAdd);
 
-  SDValue MLALNode =  DAG.getNode(FinalOpc, AddcNode->getDebugLoc(),
+  SDValue MLALNode =  DAG.getNode(FinalOpc, SDLoc(AddcNode),
                                  DAG.getVTList(MVT::i32, MVT::i32),
                                  &Ops[0], Ops.size());
 
@@ -7982,7 +8138,7 @@ static SDValue PerformVMULCombine(SDNode *N,
   }
 
   EVT VT = N->getValueType(0);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   SDValue N00 = N0->getOperand(0);
   SDValue N01 = N0->getOperand(1);
   return DAG.getNode(Opcode, DL, VT,
@@ -8012,11 +8168,11 @@ static SDValue PerformMULCombine(SDNode *N,
     return SDValue();
 
   int64_t MulAmt = C->getSExtValue();
-  unsigned ShiftAmt = CountTrailingZeros_64(MulAmt);
+  unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt);
 
   ShiftAmt = ShiftAmt & (32 - 1);
   SDValue V = N->getOperand(0);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   SDValue Res;
   MulAmt >>= ShiftAmt;
@@ -8080,7 +8236,7 @@ static SDValue PerformANDCombine(SDNode *N,
 
   // Attempt to use immediate-form VBIC
   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   EVT VT = N->getValueType(0);
   SelectionDAG &DAG = DCI.DAG;
 
@@ -8123,7 +8279,7 @@ static SDValue PerformORCombine(SDNode *N,
                                 const ARMSubtarget *Subtarget) {
   // Attempt to use immediate-form VORR
   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   EVT VT = N->getValueType(0);
   SelectionDAG &DAG = DCI.DAG;
 
@@ -8198,7 +8354,7 @@ static SDValue PerformORCombine(SDNode *N,
   if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
     return SDValue();
 
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   // 1) or (and A, mask), val => ARMbfi A, val, mask
   //      iff (val & mask) == val
   //
@@ -8233,7 +8389,7 @@ static SDValue PerformORCombine(SDNode *N,
       return SDValue();
 
     if (ARM::isBitFieldInvertedMask(Mask)) {
-      Val >>= CountTrailingZeros_32(~Mask);
+      Val >>= countTrailingZeros(~Mask);
 
       Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
                         DAG.getConstant(Val, MVT::i32),
@@ -8260,7 +8416,7 @@ static SDValue PerformORCombine(SDNode *N,
           (Mask == 0xffff || Mask == 0xffff0000))
         return SDValue();
       // 2a
-      unsigned amt = CountTrailingZeros_32(Mask2);
+      unsigned amt = countTrailingZeros(Mask2);
       Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
                         DAG.getConstant(amt, MVT::i32));
       Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
@@ -8276,7 +8432,7 @@ static SDValue PerformORCombine(SDNode *N,
           (Mask2 == 0xffff || Mask2 == 0xffff0000))
         return SDValue();
       // 2b
-      unsigned lsb = CountTrailingZeros_32(Mask);
+      unsigned lsb = countTrailingZeros(Mask);
       Res = DAG.getNode(ISD::SRL, DL, VT, N00,
                         DAG.getConstant(lsb, MVT::i32));
       Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
@@ -8294,7 +8450,7 @@ static SDValue PerformORCombine(SDNode *N,
     // where lsb(mask) == #shamt and masked bits of B are known zero.
     SDValue ShAmt = N00.getOperand(1);
     unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue();
-    unsigned LSB = CountTrailingZeros_32(Mask);
+    unsigned LSB = countTrailingZeros(Mask);
     if (ShAmtC != LSB)
       return SDValue();
 
@@ -8337,12 +8493,12 @@ static SDValue PerformBFICombine(SDNode *N,
     if (!N11C)
       return SDValue();
     unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
-    unsigned LSB = CountTrailingZeros_32(~InvMask);
-    unsigned Width = (32 - CountLeadingZeros_32(~InvMask)) - LSB;
+    unsigned LSB = countTrailingZeros(~InvMask);
+    unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB;
     unsigned Mask = (1 << Width)-1;
     unsigned Mask2 = N11C->getZExtValue();
     if ((Mask & (~Mask2)) == 0)
-      return DCI.DAG.getNode(ARMISD::BFI, N->getDebugLoc(), N->getValueType(0),
+      return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
                              N->getOperand(0), N1.getOperand(0),
                              N->getOperand(2));
   }
@@ -8368,7 +8524,7 @@ static SDValue PerformVMOVRRDCombine(SDNode *N,
     LoadSDNode *LD = cast<LoadSDNode>(InNode);
 
     SelectionDAG &DAG = DCI.DAG;
-    DebugLoc DL = LD->getDebugLoc();
+    SDLoc DL(LD);
     SDValue BasePtr = LD->getBasePtr();
     SDValue NewLD1 = DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr,
                                  LD->getPointerInfo(), LD->isVolatile(),
@@ -8405,7 +8561,7 @@ static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
   if (Op0.getOpcode() == ARMISD::VMOVRRD &&
       Op0.getNode() == Op1.getNode() &&
       Op0.getResNo() == 0 && Op1.getResNo() == 1)
-    return DAG.getNode(ISD::BITCAST, N->getDebugLoc(),
+    return DAG.getNode(ISD::BITCAST, SDLoc(N),
                        N->getValueType(0), Op0.getOperand(0));
   return SDValue();
 }
@@ -8447,7 +8603,7 @@ static SDValue PerformSTORECombine(SDNode *N,
                                      NumElems*SizeRatio);
     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
 
-    DebugLoc DL = St->getDebugLoc();
+    SDLoc DL(St);
     SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
     for (unsigned i = 0; i < NumElems; ++i) ShuffleVec[i] = i * SizeRatio;
@@ -8508,7 +8664,7 @@ static SDValue PerformSTORECombine(SDNode *N,
   if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
       StVal.getNode()->hasOneUse()) {
     SelectionDAG  &DAG = DCI.DAG;
-    DebugLoc DL = St->getDebugLoc();
+    SDLoc DL(St);
     SDValue BasePtr = St->getBasePtr();
     SDValue NewST1 = DAG.getStore(St->getChain(), DL,
                                   StVal.getNode()->getOperand(0), BasePtr,
@@ -8530,14 +8686,14 @@ static SDValue PerformSTORECombine(SDNode *N,
   // Bitcast an i64 store extracted from a vector to f64.
   // Otherwise, the i64 value will be legalized to a pair of i32 values.
   SelectionDAG &DAG = DCI.DAG;
-  DebugLoc dl = StVal.getDebugLoc();
+  SDLoc dl(StVal);
   SDValue IntVec = StVal.getOperand(0);
   EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
                                  IntVec.getValueType().getVectorNumElements());
   SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
   SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
                                Vec, StVal.getOperand(1));
-  dl = N->getDebugLoc();
+  dl = SDLoc(N);
   SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
   // Make the DAGCombiner fold the bitcasts.
   DCI.AddToWorklist(Vec.getNode());
@@ -8583,7 +8739,7 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N,
   EVT VT = N->getValueType(0);
   if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
     return SDValue();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   SmallVector<SDValue, 8> Ops;
   unsigned NumElts = VT.getVectorNumElements();
   for (unsigned i = 0; i < NumElts; ++i) {
@@ -8610,7 +8766,7 @@ static SDValue PerformInsertEltCombine(SDNode *N,
     return SDValue();
 
   SelectionDAG &DAG = DCI.DAG;
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
                                  VT.getVectorNumElements());
   SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
@@ -8656,7 +8812,7 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
       !TLI.isTypeLegal(Concat1Op1.getValueType()))
     return SDValue();
 
-  SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, N->getDebugLoc(), VT,
+  SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
                                   Op0.getOperand(0), Op1.getOperand(0));
   // Translate the shuffle mask.
   SmallVector<int, 16> NewMask;
@@ -8672,7 +8828,7 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
       NewElt = HalfElts + MaskElt - NumElts;
     NewMask.push_back(NewElt);
   }
-  return DAG.getVectorShuffle(VT, N->getDebugLoc(), NewConcat,
+  return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
                               DAG.getUNDEF(VT), NewMask.data());
 }
 
@@ -8789,7 +8945,7 @@ static SDValue CombineBaseUpdate(SDNode *N,
       Ops.push_back(N->getOperand(i));
     }
     MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
-    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, N->getDebugLoc(), SDTys,
+    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys,
                                            Ops.data(), Ops.size(),
                                            MemInt->getMemoryVT(),
                                            MemInt->getMemOperand());
@@ -8863,7 +9019,7 @@ static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   SDVTList SDTys = DAG.getVTList(Tys, NumVecs+1);
   SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
   MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
-  SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, VLD->getDebugLoc(), SDTys,
+  SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
                                            Ops, 2, VLDMemInt->getMemoryVT(),
                                            VLDMemInt->getMemOperand());
 
@@ -8918,7 +9074,7 @@ static SDValue PerformVDUPLANECombine(SDNode *N,
   if (EltSize > VT.getVectorElementType().getSizeInBits())
     return SDValue();
 
-  return DCI.DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op);
+  return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
 }
 
 // isConstVecPow2 - Return true if each vector element is a power of 2, all
@@ -8977,7 +9133,7 @@ static SDValue PerformVCVTCombine(SDNode *N,
 
   unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
     Intrinsic::arm_neon_vcvtfp2fxu;
-  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(),
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N),
                      N->getValueType(0),
                      DAG.getConstant(IntrinsicOpcode, MVT::i32), N0,
                      DAG.getConstant(Log2_64(C), MVT::i32));
@@ -9013,7 +9169,7 @@ static SDValue PerformVDIVCombine(SDNode *N,
 
   unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp :
     Intrinsic::arm_neon_vcvtfxu2fp;
-  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(),
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N),
                      Op.getValueType(),
                      DAG.getConstant(IntrinsicOpcode, MVT::i32),
                      Op.getOperand(0), DAG.getConstant(Log2_64(C), MVT::i32));
@@ -9197,7 +9353,7 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
       VShiftOpc = ARMISD::VQRSHRNsu; break;
     }
 
-    return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0),
+    return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0),
                        N->getOperand(1), DAG.getConstant(Cnt, MVT::i32));
   }
 
@@ -9214,7 +9370,7 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
       llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
     }
 
-    return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0),
+    return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0),
                        N->getOperand(1), N->getOperand(2),
                        DAG.getConstant(Cnt, MVT::i32));
   }
@@ -9245,7 +9401,7 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
       if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP &&
           DAG.MaskedValueIsZero(N0.getOperand(0),
                                 APInt::getHighBitsSet(32, 16)))
-        return DAG.getNode(ISD::ROTR, N->getDebugLoc(), VT, N0, N1);
+        return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1);
     }
   }
 
@@ -9262,7 +9418,7 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
 
   case ISD::SHL:
     if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
-      return DAG.getNode(ARMISD::VSHL, N->getDebugLoc(), VT, N->getOperand(0),
+      return DAG.getNode(ARMISD::VSHL, SDLoc(N), VT, N->getOperand(0),
                          DAG.getConstant(Cnt, MVT::i32));
     break;
 
@@ -9271,7 +9427,7 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
     if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
       unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ?
                             ARMISD::VSHRs : ARMISD::VSHRu);
-      return DAG.getNode(VShiftOpc, N->getDebugLoc(), VT, N->getOperand(0),
+      return DAG.getNode(VShiftOpc, SDLoc(N), VT, N->getOperand(0),
                          DAG.getConstant(Cnt, MVT::i32));
     }
   }
@@ -9311,7 +9467,7 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
         Opc = ARMISD::VGETLANEu;
         break;
       }
-      return DAG.getNode(Opc, N->getDebugLoc(), VT, Vec, Lane);
+      return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
     }
   }
 
@@ -9400,7 +9556,7 @@ static SDValue PerformSELECT_CCCombine(SDNode *N, SelectionDAG &DAG,
 
   if (!Opcode)
     return SDValue();
-  return DAG.getNode(Opcode, N->getDebugLoc(), N->getValueType(0), LHS, RHS);
+  return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), LHS, RHS);
 }
 
 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
@@ -9412,7 +9568,7 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
     return SDValue();
 
   EVT VT = N->getValueType(0);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   SDValue LHS = Cmp.getOperand(0);
   SDValue RHS = Cmp.getOperand(1);
   SDValue FalseVal = N->getOperand(0);
@@ -10358,17 +10514,15 @@ ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
 
 bool ARM::isBitFieldInvertedMask(unsigned v) {
   if (v == 0xffffffff)
-    return 0;
+    return false;
+
   // there can be 1's on either or both "outsides", all the "inside"
   // bits must be 0's
-  unsigned int lsb = 0, msb = 31;
-  while (v & (1 << msb)) --msb;
-  while (v & (1 << lsb)) ++lsb;
-  for (unsigned int i = lsb; i <= msb; ++i) {
-    if (v & (1 << i))
-      return 0;
-  }
-  return 1;
+  unsigned TO = CountTrailingOnes_32(v);
+  unsigned LO = CountLeadingOnes_32(v);
+  v = (v >> TO) << TO;
+  v = (v << LO) >> LO;
+  return v == 0;
 }
 
 /// isFPImmLegal - Returns true if the target can instruction select the
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 46b8438..2b65019 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -270,7 +270,7 @@ namespace llvm {
     }
 
     /// getSetCCResultType - Return the value type to use for ISD::SETCC.
-    virtual EVT getSetCCResultType(EVT VT) const;
+    virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
 
     virtual MachineBasicBlock *
       EmitInstrWithCustomInserter(MachineInstr *MI,
@@ -412,7 +412,7 @@ namespace llvm {
     void addQRTypeForNEON(MVT VT);
 
     typedef SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPassVector;
-    void PassF64ArgInRegs(DebugLoc dl, SelectionDAG &DAG,
+    void PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG,
                           SDValue Chain, SDValue &Arg,
                           RegsToPassVector &RegsToPass,
                           CCValAssign &VA, CCValAssign &NextVA,
@@ -421,12 +421,12 @@ namespace llvm {
                           ISD::ArgFlagsTy Flags) const;
     SDValue GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
                                  SDValue &Root, SelectionDAG &DAG,
-                                 DebugLoc dl) const;
+                                 SDLoc dl) const;
 
     CCAssignFn *CCAssignFnForNode(CallingConv::ID CC, bool Return,
                                   bool isVarArg) const;
     SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
-                             DebugLoc dl, SelectionDAG &DAG,
+                             SDLoc dl, SelectionDAG &DAG,
                              const CCValAssign &VA,
                              ISD::ArgFlagsTy Flags) const;
     SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
@@ -463,7 +463,7 @@ namespace llvm {
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                             CallingConv::ID CallConv, bool isVarArg,
                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                            DebugLoc dl, SelectionDAG &DAG,
+                            SDLoc dl, SelectionDAG &DAG,
                             SmallVectorImpl<SDValue> &InVals,
                             bool isThisReturn, SDValue ThisVal) const;
 
@@ -471,22 +471,26 @@ namespace llvm {
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv, bool isVarArg,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
-                           DebugLoc dl, SelectionDAG &DAG,
+                           SDLoc dl, SelectionDAG &DAG,
                            SmallVectorImpl<SDValue> &InVals) const;
 
     int StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
-                       DebugLoc dl, SDValue &Chain,
+                       SDLoc dl, SDValue &Chain,
                        const Value *OrigArg,
+                       unsigned InRegsParamRecordIdx,
                        unsigned OffsetFromOrigArg,
                        unsigned ArgOffset,
+                       unsigned ArgSize,
                        bool ForceMutable) const;
 
     void VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
-                              DebugLoc dl, SDValue &Chain,
+                              SDLoc dl, SDValue &Chain,
                               unsigned ArgOffset,
                               bool ForceMutable = false) const;
 
     void computeRegArea(CCState &CCInfo, MachineFunction &MF,
+                        unsigned InRegsParamRecordIdx,
+                        unsigned ArgSize,
                         unsigned &ArgRegsSize,
                         unsigned &ArgRegsSaveSize) const;
 
@@ -520,16 +524,16 @@ namespace llvm {
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
                   const SmallVectorImpl<SDValue> &OutVals,
-                  DebugLoc dl, SelectionDAG &DAG) const;
+                  SDLoc dl, SelectionDAG &DAG) const;
 
     virtual bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const;
 
     virtual bool mayBeEmittedAsTailCall(CallInst *CI) const;
 
     SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
-                      SDValue &ARMcc, SelectionDAG &DAG, DebugLoc dl) const;
+                      SDValue &ARMcc, SelectionDAG &DAG, SDLoc dl) const;
     SDValue getVFPCmp(SDValue LHS, SDValue RHS,
-                      SelectionDAG &DAG, DebugLoc dl) const;
+                      SelectionDAG &DAG, SDLoc dl) const;
     SDValue duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const;
 
     SDValue OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index 67a6820..bd9a212 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -1389,7 +1389,6 @@ class ADI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
   let Inst{15-12} = Dd{3-0};
   let Inst{7-0}   = addr{7-0};    // imm8
 
-  // TODO: Mark the instructions with the appropriate subtarget info.
   let Inst{27-24} = opcod1;
   let Inst{21-20} = opcod2;
   let Inst{11-9}  = 0b101;
@@ -1415,7 +1414,6 @@ class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
   let Inst{15-12} = Sd{4-1};
   let Inst{7-0}   = addr{7-0};    // imm8
 
-  // TODO: Mark the instructions with the appropriate subtarget info.
   let Inst{27-24} = opcod1;
   let Inst{21-20} = opcod2;
   let Inst{11-9}  = 0b101;
@@ -1437,6 +1435,28 @@ class PseudoVFPLdStM<dag oops, dag iops, InstrItinClass itin, string cstr,
 }
 
 // Load / store multiple
+
+// Unknown precision
+class AXXI4<dag oops, dag iops, IndexMode im,
+            string asm, string cstr, list<dag> pattern>
+  : VFPXI<oops, iops, AddrMode4, 4, im,
+          VFPLdStFrm, NoItinerary, asm, cstr, pattern> {
+  // Instruction operands.
+  bits<4>  Rn;
+  bits<13> regs;
+
+  // Encode instruction operands.
+  let Inst{19-16} = Rn;
+  let Inst{22}    = 0;
+  let Inst{15-12} = regs{11-8};
+  let Inst{7-1}   = regs{7-1};
+
+  let Inst{27-25} = 0b110;
+  let Inst{11-8}  = 0b1011;
+  let Inst{0}     = 1;
+}
+
+// Double precision
 class AXDI4<dag oops, dag iops, IndexMode im, InstrItinClass itin,
             string asm, string cstr, list<dag> pattern>
   : VFPXI<oops, iops, AddrMode4, 4, im,
@@ -1449,14 +1469,15 @@ class AXDI4<dag oops, dag iops, IndexMode im, InstrItinClass itin,
   let Inst{19-16} = Rn;
   let Inst{22}    = regs{12};
   let Inst{15-12} = regs{11-8};
-  let Inst{7-0}   = regs{7-0};
+  let Inst{7-1}   = regs{7-1};
 
-  // TODO: Mark the instructions with the appropriate subtarget info.
   let Inst{27-25} = 0b110;
   let Inst{11-9}  = 0b101;
   let Inst{8}     = 1;          // Double precision
+  let Inst{0}     = 0;
 }
 
+// Single Precision
 class AXSI4<dag oops, dag iops, IndexMode im, InstrItinClass itin,
             string asm, string cstr, list<dag> pattern>
   : VFPXI<oops, iops, AddrMode4, 4, im,
@@ -1471,7 +1492,6 @@ class AXSI4<dag oops, dag iops, IndexMode im, InstrItinClass itin,
   let Inst{15-12} = regs{12-9};
   let Inst{7-0}   = regs{7-0};
 
-  // TODO: Mark the instructions with the appropriate subtarget info.
   let Inst{27-25} = 0b110;
   let Inst{11-9}  = 0b101;
   let Inst{8}     = 0;          // Single precision
diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
index 80f0ec7..8062111 100644
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@@ -29,7 +29,7 @@
 using namespace llvm;
 
 ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI)
-  : ARMBaseInstrInfo(STI), RI(*this, STI) {
+  : ARMBaseInstrInfo(STI), RI(STI) {
 }
 
 /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 1bd174e..da815d5 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -275,8 +275,8 @@ def HasSlowVDUP32 : Predicate<"Subtarget->isSwift()">;
 def UseVMOVSR : Predicate<"Subtarget->isCortexA9() || !Subtarget->useNEONForSinglePrecisionFP()">;
 def DontUseVMOVSR : Predicate<"!Subtarget->isCortexA9() && Subtarget->useNEONForSinglePrecisionFP()">;
 
-def IsLE             : Predicate<"TLI.isLittleEndian()">;
-def IsBE             : Predicate<"TLI.isBigEndian()">;
+def IsLE             : Predicate<"TLI->isLittleEndian()">;
+def IsBE             : Predicate<"TLI->isBigEndian()">;
 
 //===----------------------------------------------------------------------===//
 // ARM Flag Definitions.
@@ -1327,7 +1327,7 @@ class AI_ext_rrot<bits<8> opcod, string opc, PatFrag opnode>
   : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPRnopc:$Rm, rot_imm:$rot),
           IIC_iEXTr, opc, "\t$Rd, $Rm$rot",
           [(set GPRnopc:$Rd, (opnode (rotr GPRnopc:$Rm, rot_imm:$rot)))]>,
-       Requires<[IsARM, HasV6]> {
+       Requires<[IsARM, HasV6]>, Sched<[WriteALUsi]> {
   bits<4> Rd;
   bits<4> Rm;
   bits<2> rot;
@@ -1340,11 +1340,11 @@ class AI_ext_rrot<bits<8> opcod, string opc, PatFrag opnode>
 class AI_ext_rrot_np<bits<8> opcod, string opc>
   : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPRnopc:$Rm, rot_imm:$rot),
           IIC_iEXTr, opc, "\t$Rd, $Rm$rot", []>,
-       Requires<[IsARM, HasV6]> {
+       Requires<[IsARM, HasV6]>, Sched<[WriteALUsi]> {
   bits<2> rot;
   let Inst{19-16} = 0b1111;
   let Inst{11-10} = rot;
-}
+ }
 
 /// AI_exta_rrot - A binary operation with two forms: one whose operand is a
 /// register and one whose operand is a register rotated by 8/16/24.
@@ -1353,7 +1353,7 @@ class AI_exta_rrot<bits<8> opcod, string opc, PatFrag opnode>
           IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm$rot",
           [(set GPRnopc:$Rd, (opnode GPR:$Rn,
                                      (rotr GPRnopc:$Rm, rot_imm:$rot)))]>,
-        Requires<[IsARM, HasV6]> {
+        Requires<[IsARM, HasV6]>, Sched<[WriteALUsr]> {
   bits<4> Rd;
   bits<4> Rm;
   bits<4> Rn;
@@ -1368,7 +1368,7 @@ class AI_exta_rrot<bits<8> opcod, string opc, PatFrag opnode>
 class AI_exta_rrot_np<bits<8> opcod, string opc>
   : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPR:$Rn, GPRnopc:$Rm, rot_imm:$rot),
           IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm$rot", []>,
-       Requires<[IsARM, HasV6]> {
+       Requires<[IsARM, HasV6]>, Sched<[WriteALUsr]> {
   bits<4> Rn;
   bits<2> rot;
   let Inst{19-16} = Rn;
@@ -1780,7 +1780,8 @@ multiclass APreLoad<bits<1> read, bits<1> data, string opc> {
 
   def i12 : AXI<(outs), (ins addrmode_imm12:$addr), MiscFrm, IIC_Preload,
                 !strconcat(opc, "\t$addr"),
-                [(ARMPreload addrmode_imm12:$addr, (i32 read), (i32 data))]> {
+                [(ARMPreload addrmode_imm12:$addr, (i32 read), (i32 data))]>,
+                Sched<[WritePreLd]> {
     bits<4> Rt;
     bits<17> addr;
     let Inst{31-26} = 0b111101;
@@ -1796,7 +1797,8 @@ multiclass APreLoad<bits<1> read, bits<1> data, string opc> {
 
   def rs : AXI<(outs), (ins ldst_so_reg:$shift), MiscFrm, IIC_Preload,
                !strconcat(opc, "\t$shift"),
-               [(ARMPreload ldst_so_reg:$shift, (i32 read), (i32 data))]> {
+               [(ARMPreload ldst_so_reg:$shift, (i32 read), (i32 data))]>,
+               Sched<[WritePreLd]> {
     bits<17> shift;
     let Inst{31-26} = 0b111101;
     let Inst{25} = 1; // 1 for register form
@@ -1863,7 +1865,8 @@ def TRAP : AXI<(outs), (ins), MiscFrm, NoItinerary,
 let isNotDuplicable = 1 in {
 def PICADD  : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$a, pclabel:$cp, pred:$p),
                             4, IIC_iALUr,
-                            [(set GPR:$dst, (ARMpic_add GPR:$a, imm:$cp))]>;
+                            [(set GPR:$dst, (ARMpic_add GPR:$a, imm:$cp))]>,
+                            Sched<[WriteALU, ReadALU]>;
 
 let AddedComplexity = 10 in {
 def PICLDR  : ARMPseudoInst<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
@@ -1923,11 +1926,11 @@ def ADR : AI1<{0,?,?,0}, (outs GPR:$Rd), (ins adrlabel:$label),
 
 let hasSideEffects = 1 in {
 def LEApcrel : ARMPseudoInst<(outs GPR:$Rd), (ins i32imm:$label, pred:$p),
-                    4, IIC_iALUi, []>;
+                    4, IIC_iALUi, []>, Sched<[WriteALU, ReadALU]>;
 
 def LEApcrelJT : ARMPseudoInst<(outs GPR:$Rd),
                       (ins i32imm:$label, nohash_imm:$id, pred:$p),
-                      4, IIC_iALUi, []>;
+                      4, IIC_iALUi, []>, Sched<[WriteALU, ReadALU]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1938,14 +1941,14 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
   // ARMV4T and above
   def BX_RET : AI<(outs), (ins), BrMiscFrm, IIC_Br,
                   "bx", "\tlr", [(ARMretflag)]>,
-               Requires<[IsARM, HasV4T]> {
+               Requires<[IsARM, HasV4T]>, Sched<[WriteBr]> {
     let Inst{27-0}  = 0b0001001011111111111100011110;
   }
 
   // ARMV4 only
   def MOVPCLR : AI<(outs), (ins), BrMiscFrm, IIC_Br,
                   "mov", "\tpc, lr", [(ARMretflag)]>,
-               Requires<[IsARM, NoV4T]> {
+               Requires<[IsARM, NoV4T]>, Sched<[WriteBr]> {
     let Inst{27-0} = 0b0001101000001111000000001110;
   }
 }
@@ -1955,7 +1958,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
   // ARMV4T and above
   def BX : AXI<(outs), (ins GPR:$dst), BrMiscFrm, IIC_Br, "bx\t$dst",
                   [(brind GPR:$dst)]>,
-              Requires<[IsARM, HasV4T]> {
+              Requires<[IsARM, HasV4T]>, Sched<[WriteBr]> {
     bits<4> dst;
     let Inst{31-4} = 0b1110000100101111111111110001;
     let Inst{3-0}  = dst;
@@ -1963,7 +1966,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
 
   def BX_pred : AI<(outs), (ins GPR:$dst), BrMiscFrm, IIC_Br,
                   "bx", "\t$dst", [/* pattern left blank */]>,
-              Requires<[IsARM, HasV4T]> {
+              Requires<[IsARM, HasV4T]>, Sched<[WriteBr]> {
     bits<4> dst;
     let Inst{27-4} = 0b000100101111111111110001;
     let Inst{3-0}  = dst;
@@ -1980,7 +1983,7 @@ let isCall = 1,
   def BL  : ABXI<0b1011, (outs), (ins bl_target:$func),
                 IIC_Br, "bl\t$func",
                 [(ARMcall tglobaladdr:$func)]>,
-            Requires<[IsARM]> {
+            Requires<[IsARM]>, Sched<[WriteBrL]> {
     let Inst{31-28} = 0b1110;
     bits<24> func;
     let Inst{23-0} = func;
@@ -1990,7 +1993,7 @@ let isCall = 1,
   def BL_pred : ABI<0b1011, (outs), (ins bl_target:$func),
                    IIC_Br, "bl", "\t$func",
                    [(ARMcall_pred tglobaladdr:$func)]>,
-                Requires<[IsARM]> {
+                Requires<[IsARM]>, Sched<[WriteBrL]> {
     bits<24> func;
     let Inst{23-0} = func;
     let DecoderMethod = "DecodeBranchImmInstruction";
@@ -2000,7 +2003,7 @@ let isCall = 1,
   def BLX : AXI<(outs), (ins GPR:$func), BrMiscFrm,
                 IIC_Br, "blx\t$func",
                 [(ARMcall GPR:$func)]>,
-            Requires<[IsARM, HasV5T]> {
+            Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> {
     bits<4> func;
     let Inst{31-4} = 0b1110000100101111111111110011;
     let Inst{3-0}  = func;
@@ -2009,7 +2012,7 @@ let isCall = 1,
   def BLX_pred : AI<(outs), (ins GPR:$func), BrMiscFrm,
                     IIC_Br, "blx", "\t$func",
                     [(ARMcall_pred GPR:$func)]>,
-                 Requires<[IsARM, HasV5T]> {
+                 Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> {
     bits<4> func;
     let Inst{27-4} = 0b000100101111111111110011;
     let Inst{3-0}  = func;
@@ -2019,18 +2022,18 @@ let isCall = 1,
   // Note: Restrict $func to the tGPR regclass to prevent it being in LR.
   def BX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func),
                    8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>,
-                   Requires<[IsARM, HasV4T]>;
+                   Requires<[IsARM, HasV4T]>, Sched<[WriteBr]>;
 
   // ARMv4
   def BMOVPCRX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func),
                    8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>,
-                   Requires<[IsARM, NoV4T]>;
+                   Requires<[IsARM, NoV4T]>, Sched<[WriteBr]>;
 
   // mov lr, pc; b if callee is marked noreturn to avoid confusing the
   // return stack predictor.
   def BMOVPCB_CALL : ARMPseudoInst<(outs), (ins bl_target:$func),
                                8, IIC_Br, [(ARMcall_nolink tglobaladdr:$func)]>,
-                      Requires<[IsARM]>;
+                      Requires<[IsARM]>, Sched<[WriteBr]>;
 }
 
 let isBranch = 1, isTerminator = 1 in {
@@ -2038,7 +2041,8 @@ let isBranch = 1, isTerminator = 1 in {
   // a two-value operand where a dag node expects two operands. :(
   def Bcc : ABI<0b1010, (outs), (ins br_target:$target),
                IIC_Br, "b", "\t$target",
-               [/*(ARMbrcond bb:$target, imm:$cc, CCR:$ccr)*/]> {
+               [/*(ARMbrcond bb:$target, imm:$cc, CCR:$ccr)*/]>,
+               Sched<[WriteBr]>  {
     bits<24> target;
     let Inst{23-0} = target;
     let DecoderMethod = "DecodeBranchImmInstruction";
@@ -2051,25 +2055,27 @@ let isBranch = 1, isTerminator = 1 in {
     // should be sufficient.
     // FIXME: Is B really a Barrier? That doesn't seem right.
     def B : ARMPseudoExpand<(outs), (ins br_target:$target), 4, IIC_Br,
-                [(br bb:$target)], (Bcc br_target:$target, (ops 14, zero_reg))>;
+                [(br bb:$target)], (Bcc br_target:$target, (ops 14, zero_reg))>,
+                Sched<[WriteBr]>;
 
     let isNotDuplicable = 1, isIndirectBranch = 1 in {
     def BR_JTr : ARMPseudoInst<(outs),
                       (ins GPR:$target, i32imm:$jt, i32imm:$id),
                       0, IIC_Br,
-                      [(ARMbrjt GPR:$target, tjumptable:$jt, imm:$id)]>;
+                      [(ARMbrjt GPR:$target, tjumptable:$jt, imm:$id)]>,
+                      Sched<[WriteBr]>;
     // FIXME: This shouldn't use the generic "addrmode2," but rather be split
     // into i12 and rs suffixed versions.
     def BR_JTm : ARMPseudoInst<(outs),
                      (ins addrmode2:$target, i32imm:$jt, i32imm:$id),
                      0, IIC_Br,
                      [(ARMbrjt (i32 (load addrmode2:$target)), tjumptable:$jt,
-                       imm:$id)]>;
+                       imm:$id)]>, Sched<[WriteBrTbl]>;
     def BR_JTadd : ARMPseudoInst<(outs),
                    (ins GPR:$target, GPR:$idx, i32imm:$jt, i32imm:$id),
                    0, IIC_Br,
                    [(ARMbrjt (add GPR:$target, GPR:$idx), tjumptable:$jt,
-                     imm:$id)]>;
+                     imm:$id)]>, Sched<[WriteBrTbl]>;
     } // isNotDuplicable = 1, isIndirectBranch = 1
   } // isBarrier = 1
 
@@ -2078,7 +2084,7 @@ let isBranch = 1, isTerminator = 1 in {
 // BLX (immediate)
 def BLXi : AXI<(outs), (ins blx_target:$target), BrMiscFrm, NoItinerary,
                "blx\t$target", []>,
-           Requires<[IsARM, HasV5T]> {
+           Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> {
   let Inst{31-25} = 0b1111101;
   bits<25> target;
   let Inst{23-0} = target{24-1};
@@ -2087,7 +2093,7 @@ def BLXi : AXI<(outs), (ins blx_target:$target), BrMiscFrm, NoItinerary,
 
 // Branch and Exchange Jazelle
 def BXJ : ABI<0b0001, (outs), (ins GPR:$func), NoItinerary, "bxj", "\t$func",
-              [/* pattern left blank */]> {
+              [/* pattern left blank */]>, Sched<[WriteBr]> {
   bits<4> func;
   let Inst{23-20} = 0b0010;
   let Inst{19-8} = 0xfff;
@@ -2098,18 +2104,20 @@ def BXJ : ABI<0b0001, (outs), (ins GPR:$func), NoItinerary, "bxj", "\t$func",
 // Tail calls.
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
-  def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst), IIC_Br, []>;
+  def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst), IIC_Br, []>,
+                   Sched<[WriteBr]>;
 
-  def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst), IIC_Br, []>;
+  def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst), IIC_Br, []>,
+                   Sched<[WriteBr]>;
 
   def TAILJMPd : ARMPseudoExpand<(outs), (ins br_target:$dst),
                                  4, IIC_Br, [],
                                  (Bcc br_target:$dst, (ops 14, zero_reg))>,
-                                 Requires<[IsARM]>;
+                                 Requires<[IsARM]>, Sched<[WriteBr]>;
 
   def TAILJMPr : ARMPseudoExpand<(outs), (ins tcGPR:$dst),
                                  4, IIC_Br, [],
-                                 (BX GPR:$dst)>,
+                                 (BX GPR:$dst)>, Sched<[WriteBr]>,
                                  Requires<[IsARM]>;
 }
 
@@ -2123,7 +2131,8 @@ def SMC : ABI<0b0001, (outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt",
 
 // Supervisor Call (Software Interrupt)
 let isCall = 1, Uses = [SP] in {
-def SVC : ABI<0b1111, (outs), (ins imm24b:$svc), IIC_Br, "svc", "\t$svc", []> {
+def SVC : ABI<0b1111, (outs), (ins imm24b:$svc), IIC_Br, "svc", "\t$svc", []>,
+          Sched<[WriteBr]> {
   bits<24> svc;
   let Inst{23-0} = svc;
 }
@@ -2955,7 +2964,7 @@ defm sysSTM : arm_ldst_mult<"stm", " ^", 0, 1, LdStMulFrm, IIC_iStore_m,
 
 let neverHasSideEffects = 1 in
 def MOVr : AsI1<0b1101, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMOVr,
-                "mov", "\t$Rd, $Rm", []>, UnaryDP {
+                "mov", "\t$Rd, $Rm", []>, UnaryDP, Sched<[WriteALU]> {
   bits<4> Rd;
   bits<4> Rm;
 
@@ -2969,7 +2978,7 @@ def MOVr : AsI1<0b1101, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMOVr,
 // A version for the smaller set of tail call registers.
 let neverHasSideEffects = 1 in
 def MOVr_TC : AsI1<0b1101, (outs tcGPR:$Rd), (ins tcGPR:$Rm), DPFrm,
-                IIC_iMOVr, "mov", "\t$Rd, $Rm", []>, UnaryDP {
+                IIC_iMOVr, "mov", "\t$Rd, $Rm", []>, UnaryDP, Sched<[WriteALU]> {
   bits<4> Rd;
   bits<4> Rm;
 
@@ -2982,7 +2991,8 @@ def MOVr_TC : AsI1<0b1101, (outs tcGPR:$Rd), (ins tcGPR:$Rm), DPFrm,
 def MOVsr : AsI1<0b1101, (outs GPRnopc:$Rd), (ins shift_so_reg_reg:$src),
                 DPSoRegRegFrm, IIC_iMOVsr,
                 "mov", "\t$Rd, $src",
-                [(set GPRnopc:$Rd, shift_so_reg_reg:$src)]>, UnaryDP {
+                [(set GPRnopc:$Rd, shift_so_reg_reg:$src)]>, UnaryDP,
+                Sched<[WriteALU]> {
   bits<4> Rd;
   bits<12> src;
   let Inst{15-12} = Rd;
@@ -2998,7 +3008,7 @@ def MOVsr : AsI1<0b1101, (outs GPRnopc:$Rd), (ins shift_so_reg_reg:$src),
 def MOVsi : AsI1<0b1101, (outs GPR:$Rd), (ins shift_so_reg_imm:$src),
                 DPSoRegImmFrm, IIC_iMOVsr,
                 "mov", "\t$Rd, $src", [(set GPR:$Rd, shift_so_reg_imm:$src)]>,
-                UnaryDP {
+                UnaryDP, Sched<[WriteALU]> {
   bits<4> Rd;
   bits<12> src;
   let Inst{15-12} = Rd;
@@ -3011,7 +3021,8 @@ def MOVsi : AsI1<0b1101, (outs GPR:$Rd), (ins shift_so_reg_imm:$src),
 
 let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
 def MOVi : AsI1<0b1101, (outs GPR:$Rd), (ins so_imm:$imm), DPFrm, IIC_iMOVi,
-                "mov", "\t$Rd, $imm", [(set GPR:$Rd, so_imm:$imm)]>, UnaryDP {
+                "mov", "\t$Rd, $imm", [(set GPR:$Rd, so_imm:$imm)]>, UnaryDP,
+                Sched<[WriteALU]> {
   bits<4> Rd;
   bits<12> imm;
   let Inst{25} = 1;
@@ -3025,7 +3036,7 @@ def MOVi16 : AI1<0b1000, (outs GPR:$Rd), (ins imm0_65535_expr:$imm),
                  DPFrm, IIC_iMOVi,
                  "movw", "\t$Rd, $imm",
                  [(set GPR:$Rd, imm0_65535:$imm)]>,
-                 Requires<[IsARM, HasV6T2]>, UnaryDP {
+                 Requires<[IsARM, HasV6T2]>, UnaryDP, Sched<[WriteALU]> {
   bits<4> Rd;
   bits<16> imm;
   let Inst{15-12} = Rd;
@@ -3041,7 +3052,8 @@ def : InstAlias<"mov${p} $Rd, $imm",
         Requires<[IsARM]>;
 
 def MOVi16_ga_pcrel : PseudoInst<(outs GPR:$Rd),
-                                (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>;
+                                (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>,
+                      Sched<[WriteALU]>;
 
 let Constraints = "$src = $Rd" in {
 def MOVTi16 : AI1<0b1010, (outs GPRnopc:$Rd),
@@ -3051,7 +3063,7 @@ def MOVTi16 : AI1<0b1010, (outs GPRnopc:$Rd),
                   [(set GPRnopc:$Rd,
                         (or (and GPR:$src, 0xffff),
                             lo16AllZero:$imm))]>, UnaryDP,
-                  Requires<[IsARM, HasV6T2]> {
+                  Requires<[IsARM, HasV6T2]>, Sched<[WriteALU]> {
   bits<4> Rd;
   bits<16> imm;
   let Inst{15-12} = Rd;
@@ -3063,7 +3075,8 @@ def MOVTi16 : AI1<0b1010, (outs GPRnopc:$Rd),
 }
 
 def MOVTi16_ga_pcrel : PseudoInst<(outs GPR:$Rd),
-                      (ins GPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>;
+                      (ins GPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>,
+                      Sched<[WriteALU]>;
 
 } // Constraints
 
@@ -3073,7 +3086,7 @@ def : ARMPat<(or GPR:$src, 0xffff0000), (MOVTi16 GPR:$src, 0xffff)>,
 let Uses = [CPSR] in
 def RRX: PseudoInst<(outs GPR:$Rd), (ins GPR:$Rm), IIC_iMOVsi,
                     [(set GPR:$Rd, (ARMrrx GPR:$Rm))]>, UnaryDP,
-                    Requires<[IsARM]>;
+                    Requires<[IsARM]>, Sched<[WriteALU]>;
 
 // These aren't really mov instructions, but we have to define them this way
 // due to flag operands.
@@ -3081,10 +3094,10 @@ def RRX: PseudoInst<(outs GPR:$Rd), (ins GPR:$Rm), IIC_iMOVsi,
 let Defs = [CPSR] in {
 def MOVsrl_flag : PseudoInst<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
                       [(set GPR:$dst, (ARMsrl_flag GPR:$src))]>, UnaryDP,
-                      Requires<[IsARM]>;
+                      Sched<[WriteALU]>, Requires<[IsARM]>;
 def MOVsra_flag : PseudoInst<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
                       [(set GPR:$dst, (ARMsra_flag GPR:$src))]>, UnaryDP,
-                      Requires<[IsARM]>;
+                      Sched<[WriteALU]>, Requires<[IsARM]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -3250,7 +3263,8 @@ class AAI<bits<8> op27_20, bits<8> op11_4, string opc,
           list<dag> pattern = [],
           dag iops = (ins GPRnopc:$Rn, GPRnopc:$Rm),
           string asm = "\t$Rd, $Rn, $Rm">
-  : AI<(outs GPRnopc:$Rd), iops, DPFrm, IIC_iALUr, opc, asm, pattern> {
+  : AI<(outs GPRnopc:$Rd), iops, DPFrm, IIC_iALUr, opc, asm, pattern>,
+    Sched<[WriteALU, ReadALU, ReadALU]> {
   bits<4> Rn;
   bits<4> Rd;
   bits<4> Rm;
@@ -3265,9 +3279,11 @@ class AAI<bits<8> op27_20, bits<8> op11_4, string opc,
 
 // Saturating add/subtract
 
+let DecoderMethod = "DecodeQADDInstruction" in
 def QADD    : AAI<0b00010000, 0b00000101, "qadd",
                   [(set GPRnopc:$Rd, (int_arm_qadd GPRnopc:$Rm, GPRnopc:$Rn))],
                   (ins GPRnopc:$Rm, GPRnopc:$Rn), "\t$Rd, $Rm, $Rn">;
+
 def QSUB    : AAI<0b00010010, 0b00000101, "qsub",
                   [(set GPRnopc:$Rd, (int_arm_qsub GPRnopc:$Rm, GPRnopc:$Rn))],
                   (ins GPRnopc:$Rm, GPRnopc:$Rn), "\t$Rd, $Rm, $Rn">;
@@ -3326,7 +3342,7 @@ def UHSUB8  : AAI<0b01100111, 0b11111111, "uhsub8">;
 def USAD8  : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
                 MulFrm /* for convenience */, NoItinerary, "usad8",
                 "\t$Rd, $Rn, $Rm", []>,
-             Requires<[IsARM, HasV6]> {
+             Requires<[IsARM, HasV6]>, Sched<[WriteALU, ReadALU, ReadALU]> {
   bits<4> Rd;
   bits<4> Rn;
   bits<4> Rm;
@@ -3340,7 +3356,7 @@ def USAD8  : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
 def USADA8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
                 MulFrm /* for convenience */, NoItinerary, "usada8",
                 "\t$Rd, $Rn, $Rm, $Ra", []>,
-             Requires<[IsARM, HasV6]> {
+             Requires<[IsARM, HasV6]>, Sched<[WriteALU, ReadALU, ReadALU]>{
   bits<4> Rd;
   bits<4> Rn;
   bits<4> Rm;
@@ -3473,7 +3489,7 @@ def BFI:I<(outs GPRnopc:$Rd), (ins GPRnopc:$src, GPR:$Rn, bf_inv_mask_imm:$imm),
 
 def  MVNr  : AsI1<0b1111, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMVNr,
                   "mvn", "\t$Rd, $Rm",
-                  [(set GPR:$Rd, (not GPR:$Rm))]>, UnaryDP {
+                  [(set GPR:$Rd, (not GPR:$Rm))]>, UnaryDP, Sched<[WriteALU]> {
   bits<4> Rd;
   bits<4> Rm;
   let Inst{25} = 0;
@@ -3484,7 +3500,8 @@ def  MVNr  : AsI1<0b1111, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMVNr,
 }
 def  MVNsi  : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_imm:$shift),
                   DPSoRegImmFrm, IIC_iMVNsr, "mvn", "\t$Rd, $shift",
-                  [(set GPR:$Rd, (not so_reg_imm:$shift))]>, UnaryDP {
+                  [(set GPR:$Rd, (not so_reg_imm:$shift))]>, UnaryDP,
+                  Sched<[WriteALU]> {
   bits<4> Rd;
   bits<12> shift;
   let Inst{25} = 0;
@@ -3496,7 +3513,8 @@ def  MVNsi  : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_imm:$shift),
 }
 def  MVNsr  : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_reg:$shift),
                   DPSoRegRegFrm, IIC_iMVNsr, "mvn", "\t$Rd, $shift",
-                  [(set GPR:$Rd, (not so_reg_reg:$shift))]>, UnaryDP {
+                  [(set GPR:$Rd, (not so_reg_reg:$shift))]>, UnaryDP,
+                  Sched<[WriteALU]> {
   bits<4> Rd;
   bits<12> shift;
   let Inst{25} = 0;
@@ -3511,7 +3529,7 @@ def  MVNsr  : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_reg:$shift),
 let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
 def  MVNi  : AsI1<0b1111, (outs GPR:$Rd), (ins so_imm:$imm), DPFrm,
                   IIC_iMVNi, "mvn", "\t$Rd, $imm",
-                  [(set GPR:$Rd, so_imm_not:$imm)]>,UnaryDP {
+                  [(set GPR:$Rd, so_imm_not:$imm)]>,UnaryDP, Sched<[WriteALU]> {
   bits<4> Rd;
   bits<12> imm;
   let Inst{25} = 1;
@@ -4022,7 +4040,8 @@ def : ARMPat<(ARMcmpZ GPR:$src, so_reg_reg:$rhs),
 let isCompare = 1, Defs = [CPSR] in {
 def CMNri : AI1<0b1011, (outs), (ins GPR:$Rn, so_imm:$imm), DPFrm, IIC_iCMPi,
                 "cmn", "\t$Rn, $imm",
-                [(ARMcmn GPR:$Rn, so_imm:$imm)]> {
+                [(ARMcmn GPR:$Rn, so_imm:$imm)]>,
+                Sched<[WriteCMP, ReadALU]> {
   bits<4> Rn;
   bits<12> imm;
   let Inst{25} = 1;
@@ -4038,7 +4057,7 @@ def CMNri : AI1<0b1011, (outs), (ins GPR:$Rn, so_imm:$imm), DPFrm, IIC_iCMPi,
 def CMNzrr : AI1<0b1011, (outs), (ins GPR:$Rn, GPR:$Rm), DPFrm, IIC_iCMPr,
                  "cmn", "\t$Rn, $Rm",
                  [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
-                   GPR:$Rn, GPR:$Rm)]> {
+                   GPR:$Rn, GPR:$Rm)]>, Sched<[WriteCMP, ReadALU, ReadALU]> {
   bits<4> Rn;
   bits<4> Rm;
   let isCommutable = 1;
@@ -4056,7 +4075,8 @@ def CMNzrsi : AI1<0b1011, (outs),
                   (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, IIC_iCMPsr,
                   "cmn", "\t$Rn, $shift",
                   [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
-                    GPR:$Rn, so_reg_imm:$shift)]> {
+                    GPR:$Rn, so_reg_imm:$shift)]>,
+                    Sched<[WriteCMPsi, ReadALU]> {
   bits<4> Rn;
   bits<12> shift;
   let Inst{25} = 0;
@@ -4074,7 +4094,8 @@ def CMNzrsr : AI1<0b1011, (outs),
                   (ins GPRnopc:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, IIC_iCMPsr,
                   "cmn", "\t$Rn, $shift",
                   [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
-                    GPRnopc:$Rn, so_reg_reg:$shift)]> {
+                    GPRnopc:$Rn, so_reg_reg:$shift)]>,
+                    Sched<[WriteCMPsr, ReadALU]> {
   bits<4> Rn;
   bits<12> shift;
   let Inst{25} = 0;
@@ -4112,11 +4133,13 @@ let usesCustomInserter = 1, isBranch = 1, isTerminator = 1,
 def BCCi64 : PseudoInst<(outs),
     (ins i32imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, brtarget:$dst),
      IIC_Br,
-    [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, bb:$dst)]>;
+    [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, bb:$dst)]>,
+    Sched<[WriteBr]>;
 
 def BCCZi64 : PseudoInst<(outs),
      (ins i32imm:$cc, GPR:$lhs1, GPR:$lhs2, brtarget:$dst), IIC_Br,
-    [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, 0, 0, bb:$dst)]>;
+    [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, 0, 0, bb:$dst)]>,
+    Sched<[WriteBr]>;
 } // usesCustomInserter
 
 
@@ -4129,20 +4152,20 @@ let isCommutable = 1, isSelect = 1 in
 def MOVCCr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$false, GPR:$Rm, pred:$p),
                            4, IIC_iCMOVr,
   [/*(set GPR:$Rd, (ARMcmov GPR:$false, GPR:$Rm, imm:$cc, CCR:$ccr))*/]>,
-      RegConstraint<"$false = $Rd">;
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
 
 def MOVCCsi : ARMPseudoInst<(outs GPR:$Rd),
                            (ins GPR:$false, so_reg_imm:$shift, pred:$p),
                            4, IIC_iCMOVsr,
   [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_reg_imm:$shift,
                             imm:$cc, CCR:$ccr))*/]>,
-      RegConstraint<"$false = $Rd">;
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
 def MOVCCsr : ARMPseudoInst<(outs GPR:$Rd),
                            (ins GPR:$false, so_reg_reg:$shift, pred:$p),
                            4, IIC_iCMOVsr,
   [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_reg_reg:$shift,
                             imm:$cc, CCR:$ccr))*/]>,
-      RegConstraint<"$false = $Rd">;
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
 
 
 let isMoveImm = 1 in
@@ -4150,14 +4173,15 @@ def MOVCCi16 : ARMPseudoInst<(outs GPR:$Rd),
                              (ins GPR:$false, imm0_65535_expr:$imm, pred:$p),
                              4, IIC_iMOVi,
                              []>,
-      RegConstraint<"$false = $Rd">, Requires<[IsARM, HasV6T2]>;
+      RegConstraint<"$false = $Rd">, Requires<[IsARM, HasV6T2]>,
+      Sched<[WriteALU]>;
 
 let isMoveImm = 1 in
 def MOVCCi : ARMPseudoInst<(outs GPR:$Rd),
                            (ins GPR:$false, so_imm:$imm, pred:$p),
                            4, IIC_iCMOVi,
    [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_imm:$imm, imm:$cc, CCR:$ccr))*/]>,
-      RegConstraint<"$false = $Rd">;
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
 
 // Two instruction predicate mov immediate.
 let isMoveImm = 1 in
@@ -4170,7 +4194,7 @@ def MVNCCi : ARMPseudoInst<(outs GPR:$Rd),
                            (ins GPR:$false, so_imm:$imm, pred:$p),
                            4, IIC_iCMOVi,
  [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_imm_not:$imm, imm:$cc, CCR:$ccr))*/]>,
-                RegConstraint<"$false = $Rd">;
+                RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
 
 } // neverHasSideEffects
 
@@ -4189,6 +4213,16 @@ def memb_opt : Operand<i32> {
   let DecoderMethod = "DecodeMemBarrierOption";
 }
 
+def InstSyncBarrierOptOperand : AsmOperandClass {
+  let Name = "InstSyncBarrierOpt";
+  let ParserMethod = "parseInstSyncBarrierOptOperand";
+}
+def instsyncb_opt : Operand<i32> {
+  let PrintMethod = "printInstSyncBOption";
+  let ParserMatchClass = InstSyncBarrierOptOperand;
+  let DecoderMethod = "DecodeInstSyncBarrierOption";
+}
+
 // memory barriers protect the atomic sequences
 let hasSideEffects = 1 in {
 def DMB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
@@ -4209,7 +4243,7 @@ def DSB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
 }
 
 // ISB has only full system option
-def ISB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
+def ISB : AInoP<(outs), (ins instsyncb_opt:$opt), MiscFrm, NoItinerary,
                 "isb", "\t$opt", []>,
                 Requires<[IsARM, HasDB]> {
   bits<4> opt;
@@ -4636,11 +4670,11 @@ def : ARMInstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm",
                    (MCR p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                         c_imm:$CRm, 0, pred:$p)>;
 def MRC : MovRCopro<"mrc", 1 /* from coprocessor to ARM core register */,
-                    (outs GPR:$Rt),
+                    (outs GPRwithAPSR:$Rt),
                     (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm,
                          imm0_7:$opc2), []>;
 def : ARMInstAlias<"mrc${p} $cop, $opc1, $Rt, $CRn, $CRm",
-                   (MRC GPR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                   (MRC GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
                         c_imm:$CRm, 0, pred:$p)>;
 
 def : ARMPat<(int_arm_mrc imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
@@ -4650,7 +4684,7 @@ class MovRCopro2<string opc, bit direction, dag oops, dag iops,
                  list<dag> pattern>
   : ABXI<0b1110, oops, iops, NoItinerary,
          !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"), pattern> {
-  let Inst{31-28} = 0b1111;
+  let Inst{31-24} = 0b11111110;
   let Inst{20} = direction;
   let Inst{4} = 1;
 
@@ -4679,11 +4713,11 @@ def : ARMInstAlias<"mcr2$ $cop, $opc1, $Rt, $CRn, $CRm",
                    (MCR2 p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                          c_imm:$CRm, 0)>;
 def MRC2 : MovRCopro2<"mrc2", 1 /* from coprocessor to ARM core register */,
-                      (outs GPR:$Rt),
+                      (outs GPRwithAPSR:$Rt),
                       (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm,
                            imm0_7:$opc2), []>;
 def : ARMInstAlias<"mrc2$ $cop, $opc1, $Rt, $CRn, $CRm",
-                   (MRC2 GPR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                   (MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
                          c_imm:$CRm, 0)>;
 
 def : ARMV5TPat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn,
@@ -4820,7 +4854,7 @@ def MSRi : ABI<0b0011, (outs), (ins msr_mask:$mask,  so_imm:$a), NoItinerary,
 let isCall = 1,
   Defs = [R0, R12, LR, CPSR], Uses = [SP] in {
   def TPsoft : PseudoInst<(outs), (ins), IIC_Br,
-               [(set R0, ARMthread_pointer)]>;
+               [(set R0, ARMthread_pointer)]>, Sched<[WriteBr]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -4884,7 +4918,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in
   def MOVPCRX : ARMPseudoExpand<(outs), (ins GPR:$dst),
                     4, IIC_Br, [(brind GPR:$dst)],
                     (MOVr PC, GPR:$dst, (ops 14, zero_reg), zero_reg)>,
-                  Requires<[IsARM, NoV4T]>;
+                  Requires<[IsARM, NoV4T]>, Sched<[WriteBr]>;
 
 // Large immediate handling.
 
@@ -5233,7 +5267,7 @@ def RORi : ARMAsmPseudo<"ror${s}${p} $Rd, $Rm, $imm",
                              cc_out:$s)>;
 }
 def RRXi : ARMAsmPseudo<"rrx${s}${p} $Rd, $Rm",
-                        (ins GPRnopc:$Rd, GPRnopc:$Rm, pred:$p, cc_out:$s)>;
+                        (ins GPR:$Rd, GPR:$Rm, pred:$p, cc_out:$s)>;
 let TwoOperandAliasConstraint = "$Rn = $Rd" in {
 def ASRr : ARMAsmPseudo<"asr${s}${p} $Rd, $Rn, $Rm",
                         (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 896fd0f..9d1a8ea 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -626,7 +626,7 @@ class VLD1D<bits<4> op7_4, string Dt>
           "vld1", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
-  let DecoderMethod = "DecodeVLDInstruction";
+  let DecoderMethod = "DecodeVLDST1Instruction";
 }
 class VLD1Q<bits<4> op7_4, string Dt>
   : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd),
@@ -634,7 +634,7 @@ class VLD1Q<bits<4> op7_4, string Dt>
           "vld1", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVLDInstruction";
+  let DecoderMethod = "DecodeVLDST1Instruction";
 }
 
 def  VLD1d8   : VLD1D<{0,0,0,?}, "8">;
@@ -655,7 +655,7 @@ multiclass VLD1DWB<bits<4> op7_4, string Dt> {
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{4} = Rn{4};
-    let DecoderMethod = "DecodeVLDInstruction";
+    let DecoderMethod = "DecodeVLDST1Instruction";
     let AsmMatchConverter = "cvtVLDwbFixed";
   }
   def _register : NLdSt<0,0b10,0b0111,op7_4, (outs VecListOneD:$Vd, GPR:$wb),
@@ -663,7 +663,7 @@ multiclass VLD1DWB<bits<4> op7_4, string Dt> {
                         "vld1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{4} = Rn{4};
-    let DecoderMethod = "DecodeVLDInstruction";
+    let DecoderMethod = "DecodeVLDST1Instruction";
     let AsmMatchConverter = "cvtVLDwbRegister";
   }
 }
@@ -674,7 +674,7 @@ multiclass VLD1QWB<bits<4> op7_4, string Dt> {
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVLDInstruction";
+    let DecoderMethod = "DecodeVLDST1Instruction";
     let AsmMatchConverter = "cvtVLDwbFixed";
   }
   def _register : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd, GPR:$wb),
@@ -682,7 +682,7 @@ multiclass VLD1QWB<bits<4> op7_4, string Dt> {
                         "vld1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVLDInstruction";
+    let DecoderMethod = "DecodeVLDST1Instruction";
     let AsmMatchConverter = "cvtVLDwbRegister";
   }
 }
@@ -703,7 +703,7 @@ class VLD1D3<bits<4> op7_4, string Dt>
           "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
-  let DecoderMethod = "DecodeVLDInstruction";
+  let DecoderMethod = "DecodeVLDST1Instruction";
 }
 multiclass VLD1D3WB<bits<4> op7_4, string Dt> {
   def _fixed : NLdSt<0,0b10,0b0110, op7_4, (outs VecListThreeD:$Vd, GPR:$wb),
@@ -712,7 +712,7 @@ multiclass VLD1D3WB<bits<4> op7_4, string Dt> {
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{4} = Rn{4};
-    let DecoderMethod = "DecodeVLDInstruction";
+    let DecoderMethod = "DecodeVLDST1Instruction";
     let AsmMatchConverter = "cvtVLDwbFixed";
   }
   def _register : NLdSt<0,0b10,0b0110,op7_4, (outs VecListThreeD:$Vd, GPR:$wb),
@@ -720,7 +720,7 @@ multiclass VLD1D3WB<bits<4> op7_4, string Dt> {
                         "vld1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{4} = Rn{4};
-    let DecoderMethod = "DecodeVLDInstruction";
+    let DecoderMethod = "DecodeVLDST1Instruction";
     let AsmMatchConverter = "cvtVLDwbRegister";
   }
 }
@@ -744,7 +744,7 @@ class VLD1D4<bits<4> op7_4, string Dt>
           "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVLDInstruction";
+  let DecoderMethod = "DecodeVLDST1Instruction";
 }
 multiclass VLD1D4WB<bits<4> op7_4, string Dt> {
   def _fixed : NLdSt<0,0b10,0b0010, op7_4, (outs VecListFourD:$Vd, GPR:$wb),
@@ -753,7 +753,7 @@ multiclass VLD1D4WB<bits<4> op7_4, string Dt> {
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVLDInstruction";
+    let DecoderMethod = "DecodeVLDST1Instruction";
     let AsmMatchConverter = "cvtVLDwbFixed";
   }
   def _register : NLdSt<0,0b10,0b0010,op7_4, (outs VecListFourD:$Vd, GPR:$wb),
@@ -761,7 +761,7 @@ multiclass VLD1D4WB<bits<4> op7_4, string Dt> {
                         "vld1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVLDInstruction";
+    let DecoderMethod = "DecodeVLDST1Instruction";
     let AsmMatchConverter = "cvtVLDwbRegister";
   }
 }
@@ -786,7 +786,7 @@ class VLD2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
           "vld2", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVLDInstruction";
+  let DecoderMethod = "DecodeVLDST2Instruction";
 }
 
 def  VLD2d8   : VLD2<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VLD2>;
@@ -810,7 +810,7 @@ multiclass VLD2WB<bits<4> op11_8, bits<4> op7_4, string Dt,
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVLDInstruction";
+    let DecoderMethod = "DecodeVLDST2Instruction";
     let AsmMatchConverter = "cvtVLDwbFixed";
   }
   def _register : NLdSt<0, 0b10, op11_8, op7_4, (outs VdTy:$Vd, GPR:$wb),
@@ -818,7 +818,7 @@ multiclass VLD2WB<bits<4> op11_8, bits<4> op7_4, string Dt,
                         "vld2", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVLDInstruction";
+    let DecoderMethod = "DecodeVLDST2Instruction";
     let AsmMatchConverter = "cvtVLDwbRegister";
   }
 }
@@ -853,7 +853,7 @@ class VLD3D<bits<4> op11_8, bits<4> op7_4, string Dt>
           "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
-  let DecoderMethod = "DecodeVLDInstruction";
+  let DecoderMethod = "DecodeVLDST3Instruction";
 }
 
 def  VLD3d8   : VLD3D<0b0100, {0,0,0,?}, "8">;
@@ -872,7 +872,7 @@ class VLD3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{4} = Rn{4};
-  let DecoderMethod = "DecodeVLDInstruction";
+  let DecoderMethod = "DecodeVLDST3Instruction";
 }
 
 def VLD3d8_UPD  : VLD3DWB<0b0100, {0,0,0,?}, "8">;
@@ -912,7 +912,7 @@ class VLD4D<bits<4> op11_8, bits<4> op7_4, string Dt>
           "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVLDInstruction";
+  let DecoderMethod = "DecodeVLDST4Instruction";
 }
 
 def  VLD4d8   : VLD4D<0b0000, {0,0,?,?}, "8">;
@@ -931,7 +931,7 @@ class VLD4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVLDInstruction";
+  let DecoderMethod = "DecodeVLDST4Instruction";
 }
 
 def VLD4d8_UPD  : VLD4DWB<0b0000, {0,0,?,?}, "8">;
@@ -1580,14 +1580,14 @@ class VST1D<bits<4> op7_4, string Dt>
           IIC_VST1, "vst1", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
-  let DecoderMethod = "DecodeVSTInstruction";
+  let DecoderMethod = "DecodeVLDST1Instruction";
 }
 class VST1Q<bits<4> op7_4, string Dt>
   : NLdSt<0,0b00,0b1010,op7_4, (outs), (ins addrmode6:$Rn, VecListDPair:$Vd),
           IIC_VST1x2, "vst1", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVSTInstruction";
+  let DecoderMethod = "DecodeVLDST1Instruction";
 }
 
 def  VST1d8   : VST1D<{0,0,0,?}, "8">;
@@ -1608,7 +1608,7 @@ multiclass VST1DWB<bits<4> op7_4, string Dt> {
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{4} = Rn{4};
-    let DecoderMethod = "DecodeVSTInstruction";
+    let DecoderMethod = "DecodeVLDST1Instruction";
     let AsmMatchConverter = "cvtVSTwbFixed";
   }
   def _register : NLdSt<0,0b00,0b0111,op7_4, (outs GPR:$wb),
@@ -1617,7 +1617,7 @@ multiclass VST1DWB<bits<4> op7_4, string Dt> {
                         "vst1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{4} = Rn{4};
-    let DecoderMethod = "DecodeVSTInstruction";
+    let DecoderMethod = "DecodeVLDST1Instruction";
     let AsmMatchConverter = "cvtVSTwbRegister";
   }
 }
@@ -1628,7 +1628,7 @@ multiclass VST1QWB<bits<4> op7_4, string Dt> {
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVSTInstruction";
+    let DecoderMethod = "DecodeVLDST1Instruction";
     let AsmMatchConverter = "cvtVSTwbFixed";
   }
   def _register : NLdSt<0,0b00,0b1010,op7_4, (outs GPR:$wb),
@@ -1637,7 +1637,7 @@ multiclass VST1QWB<bits<4> op7_4, string Dt> {
                         "vst1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVSTInstruction";
+    let DecoderMethod = "DecodeVLDST1Instruction";
     let AsmMatchConverter = "cvtVSTwbRegister";
   }
 }
@@ -1659,7 +1659,7 @@ class VST1D3<bits<4> op7_4, string Dt>
           IIC_VST1x3, "vst1", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
-  let DecoderMethod = "DecodeVSTInstruction";
+  let DecoderMethod = "DecodeVLDST1Instruction";
 }
 multiclass VST1D3WB<bits<4> op7_4, string Dt> {
   def _fixed : NLdSt<0,0b00,0b0110,op7_4, (outs GPR:$wb),
@@ -1668,7 +1668,7 @@ multiclass VST1D3WB<bits<4> op7_4, string Dt> {
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVSTInstruction";
+    let DecoderMethod = "DecodeVLDST1Instruction";
     let AsmMatchConverter = "cvtVSTwbFixed";
   }
   def _register : NLdSt<0,0b00,0b0110,op7_4, (outs GPR:$wb),
@@ -1677,7 +1677,7 @@ multiclass VST1D3WB<bits<4> op7_4, string Dt> {
                         "vst1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVSTInstruction";
+    let DecoderMethod = "DecodeVLDST1Instruction";
     let AsmMatchConverter = "cvtVSTwbRegister";
   }
 }
@@ -1704,7 +1704,7 @@ class VST1D4<bits<4> op7_4, string Dt>
           []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVSTInstruction";
+  let DecoderMethod = "DecodeVLDST1Instruction";
 }
 multiclass VST1D4WB<bits<4> op7_4, string Dt> {
   def _fixed : NLdSt<0,0b00,0b0010,op7_4, (outs GPR:$wb),
@@ -1713,7 +1713,7 @@ multiclass VST1D4WB<bits<4> op7_4, string Dt> {
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVSTInstruction";
+    let DecoderMethod = "DecodeVLDST1Instruction";
     let AsmMatchConverter = "cvtVSTwbFixed";
   }
   def _register : NLdSt<0,0b00,0b0010,op7_4, (outs GPR:$wb),
@@ -1722,7 +1722,7 @@ multiclass VST1D4WB<bits<4> op7_4, string Dt> {
                         "vst1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVSTInstruction";
+    let DecoderMethod = "DecodeVLDST1Instruction";
     let AsmMatchConverter = "cvtVSTwbRegister";
   }
 }
@@ -1748,7 +1748,7 @@ class VST2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
           itin, "vst2", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVSTInstruction";
+  let DecoderMethod = "DecodeVLDST2Instruction";
 }
 
 def  VST2d8   : VST2<0b1000, {0,0,?,?}, "8",  VecListDPair, IIC_VST2>;
@@ -1772,7 +1772,7 @@ multiclass VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt,
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVSTInstruction";
+    let DecoderMethod = "DecodeVLDST2Instruction";
     let AsmMatchConverter = "cvtVSTwbFixed";
   }
   def _register : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
@@ -1780,7 +1780,7 @@ multiclass VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt,
                         "vst2", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVSTInstruction";
+    let DecoderMethod = "DecodeVLDST2Instruction";
     let AsmMatchConverter = "cvtVSTwbRegister";
   }
 }
@@ -1791,7 +1791,7 @@ multiclass VST2QWB<bits<4> op7_4, string Dt> {
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVSTInstruction";
+    let DecoderMethod = "DecodeVLDST2Instruction";
     let AsmMatchConverter = "cvtVSTwbFixed";
   }
   def _register : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb),
@@ -1800,7 +1800,7 @@ multiclass VST2QWB<bits<4> op7_4, string Dt> {
                         "vst2", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVSTInstruction";
+    let DecoderMethod = "DecodeVLDST2Instruction";
     let AsmMatchConverter = "cvtVSTwbRegister";
   }
 }
@@ -1835,7 +1835,7 @@ class VST3D<bits<4> op11_8, bits<4> op7_4, string Dt>
           "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
-  let DecoderMethod = "DecodeVSTInstruction";
+  let DecoderMethod = "DecodeVLDST3Instruction";
 }
 
 def  VST3d8   : VST3D<0b0100, {0,0,0,?}, "8">;
@@ -1854,7 +1854,7 @@ class VST3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{4} = Rn{4};
-  let DecoderMethod = "DecodeVSTInstruction";
+  let DecoderMethod = "DecodeVLDST3Instruction";
 }
 
 def VST3d8_UPD  : VST3DWB<0b0100, {0,0,0,?}, "8">;
@@ -1894,7 +1894,7 @@ class VST4D<bits<4> op11_8, bits<4> op7_4, string Dt>
           "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVSTInstruction";
+  let DecoderMethod = "DecodeVLDST4Instruction";
 }
 
 def  VST4d8   : VST4D<0b0000, {0,0,?,?}, "8">;
@@ -1913,7 +1913,7 @@ class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
            "vst4", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVSTInstruction";
+  let DecoderMethod = "DecodeVLDST4Instruction";
 }
 
 def VST4d8_UPD  : VST4DWB<0b0000, {0,0,?,?}, "8">;
@@ -5509,8 +5509,9 @@ class VEXTd<string OpcodeStr, string Dt, ValueType Ty, Operand immTy>
         IIC_VEXTD, OpcodeStr, Dt, "$Vd, $Vn, $Vm, $index", "",
         [(set DPR:$Vd, (Ty (NEONvext (Ty DPR:$Vn),
                                      (Ty DPR:$Vm), imm:$index)))]> {
-  bits<4> index;
-  let Inst{11-8} = index{3-0};
+  bits<3> index;
+  let Inst{11} = 0b0;
+  let Inst{10-8} = index{2-0};
 }
 
 class VEXTq<string OpcodeStr, string Dt, ValueType Ty, Operand immTy>
@@ -5525,14 +5526,14 @@ class VEXTq<string OpcodeStr, string Dt, ValueType Ty, Operand immTy>
 }
 
 def VEXTd8  : VEXTd<"vext", "8",  v8i8, imm0_7> {
-  let Inst{11-8} = index{3-0};
+  let Inst{10-8} = index{2-0};
 }
 def VEXTd16 : VEXTd<"vext", "16", v4i16, imm0_3> {
-  let Inst{11-9} = index{2-0};
+  let Inst{10-9} = index{1-0};
   let Inst{8}    = 0b0;
 }
 def VEXTd32 : VEXTd<"vext", "32", v2i32, imm0_1> {
-  let Inst{11-10} = index{1-0};
+  let Inst{10}     = index{0};
   let Inst{9-8}    = 0b00;
 }
 def : Pat<(v2f32 (NEONvext (v2f32 DPR:$Vn),
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index ae7a5c0..1fff41d 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -310,7 +310,7 @@ def tCPS : T1I<(outs), (ins imod_op:$imod, iflags_op:$iflags),
 let isNotDuplicable = 1, isCodeGenOnly = 1 in
 def tPICADD : TIt<(outs GPR:$dst), (ins GPR:$lhs, pclabel:$cp), IIC_iALUr, "",
                   [(set GPR:$dst, (ARMpic_add GPR:$lhs, imm:$cp))]>,
-              T1Special<{0,0,?,?}> {
+              T1Special<{0,0,?,?}>, Sched<[WriteALU]> {
   // A8.6.6
   bits<3> dst;
   let Inst{6-3} = 0b1111; // Rm = pc
@@ -323,7 +323,7 @@ def tPICADD : TIt<(outs GPR:$dst), (ins GPR:$lhs, pclabel:$cp), IIC_iALUr, "",
 // probably because the instruction can be moved around.
 def tADDrSPi : T1pI<(outs tGPR:$dst), (ins GPRsp:$sp, t_imm0_1020s4:$imm),
                     IIC_iALUi, "add", "\t$dst, $sp, $imm", []>,
-               T1Encoding<{1,0,1,0,1,?}> {
+               T1Encoding<{1,0,1,0,1,?}>, Sched<[WriteALU]> {
   // A6.2 & A8.6.8
   bits<3> dst;
   bits<8> imm;
@@ -335,7 +335,7 @@ def tADDrSPi : T1pI<(outs tGPR:$dst), (ins GPRsp:$sp, t_imm0_1020s4:$imm),
 // ADD sp, sp, #<imm7>
 def tADDspi : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, t_imm0_508s4:$imm),
                      IIC_iALUi, "add", "\t$Rdn, $imm", []>,
-              T1Misc<{0,0,0,0,0,?,?}> {
+              T1Misc<{0,0,0,0,0,?,?}>, Sched<[WriteALU]> {
   // A6.2.5 & A8.6.8
   bits<7> imm;
   let Inst{6-0} = imm;
@@ -346,7 +346,7 @@ def tADDspi : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, t_imm0_508s4:$imm),
 // FIXME: The encoding and the ASM string don't match up.
 def tSUBspi : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, t_imm0_508s4:$imm),
                     IIC_iALUi, "sub", "\t$Rdn, $imm", []>,
-              T1Misc<{0,0,0,0,1,?,?}> {
+              T1Misc<{0,0,0,0,1,?,?}>, Sched<[WriteALU]> {
   // A6.2.5 & A8.6.214
   bits<7> imm;
   let Inst{6-0} = imm;
@@ -367,7 +367,7 @@ def : tInstAlias<"sub${p} sp, sp, $imm",
 // ADD <Rm>, sp
 def tADDrSP : T1pI<(outs GPR:$Rdn), (ins GPRsp:$sp, GPR:$Rn), IIC_iALUr,
                    "add", "\t$Rdn, $sp, $Rn", []>,
-              T1Special<{0,0,?,?}> {
+              T1Special<{0,0,?,?}>, Sched<[WriteALU]> {
   // A8.6.9 Encoding T1
   bits<4> Rdn;
   let Inst{7}   = Rdn{3};
@@ -379,7 +379,7 @@ def tADDrSP : T1pI<(outs GPR:$Rdn), (ins GPRsp:$sp, GPR:$Rn), IIC_iALUr,
 // ADD sp, <Rm>
 def tADDspr : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, GPR:$Rm), IIC_iALUr,
                   "add", "\t$Rdn, $Rm", []>,
-              T1Special<{0,0,?,?}> {
+              T1Special<{0,0,?,?}>, Sched<[WriteALU]> {
   // A8.6.9 Encoding T2
   bits<4> Rm;
   let Inst{7} = 1;
@@ -395,7 +395,7 @@ def tADDspr : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, GPR:$Rm), IIC_iALUr,
 // Indirect branches
 let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
   def tBX : TI<(outs), (ins GPR:$Rm, pred:$p), IIC_Br, "bx${p}\t$Rm", []>,
-            T1Special<{1,1,0,?}> {
+            T1Special<{1,1,0,?}>, Sched<[WriteBr]> {
     // A6.2.3 & A8.6.25
     bits<4> Rm;
     let Inst{6-3} = Rm;
@@ -406,12 +406,12 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
 
 let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
   def tBX_RET : tPseudoExpand<(outs), (ins pred:$p), 2, IIC_Br,
-                   [(ARMretflag)], (tBX LR, pred:$p)>;
+                   [(ARMretflag)], (tBX LR, pred:$p)>, Sched<[WriteBr]>;
 
   // Alternative return instruction used by vararg functions.
   def tBX_RET_vararg : tPseudoExpand<(outs), (ins tGPR:$Rm, pred:$p),
                    2, IIC_Br, [],
-                   (tBX GPR:$Rm, pred:$p)>;
+                   (tBX GPR:$Rm, pred:$p)>, Sched<[WriteBr]>;
 }
 
 // All calls clobber the non-callee saved registers. SP is marked as a use to
@@ -424,7 +424,7 @@ let isCall = 1,
                   (outs), (ins pred:$p, t_bltarget:$func), IIC_Br,
                   "bl${p}\t$func",
                   [(ARMtcall tglobaladdr:$func)]>,
-             Requires<[IsThumb]> {
+             Requires<[IsThumb]>, Sched<[WriteBrL]> {
     bits<24> func;
     let Inst{26} = func{23};
     let Inst{25-16} = func{20-11};
@@ -438,7 +438,7 @@ let isCall = 1,
                  (outs), (ins pred:$p, t_blxtarget:$func), IIC_Br,
                    "blx${p}\t$func",
                    [(ARMcall tglobaladdr:$func)]>,
-              Requires<[IsThumb, HasV5T]> {
+              Requires<[IsThumb, HasV5T]>, Sched<[WriteBrL]> {
     bits<24> func;
     let Inst{26} = func{23};
     let Inst{25-16} = func{20-11};
@@ -453,7 +453,7 @@ let isCall = 1,
                   "blx${p}\t$func",
                   [(ARMtcall GPR:$func)]>,
               Requires<[IsThumb, HasV5T]>,
-              T1Special<{1,1,1,?}> { // A6.2.3 & A8.6.24;
+              T1Special<{1,1,1,?}>, Sched<[WriteBrL]> { // A6.2.3 & A8.6.24;
     bits<4> func;
     let Inst{6-3} = func;
     let Inst{2-0} = 0b000;
@@ -463,14 +463,14 @@ let isCall = 1,
   def tBX_CALL : tPseudoInst<(outs), (ins tGPR:$func),
                   4, IIC_Br,
                   [(ARMcall_nolink tGPR:$func)]>,
-            Requires<[IsThumb, IsThumb1Only]>;
+            Requires<[IsThumb, IsThumb1Only]>, Sched<[WriteBr]>;
 }
 
 let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
   let isPredicable = 1 in
   def tB   : T1pI<(outs), (ins t_brtarget:$target), IIC_Br,
                  "b", "\t$target", [(br bb:$target)]>,
-             T1Encoding<{1,1,1,0,0,?}> {
+             T1Encoding<{1,1,1,0,0,?}>, Sched<[WriteBr]> {
     bits<11> target;
     let Inst{10-0} = target;
   }
@@ -480,12 +480,14 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
   // the clobber of LR.
   let Defs = [LR] in
   def tBfar : tPseudoExpand<(outs), (ins t_bltarget:$target, pred:$p),
-                          4, IIC_Br, [], (tBL pred:$p, t_bltarget:$target)>;
+                          4, IIC_Br, [], (tBL pred:$p, t_bltarget:$target)>,
+                          Sched<[WriteBrTbl]>;
 
   def tBR_JTr : tPseudoInst<(outs),
                       (ins tGPR:$target, i32imm:$jt, i32imm:$id),
                       0, IIC_Br,
-                      [(ARMbrjt tGPR:$target, tjumptable:$jt, imm:$id)]> {
+                      [(ARMbrjt tGPR:$target, tjumptable:$jt, imm:$id)]>,
+                      Sched<[WriteBrTbl]> {
     list<Predicate> Predicates = [IsThumb, IsThumb1Only];
   }
 }
@@ -496,7 +498,7 @@ let isBranch = 1, isTerminator = 1 in
   def tBcc : T1I<(outs), (ins t_bcctarget:$target, pred:$p), IIC_Br,
                  "b${p}\t$target",
                  [/*(ARMbrcond bb:$target, imm:$cc)*/]>,
-             T1BranchCond<{1,1,0,1}> {
+             T1BranchCond<{1,1,0,1}>, Sched<[WriteBr]> {
   bits<4> p;
   bits<8> target;
   let Inst{11-8} = p;
@@ -510,7 +512,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
     def tTAILJMPr : tPseudoExpand<(outs), (ins tcGPR:$dst),
                      4, IIC_Br, [],
                      (tBX GPR:$dst, (ops 14, zero_reg))>,
-                     Requires<[IsThumb]>;
+                     Requires<[IsThumb]>, Sched<[WriteBr]>;
   }
   // tTAILJMPd: IOS version uses a Thumb2 branch (no Thumb1 tail calls
   // on IOS), so it's in ARMInstrThumb2.td.
@@ -520,7 +522,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
                    (ins t_brtarget:$dst, pred:$p),
                    4, IIC_Br, [],
                    (tB t_brtarget:$dst, pred:$p)>,
-                 Requires<[IsThumb, IsNotIOS]>;
+                 Requires<[IsThumb, IsNotIOS]>, Sched<[WriteBr]>;
   }
 }
 
@@ -530,7 +532,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
 // If Inst{11-8} == 0b1111 then SEE SVC
 let isCall = 1, Uses = [SP] in
 def tSVC : T1pI<(outs), (ins imm0_255:$imm), IIC_Br,
-                "svc", "\t$imm", []>, Encoding16 {
+                "svc", "\t$imm", []>, Encoding16, Sched<[WriteBr]> {
   bits<8> imm;
   let Inst{15-12} = 0b1101;
   let Inst{11-8}  = 0b1111;
@@ -540,7 +542,7 @@ def tSVC : T1pI<(outs), (ins imm0_255:$imm), IIC_Br,
 // The assembler uses 0xDEFE for a trap instruction.
 let isBarrier = 1, isTerminator = 1 in
 def tTRAP : TI<(outs), (ins), IIC_Br,
-               "trap", [(trap)]>, Encoding16 {
+               "trap", [(trap)]>, Encoding16, Sched<[WriteBr]> {
   let Inst = 0xdefe;
 }
 
@@ -833,14 +835,15 @@ let isCommutable = 1, Uses = [CPSR] in
 def tADC :                      // A8.6.2
   T1sItDPEncode<0b0101, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), IIC_iALUr,
                 "adc", "\t$Rdn, $Rm",
-                [(set tGPR:$Rdn, (adde tGPR:$Rn, tGPR:$Rm))]>;
+                [(set tGPR:$Rdn, (adde tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
 
 // Add immediate
 def tADDi3 :                    // A8.6.4 T1
   T1sIGenEncodeImm<0b01110, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3),
                    IIC_iALUi,
                    "add", "\t$Rd, $Rm, $imm3",
-                   [(set tGPR:$Rd, (add tGPR:$Rm, imm0_7:$imm3))]> {
+                   [(set tGPR:$Rd, (add tGPR:$Rm, imm0_7:$imm3))]>,
+                   Sched<[WriteALU]> {
   bits<3> imm3;
   let Inst{8-6} = imm3;
 }
@@ -849,7 +852,8 @@ def tADDi8 :                    // A8.6.4 T2
   T1sItGenEncodeImm<{1,1,0,?,?}, (outs tGPR:$Rdn),
                     (ins tGPR:$Rn, imm0_255:$imm8), IIC_iALUi,
                     "add", "\t$Rdn, $imm8",
-                    [(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255:$imm8))]>;
+                    [(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255:$imm8))]>,
+                    Sched<[WriteALU]>;
 
 // Add register
 let isCommutable = 1 in
@@ -857,12 +861,12 @@ def tADDrr :                    // A8.6.6 T1
   T1sIGenEncode<0b01100, (outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm),
                 IIC_iALUr,
                 "add", "\t$Rd, $Rn, $Rm",
-                [(set tGPR:$Rd, (add tGPR:$Rn, tGPR:$Rm))]>;
+                [(set tGPR:$Rd, (add tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
 
 let neverHasSideEffects = 1 in
 def tADDhirr : T1pIt<(outs GPR:$Rdn), (ins GPR:$Rn, GPR:$Rm), IIC_iALUr,
                      "add", "\t$Rdn, $Rm", []>,
-               T1Special<{0,0,?,?}> {
+               T1Special<{0,0,?,?}>, Sched<[WriteALU]> {
   // A8.6.6 T2
   bits<4> Rdn;
   bits<4> Rm;
@@ -877,14 +881,15 @@ def tAND :                      // A8.6.12
   T1sItDPEncode<0b0000, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
                 IIC_iBITr,
                 "and", "\t$Rdn, $Rm",
-                [(set tGPR:$Rdn, (and tGPR:$Rn, tGPR:$Rm))]>;
+                [(set tGPR:$Rdn, (and tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
 
 // ASR immediate
 def tASRri :                    // A8.6.14
   T1sIGenEncodeImm<{0,1,0,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm_sr:$imm5),
                    IIC_iMOVsi,
                    "asr", "\t$Rd, $Rm, $imm5",
-                   [(set tGPR:$Rd, (sra tGPR:$Rm, (i32 imm_sr:$imm5)))]> {
+                   [(set tGPR:$Rd, (sra tGPR:$Rm, (i32 imm_sr:$imm5)))]>,
+                   Sched<[WriteALU]> {
   bits<5> imm5;
   let Inst{10-6} = imm5;
 }
@@ -894,14 +899,15 @@ def tASRrr :                    // A8.6.15
   T1sItDPEncode<0b0100, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
                 IIC_iMOVsr,
                 "asr", "\t$Rdn, $Rm",
-                [(set tGPR:$Rdn, (sra tGPR:$Rn, tGPR:$Rm))]>;
+                [(set tGPR:$Rdn, (sra tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
 
 // BIC register
 def tBIC :                      // A8.6.20
   T1sItDPEncode<0b1110, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
                 IIC_iBITr,
                 "bic", "\t$Rdn, $Rm",
-                [(set tGPR:$Rdn, (and tGPR:$Rn, (not tGPR:$Rm)))]>;
+                [(set tGPR:$Rdn, (and tGPR:$Rn, (not tGPR:$Rm)))]>,
+                Sched<[WriteALU]>;
 
 // CMN register
 let isCompare = 1, Defs = [CPSR] in {
@@ -917,7 +923,7 @@ def tCMNz :                     // A8.6.33
   T1pIDPEncode<0b1011, (outs), (ins tGPR:$Rn, tGPR:$Rm),
                IIC_iCMPr,
                "cmn", "\t$Rn, $Rm",
-               [(ARMcmpZ tGPR:$Rn, (ineg tGPR:$Rm))]>;
+               [(ARMcmpZ tGPR:$Rn, (ineg tGPR:$Rm))]>, Sched<[WriteCMP]>;
 
 } // isCompare = 1, Defs = [CPSR]
 
@@ -926,7 +932,7 @@ let isCompare = 1, Defs = [CPSR] in {
 def tCMPi8 : T1pI<(outs), (ins tGPR:$Rn, imm0_255:$imm8), IIC_iCMPi,
                   "cmp", "\t$Rn, $imm8",
                   [(ARMcmp tGPR:$Rn, imm0_255:$imm8)]>,
-             T1General<{1,0,1,?,?}> {
+             T1General<{1,0,1,?,?}>, Sched<[WriteCMP]> {
   // A8.6.35
   bits<3> Rn;
   bits<8> imm8;
@@ -939,11 +945,11 @@ def tCMPr :                     // A8.6.36 T1
   T1pIDPEncode<0b1010, (outs), (ins tGPR:$Rn, tGPR:$Rm),
                IIC_iCMPr,
                "cmp", "\t$Rn, $Rm",
-               [(ARMcmp tGPR:$Rn, tGPR:$Rm)]>;
+               [(ARMcmp tGPR:$Rn, tGPR:$Rm)]>, Sched<[WriteCMP]>;
 
 def tCMPhir : T1pI<(outs), (ins GPR:$Rn, GPR:$Rm), IIC_iCMPr,
                    "cmp", "\t$Rn, $Rm", []>,
-              T1Special<{0,1,?,?}> {
+              T1Special<{0,1,?,?}>, Sched<[WriteCMP]> {
   // A8.6.36 T2
   bits<4> Rm;
   bits<4> Rn;
@@ -960,14 +966,15 @@ def tEOR :                      // A8.6.45
   T1sItDPEncode<0b0001, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
                 IIC_iBITr,
                 "eor", "\t$Rdn, $Rm",
-                [(set tGPR:$Rdn, (xor tGPR:$Rn, tGPR:$Rm))]>;
+                [(set tGPR:$Rdn, (xor tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
 
 // LSL immediate
 def tLSLri :                    // A8.6.88
   T1sIGenEncodeImm<{0,0,0,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_31:$imm5),
                    IIC_iMOVsi,
                    "lsl", "\t$Rd, $Rm, $imm5",
-                   [(set tGPR:$Rd, (shl tGPR:$Rm, (i32 imm:$imm5)))]> {
+                   [(set tGPR:$Rd, (shl tGPR:$Rm, (i32 imm:$imm5)))]>,
+                   Sched<[WriteALU]> {
   bits<5> imm5;
   let Inst{10-6} = imm5;
 }
@@ -977,14 +984,15 @@ def tLSLrr :                    // A8.6.89
   T1sItDPEncode<0b0010, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
                 IIC_iMOVsr,
                 "lsl", "\t$Rdn, $Rm",
-                [(set tGPR:$Rdn, (shl tGPR:$Rn, tGPR:$Rm))]>;
+                [(set tGPR:$Rdn, (shl tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
 
 // LSR immediate
 def tLSRri :                    // A8.6.90
   T1sIGenEncodeImm<{0,0,1,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm_sr:$imm5),
                    IIC_iMOVsi,
                    "lsr", "\t$Rd, $Rm, $imm5",
-                   [(set tGPR:$Rd, (srl tGPR:$Rm, (i32 imm_sr:$imm5)))]> {
+                   [(set tGPR:$Rd, (srl tGPR:$Rm, (i32 imm_sr:$imm5)))]>,
+                   Sched<[WriteALU]> {
   bits<5> imm5;
   let Inst{10-6} = imm5;
 }
@@ -994,14 +1002,14 @@ def tLSRrr :                    // A8.6.91
   T1sItDPEncode<0b0011, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
                 IIC_iMOVsr,
                 "lsr", "\t$Rdn, $Rm",
-                [(set tGPR:$Rdn, (srl tGPR:$Rn, tGPR:$Rm))]>;
+                [(set tGPR:$Rdn, (srl tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
 
 // Move register
 let isMoveImm = 1 in
 def tMOVi8 : T1sI<(outs tGPR:$Rd), (ins imm0_255:$imm8), IIC_iMOVi,
                   "mov", "\t$Rd, $imm8",
                   [(set tGPR:$Rd, imm0_255:$imm8)]>,
-             T1General<{1,0,0,?,?}> {
+             T1General<{1,0,0,?,?}>, Sched<[WriteALU]> {
   // A8.6.96
   bits<3> Rd;
   bits<8> imm8;
@@ -1019,7 +1027,7 @@ let neverHasSideEffects = 1 in {
 def tMOVr : Thumb1pI<(outs GPR:$Rd), (ins GPR:$Rm), AddrModeNone,
                       2, IIC_iMOVr,
                       "mov", "\t$Rd, $Rm", "", []>,
-                  T1Special<{1,0,?,?}> {
+                  T1Special<{1,0,?,?}>, Sched<[WriteALU]> {
   // A8.6.97
   bits<4> Rd;
   bits<4> Rm;
@@ -1029,7 +1037,7 @@ def tMOVr : Thumb1pI<(outs GPR:$Rd), (ins GPR:$Rm), AddrModeNone,
 }
 let Defs = [CPSR] in
 def tMOVSr      : T1I<(outs tGPR:$Rd), (ins tGPR:$Rm), IIC_iMOVr,
-                      "movs\t$Rd, $Rm", []>, Encoding16 {
+                      "movs\t$Rd, $Rm", []>, Encoding16, Sched<[WriteALU]> {
   // A8.6.97
   bits<3> Rd;
   bits<3> Rm;
@@ -1060,7 +1068,7 @@ def :tInstAlias<"mul${s}${p} $Rdm, $Rn", (tMUL tGPR:$Rdm, s_cc_out:$s, tGPR:$Rn,
 def tMVN :                      // A8.6.107
   T1sIDPEncode<0b1111, (outs tGPR:$Rd), (ins tGPR:$Rn), IIC_iMVNr,
                "mvn", "\t$Rd, $Rn",
-               [(set tGPR:$Rd, (not tGPR:$Rn))]>;
+               [(set tGPR:$Rd, (not tGPR:$Rn))]>, Sched<[WriteALU]>;
 
 // Bitwise or register
 let isCommutable = 1 in
@@ -1068,7 +1076,7 @@ def tORR :                      // A8.6.114
   T1sItDPEncode<0b1100, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
                 IIC_iBITr,
                 "orr", "\t$Rdn, $Rm",
-                [(set tGPR:$Rdn, (or tGPR:$Rn, tGPR:$Rm))]>;
+                [(set tGPR:$Rdn, (or tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
 
 // Swaps
 def tREV :                      // A8.6.134
@@ -1076,35 +1084,36 @@ def tREV :                      // A8.6.134
                  IIC_iUNAr,
                  "rev", "\t$Rd, $Rm",
                  [(set tGPR:$Rd, (bswap tGPR:$Rm))]>,
-                 Requires<[IsThumb, IsThumb1Only, HasV6]>;
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>, Sched<[WriteALU]>;
 
 def tREV16 :                    // A8.6.135
   T1pIMiscEncode<{1,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
                  IIC_iUNAr,
                  "rev16", "\t$Rd, $Rm",
              [(set tGPR:$Rd, (rotr (bswap tGPR:$Rm), (i32 16)))]>,
-                Requires<[IsThumb, IsThumb1Only, HasV6]>;
+                Requires<[IsThumb, IsThumb1Only, HasV6]>, Sched<[WriteALU]>;
 
 def tREVSH :                    // A8.6.136
   T1pIMiscEncode<{1,0,1,0,1,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
                  IIC_iUNAr,
                  "revsh", "\t$Rd, $Rm",
                  [(set tGPR:$Rd, (sra (bswap tGPR:$Rm), (i32 16)))]>,
-                 Requires<[IsThumb, IsThumb1Only, HasV6]>;
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>, Sched<[WriteALU]>;
 
 // Rotate right register
 def tROR :                      // A8.6.139
   T1sItDPEncode<0b0111, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
                 IIC_iMOVsr,
                 "ror", "\t$Rdn, $Rm",
-                [(set tGPR:$Rdn, (rotr tGPR:$Rn, tGPR:$Rm))]>;
+                [(set tGPR:$Rdn, (rotr tGPR:$Rn, tGPR:$Rm))]>,
+                Sched<[WriteALU]>;
 
 // Negate register
 def tRSB :                      // A8.6.141
   T1sIDPEncode<0b1001, (outs tGPR:$Rd), (ins tGPR:$Rn),
                IIC_iALUi,
                "rsb", "\t$Rd, $Rn, #0",
-               [(set tGPR:$Rd, (ineg tGPR:$Rn))]>;
+               [(set tGPR:$Rd, (ineg tGPR:$Rn))]>, Sched<[WriteALU]>;
 
 // Subtract with carry register
 let Uses = [CPSR] in
@@ -1112,14 +1121,16 @@ def tSBC :                      // A8.6.151
   T1sItDPEncode<0b0110, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
                 IIC_iALUr,
                 "sbc", "\t$Rdn, $Rm",
-                [(set tGPR:$Rdn, (sube tGPR:$Rn, tGPR:$Rm))]>;
+                [(set tGPR:$Rdn, (sube tGPR:$Rn, tGPR:$Rm))]>,
+                Sched<[WriteALU]>;
 
 // Subtract immediate
 def tSUBi3 :                    // A8.6.210 T1
   T1sIGenEncodeImm<0b01111, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3),
                    IIC_iALUi,
                    "sub", "\t$Rd, $Rm, $imm3",
-                   [(set tGPR:$Rd, (add tGPR:$Rm, imm0_7_neg:$imm3))]> {
+                   [(set tGPR:$Rd, (add tGPR:$Rm, imm0_7_neg:$imm3))]>,
+                   Sched<[WriteALU]> {
   bits<3> imm3;
   let Inst{8-6} = imm3;
 }
@@ -1128,14 +1139,16 @@ def tSUBi8 :                    // A8.6.210 T2
   T1sItGenEncodeImm<{1,1,1,?,?}, (outs tGPR:$Rdn),
                     (ins tGPR:$Rn, imm0_255:$imm8), IIC_iALUi,
                     "sub", "\t$Rdn, $imm8",
-                    [(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255_neg:$imm8))]>;
+                    [(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255_neg:$imm8))]>,
+                    Sched<[WriteALU]>;
 
 // Subtract register
 def tSUBrr :                    // A8.6.212
   T1sIGenEncode<0b01101, (outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm),
                 IIC_iALUr,
                 "sub", "\t$Rd, $Rn, $Rm",
-                [(set tGPR:$Rd, (sub tGPR:$Rn, tGPR:$Rm))]>;
+                [(set tGPR:$Rd, (sub tGPR:$Rn, tGPR:$Rm))]>,
+                Sched<[WriteALU]>;
 
 // Sign-extend byte
 def tSXTB :                     // A8.6.222
@@ -1143,7 +1156,8 @@ def tSXTB :                     // A8.6.222
                  IIC_iUNAr,
                  "sxtb", "\t$Rd, $Rm",
                  [(set tGPR:$Rd, (sext_inreg tGPR:$Rm, i8))]>,
-                 Requires<[IsThumb, IsThumb1Only, HasV6]>;
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>,
+                 Sched<[WriteALU]>;
 
 // Sign-extend short
 def tSXTH :                     // A8.6.224
@@ -1151,14 +1165,16 @@ def tSXTH :                     // A8.6.224
                  IIC_iUNAr,
                  "sxth", "\t$Rd, $Rm",
                  [(set tGPR:$Rd, (sext_inreg tGPR:$Rm, i16))]>,
-                 Requires<[IsThumb, IsThumb1Only, HasV6]>;
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>,
+                 Sched<[WriteALU]>;
 
 // Test
 let isCompare = 1, isCommutable = 1, Defs = [CPSR] in
 def tTST :                      // A8.6.230
   T1pIDPEncode<0b1000, (outs), (ins tGPR:$Rn, tGPR:$Rm), IIC_iTSTr,
                "tst", "\t$Rn, $Rm",
-               [(ARMcmpZ (and_su tGPR:$Rn, tGPR:$Rm), 0)]>;
+               [(ARMcmpZ (and_su tGPR:$Rn, tGPR:$Rm), 0)]>,
+               Sched<[WriteALU]>;
 
 // Zero-extend byte
 def tUXTB :                     // A8.6.262
@@ -1166,7 +1182,8 @@ def tUXTB :                     // A8.6.262
                  IIC_iUNAr,
                  "uxtb", "\t$Rd, $Rm",
                  [(set tGPR:$Rd, (and tGPR:$Rm, 0xFF))]>,
-                 Requires<[IsThumb, IsThumb1Only, HasV6]>;
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>,
+                 Sched<[WriteALU]>;
 
 // Zero-extend short
 def tUXTH :                     // A8.6.264
@@ -1174,7 +1191,7 @@ def tUXTH :                     // A8.6.264
                  IIC_iUNAr,
                  "uxth", "\t$Rd, $Rm",
                  [(set tGPR:$Rd, (and tGPR:$Rm, 0xFFFF))]>,
-                 Requires<[IsThumb, IsThumb1Only, HasV6]>;
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>, Sched<[WriteALU]>;
 
 // Conditional move tMOVCCr - Used to implement the Thumb SELECT_CC operation.
 // Expanded after instruction selection into a branch sequence.
@@ -1189,7 +1206,7 @@ let usesCustomInserter = 1 in  // Expanded after instruction selection.
 
 def tADR : T1I<(outs tGPR:$Rd), (ins t_adrlabel:$addr, pred:$p),
                IIC_iALUi, "adr{$p}\t$Rd, $addr", []>,
-               T1Encoding<{1,0,1,0,0,?}> {
+               T1Encoding<{1,0,1,0,0,?}>, Sched<[WriteALU]> {
   bits<3> Rd;
   bits<8> addr;
   let Inst{10-8} = Rd;
@@ -1199,12 +1216,12 @@ def tADR : T1I<(outs tGPR:$Rd), (ins t_adrlabel:$addr, pred:$p),
 
 let neverHasSideEffects = 1, isReMaterializable = 1 in
 def tLEApcrel   : tPseudoInst<(outs tGPR:$Rd), (ins i32imm:$label, pred:$p),
-                              2, IIC_iALUi, []>;
+                              2, IIC_iALUi, []>, Sched<[WriteALU]>;
 
 let hasSideEffects = 1 in
 def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd),
                               (ins i32imm:$label, nohash_imm:$id, pred:$p),
-                              2, IIC_iALUi, []>;
+                              2, IIC_iALUi, []>, Sched<[WriteALU]>;
 
 //===----------------------------------------------------------------------===//
 // TLS Instructions
@@ -1215,7 +1232,8 @@ def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd),
 // complete with fixup for the aeabi_read_tp function.
 let isCall = 1, Defs = [R0, R12, LR, CPSR], Uses = [SP] in
 def tTPsoft : tPseudoInst<(outs), (ins), 4, IIC_Br,
-                          [(set R0, ARMthread_pointer)]>;
+                          [(set R0, ARMthread_pointer)]>,
+                          Sched<[WriteBr]>;
 
 //===----------------------------------------------------------------------===//
 // SJLJ Exception handling intrinsics
@@ -1381,13 +1399,13 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
     hasExtraDefRegAllocReq = 1 in
 def tPOP_RET : tPseudoExpand<(outs), (ins pred:$p, reglist:$regs, variable_ops),
                            2, IIC_iPop_Br, [],
-                           (tPOP pred:$p, reglist:$regs)>;
+                           (tPOP pred:$p, reglist:$regs)>, Sched<[WriteBrL]>;
 
 // Indirect branch using "mov pc, $Rm"
 let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
   def tBRIND : tPseudoExpand<(outs), (ins GPR:$Rm, pred:$p),
                   2, IIC_Br, [(brind GPR:$Rm)],
-                  (tMOVr PC, GPR:$Rm, pred:$p)>;
+                  (tMOVr PC, GPR:$Rm, pred:$p)>, Sched<[WriteBr]>;
 }
 
 
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 4dacb86..ff21bf7 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -554,7 +554,8 @@ multiclass T2I_bin_irs<bits<4> opcod, string opc,
    def ri : T2sTwoRegImm<
                 (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), iii,
                  opc, "\t$Rd, $Rn, $imm",
-                 [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_imm:$imm))]> {
+                 [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_imm:$imm))]>,
+                 Sched<[WriteALU, ReadALU]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
      let Inst{24-21} = opcod;
@@ -563,7 +564,8 @@ multiclass T2I_bin_irs<bits<4> opcod, string opc,
    // register
    def rr : T2sThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), iir,
                  opc, !strconcat(wide, "\t$Rd, $Rn, $Rm"),
-                 [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]> {
+                 [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]>,
+                 Sched<[WriteALU, ReadALU, ReadALU]> {
      let isCommutable = Commutable;
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
@@ -576,7 +578,8 @@ multiclass T2I_bin_irs<bits<4> opcod, string opc,
    def rs : T2sTwoRegShiftedReg<
                  (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm), iis,
                  opc, !strconcat(wide, "\t$Rd, $Rn, $ShiftedRm"),
-                 [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm))]> {
+                 [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm))]>,
+                 Sched<[WriteALUsi, ReadALU]>  {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
@@ -635,7 +638,8 @@ multiclass T2I_rbin_irs<bits<4> opcod, string opc, PatFrag opnode> {
    def ri : T2sTwoRegImm<
                  (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), IIC_iALUi,
                  opc, ".w\t$Rd, $Rn, $imm",
-                 [(set rGPR:$Rd, (opnode t2_so_imm:$imm, rGPR:$Rn))]> {
+                 [(set rGPR:$Rd, (opnode t2_so_imm:$imm, rGPR:$Rn))]>,
+                 Sched<[WriteALU, ReadALU]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
      let Inst{24-21} = opcod;
@@ -645,7 +649,8 @@ multiclass T2I_rbin_irs<bits<4> opcod, string opc, PatFrag opnode> {
    def rr : T2sThreeReg<
                  (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUr,
                  opc, "\t$Rd, $Rn, $Rm",
-                 [/* For disassembly only; pattern left blank */]> {
+                 [/* For disassembly only; pattern left blank */]>,
+                 Sched<[WriteALU, ReadALU, ReadALU]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
@@ -657,7 +662,8 @@ multiclass T2I_rbin_irs<bits<4> opcod, string opc, PatFrag opnode> {
    def rs : T2sTwoRegShiftedReg<
                  (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm),
                  IIC_iALUsir, opc, "\t$Rd, $Rn, $ShiftedRm",
-                 [(set rGPR:$Rd, (opnode t2_so_reg:$ShiftedRm, rGPR:$Rn))]> {
+                 [(set rGPR:$Rd, (opnode t2_so_reg:$ShiftedRm, rGPR:$Rn))]>,
+                 Sched<[WriteALUsi, ReadALU]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
@@ -678,12 +684,14 @@ multiclass T2I_bin_s_irs<InstrItinClass iii, InstrItinClass iir,
                          (ins GPRnopc:$Rn, t2_so_imm:$imm, pred:$p),
                          4, iii,
                          [(set rGPR:$Rd, CPSR, (opnode GPRnopc:$Rn,
-                                                t2_so_imm:$imm))]>;
+                                                t2_so_imm:$imm))]>,
+            Sched<[WriteALU, ReadALU]>;
    // register
    def rr : t2PseudoInst<(outs rGPR:$Rd), (ins GPRnopc:$Rn, rGPR:$Rm, pred:$p),
                          4, iir,
                          [(set rGPR:$Rd, CPSR, (opnode GPRnopc:$Rn,
-                                                rGPR:$Rm))]> {
+                                                rGPR:$Rm))]>,
+            Sched<[WriteALU, ReadALU, ReadALU]> {
      let isCommutable = Commutable;
    }
    // shifted register
@@ -691,7 +699,8 @@ multiclass T2I_bin_s_irs<InstrItinClass iii, InstrItinClass iir,
                          (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm, pred:$p),
                          4, iis,
                          [(set rGPR:$Rd, CPSR, (opnode GPRnopc:$Rn,
-                                                t2_so_reg:$ShiftedRm))]>;
+                                                t2_so_reg:$ShiftedRm))]>,
+            Sched<[WriteALUsi, ReadALUsr]>;
 }
 }
 
@@ -704,13 +713,15 @@ multiclass T2I_rbin_s_is<PatFrag opnode> {
                          (ins rGPR:$Rn, t2_so_imm:$imm, pred:$p),
                          4, IIC_iALUi,
                          [(set rGPR:$Rd, CPSR, (opnode t2_so_imm:$imm,
-                                                rGPR:$Rn))]>;
+                                                rGPR:$Rn))]>,
+            Sched<[WriteALU, ReadALU]>;
    // shifted register
    def rs : t2PseudoInst<(outs rGPR:$Rd),
                          (ins rGPR:$Rn, t2_so_reg:$ShiftedRm, pred:$p),
                          4, IIC_iALUsi,
                          [(set rGPR:$Rd, CPSR, (opnode t2_so_reg:$ShiftedRm,
-                                                rGPR:$Rn))]>;
+                                                rGPR:$Rn))]>,
+            Sched<[WriteALUsi, ReadALU]>;
 }
 }
 
@@ -725,7 +736,8 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
    def ri : T2sTwoRegImm<
                (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, t2_so_imm:$imm), IIC_iALUi,
                opc, ".w\t$Rd, $Rn, $imm",
-               [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, t2_so_imm:$imm))]> {
+               [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, t2_so_imm:$imm))]>,
+               Sched<[WriteALU, ReadALU]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
      let Inst{24} = 1;
@@ -737,7 +749,8 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
    def ri12 : T2I<
                   (outs GPRnopc:$Rd), (ins GPR:$Rn, imm0_4095:$imm), IIC_iALUi,
                   !strconcat(opc, "w"), "\t$Rd, $Rn, $imm",
-                  [(set GPRnopc:$Rd, (opnode GPR:$Rn, imm0_4095:$imm))]> {
+                  [(set GPRnopc:$Rd, (opnode GPR:$Rn, imm0_4095:$imm))]>,
+                  Sched<[WriteALU, ReadALU]> {
      bits<4> Rd;
      bits<4> Rn;
      bits<12> imm;
@@ -755,7 +768,8 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
    // register
    def rr : T2sThreeReg<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, rGPR:$Rm),
                  IIC_iALUr, opc, ".w\t$Rd, $Rn, $Rm",
-                 [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, rGPR:$Rm))]> {
+                 [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, rGPR:$Rm))]>,
+                 Sched<[WriteALU, ReadALU, ReadALU]> {
      let isCommutable = Commutable;
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
@@ -769,7 +783,8 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
    def rs : T2sTwoRegShiftedReg<
                  (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm),
                  IIC_iALUsi, opc, ".w\t$Rd, $Rn, $ShiftedRm",
-              [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, t2_so_reg:$ShiftedRm))]> {
+              [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, t2_so_reg:$ShiftedRm))]>,
+              Sched<[WriteALUsi, ReadALU]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24} = 1;
@@ -787,7 +802,7 @@ multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
    def ri : T2sTwoRegImm<(outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm),
                  IIC_iALUi, opc, "\t$Rd, $Rn, $imm",
                [(set rGPR:$Rd, CPSR, (opnode rGPR:$Rn, t2_so_imm:$imm, CPSR))]>,
-                 Requires<[IsThumb2]> {
+                 Requires<[IsThumb2]>, Sched<[WriteALU, ReadALU]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
      let Inst{24-21} = opcod;
@@ -797,7 +812,7 @@ multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
    def rr : T2sThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUr,
                  opc, ".w\t$Rd, $Rn, $Rm",
                  [(set rGPR:$Rd, CPSR, (opnode rGPR:$Rn, rGPR:$Rm, CPSR))]>,
-                 Requires<[IsThumb2]> {
+                 Requires<[IsThumb2]>, Sched<[WriteALU, ReadALU, ReadALU]> {
      let isCommutable = Commutable;
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
@@ -811,7 +826,7 @@ multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
                  (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm),
                  IIC_iALUsi, opc, ".w\t$Rd, $Rn, $ShiftedRm",
          [(set rGPR:$Rd, CPSR, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm, CPSR))]>,
-                 Requires<[IsThumb2]> {
+                 Requires<[IsThumb2]>, Sched<[WriteALUsi, ReadALU]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
@@ -826,7 +841,8 @@ multiclass T2I_sh_ir<bits<2> opcod, string opc, Operand ty, PatFrag opnode> {
    def ri : T2sTwoRegShiftImm<
                  (outs rGPR:$Rd), (ins rGPR:$Rm, ty:$imm), IIC_iMOVsi,
                  opc, ".w\t$Rd, $Rm, $imm",
-                 [(set rGPR:$Rd, (opnode rGPR:$Rm, (i32 ty:$imm)))]> {
+                 [(set rGPR:$Rd, (opnode rGPR:$Rm, (i32 ty:$imm)))]>,
+                 Sched<[WriteALU]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-21} = 0b010010;
      let Inst{19-16} = 0b1111; // Rn
@@ -836,7 +852,8 @@ multiclass T2I_sh_ir<bits<2> opcod, string opc, Operand ty, PatFrag opnode> {
    def rr : T2sThreeReg<
                  (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMOVsr,
                  opc, ".w\t$Rd, $Rn, $Rm",
-                 [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]> {
+                 [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]>,
+                 Sched<[WriteALU]> {
      let Inst{31-27} = 0b11111;
      let Inst{26-23} = 0b0100;
      let Inst{22-21} = opcod;
@@ -880,7 +897,7 @@ let isCompare = 1, Defs = [CPSR] in {
    def ri : T2OneRegCmpImm<
                 (outs), (ins GPRnopc:$Rn, t2_so_imm:$imm), iii,
                 opc, ".w\t$Rn, $imm",
-                [(opnode GPRnopc:$Rn, t2_so_imm:$imm)]> {
+                [(opnode GPRnopc:$Rn, t2_so_imm:$imm)]>, Sched<[WriteCMP]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
      let Inst{24-21} = opcod;
@@ -892,7 +909,7 @@ let isCompare = 1, Defs = [CPSR] in {
    def rr : T2TwoRegCmp<
                 (outs), (ins GPRnopc:$Rn, rGPR:$Rm), iir,
                 opc, ".w\t$Rn, $Rm",
-                [(opnode GPRnopc:$Rn, rGPR:$Rm)]> {
+                [(opnode GPRnopc:$Rn, rGPR:$Rm)]>, Sched<[WriteCMP]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
@@ -906,7 +923,8 @@ let isCompare = 1, Defs = [CPSR] in {
    def rs : T2OneRegCmpShiftedReg<
                 (outs), (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm), iis,
                 opc, ".w\t$Rn, $ShiftedRm",
-                [(opnode GPRnopc:$Rn, t2_so_reg:$ShiftedRm)]> {
+                [(opnode GPRnopc:$Rn, t2_so_reg:$ShiftedRm)]>,
+                Sched<[WriteCMPsi]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
@@ -1167,7 +1185,8 @@ class T2PCOneRegImm<dag oops, dag iops, InstrItinClass itin,
 // assembler.
 def t2ADR : T2PCOneRegImm<(outs rGPR:$Rd),
               (ins t2adrlabel:$addr, pred:$p),
-              IIC_iALUi, "adr{$p}.w\t$Rd, $addr", []> {
+              IIC_iALUi, "adr{$p}.w\t$Rd, $addr", []>,
+              Sched<[WriteALU, ReadALU]> {
   let Inst{31-27} = 0b11110;
   let Inst{25-24} = 0b10;
   // Inst{23:21} = '11' (add = FALSE) or '00' (add = TRUE)
@@ -1190,12 +1209,12 @@ def t2ADR : T2PCOneRegImm<(outs rGPR:$Rd),
 
 let neverHasSideEffects = 1, isReMaterializable = 1 in
 def t2LEApcrel   : t2PseudoInst<(outs rGPR:$Rd), (ins i32imm:$label, pred:$p),
-                                4, IIC_iALUi, []>;
+                                4, IIC_iALUi, []>, Sched<[WriteALU, ReadALU]>;
 let hasSideEffects = 1 in
 def t2LEApcrelJT : t2PseudoInst<(outs rGPR:$Rd),
                                 (ins i32imm:$label, nohash_imm:$id, pred:$p),
                                 4, IIC_iALUi,
-                                []>;
+                                []>, Sched<[WriteALU, ReadALU]>;
 
 
 //===----------------------------------------------------------------------===//
@@ -1520,7 +1539,8 @@ multiclass T2Ipl<bits<1> write, bits<1> instr, string opc> {
 
   def i12 : T2Ii12<(outs), (ins t2addrmode_imm12:$addr), IIC_Preload, opc,
                 "\t$addr",
-              [(ARMPreload t2addrmode_imm12:$addr, (i32 write), (i32 instr))]> {
+              [(ARMPreload t2addrmode_imm12:$addr, (i32 write), (i32 instr))]>,
+              Sched<[WritePreLd]> {
     let Inst{31-25} = 0b1111100;
     let Inst{24} = instr;
     let Inst{22} = 0;
@@ -1537,7 +1557,8 @@ multiclass T2Ipl<bits<1> write, bits<1> instr, string opc> {
 
   def i8 : T2Ii8<(outs), (ins t2addrmode_negimm8:$addr), IIC_Preload, opc,
                 "\t$addr",
-            [(ARMPreload t2addrmode_negimm8:$addr, (i32 write), (i32 instr))]> {
+            [(ARMPreload t2addrmode_negimm8:$addr, (i32 write), (i32 instr))]>,
+            Sched<[WritePreLd]> {
     let Inst{31-25} = 0b1111100;
     let Inst{24} = instr;
     let Inst{23} = 0; // U = 0
@@ -1554,7 +1575,8 @@ multiclass T2Ipl<bits<1> write, bits<1> instr, string opc> {
 
   def s : T2Iso<(outs), (ins t2addrmode_so_reg:$addr), IIC_Preload, opc,
                "\t$addr",
-             [(ARMPreload t2addrmode_so_reg:$addr, (i32 write), (i32 instr))]> {
+             [(ARMPreload t2addrmode_so_reg:$addr, (i32 write), (i32 instr))]>,
+             Sched<[WritePreLd]> {
     let Inst{31-25} = 0b1111100;
     let Inst{24} = instr;
     let Inst{23} = 0; // add = TRUE for T1
@@ -1743,7 +1765,7 @@ defm t2STM : thumb2_st_mult<"stm", IIC_iStore_m, IIC_iStore_mu, 0>;
 
 let neverHasSideEffects = 1 in
 def t2MOVr : T2sTwoReg<(outs GPRnopc:$Rd), (ins GPR:$Rm), IIC_iMOVr,
-                   "mov", ".w\t$Rd, $Rm", []> {
+                   "mov", ".w\t$Rd, $Rm", []>, Sched<[WriteALU]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
   let Inst{24-21} = 0b0010;
@@ -1763,7 +1785,7 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1,
     AddedComplexity = 1 in
 def t2MOVi : T2sOneRegImm<(outs rGPR:$Rd), (ins t2_so_imm:$imm), IIC_iMOVi,
                    "mov", ".w\t$Rd, $imm",
-                   [(set rGPR:$Rd, t2_so_imm:$imm)]> {
+                   [(set rGPR:$Rd, t2_so_imm:$imm)]>, Sched<[WriteALU]> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 0;
   let Inst{24-21} = 0b0010;
@@ -1786,7 +1808,7 @@ def : t2InstAlias<"mov${p} $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
 let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
 def t2MOVi16 : T2I<(outs rGPR:$Rd), (ins imm0_65535_expr:$imm), IIC_iMOVi,
                    "movw", "\t$Rd, $imm",
-                   [(set rGPR:$Rd, imm0_65535:$imm)]> {
+                   [(set rGPR:$Rd, imm0_65535:$imm)]>, Sched<[WriteALU]> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 1;
   let Inst{24-21} = 0b0010;
@@ -1812,7 +1834,8 @@ def t2MOVTi16 : T2I<(outs rGPR:$Rd),
                     (ins rGPR:$src, imm0_65535_expr:$imm), IIC_iMOVi,
                     "movt", "\t$Rd, $imm",
                     [(set rGPR:$Rd,
-                          (or (and rGPR:$src, 0xffff), lo16AllZero:$imm))]> {
+                          (or (and rGPR:$src, 0xffff), lo16AllZero:$imm))]>,
+                          Sched<[WriteALU]> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 1;
   let Inst{24-21} = 0b0110;
@@ -1831,7 +1854,8 @@ def t2MOVTi16 : T2I<(outs rGPR:$Rd),
 }
 
 def t2MOVTi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd),
-                     (ins rGPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>;
+                     (ins rGPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>,
+                     Sched<[WriteALU]>;
 } // Constraints
 
 def : T2Pat<(or rGPR:$src, 0xffff0000), (t2MOVTi16 rGPR:$src, 0xffff)>;
@@ -2171,7 +2195,7 @@ def : T2Pat<(rotr rGPR:$lhs, (and rGPR:$rhs, lo5AllOne)),
 let Uses = [CPSR] in {
 def t2RRX : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi,
                    "rrx", "\t$Rd, $Rm",
-                   [(set rGPR:$Rd, (ARMrrx rGPR:$Rm))]> {
+                   [(set rGPR:$Rd, (ARMrrx rGPR:$Rm))]>, Sched<[WriteALU]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
   let Inst{24-21} = 0b0010;
@@ -2185,7 +2209,8 @@ let isCodeGenOnly = 1, Defs = [CPSR] in {
 def t2MOVsrl_flag : T2TwoRegShiftImm<
                         (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi,
                         "lsrs", ".w\t$Rd, $Rm, #1",
-                        [(set rGPR:$Rd, (ARMsrl_flag rGPR:$Rm))]> {
+                        [(set rGPR:$Rd, (ARMsrl_flag rGPR:$Rm))]>,
+                        Sched<[WriteALU]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
   let Inst{24-21} = 0b0010;
@@ -2199,7 +2224,8 @@ def t2MOVsrl_flag : T2TwoRegShiftImm<
 def t2MOVsra_flag : T2TwoRegShiftImm<
                         (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi,
                         "asrs", ".w\t$Rd, $Rm, #1",
-                        [(set rGPR:$Rd, (ARMsra_flag rGPR:$Rm))]> {
+                        [(set rGPR:$Rd, (ARMsra_flag rGPR:$Rm))]>,
+                        Sched<[WriteALU]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
   let Inst{24-21} = 0b0010;
@@ -2320,7 +2346,7 @@ multiclass T2I_un_irs<bits<4> opcod, string opc,
    // shifted imm
    def i : T2sOneRegImm<(outs rGPR:$Rd), (ins t2_so_imm:$imm), iii,
                 opc, "\t$Rd, $imm",
-                [(set rGPR:$Rd, (opnode t2_so_imm:$imm))]> {
+                [(set rGPR:$Rd, (opnode t2_so_imm:$imm))]>, Sched<[WriteALU]> {
      let isAsCheapAsAMove = Cheap;
      let isReMaterializable = ReMat;
      let isMoveImm = MoveImm;
@@ -2333,7 +2359,7 @@ multiclass T2I_un_irs<bits<4> opcod, string opc,
    // register
    def r : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), iir,
                 opc, ".w\t$Rd, $Rm",
-                [(set rGPR:$Rd, (opnode rGPR:$Rm))]> {
+                [(set rGPR:$Rd, (opnode rGPR:$Rm))]>, Sched<[WriteALU]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
@@ -2345,7 +2371,8 @@ multiclass T2I_un_irs<bits<4> opcod, string opc,
    // shifted register
    def s : T2sOneRegShiftedReg<(outs rGPR:$Rd), (ins t2_so_reg:$ShiftedRm), iis,
                 opc, ".w\t$Rd, $ShiftedRm",
-                [(set rGPR:$Rd, (opnode t2_so_reg:$ShiftedRm))]> {
+                [(set rGPR:$Rd, (opnode t2_so_reg:$ShiftedRm))]>,
+                Sched<[WriteALU]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
@@ -2804,22 +2831,27 @@ class T2I_misc<bits<2> op1, bits<2> op2, dag oops, dag iops,
 }
 
 def t2CLZ : T2I_misc<0b11, 0b00, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
-                    "clz", "\t$Rd, $Rm", [(set rGPR:$Rd, (ctlz rGPR:$Rm))]>;
+                    "clz", "\t$Rd, $Rm", [(set rGPR:$Rd, (ctlz rGPR:$Rm))]>,
+                    Sched<[WriteALU]>;
 
 def t2RBIT : T2I_misc<0b01, 0b10, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
                       "rbit", "\t$Rd, $Rm",
-                      [(set rGPR:$Rd, (ARMrbit rGPR:$Rm))]>;
+                      [(set rGPR:$Rd, (ARMrbit rGPR:$Rm))]>,
+                      Sched<[WriteALU]>;
 
 def t2REV : T2I_misc<0b01, 0b00, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
-                 "rev", ".w\t$Rd, $Rm", [(set rGPR:$Rd, (bswap rGPR:$Rm))]>;
+                 "rev", ".w\t$Rd, $Rm", [(set rGPR:$Rd, (bswap rGPR:$Rm))]>,
+                 Sched<[WriteALU]>;
 
 def t2REV16 : T2I_misc<0b01, 0b01, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
                        "rev16", ".w\t$Rd, $Rm",
-                [(set rGPR:$Rd, (rotr (bswap rGPR:$Rm), (i32 16)))]>;
+                [(set rGPR:$Rd, (rotr (bswap rGPR:$Rm), (i32 16)))]>,
+                Sched<[WriteALU]>;
 
 def t2REVSH : T2I_misc<0b01, 0b11, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
                        "revsh", ".w\t$Rd, $Rm",
-                 [(set rGPR:$Rd, (sra (bswap rGPR:$Rm), (i32 16)))]>;
+                 [(set rGPR:$Rd, (sra (bswap rGPR:$Rm), (i32 16)))]>,
+                 Sched<[WriteALU]>;
 
 def : T2Pat<(or (sra (shl rGPR:$Rm, (i32 24)), (i32 16)),
                 (and (srl rGPR:$Rm, (i32 8)), 0xFF)),
@@ -2831,7 +2863,8 @@ def t2PKHBT : T2ThreeReg<
                   [(set rGPR:$Rd, (or (and rGPR:$Rn, 0xFFFF),
                                       (and (shl rGPR:$Rm, pkh_lsl_amt:$sh),
                                            0xFFFF0000)))]>,
-                  Requires<[HasT2ExtractPack, IsThumb2]> {
+                  Requires<[HasT2ExtractPack, IsThumb2]>,
+                  Sched<[WriteALUsi, ReadALU]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
   let Inst{24-20} = 0b01100;
@@ -2859,7 +2892,8 @@ def t2PKHTB : T2ThreeReg<
                   [(set rGPR:$Rd, (or (and rGPR:$Rn, 0xFFFF0000),
                                        (and (sra rGPR:$Rm, pkh_asr_amt:$sh),
                                             0xFFFF)))]>,
-                  Requires<[HasT2ExtractPack, IsThumb2]> {
+                  Requires<[HasT2ExtractPack, IsThumb2]>,
+                  Sched<[WriteALUsi, ReadALU]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
   let Inst{24-20} = 0b01100;
@@ -2900,7 +2934,8 @@ let isCompare = 1, Defs = [CPSR] in {
    def t2CMNri : T2OneRegCmpImm<
                 (outs), (ins GPRnopc:$Rn, t2_so_imm:$imm), IIC_iCMPi,
                 "cmn", ".w\t$Rn, $imm",
-                [(ARMcmn GPRnopc:$Rn, (ineg t2_so_imm:$imm))]> {
+                [(ARMcmn GPRnopc:$Rn, (ineg t2_so_imm:$imm))]>,
+                Sched<[WriteCMP, ReadALU]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
      let Inst{24-21} = 0b1000;
@@ -2913,7 +2948,7 @@ let isCompare = 1, Defs = [CPSR] in {
                 (outs), (ins GPRnopc:$Rn, rGPR:$Rm), IIC_iCMPr,
                 "cmn", ".w\t$Rn, $Rm",
                 [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
-                  GPRnopc:$Rn, rGPR:$Rm)]> {
+                  GPRnopc:$Rn, rGPR:$Rm)]>, Sched<[WriteCMP, ReadALU, ReadALU]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = 0b1000;
@@ -2928,7 +2963,8 @@ let isCompare = 1, Defs = [CPSR] in {
                 (outs), (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm), IIC_iCMPsi,
                 "cmn", ".w\t$Rn, $ShiftedRm",
                 [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
-                  GPRnopc:$Rn, t2_so_reg:$ShiftedRm)]> {
+                  GPRnopc:$Rn, t2_so_reg:$ShiftedRm)]>,
+                  Sched<[WriteCMPsi, ReadALU, ReadALU]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = 0b1000;
@@ -2968,14 +3004,15 @@ def t2MOVCCr : t2PseudoInst<(outs rGPR:$Rd),
                             (ins rGPR:$false, rGPR:$Rm, pred:$p),
                             4, IIC_iCMOVr,
    [/*(set rGPR:$Rd, (ARMcmov rGPR:$false, rGPR:$Rm, imm:$cc, CCR:$ccr))*/]>,
-                RegConstraint<"$false = $Rd">;
+                RegConstraint<"$false = $Rd">,
+                Sched<[WriteALU]>;
 
 let isMoveImm = 1 in
 def t2MOVCCi : t2PseudoInst<(outs rGPR:$Rd),
                             (ins rGPR:$false, t2_so_imm:$imm, pred:$p),
                    4, IIC_iCMOVi,
 [/*(set rGPR:$Rd,(ARMcmov rGPR:$false,t2_so_imm:$imm, imm:$cc, CCR:$ccr))*/]>,
-                   RegConstraint<"$false = $Rd">;
+                   RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
 
 // FIXME: Pseudo-ize these. For now, just mark codegen only.
 let isCodeGenOnly = 1 in {
@@ -2983,7 +3020,7 @@ let isMoveImm = 1 in
 def t2MOVCCi16 : T2I<(outs rGPR:$Rd), (ins rGPR:$false, imm0_65535_expr:$imm),
                       IIC_iCMOVi,
                       "movw", "\t$Rd, $imm", []>,
-                      RegConstraint<"$false = $Rd"> {
+                      RegConstraint<"$false = $Rd">, Sched<[WriteALU]> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 1;
   let Inst{24-21} = 0b0010;
@@ -3010,7 +3047,7 @@ def t2MVNCCi : T2OneRegImm<(outs rGPR:$Rd), (ins rGPR:$false, t2_so_imm:$imm),
                    IIC_iCMOVi, "mvn", "\t$Rd, $imm",
 [/*(set rGPR:$Rd,(ARMcmov rGPR:$false,t2_so_imm_not:$imm,
                    imm:$cc, CCR:$ccr))*/]>,
-                   RegConstraint<"$false = $Rd"> {
+                   RegConstraint<"$false = $Rd">, Sched<[WriteALU]> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 0;
   let Inst{24-21} = 0b0011;
@@ -3021,7 +3058,7 @@ def t2MVNCCi : T2OneRegImm<(outs rGPR:$Rd), (ins rGPR:$false, t2_so_imm:$imm),
 
 class T2I_movcc_sh<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
                    string opc, string asm, list<dag> pattern>
-  : T2TwoRegShiftImm<oops, iops, itin, opc, asm, pattern> {
+  : T2TwoRegShiftImm<oops, iops, itin, opc, asm, pattern>, Sched<[WriteALU]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
   let Inst{24-21} = 0b0010;
@@ -3072,7 +3109,7 @@ def t2DSB : AInoP<(outs), (ins memb_opt:$opt), ThumbFrm, NoItinerary,
   let Inst{3-0} = opt;
 }
 
-def t2ISB : AInoP<(outs), (ins memb_opt:$opt), ThumbFrm, NoItinerary,
+def t2ISB : AInoP<(outs), (ins instsyncb_opt:$opt), ThumbFrm, NoItinerary,
                   "isb", "\t$opt",
                   []>, Requires<[IsThumb, HasDB]> {
   bits<4> opt;
@@ -3243,7 +3280,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
 let isPredicable = 1 in
 def t2B   : T2I<(outs), (ins uncondbrtarget:$target), IIC_Br,
                  "b", ".w\t$target",
-                 [(br bb:$target)]> {
+                 [(br bb:$target)]>, Sched<[WriteBr]> {
   let Inst{31-27} = 0b11110;
   let Inst{15-14} = 0b10;
   let Inst{12} = 1;
@@ -3261,17 +3298,20 @@ let isNotDuplicable = 1, isIndirectBranch = 1 in {
 def t2BR_JT : t2PseudoInst<(outs),
           (ins GPR:$target, GPR:$index, i32imm:$jt, i32imm:$id),
            0, IIC_Br,
-          [(ARMbr2jt GPR:$target, GPR:$index, tjumptable:$jt, imm:$id)]>;
+          [(ARMbr2jt GPR:$target, GPR:$index, tjumptable:$jt, imm:$id)]>,
+          Sched<[WriteBr]>;
 
 // FIXME: Add a non-pc based case that can be predicated.
 def t2TBB_JT : t2PseudoInst<(outs),
-        (ins GPR:$index, i32imm:$jt, i32imm:$id), 0, IIC_Br, []>;
+        (ins GPR:$index, i32imm:$jt, i32imm:$id), 0, IIC_Br, []>,
+        Sched<[WriteBr]>;
 
 def t2TBH_JT : t2PseudoInst<(outs),
-        (ins GPR:$index, i32imm:$jt, i32imm:$id), 0, IIC_Br, []>;
+        (ins GPR:$index, i32imm:$jt, i32imm:$id), 0, IIC_Br, []>,
+        Sched<[WriteBr]>;
 
 def t2TBB : T2I<(outs), (ins addrmode_tbb:$addr), IIC_Br,
-                    "tbb", "\t$addr", []> {
+                    "tbb", "\t$addr", []>, Sched<[WriteBrTbl]> {
   bits<4> Rn;
   bits<4> Rm;
   let Inst{31-20} = 0b111010001101;
@@ -3284,7 +3324,7 @@ def t2TBB : T2I<(outs), (ins addrmode_tbb:$addr), IIC_Br,
 }
 
 def t2TBH : T2I<(outs), (ins addrmode_tbh:$addr), IIC_Br,
-                   "tbh", "\t$addr", []> {
+                   "tbh", "\t$addr", []>, Sched<[WriteBrTbl]> {
   bits<4> Rn;
   bits<4> Rm;
   let Inst{31-20} = 0b111010001101;
@@ -3304,7 +3344,7 @@ def t2TBH : T2I<(outs), (ins addrmode_tbh:$addr), IIC_Br,
 let isBranch = 1, isTerminator = 1 in
 def t2Bcc : T2I<(outs), (ins brtarget:$target), IIC_Br,
                 "b", ".w\t$target",
-                [/*(ARMbrcond bb:$target, imm:$cc)*/]> {
+                [/*(ARMbrcond bb:$target, imm:$cc)*/]>, Sched<[WriteBr]> {
   let Inst{31-27} = 0b11110;
   let Inst{15-14} = 0b10;
   let Inst{12} = 0;
@@ -3331,7 +3371,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
                    (ins uncondbrtarget:$dst, pred:$p),
                    4, IIC_Br, [],
                    (t2B uncondbrtarget:$dst, pred:$p)>,
-                 Requires<[IsThumb2, IsIOS]>;
+                 Requires<[IsThumb2, IsIOS]>, Sched<[WriteBr]>;
 }
 
 // IT block
@@ -3353,7 +3393,8 @@ def t2IT : Thumb2XI<(outs), (ins it_pred:$cc, it_mask:$mask),
 
 // Branch and Exchange Jazelle -- for disassembly only
 // Rm = Inst{19-16}
-def t2BXJ : T2I<(outs), (ins rGPR:$func), NoItinerary, "bxj", "\t$func", []> {
+def t2BXJ : T2I<(outs), (ins rGPR:$func), NoItinerary, "bxj", "\t$func", []>,
+    Sched<[WriteBr]> {
   bits<4> func;
   let Inst{31-27} = 0b11110;
   let Inst{26} = 0;
@@ -3367,7 +3408,7 @@ let isBranch = 1, isTerminator = 1 in {
   def tCBZ  : T1I<(outs), (ins tGPR:$Rn, t_cbtarget:$target), IIC_Br,
                   "cbz\t$Rn, $target", []>,
               T1Misc<{0,0,?,1,?,?,?}>,
-              Requires<[IsThumb2]> {
+              Requires<[IsThumb2]>, Sched<[WriteBr]> {
     // A8.6.27
     bits<6> target;
     bits<3> Rn;
@@ -3379,7 +3420,7 @@ let isBranch = 1, isTerminator = 1 in {
   def tCBNZ : T1I<(outs), (ins tGPR:$Rn, t_cbtarget:$target), IIC_Br,
                   "cbnz\t$Rn, $target", []>,
               T1Misc<{1,0,?,1,?,?,?}>,
-              Requires<[IsThumb2]> {
+              Requires<[IsThumb2]>, Sched<[WriteBr]> {
     // A8.6.27
     bits<6> target;
     bits<3> Rn;
@@ -3981,7 +4022,7 @@ def : t2InstAlias<"sbc${s}${p} $Rd, $Rn, $ShiftedRm",
 
 // Aliases for ADD without the ".w" optional width specifier.
 def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
-        (t2ADDri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+        (t2ADDri rGPR:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
 def : t2InstAlias<"add${p} $Rd, $Rn, $imm",
            (t2ADDri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095:$imm, pred:$p)>;
 def : t2InstAlias<"add${s}${p} $Rd, $Rn, $Rm",
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index b5a896c..597b74a 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -224,7 +224,36 @@ defm : VFPDTAnyInstAlias<"vpop${p}", "$r",
 defm : VFPDTAnyInstAlias<"vpop${p}", "$r",
                          (VLDMDIA_UPD SP, pred:$p, dpr_reglist:$r)>;
 
-// FLDMX, FSTMX - mixing S/D registers for pre-armv6 cores
+// FLDMX, FSTMX - Load and store multiple unknown precision registers for
+// pre-armv6 cores.
+// These instruction are deprecated so we don't want them to get selected.
+multiclass vfp_ldstx_mult<string asm, bit L_bit> {
+  // Unknown precision
+  def XIA :
+    AXXI4<(outs), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
+          IndexModeNone, !strconcat(asm, "iax${p}\t$Rn, $regs"), "", []> {
+    let Inst{24-23} = 0b01;       // Increment After
+    let Inst{21}    = 0;          // No writeback
+    let Inst{20}    = L_bit;
+  }
+  def XIA_UPD :
+    AXXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
+          IndexModeUpd, !strconcat(asm, "iax${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b01;         // Increment After
+    let Inst{21}    = 1;            // Writeback
+    let Inst{20}    = L_bit;
+  }
+  def XDB_UPD :
+    AXXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
+          IndexModeUpd, !strconcat(asm, "dbx${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b10;         // Decrement Before
+    let Inst{21}    = 1;
+    let Inst{20}    = L_bit;
+  }
+}
+
+defm FLDM : vfp_ldstx_mult<"fldm", 1>;
+defm FSTM : vfp_ldstx_mult<"fstm", 0>;
 
 //===----------------------------------------------------------------------===//
 // FP Binary Operations.
@@ -841,7 +870,8 @@ let Constraints = "$a = $dst" in {
 class AVConv1XInsS_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
                           bit op5, dag oops, dag iops, InstrItinClass itin,
                           string opc, string asm, list<dag> pattern>
-  : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern> {
+  : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern>,
+  Sched<[WriteCvtFP]> {
   bits<5> dst;
   // if dp_operation then UInt(D:Vd) else UInt(Vd:D);
   let Inst{22} = dst{0};
@@ -852,7 +882,8 @@ class AVConv1XInsS_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
 class AVConv1XInsD_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
                           bit op5, dag oops, dag iops, InstrItinClass itin,
                           string opc, string asm, list<dag> pattern>
-  : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern> {
+  : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern>,
+    Sched<[WriteCvtFP]> {
   bits<5> dst;
   // if dp_operation then UInt(D:Vd) else UInt(Vd:D);
   let Inst{22} = dst{4};
@@ -1300,6 +1331,10 @@ let Uses = [FPSCR] in {
                               "vmrs", "\t$Rt, mvfr0", []>;
   def VMRS_MVFR1 : MovFromVFP<0b0110 /* mvfr1 */, (outs GPR:$Rt), (ins),
                               "vmrs", "\t$Rt, mvfr1", []>;
+  def VMRS_FPINST : MovFromVFP<0b1001 /* fpinst */, (outs GPR:$Rt), (ins),
+                              "vmrs", "\t$Rt, fpinst", []>;
+  def VMRS_FPINST2 : MovFromVFP<0b1010 /* fpinst2 */, (outs GPR:$Rt), (ins),
+                                "vmrs", "\t$Rt, fpinst2", []>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1333,6 +1368,11 @@ let Defs = [FPSCR] in {
   // System level GPR -> FPSID
   def VMSR_FPSID : MovToVFP<0b0000 /* fpsid */, (outs), (ins GPR:$src),
                       "vmsr", "\tfpsid, $src", []>;
+
+  def VMSR_FPINST : MovToVFP<0b1001 /* fpinst */, (outs), (ins GPR:$src),
+                              "vmsr", "\tfpinst, $src", []>;
+  def VMSR_FPINST2 : MovToVFP<0b1010 /* fpinst2 */, (outs), (ins GPR:$src),
+                                "vmsr", "\tfpinst2, $src", []>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h
index f4248fc..d9ec4fd 100644
--- a/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -36,6 +36,13 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   /// 'isThumb'.
   bool hasThumb2;
 
+  /// StByValParamsPadding - For parameter that is split between
+  /// GPRs and memory; while recovering GPRs part, when
+  /// StackAlignment == 8, and GPRs-part-size mod 8 != 0,
+  /// we need to insert gap before parameter start address. It allows to
+  /// "attach" GPR-part to the part that was passed via stack.
+  unsigned StByValParamsPadding;
+
   /// VarArgsRegSaveSize - Size of the register save area for vararg functions.
   ///
   unsigned ArgRegsSaveSize;
@@ -129,6 +136,7 @@ public:
   explicit ARMFunctionInfo(MachineFunction &MF) :
     isThumb(MF.getTarget().getSubtarget<ARMSubtarget>().isThumb()),
     hasThumb2(MF.getTarget().getSubtarget<ARMSubtarget>().hasThumb2()),
+    StByValParamsPadding(0),
     ArgRegsSaveSize(0), HasStackFrame(false), RestoreSPFromFP(false),
     LRSpilledForFarJump(false),
     FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
@@ -141,7 +149,14 @@ public:
   bool isThumb1OnlyFunction() const { return isThumb && !hasThumb2; }
   bool isThumb2Function() const { return isThumb && hasThumb2; }
 
-  unsigned getArgRegsSaveSize() const { return ArgRegsSaveSize; }
+  unsigned getStoredByValParamsPadding() const { return StByValParamsPadding; }
+  void setStoredByValParamsPadding(unsigned p) { StByValParamsPadding = p; }
+
+  unsigned getArgRegsSaveSize(unsigned Align = 0) const {
+    if (!Align)
+      return ArgRegsSaveSize;
+    return (ArgRegsSaveSize + Align - 1) & ~(Align - 1);
+  }
   void setArgRegsSaveSize(unsigned s) { ArgRegsSaveSize = s; }
 
   bool hasStackFrame() const { return HasStackFrame; }
diff --git a/lib/Target/ARM/ARMRegisterInfo.cpp b/lib/Target/ARM/ARMRegisterInfo.cpp
index 6f3819a..a788036 100644
--- a/lib/Target/ARM/ARMRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMRegisterInfo.cpp
@@ -18,7 +18,6 @@ using namespace llvm;
 
 void ARMRegisterInfo::anchor() { }
 
-ARMRegisterInfo::ARMRegisterInfo(const ARMBaseInstrInfo &tii,
-                                 const ARMSubtarget &sti)
-  : ARMBaseRegisterInfo(tii, sti) {
+ARMRegisterInfo::ARMRegisterInfo(const ARMSubtarget &sti)
+  : ARMBaseRegisterInfo(sti) {
 }
diff --git a/lib/Target/ARM/ARMRegisterInfo.h b/lib/Target/ARM/ARMRegisterInfo.h
index 8a24842..fb1537c 100644
--- a/lib/Target/ARM/ARMRegisterInfo.h
+++ b/lib/Target/ARM/ARMRegisterInfo.h
@@ -19,13 +19,13 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 
 namespace llvm {
-  class ARMSubtarget;
-  class ARMBaseInstrInfo;
+
+class ARMSubtarget;
 
 struct ARMRegisterInfo : public ARMBaseRegisterInfo {
   virtual void anchor();
 public:
-  ARMRegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &STI);
+  ARMRegisterInfo(const ARMSubtarget &STI);
 };
 
 } // end namespace llvm
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index b0f576b..0459d64 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -27,31 +27,31 @@ class ARMFReg<bits<16> Enc, string n> : Register<n> {
 
 // Subregister indices.
 let Namespace = "ARM" in {
-def qqsub_0 : SubRegIndex;
-def qqsub_1 : SubRegIndex;
+def qqsub_0 : SubRegIndex<256>;
+def qqsub_1 : SubRegIndex<256, 256>;
 
 // Note: Code depends on these having consecutive numbers.
-def qsub_0 : SubRegIndex;
-def qsub_1 : SubRegIndex;
-def qsub_2 : SubRegIndex<[qqsub_1, qsub_0]>;
-def qsub_3 : SubRegIndex<[qqsub_1, qsub_1]>;
-
-def dsub_0 : SubRegIndex;
-def dsub_1 : SubRegIndex;
-def dsub_2 : SubRegIndex<[qsub_1, dsub_0]>;
-def dsub_3 : SubRegIndex<[qsub_1, dsub_1]>;
-def dsub_4 : SubRegIndex<[qsub_2, dsub_0]>;
-def dsub_5 : SubRegIndex<[qsub_2, dsub_1]>;
-def dsub_6 : SubRegIndex<[qsub_3, dsub_0]>;
-def dsub_7 : SubRegIndex<[qsub_3, dsub_1]>;
-
-def ssub_0  : SubRegIndex;
-def ssub_1  : SubRegIndex;
-def ssub_2  : SubRegIndex<[dsub_1, ssub_0]>;
-def ssub_3  : SubRegIndex<[dsub_1, ssub_1]>;
-
-def gsub_0  : SubRegIndex;
-def gsub_1  : SubRegIndex;
+def qsub_0 : SubRegIndex<128>;
+def qsub_1 : SubRegIndex<128, 128>;
+def qsub_2 : ComposedSubRegIndex<qqsub_1, qsub_0>;
+def qsub_3 : ComposedSubRegIndex<qqsub_1, qsub_1>;
+
+def dsub_0 : SubRegIndex<64>;
+def dsub_1 : SubRegIndex<64, 64>;
+def dsub_2 : ComposedSubRegIndex<qsub_1, dsub_0>;
+def dsub_3 : ComposedSubRegIndex<qsub_1, dsub_1>;
+def dsub_4 : ComposedSubRegIndex<qsub_2, dsub_0>;
+def dsub_5 : ComposedSubRegIndex<qsub_2, dsub_1>;
+def dsub_6 : ComposedSubRegIndex<qsub_3, dsub_0>;
+def dsub_7 : ComposedSubRegIndex<qsub_3, dsub_1>;
+
+def ssub_0  : SubRegIndex<32>;
+def ssub_1  : SubRegIndex<32, 32>;
+def ssub_2  : ComposedSubRegIndex<dsub_1, ssub_0>;
+def ssub_3  : ComposedSubRegIndex<dsub_1, ssub_1>;
+
+def gsub_0  : SubRegIndex<32>;
+def gsub_1  : SubRegIndex<32, 32>;
 // Let TableGen synthesize the remaining 12 ssub_* indices.
 // We don't need to name them.
 }
@@ -157,21 +157,26 @@ def Q15 : ARMReg<15, "q15", [D30, D31]>;
 
 // Current Program Status Register.
 // We model fpscr with two registers: FPSCR models the control bits and will be
-// reserved. FPSCR_NZCV models the flag bits and will be unreserved. 
-def CPSR       : ARMReg<0, "cpsr">;
-def APSR       : ARMReg<1, "apsr">;
-def SPSR       : ARMReg<2, "spsr">;
-def FPSCR      : ARMReg<3, "fpscr">;
-def FPSCR_NZCV : ARMReg<3, "fpscr_nzcv"> {
+// reserved. FPSCR_NZCV models the flag bits and will be unreserved. APSR_NZCV
+// models the APSR when it's accessed by some special instructions. In such cases 
+// it has the same encoding as PC.
+def CPSR       : ARMReg<0,  "cpsr">;
+def APSR       : ARMReg<1,  "apsr">;
+def APSR_NZCV  : ARMReg<15, "apsr_nzcv">; 
+def SPSR       : ARMReg<2,  "spsr">;
+def FPSCR      : ARMReg<3,  "fpscr">;
+def FPSCR_NZCV : ARMReg<3,  "fpscr_nzcv"> {
   let Aliases = [FPSCR];
 }
 def ITSTATE    : ARMReg<4, "itstate">;
 
 // Special Registers - only available in privileged mode.
-def FPSID   : ARMReg<0, "fpsid">;
-def MVFR1   : ARMReg<6, "mvfr1">;
-def MVFR0   : ARMReg<7, "mvfr0">;
-def FPEXC   : ARMReg<8, "fpexc">;
+def FPSID   : ARMReg<0,  "fpsid">;
+def MVFR1   : ARMReg<6,  "mvfr1">;
+def MVFR0   : ARMReg<7,  "mvfr0">;
+def FPEXC   : ARMReg<8,  "fpexc">;
+def FPINST  : ARMReg<9,  "fpinst">;
+def FPINST2 : ARMReg<10, "fpinst2">;
 
 // Register classes.
 //
@@ -207,6 +212,16 @@ def GPRnopc : RegisterClass<"ARM", [i32], 32, (sub GPR, PC)> {
   }];
 }
 
+// GPRs without the PC but with APSR. Some instructions allow accessing the
+// APSR, while actually encoding PC in the register field. This is usefull
+// for assembly and disassembly only.
+def GPRwithAPSR : RegisterClass<"ARM", [i32], 32, (add GPR, APSR_NZCV)> {
+  let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8)];
+  let AltOrderSelect = [{
+      return 1 + MF.getTarget().getSubtarget<ARMSubtarget>().isThumb1Only();
+  }];
+}
+
 // GPRsp - Only the SP is legal. Used by Thumb1 instructions that want the
 // implied SP argument list.
 // FIXME: It would be better to not use this at all and refactor the
diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td
index 2d088de..528c4ec 100644
--- a/lib/Target/ARM/ARMSchedule.td
+++ b/lib/Target/ARM/ARMSchedule.td
@@ -69,6 +69,24 @@ def WriteCMP : SchedWrite;
 def WriteCMPsi : SchedWrite;
 def WriteCMPsr : SchedWrite;
 
+// Division.
+def WriteDiv : SchedWrite;
+
+// Loads.
+def WriteLd : SchedWrite;
+def WritePreLd : SchedWrite;
+
+// Branches.
+def WriteBr : SchedWrite;
+def WriteBrL : SchedWrite;
+def WriteBrTbl : SchedWrite;
+
+// Fixpoint conversions.
+def WriteCvtFP : SchedWrite;
+
+// Noop.
+def WriteNoop : SchedWrite;
+
 // Define TII for use in SchedVariant Predicates.
 def : PredicateProlog<[{
   const ARMBaseInstrInfo *TII =
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index 9739ed2..d06ad7d 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -2275,10 +2275,10 @@ def A9Read4 : SchedReadAdvance<3>;
 // This table follows the ARM Cortex-A9 Technical Reference Manuals,
 // mostly in order.
 
-def :ItinRW<[A9WriteI], [IIC_iMOVi,IIC_iMOVr,IIC_iMOVsi,
+def :ItinRW<[WriteALU], [IIC_iMOVi,IIC_iMOVr,IIC_iMOVsi,
                          IIC_iMVNi,IIC_iMVNsi,
                          IIC_iCMOVi,IIC_iCMOVr,IIC_iCMOVsi]>;
-def :ItinRW<[A9WriteI,A9ReadALU],[IIC_iMVNr]>;
+def :ItinRW<[WriteALU, A9ReadALU],[IIC_iMVNr]>;
 def :ItinRW<[A9WriteIsr], [IIC_iMOVsr,IIC_iMVNsr,IIC_iCMOVsr]>;
 
 def :ItinRW<[A9WriteI2],   [IIC_iMOVix2,IIC_iCMOVix2]>;
@@ -2487,10 +2487,59 @@ def : SchedAlias<WriteALUsr, A9WriteALUsr>;
 def : SchedAlias<WriteALUSsr, A9WriteALUsr>;
 def : SchedAlias<ReadALU, A9ReadALU>;
 def : SchedAlias<ReadALUsr, A9ReadALU>;
-// FIXME: need to special case AND, ORR, EOR, BIC because they don't read
-// advance. But our instrinfo claims it does.
+def : InstRW< [WriteALU],
+      (instregex "ANDri", "ORRri", "EORri", "BICri", "ANDrr", "ORRrr", "EORrr",
+                 "BICrr")>;
+def : InstRW< [WriteALUsi], (instregex "ANDrsi", "ORRrsi", "EORrsi", "BICrsi")>;
+def : InstRW< [WriteALUsr], (instregex "ANDrsr", "ORRrsr", "EORrsr", "BICrsr")>;
+
 
 def : SchedAlias<WriteCMP, A9WriteALU>;
 def : SchedAlias<WriteCMPsi, A9WriteALU>;
 def : SchedAlias<WriteCMPsr, A9WriteALU>;
+
+def : InstRW< [A9WriteIsr], (instregex "MOVsr", "MOVsi", "MVNsr", "MOVCCsi",
+                                       "MOVCCsr")>;
+def : InstRW< [WriteALU, A9ReadALU], (instregex "MVNr")>;
+def : InstRW< [A9WriteI2], (instregex "MOVCCi32imm", "MOVi32imm",
+                                      "MOV_ga_dyn")>;
+def : InstRW< [A9WriteI2pc], (instregex "MOV_ga_pcrel")>;
+def : InstRW< [A9WriteI2ld], (instregex "MOV_ga_pcrel_ldr")>;
+
+def : InstRW< [WriteALU], (instregex "SEL")>;
+
+def : InstRW< [WriteALUsi], (instregex "BFC", "BFI", "UBFX", "SBFX")>;
+
+def : InstRW< [A9WriteM],
+      (instregex "MUL", "MULv5", "SMMUL", "SMMULR", "MLA", "MLAv5", "MLS",
+      "SMMLA", "SMMLAR", "SMMLS", "SMMLSR")>;
+def : InstRW< [A9WriteM, A9WriteMHi],
+      (instregex "SMULL", "SMULLv5", "UMULL", "UMULLv5", "SMLAL$", "UMLAL",
+      "UMAAL", "SMLALv5", "UMLALv5", "UMAALv5", "SMLALBB", "SMLALBT", "SMLALTB",
+      "SMLALTT")>;
+// FIXME: These instructions used to have NoItinerary. Just copied the one from above.
+def : InstRW< [A9WriteM, A9WriteMHi],
+      (instregex "SMLAD", "SMLADX", "SMLALD", "SMLALDX", "SMLSD", "SMLSDX",
+      "SMLSLD", "SMLLDX", "SMUAD", "SMUADX", "SMUSD", "SMUSDX")>;
+
+def : InstRW<[A9WriteM16, A9WriteM16Hi],
+      (instregex "SMULBB", "SMULBT", "SMULTB", "SMULTT", "SMULWB", "SMULWT")>;
+def : InstRW<[A9WriteM16, A9WriteM16Hi],
+      (instregex "SMLABB", "SMLABT", "SMLATB", "SMLATT", "SMLAWB", "SMLAWT")>;
+
+def : InstRW<[A9WriteL], (instregex "LDRi12", "PICLDR$")>;
+def : InstRW<[A9WriteLsi], (instregex "LDRrs")>;
+def : InstRW<[A9WriteLb],
+      (instregex "LDRBi12", "PICLDRH", "PICLDRB", "PICLDRSH", "PICLDRSB",
+      "LDRH", "LDRSH", "LDRSB")>;
+def : InstRW<[A9WriteLbsi], (instregex "LDRrs")>;
+
+def : WriteRes<WriteDiv, []> { let Latency = 0; }
+
+def : WriteRes<WriteBr, [A9UnitB]>;
+def : WriteRes<WriteBrL, [A9UnitB]>;
+def : WriteRes<WriteBrTbl, [A9UnitB]>;
+def : WriteRes<WritePreLd, []>;
+def : SchedAlias<WriteCvtFP, A9WriteF>;
+def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; }
 } // SchedModel = CortexA9Model
diff --git a/lib/Target/ARM/ARMScheduleSwift.td b/lib/Target/ARM/ARMScheduleSwift.td
index 7c6df41..b5cf251 100644
--- a/lib/Target/ARM/ARMScheduleSwift.td
+++ b/lib/Target/ARM/ARMScheduleSwift.td
@@ -1096,9 +1096,27 @@ let SchedModel = SwiftModel in {
   def SwiftUnitDiv : ProcResource<1>;
 
   // Generic resource requirements.
+  def SwiftWriteP0OneCycle : SchedWriteRes<[SwiftUnitP0]>;
+  def SwiftWriteP0TwoCycle : SchedWriteRes<[SwiftUnitP0]> { let Latency = 2; }
+  def SwiftWriteP0FourCycle : SchedWriteRes<[SwiftUnitP0]> { let Latency = 4; }
+  def SwiftWriteP0SixCycle : SchedWriteRes<[SwiftUnitP0]> { let Latency = 6; }
+  def SwiftWriteP0P1FourCycle : SchedWriteRes<[SwiftUnitP0, SwiftUnitP1]> {
+    let Latency = 4;
+  }
+  def SwiftWriteP0P1SixCycle : SchedWriteRes<[SwiftUnitP0, SwiftUnitP1]> {
+    let Latency = 6;
+  }
+  def SwiftWriteP01OneCycle : SchedWriteRes<[SwiftUnitP01]>;
+  def SwiftWriteP1TwoCycle : SchedWriteRes<[SwiftUnitP1]> { let Latency = 2; }
+  def SwiftWriteP1FourCycle : SchedWriteRes<[SwiftUnitP1]> { let Latency = 4; }
+  def SwiftWriteP1SixCycle : SchedWriteRes<[SwiftUnitP1]> { let Latency = 6; }
+  def SwiftWriteP1EightCycle : SchedWriteRes<[SwiftUnitP1]> { let Latency = 8; }
+  def SwiftWriteP1TwelveCyc : SchedWriteRes<[SwiftUnitP1]> { let Latency = 12; }
+  def SwiftWriteP01OneCycle2x : WriteSequence<[SwiftWriteP01OneCycle], 2>;
+  def SwiftWriteP01OneCycle3x : WriteSequence<[SwiftWriteP01OneCycle], 3>;
   def SwiftWriteP01TwoCycle : SchedWriteRes<[SwiftUnitP01]> { let Latency = 2; }
-  def SwiftWriteP01ThreeCycleTwoUops :
-    SchedWriteRes<[SwiftUnitP01, SwiftUnitP01]> {
+  def SwiftWriteP01ThreeCycleTwoUops : SchedWriteRes<[SwiftUnitP01,
+                                                      SwiftUnitP01]> {
     let Latency = 3;
     let NumMicroOps = 2;
   }
@@ -1107,7 +1125,23 @@ let SchedModel = SwiftModel in {
     let NumMicroOps = 3;
     let ResourceCycles = [3];
   }
-
+  // Plain load without writeback.
+  def SwiftWriteP2ThreeCycle : SchedWriteRes<[SwiftUnitP2]> {
+    let Latency = 3;
+  }
+  def SwiftWriteP2FourCycle : SchedWriteRes<[SwiftUnitP2]> {
+    let Latency = 4;
+  }
+  // A store does not write to a register.
+  def SwiftWriteP2 : SchedWriteRes<[SwiftUnitP2]> {
+    let Latency = 0;
+  }
+  foreach Num = 1-4 in {
+    def SwiftWrite#Num#xP2 : WriteSequence<[SwiftWriteP2], Num>;
+  }
+  def SwiftWriteP01OneCycle2x_load : WriteSequence<[SwiftWriteP01OneCycle,
+                                                    SwiftWriteP01OneCycle,
+                                                    SwiftWriteP2ThreeCycle]>;
   // 4.2.4 Arithmetic and Logical.
   // ALU operation register shifted by immediate variant.
   def SwiftWriteALUsi : SchedWriteVariant<[
@@ -1137,8 +1171,897 @@ let SchedModel = SwiftModel in {
   def : ReadAdvance<ReadALU, 0>;
   def : SchedAlias<ReadALUsr, SwiftReadAdvanceALUsr>;
 
+
+  def SwiftChooseShiftKindP01OneOrTwoCycle : SchedWriteVariant<[
+    SchedVar<IsFastImmShiftSwiftPred, [SwiftWriteP01OneCycle]>,
+    SchedVar<NoSchedPred,             [SwiftWriteP01TwoCycle]>
+  ]>;
+
   // 4.2.5 Integer comparison
   def : WriteRes<WriteCMP, [SwiftUnitP01]>;
-  def : WriteRes<WriteCMPsi, [SwiftUnitP01]>;
-  def : WriteRes<WriteCMPsr, [SwiftUnitP01]>;
+  def : SchedAlias<WriteCMPsi, SwiftChooseShiftKindP01OneOrTwoCycle>;
+  def : SchedAlias<WriteCMPsr, SwiftWriteP01TwoCycle>;
+
+  // 4.2.6 Shift, Move
+  // Shift
+  //  ASR,LSL,ROR,RRX
+  //  MOV(register-shiftedregister)  MVN(register-shiftedregister)
+  // Move
+  //  MOV,MVN
+  //  MOVT
+  // Sign/Zero extension
+  def : InstRW<[SwiftWriteP01OneCycle],
+               (instregex "SXTB", "SXTH", "SXTB16", "UXTB", "UXTH", "UXTB16",
+                          "t2SXTB", "t2SXTH", "t2SXTB16", "t2UXTB", "t2UXTH",
+                          "t2UXTB16")>;
+  // Pseudo instructions.
+  def : InstRW<[SwiftWriteP01OneCycle2x],
+        (instregex "MOVCCi32imm", "MOVi32imm", "MOV_ga_dyn", "t2MOVCCi32imm",
+                   "t2MOVi32imm", "t2MOV_ga_dyn")>;
+  def : InstRW<[SwiftWriteP01OneCycle3x],
+        (instregex "MOV_ga_pcrel", "t2MOV_ga_pcrel", "t2MOVi16_ga_pcrel")>;
+  def : InstRW<[SwiftWriteP01OneCycle2x_load],
+        (instregex "MOV_ga_pcrel_ldr", "t2MOV_ga_pcrel_ldr")>;
+
+  def SwiftWriteP0TwoCyleTwoUops : WriteSequence<[SwiftWriteP0OneCycle], 2>;
+
+  def SwiftPredP0OneOrTwoCycle : SchedWriteVariant<[
+    SchedVar<IsPredicatedPred, [ SwiftWriteP0TwoCyleTwoUops ]>,
+    SchedVar<NoSchedPred,     [ SwiftWriteP0OneCycle ]>
+  ]>;
+
+  // 4.2.7 Select
+  // SEL
+  def : InstRW<[SwiftPredP0OneOrTwoCycle], (instregex "SEL", "t2SEL")>;
+
+  // 4.2.8 Bitfield
+  // BFI,BFC, SBFX,UBFX
+  def : InstRW< [SwiftWriteP01TwoCycle],
+        (instregex "BFC", "BFI", "UBFX", "SBFX", "(t|t2)BFC", "(t|t2)BFI",
+        "(t|t2)UBFX", "(t|t2)SBFX")>;
+
+  // 4.2.9 Saturating arithmetic
+  def : InstRW< [SwiftWriteP01TwoCycle],
+        (instregex "QADD", "QSUB", "QDADD", "QDSUB", "SSAT", "SSAT16", "USAT",
+        "USAT16", "QADD8", "QADD16", "QSUB8", "QSUB16", "QASX", "QSAX",
+        "UQADD8", "UQADD16","UQSUB8","UQSUB16","UQASX","UQSAX", "t2QADD",
+        "t2QSUB", "t2QDADD", "t2QDSUB", "t2SSAT", "t2SSAT16", "t2USAT",
+        "t2QADD8", "t2QADD16", "t2QSUB8", "t2QSUB16", "t2QASX", "t2QSAX",
+        "t2UQADD8", "t2UQADD16","t2UQSUB8","t2UQSUB16","t2UQASX","t2UQSAX")>;
+
+  // 4.2.10 Parallel Arithmetic
+  // Not flag setting.
+  def : InstRW< [SwiftWriteALUsr],
+        (instregex "SADD8", "SADD16", "SSUB8", "SSUB16", "SASX", "SSAX",
+        "UADD8", "UADD16", "USUB8", "USUB16", "UASX", "USAX", "t2SADD8",
+        "t2SADD16", "t2SSUB8", "t2SSUB16", "t2SASX", "t2SSAX", "t2UADD8",
+        "t2UADD16", "t2USUB8", "t2USUB16", "t2UASX", "t2USAX")>;
+  // Flag setting.
+  def : InstRW< [SwiftWriteP01TwoCycle],
+       (instregex "SHADD8", "SHADD16", "SHSUB8", "SHSUB16", "SHASX", "SHSAX",
+       "SXTAB", "SXTAB16", "SXTAH", "UHADD8", "UHADD16", "UHSUB8", "UHSUB16",
+       "UHASX", "UHSAX", "UXTAB", "UXTAB16", "UXTAH", "t2SHADD8", "t2SHADD16",
+       "t2SHSUB8", "t2SHSUB16", "t2SHASX", "t2SHSAX", "t2SXTAB", "t2SXTAB16",
+       "t2SXTAH", "t2UHADD8", "t2UHADD16", "t2UHSUB8", "t2UHSUB16", "t2UHASX",
+       "t2UHSAX", "t2UXTAB", "t2UXTAB16", "t2UXTAH")>;
+
+  // 4.2.11 Sum of Absolute Difference
+  def : InstRW< [SwiftWriteP0P1FourCycle], (instregex "USAD8") >;
+  def : InstRW<[SwiftWriteP0P1FourCycle, ReadALU, ReadALU, SchedReadAdvance<2>],
+        (instregex "USADA8")>;
+
+  // 4.2.12 Integer Multiply (32-bit result)
+  // Two sources.
+  def : InstRW< [SwiftWriteP0FourCycle],
+        (instregex "MULS", "MUL", "SMMUL", "SMMULR", "SMULBB", "SMULBT",
+        "SMULTB", "SMULTT", "SMULWB", "SMULWT", "SMUSD", "SMUSDXi", "t2MUL",
+        "t2SMMUL", "t2SMMULR", "t2SMULBB", "t2SMULBT", "t2SMULTB", "t2SMULTT",
+        "t2SMULWB", "t2SMULWT", "t2SMUSD")>;
+
+  def SwiftWriteP0P01FiveCycleTwoUops :
+      SchedWriteRes<[SwiftUnitP0, SwiftUnitP01]>  {
+    let Latency = 5;
+  }
+
+  def SwiftPredP0P01FourFiveCycle : SchedWriteVariant<[
+    SchedVar<IsPredicatedPred, [ SwiftWriteP0P01FiveCycleTwoUops ]>,
+    SchedVar<NoSchedPred,      [ SwiftWriteP0FourCycle ]>
+  ]>;
+
+  def SwiftReadAdvanceFourCyclesPred : SchedReadVariant<[
+     SchedVar<IsPredicatedPred, [SchedReadAdvance<4>]>,
+     SchedVar<NoSchedPred,      [ReadALU]>
+  ]>;
+
+  // Multiply accumulate, three sources
+  def : InstRW< [SwiftPredP0P01FourFiveCycle, ReadALU, ReadALU,
+                 SwiftReadAdvanceFourCyclesPred],
+        (instregex "MLAS", "MLA", "MLS", "SMMLA", "SMMLAR", "SMMLS", "SMMLSR",
+        "t2MLA", "t2MLS", "t2MLAS", "t2SMMLA", "t2SMMLAR", "t2SMMLS",
+        "t2SMMLSR")>;
+
+  // 4.2.13 Integer Multiply (32-bit result, Q flag)
+  def : InstRW< [SwiftWriteP0FourCycle],
+        (instregex "SMUAD", "SMUADX", "t2SMUAD", "t2SMUADX")>;
+  def : InstRW< [SwiftPredP0P01FourFiveCycle, ReadALU, ReadALU,
+                 SwiftReadAdvanceFourCyclesPred],
+        (instregex "SMLABB", "SMLABT", "SMLATB", "SMLATT", "SMLSD", "SMLSDX",
+        "SMLAWB", "SMLAWT", "t2SMLABB", "t2SMLABT", "t2SMLATB", "t2SMLATT",
+        "t2SMLSD", "t2SMLSDX", "t2SMLAWB", "t2SMLAWT")>;
+  def : InstRW< [SwiftPredP0P01FourFiveCycle],
+        (instregex "SMLAD", "SMLADX", "t2SMLAD", "t2SMLADX")>;
+
+  def SwiftP0P0P01FiveCycle : SchedWriteRes<[SwiftUnitP0, SwiftUnitP01]> {
+    let Latency = 5;
+    let NumMicroOps = 3;
+    let ResourceCycles = [2, 1];
+  }
+  def SwiftWrite1Cycle : SchedWriteRes<[]> {
+    let Latency = 1;
+    let NumMicroOps = 0;
+  }
+  def SwiftWrite5Cycle : SchedWriteRes<[]> {
+    let Latency = 5;
+    let NumMicroOps = 0;
+  }
+  def SwiftWrite6Cycle : SchedWriteRes<[]> {
+    let Latency = 6;
+    let NumMicroOps = 0;
+  }
+
+  // 4.2.14 Integer Multiply, Long
+  def : InstRW< [SwiftP0P0P01FiveCycle, SwiftWrite5Cycle],
+        (instregex "SMULL$", "UMULL$", "t2SMULL$", "t2UMULL$")>;
+
+  def Swift2P03P01FiveCycle : SchedWriteRes<[SwiftUnitP0, SwiftUnitP01]> {
+    let Latency = 7;
+    let NumMicroOps = 5;
+    let ResourceCycles = [2, 3];
+  }
+
+  // 4.2.15 Integer Multiply Accumulate, Long
+  // 4.2.16 Integer Multiply Accumulate, Dual
+  // 4.2.17 Integer Multiply Accumulate Accumulate, Long
+  // We are being a bit inaccurate here.
+  def : InstRW< [SwiftWrite5Cycle, Swift2P03P01FiveCycle, ReadALU, ReadALU,
+                 SchedReadAdvance<4>, SchedReadAdvance<3>],
+        (instregex "SMLALS", "UMLALS", "SMLAL", "UMLAL", "MLALBB", "SMLALBT",
+        "SMLALTB", "SMLALTT", "SMLALD", "SMLALDX", "SMLSLD", "SMLSLDX",
+        "UMAAL", "t2SMLALS", "t2UMLALS", "t2SMLAL", "t2UMLAL", "t2MLALBB", "t2SMLALBT",
+        "t2SMLALTB", "t2SMLALTT", "t2SMLALD", "t2SMLALDX", "t2SMLSLD", "t2SMLSLDX",
+        "t2UMAAL")>;
+
+  def SwiftDiv : SchedWriteRes<[SwiftUnitP0, SwiftUnitDiv]> {
+    let NumMicroOps = 1;
+    let Latency = 14;
+    let ResourceCycles = [1, 14];
+  }
+  // 4.2.18 Integer Divide
+  def : WriteRes<WriteDiv, [SwiftUnitDiv]>; // Workaround.
+  def : InstRW <[SwiftDiv],
+        (instregex "SDIV", "UDIV", "t2SDIV", "t2UDIV")>;
+
+  // 4.2.19 Integer Load Single Element
+  // 4.2.20 Integer Load Signextended
+  def SwiftWriteP2P01ThreeCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01]> {
+    let Latency = 3;
+  }
+  def SwiftWriteP2P01FourCyle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01]> {
+    let Latency = 4;
+  }
+  def SwiftWriteP2P01P01FourCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01,
+                                                   SwiftUnitP01]> {
+    let Latency = 4;
+  }
+  def SwiftWriteP2P2ThreeCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP2]> {
+    let Latency = 3;
+  }
+  def SwiftWriteP2P2P01ThreeCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP2,
+                                                    SwiftUnitP01]> {
+    let Latency = 3;
+  }
+  def SwiftWrBackOne : SchedWriteRes<[]> {
+    let Latency = 1;
+    let NumMicroOps = 0;
+  }
+  def SwiftWriteLdFour : SchedWriteRes<[]> {
+    let Latency = 4;
+    let NumMicroOps = 0;
+  }
+   // Not accurate.
+  def : InstRW<[SwiftWriteP2ThreeCycle],
+        (instregex "LDR(i12|rs)$", "LDRB(i12|rs)$", "t2LDR(i8|i12|s|pci)",
+        "t2LDR(H|B)(i8|i12|s|pci)", "LDREX", "tLDR[BH](r|i|spi|pci|pciASM)",
+        "tLDR(r|i|spi|pci|pciASM)")>;
+  def : InstRW<[SwiftWriteP2ThreeCycle],
+        (instregex "LDRH$",  "PICLDR$", "PICLDR(H|B)$", "LDRcp$")>;
+  def : InstRW<[SwiftWriteP2P01FourCyle],
+        (instregex "PICLDRS(H|B)$", "t2LDRS(H|B)(i|r|p|s)", "LDRS(H|B)$",
+        "t2LDRpci_pic", "tLDRS(B|H)")>;
+  def : InstRW<[SwiftWriteP2P01ThreeCycle,  SwiftWrBackOne],
+        (instregex "LD(RB|R)(_|T_)(POST|PRE)_(IMM|REG)", "LDRH(_PRE|_POST)",
+        "LDR(T|BT)_POST_(REG|IMM)", "LDRHT(i|r)",
+        "t2LD(R|RB|RH)_(PRE|POST)", "t2LD(R|RB|RH)T")>;
+  def : InstRW<[SwiftWriteP2P01P01FourCycle, SwiftWrBackOne],
+        (instregex "LDR(SH|SB)(_POST|_PRE)", "t2LDR(SH|SB)(_POST|_PRE)",
+        "LDRS(B|H)T(i|r)", "t2LDRS(B|H)T(i|r)", "t2LDRS(B|H)T")>;
+
+  // 4.2.21 Integer Dual Load
+  // Not accurate.
+  def : InstRW<[SwiftWriteP2P2ThreeCycle, SwiftWriteLdFour],
+        (instregex "t2LDRDi8", "LDRD$")>;
+  def : InstRW<[SwiftWriteP2P2P01ThreeCycle, SwiftWriteLdFour, SwiftWrBackOne],
+        (instregex "LDRD_(POST|PRE)", "t2LDRD_(POST|PRE)")>;
+
+  // 4.2.22 Integer Load, Multiple
+  // NumReg = 1 .. 16
+  foreach Lat = 3-25 in {
+    def SwiftWriteLM#Lat#Cy : SchedWriteRes<[SwiftUnitP2]> {
+      let Latency = Lat;
+    }
+    def SwiftWriteLM#Lat#CyNo : SchedWriteRes<[]> { let Latency = Lat; }
+  }
+  // Predicate.
+  foreach NumAddr = 1-16 in {
+    def SwiftLMAddr#NumAddr#Pred : SchedPredicate<"TII->getNumLDMAddresses(MI) == "#NumAddr>;
+  }
+  def SwiftWriteLDMAddrNoWB : SchedWriteRes<[SwiftUnitP01]> { let Latency = 0; }
+  def SwiftWriteLDMAddrWB : SchedWriteRes<[SwiftUnitP01, SwiftUnitP01]>;
+  def SwiftWriteLM : SchedWriteVariant<[
+    SchedVar<SwiftLMAddr2Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy]>,
+    SchedVar<SwiftLMAddr3Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy]>,
+    SchedVar<SwiftLMAddr4Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy]>,
+    SchedVar<SwiftLMAddr5Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy]>,
+    SchedVar<SwiftLMAddr6Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy]>,
+    SchedVar<SwiftLMAddr7Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy]>,
+    SchedVar<SwiftLMAddr8Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy]>,
+    SchedVar<SwiftLMAddr9Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy]>,
+    SchedVar<SwiftLMAddr10Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM12Cy]>,
+    SchedVar<SwiftLMAddr11Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM12Cy,
+                                SwiftWriteLM13Cy]>,
+    SchedVar<SwiftLMAddr12Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM12Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14Cy]>,
+    SchedVar<SwiftLMAddr13Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM12Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14Cy,
+                                SwiftWriteLM15Cy]>,
+    SchedVar<SwiftLMAddr14Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM12Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14Cy,
+                                SwiftWriteLM15Cy, SwiftWriteLM16Cy]>,
+    SchedVar<SwiftLMAddr15Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM12Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14Cy,
+                                SwiftWriteLM15Cy, SwiftWriteLM16Cy,
+                                SwiftWriteLM17Cy]>,
+    SchedVar<SwiftLMAddr16Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM12Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14Cy,
+                                SwiftWriteLM15Cy, SwiftWriteLM16Cy,
+                                SwiftWriteLM17Cy, SwiftWriteLM18Cy]>,
+    // Unknow number of registers, just use resources for two registers.
+    SchedVar<NoSchedPred,      [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5CyNo, SwiftWriteLM6CyNo,
+                                SwiftWriteLM7CyNo, SwiftWriteLM8CyNo,
+                                SwiftWriteLM9CyNo, SwiftWriteLM10CyNo,
+                                SwiftWriteLM11CyNo, SwiftWriteLM12CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM14CyNo,
+                                SwiftWriteLM15CyNo, SwiftWriteLM16CyNo,
+                                SwiftWriteLM17CyNo, SwiftWriteLM18CyNo]>
+
+  ]> { let Variadic=1; }
+
+  def : InstRW<[SwiftWriteLM, SwiftWriteLDMAddrNoWB],
+        (instregex "LDM(IA|DA|DB|IB)$", "t2LDM(IA|DA|DB|IB)$",
+        "(t|sys)LDM(IA|DA|DB|IB)$")>;
+  def : InstRW<[SwiftWriteLDMAddrWB, SwiftWriteLM],
+        (instregex /*"t2LDMIA_RET", "tLDMIA_RET", "LDMIA_RET",*/
+        "LDM(IA|DA|DB|IB)_UPD", "(t2|sys|t)LDM(IA|DA|DB|IB)_UPD")>;
+  def : InstRW<[SwiftWriteLDMAddrWB, SwiftWriteLM, SwiftWriteP1TwoCycle],
+        (instregex "LDMIA_RET", "(t|t2)LDMIA_RET", "POP", "tPOP")>;
+  // 4.2.23 Integer Store, Single Element
+  def : InstRW<[SwiftWriteP2],
+        (instregex "PICSTR", "STR(i12|rs)", "STRB(i12|rs)", "STRH$", "STREX",
+        "t2STR(i12|i8|s)$", "t2STR[BH](i12|i8|s)$", "tSTR[BH](i|r)", "tSTR(i|r)", "tSTRspi")>;
+
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWriteP2],
+        (instregex "STR(B_|_|BT_|T_)(PRE_IMM|PRE_REG|POST_REG|POST_IMM)",
+        "STR(i|r)_preidx", "STRB(i|r)_preidx", "STRH_preidx", "STR(H_|HT_)(PRE|POST)",
+        "STR(BT|HT|T)", "t2STR_(PRE|POST)", "t2STR[BH]_(PRE|POST)",
+        "t2STR_preidx", "t2STR[BH]_preidx", "t2ST(RB|RH|R)T")>;
+
+  // 4.2.24 Integer Store, Dual
+  def : InstRW<[SwiftWriteP2, SwiftWriteP2, SwiftWriteP01OneCycle],
+        (instregex "STRD$", "t2STRDi8")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWriteP2, SwiftWriteP2,
+                SwiftWriteP01OneCycle],
+        (instregex "(t2|t)STRD_(POST|PRE)", "STRD_(POST|PRE)")>;
+
+  // 4.2.25 Integer Store, Multiple
+  def SwiftWriteStIncAddr : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01]> {
+    let Latency = 0;
+  }
+  foreach NumAddr = 1-16 in {
+     def SwiftWriteSTM#NumAddr : WriteSequence<[SwiftWriteStIncAddr], NumAddr>;
+  }
+  def SwiftWriteSTM : SchedWriteVariant<[
+    SchedVar<SwiftLMAddr2Pred, [SwiftWriteSTM2]>,
+    SchedVar<SwiftLMAddr3Pred, [SwiftWriteSTM3]>,
+    SchedVar<SwiftLMAddr4Pred, [SwiftWriteSTM4]>,
+    SchedVar<SwiftLMAddr5Pred, [SwiftWriteSTM5]>,
+    SchedVar<SwiftLMAddr6Pred, [SwiftWriteSTM6]>,
+    SchedVar<SwiftLMAddr7Pred, [SwiftWriteSTM7]>,
+    SchedVar<SwiftLMAddr8Pred, [SwiftWriteSTM8]>,
+    SchedVar<SwiftLMAddr9Pred, [SwiftWriteSTM9]>,
+    SchedVar<SwiftLMAddr10Pred,[SwiftWriteSTM10]>,
+    SchedVar<SwiftLMAddr11Pred,[SwiftWriteSTM11]>,
+    SchedVar<SwiftLMAddr12Pred,[SwiftWriteSTM12]>,
+    SchedVar<SwiftLMAddr13Pred,[SwiftWriteSTM13]>,
+    SchedVar<SwiftLMAddr14Pred,[SwiftWriteSTM14]>,
+    SchedVar<SwiftLMAddr15Pred,[SwiftWriteSTM15]>,
+    SchedVar<SwiftLMAddr16Pred,[SwiftWriteSTM16]>,
+    // Unknow number of registers, just use resources for two registers.
+    SchedVar<NoSchedPred,      [SwiftWriteSTM2]>
+  ]>;
+  def : InstRW<[SwiftWriteSTM],
+        (instregex "STM(IB|IA|DB|DA)$", "(t2|sys|t)STM(IB|IA|DB|DA)$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWriteSTM],
+        (instregex "STM(IB|IA|DB|DA)_UPD", "(t2|sys|t)STM(IB|IA|DB|DA)_UPD",
+        "PUSH", "tPUSH")>;
+
+  // 4.2.26 Branch
+  def : WriteRes<WriteBr, [SwiftUnitP1]> { let Latency = 0; }
+  def : WriteRes<WriteBrL, [SwiftUnitP1]> { let Latency = 2; }
+  def : WriteRes<WriteBrTbl, [SwiftUnitP1, SwiftUnitP2]> { let Latency = 0; }
+
+  // 4.2.27 Not issued
+  def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; }
+  def : InstRW<[WriteNoop], (instregex "t2IT", "IT", "NOP")>;
+
+  // 4.2.28 Advanced SIMD, Integer, 2 cycle
+  def : InstRW<[SwiftWriteP0TwoCycle],
+        (instregex "VADDv", "VSUBv", "VNEG(s|f|v)", "VADDL", "VSUBL",
+                   "VADDW", "VSUBW", "VHADD", "VHSUB", "VRHADD", "VPADDi",
+                   "VPADDL", "VAND", "VBIC", "VEOR", "VORN", "VORR", "VTST",
+                   "VSHL", "VSHR(s|u)", "VSHLL", "VQSHL", "VQSHLU", "VBIF",
+                   "VBIT", "VBSL", "VSLI", "VSRI", "VCLS", "VCLZ", "VCNT")>;
+
+  def : InstRW<[SwiftWriteP1TwoCycle],
+        (instregex "VEXT", "VREV16", "VREV32", "VREV64")>;
+
+  // 4.2.29 Advanced SIMD, Integer, 4 cycle
+  // 4.2.30 Advanced SIMD, Integer with Accumulate
+  def : InstRW<[SwiftWriteP0FourCycle],
+        (instregex "VABA", "VABAL", "VPADAL", "VRSRA", "VSRA", "VACGE", "VACGT",
+        "VACLE", "VACLT", "VCEQ", "VCGE", "VCGT", "VCLE", "VCLT", "VRSHL",
+        "VQRSHL", "VRSHR(u|s)", "VABS(f|v)", "VQABS", "VQNEG", "VQADD",
+        "VQSUB")>;
+  def : InstRW<[SwiftWriteP1FourCycle],
+        (instregex "VRECPE", "VRSQRTE")>;
+
+  // 4.2.31 Advanced SIMD, Add and Shift with Narrow
+  def : InstRW<[SwiftWriteP0P1FourCycle],
+        (instregex "VADDHN", "VSUBHN", "VSHRN")>;
+  def : InstRW<[SwiftWriteP0P1SixCycle],
+        (instregex "VRADDHN", "VRSUBHN", "VRSHRN", "VQSHRN", "VQSHRUN",
+                   "VQRSHRN", "VQRSHRUN")>;
+
+  // 4.2.32 Advanced SIMD, Vector Table Lookup
+  foreach Num = 1-4 in {
+    def SwiftWrite#Num#xP1TwoCycle : WriteSequence<[SwiftWriteP1TwoCycle], Num>;
+  }
+  def : InstRW<[SwiftWrite1xP1TwoCycle],
+        (instregex "VTB(L|X)1")>;
+  def : InstRW<[SwiftWrite2xP1TwoCycle],
+        (instregex "VTB(L|X)2")>;
+  def : InstRW<[SwiftWrite3xP1TwoCycle],
+        (instregex "VTB(L|X)3")>;
+  def : InstRW<[SwiftWrite4xP1TwoCycle],
+        (instregex "VTB(L|X)4")>;
+
+  // 4.2.33 Advanced SIMD, Transpose
+  def : InstRW<[SwiftWriteP1FourCycle, SwiftWriteP1FourCycle,
+                SwiftWriteP1TwoCycle/*RsrcOnly*/, SchedReadAdvance<2>],
+        (instregex "VSWP", "VTRN", "VUZP", "VZIP")>;
+
+  // 4.2.34 Advanced SIMD and VFP, Floating Point
+  def : InstRW<[SwiftWriteP0TwoCycle], (instregex "VABS(S|D)$", "VNEG(S|D)$")>;
+  def : InstRW<[SwiftWriteP0FourCycle],
+        (instregex "VCMP(D|S|ZD|ZS)$", "VCMPE(D|S|ZD|ZS)")>;
+  def : InstRW<[SwiftWriteP0FourCycle],
+        (instregex "VADD(S|f)", "VSUB(S|f)", "VABD", "VPADDf", "VMAX", "VMIN", "VPMAX",
+                   "VPMIN")>;
+  def : InstRW<[SwiftWriteP0SixCycle], (instregex "VADDD$", "VSUBD$")>;
+  def : InstRW<[SwiftWriteP1EightCycle], (instregex "VRECPS", "VRSQRTS")>;
+
+  // 4.2.35 Advanced SIMD and VFP, Multiply
+  def : InstRW<[SwiftWriteP1FourCycle],
+        (instregex "VMUL(S|v|p|f|s)", "VNMULS", "VQDMULH", "VQRDMULH",
+                   "VMULL", "VQDMULL")>;
+  def : InstRW<[SwiftWriteP1SixCycle],
+        (instregex "VMULD", "VNMULD")>;
+  def : InstRW<[SwiftWriteP1FourCycle],
+        (instregex "VMLA", "VMLS", "VNMLA", "VNMLS", "VFMA(S|D)", "VFMS(S|D)",
+        "VFNMA", "VFNMS", "VMLAL", "VMLSL","VQDMLAL", "VQDMLSL")>;
+  def : InstRW<[SwiftWriteP1EightCycle], (instregex "VFMAfd", "VFMSfd")>;
+  def : InstRW<[SwiftWriteP1TwelveCyc], (instregex "VFMAfq", "VFMSfq")>;
+
+  // 4.2.36 Advanced SIMD and VFP, Convert
+  def : InstRW<[SwiftWriteP1FourCycle], (instregex "VCVT", "V(S|U)IT", "VTO(S|U)")>;
+  // Fixpoint conversions.
+  def : WriteRes<WriteCvtFP, [SwiftUnitP1]> { let Latency = 4; }
+
+  // 4.2.37 Advanced SIMD and VFP, Move
+  def : InstRW<[SwiftWriteP0TwoCycle],
+        (instregex "VMOVv", "VMOV(S|D)$", "VMOV(S|D)cc",
+                   "VMVNv", "VMVN(d|q)", "VMVN(S|D)cc",
+                   "FCONST(D|S)")>;
+  def : InstRW<[SwiftWriteP1TwoCycle], (instregex "VMOVN", "VMOVL")>;
+  def : InstRW<[WriteSequence<[SwiftWriteP0FourCycle, SwiftWriteP1TwoCycle]>],
+        (instregex "VQMOVN")>;
+  def : InstRW<[SwiftWriteP1TwoCycle], (instregex "VDUPLN", "VDUPf")>;
+  def : InstRW<[WriteSequence<[SwiftWriteP2FourCycle, SwiftWriteP1TwoCycle]>],
+        (instregex "VDUP(8|16|32)")>;
+  def : InstRW<[SwiftWriteP2ThreeCycle], (instregex "VMOVRS$")>;
+  def : InstRW<[WriteSequence<[SwiftWriteP2FourCycle, SwiftWriteP0TwoCycle]>],
+        (instregex "VMOVSR$", "VSETLN")>;
+  def : InstRW<[SwiftWriteP2ThreeCycle, SwiftWriteP2FourCycle],
+        (instregex "VMOVRR(D|S)$")>;
+  def : InstRW<[SwiftWriteP2FourCycle], (instregex "VMOVDRR$")>;
+  def : InstRW<[WriteSequence<[SwiftWriteP2FourCycle, SwiftWriteP1TwoCycle]>,
+                WriteSequence<[SwiftWrite1Cycle, SwiftWriteP2FourCycle,
+                               SwiftWriteP1TwoCycle]>],
+                (instregex "VMOVSRR$")>;
+  def : InstRW<[WriteSequence<[SwiftWriteP1TwoCycle, SwiftWriteP2ThreeCycle]>],
+        (instregex "VGETLN(u|i)")>;
+  def : InstRW<[WriteSequence<[SwiftWriteP1TwoCycle, SwiftWriteP2ThreeCycle,
+                               SwiftWriteP01OneCycle]>],
+        (instregex "VGETLNs")>;
+
+  // 4.2.38 Advanced SIMD and VFP, Move FPSCR
+  // Serializing instructions.
+  def SwiftWaitP0For15Cy : SchedWriteRes<[SwiftUnitP0]> {
+    let Latency = 15;
+    let ResourceCycles = [15];
+  }
+  def SwiftWaitP1For15Cy : SchedWriteRes<[SwiftUnitP1]> {
+    let Latency = 15;
+    let ResourceCycles = [15];
+  }
+  def SwiftWaitP2For15Cy : SchedWriteRes<[SwiftUnitP2]> {
+    let Latency = 15;
+    let ResourceCycles = [15];
+  }
+  def : InstRW<[SwiftWaitP0For15Cy, SwiftWaitP1For15Cy, SwiftWaitP2For15Cy],
+        (instregex "VMRS")>;
+  def : InstRW<[SwiftWaitP0For15Cy, SwiftWaitP1For15Cy, SwiftWaitP2For15Cy],
+        (instregex "VMSR")>;
+  // Not serializing.
+  def : InstRW<[SwiftWriteP0TwoCycle], (instregex "FMSTAT")>;
+
+  // 4.2.39 Advanced SIMD and VFP, Load Single Element
+  def : InstRW<[SwiftWriteLM4Cy], (instregex "VLDRD$", "VLDRS$")>;
+
+  // 4.2.40 Advanced SIMD and VFP, Store Single Element
+  def : InstRW<[SwiftWriteLM4Cy], (instregex "VSTRD$", "VSTRS$")>;
+
+  // 4.2.41 Advanced SIMD and VFP, Load Multiple
+  // 4.2.42 Advanced SIMD and VFP, Store Multiple
+
+  // Resource requirement for permuting, just reserves the resources.
+  foreach Num = 1-28 in {
+    def SwiftVLDMPerm#Num : SchedWriteRes<[SwiftUnitP1]> {
+      let Latency = 0;
+      let NumMicroOps = Num;
+      let ResourceCycles = [Num];
+    }
+  }
+
+  // Pre RA pseudos - load/store to a Q register as a D register pair.
+  def : InstRW<[SwiftWriteLM4Cy], (instregex "VLDMQIA$", "VSTMQIA$")>;
+
+  // Post RA not modelled accurately. We assume that register use of width 64
+  // bit maps to a D register, 128 maps to a Q register. Not all different kinds
+  // are accurately represented.
+  def SwiftWriteVLDM : SchedWriteVariant<[
+    // Load of one S register.
+    SchedVar<SwiftLMAddr1Pred, [SwiftWriteLM4Cy]>,
+    // Load of one D register.
+    SchedVar<SwiftLMAddr2Pred, [SwiftWriteLM4Cy, SwiftWriteLM4CyNo]>,
+    // Load of 3 S register.
+    SchedVar<SwiftLMAddr3Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM13CyNo, SwiftWriteP01OneCycle,
+                                SwiftVLDMPerm3]>,
+    // Load of a Q register (not neccessarily true). We should not be mapping to
+    // 4 S registers, either.
+    SchedVar<SwiftLMAddr4Pred, [SwiftWriteLM4Cy, SwiftWriteLM4CyNo,
+                                SwiftWriteLM4CyNo, SwiftWriteLM4CyNo]>,
+    // Load of 5 S registers.
+    SchedVar<SwiftLMAddr5Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM13CyNo, SwiftWriteLM14CyNo,
+                                SwiftWriteLM17CyNo,  SwiftWriteP01OneCycle,
+                                SwiftVLDMPerm5]>,
+    // Load of 3 D registers. (Must also be able to handle s register list -
+    // though, not accurate)
+    SchedVar<SwiftLMAddr6Pred, [SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM10Cy, SwiftWriteLM14CyNo,
+                                SwiftWriteLM14CyNo, SwiftWriteLM14CyNo,
+                                SwiftWriteP01OneCycle, SwiftVLDMPerm5]>,
+    // Load of 7 S registers.
+    SchedVar<SwiftLMAddr7Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14CyNo,
+                                SwiftWriteLM17CyNo, SwiftWriteLM18CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteP01OneCycle,
+                                SwiftVLDMPerm7]>,
+    // Load of two Q registers.
+    SchedVar<SwiftLMAddr8Pred, [SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteP01OneCycle,  SwiftVLDMPerm2]>,
+    // Load of 9 S registers.
+    SchedVar<SwiftLMAddr9Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14CyNo,
+                                SwiftWriteLM17CyNo, SwiftWriteLM18CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM25CyNo, SwiftWriteP01OneCycle,
+                                SwiftVLDMPerm9]>,
+    // Load of 5 D registers.
+    SchedVar<SwiftLMAddr10Pred,[SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM10Cy, SwiftWriteLM14Cy,
+                                SwiftWriteLM14CyNo, SwiftWriteLM14CyNo,
+                                SwiftWriteLM14CyNo, SwiftWriteLM14CyNo,
+                                SwiftWriteLM14CyNo,  SwiftWriteLM14CyNo,
+                                SwiftWriteP01OneCycle, SwiftVLDMPerm5]>,
+    // Inaccurate: reuse describtion from 9 S registers.
+    SchedVar<SwiftLMAddr11Pred,[SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14CyNo,
+                                SwiftWriteLM17CyNo, SwiftWriteLM18CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM25CyNo, SwiftWriteP01OneCycle,
+                                SwiftVLDMPerm9]>,
+    // Load of three Q registers.
+    SchedVar<SwiftLMAddr12Pred,[SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM11Cy,
+                                SwiftWriteLM11CyNo, SwiftWriteLM11CyNo,
+                                SwiftWriteLM11CyNo, SwiftWriteLM11CyNo,
+                                SwiftWriteLM11CyNo, SwiftWriteLM11CyNo,
+                                SwiftWriteLM11CyNo, SwiftWriteLM11CyNo,
+                                SwiftWriteP01OneCycle, SwiftVLDMPerm3]>,
+    // Inaccurate: reuse describtion from 9 S registers.
+    SchedVar<SwiftLMAddr13Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14CyNo,
+                                SwiftWriteLM17CyNo, SwiftWriteLM18CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM25CyNo, SwiftWriteP01OneCycle,
+                                SwiftVLDMPerm9]>,
+    // Load of 7 D registers inaccurate.
+    SchedVar<SwiftLMAddr14Pred,[SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM10Cy, SwiftWriteLM14Cy,
+                                SwiftWriteLM14Cy, SwiftWriteLM14CyNo,
+                                SwiftWriteLM14CyNo, SwiftWriteLM14CyNo,
+                                SwiftWriteLM14CyNo,  SwiftWriteLM14CyNo,
+                                SwiftWriteLM14CyNo,  SwiftWriteLM14CyNo,
+                                SwiftWriteP01OneCycle, SwiftVLDMPerm7]>,
+    SchedVar<SwiftLMAddr15Pred,[SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14Cy,
+                                SwiftWriteLM17Cy, SwiftWriteLM18CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM25CyNo, SwiftWriteP01OneCycle,
+                                SwiftVLDMPerm9]>,
+    // Load of 4 Q registers.
+    SchedVar<SwiftLMAddr16Pred,[SwiftWriteLM7Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM14Cy,
+                                SwiftWriteLM15Cy, SwiftWriteLM18CyNo,
+                                SwiftWriteLM19CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM19CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM19CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM19CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM19CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteP01OneCycle, SwiftVLDMPerm4]>,
+    // Unknow number of registers, just use resources for two registers.
+    SchedVar<NoSchedPred,      [SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteP01OneCycle,  SwiftVLDMPerm2]>
+  ]> { let Variadic = 1; }
+
+  def : InstRW<[SwiftWriteVLDM], (instregex "VLDM[SD](IA|DB)$")>;
+
+  def : InstRW<[SwiftWriteP01OneCycle2x, SwiftWriteVLDM],
+        (instregex "VLDM[SD](IA|DB)_UPD$")>;
+
+  def SwiftWriteVSTM : SchedWriteVariant<[
+    // One S register.
+    SchedVar<SwiftLMAddr1Pred, [SwiftWriteSTM1]>,
+    // One D register.
+    SchedVar<SwiftLMAddr2Pred, [SwiftWriteSTM1]>,
+    // Three S registers.
+    SchedVar<SwiftLMAddr3Pred, [SwiftWriteSTM4]>,
+    // Assume one Q register.
+    SchedVar<SwiftLMAddr4Pred, [SwiftWriteSTM1]>,
+    SchedVar<SwiftLMAddr5Pred, [SwiftWriteSTM6]>,
+    // Assume three D registers.
+    SchedVar<SwiftLMAddr6Pred, [SwiftWriteSTM4]>,
+    SchedVar<SwiftLMAddr7Pred, [SwiftWriteSTM8]>,
+    // Assume two Q registers.
+    SchedVar<SwiftLMAddr8Pred, [SwiftWriteSTM3]>,
+    SchedVar<SwiftLMAddr9Pred, [SwiftWriteSTM10]>,
+    // Assume 5 D registers.
+    SchedVar<SwiftLMAddr10Pred, [SwiftWriteSTM6]>,
+    SchedVar<SwiftLMAddr11Pred, [SwiftWriteSTM12]>,
+    // Asume three Q registers.
+    SchedVar<SwiftLMAddr12Pred, [SwiftWriteSTM4]>,
+    SchedVar<SwiftLMAddr13Pred, [SwiftWriteSTM14]>,
+    // Assume 7 D registers.
+    SchedVar<SwiftLMAddr14Pred, [SwiftWriteSTM8]>,
+    SchedVar<SwiftLMAddr15Pred, [SwiftWriteSTM16]>,
+    // Assume four Q registers.
+    SchedVar<SwiftLMAddr16Pred, [SwiftWriteSTM5]>,
+    // Asumme two Q registers.
+    SchedVar<NoSchedPred, [SwiftWriteSTM3]>
+  ]> { let Variadic = 1; }
+
+  def : InstRW<[SwiftWriteVSTM], (instregex "VSTM[SD](IA|DB)$")>;
+
+  def : InstRW<[SwiftWriteP01OneCycle2x, SwiftWriteVSTM],
+        (instregex "VSTM[SD](IA|DB)_UPD")>;
+
+  // 4.2.43 Advanced SIMD, Element or Structure Load and Store
+  def SwiftWrite2xP2FourCy : SchedWriteRes<[SwiftUnitP2]> {
+      let Latency = 4;
+      let ResourceCycles = [2];
+  }
+  def SwiftWrite3xP2FourCy : SchedWriteRes<[SwiftUnitP2]> {
+      let Latency = 4;
+      let ResourceCycles = [3];
+  }
+  foreach Num = 1-2 in {
+    def SwiftExt#Num#xP0 : SchedWriteRes<[SwiftUnitP0]> {
+      let Latency = 0;
+      let NumMicroOps = Num;
+      let ResourceCycles = [Num];
+    }
+  }
+  // VLDx
+  // Multiple structures.
+  // Single element structure loads.
+  // We assume aligned.
+  // Single/two register.
+  def : InstRW<[SwiftWriteLM4Cy], (instregex "VLD1(d|q)(8|16|32|64)$")>;
+  def : InstRW<[SwiftWriteLM4Cy, SwiftWriteP01OneCycle],
+        (instregex "VLD1(d|q)(8|16|32|64)wb")>;
+  // Three register.
+  def : InstRW<[SwiftWrite3xP2FourCy],
+        (instregex "VLD1(d|q)(8|16|32|64)T$", "VLD1d64TPseudo")>;
+  def : InstRW<[SwiftWrite3xP2FourCy, SwiftWriteP01OneCycle],
+        (instregex "VLD1(d|q)(8|16|32|64)Twb")>;
+  /// Four Register.
+  def : InstRW<[SwiftWrite2xP2FourCy],
+        (instregex "VLD1(d|q)(8|16|32|64)Q$", "VLD1d64QPseudo")>;
+  def : InstRW<[SwiftWrite2xP2FourCy, SwiftWriteP01OneCycle],
+        (instregex "VLD1(d|q)(8|16|32|64)Qwb")>;
+  // Two element structure loads.
+  // Two/four register.
+  def : InstRW<[SwiftWriteLM9Cy, SwiftExt2xP0, SwiftVLDMPerm2],
+        (instregex "VLD2(d|q|b)(8|16|32)$", "VLD2q(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteLM9Cy, SwiftWriteP01OneCycle, SwiftExt2xP0,
+                SwiftVLDMPerm2],
+        (instregex "VLD2(d|q|b)(8|16|32)wb", "VLD2q(8|16|32)PseudoWB")>;
+  // Three element structure.
+  def : InstRW<[SwiftWriteLM9Cy, SwiftWriteLM9CyNo, SwiftWriteLM9CyNo,
+                SwiftVLDMPerm3, SwiftWrite3xP2FourCy],
+        (instregex "VLD3(d|q)(8|16|32)$")>;
+  def : InstRW<[SwiftWriteLM9Cy, SwiftVLDMPerm3, SwiftWrite3xP2FourCy],
+        (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo$")>;
+
+  def : InstRW<[SwiftWriteLM9Cy, SwiftWriteLM9CyNo, SwiftWriteLM9CyNo,
+                SwiftWriteP01OneCycle, SwiftVLDMPerm3, SwiftWrite3xP2FourCy],
+        (instregex "VLD3(d|q)(8|16|32)_UPD$")>;
+  def : InstRW<[SwiftWriteLM9Cy, SwiftWriteP01OneCycle, SwiftVLDMPerm3,
+                SwiftWrite3xP2FourCy],
+        (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
+  // Four element structure loads.
+  def : InstRW<[SwiftWriteLM11Cy, SwiftWriteLM11Cy, SwiftWriteLM11Cy,
+                SwiftWriteLM11Cy, SwiftExt2xP0, SwiftVLDMPerm4,
+                SwiftWrite3xP2FourCy],
+        (instregex "VLD4(d|q)(8|16|32)$")>;
+  def : InstRW<[SwiftWriteLM11Cy,  SwiftExt2xP0, SwiftVLDMPerm4,
+                SwiftWrite3xP2FourCy],
+        (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo$")>;
+  def : InstRW<[SwiftWriteLM11Cy, SwiftWriteLM11Cy, SwiftWriteLM11Cy,
+                SwiftWriteLM11Cy, SwiftWriteP01OneCycle, SwiftExt2xP0,
+                SwiftVLDMPerm4, SwiftWrite3xP2FourCy],
+        (instregex "VLD4(d|q)(8|16|32)_UPD")>;
+  def : InstRW<[SwiftWriteLM11Cy, SwiftWriteP01OneCycle, SwiftExt2xP0,
+                SwiftVLDMPerm4, SwiftWrite3xP2FourCy],
+        (instregex  "VLD4(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
+
+  // Single all/lane loads.
+  // One element structure.
+  def : InstRW<[SwiftWriteLM6Cy, SwiftVLDMPerm2],
+        (instregex "VLD1(LN|DUP)(d|q)(8|16|32)$", "VLD1(LN|DUP)(d|q)(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteLM6Cy, SwiftWriteP01OneCycle, SwiftVLDMPerm2],
+        (instregex "VLD1(LN|DUP)(d|q)(8|16|32)(wb|_UPD)",
+                  "VLD1LNq(8|16|32)Pseudo_UPD")>;
+  // Two element structure.
+  def : InstRW<[SwiftWriteLM6Cy, SwiftWriteLM6Cy, SwiftExt1xP0, SwiftVLDMPerm2],
+        (instregex "VLD2(DUP|LN)(d|q)(8|16|32|8x2|16x2|32x2)$",
+                   "VLD2LN(d|q)(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteLM6Cy, SwiftWriteLM6Cy, SwiftWriteP01OneCycle,
+                SwiftExt1xP0, SwiftVLDMPerm2],
+        (instregex "VLD2LN(d|q)(8|16|32)_UPD$")>;
+  def : InstRW<[SwiftWriteLM6Cy, SwiftWriteP01OneCycle, SwiftWriteLM6Cy,
+                SwiftExt1xP0, SwiftVLDMPerm2],
+        (instregex "VLD2DUPd(8|16|32|8x2|16x2|32x2)wb")>;
+  def : InstRW<[SwiftWriteLM6Cy, SwiftWriteP01OneCycle, SwiftWriteLM6Cy,
+                SwiftExt1xP0, SwiftVLDMPerm2],
+        (instregex "VLD2LN(d|q)(8|16|32)Pseudo_UPD")>;
+  // Three element structure.
+  def : InstRW<[SwiftWriteLM7Cy, SwiftWriteLM8Cy, SwiftWriteLM8Cy, SwiftExt1xP0,
+                SwiftVLDMPerm3],
+        (instregex "VLD3(DUP|LN)(d|q)(8|16|32)$",
+                   "VLD3(LN|DUP)(d|q)(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteLM7Cy, SwiftWriteLM8Cy, SwiftWriteLM8Cy,
+                SwiftWriteP01OneCycle, SwiftExt1xP0, SwiftVLDMPerm3],
+        (instregex "VLD3(LN|DUP)(d|q)(8|16|32)_UPD")>;
+  def : InstRW<[SwiftWriteLM7Cy, SwiftWriteP01OneCycle, SwiftWriteLM8Cy,
+                SwiftWriteLM8Cy, SwiftExt1xP0, SwiftVLDMPerm3],
+        (instregex "VLD3(LN|DUP)(d|q)(8|16|32)Pseudo_UPD")>;
+  // Four element struture.
+  def : InstRW<[SwiftWriteLM8Cy, SwiftWriteLM9Cy, SwiftWriteLM10CyNo,
+                SwiftWriteLM10CyNo, SwiftExt1xP0, SwiftVLDMPerm5],
+        (instregex "VLD4(LN|DUP)(d|q)(8|16|32)$",
+                   "VLD4(LN|DUP)(d|q)(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteLM8Cy, SwiftWriteLM9Cy, SwiftWriteLM10CyNo,
+                SwiftWriteLM10CyNo, SwiftWriteP01OneCycle, SwiftExt1xP0,
+                SwiftVLDMPerm5],
+        (instregex "VLD4(DUP|LN)(d|q)(8|16|32)_UPD")>;
+  def : InstRW<[SwiftWriteLM8Cy, SwiftWriteP01OneCycle, SwiftWriteLM9Cy,
+                SwiftWriteLM10CyNo, SwiftWriteLM10CyNo, SwiftExt1xP0,
+                SwiftVLDMPerm5],
+        (instregex "VLD4(DUP|LN)(d|q)(8|16|32)Pseudo_UPD")>;
+  // VSTx
+  // Multiple structures.
+  // Single element structure store.
+  def : InstRW<[SwiftWrite1xP2], (instregex "VST1d(8|16|32|64)$")>;
+  def : InstRW<[SwiftWrite2xP2], (instregex "VST1q(8|16|32|64)$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite1xP2],
+        (instregex "VST1d(8|16|32|64)wb")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite2xP2],
+        (instregex "VST1q(8|16|32|64)wb")>;
+  def : InstRW<[SwiftWrite3xP2],
+        (instregex "VST1d(8|16|32|64)T$", "VST1d64TPseudo$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite3xP2],
+        (instregex "VST1d(8|16|32|64)Twb", "VST1d64TPseudoWB")>;
+  def : InstRW<[SwiftWrite4xP2],
+        (instregex "VST1d(8|16|32|64)(Q|QPseudo)$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite4xP2],
+        (instregex "VST1d(8|16|32|64)(Qwb|QPseudoWB)")>;
+  // Two element structure store.
+  def : InstRW<[SwiftWrite1xP2, SwiftVLDMPerm1],
+        (instregex "VST2(d|b)(8|16|32)$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite1xP2, SwiftVLDMPerm1],
+        (instregex "VST2(b|d)(8|16|32)wb")>;
+  def : InstRW<[SwiftWrite2xP2, SwiftVLDMPerm2],
+        (instregex "VST2q(8|16|32)$", "VST2q(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWrite2xP2, SwiftVLDMPerm2],
+        (instregex "VST2q(8|16|32)wb", "VST2q(8|16|32)PseudoWB")>;
+  // Three element structure store.
+  def : InstRW<[SwiftWrite4xP2, SwiftVLDMPerm2],
+        (instregex "VST3(d|q)(8|16|32)$", "VST3(d|q)(8|16|32)(oddP|P)seudo$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite4xP2, SwiftVLDMPerm2],
+        (instregex "VST3(d|q)(8|16|32)_UPD",
+                   "VST3(d|q)(8|16|32)(oddP|P)seudo_UPD$")>;
+  // Four element structure store.
+  def : InstRW<[SwiftWrite4xP2, SwiftVLDMPerm2],
+        (instregex "VST4(d|q)(8|16|32)$", "VST4(d|q)(8|16|32)(oddP|P)seudo$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite4xP2, SwiftVLDMPerm4],
+        (instregex "VST4(d|q)(8|16|32)_UPD",
+                   "VST4(d|q)(8|16|32)(oddP|P)seudo_UPD$")>;
+  // Single/all lane store.
+  // One element structure.
+  def : InstRW<[SwiftWrite1xP2, SwiftVLDMPerm1],
+        (instregex "VST1LNd(8|16|32)$", "VST1LNq(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite1xP2, SwiftVLDMPerm1],
+        (instregex "VST1LNd(8|16|32)_UPD", "VST1LNq(8|16|32)Pseudo_UPD")>;
+  // Two element structure.
+  def : InstRW<[SwiftWrite1xP2, SwiftVLDMPerm2],
+        (instregex "VST2LN(d|q)(8|16|32)$", "VST2LN(d|q)(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite1xP2, SwiftVLDMPerm2],
+        (instregex "VST2LN(d|q)(8|16|32)_UPD",
+                   "VST2LN(d|q)(8|16|32)Pseudo_UPD")>;
+  // Three element structure.
+  def : InstRW<[SwiftWrite4xP2, SwiftVLDMPerm2],
+        (instregex "VST3LN(d|q)(8|16|32)$", "VST3LN(d|q)(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite4xP2, SwiftVLDMPerm2],
+        (instregex "VST3LN(d|q)(8|16|32)_UPD",
+                   "VST3LN(d|q)(8|16|32)Pseudo_UPD")>;
+  // Four element structure.
+  def : InstRW<[SwiftWrite2xP2, SwiftVLDMPerm2],
+        (instregex "VST4LN(d|q)(8|16|32)$", "VST4LN(d|q)(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite2xP2, SwiftVLDMPerm2],
+        (instregex "VST4LN(d|q)(8|16|32)_UPD",
+                   "VST4LN(d|q)(8|16|32)Pseudo_UPD")>;
+
+  // 4.2.44 VFP, Divide and Square Root
+  def SwiftDiv17 : SchedWriteRes<[SwiftUnitP0, SwiftUnitDiv]> {
+    let NumMicroOps = 1;
+    let Latency = 17;
+    let ResourceCycles = [1, 15];
+  }
+  def SwiftDiv32 : SchedWriteRes<[SwiftUnitP0, SwiftUnitDiv]> {
+    let NumMicroOps = 1;
+    let Latency = 32;
+    let ResourceCycles = [1, 30];
+  }
+  def : InstRW<[SwiftDiv17], (instregex "VDIVS", "VSQRTS")>;
+  def : InstRW<[SwiftDiv32], (instregex "VDIVD", "VSQRTD")>;
+
+  // Not specified.
+  def : InstRW<[SwiftWriteP01OneCycle2x], (instregex "ABS")>;
+  // Preload.
+  def : WriteRes<WritePreLd, [SwiftUnitP2]> { let Latency = 0;
+    let ResourceCycles = [0];
+  }
+
 }
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index 41a7e0c..93add6e 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -26,7 +26,7 @@ ARMSelectionDAGInfo::~ARMSelectionDAGInfo() {
 }
 
 SDValue
-ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
                                              SDValue Chain,
                                              SDValue Dst, SDValue Src,
                                              SDValue Size, unsigned Align,
@@ -140,7 +140,7 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
 // GNU library uses (ptr, value, size)
 // See RTABI section 4.3.4
 SDValue ARMSelectionDAGInfo::
-EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
                         SDValue Chain, SDValue Dst,
                         SDValue Src, SDValue Size,
                         unsigned Align, bool isVolatile,
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.h b/lib/Target/ARM/ARMSelectionDAGInfo.h
index 6419a73..56c9375 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.h
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.h
@@ -45,7 +45,7 @@ public:
   ~ARMSelectionDAGInfo();
 
   virtual
-  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
                                   SDValue Chain,
                                   SDValue Dst, SDValue Src,
                                   SDValue Size, unsigned Align,
@@ -55,7 +55,7 @@ public:
 
   // Adjust parameters for memset, see RTABI section 4.3.4
   virtual
-  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
                                   SDValue Chain,
                                   SDValue Op1, SDValue Op2,
                                   SDValue Op3, unsigned Align,
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 3b8e56f..4d204ce 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -38,9 +38,24 @@ static cl::opt<bool>
 UseFusedMulOps("arm-use-mulops",
                cl::init(true), cl::Hidden);
 
-static cl::opt<bool>
-StrictAlign("arm-strict-align", cl::Hidden,
-            cl::desc("Disallow all unaligned memory accesses"));
+enum AlignMode {
+  DefaultAlign,
+  StrictAlign,
+  NoStrictAlign
+};
+
+static cl::opt<AlignMode>
+Align(cl::desc("Load/store alignment support"),
+      cl::Hidden, cl::init(DefaultAlign),
+      cl::values(
+          clEnumValN(DefaultAlign,  "arm-default-align",
+                     "Generate unaligned accesses only on hardware/OS "
+                     "combinations that are known to support them"),
+          clEnumValN(StrictAlign,   "arm-strict-align",
+                     "Disallow all unaligned memory accesses"),
+          clEnumValN(NoStrictAlign, "arm-no-strict-align",
+                     "Allow unaligned memory accesses"),
+          clEnumValEnd));
 
 ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
                            const std::string &FS, const TargetOptions &Options)
@@ -91,6 +106,7 @@ void ARMSubtarget::initializeEnvironment() {
   HasRAS = false;
   HasMPExtension = false;
   FPOnlySP = false;
+  HasPerfMon = false;
   HasTrustZone = false;
   AllowsUnalignedMem = false;
   Thumb2DSP = false;
@@ -162,10 +178,32 @@ void ARMSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
   if (!isThumb() || hasThumb2())
     PostRAScheduler = true;
 
-  // v6+ may or may not support unaligned mem access depending on the system
-  // configuration.
-  if (!StrictAlign && hasV6Ops() && isTargetDarwin())
-    AllowsUnalignedMem = true;
+  switch (Align) {
+    case DefaultAlign:
+      // Assume pre-ARMv6 doesn't support unaligned accesses.
+      //
+      // ARMv6 may or may not support unaligned accesses depending on the
+      // SCTLR.U bit, which is architecture-specific. We assume ARMv6
+      // Darwin targets support unaligned accesses, and others don't.
+      //
+      // ARMv7 always has SCTLR.U set to 1, but it has a new SCTLR.A bit
+      // which raises an alignment fault on unaligned accesses. Linux
+      // defaults this bit to 0 and handles it as a system-wide (not
+      // per-process) setting. It is therefore safe to assume that ARMv7+
+      // Linux targets support unaligned accesses. The same goes for NaCl.
+      //
+      // The above behavior is consistent with GCC.
+      AllowsUnalignedMem = (
+          (hasV7Ops() && (isTargetLinux() || isTargetNaCl())) ||
+          (hasV6Ops() && isTargetDarwin()));
+      break;
+    case StrictAlign:
+      AllowsUnalignedMem = false;
+      break;
+    case NoStrictAlign:
+      AllowsUnalignedMem = true;
+      break;
+  }
 
   // NEON f32 ops are non-IEEE 754 compliant. Darwin is ok with it by default.
   uint64_t Bits = getFeatureBits();
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 038eb76..bc5af96 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -148,6 +148,11 @@ protected:
   /// precision.
   bool FPOnlySP;
 
+  /// If true, the processor supports the Performance Monitor Extensions. These
+  /// include a generic cycle-counter as well as more fine-grained (often
+  /// implementation-specific) events.
+  bool HasPerfMon;
+
   /// HasTrustZone - if true, processor supports TrustZone security extensions
   bool HasTrustZone;
 
@@ -254,6 +259,7 @@ public:
   bool hasVMLxForwarding() const { return HasVMLxForwarding; }
   bool isFPBrccSlow() const { return SlowFPBrcc; }
   bool isFPOnlySP() const { return FPOnlySP; }
+  bool hasPerfMon() const { return HasPerfMon; }
   bool hasTrustZone() const { return HasTrustZone; }
   bool prefers32BitThumb() const { return Pref32BitThumb; }
   bool avoidCPSRPartialUpdate() const { return AvoidCPSRPartialUpdate; }
@@ -270,9 +276,8 @@ public:
 
   bool isTargetIOS() const { return TargetTriple.getOS() == Triple::IOS; }
   bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
-  bool isTargetNaCl() const {
-    return TargetTriple.getOS() == Triple::NaCl;
-  }
+  bool isTargetNaCl() const { return TargetTriple.getOS() == Triple::NaCl; }
+  bool isTargetLinux() const { return TargetTriple.getOS() == Triple::Linux; }
   bool isTargetELF() const { return !isTargetDarwin(); }
 
   bool isAPCS_ABI() const { return TargetABI == ARM_ABI_APCS; }
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 42c7d2c..17c52c9 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -85,6 +85,7 @@ ARMTargetMachine::ARMTargetMachine(const Target &T, StringRef TT,
     TLInfo(*this),
     TSInfo(*this),
     FrameLowering(Subtarget) {
+  initAsmInfo();
   if (!Subtarget.hasARMOps())
     report_fatal_error("CPU: '" + Subtarget.getCPUString() + "' does not "
                        "support ARM mode execution!");
@@ -117,6 +118,7 @@ ThumbTargetMachine::ThumbTargetMachine(const Target &T, StringRef TT,
     FrameLowering(Subtarget.hasThumb2()
               ? new ARMFrameLowering(Subtarget)
               : (ARMFrameLowering*)new Thumb1FrameLowering(Subtarget)) {
+  initAsmInfo();
 }
 
 namespace {
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 114cc9e..c59ca64 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -49,6 +49,20 @@ class ARMAsmParser : public MCTargetAsmParser {
   MCAsmParser &Parser;
   const MCRegisterInfo *MRI;
 
+  // Unwind directives state
+  SMLoc FnStartLoc;
+  SMLoc CantUnwindLoc;
+  SMLoc PersonalityLoc;
+  SMLoc HandlerDataLoc;
+  int FPReg;
+  void resetUnwindDirectiveParserState() {
+    FnStartLoc = SMLoc();
+    CantUnwindLoc = SMLoc();
+    PersonalityLoc = SMLoc();
+    HandlerDataLoc = SMLoc();
+    FPReg = -1;
+  }
+
   // Map of register aliases registers via the .req directive.
   StringMap<unsigned> RegisterReqs;
 
@@ -76,7 +90,7 @@ class ARMAsmParser : public MCTargetAsmParser {
     if (!inITBlock()) return;
     // Move to the next instruction in the IT block, if there is one. If not,
     // mark the block as done.
-    unsigned TZ = CountTrailingZeros_32(ITState.Mask);
+    unsigned TZ = countTrailingZeros(ITState.Mask);
     if (++ITState.CurPosition == 5 - TZ)
       ITState.CurPosition = ~0U; // Done with the IT block after this.
   }
@@ -86,11 +100,11 @@ class ARMAsmParser : public MCTargetAsmParser {
   MCAsmLexer &getLexer() const { return Parser.getLexer(); }
 
   bool Warning(SMLoc L, const Twine &Msg,
-               ArrayRef<SMRange> Ranges = ArrayRef<SMRange>()) {
+               ArrayRef<SMRange> Ranges = None) {
     return Parser.Warning(L, Msg, Ranges);
   }
   bool Error(SMLoc L, const Twine &Msg,
-             ArrayRef<SMRange> Ranges = ArrayRef<SMRange>()) {
+             ArrayRef<SMRange> Ranges = None) {
     return Parser.Error(L, Msg, Ranges);
   }
 
@@ -113,6 +127,14 @@ class ARMAsmParser : public MCTargetAsmParser {
   bool parseDirectiveUnreq(SMLoc L);
   bool parseDirectiveArch(SMLoc L);
   bool parseDirectiveEabiAttr(SMLoc L);
+  bool parseDirectiveFnStart(SMLoc L);
+  bool parseDirectiveFnEnd(SMLoc L);
+  bool parseDirectiveCantUnwind(SMLoc L);
+  bool parseDirectivePersonality(SMLoc L);
+  bool parseDirectiveHandlerData(SMLoc L);
+  bool parseDirectiveSetFP(SMLoc L);
+  bool parseDirectivePad(SMLoc L);
+  bool parseDirectiveRegSave(SMLoc L, bool IsVector);
 
   StringRef splitMnemonic(StringRef Mnemonic, unsigned &PredicationCode,
                           bool &CarrySetting, unsigned &ProcessorIMod,
@@ -130,12 +152,19 @@ class ARMAsmParser : public MCTargetAsmParser {
   bool isThumbTwo() const {
     return isThumb() && (STI.getFeatureBits() & ARM::FeatureThumb2);
   }
+  bool hasThumb() const {
+    return STI.getFeatureBits() & ARM::HasV4TOps;
+  }
   bool hasV6Ops() const {
     return STI.getFeatureBits() & ARM::HasV6Ops;
   }
   bool hasV7Ops() const {
     return STI.getFeatureBits() & ARM::HasV7Ops;
   }
+  bool hasARM() const {
+    return !(STI.getFeatureBits() & ARM::FeatureNoARM);
+  }
+
   void SwitchMode() {
     unsigned FB = ComputeAvailableFeatures(STI.ToggleFeature(ARM::ModeThumb));
     setAvailableFeatures(FB);
@@ -161,6 +190,8 @@ class ARMAsmParser : public MCTargetAsmParser {
     SmallVectorImpl<MCParsedAsmOperand*>&);
   OperandMatchResultTy parseMemBarrierOptOperand(
     SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parseInstSyncBarrierOptOperand(
+    SmallVectorImpl<MCParsedAsmOperand*>&);
   OperandMatchResultTy parseProcIFlagsOperand(
     SmallVectorImpl<MCParsedAsmOperand*>&);
   OperandMatchResultTy parseMSRMaskOperand(
@@ -242,7 +273,7 @@ public:
   };
 
   ARMAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser)
-    : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
+    : MCTargetAsmParser(), STI(_STI), Parser(_Parser), FPReg(-1) {
     MCAsmParserExtension::Initialize(_Parser);
 
     // Cache the MCRegisterInfo.
@@ -293,6 +324,7 @@ class ARMOperand : public MCParsedAsmOperand {
     k_CoprocOption,
     k_Immediate,
     k_MemBarrierOpt,
+    k_InstSyncBarrierOpt,
     k_Memory,
     k_PostIndexRegister,
     k_MSRMask,
@@ -336,6 +368,10 @@ class ARMOperand : public MCParsedAsmOperand {
     ARM_MB::MemBOpt Val;
   };
 
+  struct ISBOptOp {
+    ARM_ISB::InstSyncBOpt Val;
+  };
+
   struct IFlagsOp {
     ARM_PROC::IFlags Val;
   };
@@ -422,6 +458,7 @@ class ARMOperand : public MCParsedAsmOperand {
     struct CopOp Cop;
     struct CoprocOptionOp CoprocOption;
     struct MBOptOp MBOpt;
+    struct ISBOptOp ISBOpt;
     struct ITMaskOp ITMask;
     struct IFlagsOp IFlags;
     struct MMaskOp MMask;
@@ -482,6 +519,8 @@ public:
     case k_MemBarrierOpt:
       MBOpt = o.MBOpt;
       break;
+    case k_InstSyncBarrierOpt:
+      ISBOpt = o.ISBOpt;
     case k_Memory:
       Memory = o.Memory;
       break;
@@ -564,6 +603,11 @@ public:
     return MBOpt.Val;
   }
 
+  ARM_ISB::InstSyncBOpt getInstSyncBarrierOpt() const {
+    assert(Kind == k_InstSyncBarrierOpt && "Invalid access!");
+    return ISBOpt.Val;
+  }
+
   ARM_PROC::IFlags getProcIFlags() const {
     assert(Kind == k_ProcIFlags && "Invalid access!");
     return IFlags.Val;
@@ -903,6 +947,7 @@ public:
   bool isSPRRegList() const { return Kind == k_SPRRegisterList; }
   bool isToken() const { return Kind == k_Token; }
   bool isMemBarrierOpt() const { return Kind == k_MemBarrierOpt; }
+  bool isInstSyncBarrierOpt() const { return Kind == k_InstSyncBarrierOpt; }
   bool isMem() const { return Kind == k_Memory; }
   bool isShifterImm() const { return Kind == k_ShifterImmediate; }
   bool isRegShiftedReg() const { return Kind == k_ShiftedRegister; }
@@ -949,7 +994,7 @@ public:
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
     int64_t Val = CE->getValue();
-    return Val > -4096 && Val < 4096;
+    return (Val == INT32_MIN) || (Val > -4096 && Val < 4096);
   }
   bool isAddrMode3() const {
     // If we have an immediate that's not a constant, treat it as a label
@@ -1680,6 +1725,11 @@ public:
     Inst.addOperand(MCOperand::CreateImm(unsigned(getMemBarrierOpt())));
   }
 
+  void addInstSyncBarrierOptOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(unsigned(getInstSyncBarrierOpt())));
+  }
+
   void addMemNoOffsetOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
@@ -2345,6 +2395,15 @@ public:
     return Op;
   }
 
+  static ARMOperand *CreateInstSyncBarrierOpt(ARM_ISB::InstSyncBOpt Opt,
+                                              SMLoc S) {
+    ARMOperand *Op = new ARMOperand(k_InstSyncBarrierOpt);
+    Op->ISBOpt.Val = Opt;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
   static ARMOperand *CreateProcIFlags(ARM_PROC::IFlags IFlags, SMLoc S) {
     ARMOperand *Op = new ARMOperand(k_ProcIFlags);
     Op->IFlags.Val = IFlags;
@@ -2399,6 +2458,9 @@ void ARMOperand::print(raw_ostream &OS) const {
   case k_MemBarrierOpt:
     OS << "<ARM_MB::" << MemBOptToString(getMemBarrierOpt()) << ">";
     break;
+  case k_InstSyncBarrierOpt:
+    OS << "<ARM_ISB::" << InstSyncBOptToString(getInstSyncBarrierOpt()) << ">";
+    break;
   case k_Memory:
     OS << "<memory "
        << " base:" << Memory.BaseRegNum;
@@ -3036,7 +3098,7 @@ parseVectorLane(VectorLaneTy &LaneKind, unsigned &Index, SMLoc &EndLoc) {
     // There's an optional '#' token here. Normally there wouldn't be, but
     // inline assemble puts one in, and it's friendly to accept that.
     if (Parser.getTok().is(AsmToken::Hash))
-      Parser.Lex(); // Eat the '#'
+      Parser.Lex(); // Eat '#' or '$'.
 
     const MCExpr *LaneIndex;
     SMLoc Loc = Parser.getTok().getLoc();
@@ -3354,7 +3416,7 @@ parseMemBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
              Tok.is(AsmToken::Dollar) ||
              Tok.is(AsmToken::Integer)) {
     if (Parser.getTok().isNot(AsmToken::Integer))
-      Parser.Lex(); // Eat the '#'.
+      Parser.Lex(); // Eat '#' or '$'.
     SMLoc Loc = Parser.getTok().getLoc();
 
     const MCExpr *MemBarrierID;
@@ -3383,6 +3445,57 @@ parseMemBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   return MatchOperand_Success;
 }
 
+/// parseInstSyncBarrierOptOperand - Try to parse ISB inst sync barrier options.
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseInstSyncBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  unsigned Opt;
+
+  if (Tok.is(AsmToken::Identifier)) {
+    StringRef OptStr = Tok.getString();
+
+    if (OptStr.lower() == "sy")
+      Opt = ARM_ISB::SY;
+    else
+      return MatchOperand_NoMatch;
+
+    Parser.Lex(); // Eat identifier token.
+  } else if (Tok.is(AsmToken::Hash) ||
+             Tok.is(AsmToken::Dollar) ||
+             Tok.is(AsmToken::Integer)) {
+    if (Parser.getTok().isNot(AsmToken::Integer))
+      Parser.Lex(); // Eat '#' or '$'.
+    SMLoc Loc = Parser.getTok().getLoc();
+
+    const MCExpr *ISBarrierID;
+    if (getParser().parseExpression(ISBarrierID)) {
+      Error(Loc, "illegal expression");
+      return MatchOperand_ParseFail;
+    }
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ISBarrierID);
+    if (!CE) {
+      Error(Loc, "constant expression expected");
+      return MatchOperand_ParseFail;
+    }
+
+    int Val = CE->getValue();
+    if (Val & ~0xf) {
+      Error(Loc, "immediate value out of range");
+      return MatchOperand_ParseFail;
+    }
+
+    Opt = ARM_ISB::RESERVED_0 + Val;
+  } else
+    return MatchOperand_ParseFail;
+
+  Operands.push_back(ARMOperand::CreateInstSyncBarrierOpt(
+          (ARM_ISB::InstSyncBOpt)Opt, S));
+  return MatchOperand_Success;
+}
+
+
 /// parseProcIFlagsOperand - Try to parse iflags from CPS instruction.
 ARMAsmParser::OperandMatchResultTy ARMAsmParser::
 parseProcIFlagsOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
@@ -3602,7 +3715,7 @@ parseSetEndImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     Error(S, "'be' or 'le' operand expected");
     return MatchOperand_ParseFail;
   }
-  int Val = StringSwitch<int>(Tok.getString())
+  int Val = StringSwitch<int>(Tok.getString().lower())
     .Case("be", 1)
     .Case("le", 0)
     .Default(-1);
@@ -3875,7 +3988,7 @@ parseAM3Offset(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Do immediates first, as we always parse those if we have a '#'.
   if (Parser.getTok().is(AsmToken::Hash) ||
       Parser.getTok().is(AsmToken::Dollar)) {
-    Parser.Lex(); // Eat the '#'.
+    Parser.Lex(); // Eat '#' or '$'.
     // Explicitly look for a '-', as we need to encode negative zero
     // differently.
     bool isNegative = Parser.getTok().is(AsmToken::Minus);
@@ -4354,7 +4467,7 @@ parseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
       Parser.getTok().is(AsmToken::Dollar) ||
       Parser.getTok().is(AsmToken::Integer)) {
     if (Parser.getTok().isNot(AsmToken::Integer))
-      Parser.Lex(); // Eat the '#'.
+      Parser.Lex(); // Eat '#' or '$'.
     E = Parser.getTok().getLoc();
 
     bool isNegative = getParser().getTok().is(AsmToken::Minus);
@@ -4536,7 +4649,7 @@ parseFPImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
                            TyOp->getToken() != ".f64"))
     return MatchOperand_NoMatch;
 
-  Parser.Lex(); // Eat the '#'.
+  Parser.Lex(); // Eat '#' or '$'.
 
   // Handle negation, as that still comes through as a separate token.
   bool isNegative = false;
@@ -7398,11 +7511,10 @@ processInstruction(MCInst &Inst,
     MCOperand &MO = Inst.getOperand(1);
     unsigned Mask = MO.getImm();
     unsigned OrigMask = Mask;
-    unsigned TZ = CountTrailingZeros_32(Mask);
+    unsigned TZ = countTrailingZeros(Mask);
     if ((Inst.getOperand(0).getImm() & 1) == 0) {
       assert(Mask && TZ <= 3 && "illegal IT mask value!");
-      for (unsigned i = 3; i != TZ; --i)
-        Mask ^= 1 << i;
+      Mask ^= (0xE << TZ) & 0xF;
     }
     MO.setImm(Mask);
 
@@ -7658,6 +7770,24 @@ bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) {
     return parseDirectiveArch(DirectiveID.getLoc());
   else if (IDVal == ".eabi_attribute")
     return parseDirectiveEabiAttr(DirectiveID.getLoc());
+  else if (IDVal == ".fnstart")
+    return parseDirectiveFnStart(DirectiveID.getLoc());
+  else if (IDVal == ".fnend")
+    return parseDirectiveFnEnd(DirectiveID.getLoc());
+  else if (IDVal == ".cantunwind")
+    return parseDirectiveCantUnwind(DirectiveID.getLoc());
+  else if (IDVal == ".personality")
+    return parseDirectivePersonality(DirectiveID.getLoc());
+  else if (IDVal == ".handlerdata")
+    return parseDirectiveHandlerData(DirectiveID.getLoc());
+  else if (IDVal == ".setfp")
+    return parseDirectiveSetFP(DirectiveID.getLoc());
+  else if (IDVal == ".pad")
+    return parseDirectivePad(DirectiveID.getLoc());
+  else if (IDVal == ".save")
+    return parseDirectiveRegSave(DirectiveID.getLoc(), false);
+  else if (IDVal == ".vsave")
+    return parseDirectiveRegSave(DirectiveID.getLoc(), true);
   return true;
 }
 
@@ -7693,6 +7823,9 @@ bool ARMAsmParser::parseDirectiveThumb(SMLoc L) {
     return Error(L, "unexpected token in directive");
   Parser.Lex();
 
+  if (!hasThumb())
+    return Error(L, "target does not support Thumb mode");
+
   if (!isThumb())
     SwitchMode();
   getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
@@ -7706,6 +7839,9 @@ bool ARMAsmParser::parseDirectiveARM(SMLoc L) {
     return Error(L, "unexpected token in directive");
   Parser.Lex();
 
+  if (!hasARM())
+    return Error(L, "target does not support ARM mode");
+
   if (isThumb())
     SwitchMode();
   getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32);
@@ -7795,10 +7931,16 @@ bool ARMAsmParser::parseDirectiveCode(SMLoc L) {
   Parser.Lex();
 
   if (Val == 16) {
+    if (!hasThumb())
+      return Error(L, "target does not support Thumb mode");
+
     if (!isThumb())
       SwitchMode();
     getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
   } else {
+    if (!hasARM())
+      return Error(L, "target does not support ARM mode");
+
     if (isThumb())
       SwitchMode();
     getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32);
@@ -7858,6 +8000,219 @@ bool ARMAsmParser::parseDirectiveEabiAttr(SMLoc L) {
   return true;
 }
 
+/// parseDirectiveFnStart
+///  ::= .fnstart
+bool ARMAsmParser::parseDirectiveFnStart(SMLoc L) {
+  if (FnStartLoc.isValid()) {
+    Error(L, ".fnstart starts before the end of previous one");
+    Error(FnStartLoc, "previous .fnstart starts here");
+    return true;
+  }
+
+  FnStartLoc = L;
+  getParser().getStreamer().EmitFnStart();
+  return false;
+}
+
+/// parseDirectiveFnEnd
+///  ::= .fnend
+bool ARMAsmParser::parseDirectiveFnEnd(SMLoc L) {
+  // Check the ordering of unwind directives
+  if (!FnStartLoc.isValid())
+    return Error(L, ".fnstart must precede .fnend directive");
+
+  // Reset the unwind directives parser state
+  resetUnwindDirectiveParserState();
+
+  getParser().getStreamer().EmitFnEnd();
+  return false;
+}
+
+/// parseDirectiveCantUnwind
+///  ::= .cantunwind
+bool ARMAsmParser::parseDirectiveCantUnwind(SMLoc L) {
+  // Check the ordering of unwind directives
+  CantUnwindLoc = L;
+  if (!FnStartLoc.isValid())
+    return Error(L, ".fnstart must precede .cantunwind directive");
+  if (HandlerDataLoc.isValid()) {
+    Error(L, ".cantunwind can't be used with .handlerdata directive");
+    Error(HandlerDataLoc, ".handlerdata was specified here");
+    return true;
+  }
+  if (PersonalityLoc.isValid()) {
+    Error(L, ".cantunwind can't be used with .personality directive");
+    Error(PersonalityLoc, ".personality was specified here");
+    return true;
+  }
+
+  getParser().getStreamer().EmitCantUnwind();
+  return false;
+}
+
+/// parseDirectivePersonality
+///  ::= .personality name
+bool ARMAsmParser::parseDirectivePersonality(SMLoc L) {
+  // Check the ordering of unwind directives
+  PersonalityLoc = L;
+  if (!FnStartLoc.isValid())
+    return Error(L, ".fnstart must precede .personality directive");
+  if (CantUnwindLoc.isValid()) {
+    Error(L, ".personality can't be used with .cantunwind directive");
+    Error(CantUnwindLoc, ".cantunwind was specified here");
+    return true;
+  }
+  if (HandlerDataLoc.isValid()) {
+    Error(L, ".personality must precede .handlerdata directive");
+    Error(HandlerDataLoc, ".handlerdata was specified here");
+    return true;
+  }
+
+  // Parse the name of the personality routine
+  if (Parser.getTok().isNot(AsmToken::Identifier)) {
+    Parser.eatToEndOfStatement();
+    return Error(L, "unexpected input in .personality directive.");
+  }
+  StringRef Name(Parser.getTok().getIdentifier());
+  Parser.Lex();
+
+  MCSymbol *PR = getParser().getContext().GetOrCreateSymbol(Name);
+  getParser().getStreamer().EmitPersonality(PR);
+  return false;
+}
+
+/// parseDirectiveHandlerData
+///  ::= .handlerdata
+bool ARMAsmParser::parseDirectiveHandlerData(SMLoc L) {
+  // Check the ordering of unwind directives
+  HandlerDataLoc = L;
+  if (!FnStartLoc.isValid())
+    return Error(L, ".fnstart must precede .personality directive");
+  if (CantUnwindLoc.isValid()) {
+    Error(L, ".handlerdata can't be used with .cantunwind directive");
+    Error(CantUnwindLoc, ".cantunwind was specified here");
+    return true;
+  }
+
+  getParser().getStreamer().EmitHandlerData();
+  return false;
+}
+
+/// parseDirectiveSetFP
+///  ::= .setfp fpreg, spreg [, offset]
+bool ARMAsmParser::parseDirectiveSetFP(SMLoc L) {
+  // Check the ordering of unwind directives
+  if (!FnStartLoc.isValid())
+    return Error(L, ".fnstart must precede .setfp directive");
+  if (HandlerDataLoc.isValid())
+    return Error(L, ".setfp must precede .handlerdata directive");
+
+  // Parse fpreg
+  SMLoc NewFPRegLoc = Parser.getTok().getLoc();
+  int NewFPReg = tryParseRegister();
+  if (NewFPReg == -1)
+    return Error(NewFPRegLoc, "frame pointer register expected");
+
+  // Consume comma
+  if (!Parser.getTok().is(AsmToken::Comma))
+    return Error(Parser.getTok().getLoc(), "comma expected");
+  Parser.Lex(); // skip comma
+
+  // Parse spreg
+  SMLoc NewSPRegLoc = Parser.getTok().getLoc();
+  int NewSPReg = tryParseRegister();
+  if (NewSPReg == -1)
+    return Error(NewSPRegLoc, "stack pointer register expected");
+
+  if (NewSPReg != ARM::SP && NewSPReg != FPReg)
+    return Error(NewSPRegLoc,
+                 "register should be either $sp or the latest fp register");
+
+  // Update the frame pointer register
+  FPReg = NewFPReg;
+
+  // Parse offset
+  int64_t Offset = 0;
+  if (Parser.getTok().is(AsmToken::Comma)) {
+    Parser.Lex(); // skip comma
+
+    if (Parser.getTok().isNot(AsmToken::Hash) &&
+        Parser.getTok().isNot(AsmToken::Dollar)) {
+      return Error(Parser.getTok().getLoc(), "'#' expected");
+    }
+    Parser.Lex(); // skip hash token.
+
+    const MCExpr *OffsetExpr;
+    SMLoc ExLoc = Parser.getTok().getLoc();
+    SMLoc EndLoc;
+    if (getParser().parseExpression(OffsetExpr, EndLoc))
+      return Error(ExLoc, "malformed setfp offset");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(OffsetExpr);
+    if (!CE)
+      return Error(ExLoc, "setfp offset must be an immediate");
+
+    Offset = CE->getValue();
+  }
+
+  getParser().getStreamer().EmitSetFP(static_cast<unsigned>(NewFPReg),
+                                      static_cast<unsigned>(NewSPReg),
+                                      Offset);
+  return false;
+}
+
+/// parseDirective
+///  ::= .pad offset
+bool ARMAsmParser::parseDirectivePad(SMLoc L) {
+  // Check the ordering of unwind directives
+  if (!FnStartLoc.isValid())
+    return Error(L, ".fnstart must precede .pad directive");
+  if (HandlerDataLoc.isValid())
+    return Error(L, ".pad must precede .handlerdata directive");
+
+  // Parse the offset
+  if (Parser.getTok().isNot(AsmToken::Hash) &&
+      Parser.getTok().isNot(AsmToken::Dollar)) {
+    return Error(Parser.getTok().getLoc(), "'#' expected");
+  }
+  Parser.Lex(); // skip hash token.
+
+  const MCExpr *OffsetExpr;
+  SMLoc ExLoc = Parser.getTok().getLoc();
+  SMLoc EndLoc;
+  if (getParser().parseExpression(OffsetExpr, EndLoc))
+    return Error(ExLoc, "malformed pad offset");
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(OffsetExpr);
+  if (!CE)
+    return Error(ExLoc, "pad offset must be an immediate");
+
+  getParser().getStreamer().EmitPad(CE->getValue());
+  return false;
+}
+
+/// parseDirectiveRegSave
+///  ::= .save  { registers }
+///  ::= .vsave { registers }
+bool ARMAsmParser::parseDirectiveRegSave(SMLoc L, bool IsVector) {
+  // Check the ordering of unwind directives
+  if (!FnStartLoc.isValid())
+    return Error(L, ".fnstart must precede .save or .vsave directives");
+  if (HandlerDataLoc.isValid())
+    return Error(L, ".save or .vsave must precede .handlerdata directive");
+
+  // Parse the register list
+  SmallVector<MCParsedAsmOperand*, 1> Operands;
+  if (parseRegisterList(Operands))
+    return true;
+  ARMOperand *Op = (ARMOperand*)Operands[0];
+  if (!IsVector && !Op->isRegList())
+    return Error(L, ".save expects GPR registers");
+  if (IsVector && !Op->isDPRRegList())
+    return Error(L, ".vsave expects DPR registers");
+
+  getParser().getStreamer().EmitRegSave(Op->getRegList(), IsVector);
+  return false;
+}
+
 /// Force static initialization.
 extern "C" void LLVMInitializeARMAsmParser() {
   RegisterMCAsmParser<ARMAsmParser> X(TheARMTarget);
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index ac937f3..a6eab33 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -65,7 +65,7 @@ namespace {
       void setITState(char Firstcond, char Mask) {
         // (3 - the number of trailing zeros) is the number of then / else.
         unsigned CondBit0 = Firstcond & 1;
-        unsigned NumTZ = CountTrailingZeros_32(Mask);
+        unsigned NumTZ = countTrailingZeros<uint8_t>(Mask);
         unsigned char CCBits = static_cast<unsigned char>(Firstcond & 0xf);
         assert(NumTZ <= 3 && "Invalid IT mask!");
         // push condition codes onto the stack the correct order for the pops
@@ -156,12 +156,17 @@ static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
 static DecodeStatus DecodeGPRnopcRegisterClass(MCInst &Inst,
                                                unsigned RegNo, uint64_t Address,
                                                const void *Decoder);
+static DecodeStatus DecodeGPRwithAPSRRegisterClass(MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
+                                               const void *Decoder);
 static DecodeStatus DecodetGPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
 static DecodeStatus DecodetcGPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
 static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo,
@@ -236,6 +241,14 @@ static DecodeStatus DecodeBranchImmInstruction(MCInst &Inst,unsigned Insn,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeAddrMode6Operand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLDST1Instruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLDST2Instruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLDST3Instruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLDST4Instruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeVLDInstruction(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeVSTInstruction(MCInst &Inst, unsigned Val,
@@ -268,6 +281,8 @@ static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeMemBarrierOption(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeInstSyncBarrierOption(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeDoubleRegLoad(MCInst &Inst, unsigned Insn,
@@ -348,6 +363,8 @@ static DecodeStatus DecodeThumbAddSPReg(MCInst &Inst, uint16_t Insn,
                                 uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeThumbCPS(MCInst &Inst, uint16_t Insn,
                                 uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeQADDInstruction(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeThumbBLXOffset(MCInst &Inst, unsigned Insn,
                                 uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeT2AddrModeImm12(MCInst &Inst, unsigned Val,
@@ -402,7 +419,7 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
          "Asked to disassemble an ARM instruction but Subtarget is in Thumb mode!");
 
   // We want to read exactly 4 bytes of data.
-  if (Region.readBytes(Address, 4, (uint8_t*)bytes, NULL) == -1) {
+  if (Region.readBytes(Address, 4, bytes) == -1) {
     Size = 0;
     return MCDisassembler::Fail;
   }
@@ -492,102 +509,9 @@ static bool tryAddingSymbolicOperand(uint64_t Address, int32_t Value,
                                      bool isBranch, uint64_t InstSize,
                                      MCInst &MI, const void *Decoder) {
   const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
-  LLVMOpInfoCallback getOpInfo = Dis->getLLVMOpInfoCallback();
-  struct LLVMOpInfo1 SymbolicOp;
-  memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
-  SymbolicOp.Value = Value;
-  void *DisInfo = Dis->getDisInfoBlock();
-
-  if (!getOpInfo ||
-      !getOpInfo(DisInfo, Address, 0 /* Offset */, InstSize, 1, &SymbolicOp)) {
-    // Clear SymbolicOp.Value from above and also all other fields.
-    memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
-    LLVMSymbolLookupCallback SymbolLookUp = Dis->getLLVMSymbolLookupCallback();
-    if (!SymbolLookUp)
-      return false;
-    uint64_t ReferenceType;
-    if (isBranch)
-       ReferenceType = LLVMDisassembler_ReferenceType_In_Branch;
-    else
-       ReferenceType = LLVMDisassembler_ReferenceType_InOut_None;
-    const char *ReferenceName;
-    uint64_t SymbolValue = 0x00000000ffffffffULL & Value;
-    const char *Name = SymbolLookUp(DisInfo, SymbolValue, &ReferenceType,
-                                    Address, &ReferenceName);
-    if (Name) {
-      SymbolicOp.AddSymbol.Name = Name;
-      SymbolicOp.AddSymbol.Present = true;
-    }
-    // For branches always create an MCExpr so it gets printed as hex address.
-    else if (isBranch) {
-      SymbolicOp.Value = Value;
-    }
-    if(ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub)
-      (*Dis->CommentStream) << "symbol stub for: " << ReferenceName;
-    if (!Name && !isBranch)
-      return false;
-  }
-
-  MCContext *Ctx = Dis->getMCContext();
-  const MCExpr *Add = NULL;
-  if (SymbolicOp.AddSymbol.Present) {
-    if (SymbolicOp.AddSymbol.Name) {
-      StringRef Name(SymbolicOp.AddSymbol.Name);
-      MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
-      Add = MCSymbolRefExpr::Create(Sym, *Ctx);
-    } else {
-      Add = MCConstantExpr::Create(SymbolicOp.AddSymbol.Value, *Ctx);
-    }
-  }
-
-  const MCExpr *Sub = NULL;
-  if (SymbolicOp.SubtractSymbol.Present) {
-    if (SymbolicOp.SubtractSymbol.Name) {
-      StringRef Name(SymbolicOp.SubtractSymbol.Name);
-      MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
-      Sub = MCSymbolRefExpr::Create(Sym, *Ctx);
-    } else {
-      Sub = MCConstantExpr::Create(SymbolicOp.SubtractSymbol.Value, *Ctx);
-    }
-  }
-
-  const MCExpr *Off = NULL;
-  if (SymbolicOp.Value != 0)
-    Off = MCConstantExpr::Create(SymbolicOp.Value, *Ctx);
-
-  const MCExpr *Expr;
-  if (Sub) {
-    const MCExpr *LHS;
-    if (Add)
-      LHS = MCBinaryExpr::CreateSub(Add, Sub, *Ctx);
-    else
-      LHS = MCUnaryExpr::CreateMinus(Sub, *Ctx);
-    if (Off != 0)
-      Expr = MCBinaryExpr::CreateAdd(LHS, Off, *Ctx);
-    else
-      Expr = LHS;
-  } else if (Add) {
-    if (Off != 0)
-      Expr = MCBinaryExpr::CreateAdd(Add, Off, *Ctx);
-    else
-      Expr = Add;
-  } else {
-    if (Off != 0)
-      Expr = Off;
-    else
-      Expr = MCConstantExpr::Create(0, *Ctx);
-  }
-
-  if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_ARM_HI16)
-    MI.addOperand(MCOperand::CreateExpr(ARMMCExpr::CreateUpper16(Expr, *Ctx)));
-  else if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_ARM_LO16)
-    MI.addOperand(MCOperand::CreateExpr(ARMMCExpr::CreateLower16(Expr, *Ctx)));
-  else if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_None)
-    MI.addOperand(MCOperand::CreateExpr(Expr));
-  else
-    llvm_unreachable("bad SymbolicOp.VariantKind");
-
-  return true;
+  // FIXME: Does it make sense for value to be negative?
+  return Dis->tryAddingSymbolicOperand(MI, (uint32_t)Value, Address, isBranch,
+                                       /* Offset */ 0, InstSize);
 }
 
 /// tryAddingPcLoadReferenceComment - trys to add a comment as to what is being
@@ -602,17 +526,7 @@ static bool tryAddingSymbolicOperand(uint64_t Address, int32_t Value,
 static void tryAddingPcLoadReferenceComment(uint64_t Address, int Value,
                                             const void *Decoder) {
   const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
-  LLVMSymbolLookupCallback SymbolLookUp = Dis->getLLVMSymbolLookupCallback();
-  if (SymbolLookUp) {
-    void *DisInfo = Dis->getDisInfoBlock();
-    uint64_t ReferenceType;
-    ReferenceType = LLVMDisassembler_ReferenceType_In_PCrel_Load;
-    const char *ReferenceName;
-    (void)SymbolLookUp(DisInfo, Value, &ReferenceType, Address, &ReferenceName);
-    if(ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr ||
-       ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr)
-      (*Dis->CommentStream) << "literal pool for: " << ReferenceName;
-  }
+  Dis->tryAddingPcLoadReferenceComment(Value, Address);
 }
 
 // Thumb1 instructions don't have explicit S bits.  Rather, they
@@ -751,7 +665,7 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
          "Asked to disassemble in Thumb mode but Subtarget is in ARM mode!");
 
   // We want to read exactly 2 bytes of data.
-  if (Region.readBytes(Address, 2, (uint8_t*)bytes, NULL) == -1) {
+  if (Region.readBytes(Address, 2, bytes) == -1) {
     Size = 0;
     return MCDisassembler::Fail;
   }
@@ -803,7 +717,7 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   }
 
   // We want to read exactly 4 bytes of data.
-  if (Region.readBytes(Address, 4, (uint8_t*)bytes, NULL) == -1) {
+  if (Region.readBytes(Address, 4, bytes) == -1) {
     Size = 0;
     return MCDisassembler::Fail;
   }
@@ -920,6 +834,21 @@ DecodeGPRnopcRegisterClass(MCInst &Inst, unsigned RegNo,
   return S;
 }
 
+static DecodeStatus
+DecodeGPRwithAPSRRegisterClass(MCInst &Inst, unsigned RegNo,
+                               uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  if (RegNo == 15)
+  {
+    Inst.addOperand(MCOperand::CreateReg(ARM::APSR_NZCV));
+    return MCDisassembler::Success;
+  }
+
+  Check(S, DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder));
+  return S;
+}
+
 static DecodeStatus DecodetGPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder) {
   if (RegNo > 7)
@@ -927,6 +856,26 @@ static DecodeStatus DecodetGPRRegisterClass(MCInst &Inst, unsigned RegNo,
   return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder);
 }
 
+static const uint16_t GPRPairDecoderTable[] = {
+  ARM::R0_R1, ARM::R2_R3,   ARM::R4_R5,  ARM::R6_R7,
+  ARM::R8_R9, ARM::R10_R11, ARM::R12_SP
+};
+
+static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  if (RegNo > 13)
+    return MCDisassembler::Fail;
+
+  if ((RegNo & 1) || RegNo == 0xe)
+     S = MCDisassembler::SoftFail;
+
+  unsigned RegisterPair = GPRPairDecoderTable[RegNo/2];
+  Inst.addOperand(MCOperand::CreateReg(RegisterPair));
+  return S;
+}
+
 static DecodeStatus DecodetcGPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder) {
   unsigned Register = 0;
@@ -1030,7 +979,7 @@ static const uint16_t QPRDecoderTable[] = {
 
 static DecodeStatus DecodeQPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder) {
-  if (RegNo > 31)
+  if (RegNo > 31 || (RegNo & 1) != 0)
     return MCDisassembler::Fail;
   RegNo >>= 1;
 
@@ -1206,7 +1155,7 @@ static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val,
   }
 
   // Empty register lists are not allowed.
-  if (CountPopulation_32(Val) == 0) return MCDisassembler::Fail;
+  if (Val == 0) return MCDisassembler::Fail;
   for (unsigned i = 0; i < 16; ++i) {
     if (Val & (1 << i)) {
       if (!Check(S, DecodeGPRRegisterClass(Inst, i, Address, Decoder)))
@@ -1227,6 +1176,13 @@ static DecodeStatus DecodeSPRRegListOperand(MCInst &Inst, unsigned Val,
   unsigned Vd = fieldFromInstruction(Val, 8, 5);
   unsigned regs = fieldFromInstruction(Val, 0, 8);
 
+  // In case of unpredictable encoding, tweak the operands.
+  if (regs == 0 || (Vd + regs) > 32) {
+    regs = Vd + regs > 32 ? 32 - Vd : regs;
+    regs = std::max( 1u, regs);
+    S = MCDisassembler::SoftFail;
+  }
+
   if (!Check(S, DecodeSPRRegisterClass(Inst, Vd, Address, Decoder)))
     return MCDisassembler::Fail;
   for (unsigned i = 0; i < (regs - 1); ++i) {
@@ -1242,9 +1198,15 @@ static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val,
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Vd = fieldFromInstruction(Val, 8, 5);
-  unsigned regs = fieldFromInstruction(Val, 0, 8);
+  unsigned regs = fieldFromInstruction(Val, 1, 7);
 
-  regs = regs >> 1;
+  // In case of unpredictable encoding, tweak the operands.
+  if (regs == 0 || regs > 16 || (Vd + regs) > 32) {
+    regs = Vd + regs > 32 ? 32 - Vd : regs;
+    regs = std::max( 1u, regs);
+    regs = std::min(16u, regs);
+    S = MCDisassembler::SoftFail;
+  }
 
   if (!Check(S, DecodeDPRRegisterClass(Inst, Vd, Address, Decoder)))
       return MCDisassembler::Fail;
@@ -1797,6 +1759,29 @@ static DecodeStatus DecodeRFEInstruction(MCInst &Inst, unsigned Insn,
   return S;
 }
 
+static DecodeStatus DecodeQADDInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+  if (pred == 0xF)
+    return DecodeCPSInstruction(Inst, Insn, Address, Decoder);
+
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+  return S;
+}
+
 static DecodeStatus DecodeMemMultipleWritebackInstruction(MCInst &Inst,
                                   unsigned Insn,
                                   uint64_t Address, const void *Decoder) {
@@ -1807,6 +1792,7 @@ static DecodeStatus DecodeMemMultipleWritebackInstruction(MCInst &Inst,
   unsigned reglist = fieldFromInstruction(Insn, 0, 16);
 
   if (pred == 0xF) {
+    // Ambiguous with RFE and SRS
     switch (Inst.getOpcode()) {
       case ARM::LDMDA:
         Inst.setOpcode(ARM::RFEDA);
@@ -1857,11 +1843,16 @@ static DecodeStatus DecodeMemMultipleWritebackInstruction(MCInst &Inst,
         Inst.setOpcode(ARM::SRSIB_UPD);
         break;
       default:
-        if (!Check(S, MCDisassembler::Fail)) return MCDisassembler::Fail;
+        return MCDisassembler::Fail;
     }
 
     // For stores (which become SRS's, the only operand is the mode.
     if (fieldFromInstruction(Insn, 20, 1) == 0) {
+      // Check SRS encoding constraints
+      if (!(fieldFromInstruction(Insn, 22, 1) == 1 &&
+            fieldFromInstruction(Insn, 20, 1) == 0))
+        return MCDisassembler::Fail;
+
       Inst.addOperand(
           MCOperand::CreateImm(fieldFromInstruction(Insn, 0, 4)));
       return S;
@@ -1891,6 +1882,13 @@ static DecodeStatus DecodeCPSInstruction(MCInst &Inst, unsigned Insn,
 
   DecodeStatus S = MCDisassembler::Success;
 
+  // This decoder is called from multiple location that do not check
+  // the full encoding is valid before they do.
+  if (fieldFromInstruction(Insn, 5, 1) != 0 ||
+      fieldFromInstruction(Insn, 16, 1) != 0 ||
+      fieldFromInstruction(Insn, 20, 8) != 0x10)
+    return MCDisassembler::Fail;
+
   // imod == '01' --> UNPREDICTABLE
   // NOTE: Even though this is technically UNPREDICTABLE, we choose to
   // return failure here.  The '01' imod value is unprintable, so there's
@@ -2432,6 +2430,57 @@ static DecodeStatus DecodeVLDInstruction(MCInst &Inst, unsigned Insn,
   return S;
 }
 
+static DecodeStatus DecodeVLDST1Instruction(MCInst &Inst, unsigned Insn,
+                                   uint64_t Address, const void *Decoder) {
+  unsigned type = fieldFromInstruction(Insn, 8, 4);
+  unsigned align = fieldFromInstruction(Insn, 4, 2);
+  if (type == 6 && (align & 2)) return MCDisassembler::Fail;
+  if (type == 7 && (align & 2)) return MCDisassembler::Fail;
+  if (type == 10 && align == 3) return MCDisassembler::Fail;
+
+  unsigned load = fieldFromInstruction(Insn, 21, 1);
+  return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
+              : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
+}
+
+static DecodeStatus DecodeVLDST2Instruction(MCInst &Inst, unsigned Insn,
+                                   uint64_t Address, const void *Decoder) {
+  unsigned size = fieldFromInstruction(Insn, 6, 2);
+  if (size == 3) return MCDisassembler::Fail;
+
+  unsigned type = fieldFromInstruction(Insn, 8, 4);
+  unsigned align = fieldFromInstruction(Insn, 4, 2);
+  if (type == 8 && align == 3) return MCDisassembler::Fail;
+  if (type == 9 && align == 3) return MCDisassembler::Fail;
+
+  unsigned load = fieldFromInstruction(Insn, 21, 1);
+  return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
+              : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
+}
+
+static DecodeStatus DecodeVLDST3Instruction(MCInst &Inst, unsigned Insn,
+                                   uint64_t Address, const void *Decoder) {
+  unsigned size = fieldFromInstruction(Insn, 6, 2);
+  if (size == 3) return MCDisassembler::Fail;
+
+  unsigned align = fieldFromInstruction(Insn, 4, 2);
+  if (align & 2) return MCDisassembler::Fail;
+
+  unsigned load = fieldFromInstruction(Insn, 21, 1);
+  return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
+              : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
+}
+
+static DecodeStatus DecodeVLDST4Instruction(MCInst &Inst, unsigned Insn,
+                                   uint64_t Address, const void *Decoder) {
+  unsigned size = fieldFromInstruction(Insn, 6, 2);
+  if (size == 3) return MCDisassembler::Fail;
+
+  unsigned load = fieldFromInstruction(Insn, 21, 1);
+  return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
+              : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
+}
+
 static DecodeStatus DecodeVSTInstruction(MCInst &Inst, unsigned Insn,
                                  uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
@@ -3536,6 +3585,15 @@ static DecodeStatus DecodeMemBarrierOption(MCInst &Inst, unsigned Val,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeInstSyncBarrierOption(MCInst &Inst, unsigned Val,
+                                        uint64_t Address, const void *Decoder) {
+  if (Val & ~0xf)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Val,
                           uint64_t Address, const void *Decoder) {
   if (!Val) return MCDisassembler::Fail;
@@ -3551,11 +3609,10 @@ static DecodeStatus DecodeDoubleRegLoad(MCInst &Inst, unsigned Insn,
   unsigned Rn = fieldFromInstruction(Insn, 16, 4);
   unsigned pred = fieldFromInstruction(Insn, 28, 4);
 
-  if ((Rt & 1) || Rt == 0xE || Rn == 0xF) return MCDisassembler::Fail;
+  if (Rn == 0xF)
+    S = MCDisassembler::SoftFail;
 
-  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
-    return MCDisassembler::Fail;
-  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt+1, Address, Decoder)))
+  if (!Check(S, DecodeGPRPairRegisterClass(Inst, Rt, Address, Decoder)))
     return MCDisassembler::Fail;
   if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
     return MCDisassembler::Fail;
@@ -3565,7 +3622,6 @@ static DecodeStatus DecodeDoubleRegLoad(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-
 static DecodeStatus DecodeDoubleRegStore(MCInst &Inst, unsigned Insn,
                                          uint64_t Address, const void *Decoder){
   DecodeStatus S = MCDisassembler::Success;
@@ -3578,12 +3634,10 @@ static DecodeStatus DecodeDoubleRegStore(MCInst &Inst, unsigned Insn,
   if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rd, Address, Decoder)))
     return MCDisassembler::Fail;
 
-  if ((Rt & 1) || Rt == 0xE || Rn == 0xF) return MCDisassembler::Fail;
-  if (Rd == Rn || Rd == Rt || Rd == Rt+1) return MCDisassembler::Fail;
+  if (Rn == 0xF || Rd == Rn || Rd == Rt || Rd == Rt+1)
+    S = MCDisassembler::SoftFail;
 
-  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
-    return MCDisassembler::Fail;
-  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt+1, Address, Decoder)))
+  if (!Check(S, DecodeGPRPairRegisterClass(Inst, Rt, Address, Decoder)))
     return MCDisassembler::Fail;
   if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
     return MCDisassembler::Fail;
@@ -4453,16 +4507,18 @@ static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn,
   Vm |= (fieldFromInstruction(Insn, 5, 1) << 4);
   unsigned imm = fieldFromInstruction(Insn, 16, 6);
   unsigned cmode = fieldFromInstruction(Insn, 8, 4);
+  unsigned op = fieldFromInstruction(Insn, 5, 1);
 
   DecodeStatus S = MCDisassembler::Success;
 
   // VMOVv2f32 is ambiguous with these decodings.
   if (!(imm & 0x38) && cmode == 0xF) {
+    if (op == 1) return MCDisassembler::Fail;
     Inst.setOpcode(ARM::VMOVv2f32);
     return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder);
   }
 
-  if (!(imm & 0x20)) Check(S, MCDisassembler::SoftFail);
+  if (!(imm & 0x20)) return MCDisassembler::Fail;
 
   if (!Check(S, DecodeDPRRegisterClass(Inst, Vd, Address, Decoder)))
     return MCDisassembler::Fail;
@@ -4481,16 +4537,18 @@ static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn,
   Vm |= (fieldFromInstruction(Insn, 5, 1) << 4);
   unsigned imm = fieldFromInstruction(Insn, 16, 6);
   unsigned cmode = fieldFromInstruction(Insn, 8, 4);
+  unsigned op = fieldFromInstruction(Insn, 5, 1);
 
   DecodeStatus S = MCDisassembler::Success;
 
   // VMOVv4f32 is ambiguous with these decodings.
   if (!(imm & 0x38) && cmode == 0xF) {
+    if (op == 1) return MCDisassembler::Fail;
     Inst.setOpcode(ARM::VMOVv4f32);
     return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder);
   }
 
-  if (!(imm & 0x20)) Check(S, MCDisassembler::SoftFail);
+  if (!(imm & 0x20)) return MCDisassembler::Fail;
 
   if (!Check(S, DecodeQPRRegisterClass(Inst, Vd, Address, Decoder)))
     return MCDisassembler::Fail;
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index 3bcd083..7fef795 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -660,8 +660,8 @@ void ARMInstPrinter::printBitfieldInvMaskImmOperand(const MCInst *MI,
                                                     raw_ostream &O) {
   const MCOperand &MO = MI->getOperand(OpNum);
   uint32_t v = ~MO.getImm();
-  int32_t lsb = CountTrailingZeros_32(v);
-  int32_t width = (32 - CountLeadingZeros_32 (v)) - lsb;
+  int32_t lsb = countTrailingZeros(v);
+  int32_t width = (32 - countLeadingZeros (v)) - lsb;
   assert(MO.isImm() && "Not a valid bf_inv_mask_imm value!");
   O << markup("<imm:") << '#' << lsb << markup(">")
     << ", "
@@ -674,6 +674,12 @@ void ARMInstPrinter::printMemBOption(const MCInst *MI, unsigned OpNum,
   O << ARM_MB::MemBOptToString(val);
 }
 
+void ARMInstPrinter::printInstSyncBOption(const MCInst *MI, unsigned OpNum,
+                                          raw_ostream &O) {
+  unsigned val = MI->getOperand(OpNum).getImm();
+  O << ARM_ISB::InstSyncBOptToString(val);
+}
+
 void ARMInstPrinter::printShiftImmOperand(const MCInst *MI, unsigned OpNum,
                                           raw_ostream &O) {
   unsigned ShiftOp = MI->getOperand(OpNum).getImm();
@@ -931,7 +937,7 @@ void ARMInstPrinter::printThumbITMask(const MCInst *MI, unsigned OpNum,
   unsigned Mask = MI->getOperand(OpNum).getImm();
   unsigned Firstcond = MI->getOperand(OpNum-1).getImm();
   unsigned CondBit0 = Firstcond & 1;
-  unsigned NumTZ = CountTrailingZeros_32(Mask);
+  unsigned NumTZ = countTrailingZeros(Mask);
   assert(NumTZ <= 3 && "Invalid IT mask!");
   for (unsigned Pos = 3, e = NumTZ; Pos > e; --Pos) {
     bool T = ((Mask >> Pos) & 1) == CondBit0;
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
index 344104e..5a64348 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
@@ -71,6 +71,7 @@ public:
   void printBitfieldInvMaskImmOperand(const MCInst *MI, unsigned OpNum,
                                       raw_ostream &O);
   void printMemBOption(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printInstSyncBOption(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printShiftImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printPKHLSLShiftImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printPKHASRShiftImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h b/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
index 62473b2..b6c85c2 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
@@ -140,7 +140,7 @@ namespace ARM_AM {
     if ((Imm & ~255U) == 0) return 0;
 
     // Use CTZ to compute the rotate amount.
-    unsigned TZ = CountTrailingZeros_32(Imm);
+    unsigned TZ = countTrailingZeros(Imm);
 
     // Rotate amount must be even.  Something like 0x200 must be rotated 8 bits,
     // not 9.
@@ -153,7 +153,7 @@ namespace ARM_AM {
     // For values like 0xF000000F, we should ignore the low 6 bits, then
     // retry the hunt.
     if (Imm & 63U) {
-      unsigned TZ2 = CountTrailingZeros_32(Imm & ~63U);
+      unsigned TZ2 = countTrailingZeros(Imm & ~63U);
       unsigned RotAmt2 = TZ2 & ~1;
       if ((rotr32(Imm, RotAmt2) & ~255U) == 0)
         return (32-RotAmt2)&31;  // HW rotates right, not left.
@@ -221,7 +221,7 @@ namespace ARM_AM {
     if ((Imm & ~255U) == 0) return 0;
 
     // Use CTZ to compute the shift amount.
-    return CountTrailingZeros_32(Imm);
+    return countTrailingZeros(Imm);
   }
 
   /// isThumbImmShiftedVal - Return true if the specified value can be obtained
@@ -240,7 +240,7 @@ namespace ARM_AM {
     if ((Imm & ~65535U) == 0) return 0;
 
     // Use CTZ to compute the shift amount.
-    return CountTrailingZeros_32(Imm);
+    return countTrailingZeros(Imm);
   }
 
   /// isThumbImm16ShiftedVal - Return true if the specified value can be
@@ -296,7 +296,7 @@ namespace ARM_AM {
   /// encoding is possible.
   /// See ARM Reference Manual A6.3.2.
   static inline int getT2SOImmValRotateVal(unsigned V) {
-    unsigned RotAmt = CountLeadingZeros_32(V);
+    unsigned RotAmt = countLeadingZeros(V);
     if (RotAmt >= 24)
       return -1;
 
@@ -328,7 +328,7 @@ namespace ARM_AM {
   static inline unsigned getT2SOImmValRotate(unsigned V) {
     if ((V & ~255U) == 0) return 0;
     // Use CTZ to compute the rotate amount.
-    unsigned RotAmt = CountTrailingZeros_32(V);
+    unsigned RotAmt = countTrailingZeros(V);
     return (32 - RotAmt) & 31;
   }
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index e66e985..8baa3a6 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -419,7 +419,7 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
      uint32_t J2Bit = (I2Bit ^ 0x1) ^ signBit;
      uint32_t imm10Bits = (offset & 0x1FF800) >> 11;
      uint32_t imm11Bits = (offset & 0x000007FF);
- 
+
      uint32_t Binary = 0;
      uint32_t firstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10Bits);
      uint32_t secondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) |
@@ -434,8 +434,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
      // four (see fixup_arm_thumb_cp). The 32-bit immediate value is encoded as
      //   imm32 = SignExtend(S:I1:I2:imm10H:imm10L:00)
      // where I1 = NOT(J1 ^ S) and I2 = NOT(J2 ^ S).
-     // The value is encoded into disjoint bit positions in the destination 
-     // opcode. x = unchanged, I = immediate value bit, S = sign extension bit, 
+     // The value is encoded into disjoint bit positions in the destination
+     // opcode. x = unchanged, I = immediate value bit, S = sign extension bit,
      // J = either J1 or J2 bit, 0 = zero.
      //
      //   BLX: xxxxxSIIIIIIIIII xxJxJIIIIIIIIII0
@@ -450,10 +450,10 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
      uint32_t J2Bit = (I2Bit ^ 0x1) ^ signBit;
      uint32_t imm10HBits = (offset & 0xFFC00) >> 10;
      uint32_t imm10LBits = (offset & 0x3FF);
- 
+
      uint32_t Binary = 0;
      uint32_t firstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10HBits);
-     uint32_t secondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) | 
+     uint32_t secondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) |
                            ((uint16_t)imm10LBits) << 1);
      Binary |= secondHalf << 16;
      Binary |= firstHalf;
@@ -680,8 +680,11 @@ MCAsmBackend *llvm::createARMAsmBackend(const Target &T, StringRef TT, StringRef
     return new DarwinARMAsmBackend(T, TT, CS);
   }
 
-  if (TheTriple.isOSWindows())
+#if 0
+  // FIXME: Introduce yet another checker but assert(0).
+  if (TheTriple.isOSBinFormatCOFF())
     assert(0 && "Windows not supported on ARM");
+#endif
 
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(Triple(TT).getOS());
   return new ELFARMAsmBackend(T, TT, OSABI);
diff --git a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
index de48a0e..ff9917d 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -161,6 +161,49 @@ namespace ARM_MB {
   }
 } // namespace ARM_MB
 
+namespace ARM_ISB {
+  enum InstSyncBOpt {
+    RESERVED_0 = 0,
+    RESERVED_1 = 1,
+    RESERVED_2 = 2,
+    RESERVED_3 = 3,
+    RESERVED_4 = 4,
+    RESERVED_5 = 5,
+    RESERVED_6 = 6,
+    RESERVED_7 = 7,
+    RESERVED_8 = 8,
+    RESERVED_9 = 9,
+    RESERVED_10 = 10,
+    RESERVED_11 = 11,
+    RESERVED_12 = 12,
+    RESERVED_13 = 13,
+    RESERVED_14 = 14,
+    SY = 15
+  };
+
+  inline static const char *InstSyncBOptToString(unsigned val) {
+    switch (val) {
+      default: llvm_unreachable("Unkown memory operation");
+      case RESERVED_0:  return "#0x0";
+      case RESERVED_1:  return "#0x1";
+      case RESERVED_2:  return "#0x2";
+      case RESERVED_3:  return "#0x3";
+      case RESERVED_4:  return "#0x4";
+      case RESERVED_5:  return "#0x5";
+      case RESERVED_6:  return "#0x6";
+      case RESERVED_7:  return "#0x7";
+      case RESERVED_8:  return "#0x8";
+      case RESERVED_9:  return "#0x9";
+      case RESERVED_10: return "#0xa";
+      case RESERVED_11: return "#0xb";
+      case RESERVED_12: return "#0xc";
+      case RESERVED_13: return "#0xd";
+      case RESERVED_14: return "#0xe";
+      case SY:          return "sy";
+    }
+  }
+} // namespace ARM_ISB
+
 /// isARMLowRegister - Returns true if the register is a low register (r0-r7).
 ///
 static inline bool isARMLowRegister(unsigned Reg) {
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 6c3d247..679d3c4 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -203,7 +203,8 @@ private:
   void Reset();
 
   void EmitPersonalityFixup(StringRef Name);
-  void CollectUnwindOpcodes();
+  void FlushPendingOffset();
+  void FlushUnwindOpcodes(bool AllowCompactModel0);
 
   void SwitchToEHSection(const char *Prefix, unsigned Type, unsigned Flags,
                          SectionKind Kind, const MCSymbol &Fn);
@@ -220,13 +221,14 @@ private:
   MCSymbol *ExTab;
   MCSymbol *FnStart;
   const MCSymbol *Personality;
-  uint32_t VFPRegSave; // Register mask for {d31-d0}
-  uint32_t RegSave; // Register mask for {r15-r0}
-  int64_t SPOffset;
-  uint16_t FPReg;
-  int64_t FPOffset;
+  unsigned PersonalityIndex;
+  unsigned FPReg; // Frame pointer register
+  int64_t FPOffset; // Offset: (final frame pointer) - (initial $sp)
+  int64_t SPOffset; // Offset: (final $sp) - (initial $sp)
+  int64_t PendingOffset; // Offset: (final $sp) - (emitted $sp)
   bool UsedFP;
   bool CantUnwind;
+  SmallVector<uint8_t, 64> Opcodes;
   UnwindOpcodeAssembler UnwindOpAsm;
 };
 } // end anonymous namespace
@@ -279,19 +281,18 @@ inline void ARMELFStreamer::SwitchToExIdxSection(const MCSymbol &FnStart) {
 }
 
 void ARMELFStreamer::Reset() {
-  const MCRegisterInfo &MRI = getContext().getRegisterInfo();
-
   ExTab = NULL;
   FnStart = NULL;
   Personality = NULL;
-  VFPRegSave = 0;
-  RegSave = 0;
-  FPReg = MRI.getEncodingValue(ARM::SP);
+  PersonalityIndex = NUM_PERSONALITY_INDEX;
+  FPReg = ARM::SP;
   FPOffset = 0;
   SPOffset = 0;
+  PendingOffset = 0;
   UsedFP = false;
   CantUnwind = false;
 
+  Opcodes.clear();
   UnwindOpAsm.Reset();
 }
 
@@ -311,18 +312,6 @@ void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) {
                     MCFixup::getKindForSize(4, false)));
 }
 
-void ARMELFStreamer::CollectUnwindOpcodes() {
-  if (UsedFP) {
-    UnwindOpAsm.EmitSetFP(FPReg);
-    UnwindOpAsm.EmitSPOffset(-FPOffset);
-  } else {
-    UnwindOpAsm.EmitSPOffset(SPOffset);
-  }
-  UnwindOpAsm.EmitVFPRegSave(VFPRegSave);
-  UnwindOpAsm.EmitRegSave(RegSave);
-  UnwindOpAsm.Finalize();
-}
-
 void ARMELFStreamer::EmitFnStart() {
   assert(FnStart == 0);
   FnStart = getContext().CreateTempSymbol();
@@ -333,27 +322,12 @@ void ARMELFStreamer::EmitFnEnd() {
   assert(FnStart && ".fnstart must preceeds .fnend");
 
   // Emit unwind opcodes if there is no .handlerdata directive
-  if (!ExTab && !CantUnwind) {
-    CollectUnwindOpcodes();
-
-    unsigned PersonalityIndex = UnwindOpAsm.getPersonalityIndex();
-    if (PersonalityIndex == AEABI_UNWIND_CPP_PR1 ||
-        PersonalityIndex == AEABI_UNWIND_CPP_PR2) {
-      // For the __aeabi_unwind_cpp_pr1 and __aeabi_unwind_cpp_pr2, we have to
-      // emit the unwind opcodes in the corresponding ".ARM.extab" section, and
-      // then emit a reference to these unwind opcodes in the second word of
-      // the exception index table entry.
-      SwitchToExTabSection(*FnStart);
-      ExTab = getContext().CreateTempSymbol();
-      EmitLabel(ExTab);
-      EmitBytes(UnwindOpAsm.data(), 0);
-    }
-  }
+  if (!ExTab && !CantUnwind)
+    FlushUnwindOpcodes(true);
 
   // Emit the exception index table entry
   SwitchToExIdxSection(*FnStart);
 
-  unsigned PersonalityIndex = UnwindOpAsm.getPersonalityIndex();
   if (PersonalityIndex < NUM_PERSONALITY_INDEX)
     EmitPersonalityFixup(GetAEABIUnwindPersonalityName(PersonalityIndex));
 
@@ -379,11 +353,15 @@ void ARMELFStreamer::EmitFnEnd() {
     // opcodes should always be 4 bytes.
     assert(PersonalityIndex == AEABI_UNWIND_CPP_PR0 &&
            "Compact model must use __aeabi_cpp_unwind_pr0 as personality");
-    assert(UnwindOpAsm.size() == 4u &&
+    assert(Opcodes.size() == 4u &&
            "Unwind opcode size for __aeabi_cpp_unwind_pr0 must be equal to 4");
-    EmitBytes(UnwindOpAsm.data(), 0);
+    EmitBytes(StringRef(reinterpret_cast<const char*>(Opcodes.data()),
+                        Opcodes.size()), 0);
   }
 
+  // Switch to the section containing FnStart
+  SwitchSection(&FnStart->getSection());
+
   // Clean exception handling frame information
   Reset();
 }
@@ -392,7 +370,34 @@ void ARMELFStreamer::EmitCantUnwind() {
   CantUnwind = true;
 }
 
-void ARMELFStreamer::EmitHandlerData() {
+void ARMELFStreamer::FlushPendingOffset() {
+  if (PendingOffset != 0) {
+    UnwindOpAsm.EmitSPOffset(-PendingOffset);
+    PendingOffset = 0;
+  }
+}
+
+void ARMELFStreamer::FlushUnwindOpcodes(bool AllowCompactModel0) {
+  // Emit the unwind opcode to restore $sp.
+  if (UsedFP) {
+    const MCRegisterInfo &MRI = getContext().getRegisterInfo();
+    int64_t LastRegSaveSPOffset = SPOffset - PendingOffset;
+    UnwindOpAsm.EmitSPOffset(LastRegSaveSPOffset - FPOffset);
+    UnwindOpAsm.EmitSetSP(MRI.getEncodingValue(FPReg));
+  } else {
+    FlushPendingOffset();
+  }
+
+  // Finalize the unwind opcode sequence
+  UnwindOpAsm.Finalize(PersonalityIndex, Opcodes);
+
+  // For compact model 0, we have to emit the unwind opcodes in the .ARM.exidx
+  // section.  Thus, we don't have to create an entry in the .ARM.extab
+  // section.
+  if (AllowCompactModel0 && PersonalityIndex == AEABI_UNWIND_CPP_PR0)
+    return;
+
+  // Switch to .ARM.extab section.
   SwitchToExTabSection(*FnStart);
 
   // Create .ARM.extab label for offset in .ARM.exidx
@@ -400,19 +405,23 @@ void ARMELFStreamer::EmitHandlerData() {
   ExTab = getContext().CreateTempSymbol();
   EmitLabel(ExTab);
 
-  // Emit Personality
-  assert(Personality && ".personality directive must preceed .handlerdata");
-
-  const MCSymbolRefExpr *PersonalityRef =
-    MCSymbolRefExpr::Create(Personality,
-                            MCSymbolRefExpr::VK_ARM_PREL31,
-                            getContext());
+  // Emit personality
+  if (Personality) {
+    const MCSymbolRefExpr *PersonalityRef =
+      MCSymbolRefExpr::Create(Personality,
+                              MCSymbolRefExpr::VK_ARM_PREL31,
+                              getContext());
 
-  EmitValue(PersonalityRef, 4, 0);
+    EmitValue(PersonalityRef, 4, 0);
+  }
 
   // Emit unwind opcodes
-  CollectUnwindOpcodes();
-  EmitBytes(UnwindOpAsm.data(), 0);
+  EmitBytes(StringRef(reinterpret_cast<const char *>(Opcodes.data()),
+                      Opcodes.size()), 0);
+}
+
+void ARMELFStreamer::EmitHandlerData() {
+  FlushUnwindOpcodes(false);
 }
 
 void ARMELFStreamer::EmitPersonality(const MCSymbol *Per) {
@@ -423,42 +432,55 @@ void ARMELFStreamer::EmitPersonality(const MCSymbol *Per) {
 void ARMELFStreamer::EmitSetFP(unsigned NewFPReg,
                                unsigned NewSPReg,
                                int64_t Offset) {
-  assert(SPOffset == 0 &&
-         "Current implementation assumes .setfp precedes .pad");
-
-  const MCRegisterInfo &MRI = getContext().getRegisterInfo();
-
-  uint16_t NewFPRegEncVal = MRI.getEncodingValue(NewFPReg);
-#ifndef NDEBUG
-  uint16_t NewSPRegEncVal = MRI.getEncodingValue(NewSPReg);
-#endif
-
-  assert((NewSPReg == ARM::SP || NewSPRegEncVal == FPReg) &&
+  assert((NewSPReg == ARM::SP || NewSPReg == FPReg) &&
          "the operand of .setfp directive should be either $sp or $fp");
 
   UsedFP = true;
-  FPReg = NewFPRegEncVal;
-  FPOffset = Offset;
+  FPReg = NewFPReg;
+
+  if (NewSPReg == ARM::SP)
+    FPOffset = SPOffset + Offset;
+  else
+    FPOffset += Offset;
 }
 
 void ARMELFStreamer::EmitPad(int64_t Offset) {
-  SPOffset += Offset;
+  // Track the change of the $sp offset
+  SPOffset -= Offset;
+
+  // To squash multiple .pad directives, we should delay the unwind opcode
+  // until the .save, .vsave, .handlerdata, or .fnend directives.
+  PendingOffset -= Offset;
 }
 
 void ARMELFStreamer::EmitRegSave(const SmallVectorImpl<unsigned> &RegList,
                                  bool IsVector) {
+  // Collect the registers in the register list
+  unsigned Count = 0;
+  uint32_t Mask = 0;
   const MCRegisterInfo &MRI = getContext().getRegisterInfo();
-
-#ifndef NDEBUG
-  unsigned Max = IsVector ? 32 : 16;
-#endif
-  uint32_t &RegMask = IsVector ? VFPRegSave : RegSave;
-
   for (size_t i = 0; i < RegList.size(); ++i) {
     unsigned Reg = MRI.getEncodingValue(RegList[i]);
-    assert(Reg < Max && "Register encoded value out of range");
-    RegMask |= 1u << Reg;
+    assert(Reg < (IsVector ? 32U : 16U) && "Register out of range");
+    unsigned Bit = (1u << Reg);
+    if ((Mask & Bit) == 0) {
+      Mask |= Bit;
+      ++Count;
+    }
   }
+
+  // Track the change the $sp offset: For the .save directive, the
+  // corresponding push instruction will decrease the $sp by (4 * Count).
+  // For the .vsave directive, the corresponding vpush instruction will
+  // decrease $sp by (8 * Count).
+  SPOffset -= Count * (IsVector ? 8 : 4);
+
+  // Emit the opcode
+  FlushPendingOffset();
+  if (IsVector)
+    UnwindOpAsm.EmitVFPRegSave(Mask);
+  else
+    UnwindOpAsm.EmitRegSave(Mask);
 }
 
 namespace llvm {
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index 7a59a7d..2aa1010 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -1359,8 +1359,8 @@ getBitfieldInvertedMaskOpValue(const MCInst &MI, unsigned Op,
   // msb of the mask.
   const MCOperand &MO = MI.getOperand(Op);
   uint32_t v = ~MO.getImm();
-  uint32_t lsb = CountTrailingZeros_32(v);
-  uint32_t msb = (32 - CountLeadingZeros_32 (v)) - 1;
+  uint32_t lsb = countTrailingZeros(v);
+  uint32_t msb = (32 - countLeadingZeros (v)) - 1;
   assert (v != 0 && lsb < 32 && msb < 32 && "Illegal bitfield mask!");
   return lsb | (msb << 5);
 }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index f09fb5a..14fd03f 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -61,6 +61,7 @@ std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) {
     unsigned SubVer = TT[Idx];
     if (SubVer >= '7' && SubVer <= '9') {
       if (Len >= Idx+2 && TT[Idx+1] == 'm') {
+        isThumb = true;
         if (NoCPU)
           // v7m: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureMClass
           ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+mclass";
@@ -99,6 +100,7 @@ std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) {
       if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == '2')
         ARMArchFeature = "+v6t2";
       else if (Len >= Idx+2 && TT[Idx+1] == 'm') {
+        isThumb = true;
         if (NoCPU)
           // v6m: FeatureNoARM, FeatureMClass
           ARMArchFeature = "+v6,+noarm,+mclass";
@@ -159,7 +161,7 @@ static MCRegisterInfo *createARMMCRegisterInfo(StringRef Triple) {
   return X;
 }
 
-static MCAsmInfo *createARMMCAsmInfo(const Target &T, StringRef TT) {
+static MCAsmInfo *createARMMCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) {
   Triple TheTriple(TT);
 
   if (TheTriple.isOSDarwin())
@@ -212,6 +214,15 @@ static MCInstPrinter *createARMMCInstPrinter(const Target &T,
   return 0;
 }
 
+static MCRelocationInfo *createARMMCRelocationInfo(StringRef TT,
+                                                   MCContext &Ctx) {
+  Triple TheTriple(TT);
+  if (TheTriple.isEnvironmentMachO())
+    return createARMMachORelocationInfo(Ctx);
+  // Default to the stock relocation info.
+  return llvm::createMCRelocationInfo(TT, Ctx);
+}
+
 namespace {
 
 class ARMMCInstrAnalysis : public MCInstrAnalysis {
@@ -232,15 +243,16 @@ public:
     return MCInstrAnalysis::isConditionalBranch(Inst);
   }
 
-  uint64_t evaluateBranch(const MCInst &Inst, uint64_t Addr,
-                          uint64_t Size) const {
+  bool evaluateBranch(const MCInst &Inst, uint64_t Addr,
+                      uint64_t Size, uint64_t &Target) const {
     // We only handle PCRel branches for now.
     if (Info->get(Inst.getOpcode()).OpInfo[0].OperandType!=MCOI::OPERAND_PCREL)
-      return -1ULL;
+      return false;
 
     int64_t Imm = Inst.getOperand(0).getImm();
     // FIXME: This is not right for thumb.
-    return Addr+Imm+8; // In ARM mode the PC is always off by 8 bytes.
+    Target = Addr+Imm+8; // In ARM mode the PC is always off by 8 bytes.
+    return true;
   }
 };
 
@@ -295,4 +307,10 @@ extern "C" void LLVMInitializeARMTargetMC() {
   // Register the MCInstPrinter.
   TargetRegistry::RegisterMCInstPrinter(TheARMTarget, createARMMCInstPrinter);
   TargetRegistry::RegisterMCInstPrinter(TheThumbTarget, createARMMCInstPrinter);
+
+  // Register the MC relocation info.
+  TargetRegistry::RegisterMCRelocationInfo(TheARMTarget,
+                                           createARMMCRelocationInfo);
+  TargetRegistry::RegisterMCRelocationInfo(TheThumbTarget,
+                                           createARMMCRelocationInfo);
 }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
index a89981e..4e94c53 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -25,6 +25,7 @@ class MCInstrInfo;
 class MCObjectWriter;
 class MCRegisterInfo;
 class MCSubtargetInfo;
+class MCRelocationInfo;
 class StringRef;
 class Target;
 class raw_ostream;
@@ -58,6 +59,9 @@ MCObjectWriter *createARMMachObjectWriter(raw_ostream &OS,
                                           uint32_t CPUType,
                                           uint32_t CPUSubtype);
 
+
+/// createARMMachORelocationInfo - Construct ARM Mach-O relocation info.
+MCRelocationInfo *createARMMachORelocationInfo(MCContext &Ctx);
 } // End llvm namespace
 
 // Defines symbolic names for ARM registers.  This defines a mapping from
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
new file mode 100644
index 0000000..807c948
--- /dev/null
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
@@ -0,0 +1,43 @@
+//===-- ARMMachORelocationInfo.cpp ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "ARMMCExpr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCRelocationInfo.h"
+#include "llvm-c/Disassembler.h"
+
+using namespace llvm;
+using namespace object;
+
+namespace {
+class ARMMachORelocationInfo : public MCRelocationInfo {
+public:
+  ARMMachORelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {}
+
+  const MCExpr *createExprForCAPIVariantKind(const MCExpr *SubExpr,
+                                             unsigned VariantKind) {
+    switch(VariantKind) {
+    case LLVMDisassembler_VariantKind_ARM_HI16:
+      return ARMMCExpr::CreateUpper16(SubExpr, Ctx);
+    case LLVMDisassembler_VariantKind_ARM_LO16:
+      return ARMMCExpr::CreateLower16(SubExpr, Ctx);
+    default:
+      return MCRelocationInfo::createExprForCAPIVariantKind(SubExpr,
+                                                            VariantKind);
+    }
+  }
+};
+} // End unnamed namespace
+
+/// createARMMachORelocationInfo - Construct an ARM Mach-O RelocationInfo.
+MCRelocationInfo *llvm::createARMMachORelocationInfo(MCContext &Ctx) {
+  return new ARMMachORelocationInfo(Ctx);
+}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp b/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
index 191db69..c943370 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
@@ -20,6 +20,48 @@
 
 using namespace llvm;
 
+namespace {
+  /// UnwindOpcodeStreamer - The simple wrapper over SmallVector to emit bytes
+  /// with MSB to LSB per uint32_t ordering.  For example, the first byte will
+  /// be placed in Vec[3], and the following bytes will be placed in 2, 1, 0,
+  /// 7, 6, 5, 4, 11, 10, 9, 8, and so on.
+  class UnwindOpcodeStreamer {
+  private:
+    SmallVectorImpl<uint8_t> &Vec;
+    size_t Pos;
+
+  public:
+    UnwindOpcodeStreamer(SmallVectorImpl<uint8_t> &V) : Vec(V), Pos(3) {
+    }
+
+    /// Emit the byte in MSB to LSB per uint32_t order.
+    inline void EmitByte(uint8_t elem) {
+      Vec[Pos] = elem;
+      Pos = (((Pos ^ 0x3u) + 1) ^ 0x3u);
+    }
+
+    /// Emit the size prefix.
+    inline void EmitSize(size_t Size) {
+      size_t SizeInWords = (Size + 3) / 4;
+      assert(SizeInWords <= 0x100u &&
+             "Only 256 additional words are allowed for unwind opcodes");
+      EmitByte(static_cast<uint8_t>(SizeInWords - 1));
+    }
+
+    /// Emit the personality index prefix.
+    inline void EmitPersonalityIndex(unsigned PI) {
+      assert(PI < NUM_PERSONALITY_INDEX && "Invalid personality prefix");
+      EmitByte(EHT_COMPACT | PI);
+    }
+
+    /// Fill the rest of bytes with FINISH opcode.
+    inline void FillFinishOpcode() {
+      while (Pos < Vec.size())
+        EmitByte(UNWIND_OPCODE_FINISH);
+    }
+  };
+}
+
 void UnwindOpcodeAssembler::EmitRegSave(uint32_t RegSave) {
   if (RegSave == 0u)
     return;
@@ -43,28 +85,22 @@ void UnwindOpcodeAssembler::EmitRegSave(uint32_t RegSave) {
     uint32_t UnmaskedReg = RegSave & 0xfff0u & (~Mask);
     if (UnmaskedReg == 0u) {
       // Pop r[4 : (4 + n)]
-      Ops.push_back(UNWIND_OPCODE_POP_REG_RANGE_R4 | Range);
+      EmitInt8(UNWIND_OPCODE_POP_REG_RANGE_R4 | Range);
       RegSave &= 0x000fu;
     } else if (UnmaskedReg == (1u << 14)) {
       // Pop r[14] + r[4 : (4 + n)]
-      Ops.push_back(UNWIND_OPCODE_POP_REG_RANGE_R4_R14 | Range);
+      EmitInt8(UNWIND_OPCODE_POP_REG_RANGE_R4_R14 | Range);
       RegSave &= 0x000fu;
     }
   }
 
   // Two bytes opcode to save register r15-r4
-  if ((RegSave & 0xfff0u) != 0) {
-    uint32_t Op = UNWIND_OPCODE_POP_REG_MASK_R4 | (RegSave >> 4);
-    Ops.push_back(static_cast<uint8_t>(Op >> 8));
-    Ops.push_back(static_cast<uint8_t>(Op & 0xff));
-  }
+  if ((RegSave & 0xfff0u) != 0)
+    EmitInt16(UNWIND_OPCODE_POP_REG_MASK_R4 | (RegSave >> 4));
 
   // Opcode to save register r3-r0
-  if ((RegSave & 0x000fu) != 0) {
-    uint32_t Op = UNWIND_OPCODE_POP_REG_MASK | (RegSave & 0x000fu);
-    Ops.push_back(static_cast<uint8_t>(Op >> 8));
-    Ops.push_back(static_cast<uint8_t>(Op & 0xff));
-  }
+  if ((RegSave & 0x000fu) != 0)
+    EmitInt16(UNWIND_OPCODE_POP_REG_MASK | (RegSave & 0x000fu));
 }
 
 /// Emit unwind opcodes for .vsave directives
@@ -89,10 +125,8 @@ void UnwindOpcodeAssembler::EmitVFPRegSave(uint32_t VFPRegSave) {
       Bit >>= 1;
     }
 
-    uint32_t Op =
-        UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD_D16 | ((i - 16) << 4) | Range;
-    Ops.push_back(static_cast<uint8_t>(Op >> 8));
-    Ops.push_back(static_cast<uint8_t>(Op & 0xff));
+    EmitInt16(UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD_D16 |
+              ((i - 16) << 4) | Range);
   }
 
   while (i > 0) {
@@ -113,86 +147,75 @@ void UnwindOpcodeAssembler::EmitVFPRegSave(uint32_t VFPRegSave) {
       Bit >>= 1;
     }
 
-    uint32_t Op = UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD | (i << 4) | Range;
-    Ops.push_back(static_cast<uint8_t>(Op >> 8));
-    Ops.push_back(static_cast<uint8_t>(Op & 0xff));
+    EmitInt16(UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD | (i << 4) | Range);
   }
 }
 
-/// Emit unwind opcodes for .setfp directives
-void UnwindOpcodeAssembler::EmitSetFP(uint16_t FPReg) {
-  Ops.push_back(UNWIND_OPCODE_SET_VSP | FPReg);
+/// Emit unwind opcodes to copy address from source register to $sp.
+void UnwindOpcodeAssembler::EmitSetSP(uint16_t Reg) {
+  EmitInt8(UNWIND_OPCODE_SET_VSP | Reg);
 }
 
-/// Emit unwind opcodes to update stack pointer
+/// Emit unwind opcodes to add $sp with an offset.
 void UnwindOpcodeAssembler::EmitSPOffset(int64_t Offset) {
   if (Offset > 0x200) {
-    uint8_t Buff[10];
-    size_t Size = encodeULEB128((Offset - 0x204) >> 2, Buff);
-    Ops.push_back(UNWIND_OPCODE_INC_VSP_ULEB128);
-    Ops.append(Buff, Buff + Size);
+    uint8_t Buff[16];
+    Buff[0] = UNWIND_OPCODE_INC_VSP_ULEB128;
+    size_t ULEBSize = encodeULEB128((Offset - 0x204) >> 2, Buff + 1);
+    EmitBytes(Buff, ULEBSize + 1);
   } else if (Offset > 0) {
     if (Offset > 0x100) {
-      Ops.push_back(UNWIND_OPCODE_INC_VSP | 0x3fu);
+      EmitInt8(UNWIND_OPCODE_INC_VSP | 0x3fu);
       Offset -= 0x100;
     }
-    Ops.push_back(UNWIND_OPCODE_INC_VSP |
-                  static_cast<uint8_t>((Offset - 4) >> 2));
+    EmitInt8(UNWIND_OPCODE_INC_VSP | static_cast<uint8_t>((Offset - 4) >> 2));
   } else if (Offset < 0) {
     while (Offset < -0x100) {
-      Ops.push_back(UNWIND_OPCODE_DEC_VSP | 0x3fu);
+      EmitInt8(UNWIND_OPCODE_DEC_VSP | 0x3fu);
       Offset += 0x100;
     }
-    Ops.push_back(UNWIND_OPCODE_DEC_VSP |
-                  static_cast<uint8_t>(((-Offset) - 4) >> 2));
+    EmitInt8(UNWIND_OPCODE_DEC_VSP |
+             static_cast<uint8_t>(((-Offset) - 4) >> 2));
   }
 }
 
-void UnwindOpcodeAssembler::AddOpcodeSizePrefix(size_t Pos) {
-  size_t SizeInWords = (size() + 3) / 4;
-  assert(SizeInWords <= 0x100u &&
-         "Only 256 additional words are allowed for unwind opcodes");
-  Ops[Pos] = static_cast<uint8_t>(SizeInWords - 1);
-}
+void UnwindOpcodeAssembler::Finalize(unsigned &PersonalityIndex,
+                                     SmallVectorImpl<uint8_t> &Result) {
 
-void UnwindOpcodeAssembler::AddPersonalityIndexPrefix(size_t Pos, unsigned PI) {
-  assert(PI < NUM_PERSONALITY_INDEX && "Invalid personality prefix");
-  Ops[Pos] = EHT_COMPACT | PI;
-}
+  UnwindOpcodeStreamer OpStreamer(Result);
 
-void UnwindOpcodeAssembler::EmitFinishOpcodes() {
-  for (size_t i = (0x4u - (size() & 0x3u)) & 0x3u; i > 0; --i)
-    Ops.push_back(UNWIND_OPCODE_FINISH);
-}
-
-void UnwindOpcodeAssembler::Finalize() {
   if (HasPersonality) {
-    // Personality specified by .personality directive
-    Offset = 1;
-    AddOpcodeSizePrefix(1);
+    // User-specifed personality routine: [ SIZE , OP1 , OP2 , ... ]
+    PersonalityIndex = NUM_PERSONALITY_INDEX;
+    size_t TotalSize = Ops.size() + 1;
+    size_t RoundUpSize = (TotalSize + 3) / 4 * 4;
+    Result.resize(RoundUpSize);
+    OpStreamer.EmitSize(RoundUpSize);
   } else {
-    if (getOpcodeSize() <= 3) {
+    if (Ops.size() <= 3) {
       // __aeabi_unwind_cpp_pr0: [ 0x80 , OP1 , OP2 , OP3 ]
-      Offset = 1;
       PersonalityIndex = AEABI_UNWIND_CPP_PR0;
-      AddPersonalityIndexPrefix(Offset, PersonalityIndex);
+      Result.resize(4);
+      OpStreamer.EmitPersonalityIndex(PersonalityIndex);
     } else {
       // __aeabi_unwind_cpp_pr1: [ 0x81 , SIZE , OP1 , OP2 , ... ]
-      Offset = 0;
       PersonalityIndex = AEABI_UNWIND_CPP_PR1;
-      AddPersonalityIndexPrefix(Offset, PersonalityIndex);
-      AddOpcodeSizePrefix(1);
+      size_t TotalSize = Ops.size() + 2;
+      size_t RoundUpSize = (TotalSize + 3) / 4 * 4;
+      Result.resize(RoundUpSize);
+      OpStreamer.EmitPersonalityIndex(PersonalityIndex);
+      OpStreamer.EmitSize(RoundUpSize);
     }
   }
 
-  // Emit the padding finish opcodes if the size() is not multiple of 4.
-  EmitFinishOpcodes();
+  // Copy the unwind opcodes
+  for (size_t i = OpBegins.size() - 1; i > 0; --i)
+    for (size_t j = OpBegins[i - 1], end = OpBegins[i]; j < end; ++j)
+      OpStreamer.EmitByte(Ops[j]);
 
-  // Swap the byte order
-  uint8_t *Ptr = Ops.begin() + Offset;
-  assert(size() % 4 == 0 && "Final unwind opcodes should align to 4");
-  for (size_t i = 0, n = size(); i < n; i += 4) {
-    std::swap(Ptr[i], Ptr[i + 3]);
-    std::swap(Ptr[i + 1], Ptr[i + 2]);
-  }
+  // Emit the padding finish opcodes if the size is not multiple of 4.
+  OpStreamer.FillFinishOpcode();
+
+  // Reset the assembler state
+  Reset();
 }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h b/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
index f6ecaeb..ac67c6e 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
@@ -27,86 +27,61 @@ class MCSymbol;
 
 class UnwindOpcodeAssembler {
 private:
-  llvm::SmallVector<uint8_t, 8> Ops;
-
-  unsigned Offset;
-  unsigned PersonalityIndex;
+  llvm::SmallVector<uint8_t, 32> Ops;
+  llvm::SmallVector<unsigned, 8> OpBegins;
   bool HasPersonality;
 
-  enum {
-    // The number of bytes to be preserved for the size and personality index
-    // prefix of unwind opcodes.
-    NUM_PRESERVED_PREFIX_BUF = 2
-  };
-
 public:
   UnwindOpcodeAssembler()
-      : Ops(NUM_PRESERVED_PREFIX_BUF), Offset(NUM_PRESERVED_PREFIX_BUF),
-        PersonalityIndex(NUM_PERSONALITY_INDEX), HasPersonality(0) {
+      : HasPersonality(0) {
+    OpBegins.push_back(0);
   }
 
   /// Reset the unwind opcode assembler.
   void Reset() {
-    Ops.resize(NUM_PRESERVED_PREFIX_BUF);
-    Offset = NUM_PRESERVED_PREFIX_BUF;
-    PersonalityIndex = NUM_PERSONALITY_INDEX;
+    Ops.clear();
+    OpBegins.clear();
+    OpBegins.push_back(0);
     HasPersonality = 0;
   }
 
-  /// Get the size of the payload (including the size byte)
-  size_t size() const {
-    return Ops.size() - Offset;
-  }
-
-  /// Get the beginning of the payload
-  const uint8_t *begin() const {
-    return Ops.begin() + Offset;
-  }
-
-  /// Get the payload
-  StringRef data() const {
-    return StringRef(reinterpret_cast<const char *>(begin()), size());
-  }
-
   /// Set the personality index
   void setPersonality(const MCSymbol *Per) {
     HasPersonality = 1;
   }
 
-  /// Get the personality index
-  unsigned getPersonalityIndex() const {
-    return PersonalityIndex;
-  }
-
   /// Emit unwind opcodes for .save directives
   void EmitRegSave(uint32_t RegSave);
 
   /// Emit unwind opcodes for .vsave directives
   void EmitVFPRegSave(uint32_t VFPRegSave);
 
-  /// Emit unwind opcodes for .setfp directives
-  void EmitSetFP(uint16_t FPReg);
+  /// Emit unwind opcodes to copy address from source register to $sp.
+  void EmitSetSP(uint16_t Reg);
 
-  /// Emit unwind opcodes to update stack pointer
+  /// Emit unwind opcodes to add $sp with an offset.
   void EmitSPOffset(int64_t Offset);
 
   /// Finalize the unwind opcode sequence for EmitBytes()
-  void Finalize();
+  void Finalize(unsigned &PersonalityIndex,
+                SmallVectorImpl<uint8_t> &Result);
 
 private:
-  /// Get the size of the opcodes in bytes.
-  size_t getOpcodeSize() const {
-    return Ops.size() - NUM_PRESERVED_PREFIX_BUF;
+  void EmitInt8(unsigned Opcode) {
+    Ops.push_back(Opcode & 0xff);
+    OpBegins.push_back(OpBegins.back() + 1);
   }
 
-  /// Add the length prefix to the payload
-  void AddOpcodeSizePrefix(size_t Pos);
-
-  /// Add personality index prefix in some compact format
-  void AddPersonalityIndexPrefix(size_t Pos, unsigned PersonalityIndex);
+  void EmitInt16(unsigned Opcode) {
+    Ops.push_back((Opcode >> 8) & 0xff);
+    Ops.push_back(Opcode & 0xff);
+    OpBegins.push_back(OpBegins.back() + 2);
+  }
 
-  /// Fill the words with finish opcode if it is not aligned
-  void EmitFinishOpcodes();
+  void EmitBytes(const uint8_t *Opcode, size_t Size) {
+    Ops.insert(Ops.end(), Opcode, Opcode + Size);
+    OpBegins.push_back(OpBegins.back() + Size);
+  }
 };
 
 } // namespace llvm
diff --git a/lib/Target/ARM/MCTargetDesc/CMakeLists.txt b/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
index a7ac5ca..bab59f4 100644
--- a/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
@@ -9,6 +9,7 @@ add_llvm_library(LLVMARMDesc
   ARMMachObjectWriter.cpp
   ARMELFObjectWriter.cpp
   ARMUnwindOpAsm.cpp
+  ARMMachORelocationInfo.cpp
   )
 add_dependencies(LLVMARMDesc ARMCommonTableGen)
 
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index 1e2a8b0..db49db8 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -88,7 +88,8 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
   const Thumb1InstrInfo &TII =
     *static_cast<const Thumb1InstrInfo*>(MF.getTarget().getInstrInfo());
 
-  unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize();
+  unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
+  unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(Align);
   unsigned NumBytes = MFI->getStackSize();
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
   DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
@@ -249,7 +250,8 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
   const Thumb1InstrInfo &TII =
     *static_cast<const Thumb1InstrInfo*>(MF.getTarget().getInstrInfo());
 
-  unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize();
+  unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
+  unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(Align);
   int NumBytes = (int)MFI->getStackSize();
   const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs();
   unsigned FramePtr = RegInfo->getFrameRegister(MF);
diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp
index 095736d..22a925e 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -22,7 +22,7 @@
 using namespace llvm;
 
 Thumb1InstrInfo::Thumb1InstrInfo(const ARMSubtarget &STI)
-  : ARMBaseInstrInfo(STI), RI(*this, STI) {
+  : ARMBaseInstrInfo(STI), RI(STI) {
 }
 
 /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/Thumb1RegisterInfo.cpp
index 7452fb7..6722614 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb1RegisterInfo.cpp
@@ -40,9 +40,8 @@ extern cl::opt<bool> ReuseFrameIndexVals;
 
 using namespace llvm;
 
-Thumb1RegisterInfo::Thumb1RegisterInfo(const ARMBaseInstrInfo &tii,
-                                       const ARMSubtarget &sti)
-  : ARMBaseRegisterInfo(tii, sti) {
+Thumb1RegisterInfo::Thumb1RegisterInfo(const ARMSubtarget &sti)
+  : ARMBaseRegisterInfo(sti) {
 }
 
 const TargetRegisterClass*
@@ -70,6 +69,7 @@ Thumb1RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB,
                                       ARMCC::CondCodes Pred, unsigned PredReg,
                                       unsigned MIFlags) const {
   MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   MachineConstantPool *ConstantPool = MF.getConstantPool();
   const Constant *C = ConstantInt::get(
           Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val);
@@ -488,6 +488,9 @@ void
 Thumb1RegisterInfo::resolveFrameIndex(MachineBasicBlock::iterator I,
                                       unsigned BaseReg, int64_t Offset) const {
   MachineInstr &MI = *I;
+  const ARMBaseInstrInfo &TII =
+    *static_cast<const ARMBaseInstrInfo*>(
+      MI.getParent()->getParent()->getTarget().getInstrInfo());
   int Off = Offset; // ARM doesn't need the general 64-bit offsets
   unsigned i = 0;
 
@@ -513,6 +516,7 @@ Thumb1RegisterInfo::saveScavengerRegister(MachineBasicBlock &MBB,
   // off the frame pointer (if, for example, there are alloca() calls in
   // the function, the offset will be negative. Use R12 instead since that's
   // a call clobbered register that we know won't be used in Thumb1 mode.
+  const TargetInstrInfo &TII = *MBB.getParent()->getTarget().getInstrInfo();
   DebugLoc DL;
   AddDefaultPred(BuildMI(MBB, I, DL, TII.get(ARM::tMOVr))
     .addReg(ARM::R12, RegState::Define)
@@ -558,6 +562,8 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MachineInstr &MI = *II;
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
+  const ARMBaseInstrInfo &TII =
+    *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   DebugLoc dl = MI.getDebugLoc();
   MachineInstrBuilder MIB(*MBB.getParent(), &MI);
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.h b/lib/Target/ARM/Thumb1RegisterInfo.h
index ebbab36..9689b23 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.h
+++ b/lib/Target/ARM/Thumb1RegisterInfo.h
@@ -25,7 +25,7 @@ namespace llvm {
 
 struct Thumb1RegisterInfo : public ARMBaseRegisterInfo {
 public:
-  Thumb1RegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &STI);
+  Thumb1RegisterInfo(const ARMSubtarget &STI);
 
   const TargetRegisterClass*
   getLargestLegalSuperClass(const TargetRegisterClass *RC) const;
diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
index 97c254c..d8596d7 100644
--- a/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -73,15 +73,15 @@ static void TrackDefUses(MachineInstr *MI,
 
   for (unsigned i = 0, e = LocalUses.size(); i != e; ++i) {
     unsigned Reg = LocalUses[i];
-    Uses.insert(Reg);
-    for (MCSubRegIterator Subreg(Reg, TRI); Subreg.isValid(); ++Subreg)
+    for (MCSubRegIterator Subreg(Reg, TRI, /*IncludeSelf=*/true);
+         Subreg.isValid(); ++Subreg)
       Uses.insert(*Subreg);
   }
 
   for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) {
     unsigned Reg = LocalDefs[i];
-    Defs.insert(Reg);
-    for (MCSubRegIterator Subreg(Reg, TRI); Subreg.isValid(); ++Subreg)
+    for (MCSubRegIterator Subreg(Reg, TRI, /*IncludeSelf=*/true);
+         Subreg.isValid(); ++Subreg)
       Defs.insert(*Subreg);
     if (Reg == ARM::CPSR)
       continue;
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index a1b48c2..286eaa0 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -31,7 +31,7 @@ OldT2IfCvt("old-thumb2-ifcvt", cl::Hidden,
            cl::init(false));
 
 Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI)
-  : ARMBaseInstrInfo(STI), RI(*this, STI) {
+  : ARMBaseInstrInfo(STI), RI(STI) {
 }
 
 /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
@@ -285,7 +285,7 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
         NumBytes = 0;
       } else {
         // FIXME: Move this to ARMAddressingModes.h?
-        unsigned RotAmt = CountLeadingZeros_32(ThisVal);
+        unsigned RotAmt = countLeadingZeros(ThisVal);
         ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt);
         NumBytes &= ~ThisVal;
         assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 &&
@@ -302,7 +302,7 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
         NumBytes = 0;
       } else {
         // FIXME: Move this to ARMAddressingModes.h?
-        unsigned RotAmt = CountLeadingZeros_32(ThisVal);
+        unsigned RotAmt = countLeadingZeros(ThisVal);
         ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt);
         NumBytes &= ~ThisVal;
         assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 &&
@@ -484,7 +484,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
 
     // Otherwise, extract 8 adjacent bits from the immediate into this
     // t2ADDri/t2SUBri.
-    unsigned RotAmt = CountLeadingZeros_32(Offset);
+    unsigned RotAmt = countLeadingZeros<unsigned>(Offset);
     unsigned ThisImmVal = Offset & ARM_AM::rotr32(0xff000000U, RotAmt);
 
     // We will handle these bits from offset, clear them.
diff --git a/lib/Target/ARM/Thumb2RegisterInfo.cpp b/lib/Target/ARM/Thumb2RegisterInfo.cpp
index 1a7a4d4..4cb827f 100644
--- a/lib/Target/ARM/Thumb2RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb2RegisterInfo.cpp
@@ -24,9 +24,8 @@
 #include "llvm/IR/Function.h"
 using namespace llvm;
 
-Thumb2RegisterInfo::Thumb2RegisterInfo(const ARMBaseInstrInfo &tii,
-                                       const ARMSubtarget &sti)
-  : ARMBaseRegisterInfo(tii, sti) {
+Thumb2RegisterInfo::Thumb2RegisterInfo(const ARMSubtarget &sti)
+  : ARMBaseRegisterInfo(sti) {
 }
 
 /// emitLoadConstPool - Emits a load from constpool to materialize the
@@ -40,6 +39,7 @@ Thumb2RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB,
                                       ARMCC::CondCodes Pred, unsigned PredReg,
                                       unsigned MIFlags) const {
   MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   MachineConstantPool *ConstantPool = MF.getConstantPool();
   const Constant *C = ConstantInt::get(
            Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val);
diff --git a/lib/Target/ARM/Thumb2RegisterInfo.h b/lib/Target/ARM/Thumb2RegisterInfo.h
index 6b397e8..b1d63fa 100644
--- a/lib/Target/ARM/Thumb2RegisterInfo.h
+++ b/lib/Target/ARM/Thumb2RegisterInfo.h
@@ -20,12 +20,12 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 
 namespace llvm {
-  class ARMSubtarget;
-  class ARMBaseInstrInfo;
+
+class ARMSubtarget;
 
 struct Thumb2RegisterInfo : public ARMBaseRegisterInfo {
 public:
-  Thumb2RegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &STI);
+  Thumb2RegisterInfo(const ARMSubtarget &STI);
 
   /// emitLoadConstPool - Emits a load from constpool to materialize the
   /// specified immediate.
diff --git a/lib/Target/Hexagon/CMakeLists.txt b/lib/Target/Hexagon/CMakeLists.txt
index b5b887e..57044b2 100644
--- a/lib/Target/Hexagon/CMakeLists.txt
+++ b/lib/Target/Hexagon/CMakeLists.txt
@@ -28,12 +28,14 @@ add_llvm_target(HexagonCodeGen
   HexagonRegisterInfo.cpp
   HexagonRemoveSZExtArgs.cpp
   HexagonSelectionDAGInfo.cpp
+  HexagonSplitConst32AndConst64.cpp
   HexagonSplitTFRCondSets.cpp
   HexagonSubtarget.cpp
   HexagonTargetMachine.cpp
   HexagonTargetObjectFile.cpp
   HexagonVLIWPacketizer.cpp
   HexagonNewValueJump.cpp
+  HexagonCopyToCombine.cpp
 )
 
 add_subdirectory(TargetInfo)
diff --git a/lib/Target/Hexagon/Hexagon.h b/lib/Target/Hexagon/Hexagon.h
index dfbefc8..b88637a 100644
--- a/lib/Target/Hexagon/Hexagon.h
+++ b/lib/Target/Hexagon/Hexagon.h
@@ -29,26 +29,29 @@ namespace llvm {
   class HexagonTargetMachine;
   class raw_ostream;
 
-  FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM,
+  FunctionPass *createHexagonISelDag(const HexagonTargetMachine &TM,
                                      CodeGenOpt::Level OptLevel);
-  FunctionPass *createHexagonDelaySlotFillerPass(TargetMachine &TM);
-  FunctionPass *createHexagonFPMoverPass(TargetMachine &TM);
-  FunctionPass *createHexagonRemoveExtendOps(HexagonTargetMachine &TM);
-  FunctionPass *createHexagonCFGOptimizer(HexagonTargetMachine &TM);
-
-  FunctionPass *createHexagonSplitTFRCondSets(HexagonTargetMachine &TM);
-  FunctionPass *createHexagonExpandPredSpillCode(HexagonTargetMachine &TM);
+  FunctionPass *createHexagonDelaySlotFillerPass(const TargetMachine &TM);
+  FunctionPass *createHexagonFPMoverPass(const TargetMachine &TM);
+  FunctionPass *createHexagonRemoveExtendArgs(const HexagonTargetMachine &TM);
+  FunctionPass *createHexagonCFGOptimizer(const HexagonTargetMachine &TM);
 
+  FunctionPass *createHexagonSplitTFRCondSets(const HexagonTargetMachine &TM);
+  FunctionPass *createHexagonSplitConst32AndConst64(
+                      const HexagonTargetMachine &TM);
+  FunctionPass *createHexagonExpandPredSpillCode(
+                      const HexagonTargetMachine &TM);
   FunctionPass *createHexagonHardwareLoops();
   FunctionPass *createHexagonPeephole();
   FunctionPass *createHexagonFixupHwLoops();
+  FunctionPass *createHexagonNewValueJump();
+  FunctionPass *createHexagonCopyToCombine();
   FunctionPass *createHexagonPacketizer();
   FunctionPass *createHexagonNewValueJump();
 
-
 /* TODO: object output.
   MCCodeEmitter *createHexagonMCCodeEmitter(const Target &,
-                                            TargetMachine &TM,
+                                            const TargetMachine &TM,
                                             MCContext &Ctx);
 */
 /* TODO: assembler input.
diff --git a/lib/Target/Hexagon/Hexagon.td b/lib/Target/Hexagon/Hexagon.td
index af1c56b..568798c 100644
--- a/lib/Target/Hexagon/Hexagon.td
+++ b/lib/Target/Hexagon/Hexagon.td
@@ -84,6 +84,30 @@ def getPredOpcode : InstrMapping {
 }
 
 //===----------------------------------------------------------------------===//
+// Generate mapping table to relate predicate-true instructions with their
+// predicate-false forms
+//
+def getFalsePredOpcode : InstrMapping {
+  let FilterClass = "PredRel";
+  let RowFields = ["BaseOpcode", "PNewValue", "isNVStore", "isBrTaken"];
+  let ColFields = ["PredSense"];
+  let KeyCol = ["true"];
+  let ValueCols = [["false"]];
+}
+
+//===----------------------------------------------------------------------===//
+// Generate mapping table to relate predicate-false instructions with their
+// predicate-true forms
+//
+def getTruePredOpcode : InstrMapping {
+  let FilterClass = "PredRel";
+  let RowFields = ["BaseOpcode", "PNewValue", "isNVStore", "isBrTaken"];
+  let ColFields = ["PredSense"];
+  let KeyCol = ["false"];
+  let ValueCols = [["true"]];
+}
+
+//===----------------------------------------------------------------------===//
 // Generate mapping table to relate predicated instructions with their .new
 // format.
 //
@@ -96,15 +120,39 @@ def getPredNewOpcode : InstrMapping {
 }
 
 //===----------------------------------------------------------------------===//
+// Generate mapping table to relate .new predicated instructions with their old
+// format.
+//
+def getPredOldOpcode : InstrMapping {
+  let FilterClass = "PredNewRel";
+  let RowFields = ["BaseOpcode", "PredSense", "isNVStore"];
+  let ColFields = ["PNewValue"];
+  let KeyCol = ["new"];
+  let ValueCols = [[""]];
+}
+
+//===----------------------------------------------------------------------===//
 // Generate mapping table to relate store instructions with their new-value
 // format.
 //
 def getNewValueOpcode : InstrMapping {
   let FilterClass = "NewValueRel";
   let RowFields = ["BaseOpcode", "PredSense", "PNewValue"];
-  let ColFields = ["isNVStore"];
-  let KeyCol = ["0"];
-  let ValueCols = [["1"]];
+  let ColFields = ["NValueST"];
+  let KeyCol = ["false"];
+  let ValueCols = [["true"]];
+}
+
+//===----------------------------------------------------------------------===//
+// Generate mapping table to relate new-value store instructions with their old
+// format.
+//
+def getNonNVStore : InstrMapping {
+  let FilterClass = "NewValueRel";
+  let RowFields = ["BaseOpcode", "PredSense", "PNewValue"];
+  let ColFields = ["NValueST"];
+  let KeyCol = ["true"];
+  let ValueCols = [["false"]];
 }
 
 def getBasedWithImmOffset : InstrMapping {
diff --git a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
index b6022ca..8597f11 100644
--- a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
+++ b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
@@ -26,21 +26,27 @@
 
 using namespace llvm;
 
+namespace llvm {
+  void initializeHexagonCFGOptimizerPass(PassRegistry&);
+}
+
+
 namespace {
 
 class HexagonCFGOptimizer : public MachineFunctionPass {
 
 private:
-  HexagonTargetMachine& QTM;
+  const HexagonTargetMachine& QTM;
   const HexagonSubtarget &QST;
 
   void InvertAndChangeJumpTarget(MachineInstr*, MachineBasicBlock*);
 
  public:
   static char ID;
-  HexagonCFGOptimizer(HexagonTargetMachine& TM) : MachineFunctionPass(ID),
-                                                  QTM(TM),
-                                                  QST(*TM.getSubtargetImpl()) {}
+  HexagonCFGOptimizer(const HexagonTargetMachine& TM)
+    : MachineFunctionPass(ID), QTM(TM), QST(*TM.getSubtargetImpl()) {
+    initializeHexagonCFGOptimizerPass(*PassRegistry::getPassRegistry());
+  }
 
   const char *getPassName() const {
     return "Hexagon CFG Optimizer";
@@ -231,6 +237,16 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
 //                         Public Constructor Functions
 //===----------------------------------------------------------------------===//
 
-FunctionPass *llvm::createHexagonCFGOptimizer(HexagonTargetMachine &TM) {
+static void initializePassOnce(PassRegistry &Registry) {
+  PassInfo *PI = new PassInfo("Hexagon CFG Optimizer", "hexagon-cfg",
+                              &HexagonCFGOptimizer::ID, 0, false, false);
+  Registry.registerPass(*PI, true);
+}
+
+void llvm::initializeHexagonCFGOptimizerPass(PassRegistry &Registry) {
+  CALL_ONCE_INITIALIZATION(initializePassOnce)
+}
+
+FunctionPass *llvm::createHexagonCFGOptimizer(const HexagonTargetMachine &TM) {
   return new HexagonCFGOptimizer(TM);
 }
diff --git a/lib/Target/Hexagon/HexagonCallingConvLower.cpp b/lib/Target/Hexagon/HexagonCallingConvLower.cpp
index 2c93d04..fc5503a 100644
--- a/lib/Target/Hexagon/HexagonCallingConvLower.cpp
+++ b/lib/Target/Hexagon/HexagonCallingConvLower.cpp
@@ -27,12 +27,11 @@ Hexagon_CCState::Hexagon_CCState(CallingConv::ID CC, bool isVarArg,
                                  const TargetMachine &tm,
                                  SmallVector<CCValAssign, 16> &locs,
                                  LLVMContext &c)
-  : CallingConv(CC), IsVarArg(isVarArg), TM(tm),
-    TRI(*TM.getRegisterInfo()), Locs(locs), Context(c) {
+  : CallingConv(CC), IsVarArg(isVarArg), TM(tm), Locs(locs), Context(c) {
   // No stack is used.
   StackOffset = 0;
 
-  UsedRegs.resize((TRI.getNumRegs()+31)/32);
+  UsedRegs.resize((TM.getRegisterInfo()->getNumRegs()+31)/32);
 }
 
 // HandleByVal - Allocate a stack slot large enough to pass an argument by
@@ -56,6 +55,7 @@ void Hexagon_CCState::HandleByVal(unsigned ValNo, EVT ValVT,
 
 /// MarkAllocated - Mark a register and all of its aliases as allocated.
 void Hexagon_CCState::MarkAllocated(unsigned Reg) {
+  const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
   for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
     UsedRegs[*AI/32] |= 1 << (*AI&31);
 }
diff --git a/lib/Target/Hexagon/HexagonCallingConvLower.h b/lib/Target/Hexagon/HexagonCallingConvLower.h
index 489b3a3..eed99f4 100644
--- a/lib/Target/Hexagon/HexagonCallingConvLower.h
+++ b/lib/Target/Hexagon/HexagonCallingConvLower.h
@@ -48,7 +48,6 @@ class Hexagon_CCState {
   CallingConv::ID CallingConv;
   bool IsVarArg;
   const TargetMachine &TM;
-  const TargetRegisterInfo &TRI;
   SmallVector<CCValAssign, 16> &Locs;
   LLVMContext &Context;
 
diff --git a/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/lib/Target/Hexagon/HexagonCopyToCombine.cpp
new file mode 100644
index 0000000..dc440cb
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonCopyToCombine.cpp
@@ -0,0 +1,677 @@
+//===------- HexagonCopyToCombine.cpp - Hexagon Copy-To-Combine Pass ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This pass replaces transfer instructions by combine instructions.
+// We walk along a basic block and look for two combinable instructions and try
+// to move them together. If we can move them next to each other we do so and
+// replace them with a combine instruction.
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "hexagon-copy-combine"
+
+#include "llvm/PassSupport.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "Hexagon.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonMachineFunctionInfo.h"
+
+using namespace llvm;
+
+static
+cl::opt<bool> IsCombinesDisabled("disable-merge-into-combines",
+                                 cl::Hidden, cl::ZeroOrMore,
+                                 cl::init(false),
+                                 cl::desc("Disable merging into combines"));
+static
+cl::opt<unsigned>
+MaxNumOfInstsBetweenNewValueStoreAndTFR("max-num-inst-between-tfr-and-nv-store",
+                   cl::Hidden, cl::init(4),
+                   cl::desc("Maximum distance between a tfr feeding a store we "
+                            "consider the store still to be newifiable"));
+
+namespace llvm {
+  void initializeHexagonCopyToCombinePass(PassRegistry&);
+}
+
+
+namespace {
+
+class HexagonCopyToCombine : public MachineFunctionPass  {
+  const HexagonInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  bool ShouldCombineAggressively;
+
+  DenseSet<MachineInstr *> PotentiallyNewifiableTFR;
+public:
+  static char ID;
+
+  HexagonCopyToCombine() : MachineFunctionPass(ID) {
+    initializeHexagonCopyToCombinePass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  const char *getPassName() const {
+    return "Hexagon Copy-To-Combine Pass";
+  }
+
+  virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+private:
+  MachineInstr *findPairable(MachineInstr *I1, bool &DoInsertAtI1);
+
+  void findPotentialNewifiableTFRs(MachineBasicBlock &);
+
+  void combine(MachineInstr *I1, MachineInstr *I2,
+               MachineBasicBlock::iterator &MI, bool DoInsertAtI1);
+
+  bool isSafeToMoveTogether(MachineInstr *I1, MachineInstr *I2,
+                            unsigned I1DestReg, unsigned I2DestReg,
+                            bool &DoInsertAtI1);
+
+  void emitCombineRR(MachineBasicBlock::iterator &Before, unsigned DestReg,
+                     MachineOperand &HiOperand, MachineOperand &LoOperand);
+
+  void emitCombineRI(MachineBasicBlock::iterator &Before, unsigned DestReg,
+                     MachineOperand &HiOperand, MachineOperand &LoOperand);
+
+  void emitCombineIR(MachineBasicBlock::iterator &Before, unsigned DestReg,
+                     MachineOperand &HiOperand, MachineOperand &LoOperand);
+
+  void emitCombineII(MachineBasicBlock::iterator &Before, unsigned DestReg,
+                     MachineOperand &HiOperand, MachineOperand &LoOperand);
+};
+
+} // End anonymous namespace.
+
+char HexagonCopyToCombine::ID = 0;
+
+INITIALIZE_PASS(HexagonCopyToCombine, "hexagon-copy-combine",
+                "Hexagon Copy-To-Combine Pass", false, false)
+
+static bool isCombinableInstType(MachineInstr *MI,
+                                 const HexagonInstrInfo *TII,
+                                 bool ShouldCombineAggressively) {
+  switch(MI->getOpcode()) {
+  case Hexagon::TFR: {
+    // A COPY instruction can be combined if its arguments are IntRegs (32bit).
+    assert(MI->getOperand(0).isReg() && MI->getOperand(1).isReg());
+
+    unsigned DestReg = MI->getOperand(0).getReg();
+    unsigned SrcReg = MI->getOperand(1).getReg();
+    return Hexagon::IntRegsRegClass.contains(DestReg) &&
+      Hexagon::IntRegsRegClass.contains(SrcReg);
+  }
+
+  case Hexagon::TFRI: {
+    // A transfer-immediate can be combined if its argument is a signed 8bit
+    // value.
+    assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
+    unsigned DestReg = MI->getOperand(0).getReg();
+
+    // Only combine constant extended TFRI if we are in aggressive mode.
+    return Hexagon::IntRegsRegClass.contains(DestReg) &&
+      (ShouldCombineAggressively || isInt<8>(MI->getOperand(1).getImm()));
+  }
+
+  case Hexagon::TFRI_V4: {
+    if (!ShouldCombineAggressively)
+      return false;
+    assert(MI->getOperand(0).isReg() && MI->getOperand(1).isGlobal());
+
+    // Ensure that TargetFlags are MO_NO_FLAG for a global. This is a
+    // workaround for an ABI bug that prevents GOT relocations on combine
+    // instructions
+    if (MI->getOperand(1).getTargetFlags() != HexagonII::MO_NO_FLAG)
+      return false;
+
+    unsigned DestReg = MI->getOperand(0).getReg();
+    return Hexagon::IntRegsRegClass.contains(DestReg);
+  }
+
+  default:
+    break;
+  }
+
+  return false;
+}
+
+static bool isGreaterThan8BitTFRI(MachineInstr *I) {
+  return I->getOpcode() == Hexagon::TFRI &&
+    !isInt<8>(I->getOperand(1).getImm());
+}
+static bool isGreaterThan6BitTFRI(MachineInstr *I) {
+  return I->getOpcode() == Hexagon::TFRI &&
+    !isUInt<6>(I->getOperand(1).getImm());
+}
+
+/// areCombinableOperations - Returns true if the two instruction can be merge
+/// into a combine (ignoring register constraints).
+static bool areCombinableOperations(const TargetRegisterInfo *TRI,
+                                    MachineInstr *HighRegInst,
+                                    MachineInstr *LowRegInst) {
+  assert((HighRegInst->getOpcode() == Hexagon::TFR ||
+          HighRegInst->getOpcode() == Hexagon::TFRI ||
+          HighRegInst->getOpcode() == Hexagon::TFRI_V4) &&
+         (LowRegInst->getOpcode() == Hexagon::TFR ||
+          LowRegInst->getOpcode() == Hexagon::TFRI ||
+          LowRegInst->getOpcode() == Hexagon::TFRI_V4) &&
+         "Assume individual instructions are of a combinable type");
+
+  const HexagonRegisterInfo *QRI =
+    static_cast<const HexagonRegisterInfo *>(TRI);
+
+  // V4 added some combine variations (mixed immediate and register source
+  // operands), if we are on < V4 we can only combine 2 register-to-register
+  // moves and 2 immediate-to-register moves. We also don't have
+  // constant-extenders.
+  if (!QRI->Subtarget.hasV4TOps())
+    return HighRegInst->getOpcode() == LowRegInst->getOpcode() &&
+      !isGreaterThan8BitTFRI(HighRegInst) &&
+      !isGreaterThan6BitTFRI(LowRegInst);
+
+  // There is no combine of two constant extended values.
+  if ((HighRegInst->getOpcode() == Hexagon::TFRI_V4 ||
+       isGreaterThan8BitTFRI(HighRegInst)) &&
+      (LowRegInst->getOpcode() == Hexagon::TFRI_V4 ||
+       isGreaterThan6BitTFRI(LowRegInst)))
+    return false;
+
+  return true;
+}
+
+static bool isEvenReg(unsigned Reg) {
+  assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
+         Hexagon::IntRegsRegClass.contains(Reg));
+  return (Reg - Hexagon::R0) % 2 == 0;
+}
+
+static void removeKillInfo(MachineInstr *MI, unsigned RegNotKilled) {
+  for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
+    MachineOperand &Op = MI->getOperand(I);
+    if (!Op.isReg() || Op.getReg() != RegNotKilled || !Op.isKill())
+      continue;
+    Op.setIsKill(false);
+  }
+}
+
+/// isUnsafeToMoveAcross - Returns true if it is unsafe to move a copy
+/// instruction from \p UseReg to \p DestReg over the instruction \p I.
+static bool isUnsafeToMoveAcross(MachineInstr *I, unsigned UseReg,
+                                  unsigned DestReg,
+                                  const TargetRegisterInfo *TRI) {
+  return (UseReg && (I->modifiesRegister(UseReg, TRI))) ||
+          I->modifiesRegister(DestReg, TRI) ||
+          I->readsRegister(DestReg, TRI) ||
+          I->hasUnmodeledSideEffects() ||
+          I->isInlineAsm() || I->isDebugValue();
+}
+
+/// isSafeToMoveTogether - Returns true if it is safe to move I1 next to I2 such
+/// that the two instructions can be paired in a combine.
+bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr *I1,
+                                                MachineInstr *I2,
+                                                unsigned I1DestReg,
+                                                unsigned I2DestReg,
+                                                bool &DoInsertAtI1) {
+
+  bool IsImmUseReg = I2->getOperand(1).isImm() || I2->getOperand(1).isGlobal();
+  unsigned I2UseReg = IsImmUseReg ? 0 : I2->getOperand(1).getReg();
+
+  // It is not safe to move I1 and I2 into one combine if I2 has a true
+  // dependence on I1.
+  if (I2UseReg && I1->modifiesRegister(I2UseReg, TRI))
+    return false;
+
+  bool isSafe = true;
+
+  // First try to move I2 towards I1.
+  {
+    // A reverse_iterator instantiated like below starts before I2, and I1
+    // respectively.
+    // Look at instructions I in between I2 and (excluding) I1.
+    MachineBasicBlock::reverse_iterator I(I2),
+      End = --(MachineBasicBlock::reverse_iterator(I1));
+    // At 03 we got better results (dhrystone!) by being more conservative.
+    if (!ShouldCombineAggressively)
+      End = MachineBasicBlock::reverse_iterator(I1);
+    // If I2 kills its operand and we move I2 over an instruction that also
+    // uses I2's use reg we need to modify that (first) instruction to now kill
+    // this reg.
+    unsigned KilledOperand = 0;
+    if (I2->killsRegister(I2UseReg))
+      KilledOperand = I2UseReg;
+    MachineInstr *KillingInstr = 0;
+
+    for (; I != End; ++I) {
+      // If the intervening instruction I:
+      //   * modifies I2's use reg
+      //   * modifies I2's def reg
+      //   * reads I2's def reg
+      //   * or has unmodelled side effects
+      // we can't move I2 across it.
+      if (isUnsafeToMoveAcross(&*I, I2UseReg, I2DestReg, TRI)) {
+        isSafe = false;
+        break;
+      }
+
+      // Update first use of the killed operand.
+      if (!KillingInstr && KilledOperand &&
+          I->readsRegister(KilledOperand, TRI))
+        KillingInstr = &*I;
+    }
+    if (isSafe) {
+      // Update the intermediate instruction to with the kill flag.
+      if (KillingInstr) {
+        bool Added = KillingInstr->addRegisterKilled(KilledOperand, TRI, true);
+        (void)Added; // supress compiler warning
+        assert(Added && "Must successfully update kill flag");
+        removeKillInfo(I2, KilledOperand);
+      }
+      DoInsertAtI1 = true;
+      return true;
+    }
+  }
+
+  // Try to move I1 towards I2.
+  {
+    // Look at instructions I in between I1 and (excluding) I2.
+    MachineBasicBlock::iterator I(I1), End(I2);
+    // At O3 we got better results (dhrystone) by being more conservative here.
+    if (!ShouldCombineAggressively)
+      End = llvm::next(MachineBasicBlock::iterator(I2));
+    IsImmUseReg = I1->getOperand(1).isImm() || I1->getOperand(1).isGlobal();
+    unsigned I1UseReg = IsImmUseReg ? 0 : I1->getOperand(1).getReg();
+    // Track killed operands. If we move across an instruction that kills our
+    // operand, we need to update the kill information on the moved I1. It kills
+    // the operand now.
+    MachineInstr *KillingInstr = 0;
+    unsigned KilledOperand = 0;
+
+    while(++I != End) {
+      // If the intervening instruction I:
+      //   * modifies I1's use reg
+      //   * modifies I1's def reg
+      //   * reads I1's def reg
+      //   * or has unmodelled side effects
+      //   We introduce this special case because llvm has no api to remove a
+      //   kill flag for a register (a removeRegisterKilled() analogous to
+      //   addRegisterKilled) that handles aliased register correctly.
+      //   * or has a killed aliased register use of I1's use reg
+      //           %D4<def> = TFRI64 16
+      //           %R6<def> = TFR %R9
+      //           %R8<def> = KILL %R8, %D4<imp-use,kill>
+      //      If we want to move R6 = across the KILL instruction we would have
+      //      to remove the %D4<imp-use,kill> operand. For now, we are
+      //      conservative and disallow the move.
+      // we can't move I1 across it.
+      if (isUnsafeToMoveAcross(I, I1UseReg, I1DestReg, TRI) ||
+          // Check for an aliased register kill. Bail out if we see one.
+          (!I->killsRegister(I1UseReg) && I->killsRegister(I1UseReg, TRI)))
+        return false;
+
+      // Check for an exact kill (registers match).
+      if (I1UseReg && I->killsRegister(I1UseReg)) {
+        assert(KillingInstr == 0 && "Should only see one killing instruction");
+        KilledOperand = I1UseReg;
+        KillingInstr = &*I;
+      }
+    }
+    if (KillingInstr) {
+      removeKillInfo(KillingInstr, KilledOperand);
+      // Update I1 to set the kill flag. This flag will later be picked up by
+      // the new COMBINE instruction.
+      bool Added = I1->addRegisterKilled(KilledOperand, TRI);
+      (void)Added; // supress compiler warning
+      assert(Added && "Must successfully update kill flag");
+    }
+    DoInsertAtI1 = false;
+  }
+
+  return true;
+}
+
+/// findPotentialNewifiableTFRs - Finds tranfers that feed stores that could be
+/// newified. (A use of a 64 bit register define can not be newified)
+void
+HexagonCopyToCombine::findPotentialNewifiableTFRs(MachineBasicBlock &BB) {
+  DenseMap<unsigned, MachineInstr *> LastDef;
+  for (MachineBasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I) {
+    MachineInstr *MI = I;
+    // Mark TFRs that feed a potential new value store as such.
+    if(TII->mayBeNewStore(MI)) {
+      // Look for uses of TFR instructions.
+      for (unsigned OpdIdx = 0, OpdE = MI->getNumOperands(); OpdIdx != OpdE;
+           ++OpdIdx) {
+        MachineOperand &Op = MI->getOperand(OpdIdx);
+
+        // Skip over anything except register uses.
+        if (!Op.isReg() || !Op.isUse() || !Op.getReg())
+          continue;
+
+        // Look for the defining instruction.
+        unsigned Reg = Op.getReg();
+        MachineInstr *DefInst = LastDef[Reg];
+        if (!DefInst)
+          continue;
+        if (!isCombinableInstType(DefInst, TII, ShouldCombineAggressively))
+          continue;
+
+        // Only close newifiable stores should influence the decision.
+        MachineBasicBlock::iterator It(DefInst);
+        unsigned NumInstsToDef = 0;
+        while (&*It++ != MI)
+          ++NumInstsToDef;
+
+        if (NumInstsToDef > MaxNumOfInstsBetweenNewValueStoreAndTFR)
+          continue;
+
+        PotentiallyNewifiableTFR.insert(DefInst);
+      }
+      // Skip to next instruction.
+      continue;
+    }
+
+    // Put instructions that last defined integer or double registers into the
+    // map.
+    for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
+      MachineOperand &Op = MI->getOperand(I);
+      if (!Op.isReg() || !Op.isDef() || !Op.getReg())
+        continue;
+      unsigned Reg = Op.getReg();
+      if (Hexagon::DoubleRegsRegClass.contains(Reg)) {
+        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+          LastDef[*SubRegs] = MI;
+        }
+      } else if (Hexagon::IntRegsRegClass.contains(Reg))
+        LastDef[Reg] = MI;
+    }
+  }
+}
+
+bool HexagonCopyToCombine::runOnMachineFunction(MachineFunction &MF) {
+
+  if (IsCombinesDisabled) return false;
+
+  bool HasChanged = false;
+
+  // Get target info.
+  TRI = MF.getTarget().getRegisterInfo();
+  TII = static_cast<const HexagonInstrInfo *>(MF.getTarget().getInstrInfo());
+
+  // Combine aggressively (for code size)
+  ShouldCombineAggressively =
+    MF.getTarget().getOptLevel() <= CodeGenOpt::Default;
+
+  // Traverse basic blocks.
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
+       ++BI) {
+    PotentiallyNewifiableTFR.clear();
+    findPotentialNewifiableTFRs(*BI);
+
+    // Traverse instructions in basic block.
+    for(MachineBasicBlock::iterator MI = BI->begin(), End = BI->end();
+        MI != End;) {
+      MachineInstr *I1 = MI++;
+      // Don't combine a TFR whose user could be newified (instructions that
+      // define double registers can not be newified - Programmer's Ref Manual
+      // 5.4.2 New-value stores).
+      if (ShouldCombineAggressively && PotentiallyNewifiableTFR.count(I1))
+        continue;
+
+      // Ignore instructions that are not combinable.
+      if (!isCombinableInstType(I1, TII, ShouldCombineAggressively))
+        continue;
+
+      // Find a second instruction that can be merged into a combine
+      // instruction.
+      bool DoInsertAtI1 = false;
+      MachineInstr *I2 = findPairable(I1, DoInsertAtI1);
+      if (I2) {
+        HasChanged = true;
+        combine(I1, I2, MI, DoInsertAtI1);
+      }
+    }
+  }
+
+  return HasChanged;
+}
+
+/// findPairable - Returns an instruction that can be merged with \p I1 into a
+/// COMBINE instruction or 0 if no such instruction can be found. Returns true
+/// in \p DoInsertAtI1 if the combine must be inserted at instruction \p I1
+/// false if the combine must be inserted at the returned instruction.
+MachineInstr *HexagonCopyToCombine::findPairable(MachineInstr *I1,
+                                                 bool &DoInsertAtI1) {
+  MachineBasicBlock::iterator I2 = llvm::next(MachineBasicBlock::iterator(I1));
+  unsigned I1DestReg = I1->getOperand(0).getReg();
+
+  for (MachineBasicBlock::iterator End = I1->getParent()->end(); I2 != End;
+       ++I2) {
+    // Bail out early if we see a second definition of I1DestReg.
+    if (I2->modifiesRegister(I1DestReg, TRI))
+      break;
+
+    // Ignore non-combinable instructions.
+    if (!isCombinableInstType(I2, TII, ShouldCombineAggressively))
+      continue;
+
+    // Don't combine a TFR whose user could be newified.
+    if (ShouldCombineAggressively && PotentiallyNewifiableTFR.count(I2))
+      continue;
+
+    unsigned I2DestReg = I2->getOperand(0).getReg();
+
+    // Check that registers are adjacent and that the first destination register
+    // is even.
+    bool IsI1LowReg = (I2DestReg - I1DestReg) == 1;
+    bool IsI2LowReg = (I1DestReg - I2DestReg) == 1;
+    unsigned FirstRegIndex = IsI1LowReg ? I1DestReg : I2DestReg;
+    if ((!IsI1LowReg && !IsI2LowReg) || !isEvenReg(FirstRegIndex))
+      continue;
+
+    // Check that the two instructions are combinable. V4 allows more
+    // instructions to be merged into a combine.
+    // The order matters because in a TFRI we might can encode a int8 as the
+    // hi reg operand but only a uint6 as the low reg operand.
+    if ((IsI2LowReg && !areCombinableOperations(TRI, I1, I2)) ||
+        (IsI1LowReg && !areCombinableOperations(TRI, I2, I1)))
+      break;
+
+    if (isSafeToMoveTogether(I1, I2, I1DestReg, I2DestReg,
+                             DoInsertAtI1))
+      return I2;
+
+    // Not safe. Stop searching.
+    break;
+  }
+  return 0;
+}
+
+void HexagonCopyToCombine::combine(MachineInstr *I1, MachineInstr *I2,
+                                   MachineBasicBlock::iterator &MI,
+                                   bool DoInsertAtI1) {
+  // We are going to delete I2. If MI points to I2 advance it to the next
+  // instruction.
+  if ((MachineInstr *)MI == I2) ++MI;
+
+  // Figure out whether I1 or I2 goes into the lowreg part.
+  unsigned I1DestReg = I1->getOperand(0).getReg();
+  unsigned I2DestReg = I2->getOperand(0).getReg();
+  bool IsI1Loreg = (I2DestReg - I1DestReg) == 1;
+  unsigned LoRegDef = IsI1Loreg ? I1DestReg : I2DestReg;
+
+  // Get the double word register.
+  unsigned DoubleRegDest =
+    TRI->getMatchingSuperReg(LoRegDef, Hexagon::subreg_loreg,
+                             &Hexagon::DoubleRegsRegClass);
+  assert(DoubleRegDest != 0 && "Expect a valid register");
+
+
+  // Setup source operands.
+  MachineOperand &LoOperand = IsI1Loreg ? I1->getOperand(1) :
+    I2->getOperand(1);
+  MachineOperand &HiOperand = IsI1Loreg ? I2->getOperand(1) :
+    I1->getOperand(1);
+
+  // Figure out which source is a register and which a constant.
+  bool IsHiReg = HiOperand.isReg();
+  bool IsLoReg = LoOperand.isReg();
+
+  MachineBasicBlock::iterator InsertPt(DoInsertAtI1 ? I1 : I2);
+  // Emit combine.
+  if (IsHiReg && IsLoReg)
+    emitCombineRR(InsertPt, DoubleRegDest, HiOperand, LoOperand);
+  else if (IsHiReg)
+    emitCombineRI(InsertPt, DoubleRegDest, HiOperand, LoOperand);
+  else if (IsLoReg)
+    emitCombineIR(InsertPt, DoubleRegDest, HiOperand, LoOperand);
+  else
+    emitCombineII(InsertPt, DoubleRegDest, HiOperand, LoOperand);
+
+  I1->eraseFromParent();
+  I2->eraseFromParent();
+}
+
+void HexagonCopyToCombine::emitCombineII(MachineBasicBlock::iterator &InsertPt,
+                                         unsigned DoubleDestReg,
+                                         MachineOperand &HiOperand,
+                                         MachineOperand &LoOperand) {
+  DebugLoc DL = InsertPt->getDebugLoc();
+  MachineBasicBlock *BB = InsertPt->getParent();
+
+  // Handle  globals.
+  if (HiOperand.isGlobal()) {
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_Ii), DoubleDestReg)
+      .addGlobalAddress(HiOperand.getGlobal(), HiOperand.getOffset(),
+                        HiOperand.getTargetFlags())
+      .addImm(LoOperand.getImm());
+    return;
+  }
+  if (LoOperand.isGlobal()) {
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_iI_V4), DoubleDestReg)
+      .addImm(HiOperand.getImm())
+      .addGlobalAddress(LoOperand.getGlobal(), LoOperand.getOffset(),
+                        LoOperand.getTargetFlags());
+    return;
+  }
+
+  // Handle constant extended immediates.
+  if (!isInt<8>(HiOperand.getImm())) {
+    assert(isInt<8>(LoOperand.getImm()));
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_Ii), DoubleDestReg)
+      .addImm(HiOperand.getImm())
+      .addImm(LoOperand.getImm());
+    return;
+  }
+
+  if (!isUInt<6>(LoOperand.getImm())) {
+    assert(isInt<8>(HiOperand.getImm()));
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_iI_V4), DoubleDestReg)
+      .addImm(HiOperand.getImm())
+      .addImm(LoOperand.getImm());
+    return;
+  }
+
+  // Insert new combine instruction.
+  //  DoubleRegDest = combine #HiImm, #LoImm
+  BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_Ii), DoubleDestReg)
+    .addImm(HiOperand.getImm())
+    .addImm(LoOperand.getImm());
+}
+
+void HexagonCopyToCombine::emitCombineIR(MachineBasicBlock::iterator &InsertPt,
+                                         unsigned DoubleDestReg,
+                                         MachineOperand &HiOperand,
+                                         MachineOperand &LoOperand) {
+  unsigned LoReg = LoOperand.getReg();
+  unsigned LoRegKillFlag = getKillRegState(LoOperand.isKill());
+
+  DebugLoc DL = InsertPt->getDebugLoc();
+  MachineBasicBlock *BB = InsertPt->getParent();
+
+  // Handle global.
+  if (HiOperand.isGlobal()) {
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_Ir_V4), DoubleDestReg)
+      .addGlobalAddress(HiOperand.getGlobal(), HiOperand.getOffset(),
+                        HiOperand.getTargetFlags())
+      .addReg(LoReg, LoRegKillFlag);
+    return;
+  }
+  // Insert new combine instruction.
+  //  DoubleRegDest = combine #HiImm, LoReg
+  BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_Ir_V4), DoubleDestReg)
+    .addImm(HiOperand.getImm())
+    .addReg(LoReg, LoRegKillFlag);
+}
+
+void HexagonCopyToCombine::emitCombineRI(MachineBasicBlock::iterator &InsertPt,
+                                         unsigned DoubleDestReg,
+                                         MachineOperand &HiOperand,
+                                         MachineOperand &LoOperand) {
+  unsigned HiRegKillFlag = getKillRegState(HiOperand.isKill());
+  unsigned HiReg = HiOperand.getReg();
+
+  DebugLoc DL = InsertPt->getDebugLoc();
+  MachineBasicBlock *BB = InsertPt->getParent();
+
+  // Handle global.
+  if (LoOperand.isGlobal()) {
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_rI_V4), DoubleDestReg)
+      .addReg(HiReg, HiRegKillFlag)
+      .addGlobalAddress(LoOperand.getGlobal(), LoOperand.getOffset(),
+                        LoOperand.getTargetFlags());
+    return;
+  }
+
+  // Insert new combine instruction.
+  //  DoubleRegDest = combine HiReg, #LoImm
+  BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_rI_V4), DoubleDestReg)
+    .addReg(HiReg, HiRegKillFlag)
+    .addImm(LoOperand.getImm());
+}
+
+void HexagonCopyToCombine::emitCombineRR(MachineBasicBlock::iterator &InsertPt,
+                                         unsigned DoubleDestReg,
+                                         MachineOperand &HiOperand,
+                                         MachineOperand &LoOperand) {
+  unsigned LoRegKillFlag = getKillRegState(LoOperand.isKill());
+  unsigned HiRegKillFlag = getKillRegState(HiOperand.isKill());
+  unsigned LoReg = LoOperand.getReg();
+  unsigned HiReg = HiOperand.getReg();
+
+  DebugLoc DL = InsertPt->getDebugLoc();
+  MachineBasicBlock *BB = InsertPt->getParent();
+
+  // Insert new combine instruction.
+  //  DoubleRegDest = combine HiReg, LoReg
+  BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_rr), DoubleDestReg)
+    .addReg(HiReg, HiRegKillFlag)
+    .addReg(LoReg, LoRegKillFlag);
+}
+
+FunctionPass *llvm::createHexagonCopyToCombine() {
+  return new HexagonCopyToCombine();
+}
diff --git a/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp b/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
index 0814421..8a5991f 100644
--- a/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
+++ b/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
@@ -41,16 +41,24 @@
 using namespace llvm;
 
 
+namespace llvm {
+  void initializeHexagonExpandPredSpillCodePass(PassRegistry&);
+}
+
+
 namespace {
 
 class HexagonExpandPredSpillCode : public MachineFunctionPass {
-    HexagonTargetMachine& QTM;
+    const HexagonTargetMachine& QTM;
     const HexagonSubtarget &QST;
 
  public:
     static char ID;
-    HexagonExpandPredSpillCode(HexagonTargetMachine& TM) :
-      MachineFunctionPass(ID), QTM(TM), QST(*TM.getSubtargetImpl()) {}
+    HexagonExpandPredSpillCode(const HexagonTargetMachine& TM) :
+      MachineFunctionPass(ID), QTM(TM), QST(*TM.getSubtargetImpl()) {
+      PassRegistry &Registry = *PassRegistry::getPassRegistry();
+      initializeHexagonExpandPredSpillCodePass(Registry);
+    }
 
     const char *getPassName() const {
       return "Hexagon Expand Predicate Spill Code";
@@ -175,6 +183,19 @@ bool HexagonExpandPredSpillCode::runOnMachineFunction(MachineFunction &Fn) {
 //                         Public Constructor Functions
 //===----------------------------------------------------------------------===//
 
-FunctionPass *llvm::createHexagonExpandPredSpillCode(HexagonTargetMachine &TM) {
+static void initializePassOnce(PassRegistry &Registry) {
+  const char *Name = "Hexagon Expand Predicate Spill Code";
+  PassInfo *PI = new PassInfo(Name, "hexagon-spill-pred",
+                              &HexagonExpandPredSpillCode::ID,
+                              0, false, false);
+  Registry.registerPass(*PI, true);
+}
+
+void llvm::initializeHexagonExpandPredSpillCodePass(PassRegistry &Registry) {
+  CALL_ONCE_INITIALIZATION(initializePassOnce)
+}
+
+FunctionPass*
+llvm::createHexagonExpandPredSpillCode(const HexagonTargetMachine &TM) {
   return new HexagonExpandPredSpillCode(TM);
 }
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index de993ee8..2b04f25 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -76,17 +76,12 @@ void HexagonFrameLowering::determineFrameLayout(MachineFunction &MF) const {
 void HexagonFrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock &MBB = MF.front();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  MachineModuleInfo &MMI = MF.getMMI();
   MachineBasicBlock::iterator MBBI = MBB.begin();
   const HexagonRegisterInfo *QRI =
     static_cast<const HexagonRegisterInfo *>(MF.getTarget().getRegisterInfo());
   DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
   determineFrameLayout(MF);
 
-  // Check if frame moves are needed for EH.
-  bool needsFrameMoves = MMI.hasDebugInfo() ||
-    !MF.getFunction()->needsUnwindTableEntry();
-
   // Get the number of bytes to allocate from the FrameInfo.
   int NumBytes = (int) MFI->getStackSize();
 
@@ -113,28 +108,6 @@ void HexagonFrameLowering::emitPrologue(MachineFunction &MF) const {
     MO.setImm(MFI->getMaxCallFrameSize());
   }
 
- std::vector<MachineMove> &Moves = MMI.getFrameMoves();
-
- if (needsFrameMoves) {
-   // Advance CFA. DW_CFA_def_cfa
-   unsigned FPReg = QRI->getFrameRegister();
-   unsigned RAReg = QRI->getRARegister();
-
-   MachineLocation Dst(MachineLocation::VirtualFP);
-   MachineLocation Src(FPReg, -8);
-   Moves.push_back(MachineMove(0, Dst, Src));
-
-   // R31 = (R31 - #4)
-   MachineLocation LRDst(RAReg, -4);
-   MachineLocation LRSrc(RAReg);
-   Moves.push_back(MachineMove(0, LRDst, LRSrc));
-
-   // R30 = (R30 - #8)
-   MachineLocation SPDst(FPReg, -8);
-   MachineLocation SPSrc(FPReg);
-   Moves.push_back(MachineMove(0, SPDst, SPSrc));
- }
-
   //
   // Only insert ALLOCFRAME if we need to.
   //
@@ -174,30 +147,55 @@ void HexagonFrameLowering::emitEpilogue(MachineFunction &MF,
   MachineBasicBlock::iterator MBBI = prior(MBB.end());
   DebugLoc dl = MBBI->getDebugLoc();
   //
-  // Only insert deallocframe if we need to.
+  // Only insert deallocframe if we need to.  Also at -O0.  See comment
+  // in emitPrologue above.
   //
-  if (hasFP(MF)) {
+  if (hasFP(MF) || MF.getTarget().getOptLevel() == CodeGenOpt::None) {
     MachineBasicBlock::iterator MBBI = prior(MBB.end());
     MachineBasicBlock::iterator MBBI_end = MBB.end();
-    //
-    // For Hexagon, we don't need the frame size.
-    //
-    MachineFrameInfo *MFI = MF.getFrameInfo();
-    int NumBytes = (int) MFI->getStackSize();
 
     const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
-
+    // Handle EH_RETURN.
+    if (MBBI->getOpcode() == Hexagon::EH_RETURN_JMPR) {
+      assert(MBBI->getOperand(0).isReg() && "Offset should be in register!");
+      BuildMI(MBB, MBBI, dl, TII.get(Hexagon::DEALLOCFRAME));
+      BuildMI(MBB, MBBI, dl, TII.get(Hexagon::ADD_rr),
+              Hexagon::R29).addReg(Hexagon::R29).addReg(Hexagon::R28);
+      return;
+    }
     // Replace 'jumpr r31' instruction with dealloc_return for V4 and higher
     // versions.
     if (STI.hasV4TOps() && MBBI->getOpcode() == Hexagon::JMPret
                         && !DisableDeallocRet) {
-      // Remove jumpr node.
-      MBB.erase(MBBI);
+      // Check for RESTORE_DEALLOC_RET_JMP_V4 call. Don't emit an extra DEALLOC
+      // instruction if we encounter it.
+      MachineBasicBlock::iterator BeforeJMPR =
+        MBB.begin() == MBBI ? MBBI : prior(MBBI);
+      if (BeforeJMPR != MBBI &&
+          BeforeJMPR->getOpcode() == Hexagon::RESTORE_DEALLOC_RET_JMP_V4) {
+        // Remove the JMPR node.
+        MBB.erase(MBBI);
+        return;
+      }
+
       // Add dealloc_return.
-      BuildMI(MBB, MBBI_end, dl, TII.get(Hexagon::DEALLOC_RET_V4))
-        .addImm(NumBytes);
-    } else { // Add deallocframe for V2 and V3.
-      BuildMI(MBB, MBBI, dl, TII.get(Hexagon::DEALLOCFRAME)).addImm(NumBytes);
+      MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI_end, dl, TII.get(Hexagon::DEALLOC_RET_V4));
+      // Transfer the function live-out registers.
+      MIB->copyImplicitOps(*MBB.getParent(), &*MBBI);
+      // Remove the JUMPR node.
+      MBB.erase(MBBI);
+    } else { // Add deallocframe for V2 and V3, and V4 tail calls.
+      // Check for RESTORE_DEALLOC_BEFORE_TAILCALL_V4. We don't need an extra
+      // DEALLOCFRAME instruction after it.
+      MachineBasicBlock::iterator Term = MBB.getFirstTerminator();
+      MachineBasicBlock::iterator I =
+        Term == MBB.begin() ?  MBB.end() : prior(Term);
+      if (I != MBB.end() &&
+          I->getOpcode() == Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4)
+        return;
+
+      BuildMI(MBB, MBBI, dl, TII.get(Hexagon::DEALLOCFRAME));
     }
   }
 }
diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index ba6c100..22740b7 100644
--- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -49,16 +49,14 @@ class HexagonDAGToDAGISel : public SelectionDAGISel {
   const HexagonSubtarget &Subtarget;
 
   // Keep a reference to HexagonTargetMachine.
-  HexagonTargetMachine& TM;
-  const HexagonInstrInfo *TII;
+  const HexagonTargetMachine& TM;
   DenseMap<const GlobalValue *, unsigned> GlobalAddressUseCountMap;
 public:
-  explicit HexagonDAGToDAGISel(HexagonTargetMachine &targetmachine,
+  explicit HexagonDAGToDAGISel(const HexagonTargetMachine &targetmachine,
                                CodeGenOpt::Level OptLevel)
     : SelectionDAGISel(targetmachine, OptLevel),
       Subtarget(targetmachine.getSubtarget<HexagonSubtarget>()),
-      TM(targetmachine),
-      TII(static_cast<const HexagonInstrInfo*>(TM.getInstrInfo())) {
+      TM(targetmachine) {
     initializeHexagonDAGToDAGISelPass(*PassRegistry::getPassRegistry());
   }
   bool hasNumUsesBelowThresGA(SDNode *N) const;
@@ -92,14 +90,14 @@ public:
   bool SelectAddr(SDNode *Op, SDValue Addr, SDValue &Base, SDValue &Offset);
 
   SDNode *SelectLoad(SDNode *N);
-  SDNode *SelectBaseOffsetLoad(LoadSDNode *LD, DebugLoc dl);
-  SDNode *SelectIndexedLoad(LoadSDNode *LD, DebugLoc dl);
+  SDNode *SelectBaseOffsetLoad(LoadSDNode *LD, SDLoc dl);
+  SDNode *SelectIndexedLoad(LoadSDNode *LD, SDLoc dl);
   SDNode *SelectIndexedLoadZeroExtend64(LoadSDNode *LD, unsigned Opcode,
-                                        DebugLoc dl);
+                                        SDLoc dl);
   SDNode *SelectIndexedLoadSignExtend64(LoadSDNode *LD, unsigned Opcode,
-                                        DebugLoc dl);
-  SDNode *SelectBaseOffsetStore(StoreSDNode *ST, DebugLoc dl);
-  SDNode *SelectIndexedStore(StoreSDNode *ST, DebugLoc dl);
+                                        SDLoc dl);
+  SDNode *SelectBaseOffsetStore(StoreSDNode *ST, SDLoc dl);
+  SDNode *SelectIndexedStore(StoreSDNode *ST, SDLoc dl);
   SDNode *SelectStore(SDNode *N);
   SDNode *SelectSHL(SDNode *N);
   SDNode *SelectSelect(SDNode *N);
@@ -180,7 +178,7 @@ inline SDValue XformUToUM1Imm(unsigned Imm) {
 /// createHexagonISelDag - This pass converts a legalized DAG into a
 /// Hexagon-specific DAG, ready for instruction scheduling.
 ///
-FunctionPass *llvm::createHexagonISelDag(HexagonTargetMachine &TM,
+FunctionPass *llvm::createHexagonISelDag(const HexagonTargetMachine &TM,
                                          CodeGenOpt::Level OptLevel) {
   return new HexagonDAGToDAGISel(TM, OptLevel);
 }
@@ -385,7 +383,7 @@ static bool OffsetFitsS11(EVT MemType, int64_t Offset) {
 // lowering for GlobalAddress nodes has already turned it into a
 // CONST32.
 //
-SDNode *HexagonDAGToDAGISel::SelectBaseOffsetLoad(LoadSDNode *LD, DebugLoc dl) {
+SDNode *HexagonDAGToDAGISel::SelectBaseOffsetLoad(LoadSDNode *LD, SDLoc dl) {
   SDValue Chain = LD->getChain();
   SDNode* Const32 = LD->getBasePtr().getNode();
   unsigned Opcode = 0;
@@ -396,7 +394,7 @@ SDNode *HexagonDAGToDAGISel::SelectBaseOffsetLoad(LoadSDNode *LD, DebugLoc dl) {
     EVT LoadedVT = LD->getMemoryVT();
     int64_t Offset = cast<GlobalAddressSDNode>(Base)->getOffset();
     if (Offset != 0 && OffsetFitsS11(LoadedVT, Offset)) {
-      MVT PointerTy = TLI.getPointerTy();
+      MVT PointerTy = TLI->getPointerTy();
       const GlobalValue* GV =
         cast<GlobalAddressSDNode>(Base)->getGlobal();
       SDValue TargAddr =
@@ -433,7 +431,7 @@ SDNode *HexagonDAGToDAGISel::SelectBaseOffsetLoad(LoadSDNode *LD, DebugLoc dl) {
 
 SDNode *HexagonDAGToDAGISel::SelectIndexedLoadSignExtend64(LoadSDNode *LD,
                                                            unsigned Opcode,
-                                                           DebugLoc dl)
+                                                           SDLoc dl)
 {
   SDValue Chain = LD->getChain();
   EVT LoadedVT = LD->getMemoryVT();
@@ -444,6 +442,9 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoadSignExtend64(LoadSDNode *LD,
   SDValue N1 = LD->getOperand(1);
   SDValue CPTmpN1_0;
   SDValue CPTmpN1_1;
+
+  const HexagonInstrInfo *TII =
+    static_cast<const HexagonInstrInfo*>(TM.getInstrInfo());
   if (SelectADDRriS11_2(N1, CPTmpN1_0, CPTmpN1_1) &&
       N1.getNode()->getValueType(0) == MVT::i32) {
     if (TII->isValidAutoIncImm(LoadedVT, Val)) {
@@ -497,7 +498,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoadSignExtend64(LoadSDNode *LD,
 
 SDNode *HexagonDAGToDAGISel::SelectIndexedLoadZeroExtend64(LoadSDNode *LD,
                                                            unsigned Opcode,
-                                                           DebugLoc dl)
+                                                           SDLoc dl)
 {
   SDValue Chain = LD->getChain();
   EVT LoadedVT = LD->getMemoryVT();
@@ -508,6 +509,9 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoadZeroExtend64(LoadSDNode *LD,
   SDValue N1 = LD->getOperand(1);
   SDValue CPTmpN1_0;
   SDValue CPTmpN1_1;
+
+  const HexagonInstrInfo *TII =
+    static_cast<const HexagonInstrInfo*>(TM.getInstrInfo());
   if (SelectADDRriS11_2(N1, CPTmpN1_0, CPTmpN1_1) &&
       N1.getNode()->getValueType(0) == MVT::i32) {
     if (TII->isValidAutoIncImm(LoadedVT, Val)) {
@@ -572,7 +576,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoadZeroExtend64(LoadSDNode *LD,
 }
 
 
-SDNode *HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, DebugLoc dl) {
+SDNode *HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, SDLoc dl) {
   SDValue Chain = LD->getChain();
   SDValue Base = LD->getBasePtr();
   SDValue Offset = LD->getOffset();
@@ -586,6 +590,8 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, DebugLoc dl) {
   bool zextval = (LD->getExtensionType() == ISD::ZEXTLOAD);
 
   // Figure out the opcode.
+  const HexagonInstrInfo *TII =
+    static_cast<const HexagonInstrInfo*>(TM.getInstrInfo());
   if (LoadedVT == MVT::i64) {
     if (TII->isValidAutoIncImm(LoadedVT, Val))
       Opcode = Hexagon::POST_LDrid;
@@ -667,7 +673,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, DebugLoc dl) {
 
 SDNode *HexagonDAGToDAGISel::SelectLoad(SDNode *N) {
   SDNode *result;
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   LoadSDNode *LD = cast<LoadSDNode>(N);
   ISD::MemIndexedMode AM = LD->getAddressingMode();
 
@@ -682,7 +688,7 @@ SDNode *HexagonDAGToDAGISel::SelectLoad(SDNode *N) {
 }
 
 
-SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, DebugLoc dl) {
+SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, SDLoc dl) {
   SDValue Chain = ST->getChain();
   SDValue Base = ST->getBasePtr();
   SDValue Offset = ST->getOffset();
@@ -694,6 +700,8 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, DebugLoc dl) {
 
   // Offset value must be within representable range
   // and must have correct alignment properties.
+  const HexagonInstrInfo *TII =
+    static_cast<const HexagonInstrInfo*>(TM.getInstrInfo());
   if (TII->isValidAutoIncImm(StoredVT, Val)) {
     SDValue Ops[] = {Base, CurDAG->getTargetConstant(Val, MVT::i32), Value,
                      Chain};
@@ -751,7 +759,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, DebugLoc dl) {
 
 
 SDNode *HexagonDAGToDAGISel::SelectBaseOffsetStore(StoreSDNode *ST,
-                                                   DebugLoc dl) {
+                                                   SDLoc dl) {
   SDValue Chain = ST->getChain();
   SDNode* Const32 = ST->getBasePtr().getNode();
   SDValue Value = ST->getValue();
@@ -769,7 +777,7 @@ SDNode *HexagonDAGToDAGISel::SelectBaseOffsetStore(StoreSDNode *ST,
       EVT StoredVT = ST->getMemoryVT();
       int64_t Offset = cast<GlobalAddressSDNode>(Base)->getOffset();
       if (Offset != 0 && OffsetFitsS11(StoredVT, Offset)) {
-        MVT PointerTy = TLI.getPointerTy();
+        MVT PointerTy = TLI->getPointerTy();
         const GlobalValue* GV =
           cast<GlobalAddressSDNode>(Base)->getGlobal();
         SDValue TargAddr =
@@ -805,7 +813,7 @@ SDNode *HexagonDAGToDAGISel::SelectBaseOffsetStore(StoreSDNode *ST,
 
 
 SDNode *HexagonDAGToDAGISel::SelectStore(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   StoreSDNode *ST = cast<StoreSDNode>(N);
   ISD::MemIndexedMode AM = ST->getAddressingMode();
 
@@ -818,7 +826,7 @@ SDNode *HexagonDAGToDAGISel::SelectStore(SDNode *N) {
 }
 
 SDNode *HexagonDAGToDAGISel::SelectMul(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   //
   // %conv.i = sext i32 %tmp1 to i64
@@ -902,7 +910,7 @@ SDNode *HexagonDAGToDAGISel::SelectMul(SDNode *N) {
 
 
 SDNode *HexagonDAGToDAGISel::SelectSelect(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   SDValue N0 = N->getOperand(0);
   if (N0.getOpcode() == ISD::SETCC) {
     SDValue N00 = N0.getOperand(0);
@@ -969,7 +977,7 @@ SDNode *HexagonDAGToDAGISel::SelectSelect(SDNode *N) {
 
 
 SDNode *HexagonDAGToDAGISel::SelectTruncate(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   SDValue Shift = N->getOperand(0);
 
   //
@@ -1082,7 +1090,7 @@ SDNode *HexagonDAGToDAGISel::SelectTruncate(SDNode *N) {
 
 
 SDNode *HexagonDAGToDAGISel::SelectSHL(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   if (N->getValueType(0) == MVT::i32) {
     SDValue Shl_0 = N->getOperand(0);
     SDValue Shl_1 = N->getOperand(1);
@@ -1158,7 +1166,7 @@ SDNode *HexagonDAGToDAGISel::SelectSHL(SDNode *N) {
 // We want to preserve all the lower 8-bits and, not just 1 LSB bit.
 //
 SDNode *HexagonDAGToDAGISel::SelectZeroExtend(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   SDNode *IsIntrinsic = N->getOperand(0).getNode();
   if ((IsIntrinsic->getOpcode() == ISD::INTRINSIC_WO_CHAIN)) {
     unsigned ID =
@@ -1201,12 +1209,14 @@ SDNode *HexagonDAGToDAGISel::SelectZeroExtend(SDNode *N) {
 // and lowering to the actual intrinsic.
 //
 SDNode *HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   unsigned ID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
   unsigned IntrinsicWithPred = doesIntrinsicContainPredicate(ID);
 
   // We are concerned with only those intrinsics that have predicate registers
   // as at least one of the operands.
+  const HexagonInstrInfo *TII =
+    static_cast<const HexagonInstrInfo*>(TM.getInstrInfo());
   if (IntrinsicWithPred) {
     SmallVector<SDValue, 8> Ops;
     const MCInstrDesc &MCID = TII->get(IntrinsicWithPred);
@@ -1251,7 +1261,7 @@ SDNode *HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) {
 // Map floating point constant values.
 //
 SDNode *HexagonDAGToDAGISel::SelectConstantFP(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N);
   APFloat APF = CN->getValueAPF();
   if (N->getValueType(0) == MVT::f32) {
@@ -1271,7 +1281,7 @@ SDNode *HexagonDAGToDAGISel::SelectConstantFP(SDNode *N) {
 // Map predicate true (encoded as -1 in LLVM) to a XOR.
 //
 SDNode *HexagonDAGToDAGISel::SelectConstant(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   if (N->getValueType(0) == MVT::i1) {
     SDNode* Result;
     int32_t Val = cast<ConstantSDNode>(N)->getSExtValue();
@@ -1310,7 +1320,7 @@ SDNode *HexagonDAGToDAGISel::SelectConstant(SDNode *N) {
 // Map add followed by a asr -> asr +=.
 //
 SDNode *HexagonDAGToDAGISel::SelectAdd(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   if (N->getValueType(0) != MVT::i32) {
     return SelectCode(N);
   }
@@ -1660,7 +1670,7 @@ bool HexagonDAGToDAGISel::foldGlobalAddressImpl(SDValue &N, SDValue &R,
                 !hasNumUsesBelowThresGA(GA))
             return false;
         R = CurDAG->getTargetGlobalAddress(GA->getGlobal(),
-                                          Const->getDebugLoc(),
+                                          SDLoc(Const),
                                           N.getValueType(),
                                           GA->getOffset() +
                                           (uint64_t)Const->getSExtValue());
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 0e5b8dc..2b0fa5e 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -285,7 +285,7 @@ const {
 static SDValue
 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
-                          DebugLoc dl) {
+                          SDLoc dl) {
 
   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
@@ -302,7 +302,7 @@ HexagonTargetLowering::LowerReturn(SDValue Chain,
                                    CallingConv::ID CallConv, bool isVarArg,
                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
                                    const SmallVectorImpl<SDValue> &OutVals,
-                                   DebugLoc dl, SelectionDAG &DAG) const {
+                                   SDLoc dl, SelectionDAG &DAG) const {
 
   // CCValAssign - represent the assignment of the return value to locations.
   SmallVector<CCValAssign, 16> RVLocs;
@@ -351,7 +351,7 @@ HexagonTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
                                        CallingConv::ID CallConv, bool isVarArg,
                                        const
                                        SmallVectorImpl<ISD::InputArg> &Ins,
-                                       DebugLoc dl, SelectionDAG &DAG,
+                                       SDLoc dl, SelectionDAG &DAG,
                                        SmallVectorImpl<SDValue> &InVals,
                                        const SmallVectorImpl<SDValue> &OutVals,
                                        SDValue Callee) const {
@@ -382,7 +382,7 @@ SDValue
 HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                  SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG                     = CLI.DAG;
-  DebugLoc &dl                          = CLI.DL;
+  SDLoc &dl                          = CLI.DL;
   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
   SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
   SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
@@ -513,7 +513,8 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   if (!isTailCall)
     Chain = DAG.getCALLSEQ_START(Chain, DAG.getConstant(NumBytes,
-                                                        getPointerTy(), true));
+                                                        getPointerTy(), true),
+                                 dl);
 
   // Build a sequence of copy-to-reg nodes chained together with token
   // chain and flag operands which copy the outgoing args into registers.
@@ -588,7 +589,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Create the CALLSEQ_END node.
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                             DAG.getIntPtrConstant(0, true), InFlag);
+                             DAG.getIntPtrConstant(0, true), InFlag, dl);
   InFlag = Chain.getValue(1);
 
   // Handle result values, copying them out of physregs into vregs that we
@@ -730,7 +731,7 @@ LowerBR_JT(SDValue Op, SelectionDAG &DAG) const
   SDValue Chain = Op.getOperand(0);
   SDValue Table = Op.getOperand(1);
   SDValue Index = Op.getOperand(2);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
   unsigned JTI = JT->getIndex();
   MachineFunction &MF = DAG.getMachineFunction();
@@ -766,7 +767,7 @@ HexagonTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                                SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
   SDValue Size = Op.getOperand(1);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   unsigned SPReg = getStackPointerRegisterToSaveRestore();
 
@@ -812,7 +813,7 @@ HexagonTargetLowering::LowerFormalArguments(SDValue Chain,
                                             bool isVarArg,
                                             const
                                             SmallVectorImpl<ISD::InputArg> &Ins,
-                                            DebugLoc dl, SelectionDAG &DAG,
+                                            SDLoc dl, SelectionDAG &DAG,
                                             SmallVectorImpl<SDValue> &InVals)
 const {
 
@@ -925,7 +926,7 @@ HexagonTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   HexagonMachineFunctionInfo *QFI = MF.getInfo<HexagonMachineFunctionInfo>();
   SDValue Addr = DAG.getFrameIndex(QFI->getVarArgsFrameIndex(), MVT::i32);
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
-  return DAG.getStore(Op.getOperand(0), Op.getDebugLoc(), Addr,
+  return DAG.getStore(Op.getOperand(0), SDLoc(Op), Addr,
                       Op.getOperand(1), MachinePointerInfo(SV), false,
                       false, 0);
 }
@@ -937,7 +938,7 @@ HexagonTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue CC = Op.getOperand(4);
   SDValue TrueVal = Op.getOperand(2);
   SDValue FalseVal = Op.getOperand(3);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDNode* OpNode = Op.getNode();
   EVT SVT = OpNode->getValueType(0);
 
@@ -948,8 +949,7 @@ HexagonTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 SDValue
 HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
   EVT ValTy = Op.getValueType();
-
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
   SDValue Res;
   if (CP->isMachineConstantPoolEntry())
@@ -969,7 +969,7 @@ HexagonTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
   MFI->setReturnAddressIsTaken(true);
 
   EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   if (Depth) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
@@ -991,7 +991,7 @@ HexagonTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   MFI->setFrameAddressIsTaken(true);
 
   EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
                                          TRI->getFrameRegister(), VT);
@@ -1004,7 +1004,7 @@ HexagonTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue HexagonTargetLowering::LowerATOMIC_FENCE(SDValue Op,
                                                  SelectionDAG& DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   return DAG.getNode(HexagonISD::BARRIER, dl, MVT::Other, Op.getOperand(0));
 }
 
@@ -1014,7 +1014,7 @@ SDValue HexagonTargetLowering::LowerGLOBALADDRESS(SDValue Op,
   SDValue Result;
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
 
   const HexagonTargetObjectFile &TLOF =
@@ -1030,7 +1030,7 @@ SDValue
 HexagonTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   SDValue BA_SD =  DAG.getTargetBlockAddress(BA, MVT::i32);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   return DAG.getNode(HexagonISD::CONST32_GP, dl, getPointerTy(), BA_SD);
 }
 
@@ -1361,7 +1361,6 @@ HexagonTargetLowering::HexagonTargetLowering(HexagonTargetMachine
     // Increase jump tables cutover to 5, was 4.
     setMinimumJumpTableEntries(5);
 
-    setOperationAction(ISD::BR_CC, MVT::Other, Expand);
     setOperationAction(ISD::BR_CC, MVT::f32, Expand);
     setOperationAction(ISD::BR_CC, MVT::f64, Expand);
     setOperationAction(ISD::BR_CC, MVT::i1,  Expand);
@@ -1515,7 +1514,7 @@ HexagonTargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain     = Op.getOperand(0);
   SDValue Offset    = Op.getOperand(1);
   SDValue Handler   = Op.getOperand(2);
-  DebugLoc dl       = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   // Mark function as containing a call to EH_RETURN.
   HexagonMachineFunctionInfo *FuncInfo =
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index bb1acc1..70642e6 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -106,7 +106,7 @@ namespace llvm {
     SDValue LowerFormalArguments(SDValue Chain,
                                  CallingConv::ID CallConv, bool isVarArg,
                                  const SmallVectorImpl<ISD::InputArg> &Ins,
-                                 DebugLoc dl, SelectionDAG &DAG,
+                                 SDLoc dl, SelectionDAG &DAG,
                                  SmallVectorImpl<SDValue> &InVals) const;
     SDValue LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
@@ -117,7 +117,7 @@ namespace llvm {
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                             CallingConv::ID CallConv, bool isVarArg,
                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                            DebugLoc dl, SelectionDAG &DAG,
+                            SDLoc dl, SelectionDAG &DAG,
                             SmallVectorImpl<SDValue> &InVals,
                             const SmallVectorImpl<SDValue> &OutVals,
                             SDValue Callee) const;
@@ -131,7 +131,7 @@ namespace llvm {
                         CallingConv::ID CallConv, bool isVarArg,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
                         const SmallVectorImpl<SDValue> &OutVals,
-                        DebugLoc dl, SelectionDAG &DAG) const;
+                        SDLoc dl, SelectionDAG &DAG) const;
 
     virtual MachineBasicBlock
     *EmitInstrWithCustomInserter(MachineInstr *MI,
@@ -139,7 +139,7 @@ namespace llvm {
 
     SDValue  LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
     SDValue  LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
-    virtual EVT getSetCCResultType(EVT VT) const {
+    virtual EVT getSetCCResultType(LLVMContext &, EVT) const {
       return MVT::i1;
     }
 
diff --git a/lib/Target/Hexagon/HexagonInstrFormats.td b/lib/Target/Hexagon/HexagonInstrFormats.td
index 587fa7d..e71386a 100644
--- a/lib/Target/Hexagon/HexagonInstrFormats.td
+++ b/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -54,6 +54,7 @@ def AbsoluteSet    : AddrModeType<2>;  // Absolute set addressing mode
 def BaseImmOffset  : AddrModeType<3>;  // Indirect with offset
 def BaseLongOffset : AddrModeType<4>;  // Indirect with long offset
 def BaseRegOffset  : AddrModeType<5>;  // Indirect with register offset
+def PostInc        : AddrModeType<6>;  // Post increment addressing mode
 
 class MemAccessSize<bits<3> value> {
   bits<3> Value = value;
@@ -157,6 +158,7 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
   string CextOpcode = "";
   string PredSense = "";
   string PNewValue = "";
+  string NValueST  = "";    // Set to "true" for new-value stores.
   string InputType = "";    // Input is "imm" or "reg" type.
   string isMEMri = "false"; // Set to "true" for load/store with MEMri operand.
   string isFloat = "false"; // Set to "true" for the floating-point load/store.
@@ -165,6 +167,7 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
   let PredSense = !if(isPredicated, !if(isPredicatedFalse, "false", "true"),
                                     "");
   let PNewValue = !if(isPredicatedNew, "new", "");
+  let NValueST = !if(isNVStore, "true", "false");
 
   // *** Must match MCTargetDesc/HexagonBaseInfo.h ***
 }
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index e0beab0..3218134 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -25,6 +25,7 @@
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
 #define GET_INSTRINFO_CTOR
 #define GET_INSTRMAP_INFO
 #include "HexagonGenInstrInfo.inc"
@@ -57,7 +58,7 @@ const int Hexagon_MEMB_AUTOINC_MIN = -8;
 
 HexagonInstrInfo::HexagonInstrInfo(HexagonSubtarget &ST)
   : HexagonGenInstrInfo(Hexagon::ADJCALLSTACKDOWN, Hexagon::ADJCALLSTACKUP),
-    RI(ST, *this), Subtarget(ST) {
+    RI(ST), Subtarget(ST) {
 }
 
 
@@ -625,292 +626,8 @@ bool HexagonInstrInfo::isExtended(const MachineInstr *MI) const {
   return  false;
 }
 
-bool HexagonInstrInfo::isNewValueJump(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
-    default: return false;
-    // JMP_EQri
-    case Hexagon::JMP_EQriPt_nv_V4:
-    case Hexagon::JMP_EQriPnt_nv_V4:
-    case Hexagon::JMP_EQriNotPt_nv_V4:
-    case Hexagon::JMP_EQriNotPnt_nv_V4:
-    case Hexagon::JMP_EQriPt_ie_nv_V4:
-    case Hexagon::JMP_EQriPnt_ie_nv_V4:
-    case Hexagon::JMP_EQriNotPt_ie_nv_V4:
-    case Hexagon::JMP_EQriNotPnt_ie_nv_V4:
-
-    // JMP_EQri - with -1
-    case Hexagon::JMP_EQriPtneg_nv_V4:
-    case Hexagon::JMP_EQriPntneg_nv_V4:
-    case Hexagon::JMP_EQriNotPtneg_nv_V4:
-    case Hexagon::JMP_EQriNotPntneg_nv_V4:
-    case Hexagon::JMP_EQriPtneg_ie_nv_V4:
-    case Hexagon::JMP_EQriPntneg_ie_nv_V4:
-    case Hexagon::JMP_EQriNotPtneg_ie_nv_V4:
-    case Hexagon::JMP_EQriNotPntneg_ie_nv_V4:
-
-    // JMP_EQrr
-    case Hexagon::JMP_EQrrPt_nv_V4:
-    case Hexagon::JMP_EQrrPnt_nv_V4:
-    case Hexagon::JMP_EQrrNotPt_nv_V4:
-    case Hexagon::JMP_EQrrNotPnt_nv_V4:
-    case Hexagon::JMP_EQrrPt_ie_nv_V4:
-    case Hexagon::JMP_EQrrPnt_ie_nv_V4:
-    case Hexagon::JMP_EQrrNotPt_ie_nv_V4:
-    case Hexagon::JMP_EQrrNotPnt_ie_nv_V4:
-
-    // JMP_GTri
-    case Hexagon::JMP_GTriPt_nv_V4:
-    case Hexagon::JMP_GTriPnt_nv_V4:
-    case Hexagon::JMP_GTriNotPt_nv_V4:
-    case Hexagon::JMP_GTriNotPnt_nv_V4:
-    case Hexagon::JMP_GTriPt_ie_nv_V4:
-    case Hexagon::JMP_GTriPnt_ie_nv_V4:
-    case Hexagon::JMP_GTriNotPt_ie_nv_V4:
-    case Hexagon::JMP_GTriNotPnt_ie_nv_V4:
-
-    // JMP_GTri - with -1
-    case Hexagon::JMP_GTriPtneg_nv_V4:
-    case Hexagon::JMP_GTriPntneg_nv_V4:
-    case Hexagon::JMP_GTriNotPtneg_nv_V4:
-    case Hexagon::JMP_GTriNotPntneg_nv_V4:
-    case Hexagon::JMP_GTriPtneg_ie_nv_V4:
-    case Hexagon::JMP_GTriPntneg_ie_nv_V4:
-    case Hexagon::JMP_GTriNotPtneg_ie_nv_V4:
-    case Hexagon::JMP_GTriNotPntneg_ie_nv_V4:
-
-    // JMP_GTrr
-    case Hexagon::JMP_GTrrPt_nv_V4:
-    case Hexagon::JMP_GTrrPnt_nv_V4:
-    case Hexagon::JMP_GTrrNotPt_nv_V4:
-    case Hexagon::JMP_GTrrNotPnt_nv_V4:
-    case Hexagon::JMP_GTrrPt_ie_nv_V4:
-    case Hexagon::JMP_GTrrPnt_ie_nv_V4:
-    case Hexagon::JMP_GTrrNotPt_ie_nv_V4:
-    case Hexagon::JMP_GTrrNotPnt_ie_nv_V4:
-
-    // JMP_GTrrdn
-    case Hexagon::JMP_GTrrdnPt_nv_V4:
-    case Hexagon::JMP_GTrrdnPnt_nv_V4:
-    case Hexagon::JMP_GTrrdnNotPt_nv_V4:
-    case Hexagon::JMP_GTrrdnNotPnt_nv_V4:
-    case Hexagon::JMP_GTrrdnPt_ie_nv_V4:
-    case Hexagon::JMP_GTrrdnPnt_ie_nv_V4:
-    case Hexagon::JMP_GTrrdnNotPt_ie_nv_V4:
-    case Hexagon::JMP_GTrrdnNotPnt_ie_nv_V4:
-
-    // JMP_GTUri
-    case Hexagon::JMP_GTUriPt_nv_V4:
-    case Hexagon::JMP_GTUriPnt_nv_V4:
-    case Hexagon::JMP_GTUriNotPt_nv_V4:
-    case Hexagon::JMP_GTUriNotPnt_nv_V4:
-    case Hexagon::JMP_GTUriPt_ie_nv_V4:
-    case Hexagon::JMP_GTUriPnt_ie_nv_V4:
-    case Hexagon::JMP_GTUriNotPt_ie_nv_V4:
-    case Hexagon::JMP_GTUriNotPnt_ie_nv_V4:
-
-    // JMP_GTUrr
-    case Hexagon::JMP_GTUrrPt_nv_V4:
-    case Hexagon::JMP_GTUrrPnt_nv_V4:
-    case Hexagon::JMP_GTUrrNotPt_nv_V4:
-    case Hexagon::JMP_GTUrrNotPnt_nv_V4:
-    case Hexagon::JMP_GTUrrPt_ie_nv_V4:
-    case Hexagon::JMP_GTUrrPnt_ie_nv_V4:
-    case Hexagon::JMP_GTUrrNotPt_ie_nv_V4:
-    case Hexagon::JMP_GTUrrNotPnt_ie_nv_V4:
-
-    // JMP_GTUrrdn
-    case Hexagon::JMP_GTUrrdnPt_nv_V4:
-    case Hexagon::JMP_GTUrrdnPnt_nv_V4:
-    case Hexagon::JMP_GTUrrdnNotPt_nv_V4:
-    case Hexagon::JMP_GTUrrdnNotPnt_nv_V4:
-    case Hexagon::JMP_GTUrrdnPt_ie_nv_V4:
-    case Hexagon::JMP_GTUrrdnPnt_ie_nv_V4:
-    case Hexagon::JMP_GTUrrdnNotPt_ie_nv_V4:
-    case Hexagon::JMP_GTUrrdnNotPnt_ie_nv_V4:
-      return true;
-  }
-}
-
-bool HexagonInstrInfo::isNewValueStore(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
-    default: return false;
-    // Store Byte
-    case Hexagon::STrib_nv_V4:
-    case Hexagon::STrib_indexed_nv_V4:
-    case Hexagon::STrib_indexed_shl_nv_V4:
-    case Hexagon::STrib_shl_nv_V4:
-    case Hexagon::STb_GP_nv_V4:
-    case Hexagon::POST_STbri_nv_V4:
-    case Hexagon::STrib_cPt_nv_V4:
-    case Hexagon::STrib_cdnPt_nv_V4:
-    case Hexagon::STrib_cNotPt_nv_V4:
-    case Hexagon::STrib_cdnNotPt_nv_V4:
-    case Hexagon::STrib_indexed_cPt_nv_V4:
-    case Hexagon::STrib_indexed_cdnPt_nv_V4:
-    case Hexagon::STrib_indexed_cNotPt_nv_V4:
-    case Hexagon::STrib_indexed_cdnNotPt_nv_V4:
-    case Hexagon::STrib_indexed_shl_cPt_nv_V4:
-    case Hexagon::STrib_indexed_shl_cdnPt_nv_V4:
-    case Hexagon::STrib_indexed_shl_cNotPt_nv_V4:
-    case Hexagon::STrib_indexed_shl_cdnNotPt_nv_V4:
-    case Hexagon::POST_STbri_cPt_nv_V4:
-    case Hexagon::POST_STbri_cdnPt_nv_V4:
-    case Hexagon::POST_STbri_cNotPt_nv_V4:
-    case Hexagon::POST_STbri_cdnNotPt_nv_V4:
-    case Hexagon::STb_GP_cPt_nv_V4:
-    case Hexagon::STb_GP_cNotPt_nv_V4:
-    case Hexagon::STb_GP_cdnPt_nv_V4:
-    case Hexagon::STb_GP_cdnNotPt_nv_V4:
-    case Hexagon::STrib_abs_nv_V4:
-    case Hexagon::STrib_abs_cPt_nv_V4:
-    case Hexagon::STrib_abs_cdnPt_nv_V4:
-    case Hexagon::STrib_abs_cNotPt_nv_V4:
-    case Hexagon::STrib_abs_cdnNotPt_nv_V4:
-
-    // Store Halfword
-    case Hexagon::STrih_nv_V4:
-    case Hexagon::STrih_indexed_nv_V4:
-    case Hexagon::STrih_indexed_shl_nv_V4:
-    case Hexagon::STrih_shl_nv_V4:
-    case Hexagon::STh_GP_nv_V4:
-    case Hexagon::POST_SThri_nv_V4:
-    case Hexagon::STrih_cPt_nv_V4:
-    case Hexagon::STrih_cdnPt_nv_V4:
-    case Hexagon::STrih_cNotPt_nv_V4:
-    case Hexagon::STrih_cdnNotPt_nv_V4:
-    case Hexagon::STrih_indexed_cPt_nv_V4:
-    case Hexagon::STrih_indexed_cdnPt_nv_V4:
-    case Hexagon::STrih_indexed_cNotPt_nv_V4:
-    case Hexagon::STrih_indexed_cdnNotPt_nv_V4:
-    case Hexagon::STrih_indexed_shl_cPt_nv_V4:
-    case Hexagon::STrih_indexed_shl_cdnPt_nv_V4:
-    case Hexagon::STrih_indexed_shl_cNotPt_nv_V4:
-    case Hexagon::STrih_indexed_shl_cdnNotPt_nv_V4:
-    case Hexagon::POST_SThri_cPt_nv_V4:
-    case Hexagon::POST_SThri_cdnPt_nv_V4:
-    case Hexagon::POST_SThri_cNotPt_nv_V4:
-    case Hexagon::POST_SThri_cdnNotPt_nv_V4:
-    case Hexagon::STh_GP_cPt_nv_V4:
-    case Hexagon::STh_GP_cNotPt_nv_V4:
-    case Hexagon::STh_GP_cdnPt_nv_V4:
-    case Hexagon::STh_GP_cdnNotPt_nv_V4:
-    case Hexagon::STrih_abs_nv_V4:
-    case Hexagon::STrih_abs_cPt_nv_V4:
-    case Hexagon::STrih_abs_cdnPt_nv_V4:
-    case Hexagon::STrih_abs_cNotPt_nv_V4:
-    case Hexagon::STrih_abs_cdnNotPt_nv_V4:
-
-    // Store Word
-    case Hexagon::STriw_nv_V4:
-    case Hexagon::STriw_indexed_nv_V4:
-    case Hexagon::STriw_indexed_shl_nv_V4:
-    case Hexagon::STriw_shl_nv_V4:
-    case Hexagon::STw_GP_nv_V4:
-    case Hexagon::POST_STwri_nv_V4:
-    case Hexagon::STriw_cPt_nv_V4:
-    case Hexagon::STriw_cdnPt_nv_V4:
-    case Hexagon::STriw_cNotPt_nv_V4:
-    case Hexagon::STriw_cdnNotPt_nv_V4:
-    case Hexagon::STriw_indexed_cPt_nv_V4:
-    case Hexagon::STriw_indexed_cdnPt_nv_V4:
-    case Hexagon::STriw_indexed_cNotPt_nv_V4:
-    case Hexagon::STriw_indexed_cdnNotPt_nv_V4:
-    case Hexagon::STriw_indexed_shl_cPt_nv_V4:
-    case Hexagon::STriw_indexed_shl_cdnPt_nv_V4:
-    case Hexagon::STriw_indexed_shl_cNotPt_nv_V4:
-    case Hexagon::STriw_indexed_shl_cdnNotPt_nv_V4:
-    case Hexagon::POST_STwri_cPt_nv_V4:
-    case Hexagon::POST_STwri_cdnPt_nv_V4:
-    case Hexagon::POST_STwri_cNotPt_nv_V4:
-    case Hexagon::POST_STwri_cdnNotPt_nv_V4:
-    case Hexagon::STw_GP_cPt_nv_V4:
-    case Hexagon::STw_GP_cNotPt_nv_V4:
-    case Hexagon::STw_GP_cdnPt_nv_V4:
-    case Hexagon::STw_GP_cdnNotPt_nv_V4:
-    case Hexagon::STriw_abs_nv_V4:
-    case Hexagon::STriw_abs_cPt_nv_V4:
-    case Hexagon::STriw_abs_cdnPt_nv_V4:
-    case Hexagon::STriw_abs_cNotPt_nv_V4:
-    case Hexagon::STriw_abs_cdnNotPt_nv_V4:
-      return true;
-  }
-}
-
-bool HexagonInstrInfo::isPostIncrement (const MachineInstr* MI) const {
-  switch (MI->getOpcode())
-  {
-    default: return false;
-    // Load Byte
-    case Hexagon::POST_LDrib:
-    case Hexagon::POST_LDrib_cPt:
-    case Hexagon::POST_LDrib_cNotPt:
-    case Hexagon::POST_LDrib_cdnPt_V4:
-    case Hexagon::POST_LDrib_cdnNotPt_V4:
-
-    // Load unsigned byte
-    case Hexagon::POST_LDriub:
-    case Hexagon::POST_LDriub_cPt:
-    case Hexagon::POST_LDriub_cNotPt:
-    case Hexagon::POST_LDriub_cdnPt_V4:
-    case Hexagon::POST_LDriub_cdnNotPt_V4:
-
-    // Load halfword
-    case Hexagon::POST_LDrih:
-    case Hexagon::POST_LDrih_cPt:
-    case Hexagon::POST_LDrih_cNotPt:
-    case Hexagon::POST_LDrih_cdnPt_V4:
-    case Hexagon::POST_LDrih_cdnNotPt_V4:
-
-    // Load unsigned halfword
-    case Hexagon::POST_LDriuh:
-    case Hexagon::POST_LDriuh_cPt:
-    case Hexagon::POST_LDriuh_cNotPt:
-    case Hexagon::POST_LDriuh_cdnPt_V4:
-    case Hexagon::POST_LDriuh_cdnNotPt_V4:
-
-    // Load word
-    case Hexagon::POST_LDriw:
-    case Hexagon::POST_LDriw_cPt:
-    case Hexagon::POST_LDriw_cNotPt:
-    case Hexagon::POST_LDriw_cdnPt_V4:
-    case Hexagon::POST_LDriw_cdnNotPt_V4:
-
-    // Load double word
-    case Hexagon::POST_LDrid:
-    case Hexagon::POST_LDrid_cPt:
-    case Hexagon::POST_LDrid_cNotPt:
-    case Hexagon::POST_LDrid_cdnPt_V4:
-    case Hexagon::POST_LDrid_cdnNotPt_V4:
-
-    // Store byte
-    case Hexagon::POST_STbri:
-    case Hexagon::POST_STbri_cPt:
-    case Hexagon::POST_STbri_cNotPt:
-    case Hexagon::POST_STbri_cdnPt_V4:
-    case Hexagon::POST_STbri_cdnNotPt_V4:
-
-    // Store halfword
-    case Hexagon::POST_SThri:
-    case Hexagon::POST_SThri_cPt:
-    case Hexagon::POST_SThri_cNotPt:
-    case Hexagon::POST_SThri_cdnPt_V4:
-    case Hexagon::POST_SThri_cdnNotPt_V4:
-
-    // Store word
-    case Hexagon::POST_STwri:
-    case Hexagon::POST_STwri_cPt:
-    case Hexagon::POST_STwri_cNotPt:
-    case Hexagon::POST_STwri_cdnPt_V4:
-    case Hexagon::POST_STwri_cdnNotPt_V4:
-
-    // Store double word
-    case Hexagon::POST_STdri:
-    case Hexagon::POST_STdri_cPt:
-    case Hexagon::POST_STdri_cNotPt:
-    case Hexagon::POST_STdri_cdnPt_V4:
-    case Hexagon::POST_STdri_cdnNotPt_V4:
-      return true;
-  }
+bool HexagonInstrInfo::isBranch (const MachineInstr *MI) const {
+  return MI->getDesc().isBranch();
 }
 
 bool HexagonInstrInfo::isNewValueInst(const MachineInstr *MI) const {
@@ -1018,466 +735,40 @@ bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const {
 //  cPt    ---> cNotPt
 //  cNotPt ---> cPt
 //
-// however, these inversiones are NOT included:
-//
-//  cdnPt      -X-> cdnNotPt
-//  cdnNotPt   -X-> cdnPt
-//  cPt_nv     -X-> cNotPt_nv (new value stores)
-//  cNotPt_nv  -X-> cPt_nv    (new value stores)
-//
-// because only the following transformations are allowed:
-//
-//  cNotPt  ---> cdnNotPt
-//  cPt     ---> cdnPt
-//  cNotPt  ---> cNotPt_nv
-//  cPt     ---> cPt_nv
 unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const {
+  int InvPredOpcode;
+  InvPredOpcode = isPredicatedTrue(Opc) ? Hexagon::getFalsePredOpcode(Opc)
+                                        : Hexagon::getTruePredOpcode(Opc);
+  if (InvPredOpcode >= 0) // Valid instruction with the inverted predicate.
+    return InvPredOpcode;
+
   switch(Opc) {
     default: llvm_unreachable("Unexpected predicated instruction");
-    case Hexagon::TFR_cPt:
-      return Hexagon::TFR_cNotPt;
-    case Hexagon::TFR_cNotPt:
-      return Hexagon::TFR_cPt;
-
-    case Hexagon::TFRI_cPt:
-      return Hexagon::TFRI_cNotPt;
-    case Hexagon::TFRI_cNotPt:
-      return Hexagon::TFRI_cPt;
-
-    case Hexagon::JMP_t:
-      return Hexagon::JMP_f;
-    case Hexagon::JMP_f:
-      return Hexagon::JMP_t;
-
-    case Hexagon::ADD_ri_cPt:
-      return Hexagon::ADD_ri_cNotPt;
-    case Hexagon::ADD_ri_cNotPt:
-      return Hexagon::ADD_ri_cPt;
-
-    case Hexagon::ADD_rr_cPt:
-      return Hexagon::ADD_rr_cNotPt;
-    case Hexagon::ADD_rr_cNotPt:
-      return Hexagon::ADD_rr_cPt;
-
-    case Hexagon::XOR_rr_cPt:
-      return Hexagon::XOR_rr_cNotPt;
-    case Hexagon::XOR_rr_cNotPt:
-      return Hexagon::XOR_rr_cPt;
-
-    case Hexagon::AND_rr_cPt:
-      return Hexagon::AND_rr_cNotPt;
-    case Hexagon::AND_rr_cNotPt:
-      return Hexagon::AND_rr_cPt;
-
-    case Hexagon::OR_rr_cPt:
-      return Hexagon::OR_rr_cNotPt;
-    case Hexagon::OR_rr_cNotPt:
-      return Hexagon::OR_rr_cPt;
-
-    case Hexagon::SUB_rr_cPt:
-      return Hexagon::SUB_rr_cNotPt;
-    case Hexagon::SUB_rr_cNotPt:
-      return Hexagon::SUB_rr_cPt;
-
     case Hexagon::COMBINE_rr_cPt:
       return Hexagon::COMBINE_rr_cNotPt;
     case Hexagon::COMBINE_rr_cNotPt:
       return Hexagon::COMBINE_rr_cPt;
 
-    case Hexagon::ASLH_cPt_V4:
-      return Hexagon::ASLH_cNotPt_V4;
-    case Hexagon::ASLH_cNotPt_V4:
-      return Hexagon::ASLH_cPt_V4;
-
-    case Hexagon::ASRH_cPt_V4:
-      return Hexagon::ASRH_cNotPt_V4;
-    case Hexagon::ASRH_cNotPt_V4:
-      return Hexagon::ASRH_cPt_V4;
-
-    case Hexagon::SXTB_cPt_V4:
-      return Hexagon::SXTB_cNotPt_V4;
-    case Hexagon::SXTB_cNotPt_V4:
-      return Hexagon::SXTB_cPt_V4;
-
-    case Hexagon::SXTH_cPt_V4:
-      return Hexagon::SXTH_cNotPt_V4;
-    case Hexagon::SXTH_cNotPt_V4:
-      return Hexagon::SXTH_cPt_V4;
-
-    case Hexagon::ZXTB_cPt_V4:
-      return Hexagon::ZXTB_cNotPt_V4;
-    case Hexagon::ZXTB_cNotPt_V4:
-      return Hexagon::ZXTB_cPt_V4;
-
-    case Hexagon::ZXTH_cPt_V4:
-      return Hexagon::ZXTH_cNotPt_V4;
-    case Hexagon::ZXTH_cNotPt_V4:
-      return Hexagon::ZXTH_cPt_V4;
-
-
-    case Hexagon::JMPR_t:
-      return Hexagon::JMPR_f;
-    case Hexagon::JMPR_f:
-      return Hexagon::JMPR_t;
-
-  // V4 indexed+scaled load.
-    case Hexagon::LDrid_indexed_shl_cPt_V4:
-      return Hexagon::LDrid_indexed_shl_cNotPt_V4;
-    case Hexagon::LDrid_indexed_shl_cNotPt_V4:
-      return Hexagon::LDrid_indexed_shl_cPt_V4;
-
-    case Hexagon::LDrib_indexed_shl_cPt_V4:
-      return Hexagon::LDrib_indexed_shl_cNotPt_V4;
-    case Hexagon::LDrib_indexed_shl_cNotPt_V4:
-      return Hexagon::LDrib_indexed_shl_cPt_V4;
-
-    case Hexagon::LDriub_indexed_shl_cPt_V4:
-      return Hexagon::LDriub_indexed_shl_cNotPt_V4;
-    case Hexagon::LDriub_indexed_shl_cNotPt_V4:
-      return Hexagon::LDriub_indexed_shl_cPt_V4;
-
-    case Hexagon::LDrih_indexed_shl_cPt_V4:
-      return Hexagon::LDrih_indexed_shl_cNotPt_V4;
-    case Hexagon::LDrih_indexed_shl_cNotPt_V4:
-      return Hexagon::LDrih_indexed_shl_cPt_V4;
-
-    case Hexagon::LDriuh_indexed_shl_cPt_V4:
-      return Hexagon::LDriuh_indexed_shl_cNotPt_V4;
-    case Hexagon::LDriuh_indexed_shl_cNotPt_V4:
-      return Hexagon::LDriuh_indexed_shl_cPt_V4;
-
-    case Hexagon::LDriw_indexed_shl_cPt_V4:
-      return Hexagon::LDriw_indexed_shl_cNotPt_V4;
-    case Hexagon::LDriw_indexed_shl_cNotPt_V4:
-      return Hexagon::LDriw_indexed_shl_cPt_V4;
-
-    // Byte.
-    case Hexagon::POST_STbri_cPt:
-      return Hexagon::POST_STbri_cNotPt;
-    case Hexagon::POST_STbri_cNotPt:
-      return Hexagon::POST_STbri_cPt;
-
-    case Hexagon::STrib_cPt:
-      return Hexagon::STrib_cNotPt;
-    case Hexagon::STrib_cNotPt:
-      return Hexagon::STrib_cPt;
-
-    case Hexagon::STrib_indexed_cPt:
-      return Hexagon::STrib_indexed_cNotPt;
-    case Hexagon::STrib_indexed_cNotPt:
-      return Hexagon::STrib_indexed_cPt;
-
-    case Hexagon::STrib_imm_cPt_V4:
-      return Hexagon::STrib_imm_cNotPt_V4;
-    case Hexagon::STrib_imm_cNotPt_V4:
-      return Hexagon::STrib_imm_cPt_V4;
-
-    case Hexagon::STrib_indexed_shl_cPt_V4:
-      return Hexagon::STrib_indexed_shl_cNotPt_V4;
-    case Hexagon::STrib_indexed_shl_cNotPt_V4:
-      return Hexagon::STrib_indexed_shl_cPt_V4;
-
-  // Halfword.
-    case Hexagon::POST_SThri_cPt:
-      return Hexagon::POST_SThri_cNotPt;
-    case Hexagon::POST_SThri_cNotPt:
-      return Hexagon::POST_SThri_cPt;
-
-    case Hexagon::STrih_cPt:
-      return Hexagon::STrih_cNotPt;
-    case Hexagon::STrih_cNotPt:
-      return Hexagon::STrih_cPt;
-
-    case Hexagon::STrih_indexed_cPt:
-      return Hexagon::STrih_indexed_cNotPt;
-    case Hexagon::STrih_indexed_cNotPt:
-      return Hexagon::STrih_indexed_cPt;
-
-    case Hexagon::STrih_imm_cPt_V4:
-      return Hexagon::STrih_imm_cNotPt_V4;
-    case Hexagon::STrih_imm_cNotPt_V4:
-      return Hexagon::STrih_imm_cPt_V4;
-
-    case Hexagon::STrih_indexed_shl_cPt_V4:
-      return Hexagon::STrih_indexed_shl_cNotPt_V4;
-    case Hexagon::STrih_indexed_shl_cNotPt_V4:
-      return Hexagon::STrih_indexed_shl_cPt_V4;
-
-  // Word.
-    case Hexagon::POST_STwri_cPt:
-      return Hexagon::POST_STwri_cNotPt;
-    case Hexagon::POST_STwri_cNotPt:
-      return Hexagon::POST_STwri_cPt;
-
-    case Hexagon::STriw_cPt:
-      return Hexagon::STriw_cNotPt;
-    case Hexagon::STriw_cNotPt:
-      return Hexagon::STriw_cPt;
-
-    case Hexagon::STriw_indexed_cPt:
-      return Hexagon::STriw_indexed_cNotPt;
-    case Hexagon::STriw_indexed_cNotPt:
-      return Hexagon::STriw_indexed_cPt;
-
-    case Hexagon::STriw_indexed_shl_cPt_V4:
-      return Hexagon::STriw_indexed_shl_cNotPt_V4;
-    case Hexagon::STriw_indexed_shl_cNotPt_V4:
-      return Hexagon::STriw_indexed_shl_cPt_V4;
-
-    case Hexagon::STriw_imm_cPt_V4:
-      return Hexagon::STriw_imm_cNotPt_V4;
-    case Hexagon::STriw_imm_cNotPt_V4:
-      return Hexagon::STriw_imm_cPt_V4;
-
-  // Double word.
-    case Hexagon::POST_STdri_cPt:
-      return Hexagon::POST_STdri_cNotPt;
-    case Hexagon::POST_STdri_cNotPt:
-      return Hexagon::POST_STdri_cPt;
-
-    case Hexagon::STrid_cPt:
-      return Hexagon::STrid_cNotPt;
-    case Hexagon::STrid_cNotPt:
-      return Hexagon::STrid_cPt;
-
-    case Hexagon::STrid_indexed_cPt:
-      return Hexagon::STrid_indexed_cNotPt;
-    case Hexagon::STrid_indexed_cNotPt:
-      return Hexagon::STrid_indexed_cPt;
-
-    case Hexagon::STrid_indexed_shl_cPt_V4:
-      return Hexagon::STrid_indexed_shl_cNotPt_V4;
-    case Hexagon::STrid_indexed_shl_cNotPt_V4:
-      return Hexagon::STrid_indexed_shl_cPt_V4;
-
-    // V4 Store to global address.
-    case Hexagon::STd_GP_cPt_V4:
-      return Hexagon::STd_GP_cNotPt_V4;
-    case Hexagon::STd_GP_cNotPt_V4:
-      return Hexagon::STd_GP_cPt_V4;
-
-    case Hexagon::STb_GP_cPt_V4:
-      return Hexagon::STb_GP_cNotPt_V4;
-    case Hexagon::STb_GP_cNotPt_V4:
-      return Hexagon::STb_GP_cPt_V4;
-
-    case Hexagon::STh_GP_cPt_V4:
-      return Hexagon::STh_GP_cNotPt_V4;
-    case Hexagon::STh_GP_cNotPt_V4:
-      return Hexagon::STh_GP_cPt_V4;
-
-    case Hexagon::STw_GP_cPt_V4:
-      return Hexagon::STw_GP_cNotPt_V4;
-    case Hexagon::STw_GP_cNotPt_V4:
-      return Hexagon::STw_GP_cPt_V4;
-
-  // Load.
-    case Hexagon::LDrid_cPt:
-      return Hexagon::LDrid_cNotPt;
-    case Hexagon::LDrid_cNotPt:
-      return Hexagon::LDrid_cPt;
-
-    case Hexagon::LDriw_cPt:
-      return Hexagon::LDriw_cNotPt;
-    case Hexagon::LDriw_cNotPt:
-      return Hexagon::LDriw_cPt;
-
-    case Hexagon::LDrih_cPt:
-      return Hexagon::LDrih_cNotPt;
-    case Hexagon::LDrih_cNotPt:
-      return Hexagon::LDrih_cPt;
-
-    case Hexagon::LDriuh_cPt:
-      return Hexagon::LDriuh_cNotPt;
-    case Hexagon::LDriuh_cNotPt:
-      return Hexagon::LDriuh_cPt;
-
-    case Hexagon::LDrib_cPt:
-      return Hexagon::LDrib_cNotPt;
-    case Hexagon::LDrib_cNotPt:
-      return Hexagon::LDrib_cPt;
-
-    case Hexagon::LDriub_cPt:
-      return Hexagon::LDriub_cNotPt;
-    case Hexagon::LDriub_cNotPt:
-      return Hexagon::LDriub_cPt;
-
- // Load Indexed.
-    case Hexagon::LDrid_indexed_cPt:
-      return Hexagon::LDrid_indexed_cNotPt;
-    case Hexagon::LDrid_indexed_cNotPt:
-      return Hexagon::LDrid_indexed_cPt;
-
-    case Hexagon::LDriw_indexed_cPt:
-      return Hexagon::LDriw_indexed_cNotPt;
-    case Hexagon::LDriw_indexed_cNotPt:
-      return Hexagon::LDriw_indexed_cPt;
-
-    case Hexagon::LDrih_indexed_cPt:
-      return Hexagon::LDrih_indexed_cNotPt;
-    case Hexagon::LDrih_indexed_cNotPt:
-      return Hexagon::LDrih_indexed_cPt;
-
-    case Hexagon::LDriuh_indexed_cPt:
-      return Hexagon::LDriuh_indexed_cNotPt;
-    case Hexagon::LDriuh_indexed_cNotPt:
-      return Hexagon::LDriuh_indexed_cPt;
-
-    case Hexagon::LDrib_indexed_cPt:
-      return Hexagon::LDrib_indexed_cNotPt;
-    case Hexagon::LDrib_indexed_cNotPt:
-      return Hexagon::LDrib_indexed_cPt;
-
-    case Hexagon::LDriub_indexed_cPt:
-      return Hexagon::LDriub_indexed_cNotPt;
-    case Hexagon::LDriub_indexed_cNotPt:
-      return Hexagon::LDriub_indexed_cPt;
-
-  // Post Inc Load.
-    case Hexagon::POST_LDrid_cPt:
-      return Hexagon::POST_LDrid_cNotPt;
-    case Hexagon::POST_LDriw_cNotPt:
-      return Hexagon::POST_LDriw_cPt;
-
-    case Hexagon::POST_LDrih_cPt:
-      return Hexagon::POST_LDrih_cNotPt;
-    case Hexagon::POST_LDrih_cNotPt:
-      return Hexagon::POST_LDrih_cPt;
-
-    case Hexagon::POST_LDriuh_cPt:
-      return Hexagon::POST_LDriuh_cNotPt;
-    case Hexagon::POST_LDriuh_cNotPt:
-      return Hexagon::POST_LDriuh_cPt;
-
-    case Hexagon::POST_LDrib_cPt:
-      return Hexagon::POST_LDrib_cNotPt;
-    case Hexagon::POST_LDrib_cNotPt:
-      return Hexagon::POST_LDrib_cPt;
-
-    case Hexagon::POST_LDriub_cPt:
-      return Hexagon::POST_LDriub_cNotPt;
-    case Hexagon::POST_LDriub_cNotPt:
-      return Hexagon::POST_LDriub_cPt;
-
-  // Dealloc_return.
+      // Dealloc_return.
     case Hexagon::DEALLOC_RET_cPt_V4:
       return Hexagon::DEALLOC_RET_cNotPt_V4;
     case Hexagon::DEALLOC_RET_cNotPt_V4:
       return Hexagon::DEALLOC_RET_cPt_V4;
-
-   // New Value Jump.
-   // JMPEQ_ri - with -1.
-    case Hexagon::JMP_EQriPtneg_nv_V4:
-      return Hexagon::JMP_EQriNotPtneg_nv_V4;
-    case Hexagon::JMP_EQriNotPtneg_nv_V4:
-      return Hexagon::JMP_EQriPtneg_nv_V4;
-
-    case Hexagon::JMP_EQriPntneg_nv_V4:
-      return Hexagon::JMP_EQriNotPntneg_nv_V4;
-    case Hexagon::JMP_EQriNotPntneg_nv_V4:
-      return Hexagon::JMP_EQriPntneg_nv_V4;
-
-   // JMPEQ_ri.
-     case Hexagon::JMP_EQriPt_nv_V4:
-      return Hexagon::JMP_EQriNotPt_nv_V4;
-    case Hexagon::JMP_EQriNotPt_nv_V4:
-      return Hexagon::JMP_EQriPt_nv_V4;
-
-     case Hexagon::JMP_EQriPnt_nv_V4:
-      return Hexagon::JMP_EQriNotPnt_nv_V4;
-    case Hexagon::JMP_EQriNotPnt_nv_V4:
-      return Hexagon::JMP_EQriPnt_nv_V4;
-
-   // JMPEQ_rr.
-     case Hexagon::JMP_EQrrPt_nv_V4:
-      return Hexagon::JMP_EQrrNotPt_nv_V4;
-    case Hexagon::JMP_EQrrNotPt_nv_V4:
-      return Hexagon::JMP_EQrrPt_nv_V4;
-
-     case Hexagon::JMP_EQrrPnt_nv_V4:
-      return Hexagon::JMP_EQrrNotPnt_nv_V4;
-    case Hexagon::JMP_EQrrNotPnt_nv_V4:
-      return Hexagon::JMP_EQrrPnt_nv_V4;
-
-   // JMPGT_ri - with -1.
-    case Hexagon::JMP_GTriPtneg_nv_V4:
-      return Hexagon::JMP_GTriNotPtneg_nv_V4;
-    case Hexagon::JMP_GTriNotPtneg_nv_V4:
-      return Hexagon::JMP_GTriPtneg_nv_V4;
-
-    case Hexagon::JMP_GTriPntneg_nv_V4:
-      return Hexagon::JMP_GTriNotPntneg_nv_V4;
-    case Hexagon::JMP_GTriNotPntneg_nv_V4:
-      return Hexagon::JMP_GTriPntneg_nv_V4;
-
-   // JMPGT_ri.
-     case Hexagon::JMP_GTriPt_nv_V4:
-      return Hexagon::JMP_GTriNotPt_nv_V4;
-    case Hexagon::JMP_GTriNotPt_nv_V4:
-      return Hexagon::JMP_GTriPt_nv_V4;
-
-     case Hexagon::JMP_GTriPnt_nv_V4:
-      return Hexagon::JMP_GTriNotPnt_nv_V4;
-    case Hexagon::JMP_GTriNotPnt_nv_V4:
-      return Hexagon::JMP_GTriPnt_nv_V4;
-
-   // JMPGT_rr.
-     case Hexagon::JMP_GTrrPt_nv_V4:
-      return Hexagon::JMP_GTrrNotPt_nv_V4;
-    case Hexagon::JMP_GTrrNotPt_nv_V4:
-      return Hexagon::JMP_GTrrPt_nv_V4;
-
-     case Hexagon::JMP_GTrrPnt_nv_V4:
-      return Hexagon::JMP_GTrrNotPnt_nv_V4;
-    case Hexagon::JMP_GTrrNotPnt_nv_V4:
-      return Hexagon::JMP_GTrrPnt_nv_V4;
-
-   // JMPGT_rrdn.
-     case Hexagon::JMP_GTrrdnPt_nv_V4:
-      return Hexagon::JMP_GTrrdnNotPt_nv_V4;
-    case Hexagon::JMP_GTrrdnNotPt_nv_V4:
-      return Hexagon::JMP_GTrrdnPt_nv_V4;
-
-     case Hexagon::JMP_GTrrdnPnt_nv_V4:
-      return Hexagon::JMP_GTrrdnNotPnt_nv_V4;
-    case Hexagon::JMP_GTrrdnNotPnt_nv_V4:
-      return Hexagon::JMP_GTrrdnPnt_nv_V4;
-
-   // JMPGTU_ri.
-     case Hexagon::JMP_GTUriPt_nv_V4:
-      return Hexagon::JMP_GTUriNotPt_nv_V4;
-    case Hexagon::JMP_GTUriNotPt_nv_V4:
-      return Hexagon::JMP_GTUriPt_nv_V4;
-
-     case Hexagon::JMP_GTUriPnt_nv_V4:
-      return Hexagon::JMP_GTUriNotPnt_nv_V4;
-    case Hexagon::JMP_GTUriNotPnt_nv_V4:
-      return Hexagon::JMP_GTUriPnt_nv_V4;
-
-   // JMPGTU_rr.
-     case Hexagon::JMP_GTUrrPt_nv_V4:
-      return Hexagon::JMP_GTUrrNotPt_nv_V4;
-    case Hexagon::JMP_GTUrrNotPt_nv_V4:
-      return Hexagon::JMP_GTUrrPt_nv_V4;
-
-     case Hexagon::JMP_GTUrrPnt_nv_V4:
-      return Hexagon::JMP_GTUrrNotPnt_nv_V4;
-    case Hexagon::JMP_GTUrrNotPnt_nv_V4:
-      return Hexagon::JMP_GTUrrPnt_nv_V4;
-
-   // JMPGTU_rrdn.
-     case Hexagon::JMP_GTUrrdnPt_nv_V4:
-      return Hexagon::JMP_GTUrrdnNotPt_nv_V4;
-    case Hexagon::JMP_GTUrrdnNotPt_nv_V4:
-      return Hexagon::JMP_GTUrrdnPt_nv_V4;
-
-     case Hexagon::JMP_GTUrrdnPnt_nv_V4:
-      return Hexagon::JMP_GTUrrdnNotPnt_nv_V4;
-    case Hexagon::JMP_GTUrrdnNotPnt_nv_V4:
-      return Hexagon::JMP_GTUrrdnPnt_nv_V4;
   }
 }
 
+// New Value Store instructions.
+bool HexagonInstrInfo::isNewValueStore(const MachineInstr *MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+
+  return ((F >> HexagonII::NVStorePos) & HexagonII::NVStoreMask);
+}
+
+bool HexagonInstrInfo::isNewValueStore(unsigned Opcode) const {
+  const uint64_t F = get(Opcode).TSFlags;
+
+  return ((F >> HexagonII::NVStorePos) & HexagonII::NVStoreMask);
+}
 
 int HexagonInstrInfo::
 getMatchingCondBranchOpcode(int Opc, bool invertPredicate) const {
@@ -1491,223 +782,21 @@ getMatchingCondBranchOpcode(int Opc, bool invertPredicate) const {
   // This switch case will be removed once all the instructions have been
   // modified to use relation maps.
   switch(Opc) {
-  case Hexagon::TFR:
-    return !invertPredicate ? Hexagon::TFR_cPt :
-                              Hexagon::TFR_cNotPt;
   case Hexagon::TFRI_f:
     return !invertPredicate ? Hexagon::TFRI_cPt_f :
                               Hexagon::TFRI_cNotPt_f;
-  case Hexagon::TFRI:
-    return !invertPredicate ? Hexagon::TFRI_cPt :
-                              Hexagon::TFRI_cNotPt;
-  case Hexagon::JMP:
-    return !invertPredicate ? Hexagon::JMP_t :
-                              Hexagon::JMP_f;
-  case Hexagon::JMP_EQrrPt_nv_V4:
-    return !invertPredicate ? Hexagon::JMP_EQrrPt_nv_V4 :
-                              Hexagon::JMP_EQrrNotPt_nv_V4;
-  case Hexagon::JMP_EQriPt_nv_V4:
-    return !invertPredicate ? Hexagon::JMP_EQriPt_nv_V4 :
-                              Hexagon::JMP_EQriNotPt_nv_V4;
   case Hexagon::COMBINE_rr:
     return !invertPredicate ? Hexagon::COMBINE_rr_cPt :
                               Hexagon::COMBINE_rr_cNotPt;
-  case Hexagon::ASLH:
-    return !invertPredicate ? Hexagon::ASLH_cPt_V4 :
-                              Hexagon::ASLH_cNotPt_V4;
-  case Hexagon::ASRH:
-    return !invertPredicate ? Hexagon::ASRH_cPt_V4 :
-                              Hexagon::ASRH_cNotPt_V4;
-  case Hexagon::SXTB:
-    return !invertPredicate ? Hexagon::SXTB_cPt_V4 :
-                              Hexagon::SXTB_cNotPt_V4;
-  case Hexagon::SXTH:
-    return !invertPredicate ? Hexagon::SXTH_cPt_V4 :
-                              Hexagon::SXTH_cNotPt_V4;
-  case Hexagon::ZXTB:
-    return !invertPredicate ? Hexagon::ZXTB_cPt_V4 :
-                              Hexagon::ZXTB_cNotPt_V4;
-  case Hexagon::ZXTH:
-    return !invertPredicate ? Hexagon::ZXTH_cPt_V4 :
-                              Hexagon::ZXTH_cNotPt_V4;
-
-  case Hexagon::JMPR:
-    return !invertPredicate ? Hexagon::JMPR_t :
-                              Hexagon::JMPR_f;
-
-  // V4 indexed+scaled load.
-  case Hexagon::LDrid_indexed_shl_V4:
-    return !invertPredicate ? Hexagon::LDrid_indexed_shl_cPt_V4 :
-                              Hexagon::LDrid_indexed_shl_cNotPt_V4;
-  case Hexagon::LDrib_indexed_shl_V4:
-    return !invertPredicate ? Hexagon::LDrib_indexed_shl_cPt_V4 :
-                              Hexagon::LDrib_indexed_shl_cNotPt_V4;
-  case Hexagon::LDriub_indexed_shl_V4:
-    return !invertPredicate ? Hexagon::LDriub_indexed_shl_cPt_V4 :
-                              Hexagon::LDriub_indexed_shl_cNotPt_V4;
-  case Hexagon::LDrih_indexed_shl_V4:
-    return !invertPredicate ? Hexagon::LDrih_indexed_shl_cPt_V4 :
-                              Hexagon::LDrih_indexed_shl_cNotPt_V4;
-  case Hexagon::LDriuh_indexed_shl_V4:
-    return !invertPredicate ? Hexagon::LDriuh_indexed_shl_cPt_V4 :
-                              Hexagon::LDriuh_indexed_shl_cNotPt_V4;
-  case Hexagon::LDriw_indexed_shl_V4:
-    return !invertPredicate ? Hexagon::LDriw_indexed_shl_cPt_V4 :
-                              Hexagon::LDriw_indexed_shl_cNotPt_V4;
-
-  // V4 Load from global address
-  case Hexagon::LDd_GP_V4:
-    return !invertPredicate ? Hexagon::LDd_GP_cPt_V4 :
-                              Hexagon::LDd_GP_cNotPt_V4;
-  case Hexagon::LDb_GP_V4:
-    return !invertPredicate ? Hexagon::LDb_GP_cPt_V4 :
-                              Hexagon::LDb_GP_cNotPt_V4;
-  case Hexagon::LDub_GP_V4:
-    return !invertPredicate ? Hexagon::LDub_GP_cPt_V4 :
-                              Hexagon::LDub_GP_cNotPt_V4;
-  case Hexagon::LDh_GP_V4:
-    return !invertPredicate ? Hexagon::LDh_GP_cPt_V4 :
-                              Hexagon::LDh_GP_cNotPt_V4;
-  case Hexagon::LDuh_GP_V4:
-    return !invertPredicate ? Hexagon::LDuh_GP_cPt_V4 :
-                              Hexagon::LDuh_GP_cNotPt_V4;
-  case Hexagon::LDw_GP_V4:
-    return !invertPredicate ? Hexagon::LDw_GP_cPt_V4 :
-                              Hexagon::LDw_GP_cNotPt_V4;
-
-    // Byte.
-  case Hexagon::POST_STbri:
-    return !invertPredicate ? Hexagon::POST_STbri_cPt :
-                              Hexagon::POST_STbri_cNotPt;
-  case Hexagon::STrib:
-    return !invertPredicate ? Hexagon::STrib_cPt :
-                              Hexagon::STrib_cNotPt;
-  case Hexagon::STrib_indexed:
-    return !invertPredicate ? Hexagon::STrib_indexed_cPt :
-                              Hexagon::STrib_indexed_cNotPt;
-  case Hexagon::STrib_imm_V4:
-    return !invertPredicate ? Hexagon::STrib_imm_cPt_V4 :
-                              Hexagon::STrib_imm_cNotPt_V4;
-  case Hexagon::STrib_indexed_shl_V4:
-    return !invertPredicate ? Hexagon::STrib_indexed_shl_cPt_V4 :
-                              Hexagon::STrib_indexed_shl_cNotPt_V4;
-  // Halfword.
-  case Hexagon::POST_SThri:
-    return !invertPredicate ? Hexagon::POST_SThri_cPt :
-                              Hexagon::POST_SThri_cNotPt;
-  case Hexagon::STrih:
-    return !invertPredicate ? Hexagon::STrih_cPt :
-                              Hexagon::STrih_cNotPt;
-  case Hexagon::STrih_indexed:
-    return !invertPredicate ? Hexagon::STrih_indexed_cPt :
-                              Hexagon::STrih_indexed_cNotPt;
-  case Hexagon::STrih_imm_V4:
-    return !invertPredicate ? Hexagon::STrih_imm_cPt_V4 :
-                              Hexagon::STrih_imm_cNotPt_V4;
-  case Hexagon::STrih_indexed_shl_V4:
-    return !invertPredicate ? Hexagon::STrih_indexed_shl_cPt_V4 :
-                              Hexagon::STrih_indexed_shl_cNotPt_V4;
+
   // Word.
-  case Hexagon::POST_STwri:
-    return !invertPredicate ? Hexagon::POST_STwri_cPt :
-                              Hexagon::POST_STwri_cNotPt;
-  case Hexagon::STriw:
+  case Hexagon::STriw_f:
     return !invertPredicate ? Hexagon::STriw_cPt :
                               Hexagon::STriw_cNotPt;
-  case Hexagon::STriw_indexed:
+  case Hexagon::STriw_indexed_f:
     return !invertPredicate ? Hexagon::STriw_indexed_cPt :
                               Hexagon::STriw_indexed_cNotPt;
-  case Hexagon::STriw_indexed_shl_V4:
-    return !invertPredicate ? Hexagon::STriw_indexed_shl_cPt_V4 :
-                              Hexagon::STriw_indexed_shl_cNotPt_V4;
-  case Hexagon::STriw_imm_V4:
-    return !invertPredicate ? Hexagon::STriw_imm_cPt_V4 :
-                              Hexagon::STriw_imm_cNotPt_V4;
-  // Double word.
-  case Hexagon::POST_STdri:
-    return !invertPredicate ? Hexagon::POST_STdri_cPt :
-                              Hexagon::POST_STdri_cNotPt;
-  case Hexagon::STrid:
-    return !invertPredicate ? Hexagon::STrid_cPt :
-                              Hexagon::STrid_cNotPt;
-  case Hexagon::STrid_indexed:
-    return !invertPredicate ? Hexagon::STrid_indexed_cPt :
-                              Hexagon::STrid_indexed_cNotPt;
-  case Hexagon::STrid_indexed_shl_V4:
-    return !invertPredicate ? Hexagon::STrid_indexed_shl_cPt_V4 :
-                              Hexagon::STrid_indexed_shl_cNotPt_V4;
-
-  // V4 Store to global address
-  case Hexagon::STd_GP_V4:
-    return !invertPredicate ? Hexagon::STd_GP_cPt_V4 :
-                              Hexagon::STd_GP_cNotPt_V4;
-  case Hexagon::STb_GP_V4:
-    return !invertPredicate ? Hexagon::STb_GP_cPt_V4 :
-                              Hexagon::STb_GP_cNotPt_V4;
-  case Hexagon::STh_GP_V4:
-    return !invertPredicate ? Hexagon::STh_GP_cPt_V4 :
-                              Hexagon::STh_GP_cNotPt_V4;
-  case Hexagon::STw_GP_V4:
-    return !invertPredicate ? Hexagon::STw_GP_cPt_V4 :
-                              Hexagon::STw_GP_cNotPt_V4;
-
-  // Load.
-  case Hexagon::LDrid:
-    return !invertPredicate ? Hexagon::LDrid_cPt :
-                              Hexagon::LDrid_cNotPt;
-  case Hexagon::LDriw:
-    return !invertPredicate ? Hexagon::LDriw_cPt :
-                              Hexagon::LDriw_cNotPt;
-  case Hexagon::LDrih:
-    return !invertPredicate ? Hexagon::LDrih_cPt :
-                              Hexagon::LDrih_cNotPt;
-  case Hexagon::LDriuh:
-    return !invertPredicate ? Hexagon::LDriuh_cPt :
-                              Hexagon::LDriuh_cNotPt;
-  case Hexagon::LDrib:
-    return !invertPredicate ? Hexagon::LDrib_cPt :
-                              Hexagon::LDrib_cNotPt;
-  case Hexagon::LDriub:
-    return !invertPredicate ? Hexagon::LDriub_cPt :
-                              Hexagon::LDriub_cNotPt;
- // Load Indexed.
-  case Hexagon::LDrid_indexed:
-    return !invertPredicate ? Hexagon::LDrid_indexed_cPt :
-                              Hexagon::LDrid_indexed_cNotPt;
-  case Hexagon::LDriw_indexed:
-    return !invertPredicate ? Hexagon::LDriw_indexed_cPt :
-                              Hexagon::LDriw_indexed_cNotPt;
-  case Hexagon::LDrih_indexed:
-    return !invertPredicate ? Hexagon::LDrih_indexed_cPt :
-                              Hexagon::LDrih_indexed_cNotPt;
-  case Hexagon::LDriuh_indexed:
-    return !invertPredicate ? Hexagon::LDriuh_indexed_cPt :
-                              Hexagon::LDriuh_indexed_cNotPt;
-  case Hexagon::LDrib_indexed:
-    return !invertPredicate ? Hexagon::LDrib_indexed_cPt :
-                              Hexagon::LDrib_indexed_cNotPt;
-  case Hexagon::LDriub_indexed:
-    return !invertPredicate ? Hexagon::LDriub_indexed_cPt :
-                              Hexagon::LDriub_indexed_cNotPt;
-  // Post Increment Load.
-  case Hexagon::POST_LDrid:
-    return !invertPredicate ? Hexagon::POST_LDrid_cPt :
-                              Hexagon::POST_LDrid_cNotPt;
-  case Hexagon::POST_LDriw:
-    return !invertPredicate ? Hexagon::POST_LDriw_cPt :
-                              Hexagon::POST_LDriw_cNotPt;
-  case Hexagon::POST_LDrih:
-    return !invertPredicate ? Hexagon::POST_LDrih_cPt :
-                              Hexagon::POST_LDrih_cNotPt;
-  case Hexagon::POST_LDriuh:
-    return !invertPredicate ? Hexagon::POST_LDriuh_cPt :
-                              Hexagon::POST_LDriuh_cNotPt;
-  case Hexagon::POST_LDrib:
-    return !invertPredicate ? Hexagon::POST_LDrib_cPt :
-                              Hexagon::POST_LDrib_cNotPt;
-  case Hexagon::POST_LDriub:
-    return !invertPredicate ? Hexagon::POST_LDriub_cPt :
-                              Hexagon::POST_LDriub_cNotPt;
+
   // DEALLOC_RETURN.
   case Hexagon::DEALLOC_RET_V4:
     return !invertPredicate ? Hexagon::DEALLOC_RET_cPt_V4 :
@@ -1889,13 +978,41 @@ isProfitableToIfCvt(MachineBasicBlock &TMBB,
   return true;
 }
 
-
+// Returns true if an instruction is predicated irrespective of the predicate
+// sense. For example, all of the following will return true.
+// if (p0) R1 = add(R2, R3)
+// if (!p0) R1 = add(R2, R3)
+// if (p0.new) R1 = add(R2, R3)
+// if (!p0.new) R1 = add(R2, R3)
 bool HexagonInstrInfo::isPredicated(const MachineInstr *MI) const {
   const uint64_t F = MI->getDesc().TSFlags;
 
   return ((F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask);
 }
 
+bool HexagonInstrInfo::isPredicated(unsigned Opcode) const {
+  const uint64_t F = get(Opcode).TSFlags;
+
+  return ((F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask);
+}
+
+bool HexagonInstrInfo::isPredicatedTrue(const MachineInstr *MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+
+  assert(isPredicated(MI));
+  return (!((F >> HexagonII::PredicatedFalsePos) &
+            HexagonII::PredicatedFalseMask));
+}
+
+bool HexagonInstrInfo::isPredicatedTrue(unsigned Opcode) const {
+  const uint64_t F = get(Opcode).TSFlags;
+
+  // Make sure that the instruction is predicated.
+  assert((F>> HexagonII::PredicatedPos) & HexagonII::PredicatedMask);
+  return (!((F >> HexagonII::PredicatedFalsePos) &
+            HexagonII::PredicatedFalseMask));
+}
+
 bool HexagonInstrInfo::isPredicatedNew(const MachineInstr *MI) const {
   const uint64_t F = MI->getDesc().TSFlags;
 
@@ -1903,6 +1020,23 @@ bool HexagonInstrInfo::isPredicatedNew(const MachineInstr *MI) const {
   return ((F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask);
 }
 
+bool HexagonInstrInfo::isPredicatedNew(unsigned Opcode) const {
+  const uint64_t F = get(Opcode).TSFlags;
+
+  assert(isPredicated(Opcode));
+  return ((F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask);
+}
+
+// Returns true, if a ST insn can be promoted to a new-value store.
+bool HexagonInstrInfo::mayBeNewStore(const MachineInstr *MI) const {
+  const HexagonRegisterInfo& QRI = getRegisterInfo();
+  const uint64_t F = MI->getDesc().TSFlags;
+
+  return ((F >> HexagonII::mayNVStorePos) &
+           HexagonII::mayNVStoreMask &
+           QRI.Subtarget.hasV4TOps());
+}
+
 bool
 HexagonInstrInfo::DefinesPredicate(MachineInstr *MI,
                                    std::vector<MachineOperand> &Pred) const {
@@ -2087,6 +1221,8 @@ isValidAutoIncImm(const EVT VT, const int Offset) const {
 
 bool HexagonInstrInfo::
 isMemOp(const MachineInstr *MI) const {
+//  return MI->getDesc().mayLoad() && MI->getDesc().mayStore();
+
   switch (MI->getOpcode())
   {
     default: return false;
@@ -2371,6 +1507,22 @@ isConditionalStore (const MachineInstr* MI) const {
   }
 }
 
+
+bool HexagonInstrInfo::isNewValueJump(const MachineInstr *MI) const {
+  if (isNewValue(MI) && isBranch(MI))
+    return true;
+  return false;
+}
+
+bool HexagonInstrInfo::isPostIncrement (const MachineInstr* MI) const {
+  return (getAddrMode(MI) == HexagonII::PostInc);
+}
+
+bool HexagonInstrInfo::isNewValue(const MachineInstr* MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+  return ((F >> HexagonII::NewValuePos) & HexagonII::NewValueMask);
+}
+
 // Returns true, if any one of the operands is a dot new
 // insn, whether it is predicated dot new or register dot new.
 bool HexagonInstrInfo::isDotNewInst (const MachineInstr* MI) const {
@@ -2378,6 +1530,97 @@ bool HexagonInstrInfo::isDotNewInst (const MachineInstr* MI) const {
      (isPredicated(MI) && isPredicatedNew(MI)));
 }
 
+// Returns the most basic instruction for the .new predicated instructions and
+// new-value stores.
+// For example, all of the following instructions will be converted back to the
+// same instruction:
+// 1) if (p0.new) memw(R0+#0) = R1.new  --->
+// 2) if (p0) memw(R0+#0)= R1.new      -------> if (p0) memw(R0+#0) = R1
+// 3) if (p0.new) memw(R0+#0) = R1      --->
+//
+
+int HexagonInstrInfo::GetDotOldOp(const int opc) const {
+  int NewOp = opc;
+  if (isPredicated(NewOp) && isPredicatedNew(NewOp)) { // Get predicate old form
+    NewOp = Hexagon::getPredOldOpcode(NewOp);
+    if (NewOp < 0)
+      assert(0 && "Couldn't change predicate new instruction to its old form.");
+  }
+
+  if (isNewValueStore(NewOp)) { // Convert into non new-value format
+    NewOp = Hexagon::getNonNVStore(NewOp);
+    if (NewOp < 0)
+      assert(0 && "Couldn't change new-value store to its old form.");
+  }
+  return NewOp;
+}
+
+// Return the new value instruction for a given store.
+int HexagonInstrInfo::GetDotNewOp(const MachineInstr* MI) const {
+  int NVOpcode = Hexagon::getNewValueOpcode(MI->getOpcode());
+  if (NVOpcode >= 0) // Valid new-value store instruction.
+    return NVOpcode;
+
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("Unknown .new type");
+  // store new value byte
+  case Hexagon::STrib_shl_V4:
+    return Hexagon::STrib_shl_nv_V4;
+
+  case Hexagon::STrih_shl_V4:
+    return Hexagon::STrih_shl_nv_V4;
+
+  case Hexagon::STriw_f:
+    return Hexagon::STriw_nv_V4;
+
+  case Hexagon::STriw_indexed_f:
+    return Hexagon::STriw_indexed_nv_V4;
+
+  case Hexagon::STriw_shl_V4:
+    return Hexagon::STriw_shl_nv_V4;
+
+  }
+  return 0;
+}
+
+// Return .new predicate version for an instruction.
+int HexagonInstrInfo::GetDotNewPredOp(MachineInstr *MI,
+                                      const MachineBranchProbabilityInfo
+                                      *MBPI) const {
+
+  int NewOpcode = Hexagon::getPredNewOpcode(MI->getOpcode());
+  if (NewOpcode >= 0) // Valid predicate new instruction
+    return NewOpcode;
+
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("Unknown .new type");
+  // Condtional Jumps
+  case Hexagon::JMP_t:
+  case Hexagon::JMP_f:
+    return getDotNewPredJumpOp(MI, MBPI);
+
+  case Hexagon::JMPR_t:
+    return Hexagon::JMPR_tnew_tV3;
+
+  case Hexagon::JMPR_f:
+    return Hexagon::JMPR_fnew_tV3;
+
+  case Hexagon::JMPret_t:
+    return Hexagon::JMPret_tnew_tV3;
+
+  case Hexagon::JMPret_f:
+    return Hexagon::JMPret_fnew_tV3;
+
+
+  // Conditional combine
+  case Hexagon::COMBINE_rr_cPt :
+    return Hexagon::COMBINE_rr_cdnPt;
+  case Hexagon::COMBINE_rr_cNotPt :
+    return Hexagon::COMBINE_rr_cdnNotPt;
+  }
+}
+
+
 unsigned HexagonInstrInfo::getAddrMode(const MachineInstr* MI) const {
   const uint64_t F = MI->getDesc().TSFlags;
 
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h
index e0bec04..42ffb48 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -27,7 +27,7 @@ namespace llvm {
 
 class HexagonInstrInfo : public HexagonGenInstrInfo {
   const HexagonRegisterInfo RI;
-  const HexagonSubtarget& Subtarget;
+  const HexagonSubtarget &Subtarget;
   typedef unsigned Opcode_t;
 
 public:
@@ -113,6 +113,7 @@ public:
 
   unsigned createVR(MachineFunction* MF, MVT VT) const;
 
+  virtual bool isBranch(const MachineInstr *MI) const;
   virtual bool isPredicable(MachineInstr *MI) const;
   virtual bool
   PredicateInstruction(MachineInstr *MI,
@@ -129,7 +130,11 @@ public:
                                    const BranchProbability &Probability) const;
 
   virtual bool isPredicated(const MachineInstr *MI) const;
+  virtual bool isPredicated(unsigned Opcode) const;
+  virtual bool isPredicatedTrue(const MachineInstr *MI) const;
+  virtual bool isPredicatedTrue(unsigned Opcode) const;
   virtual bool isPredicatedNew(const MachineInstr *MI) const;
+  virtual bool isPredicatedNew(unsigned Opcode) const;
   virtual bool DefinesPredicate(MachineInstr *MI,
                                 std::vector<MachineOperand> &Pred) const;
   virtual bool
@@ -178,13 +183,21 @@ public:
   bool isConditionalLoad (const MachineInstr* MI) const;
   bool isConditionalStore(const MachineInstr* MI) const;
   bool isNewValueInst(const MachineInstr* MI) const;
+  bool isNewValue(const MachineInstr* MI) const;
   bool isDotNewInst(const MachineInstr* MI) const;
+  int GetDotOldOp(const int opc) const;
+  int GetDotNewOp(const MachineInstr* MI) const;
+  int GetDotNewPredOp(MachineInstr *MI,
+                      const MachineBranchProbabilityInfo
+                      *MBPI) const;
+  bool mayBeNewStore(const MachineInstr* MI) const;
   bool isDeallocRet(const MachineInstr *MI) const;
   unsigned getInvertedPredicatedOpcode(const int Opc) const;
   bool isExtendable(const MachineInstr* MI) const;
   bool isExtended(const MachineInstr* MI) const;
   bool isPostIncrement(const MachineInstr* MI) const;
   bool isNewValueStore(const MachineInstr* MI) const;
+  bool isNewValueStore(unsigned Opcode) const;
   bool isNewValueJump(const MachineInstr* MI) const;
   bool isNewValueJumpCandidate(const MachineInstr *MI) const;
 
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.td b/lib/Target/Hexagon/HexagonInstrInfo.td
index 2a4b17b..c96aaca 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.td
+++ b/lib/Target/Hexagon/HexagonInstrInfo.td
@@ -384,6 +384,12 @@ def TFCR : CRInst<(outs CRRegs:$dst), (ins IntRegs:$src1),
 // ALU32/PERM +
 //===----------------------------------------------------------------------===//
 
+let neverHasSideEffects = 1 in
+def COMBINE_ii : ALU32_ii<(outs DoubleRegs:$dst),
+            (ins s8Imm:$src1, s8Imm:$src2),
+            "$dst = combine(#$src1, #$src2)",
+            []>;
+
 // Mux.
 def VMUX_prr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins PredRegs:$src1,
                                                    DoubleRegs:$src2,
@@ -932,12 +938,21 @@ multiclass LD_MEMri<string mnemonic, string CextOp, RegisterClass RC,
 }
 
 let addrMode = BaseImmOffset, isMEMri = "true" in {
-  defm LDrib: LD_MEMri < "memb", "LDrib", IntRegs, 11, 6>, AddrModeRel;
-  defm LDriub: LD_MEMri < "memub" , "LDriub", IntRegs, 11, 6>, AddrModeRel;
-  defm LDrih: LD_MEMri < "memh", "LDrih", IntRegs, 12, 7>, AddrModeRel;
-  defm LDriuh: LD_MEMri < "memuh", "LDriuh", IntRegs, 12, 7>, AddrModeRel;
-  defm LDriw: LD_MEMri < "memw", "LDriw", IntRegs, 13, 8>, AddrModeRel;
-  defm LDrid: LD_MEMri < "memd", "LDrid", DoubleRegs, 14, 9>, AddrModeRel;
+  let accessSize = ByteAccess in {
+    defm LDrib: LD_MEMri < "memb", "LDrib", IntRegs, 11, 6>, AddrModeRel;
+    defm LDriub: LD_MEMri < "memub" , "LDriub", IntRegs, 11, 6>, AddrModeRel;
+ }
+
+  let accessSize = HalfWordAccess in {
+    defm LDrih: LD_MEMri < "memh", "LDrih", IntRegs, 12, 7>, AddrModeRel;
+    defm LDriuh: LD_MEMri < "memuh", "LDriuh", IntRegs, 12, 7>, AddrModeRel;
+ }
+
+  let accessSize = WordAccess in
+    defm LDriw: LD_MEMri < "memw", "LDriw", IntRegs, 13, 8>, AddrModeRel;
+
+  let accessSize = DoubleWordAccess in
+    defm LDrid: LD_MEMri < "memd", "LDrid", DoubleRegs, 14, 9>, AddrModeRel;
 }
 
 def : Pat < (i32 (sextloadi8 ADDRriS11_0:$addr)),
@@ -1000,18 +1015,25 @@ multiclass LD_Idxd<string mnemonic, string CextOp, RegisterClass RC,
 }
 
 let addrMode = BaseImmOffset in {
-  defm LDrib_indexed: LD_Idxd <"memb", "LDrib", IntRegs, s11_0Ext, u6_0Ext,
-                               11, 6>, AddrModeRel;
-  defm LDriub_indexed: LD_Idxd <"memub" , "LDriub", IntRegs, s11_0Ext, u6_0Ext,
-                                11, 6>, AddrModeRel;
-  defm LDrih_indexed: LD_Idxd <"memh", "LDrih", IntRegs, s11_1Ext, u6_1Ext,
-                               12, 7>, AddrModeRel;
-  defm LDriuh_indexed: LD_Idxd <"memuh", "LDriuh", IntRegs, s11_1Ext, u6_1Ext,
-                                12, 7>, AddrModeRel;
-  defm LDriw_indexed: LD_Idxd <"memw", "LDriw", IntRegs, s11_2Ext, u6_2Ext,
-                               13, 8>, AddrModeRel;
-  defm LDrid_indexed: LD_Idxd <"memd", "LDrid", DoubleRegs, s11_3Ext, u6_3Ext,
-                               14, 9>, AddrModeRel;
+  let accessSize = ByteAccess in {
+    defm LDrib_indexed: LD_Idxd <"memb", "LDrib", IntRegs, s11_0Ext, u6_0Ext,
+                                  11, 6>, AddrModeRel;
+    defm LDriub_indexed: LD_Idxd <"memub" , "LDriub", IntRegs, s11_0Ext, u6_0Ext,
+                                   11, 6>, AddrModeRel;
+  }
+  let accessSize = HalfWordAccess in {
+    defm LDrih_indexed: LD_Idxd <"memh", "LDrih", IntRegs, s11_1Ext, u6_1Ext,
+                                 12, 7>, AddrModeRel;
+    defm LDriuh_indexed: LD_Idxd <"memuh", "LDriuh", IntRegs, s11_1Ext, u6_1Ext,
+                                  12, 7>, AddrModeRel;
+  }
+  let accessSize = WordAccess in
+    defm LDriw_indexed: LD_Idxd <"memw", "LDriw", IntRegs, s11_2Ext, u6_2Ext,
+                                 13, 8>, AddrModeRel;
+
+  let accessSize = DoubleWordAccess in
+    defm LDrid_indexed: LD_Idxd <"memd", "LDrid", DoubleRegs, s11_3Ext, u6_3Ext,
+                                 14, 9>, AddrModeRel;
 }
 
 let AddedComplexity = 20 in {
@@ -1036,8 +1058,6 @@ def : Pat < (i64 (load (add IntRegs:$src1, s11_3ExtPred:$offset))),
 
 //===----------------------------------------------------------------------===//
 // Post increment load
-// Make sure that in post increment load, the first operand is always the post
-// increment operand.
 //===----------------------------------------------------------------------===//
 
 multiclass LD_PostInc_Pbase<string mnemonic, RegisterClass RC, Operand ImmOp,
@@ -1079,7 +1099,7 @@ multiclass LD_PostInc<string mnemonic, string BaseOp, RegisterClass RC,
   }
 }
 
-let hasCtrlDep = 1, neverHasSideEffects = 1 in {
+let hasCtrlDep = 1, neverHasSideEffects = 1, addrMode = PostInc in {
   defm POST_LDrib : LD_PostInc<"memb", "LDrib", IntRegs, s4_0Imm>,
                     PredNewRel;
   defm POST_LDriub : LD_PostInc<"memub", "LDriub", IntRegs, s4_0Imm>,
@@ -1382,7 +1402,7 @@ multiclass ST_PostInc_Pbase<string mnemonic, RegisterClass RC, Operand ImmOp,
 multiclass ST_PostInc_Pred<string mnemonic, RegisterClass RC,
                            Operand ImmOp, bit PredNot> {
   let isPredicatedFalse = PredNot in {
-    defm _c#NAME# : ST_PostInc_Pbase<mnemonic, RC, ImmOp, PredNot, 0>;
+    defm _c#NAME : ST_PostInc_Pbase<mnemonic, RC, ImmOp, PredNot, 0>;
     // Predicate new
     let Predicates = [HasV4T], validSubTargets = HasV4SubT in
     defm _cdn#NAME#_V4 : ST_PostInc_Pbase<mnemonic, RC, ImmOp, PredNot, 1>;
@@ -1397,7 +1417,7 @@ multiclass ST_PostInc<string mnemonic, string BaseOp, RegisterClass RC,
     let isPredicable = 1 in
     def NAME : STInst2PI<(outs IntRegs:$dst),
                 (ins IntRegs:$src1, ImmOp:$offset, RC:$src2),
-                #mnemonic#"($src1++#$offset) = $src2",
+                mnemonic#"($src1++#$offset) = $src2",
                 [],
                 "$src1 = $dst">;
 
@@ -1474,12 +1494,17 @@ multiclass ST_MEMri<string mnemonic, string CextOp, RegisterClass RC,
 }
 
 let addrMode = BaseImmOffset, isMEMri = "true" in {
-  defm STrib: ST_MEMri < "memb", "STrib", IntRegs, 11, 6>, AddrModeRel;
-  defm STrih: ST_MEMri < "memh", "STrih", IntRegs, 12, 7>, AddrModeRel;
-  defm STriw: ST_MEMri < "memw", "STriw", IntRegs, 13, 8>, AddrModeRel;
+  let accessSize = ByteAccess in
+    defm STrib: ST_MEMri < "memb", "STrib", IntRegs, 11, 6>, AddrModeRel;
+
+  let accessSize = HalfWordAccess in
+    defm STrih: ST_MEMri < "memh", "STrih", IntRegs, 12, 7>, AddrModeRel;
 
-  let isNVStorable = 0 in
-  defm STrid: ST_MEMri < "memd", "STrid", DoubleRegs, 14, 9>, AddrModeRel;
+  let accessSize = WordAccess in
+    defm STriw: ST_MEMri < "memw", "STriw", IntRegs, 13, 8>, AddrModeRel;
+
+  let accessSize = DoubleWordAccess, isNVStorable = 0 in
+    defm STrid: ST_MEMri < "memd", "STrid", DoubleRegs, 14, 9>, AddrModeRel;
 }
 
 def : Pat<(truncstorei8 (i32 IntRegs:$src1), ADDRriS11_0:$addr),
@@ -1541,15 +1566,21 @@ multiclass ST_Idxd<string mnemonic, string CextOp, RegisterClass RC,
 }
 
 let addrMode = BaseImmOffset, InputType = "reg" in {
-  defm STrib_indexed: ST_Idxd < "memb", "STrib", IntRegs, s11_0Ext,
-                                u6_0Ext, 11, 6>, AddrModeRel, ImmRegRel;
-  defm STrih_indexed: ST_Idxd < "memh", "STrih", IntRegs, s11_1Ext,
-                                u6_1Ext, 12, 7>, AddrModeRel, ImmRegRel;
-  defm STriw_indexed: ST_Idxd < "memw", "STriw", IntRegs, s11_2Ext,
-                                u6_2Ext, 13, 8>, AddrModeRel, ImmRegRel;
-  let isNVStorable = 0 in
-  defm STrid_indexed: ST_Idxd < "memd", "STrid", DoubleRegs, s11_3Ext,
-                                u6_3Ext, 14, 9>, AddrModeRel;
+  let accessSize = ByteAccess in
+    defm STrib_indexed: ST_Idxd < "memb", "STrib", IntRegs, s11_0Ext,
+                                  u6_0Ext, 11, 6>, AddrModeRel, ImmRegRel;
+
+  let accessSize = HalfWordAccess in
+    defm STrih_indexed: ST_Idxd < "memh", "STrih", IntRegs, s11_1Ext,
+                                  u6_1Ext, 12, 7>, AddrModeRel, ImmRegRel;
+
+  let accessSize = WordAccess in
+    defm STriw_indexed: ST_Idxd < "memw", "STriw", IntRegs, s11_2Ext,
+                                  u6_2Ext, 13, 8>, AddrModeRel, ImmRegRel;
+
+  let accessSize = DoubleWordAccess, isNVStorable = 0 in
+    defm STrid_indexed: ST_Idxd < "memd", "STrid", DoubleRegs, s11_3Ext,
+                                  u6_3Ext, 14, 9>, AddrModeRel;
 }
 
 let AddedComplexity = 10 in {
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV4.td b/lib/Target/Hexagon/HexagonInstrInfoV4.td
index 744efe8..fee83fb 100644
--- a/lib/Target/Hexagon/HexagonInstrInfoV4.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoV4.td
@@ -213,7 +213,7 @@ def COMBINE_iI_V4 : ALU32_ii<(outs DoubleRegs:$dst),
 // Template class for load instructions with Absolute set addressing mode.
 //===----------------------------------------------------------------------===//
 let isExtended = 1, opExtendable = 2, neverHasSideEffects = 1,
-validSubTargets = HasV4SubT in
+validSubTargets = HasV4SubT, addrMode = AbsoluteSet in
 class T_LD_abs_set<string mnemonic, RegisterClass RC>:
             LDInst2<(outs RC:$dst1, IntRegs:$dst2),
             (ins u0AlwaysExt:$addr),
@@ -266,12 +266,23 @@ multiclass ld_idxd_shl<string mnemonic, string CextOp, RegisterClass RC> {
 }
 
 let addrMode = BaseRegOffset in {
-  defm LDrib_indexed_shl: ld_idxd_shl<"memb", "LDrib", IntRegs>, AddrModeRel;
-  defm LDriub_indexed_shl: ld_idxd_shl<"memub", "LDriub", IntRegs>, AddrModeRel;
-  defm LDrih_indexed_shl: ld_idxd_shl<"memh", "LDrih", IntRegs>, AddrModeRel;
-  defm LDriuh_indexed_shl: ld_idxd_shl<"memuh", "LDriuh", IntRegs>, AddrModeRel;
-  defm LDriw_indexed_shl: ld_idxd_shl<"memw", "LDriw", IntRegs>, AddrModeRel;
-  defm LDrid_indexed_shl: ld_idxd_shl<"memd", "LDrid", DoubleRegs>, AddrModeRel;
+  let accessSize = ByteAccess in {
+    defm LDrib_indexed_shl: ld_idxd_shl<"memb", "LDrib", IntRegs>,
+                                        AddrModeRel;
+    defm LDriub_indexed_shl: ld_idxd_shl<"memub", "LDriub", IntRegs>,
+                                        AddrModeRel;
+  }
+  let accessSize = HalfWordAccess in {
+    defm LDrih_indexed_shl: ld_idxd_shl<"memh", "LDrih", IntRegs>, AddrModeRel;
+    defm LDriuh_indexed_shl: ld_idxd_shl<"memuh", "LDriuh", IntRegs>,
+                             AddrModeRel;
+  }
+  let accessSize = WordAccess in
+     defm LDriw_indexed_shl: ld_idxd_shl<"memw", "LDriw", IntRegs>, AddrModeRel;
+
+  let accessSize = DoubleWordAccess in
+    defm LDrid_indexed_shl: ld_idxd_shl<"memd", "LDrid", DoubleRegs>,
+                             AddrModeRel;
 }
 
 // 'def pats' for load instructions with base + register offset and non-zero
@@ -456,7 +467,8 @@ def:  Pat <(i64 (extloadi32 (i32 (add IntRegs:$src1, s11_2ExtPred:$offset)))),
 //===----------------------------------------------------------------------===//
 // Template class for store instructions with Absolute set addressing mode.
 //===----------------------------------------------------------------------===//
-let isExtended = 1, opExtendable = 2, validSubTargets = HasV4SubT in
+let isExtended = 1, opExtendable = 2, validSubTargets = HasV4SubT,
+addrMode = AbsoluteSet in
 class T_ST_abs_set<string mnemonic, RegisterClass RC>:
             STInst2<(outs IntRegs:$dst1),
             (ins RC:$src1, u0AlwaysExt:$src2),
@@ -551,17 +563,20 @@ multiclass ST_Idxd_shl_nv<string mnemonic, string CextOp, RegisterClass RC> {
 
 let addrMode = BaseRegOffset, neverHasSideEffects = 1,
 validSubTargets = HasV4SubT in {
-  defm STrib_indexed_shl: ST_Idxd_shl<"memb", "STrib", IntRegs>,
-                          ST_Idxd_shl_nv<"memb", "STrib", IntRegs>, AddrModeRel;
+  let accessSize = ByteAccess in
+    defm STrib_indexed_shl: ST_Idxd_shl<"memb", "STrib", IntRegs>,
+                            ST_Idxd_shl_nv<"memb", "STrib", IntRegs>, AddrModeRel;
 
-  defm STrih_indexed_shl: ST_Idxd_shl<"memh", "STrih", IntRegs>,
-                          ST_Idxd_shl_nv<"memh", "STrih", IntRegs>, AddrModeRel;
+  let accessSize = HalfWordAccess in
+    defm STrih_indexed_shl: ST_Idxd_shl<"memh", "STrih", IntRegs>,
+                            ST_Idxd_shl_nv<"memh", "STrih", IntRegs>, AddrModeRel;
 
-  defm STriw_indexed_shl: ST_Idxd_shl<"memw", "STriw", IntRegs>,
-                          ST_Idxd_shl_nv<"memw", "STriw", IntRegs>, AddrModeRel;
+  let accessSize = WordAccess in
+    defm STriw_indexed_shl: ST_Idxd_shl<"memw", "STriw", IntRegs>,
+                            ST_Idxd_shl_nv<"memw", "STriw", IntRegs>, AddrModeRel;
 
-  let isNVStorable = 0 in
-  defm STrid_indexed_shl: ST_Idxd_shl<"memd", "STrid", DoubleRegs>, AddrModeRel;
+  let isNVStorable = 0, accessSize = DoubleWordAccess in
+    defm STrid_indexed_shl: ST_Idxd_shl<"memd", "STrid", DoubleRegs>, AddrModeRel;
 }
 
 let Predicates = [HasV4T], AddedComplexity = 10 in {
@@ -695,10 +710,15 @@ multiclass ST_Imm<string mnemonic, string CextOp, Operand OffsetOp> {
 }
 
 let addrMode = BaseImmOffset, InputType = "imm",
-    validSubTargets = HasV4SubT in {
-  defm STrib_imm : ST_Imm<"memb", "STrib", u6_0Imm>, ImmRegRel, PredNewRel;
-  defm STrih_imm : ST_Imm<"memh", "STrih", u6_1Imm>, ImmRegRel, PredNewRel;
-  defm STriw_imm : ST_Imm<"memw", "STriw", u6_2Imm>, ImmRegRel, PredNewRel;
+validSubTargets = HasV4SubT in {
+  let accessSize = ByteAccess in
+    defm STrib_imm : ST_Imm<"memb", "STrib", u6_0Imm>, ImmRegRel, PredNewRel;
+
+  let accessSize = HalfWordAccess in
+    defm STrih_imm : ST_Imm<"memh", "STrih", u6_1Imm>, ImmRegRel, PredNewRel;
+
+  let accessSize = WordAccess in
+    defm STriw_imm : ST_Imm<"memw", "STriw", u6_2Imm>, ImmRegRel, PredNewRel;
 }
 
 let Predicates = [HasV4T], AddedComplexity = 10 in {
@@ -834,12 +854,17 @@ multiclass ST_Idxd_nv<string mnemonic, string CextOp, RegisterClass RC,
 }
 
 let addrMode = BaseImmOffset, validSubTargets = HasV4SubT in {
-  defm STrib_indexed: ST_Idxd_nv<"memb", "STrib", IntRegs, s11_0Ext,
-                                 u6_0Ext, 11, 6>, AddrModeRel;
-  defm STrih_indexed: ST_Idxd_nv<"memh", "STrih", IntRegs, s11_1Ext,
-                                 u6_1Ext, 12, 7>, AddrModeRel;
-  defm STriw_indexed: ST_Idxd_nv<"memw", "STriw", IntRegs, s11_2Ext,
-                                 u6_2Ext, 13, 8>, AddrModeRel;
+  let accessSize = ByteAccess in
+    defm STrib_indexed: ST_Idxd_nv<"memb", "STrib", IntRegs, s11_0Ext,
+                                   u6_0Ext, 11, 6>, AddrModeRel;
+
+  let accessSize = HalfWordAccess in
+    defm STrih_indexed: ST_Idxd_nv<"memh", "STrih", IntRegs, s11_1Ext,
+                                   u6_1Ext, 12, 7>, AddrModeRel;
+
+  let accessSize = WordAccess in
+    defm STriw_indexed: ST_Idxd_nv<"memw", "STriw", IntRegs, s11_2Ext,
+                                   u6_2Ext, 13, 8>, AddrModeRel;
 }
 
 // multiclass for new-value store instructions with base + immediate offset.
@@ -887,9 +912,14 @@ multiclass ST_MEMri_nv<string mnemonic, string CextOp, RegisterClass RC,
 
 let addrMode = BaseImmOffset, isMEMri = "true", validSubTargets = HasV4SubT,
 mayStore = 1 in {
-  defm STrib: ST_MEMri_nv<"memb", "STrib", IntRegs, 11, 6>, AddrModeRel;
-  defm STrih: ST_MEMri_nv<"memh", "STrih", IntRegs, 12, 7>, AddrModeRel;
-  defm STriw: ST_MEMri_nv<"memw", "STriw", IntRegs, 13, 8>, AddrModeRel;
+  let accessSize = ByteAccess in
+    defm STrib: ST_MEMri_nv<"memb", "STrib", IntRegs, 11, 6>, AddrModeRel;
+
+  let accessSize = HalfWordAccess in
+    defm STrih: ST_MEMri_nv<"memh", "STrih", IntRegs, 12, 7>, AddrModeRel;
+
+  let accessSize = WordAccess in
+    defm STriw: ST_MEMri_nv<"memw", "STriw", IntRegs, 13, 8>, AddrModeRel;
 }
 
 //===----------------------------------------------------------------------===//
@@ -939,7 +969,7 @@ multiclass ST_PostInc_nv<string mnemonic, string BaseOp, RegisterClass RC,
   }
 }
 
-let validSubTargets = HasV4SubT in {
+let addrMode = PostInc, validSubTargets = HasV4SubT in {
 defm POST_STbri: ST_PostInc_nv <"memb", "STrib", IntRegs, s4_0Imm>, AddrModeRel;
 defm POST_SThri: ST_PostInc_nv <"memh", "STrih", IntRegs, s4_1Imm>, AddrModeRel;
 defm POST_STwri: ST_PostInc_nv <"memw", "STriw", IntRegs, s4_2Imm>, AddrModeRel;
@@ -967,179 +997,193 @@ defm POST_STwri: ST_PostInc_nv <"memw", "STriw", IntRegs, s4_2Imm>, AddrModeRel;
 // NV/J +
 //===----------------------------------------------------------------------===//
 
-multiclass NVJ_type_basic_reg<string NotStr, string OpcStr, string TakenStr> {
-  def _ie_nv_V4 : NVInst_V4<(outs),
-            (ins IntRegs:$src1, IntRegs:$src2, brtarget:$offset),
-            !strconcat("if (", !strconcat(NotStr, !strconcat(OpcStr,
-            !strconcat("($src1.new, $src2)) jump:",
-            !strconcat(TakenStr, " $offset"))))),
-            []>,
-            Requires<[HasV4T]>;
+//===----------------------------------------------------------------------===//
+// multiclass/template class for the new-value compare jumps with the register
+// operands.
+//===----------------------------------------------------------------------===//
 
-  def _nv_V4 : NVInst_V4<(outs),
-            (ins IntRegs:$src1, IntRegs:$src2, brtarget:$offset),
-            !strconcat("if (", !strconcat(NotStr, !strconcat(OpcStr,
-            !strconcat("($src1.new, $src2)) jump:",
-            !strconcat(TakenStr, " $offset"))))),
-            []>,
-            Requires<[HasV4T]>;
-}
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 11 in
+class NVJrr_template<string mnemonic, bits<3> majOp, bit NvOpNum,
+                      bit isNegCond, bit isTaken>
+  : NVInst_V4<(outs),
+    (ins IntRegs:$src1, IntRegs:$src2, brtarget:$offset),
+    "if ("#!if(isNegCond, "!","")#mnemonic#
+    "($src1"#!if(!eq(NvOpNum, 0),".new, ",", ")#
+    "$src2"#!if(!eq(NvOpNum, 1),".new))","))")#" jump:"
+    #!if(isTaken, "t","nt")#" $offset",
+    []>, Requires<[HasV4T]> {
 
-multiclass NVJ_type_basic_2ndDotNew<string NotStr, string OpcStr,
-                                                   string TakenStr> {
-  def _ie_nv_V4 : NVInst_V4<(outs),
-            (ins IntRegs:$src1, IntRegs:$src2, brtarget:$offset),
-            !strconcat("if (", !strconcat(NotStr, !strconcat(OpcStr,
-            !strconcat("($src1, $src2.new)) jump:",
-            !strconcat(TakenStr, " $offset"))))),
-            []>,
-            Requires<[HasV4T]>;
+      bits<5> src1;
+      bits<5> src2;
+      bits<3> Ns;    // New-Value Operand
+      bits<5> RegOp; // Non New-Value Operand
+      bits<11> offset;
 
-  def _nv_V4 : NVInst_V4<(outs),
-            (ins IntRegs:$src1, IntRegs:$src2, brtarget:$offset),
-            !strconcat("if (", !strconcat(NotStr, !strconcat(OpcStr,
-            !strconcat("($src1, $src2.new)) jump:",
-            !strconcat(TakenStr, " $offset"))))),
-            []>,
-            Requires<[HasV4T]>;
-}
+      let isBrTaken = !if(isTaken, "true", "false");
+      let isPredicatedFalse = isNegCond;
 
-multiclass NVJ_type_basic_imm<string NotStr, string OpcStr, string TakenStr> {
-  def _ie_nv_V4 : NVInst_V4<(outs),
-            (ins IntRegs:$src1, u5Imm:$src2, brtarget:$offset),
-            !strconcat("if (", !strconcat(NotStr, !strconcat(OpcStr,
-            !strconcat("($src1.new, #$src2)) jump:",
-            !strconcat(TakenStr, " $offset"))))),
-            []>,
-            Requires<[HasV4T]>;
+      let Ns = !if(!eq(NvOpNum, 0), src1{2-0}, src2{2-0});
+      let RegOp = !if(!eq(NvOpNum, 0), src2, src1);
 
-  def _nv_V4 : NVInst_V4<(outs),
-            (ins IntRegs:$src1, u5Imm:$src2, brtarget:$offset),
-            !strconcat("if (", !strconcat(NotStr, !strconcat(OpcStr,
-            !strconcat("($src1.new, #$src2)) jump:",
-            !strconcat(TakenStr, " $offset"))))),
-            []>,
-            Requires<[HasV4T]>;
+      let IClass = 0b0010;
+      let Inst{26} = 0b0;
+      let Inst{25-23} = majOp;
+      let Inst{22} = isNegCond;
+      let Inst{18-16} = Ns;
+      let Inst{13} = isTaken;
+      let Inst{12-8} = RegOp;
+      let Inst{21-20} = offset{10-9};
+      let Inst{7-1} = offset{8-2};
 }
 
-multiclass NVJ_type_basic_neg<string NotStr, string OpcStr, string TakenStr> {
-  def _ie_nv_V4 : NVInst_V4<(outs),
-            (ins IntRegs:$src1, nOneImm:$src2, brtarget:$offset),
-            !strconcat("if (", !strconcat(NotStr, !strconcat(OpcStr,
-            !strconcat("($src1.new, #$src2)) jump:",
-            !strconcat(TakenStr, " $offset"))))),
-            []>,
-            Requires<[HasV4T]>;
 
-  def _nv_V4 : NVInst_V4<(outs),
-            (ins IntRegs:$src1, nOneImm:$src2, brtarget:$offset),
-            !strconcat("if (", !strconcat(NotStr, !strconcat(OpcStr,
-            !strconcat("($src1.new, #$src2)) jump:",
-            !strconcat(TakenStr, " $offset"))))),
-            []>,
-            Requires<[HasV4T]>;
+multiclass NVJrr_cond<string mnemonic, bits<3> majOp, bit NvOpNum,
+                       bit isNegCond> {
+  // Branch not taken:
+  def _nt_V4: NVJrr_template<mnemonic, majOp, NvOpNum, isNegCond, 0>;
+  // Branch taken:
+  def _t_V4: NVJrr_template<mnemonic, majOp, NvOpNum, isNegCond, 1>;
 }
 
-multiclass NVJ_type_basic_tstbit<string NotStr, string OpcStr,
-                                                string TakenStr> {
-  def _ie_nv_V4 : NVInst_V4<(outs),
-            (ins IntRegs:$src1, u1Imm:$src2, brtarget:$offset),
-            !strconcat("if (", !strconcat(NotStr, !strconcat(OpcStr,
-            !strconcat("($src1.new, #$src2)) jump:",
-            !strconcat(TakenStr, " $offset"))))),
-            []>,
-            Requires<[HasV4T]>;
+// NvOpNum = 0 -> First Operand is a new-value Register
+// NvOpNum = 1 -> Second Operand is a new-value Register
 
-  def _nv_V4 : NVInst_V4<(outs),
-            (ins IntRegs:$src1, u1Imm:$src2, brtarget:$offset),
-            !strconcat("if (", !strconcat(NotStr, !strconcat(OpcStr,
-            !strconcat("($src1.new, #$src2)) jump:",
-            !strconcat(TakenStr, " $offset"))))),
-            []>,
-            Requires<[HasV4T]>;
+multiclass NVJrr_base<string mnemonic, string BaseOp, bits<3> majOp,
+                       bit NvOpNum> {
+  let BaseOpcode = BaseOp#_NVJ in {
+    defm _t_Jumpnv : NVJrr_cond<mnemonic, majOp, NvOpNum, 0>; // True cond
+    defm _f_Jumpnv : NVJrr_cond<mnemonic, majOp, NvOpNum, 1>; // False cond
+  }
 }
 
-// Multiclass for regular dot new of Ist operand register.
-multiclass NVJ_type_br_pred_reg<string NotStr, string OpcStr> {
-  defm Pt  : NVJ_type_basic_reg<NotStr, OpcStr, "t">;
-  defm Pnt : NVJ_type_basic_reg<NotStr, OpcStr, "nt">;
-}
+// if ([!]cmp.eq(Ns.new,Rt)) jump:[n]t #r9:2
+// if ([!]cmp.gt(Ns.new,Rt)) jump:[n]t #r9:2
+// if ([!]cmp.gtu(Ns.new,Rt)) jump:[n]t #r9:2
+// if ([!]cmp.gt(Rt,Ns.new)) jump:[n]t #r9:2
+// if ([!]cmp.gtu(Rt,Ns.new)) jump:[n]t #r9:2
 
-// Multiclass for dot new of 2nd operand register.
-multiclass NVJ_type_br_pred_2ndDotNew<string NotStr, string OpcStr> {
-  defm Pt  : NVJ_type_basic_2ndDotNew<NotStr, OpcStr, "t">;
-  defm Pnt : NVJ_type_basic_2ndDotNew<NotStr, OpcStr, "nt">;
+let isPredicated = 1, isBranch = 1, isNewValue = 1, isTerminator = 1,
+  Defs = [PC], neverHasSideEffects = 1, validSubTargets = HasV4SubT in {
+  defm CMPEQrr  : NVJrr_base<"cmp.eq",  "CMPEQ",  0b000, 0>, PredRel;
+  defm CMPGTrr  : NVJrr_base<"cmp.gt",  "CMPGT",  0b001, 0>, PredRel;
+  defm CMPGTUrr : NVJrr_base<"cmp.gtu", "CMPGTU", 0b010, 0>, PredRel;
+  defm CMPLTrr  : NVJrr_base<"cmp.gt",  "CMPLT",  0b011, 1>, PredRel;
+  defm CMPLTUrr : NVJrr_base<"cmp.gtu", "CMPLTU", 0b100, 1>, PredRel;
 }
 
-// Multiclass for 2nd operand immediate, including -1.
-multiclass NVJ_type_br_pred_imm<string NotStr, string OpcStr> {
-  defm Pt     : NVJ_type_basic_imm<NotStr, OpcStr, "t">;
-  defm Pnt    : NVJ_type_basic_imm<NotStr, OpcStr, "nt">;
-  defm Ptneg  : NVJ_type_basic_neg<NotStr, OpcStr, "t">;
-  defm Pntneg : NVJ_type_basic_neg<NotStr, OpcStr, "nt">;
-}
+//===----------------------------------------------------------------------===//
+// multiclass/template class for the new-value compare jumps instruction
+// with a register and an unsigned immediate (U5) operand.
+//===----------------------------------------------------------------------===//
 
-// Multiclass for 2nd operand immediate, excluding -1.
-multiclass NVJ_type_br_pred_imm_only<string NotStr, string OpcStr> {
-  defm Pt     : NVJ_type_basic_imm<NotStr, OpcStr, "t">;
-  defm Pnt    : NVJ_type_basic_imm<NotStr, OpcStr, "nt">;
-}
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 11 in
+class NVJri_template<string mnemonic, bits<3> majOp, bit isNegCond,
+                         bit isTaken>
+  : NVInst_V4<(outs),
+    (ins IntRegs:$src1, u5Imm:$src2, brtarget:$offset),
+    "if ("#!if(isNegCond, "!","")#mnemonic#"($src1.new, #$src2)) jump:"
+    #!if(isTaken, "t","nt")#" $offset",
+    []>, Requires<[HasV4T]> {
 
-// Multiclass for tstbit, where 2nd operand is always #0.
-multiclass NVJ_type_br_pred_tstbit<string NotStr, string OpcStr> {
-  defm Pt     : NVJ_type_basic_tstbit<NotStr, OpcStr, "t">;
-  defm Pnt    : NVJ_type_basic_tstbit<NotStr, OpcStr, "nt">;
+      let isPredicatedFalse = isNegCond;
+      let isBrTaken = !if(isTaken, "true", "false");
+
+      bits<3> src1;
+      bits<5> src2;
+      bits<11> offset;
+
+      let IClass = 0b0010;
+      let Inst{26} = 0b1;
+      let Inst{25-23} = majOp;
+      let Inst{22} = isNegCond;
+      let Inst{18-16} = src1;
+      let Inst{13} = isTaken;
+      let Inst{12-8} = src2;
+      let Inst{21-20} = offset{10-9};
+      let Inst{7-1} = offset{8-2};
 }
 
-// Multiclass for GT.
-multiclass NVJ_type_rr_ri<string OpcStr> {
-  defm rrNot   : NVJ_type_br_pred_reg<"!", OpcStr>;
-  defm rr      : NVJ_type_br_pred_reg<"",  OpcStr>;
-  defm rrdnNot : NVJ_type_br_pred_2ndDotNew<"!", OpcStr>;
-  defm rrdn    : NVJ_type_br_pred_2ndDotNew<"",  OpcStr>;
-  defm riNot   : NVJ_type_br_pred_imm<"!", OpcStr>;
-  defm ri      : NVJ_type_br_pred_imm<"",  OpcStr>;
+multiclass NVJri_cond<string mnemonic, bits<3> majOp, bit isNegCond> {
+  // Branch not taken:
+  def _nt_V4: NVJri_template<mnemonic, majOp, isNegCond, 0>;
+  // Branch taken:
+  def _t_V4: NVJri_template<mnemonic, majOp, isNegCond, 1>;
 }
 
-// Multiclass for EQ.
-multiclass NVJ_type_rr_ri_no_2ndDotNew<string OpcStr> {
-  defm rrNot   : NVJ_type_br_pred_reg<"!", OpcStr>;
-  defm rr      : NVJ_type_br_pred_reg<"",  OpcStr>;
-  defm riNot   : NVJ_type_br_pred_imm<"!", OpcStr>;
-  defm ri      : NVJ_type_br_pred_imm<"",  OpcStr>;
+multiclass NVJri_base<string mnemonic, string BaseOp, bits<3> majOp> {
+  let BaseOpcode = BaseOp#_NVJri in {
+    defm _t_Jumpnv : NVJri_cond<mnemonic, majOp, 0>; // True Cond
+    defm _f_Jumpnv : NVJri_cond<mnemonic, majOp, 1>; // False cond
+  }
 }
 
-// Multiclass for GTU.
-multiclass NVJ_type_rr_ri_no_nOne<string OpcStr> {
-  defm rrNot   : NVJ_type_br_pred_reg<"!", OpcStr>;
-  defm rr      : NVJ_type_br_pred_reg<"",  OpcStr>;
-  defm rrdnNot : NVJ_type_br_pred_2ndDotNew<"!", OpcStr>;
-  defm rrdn    : NVJ_type_br_pred_2ndDotNew<"",  OpcStr>;
-  defm riNot   : NVJ_type_br_pred_imm_only<"!", OpcStr>;
-  defm ri      : NVJ_type_br_pred_imm_only<"",  OpcStr>;
+// if ([!]cmp.eq(Ns.new,#U5)) jump:[n]t #r9:2
+// if ([!]cmp.gt(Ns.new,#U5)) jump:[n]t #r9:2
+// if ([!]cmp.gtu(Ns.new,#U5)) jump:[n]t #r9:2
+
+let isPredicated = 1, isBranch = 1, isNewValue = 1, isTerminator = 1,
+  Defs = [PC], neverHasSideEffects = 1, validSubTargets = HasV4SubT in {
+  defm CMPEQri  : NVJri_base<"cmp.eq", "CMPEQ", 0b000>, PredRel;
+  defm CMPGTri  : NVJri_base<"cmp.gt", "CMPGT", 0b001>, PredRel;
+  defm CMPGTUri : NVJri_base<"cmp.gtu", "CMPGTU", 0b010>, PredRel;
 }
 
-// Multiclass for tstbit.
-multiclass NVJ_type_r0<string OpcStr> {
-  defm r0Not : NVJ_type_br_pred_tstbit<"!", OpcStr>;
-  defm r0    : NVJ_type_br_pred_tstbit<"",  OpcStr>;
- }
+//===----------------------------------------------------------------------===//
+// multiclass/template class for the new-value compare jumps instruction
+// with a register and an hardcoded 0/-1 immediate value.
+//===----------------------------------------------------------------------===//
+
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 11 in
+class NVJ_ConstImm_template<string mnemonic, bits<3> majOp, string ImmVal,
+                            bit isNegCond, bit isTaken>
+  : NVInst_V4<(outs),
+    (ins IntRegs:$src1, brtarget:$offset),
+    "if ("#!if(isNegCond, "!","")#mnemonic
+    #"($src1.new, #"#ImmVal#")) jump:"
+    #!if(isTaken, "t","nt")#" $offset",
+    []>, Requires<[HasV4T]> {
 
-// Base Multiclass for New Value Jump.
-multiclass NVJ_type {
-  defm GT     : NVJ_type_rr_ri<"cmp.gt">;
-  defm EQ     : NVJ_type_rr_ri_no_2ndDotNew<"cmp.eq">;
-  defm GTU    : NVJ_type_rr_ri_no_nOne<"cmp.gtu">;
-  defm TSTBIT : NVJ_type_r0<"tstbit">;
+      let isPredicatedFalse = isNegCond;
+      let isBrTaken = !if(isTaken, "true", "false");
+
+      bits<3> src1;
+      bits<11> offset;
+      let IClass = 0b0010;
+      let Inst{26} = 0b1;
+      let Inst{25-23} = majOp;
+      let Inst{22} = isNegCond;
+      let Inst{18-16} = src1;
+      let Inst{13} = isTaken;
+      let Inst{21-20} = offset{10-9};
+      let Inst{7-1} = offset{8-2};
 }
 
-let isBranch = 1, isTerminator=1, neverHasSideEffects = 1, Defs = [PC] in {
-  defm JMP_ : NVJ_type;
+multiclass NVJ_ConstImm_cond<string mnemonic, bits<3> majOp, string ImmVal,
+                             bit isNegCond> {
+  // Branch not taken:
+  def _nt_V4: NVJ_ConstImm_template<mnemonic, majOp, ImmVal, isNegCond, 0>;
+  // Branch taken:
+  def _t_V4: NVJ_ConstImm_template<mnemonic, majOp, ImmVal, isNegCond, 1>;
 }
 
-//===----------------------------------------------------------------------===//
-// NV/J -
-//===----------------------------------------------------------------------===//
+multiclass NVJ_ConstImm_base<string mnemonic, string BaseOp, bits<3> majOp,
+                             string ImmVal> {
+  let BaseOpcode = BaseOp#_NVJ_ConstImm in {
+  defm _t_Jumpnv : NVJ_ConstImm_cond<mnemonic, majOp, ImmVal, 0>; // True cond
+  defm _f_Jumpnv : NVJ_ConstImm_cond<mnemonic, majOp, ImmVal, 1>; // False Cond
+  }
+}
+
+// if ([!]tstbit(Ns.new,#0)) jump:[n]t #r9:2
+// if ([!]cmp.eq(Ns.new,#-1)) jump:[n]t #r9:2
+// if ([!]cmp.gt(Ns.new,#-1)) jump:[n]t #r9:2
+
+let isPredicated = 1, isBranch = 1, isNewValue = 1, isTerminator=1,
+  Defs = [PC], neverHasSideEffects = 1 in {
+  defm TSTBIT0  : NVJ_ConstImm_base<"tstbit", "TSTBIT", 0b011, "0">, PredRel;
+  defm CMPEQn1  : NVJ_ConstImm_base<"cmp.eq", "CMPEQ",  0b100, "-1">, PredRel;
+  defm CMPGTn1  : NVJ_ConstImm_base<"cmp.gt", "CMPGT",  0b101, "-1">, PredRel;
+}
 
 //===----------------------------------------------------------------------===//
 // XTYPE/ALU +
@@ -2520,8 +2564,9 @@ def NTSTBIT_ri : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
 //Deallocate frame and return.
 //    dealloc_return
 let isReturn = 1, isTerminator = 1, isBarrier = 1, isPredicable = 1,
-  Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1 in {
-  def DEALLOC_RET_V4 : NVInst_V4<(outs), (ins i32imm:$amt1),
+  Defs = [R29, R30, R31, PC], Uses = [R30], neverHasSideEffects = 1 in {
+let validSubTargets = HasV4SubT in
+  def DEALLOC_RET_V4 : LD0Inst<(outs), (ins),
             "dealloc_return",
             []>,
             Requires<[HasV4T]>;
@@ -2530,9 +2575,10 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1, isPredicable = 1,
 // Restore registers and dealloc return function call.
 let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1,
   Defs = [R29, R30, R31, PC] in {
+let validSubTargets = HasV4SubT in
   def RESTORE_DEALLOC_RET_JMP_V4 : JInst<(outs),
                                    (ins calltarget:$dst),
-             "jump $dst // Restore_and_dealloc_return",
+             "jump $dst",
              []>,
              Requires<[HasV4T]>;
 }
@@ -2540,9 +2586,10 @@ let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1,
 // Restore registers and dealloc frame before a tail call.
 let isCall = 1, isBarrier = 1,
   Defs = [R29, R30, R31, PC] in {
+let validSubTargets = HasV4SubT in
   def RESTORE_DEALLOC_BEFORE_TAILCALL_V4 : JInst<(outs),
                                            (ins calltarget:$dst),
-             "call $dst // Restore_and_dealloc_before_tailcall",
+             "call $dst",
              []>,
              Requires<[HasV4T]>;
 }
@@ -2559,10 +2606,11 @@ let isCall = 1, isBarrier = 1,
 
 //    if (Ps) dealloc_return
 let isReturn = 1, isTerminator = 1,
-    Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1,
+    Defs = [R29, R30, R31, PC], Uses = [R30], neverHasSideEffects = 1,
     isPredicated = 1 in {
-  def DEALLOC_RET_cPt_V4 : NVInst_V4<(outs),
-                           (ins PredRegs:$src1, i32imm:$amt1),
+let validSubTargets = HasV4SubT in
+  def DEALLOC_RET_cPt_V4 : LD0Inst<(outs),
+                           (ins PredRegs:$src1),
             "if ($src1) dealloc_return",
             []>,
             Requires<[HasV4T]>;
@@ -2570,10 +2618,10 @@ let isReturn = 1, isTerminator = 1,
 
 //    if (!Ps) dealloc_return
 let isReturn = 1, isTerminator = 1,
-    Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1,
-    isPredicated = 1 in {
-  def DEALLOC_RET_cNotPt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1,
-                                                     i32imm:$amt1),
+    Defs = [R29, R30, R31, PC], Uses = [R30], neverHasSideEffects = 1,
+    isPredicated = 1, isPredicatedFalse = 1 in {
+let validSubTargets = HasV4SubT in
+  def DEALLOC_RET_cNotPt_V4 : LD0Inst<(outs), (ins PredRegs:$src1),
             "if (!$src1) dealloc_return",
             []>,
             Requires<[HasV4T]>;
@@ -2581,10 +2629,10 @@ let isReturn = 1, isTerminator = 1,
 
 //    if (Ps.new) dealloc_return:nt
 let isReturn = 1, isTerminator = 1,
-    Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1,
+    Defs = [R29, R30, R31, PC], Uses = [R30], neverHasSideEffects = 1,
     isPredicated = 1 in {
-  def DEALLOC_RET_cdnPnt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1,
-                                                     i32imm:$amt1),
+let validSubTargets = HasV4SubT in
+  def DEALLOC_RET_cdnPnt_V4 : LD0Inst<(outs), (ins PredRegs:$src1),
             "if ($src1.new) dealloc_return:nt",
             []>,
             Requires<[HasV4T]>;
@@ -2592,10 +2640,10 @@ let isReturn = 1, isTerminator = 1,
 
 //    if (!Ps.new) dealloc_return:nt
 let isReturn = 1, isTerminator = 1,
-    Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1,
-    isPredicated = 1 in {
-  def DEALLOC_RET_cNotdnPnt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1,
-                                                        i32imm:$amt1),
+    Defs = [R29, R30, R31, PC], Uses = [R30], neverHasSideEffects = 1,
+    isPredicated = 1, isPredicatedFalse = 1 in {
+let validSubTargets = HasV4SubT in
+  def DEALLOC_RET_cNotdnPnt_V4 : LD0Inst<(outs), (ins PredRegs:$src1),
             "if (!$src1.new) dealloc_return:nt",
             []>,
             Requires<[HasV4T]>;
@@ -2603,21 +2651,21 @@ let isReturn = 1, isTerminator = 1,
 
 //    if (Ps.new) dealloc_return:t
 let isReturn = 1, isTerminator = 1,
-    Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1,
+    Defs = [R29, R30, R31, PC], Uses = [R30], neverHasSideEffects = 1,
     isPredicated = 1 in {
-  def DEALLOC_RET_cdnPt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1,
-                                                    i32imm:$amt1),
+let validSubTargets = HasV4SubT in
+  def DEALLOC_RET_cdnPt_V4 : LD0Inst<(outs), (ins PredRegs:$src1),
             "if ($src1.new) dealloc_return:t",
             []>,
             Requires<[HasV4T]>;
 }
 
-//    if (!Ps.new) dealloc_return:nt
+// if (!Ps.new) dealloc_return:nt
 let isReturn = 1, isTerminator = 1,
-    Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1,
-    isPredicated = 1 in {
-  def DEALLOC_RET_cNotdnPt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1,
-                                                       i32imm:$amt1),
+    Defs = [R29, R30, R31, PC], Uses = [R30], neverHasSideEffects = 1,
+    isPredicated = 1, isPredicatedFalse = 1 in {
+let validSubTargets = HasV4SubT in
+  def DEALLOC_RET_cNotdnPt_V4 : LD0Inst<(outs), (ins PredRegs:$src1),
             "if (!$src1.new) dealloc_return:t",
             []>,
             Requires<[HasV4T]>;
@@ -3007,9 +3055,10 @@ def : Pat <(i32 (load (HexagonCONST32_GP tglobaladdr:$global))),
 
 
 // Transfer global address into a register
-let AddedComplexity=50, isMoveImm = 1, isReMaterializable = 1 in
-def TFRI_V4 : ALU32_ri<(outs IntRegs:$dst), (ins globaladdress:$src1),
-           "$dst = ##$src1",
+let isExtended = 1, opExtendable = 1, AddedComplexity=50, isMoveImm = 1,
+isAsCheapAsAMove = 1, isReMaterializable = 1, validSubTargets = HasV4SubT in
+def TFRI_V4 : ALU32_ri<(outs IntRegs:$dst), (ins s16Ext:$src1),
+           "$dst = #$src1",
            [(set IntRegs:$dst, (HexagonCONST32 tglobaladdr:$src1))]>,
            Requires<[HasV4T]>;
 
@@ -3018,37 +3067,42 @@ def : Pat<(HexagonCONST32_GP tblockaddress:$src1),
           (TFRI_V4 tblockaddress:$src1)>,
           Requires<[HasV4T]>;
 
-let AddedComplexity=50, neverHasSideEffects = 1, isPredicated = 1 in
+let isExtended = 1, opExtendable = 2, AddedComplexity=50,
+neverHasSideEffects = 1, isPredicated = 1, validSubTargets = HasV4SubT in
 def TFRI_cPt_V4 : ALU32_ri<(outs IntRegs:$dst),
-                           (ins PredRegs:$src1, globaladdress:$src2),
-           "if($src1) $dst = ##$src2",
+                           (ins PredRegs:$src1, s16Ext:$src2),
+           "if($src1) $dst = #$src2",
            []>,
            Requires<[HasV4T]>;
 
-let AddedComplexity=50, neverHasSideEffects = 1, isPredicated = 1 in
+let isExtended = 1, opExtendable = 2, AddedComplexity=50, isPredicatedFalse = 1,
+neverHasSideEffects = 1, isPredicated = 1, validSubTargets = HasV4SubT in
 def TFRI_cNotPt_V4 : ALU32_ri<(outs IntRegs:$dst),
-                              (ins PredRegs:$src1, globaladdress:$src2),
-           "if(!$src1) $dst = ##$src2",
+                              (ins PredRegs:$src1, s16Ext:$src2),
+           "if(!$src1) $dst = #$src2",
            []>,
            Requires<[HasV4T]>;
 
-let AddedComplexity=50, neverHasSideEffects = 1, isPredicated = 1 in
+let isExtended = 1, opExtendable = 2, AddedComplexity=50,
+neverHasSideEffects = 1, isPredicated = 1, validSubTargets = HasV4SubT in
 def TFRI_cdnPt_V4 : ALU32_ri<(outs IntRegs:$dst),
-                             (ins PredRegs:$src1, globaladdress:$src2),
-           "if($src1.new) $dst = ##$src2",
+                             (ins PredRegs:$src1, s16Ext:$src2),
+           "if($src1.new) $dst = #$src2",
            []>,
            Requires<[HasV4T]>;
 
-let AddedComplexity=50, neverHasSideEffects = 1, isPredicated = 1 in
+let isExtended = 1, opExtendable = 2, AddedComplexity=50, isPredicatedFalse = 1,
+neverHasSideEffects = 1, isPredicated = 1, validSubTargets = HasV4SubT in
 def TFRI_cdnNotPt_V4 : ALU32_ri<(outs IntRegs:$dst),
-                                (ins PredRegs:$src1, globaladdress:$src2),
-           "if(!$src1.new) $dst = ##$src2",
+                                (ins PredRegs:$src1, s16Ext:$src2),
+           "if(!$src1.new) $dst = #$src2",
            []>,
            Requires<[HasV4T]>;
 
 let AddedComplexity = 50, Predicates = [HasV4T] in
 def : Pat<(HexagonCONST32_GP tglobaladdr:$src1),
-           (TFRI_V4 tglobaladdr:$src1)>;
+           (TFRI_V4 tglobaladdr:$src1)>,
+           Requires<[HasV4T]>;
 
 
 // Load - Indirect with long offset: These instructions take global address
@@ -3134,6 +3188,93 @@ def STriw_offset_ext_V4 : STInst<(outs),
                     (add IntRegs:$src1, u6_2ImmPred:$src2))]>,
             Requires<[HasV4T]>;
 
+def : Pat<(i64 (ctlz (i64 DoubleRegs:$src1))),
+          (i64 (COMBINE_Ir_V4 (i32 0), (i32 (CTLZ64_rr DoubleRegs:$src1))))>,
+          Requires<[HasV4T]>;
+
+def : Pat<(i64 (cttz (i64 DoubleRegs:$src1))),
+          (i64 (COMBINE_Ir_V4 (i32 0), (i32 (CTTZ64_rr DoubleRegs:$src1))))>,
+          Requires<[HasV4T]>;
+
+
+// i8 -> i64 loads
+// We need a complexity of 120 here to overide preceeding handling of
+// zextloadi8.
+let Predicates = [HasV4T], AddedComplexity = 120 in {
+def:  Pat <(i64 (extloadi8 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
+      (i64 (COMBINE_Ir_V4 0, (LDrib_abs_V4 tglobaladdr:$addr)))>;
+
+def:  Pat <(i64 (zextloadi8 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
+      (i64 (COMBINE_Ir_V4 0, (LDriub_abs_V4 tglobaladdr:$addr)))>;
+
+def:  Pat <(i64 (sextloadi8 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
+      (i64 (SXTW (LDrib_abs_V4 tglobaladdr:$addr)))>;
+
+def:  Pat <(i64 (extloadi8 FoldGlobalAddr:$addr)),
+      (i64 (COMBINE_Ir_V4 0, (LDrib_abs_V4 FoldGlobalAddr:$addr)))>;
+
+def:  Pat <(i64 (zextloadi8 FoldGlobalAddr:$addr)),
+      (i64 (COMBINE_Ir_V4 0, (LDriub_abs_V4 FoldGlobalAddr:$addr)))>;
+
+def:  Pat <(i64 (sextloadi8 FoldGlobalAddr:$addr)),
+      (i64 (SXTW (LDrib_abs_V4 FoldGlobalAddr:$addr)))>;
+}
+// i16 -> i64 loads
+// We need a complexity of 120 here to overide preceeding handling of
+// zextloadi16.
+let AddedComplexity = 120 in {
+def:  Pat <(i64 (extloadi16 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
+      (i64 (COMBINE_Ir_V4 0, (LDrih_abs_V4 tglobaladdr:$addr)))>,
+      Requires<[HasV4T]>;
+
+def:  Pat <(i64 (zextloadi16 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
+      (i64 (COMBINE_Ir_V4 0, (LDriuh_abs_V4 tglobaladdr:$addr)))>,
+      Requires<[HasV4T]>;
+
+def:  Pat <(i64 (sextloadi16 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
+      (i64 (SXTW (LDrih_abs_V4 tglobaladdr:$addr)))>,
+      Requires<[HasV4T]>;
+
+def:  Pat <(i64 (extloadi16 FoldGlobalAddr:$addr)),
+      (i64 (COMBINE_Ir_V4 0, (LDrih_abs_V4 FoldGlobalAddr:$addr)))>,
+      Requires<[HasV4T]>;
+
+def:  Pat <(i64 (zextloadi16 FoldGlobalAddr:$addr)),
+      (i64 (COMBINE_Ir_V4 0, (LDriuh_abs_V4 FoldGlobalAddr:$addr)))>,
+      Requires<[HasV4T]>;
+
+def:  Pat <(i64 (sextloadi16 FoldGlobalAddr:$addr)),
+      (i64 (SXTW (LDrih_abs_V4 FoldGlobalAddr:$addr)))>,
+      Requires<[HasV4T]>;
+}
+// i32->i64 loads
+// We need a complexity of 120 here to overide preceeding handling of
+// zextloadi32.
+let AddedComplexity = 120 in {
+def:  Pat <(i64 (extloadi32 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
+      (i64 (COMBINE_Ir_V4 0, (LDriw_abs_V4 tglobaladdr:$addr)))>,
+      Requires<[HasV4T]>;
+
+def:  Pat <(i64 (zextloadi32 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
+      (i64 (COMBINE_Ir_V4 0, (LDriw_abs_V4 tglobaladdr:$addr)))>,
+      Requires<[HasV4T]>;
+
+def:  Pat <(i64 (sextloadi32 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
+      (i64 (SXTW (LDriw_abs_V4 tglobaladdr:$addr)))>,
+      Requires<[HasV4T]>;
+
+def:  Pat <(i64 (extloadi32 FoldGlobalAddr:$addr)),
+      (i64 (COMBINE_Ir_V4 0, (LDriw_abs_V4 FoldGlobalAddr:$addr)))>,
+      Requires<[HasV4T]>;
+
+def:  Pat <(i64 (zextloadi32 FoldGlobalAddr:$addr)),
+      (i64 (COMBINE_Ir_V4 0, (LDriw_abs_V4 FoldGlobalAddr:$addr)))>,
+      Requires<[HasV4T]>;
+
+def:  Pat <(i64 (sextloadi32 FoldGlobalAddr:$addr)),
+      (i64 (SXTW (LDriw_abs_V4 FoldGlobalAddr:$addr)))>,
+      Requires<[HasV4T]>;
+}
 
 // Indexed store double word - global address.
 // memw(Rs+#u6:2)=#S8
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV5.td b/lib/Target/Hexagon/HexagonInstrInfoV5.td
index 92d098c..9da6074 100644
--- a/lib/Target/Hexagon/HexagonInstrInfoV5.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoV5.td
@@ -26,22 +26,29 @@ def CONST32_Float_Real : LDInst<(outs IntRegs:$dst), (ins f32imm:$src1),
 // Only works with single precision fp value.
 // For double precision, use CONST64_float_real, as 64bit transfer
 // can only hold 40-bit values - 32 from const ext + 8 bit immediate.
-let isMoveImm = 1, isReMaterializable = 1, isPredicable = 1 in
-def TFRI_f : ALU32_ri<(outs IntRegs:$dst), (ins f32imm:$src1),
-           "$dst = ##$src1",
+// Make sure that complexity is more than the CONST32 pattern in
+// HexagonInstrInfo.td patterns.
+let isExtended = 1, opExtendable = 1, isMoveImm = 1, isReMaterializable = 1,
+isPredicable = 1, AddedComplexity = 30, validSubTargets = HasV5SubT,
+isCodeGenOnly = 1 in
+def TFRI_f : ALU32_ri<(outs IntRegs:$dst), (ins f32Ext:$src1),
+           "$dst = #$src1",
            [(set IntRegs:$dst, fpimm:$src1)]>,
           Requires<[HasV5T]>;
 
+let isExtended = 1, opExtendable = 2, isPredicated = 1,
+neverHasSideEffects = 1, validSubTargets = HasV5SubT in
 def TFRI_cPt_f : ALU32_ri<(outs IntRegs:$dst),
-                          (ins PredRegs:$src1, f32imm:$src2),
-           "if ($src1) $dst = ##$src2",
+                          (ins PredRegs:$src1, f32Ext:$src2),
+           "if ($src1) $dst = #$src2",
            []>,
           Requires<[HasV5T]>;
 
-let isPredicated = 1 in
+let isExtended = 1, opExtendable = 2, isPredicated = 1, isPredicatedFalse = 1,
+neverHasSideEffects = 1, validSubTargets = HasV5SubT in
 def TFRI_cNotPt_f : ALU32_ri<(outs IntRegs:$dst),
-                             (ins PredRegs:$src1, f32imm:$src2),
-           "if (!$src1) $dst = ##$src2",
+                             (ins PredRegs:$src1, f32Ext:$src2),
+           "if (!$src1) $dst =#$src2",
            []>,
           Requires<[HasV5T]>;
 
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
index 1388ad4..6e966ec 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -195,7 +195,6 @@ void VLIWMachineScheduler::schedule() {
 void ConvergingVLIWScheduler::initialize(ScheduleDAGMI *dag) {
   DAG = static_cast<VLIWMachineScheduler*>(dag);
   SchedModel = DAG->getSchedModel();
-  TRI = DAG->TRI;
 
   Top.init(DAG, SchedModel);
   Bot.init(DAG, SchedModel);
@@ -409,7 +408,7 @@ void ConvergingVLIWScheduler::traceCandidate(const char *Label,
                                              SUnit *SU, PressureElement P) {
   dbgs() << Label << " " << Q.getName() << " ";
   if (P.isValid())
-    dbgs() << TRI->getRegPressureSetName(P.PSetID) << ":" << P.UnitIncrease
+    dbgs() << DAG->TRI->getRegPressureSetName(P.PSetID) << ":" << P.UnitIncrease
            << " ";
   else
     dbgs() << "     ";
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.h b/lib/Target/Hexagon/HexagonMachineScheduler.h
index f68dadf..171193e 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.h
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.h
@@ -190,7 +190,6 @@ class ConvergingVLIWScheduler : public MachineSchedStrategy {
 
   VLIWMachineScheduler *DAG;
   const TargetSchedModel *SchedModel;
-  const TargetRegisterInfo *TRI;
 
   // State of the top and bottom scheduled instruction boundaries.
   SchedBoundary Top;
@@ -205,7 +204,7 @@ public:
   };
 
   ConvergingVLIWScheduler():
-    DAG(0), SchedModel(0), TRI(0), Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {}
+    DAG(0), SchedModel(0), Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {}
 
   virtual void initialize(ScheduleDAGMI *dag);
 
diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp
index 72af876..f7c4513 100644
--- a/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -22,29 +22,31 @@
 //
 //===----------------------------------------------------------------------===//
 #define DEBUG_TYPE "hexagon-nvj"
-#include "Hexagon.h"
-#include "HexagonInstrInfo.h"
-#include "HexagonMachineFunctionInfo.h"
-#include "HexagonRegisterInfo.h"
-#include "HexagonSubtarget.h"
-#include "HexagonTargetMachine.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/LiveVariables.h"
-#include "llvm/CodeGen/MachineFunctionAnalysis.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
-#include "llvm/PassSupport.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonMachineFunctionInfo.h"
+
 #include <map>
+
+#include "llvm/Support/CommandLine.h"
 using namespace llvm;
 
 STATISTIC(NumNVJGenerated, "Number of New Value Jump Instructions created");
@@ -57,6 +59,11 @@ static cl::opt<bool> DisableNewValueJumps("disable-nvjump", cl::Hidden,
     cl::ZeroOrMore, cl::init(false),
     cl::desc("Disable New Value Jumps"));
 
+namespace llvm {
+  void initializeHexagonNewValueJumpPass(PassRegistry&);
+}
+
+
 namespace {
   struct HexagonNewValueJump : public MachineFunctionPass {
     const HexagonInstrInfo    *QII;
@@ -65,7 +72,9 @@ namespace {
   public:
     static char ID;
 
-    HexagonNewValueJump() : MachineFunctionPass(ID) { }
+    HexagonNewValueJump() : MachineFunctionPass(ID) {
+      initializeHexagonNewValueJumpPass(*PassRegistry::getPassRegistry());
+    }
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addRequired<MachineBranchProbabilityInfo>();
@@ -88,6 +97,13 @@ namespace {
 
 char HexagonNewValueJump::ID = 0;
 
+INITIALIZE_PASS_BEGIN(HexagonNewValueJump, "hexagon-nvj",
+                      "Hexagon NewValueJump", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_END(HexagonNewValueJump, "hexagon-nvj",
+                    "Hexagon NewValueJump", false, false)
+
+
 // We have identified this II could be feeder to NVJ,
 // verify that it can be.
 static bool canBeFeederToNewValueJump(const HexagonInstrInfo *QII,
@@ -219,7 +235,7 @@ static bool canCompareBeNewValueJump(const HexagonInstrInfo *QII,
       return false;
   }
 
-  unsigned cmpReg1, cmpOp2;
+  unsigned cmpReg1, cmpOp2 = 0; // cmpOp2 assignment silences compiler warning.
   cmpReg1 = MI->getOperand(1).getReg();
 
   if (secondReg) {
@@ -285,43 +301,48 @@ static unsigned getNewValueJumpOpcode(MachineInstr *MI, int reg,
 
   switch (MI->getOpcode()) {
     case Hexagon::CMPEQrr:
-      return taken ? Hexagon::JMP_EQrrPt_nv_V4 : Hexagon::JMP_EQrrPnt_nv_V4;
+      return taken ? Hexagon::CMPEQrr_t_Jumpnv_t_V4
+                   : Hexagon::CMPEQrr_t_Jumpnv_nt_V4;
 
     case Hexagon::CMPEQri: {
       if (reg >= 0)
-        return taken ? Hexagon::JMP_EQriPt_nv_V4 : Hexagon::JMP_EQriPnt_nv_V4;
+        return taken ? Hexagon::CMPEQri_t_Jumpnv_t_V4
+                     : Hexagon::CMPEQri_t_Jumpnv_nt_V4;
       else
-        return taken ? Hexagon::JMP_EQriPtneg_nv_V4
-                     : Hexagon::JMP_EQriPntneg_nv_V4;
+        return taken ? Hexagon::CMPEQn1_t_Jumpnv_t_V4
+                     : Hexagon::CMPEQn1_t_Jumpnv_nt_V4;
     }
 
     case Hexagon::CMPGTrr: {
       if (secondRegNewified)
-        return taken ? Hexagon::JMP_GTrrdnPt_nv_V4
-                     : Hexagon::JMP_GTrrdnPnt_nv_V4;
+        return taken ? Hexagon::CMPLTrr_t_Jumpnv_t_V4
+                     : Hexagon::CMPLTrr_t_Jumpnv_nt_V4;
       else
-        return taken ? Hexagon::JMP_GTrrPt_nv_V4
-                     : Hexagon::JMP_GTrrPnt_nv_V4;
+        return taken ? Hexagon::CMPGTrr_t_Jumpnv_t_V4
+                     : Hexagon::CMPGTrr_t_Jumpnv_nt_V4;
     }
 
     case Hexagon::CMPGTri: {
       if (reg >= 0)
-        return taken ? Hexagon::JMP_GTriPt_nv_V4 : Hexagon::JMP_GTriPnt_nv_V4;
+        return taken ? Hexagon::CMPGTri_t_Jumpnv_t_V4
+                     : Hexagon::CMPGTri_t_Jumpnv_nt_V4;
       else
-        return taken ? Hexagon::JMP_GTriPtneg_nv_V4
-                     : Hexagon::JMP_GTriPntneg_nv_V4;
+        return taken ? Hexagon::CMPGTn1_t_Jumpnv_t_V4
+                     : Hexagon::CMPGTn1_t_Jumpnv_nt_V4;
     }
 
     case Hexagon::CMPGTUrr: {
       if (secondRegNewified)
-        return taken ? Hexagon::JMP_GTUrrdnPt_nv_V4
-                     : Hexagon::JMP_GTUrrdnPnt_nv_V4;
+        return taken ? Hexagon::CMPLTUrr_t_Jumpnv_t_V4
+                     : Hexagon::CMPLTUrr_t_Jumpnv_nt_V4;
       else
-        return taken ? Hexagon::JMP_GTUrrPt_nv_V4 : Hexagon::JMP_GTUrrPnt_nv_V4;
+        return taken ? Hexagon::CMPGTUrr_t_Jumpnv_t_V4
+                     : Hexagon::CMPGTUrr_t_Jumpnv_nt_V4;
     }
 
     case Hexagon::CMPGTUri:
-      return taken ? Hexagon::JMP_GTUriPt_nv_V4 : Hexagon::JMP_GTUriPnt_nv_V4;
+      return taken ? Hexagon::CMPGTUri_t_Jumpnv_t_V4
+                   : Hexagon::CMPGTUri_t_Jumpnv_nt_V4;
 
     default:
        llvm_unreachable("Could not find matching New Value Jump instruction.");
@@ -610,6 +631,7 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
                                     .addMBB(jmpTarget);
 
           assert(NewMI && "New Value Jump Instruction Not created!");
+          (void)NewMI;
           if (cmpInstr->getOperand(0).isReg() &&
               cmpInstr->getOperand(0).isKill())
             cmpInstr->getOperand(0).setIsKill(false);
diff --git a/lib/Target/Hexagon/HexagonPeephole.cpp b/lib/Target/Hexagon/HexagonPeephole.cpp
index 6c4eb7e..89e3406 100644
--- a/lib/Target/Hexagon/HexagonPeephole.cpp
+++ b/lib/Target/Hexagon/HexagonPeephole.cpp
@@ -61,10 +61,6 @@ static cl::opt<bool> DisableHexagonPeephole("disable-hexagon-peephole",
     cl::Hidden, cl::ZeroOrMore, cl::init(false),
     cl::desc("Disable Peephole Optimization"));
 
-static cl::opt<int>
-DbgPNPCount("pnp-count", cl::init(-1), cl::Hidden,
-  cl::desc("Maximum number of P=NOT(P) to be optimized"));
-
 static cl::opt<bool> DisablePNotP("disable-hexagon-pnotp",
     cl::Hidden, cl::ZeroOrMore, cl::init(false),
     cl::desc("Disable Optimization of PNotP"));
@@ -77,6 +73,10 @@ static cl::opt<bool> DisableOptExtTo64("disable-hexagon-opt-ext-to-64",
     cl::Hidden, cl::ZeroOrMore, cl::init(false),
     cl::desc("Disable Optimization of extensions to i64."));
 
+namespace llvm {
+  void initializeHexagonPeepholePass(PassRegistry&);
+}
+
 namespace {
   struct HexagonPeephole : public MachineFunctionPass {
     const HexagonInstrInfo    *QII;
@@ -85,7 +85,9 @@ namespace {
 
   public:
     static char ID;
-    HexagonPeephole() : MachineFunctionPass(ID) { }
+    HexagonPeephole() : MachineFunctionPass(ID) {
+      initializeHexagonPeepholePass(*PassRegistry::getPassRegistry());
+    }
 
     bool runOnMachineFunction(MachineFunction &MF);
 
@@ -104,8 +106,10 @@ namespace {
 
 char HexagonPeephole::ID = 0;
 
-bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
+INITIALIZE_PASS(HexagonPeephole, "hexagon-peephole", "Hexagon Peephole",
+                false, false)
 
+bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
   QII = static_cast<const HexagonInstrInfo *>(MF.getTarget().
                                         getInstrInfo());
   QRI = static_cast<const HexagonRegisterInfo *>(MF.getTarget().
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index d8b4e2f..d5ca4d7 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -38,11 +38,9 @@
 using namespace llvm;
 
 
-HexagonRegisterInfo::HexagonRegisterInfo(HexagonSubtarget &st,
-                                     const HexagonInstrInfo &tii)
+HexagonRegisterInfo::HexagonRegisterInfo(HexagonSubtarget &st)
   : HexagonGenRegisterInfo(Hexagon::R31),
-    Subtarget(st),
-   TII(tii) {
+    Subtarget(st) {
 }
 
 const uint16_t* HexagonRegisterInfo::getCalleeSavedRegs(const MachineFunction
@@ -130,6 +128,8 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
   // Addressable stack objects are accessed using neg. offsets from %fp.
   MachineFunction &MF = *MI.getParent()->getParent();
+  const HexagonInstrInfo &TII =
+    *static_cast<const HexagonInstrInfo*>(MF.getTarget().getInstrInfo());
   int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex);
   MachineFrameInfo &MFI = *MF.getFrameInfo();
 
@@ -295,16 +295,6 @@ unsigned HexagonRegisterInfo::getStackRegister() const {
   return Hexagon::R29;
 }
 
-void HexagonRegisterInfo::getInitialFrameState(std::vector<MachineMove>
-                                               &Moves)  const
-{
-  // VirtualFP = (R30 + #0).
-  unsigned FPReg = getFrameRegister();
-  MachineLocation Dst(MachineLocation::VirtualFP);
-  MachineLocation Src(FPReg, 0);
-  Moves.push_back(MachineMove(0, Dst, Src));
-}
-
 unsigned HexagonRegisterInfo::getEHExceptionRegister() const {
   llvm_unreachable("What is the exception register");
 }
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.h b/lib/Target/Hexagon/HexagonRegisterInfo.h
index 8a3f94a..fec86df 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.h
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.h
@@ -44,9 +44,8 @@ class Type;
 
 struct HexagonRegisterInfo : public HexagonGenRegisterInfo {
   HexagonSubtarget &Subtarget;
-  const HexagonInstrInfo &TII;
 
-  HexagonRegisterInfo(HexagonSubtarget &st, const HexagonInstrInfo &tii);
+  HexagonRegisterInfo(HexagonSubtarget &st);
 
   /// Code Generation virtual methods...
   const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
@@ -78,7 +77,6 @@ struct HexagonRegisterInfo : public HexagonGenRegisterInfo {
   unsigned getRARegister() const;
   unsigned getFrameRegister(const MachineFunction &MF) const;
   unsigned getFrameRegister() const;
-  void getInitialFrameState(std::vector<MachineMove> &Moves) const;
   unsigned getStackRegister() const;
 
   // Exception handling queries.
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.td b/lib/Target/Hexagon/HexagonRegisterInfo.td
index fe41fc3..8ea1b7e 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.td
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.td
@@ -57,8 +57,8 @@ let Namespace = "Hexagon" in {
     let Aliases = [R];
   }
 
-  def subreg_loreg  : SubRegIndex;
-  def subreg_hireg  : SubRegIndex;
+  def subreg_loreg  : SubRegIndex<32>;
+  def subreg_hireg  : SubRegIndex<32, 32>;
 
   // Integer registers.
   def R0 : Ri< 0, "r0">, DwarfRegNum<[0]>;
diff --git a/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp b/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
index 34bf4ea..44234e8 100644
--- a/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
+++ b/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
@@ -21,11 +21,18 @@
 #include "llvm/Transforms/Scalar.h"
 
 using namespace llvm;
+
+namespace llvm {
+  void initializeHexagonRemoveExtendArgsPass(PassRegistry&);
+}
+
 namespace {
   struct HexagonRemoveExtendArgs : public FunctionPass {
   public:
     static char ID;
-    HexagonRemoveExtendArgs() : FunctionPass(ID) {}
+    HexagonRemoveExtendArgs() : FunctionPass(ID) {
+      initializeHexagonRemoveExtendArgsPass(*PassRegistry::getPassRegistry());
+    }
     virtual bool runOnFunction(Function &F);
 
     const char *getPassName() const {
@@ -41,11 +48,9 @@ namespace {
 }
 
 char HexagonRemoveExtendArgs::ID = 0;
-RegisterPass<HexagonRemoveExtendArgs> X("reargs",
-                                        "Remove Sign and Zero Extends for Args"
-                                        );
-
 
+INITIALIZE_PASS(HexagonRemoveExtendArgs, "reargs",
+                "Remove Sign and Zero Extends for Args", false, false)
 
 bool HexagonRemoveExtendArgs::runOnFunction(Function &F) {
   unsigned Idx = 1;
@@ -78,6 +83,7 @@ bool HexagonRemoveExtendArgs::runOnFunction(Function &F) {
 
 
 
-FunctionPass *llvm::createHexagonRemoveExtendOps(HexagonTargetMachine &TM) {
+FunctionPass*
+llvm::createHexagonRemoveExtendArgs(const HexagonTargetMachine &TM) {
   return new HexagonRemoveExtendArgs();
 }
diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
index a52c604..c37bf9f 100644
--- a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
+++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
@@ -27,7 +27,7 @@ HexagonSelectionDAGInfo::~HexagonSelectionDAGInfo() {
 
 SDValue
 HexagonSelectionDAGInfo::
-EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, SDValue Chain,
+EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
                         SDValue Dst, SDValue Src, SDValue Size, unsigned Align,
                         bool isVolatile, bool AlwaysInline,
                         MachinePointerInfo DstPtrInfo,
diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.h b/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
index 0673e4d..31f278a 100644
--- a/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
+++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
@@ -26,7 +26,7 @@ public:
   ~HexagonSelectionDAGInfo();
 
   virtual
-  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
                                   SDValue Chain,
                                   SDValue Dst, SDValue Src,
                                   SDValue Size, unsigned Align,
diff --git a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
new file mode 100644
index 0000000..3bf2f20
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
@@ -0,0 +1,176 @@
+//=== HexagonSplitConst32AndConst64.cpp - split CONST32/Const64 into HI/LO ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// When the compiler is invoked with no small data, for instance, with the -G0
+// command line option, then all CONST32_* opcodes should be broken down into
+// appropriate LO and HI instructions. This splitting is done by this pass.
+// The only reason this is not done in the DAG lowering itself is that there
+// is no simple way of getting the register allocator to allot the same hard
+// register to the result of LO and HI instructions. This pass is always
+// scheduled after register allocation.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "xfer"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/LatencyPriorityQueue.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonSubtarget.h"
+#include "HexagonMachineFunctionInfo.h"
+#include <map>
+#include <iostream>
+
+#include "llvm/Support/CommandLine.h"
+#define DEBUG_TYPE "xfer"
+
+
+using namespace llvm;
+
+namespace {
+
+class HexagonSplitConst32AndConst64 : public MachineFunctionPass {
+    const HexagonTargetMachine& QTM;
+    const HexagonSubtarget &QST;
+
+ public:
+    static char ID;
+    HexagonSplitConst32AndConst64(const HexagonTargetMachine& TM)
+      : MachineFunctionPass(ID), QTM(TM), QST(*TM.getSubtargetImpl()) {}
+
+    const char *getPassName() const {
+      return "Hexagon Split Const32s and Const64s";
+    }
+    bool runOnMachineFunction(MachineFunction &Fn);
+};
+
+
+char HexagonSplitConst32AndConst64::ID = 0;
+
+
+bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) {
+
+  const TargetInstrInfo *TII = QTM.getInstrInfo();
+
+  // Loop over all of the basic blocks
+  for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
+       MBBb != MBBe; ++MBBb) {
+    MachineBasicBlock* MBB = MBBb;
+    // Traverse the basic block
+    MachineBasicBlock::iterator MII = MBB->begin();
+    MachineBasicBlock::iterator MIE = MBB->end ();
+    while (MII != MIE) {
+      MachineInstr *MI = MII;
+      int Opc = MI->getOpcode();
+      if (Opc == Hexagon::CONST32_set) {
+        int DestReg = MI->getOperand(0).getReg();
+        MachineOperand &Symbol = MI->getOperand (1);
+
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::LO), DestReg).addOperand(Symbol);
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::HI), DestReg).addOperand(Symbol);
+        // MBB->erase returns the iterator to the next instruction, which is the
+        // one we want to process next
+        MII = MBB->erase (MI);
+        continue;
+      }
+      else if (Opc == Hexagon::CONST32_set_jt) {
+        int DestReg = MI->getOperand(0).getReg();
+        MachineOperand &Symbol = MI->getOperand (1);
+
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::LO_jt), DestReg).addOperand(Symbol);
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::HI_jt), DestReg).addOperand(Symbol);
+        // MBB->erase returns the iterator to the next instruction, which is the
+        // one we want to process next
+        MII = MBB->erase (MI);
+        continue;
+      }
+      else if (Opc == Hexagon::CONST32_Label) {
+        int DestReg = MI->getOperand(0).getReg();
+        MachineOperand &Symbol = MI->getOperand (1);
+
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::LO_label), DestReg).addOperand(Symbol);
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::HI_label), DestReg).addOperand(Symbol);
+        // MBB->erase returns the iterator to the next instruction, which is the
+        // one we want to process next
+        MII = MBB->erase (MI);
+        continue;
+      }
+      else if (Opc == Hexagon::CONST32_Int_Real) {
+        int DestReg = MI->getOperand(0).getReg();
+        int64_t ImmValue = MI->getOperand(1).getImm ();
+
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::LOi), DestReg).addImm(ImmValue);
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::HIi), DestReg).addImm(ImmValue);
+        MII = MBB->erase (MI);
+        continue;
+      }
+      else if (Opc == Hexagon::CONST64_Int_Real) {
+        int DestReg = MI->getOperand(0).getReg();
+        int64_t ImmValue = MI->getOperand(1).getImm ();
+        unsigned DestLo =
+          QTM.getRegisterInfo()->getSubReg (DestReg, Hexagon::subreg_loreg);
+        unsigned DestHi =
+          QTM.getRegisterInfo()->getSubReg (DestReg, Hexagon::subreg_hireg);
+
+        int32_t LowWord = (ImmValue & 0xFFFFFFFF);
+        int32_t HighWord = (ImmValue >> 32) & 0xFFFFFFFF;
+
+        // Lower Registers Lower Half
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::LOi), DestLo).addImm(LowWord);
+        // Lower Registers Higher Half
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::HIi), DestLo).addImm(LowWord);
+        // Higher Registers Lower Half
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::LOi), DestHi).addImm(HighWord);
+        // Higher Registers Higher Half.
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::HIi), DestHi).addImm(HighWord);
+        MII = MBB->erase (MI);
+        continue;
+       }
+      ++MII;
+    }
+  }
+
+  return true;
+}
+
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *
+llvm::createHexagonSplitConst32AndConst64(const HexagonTargetMachine &TM) {
+  return new HexagonSplitConst32AndConst64(TM);
+}
diff --git a/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp b/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp
index 814249f..8608e08 100644
--- a/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp
+++ b/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp
@@ -49,16 +49,23 @@
 
 using namespace llvm;
 
+namespace llvm {
+  void initializeHexagonSplitTFRCondSetsPass(PassRegistry&);
+}
+
+
 namespace {
 
 class HexagonSplitTFRCondSets : public MachineFunctionPass {
-    HexagonTargetMachine& QTM;
+    const HexagonTargetMachine &QTM;
     const HexagonSubtarget &QST;
 
  public:
     static char ID;
-    HexagonSplitTFRCondSets(HexagonTargetMachine& TM) :
-      MachineFunctionPass(ID), QTM(TM), QST(*TM.getSubtargetImpl()) {}
+    HexagonSplitTFRCondSets(const HexagonTargetMachine& TM) :
+      MachineFunctionPass(ID), QTM(TM), QST(*TM.getSubtargetImpl()) {
+      initializeHexagonSplitTFRCondSetsPass(*PassRegistry::getPassRegistry());
+    }
 
     const char *getPassName() const {
       return "Hexagon Split TFRCondSets";
@@ -211,6 +218,18 @@ bool HexagonSplitTFRCondSets::runOnMachineFunction(MachineFunction &Fn) {
 //                         Public Constructor Functions
 //===----------------------------------------------------------------------===//
 
-FunctionPass *llvm::createHexagonSplitTFRCondSets(HexagonTargetMachine &TM) {
+static void initializePassOnce(PassRegistry &Registry) {
+  const char *Name = "Hexagon Split TFRCondSets";
+  PassInfo *PI = new PassInfo(Name, "hexagon-split-tfr",
+                              &HexagonSplitTFRCondSets::ID, 0, false, false);
+  Registry.registerPass(*PI, true);
+}
+
+void llvm::initializeHexagonSplitTFRCondSetsPass(PassRegistry &Registry) {
+  CALL_ONCE_INITIALIZATION(initializePassOnce)
+}
+
+FunctionPass*
+llvm::createHexagonSplitTFRCondSets(const HexagonTargetMachine &TM) {
   return new HexagonSplitTFRCondSets(TM);
 }
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index ce45c62..b113b35 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -15,6 +15,7 @@
 #include "Hexagon.h"
 #include "HexagonISelLowering.h"
 #include "HexagonMachineScheduler.h"
+#include "HexagonTargetObjectFile.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Module.h"
 #include "llvm/PassManager.h"
@@ -25,19 +26,17 @@
 
 using namespace llvm;
 
-static cl::
-opt<bool> DisableHardwareLoops(
-                        "disable-hexagon-hwloops", cl::Hidden,
-                        cl::desc("Disable Hardware Loops for Hexagon target"));
+static cl:: opt<bool> DisableHardwareLoops("disable-hexagon-hwloops",
+      cl::Hidden, cl::desc("Disable Hardware Loops for Hexagon target"));
 
-static cl::
-opt<bool> DisableHexagonMISched("disable-hexagon-misched",
-                                cl::Hidden, cl::ZeroOrMore, cl::init(false),
-                                cl::desc("Disable Hexagon MI Scheduling"));
+static cl::opt<bool> DisableHexagonMISched("disable-hexagon-misched",
+      cl::Hidden, cl::ZeroOrMore, cl::init(false),
+      cl::desc("Disable Hexagon MI Scheduling"));
 
 static cl::opt<bool> DisableHexagonCFGOpt("disable-hexagon-cfgopt",
-    cl::Hidden, cl::ZeroOrMore, cl::init(false),
-    cl::desc("Disable Hexagon CFG Optimization"));
+      cl::Hidden, cl::ZeroOrMore, cl::init(false),
+      cl::desc("Disable Hexagon CFG Optimization"));
+
 
 /// HexagonTargetMachineModule - Note that this is used on hosts that
 /// cannot link in a library unless there are references into the
@@ -80,6 +79,7 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, StringRef TT,
     FrameLowering(Subtarget),
     InstrItins(&Subtarget.getInstrItineraryData()) {
     setMCUseCFI(false);
+    initAsmInfo();
 }
 
 // addPassesForOptimizations - Allow the backend (target) to add Target
@@ -126,55 +126,71 @@ TargetPassConfig *HexagonTargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 bool HexagonPassConfig::addInstSelector() {
+  const HexagonTargetMachine &TM = getHexagonTargetMachine();
+  bool NoOpt = (getOptLevel() == CodeGenOpt::None);
 
-  if (getOptLevel() != CodeGenOpt::None)
-    addPass(createHexagonRemoveExtendOps(getHexagonTargetMachine()));
+  if (!NoOpt)
+    addPass(createHexagonRemoveExtendArgs(TM));
 
-  addPass(createHexagonISelDag(getHexagonTargetMachine(), getOptLevel()));
+  addPass(createHexagonISelDag(TM, getOptLevel()));
 
-  if (getOptLevel() != CodeGenOpt::None)
+  if (!NoOpt) {
     addPass(createHexagonPeephole());
+    printAndVerify("After hexagon peephole pass");
+  }
 
   return false;
 }
 
-
 bool HexagonPassConfig::addPreRegAlloc() {
-  if (!DisableHardwareLoops && getOptLevel() != CodeGenOpt::None)
-    addPass(createHexagonHardwareLoops());
+  if (getOptLevel() != CodeGenOpt::None)
+    if (!DisableHardwareLoops)
+      addPass(createHexagonHardwareLoops());
   return false;
 }
 
 bool HexagonPassConfig::addPostRegAlloc() {
-  if (!DisableHexagonCFGOpt && getOptLevel() != CodeGenOpt::None)
-    addPass(createHexagonCFGOptimizer(getHexagonTargetMachine()));
-  return true;
+  const HexagonTargetMachine &TM = getHexagonTargetMachine();
+  if (getOptLevel() != CodeGenOpt::None)
+    if (!DisableHexagonCFGOpt)
+      addPass(createHexagonCFGOptimizer(TM));
+  return false;
 }
 
-
 bool HexagonPassConfig::addPreSched2() {
+  const HexagonTargetMachine &TM = getHexagonTargetMachine();
+  const HexagonTargetObjectFile &TLOF =
+    (const HexagonTargetObjectFile &)getTargetLowering()->getObjFileLowering();
+
+  addPass(createHexagonCopyToCombine());
   if (getOptLevel() != CodeGenOpt::None)
     addPass(&IfConverterID);
+  if (!TLOF.IsSmallDataEnabled()) {
+    addPass(createHexagonSplitConst32AndConst64(TM));
+    printAndVerify("After hexagon split const32/64 pass");
+  }
   return true;
 }
 
 bool HexagonPassConfig::addPreEmitPass() {
+  const HexagonTargetMachine &TM = getHexagonTargetMachine();
+  bool NoOpt = (getOptLevel() == CodeGenOpt::None);
 
-  if (!DisableHardwareLoops && getOptLevel() != CodeGenOpt::None)
-    addPass(createHexagonFixupHwLoops());
-
-  if (getOptLevel() != CodeGenOpt::None)
+  if (!NoOpt)
     addPass(createHexagonNewValueJump());
 
   // Expand Spill code for predicate registers.
-  addPass(createHexagonExpandPredSpillCode(getHexagonTargetMachine()));
+  addPass(createHexagonExpandPredSpillCode(TM));
 
   // Split up TFRcondsets into conditional transfers.
-  addPass(createHexagonSplitTFRCondSets(getHexagonTargetMachine()));
+  addPass(createHexagonSplitTFRCondSets(TM));
 
   // Create Packets.
-  if (getOptLevel() != CodeGenOpt::None)
+  if (!NoOpt) {
+    if (!DisableHardwareLoops)
+      addPass(createHexagonFixupHwLoops());
     addPass(createHexagonPacketizer());
+  }
 
   return false;
 }
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index 993fcfa..7773cff 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -25,7 +25,8 @@
 using namespace llvm;
 
 static cl::opt<int> SmallDataThreshold("hexagon-small-data-threshold",
-                                cl::init(8), cl::Hidden);
+                                cl::init(8), cl::Hidden,
+                cl::desc("The maximum size of an object in the sdata section"));
 
 void HexagonTargetObjectFile::Initialize(MCContext &Ctx,
                                          const TargetMachine &TM) {
@@ -46,6 +47,11 @@ void HexagonTargetObjectFile::Initialize(MCContext &Ctx,
 static bool IsInSmallSection(uint64_t Size) {
   return Size > 0 && Size <= (uint64_t)SmallDataThreshold;
 }
+
+bool HexagonTargetObjectFile::IsSmallDataEnabled () const {
+  return SmallDataThreshold > 0;
+}
+
 /// IsGlobalInSmallSection - Return true if this global value should be
 /// placed into small data/bss section.
 bool HexagonTargetObjectFile::IsGlobalInSmallSection(const GlobalValue *GV,
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.h b/lib/Target/Hexagon/HexagonTargetObjectFile.h
index 6933450..41f6792 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.h
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.h
@@ -29,6 +29,7 @@ namespace llvm {
     bool IsGlobalInSmallSection(const GlobalValue *GV,
                                 const TargetMachine &TM) const;
 
+    bool IsSmallDataEnabled () const;
     const MCSection* SelectSectionForGlobal(const GlobalValue *GV,
                                             SectionKind Kind,
                                             Mangler *Mang,
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index e592df9..41e382d 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -56,9 +56,6 @@ static cl::opt<bool> PacketizeVolatiles("hexagon-packetize-volatiles",
       cl::ZeroOrMore, cl::Hidden, cl::init(true),
       cl::desc("Allow non-solo packetization of volatile memory references"));
 
-extern cl::opt<bool> ScheduleInlineAsm;
-extern cl::opt<bool> CountDeadOutput;
-
 namespace llvm {
   void initializeHexagonPacketizerPass(PassRegistry&);
 }
@@ -167,7 +164,6 @@ namespace {
                     unsigned, std::map <MachineInstr*, SUnit*>);
     bool isNewifiable(MachineInstr* MI);
     bool isCondInst(MachineInstr* MI);
-    bool IsNewifyStore (MachineInstr* MI);
     bool tryAllocateResourcesForConstExt(MachineInstr* MI);
     bool canReserveResourcesForConstExt(MachineInstr *MI);
     void reserveResourcesForConstExt(MachineInstr* MI);
@@ -180,6 +176,7 @@ INITIALIZE_PASS_BEGIN(HexagonPacketizer, "packets", "Hexagon Packetizer",
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_END(HexagonPacketizer, "packets", "Hexagon Packetizer",
                     false, false)
 
@@ -385,104 +382,6 @@ static bool IsControlFlow(MachineInstr* MI) {
   return (MI->getDesc().isTerminator() || MI->getDesc().isCall());
 }
 
-// Function returns true if an instruction can be promoted to the new-value
-// store. It will always return false for v2 and v3.
-// It lists all the conditional and unconditional stores that can be promoted
-// to the new-value stores.
-
-bool HexagonPacketizerList::IsNewifyStore (MachineInstr* MI) {
-  const HexagonRegisterInfo* QRI =
-                          (const HexagonRegisterInfo *) TM.getRegisterInfo();
-  switch (MI->getOpcode())
-  {
-    // store byte
-    case Hexagon::STrib:
-    case Hexagon::STrib_indexed:
-    case Hexagon::STrib_indexed_shl_V4:
-    case Hexagon::STrib_shl_V4:
-    case Hexagon::STb_GP_V4:
-    case Hexagon::POST_STbri:
-    case Hexagon::STrib_cPt:
-    case Hexagon::STrib_cdnPt_V4:
-    case Hexagon::STrib_cNotPt:
-    case Hexagon::STrib_cdnNotPt_V4:
-    case Hexagon::STrib_indexed_cPt:
-    case Hexagon::STrib_indexed_cdnPt_V4:
-    case Hexagon::STrib_indexed_cNotPt:
-    case Hexagon::STrib_indexed_cdnNotPt_V4:
-    case Hexagon::STrib_indexed_shl_cPt_V4:
-    case Hexagon::STrib_indexed_shl_cdnPt_V4:
-    case Hexagon::STrib_indexed_shl_cNotPt_V4:
-    case Hexagon::STrib_indexed_shl_cdnNotPt_V4:
-    case Hexagon::POST_STbri_cPt:
-    case Hexagon::POST_STbri_cdnPt_V4:
-    case Hexagon::POST_STbri_cNotPt:
-    case Hexagon::POST_STbri_cdnNotPt_V4:
-    case Hexagon::STb_GP_cPt_V4:
-    case Hexagon::STb_GP_cNotPt_V4:
-    case Hexagon::STb_GP_cdnPt_V4:
-    case Hexagon::STb_GP_cdnNotPt_V4:
-
-    // store halfword
-    case Hexagon::STrih:
-    case Hexagon::STrih_indexed:
-    case Hexagon::STrih_indexed_shl_V4:
-    case Hexagon::STrih_shl_V4:
-    case Hexagon::STh_GP_V4:
-    case Hexagon::POST_SThri:
-    case Hexagon::STrih_cPt:
-    case Hexagon::STrih_cdnPt_V4:
-    case Hexagon::STrih_cNotPt:
-    case Hexagon::STrih_cdnNotPt_V4:
-    case Hexagon::STrih_indexed_cPt:
-    case Hexagon::STrih_indexed_cdnPt_V4:
-    case Hexagon::STrih_indexed_cNotPt:
-    case Hexagon::STrih_indexed_cdnNotPt_V4:
-    case Hexagon::STrih_indexed_shl_cPt_V4:
-    case Hexagon::STrih_indexed_shl_cdnPt_V4:
-    case Hexagon::STrih_indexed_shl_cNotPt_V4:
-    case Hexagon::STrih_indexed_shl_cdnNotPt_V4:
-    case Hexagon::POST_SThri_cPt:
-    case Hexagon::POST_SThri_cdnPt_V4:
-    case Hexagon::POST_SThri_cNotPt:
-    case Hexagon::POST_SThri_cdnNotPt_V4:
-    case Hexagon::STh_GP_cPt_V4:
-    case Hexagon::STh_GP_cNotPt_V4:
-    case Hexagon::STh_GP_cdnPt_V4:
-    case Hexagon::STh_GP_cdnNotPt_V4:
-
-    // store word
-    case Hexagon::STriw:
-    case Hexagon::STriw_indexed:
-    case Hexagon::STriw_indexed_shl_V4:
-    case Hexagon::STriw_shl_V4:
-    case Hexagon::STw_GP_V4:
-    case Hexagon::POST_STwri:
-    case Hexagon::STriw_cPt:
-    case Hexagon::STriw_cdnPt_V4:
-    case Hexagon::STriw_cNotPt:
-    case Hexagon::STriw_cdnNotPt_V4:
-    case Hexagon::STriw_indexed_cPt:
-    case Hexagon::STriw_indexed_cdnPt_V4:
-    case Hexagon::STriw_indexed_cNotPt:
-    case Hexagon::STriw_indexed_cdnNotPt_V4:
-    case Hexagon::STriw_indexed_shl_cPt_V4:
-    case Hexagon::STriw_indexed_shl_cdnPt_V4:
-    case Hexagon::STriw_indexed_shl_cNotPt_V4:
-    case Hexagon::STriw_indexed_shl_cdnNotPt_V4:
-    case Hexagon::POST_STwri_cPt:
-    case Hexagon::POST_STwri_cdnPt_V4:
-    case Hexagon::POST_STwri_cNotPt:
-    case Hexagon::POST_STwri_cdnNotPt_V4:
-    case Hexagon::STw_GP_cPt_V4:
-    case Hexagon::STw_GP_cNotPt_V4:
-    case Hexagon::STw_GP_cdnPt_V4:
-    case Hexagon::STw_GP_cdnNotPt_V4:
-        return QRI->Subtarget.hasV4TOps();
-  }
-  return false;
-}
-
 static bool IsLoopN(MachineInstr *MI) {
   return (MI->getOpcode() == Hexagon::LOOP0_i ||
           MI->getOpcode() == Hexagon::LOOP0_r);
@@ -500,769 +399,11 @@ static bool DoesModifyCalleeSavedReg(MachineInstr *MI,
   return false;
 }
 
-// Return the new value instruction for a given store.
-static int GetDotNewOp(const int opc) {
-  switch (opc) {
-  default: llvm_unreachable("Unknown .new type");
-  // store new value byte
-  case Hexagon::STrib:
-    return Hexagon::STrib_nv_V4;
-
-  case Hexagon::STrib_indexed:
-    return Hexagon::STrib_indexed_nv_V4;
-
-  case Hexagon::STrib_indexed_shl_V4:
-    return Hexagon::STrib_indexed_shl_nv_V4;
-
-  case Hexagon::STrib_shl_V4:
-    return Hexagon::STrib_shl_nv_V4;
-
-  case Hexagon::STb_GP_V4:
-    return Hexagon::STb_GP_nv_V4;
-
-  case Hexagon::POST_STbri:
-    return Hexagon::POST_STbri_nv_V4;
-
-  case Hexagon::STrib_cPt:
-    return Hexagon::STrib_cPt_nv_V4;
-
-  case Hexagon::STrib_cdnPt_V4:
-    return Hexagon::STrib_cdnPt_nv_V4;
-
-  case Hexagon::STrib_cNotPt:
-    return Hexagon::STrib_cNotPt_nv_V4;
-
-  case Hexagon::STrib_cdnNotPt_V4:
-    return Hexagon::STrib_cdnNotPt_nv_V4;
-
-  case Hexagon::STrib_indexed_cPt:
-    return Hexagon::STrib_indexed_cPt_nv_V4;
-
-  case Hexagon::STrib_indexed_cdnPt_V4:
-    return Hexagon::STrib_indexed_cdnPt_nv_V4;
-
-  case Hexagon::STrib_indexed_cNotPt:
-    return Hexagon::STrib_indexed_cNotPt_nv_V4;
-
-  case Hexagon::STrib_indexed_cdnNotPt_V4:
-    return Hexagon::STrib_indexed_cdnNotPt_nv_V4;
-
-  case Hexagon::STrib_indexed_shl_cPt_V4:
-    return Hexagon::STrib_indexed_shl_cPt_nv_V4;
-
-  case Hexagon::STrib_indexed_shl_cdnPt_V4:
-    return Hexagon::STrib_indexed_shl_cdnPt_nv_V4;
-
-  case Hexagon::STrib_indexed_shl_cNotPt_V4:
-    return Hexagon::STrib_indexed_shl_cNotPt_nv_V4;
-
-  case Hexagon::STrib_indexed_shl_cdnNotPt_V4:
-    return Hexagon::STrib_indexed_shl_cdnNotPt_nv_V4;
-
-  case Hexagon::POST_STbri_cPt:
-    return Hexagon::POST_STbri_cPt_nv_V4;
-
-  case Hexagon::POST_STbri_cdnPt_V4:
-    return Hexagon::POST_STbri_cdnPt_nv_V4;
-
-  case Hexagon::POST_STbri_cNotPt:
-    return Hexagon::POST_STbri_cNotPt_nv_V4;
-
-  case Hexagon::POST_STbri_cdnNotPt_V4:
-    return Hexagon::POST_STbri_cdnNotPt_nv_V4;
-
-  case Hexagon::STb_GP_cPt_V4:
-    return Hexagon::STb_GP_cPt_nv_V4;
-
-  case Hexagon::STb_GP_cNotPt_V4:
-    return Hexagon::STb_GP_cNotPt_nv_V4;
-
-  case Hexagon::STb_GP_cdnPt_V4:
-    return Hexagon::STb_GP_cdnPt_nv_V4;
-
-  case Hexagon::STb_GP_cdnNotPt_V4:
-    return Hexagon::STb_GP_cdnNotPt_nv_V4;
-
-  // store new value halfword
-  case Hexagon::STrih:
-    return Hexagon::STrih_nv_V4;
-
-  case Hexagon::STrih_indexed:
-    return Hexagon::STrih_indexed_nv_V4;
-
-  case Hexagon::STrih_indexed_shl_V4:
-    return Hexagon::STrih_indexed_shl_nv_V4;
-
-  case Hexagon::STrih_shl_V4:
-    return Hexagon::STrih_shl_nv_V4;
-
-  case Hexagon::STh_GP_V4:
-    return Hexagon::STh_GP_nv_V4;
-
-  case Hexagon::POST_SThri:
-    return Hexagon::POST_SThri_nv_V4;
-
-  case Hexagon::STrih_cPt:
-    return Hexagon::STrih_cPt_nv_V4;
-
-  case Hexagon::STrih_cdnPt_V4:
-    return Hexagon::STrih_cdnPt_nv_V4;
-
-  case Hexagon::STrih_cNotPt:
-    return Hexagon::STrih_cNotPt_nv_V4;
-
-  case Hexagon::STrih_cdnNotPt_V4:
-    return Hexagon::STrih_cdnNotPt_nv_V4;
-
-  case Hexagon::STrih_indexed_cPt:
-    return Hexagon::STrih_indexed_cPt_nv_V4;
-
-  case Hexagon::STrih_indexed_cdnPt_V4:
-    return Hexagon::STrih_indexed_cdnPt_nv_V4;
-
-  case Hexagon::STrih_indexed_cNotPt:
-    return Hexagon::STrih_indexed_cNotPt_nv_V4;
-
-  case Hexagon::STrih_indexed_cdnNotPt_V4:
-    return Hexagon::STrih_indexed_cdnNotPt_nv_V4;
-
-  case Hexagon::STrih_indexed_shl_cPt_V4:
-    return Hexagon::STrih_indexed_shl_cPt_nv_V4;
-
-  case Hexagon::STrih_indexed_shl_cdnPt_V4:
-    return Hexagon::STrih_indexed_shl_cdnPt_nv_V4;
-
-  case Hexagon::STrih_indexed_shl_cNotPt_V4:
-    return Hexagon::STrih_indexed_shl_cNotPt_nv_V4;
-
-  case Hexagon::STrih_indexed_shl_cdnNotPt_V4:
-    return Hexagon::STrih_indexed_shl_cdnNotPt_nv_V4;
-
-  case Hexagon::POST_SThri_cPt:
-    return Hexagon::POST_SThri_cPt_nv_V4;
-
-  case Hexagon::POST_SThri_cdnPt_V4:
-    return Hexagon::POST_SThri_cdnPt_nv_V4;
-
-  case Hexagon::POST_SThri_cNotPt:
-    return Hexagon::POST_SThri_cNotPt_nv_V4;
-
-  case Hexagon::POST_SThri_cdnNotPt_V4:
-    return Hexagon::POST_SThri_cdnNotPt_nv_V4;
-
-  case Hexagon::STh_GP_cPt_V4:
-    return Hexagon::STh_GP_cPt_nv_V4;
-
-  case Hexagon::STh_GP_cNotPt_V4:
-    return Hexagon::STh_GP_cNotPt_nv_V4;
-
-  case Hexagon::STh_GP_cdnPt_V4:
-    return Hexagon::STh_GP_cdnPt_nv_V4;
-
-  case Hexagon::STh_GP_cdnNotPt_V4:
-    return Hexagon::STh_GP_cdnNotPt_nv_V4;
-
-  // store new value word
-  case Hexagon::STriw:
-    return Hexagon::STriw_nv_V4;
-
-  case Hexagon::STriw_indexed:
-    return Hexagon::STriw_indexed_nv_V4;
-
-  case Hexagon::STriw_indexed_shl_V4:
-    return Hexagon::STriw_indexed_shl_nv_V4;
-
-  case Hexagon::STriw_shl_V4:
-    return Hexagon::STriw_shl_nv_V4;
-
-  case Hexagon::STw_GP_V4:
-    return Hexagon::STw_GP_nv_V4;
-
-  case Hexagon::POST_STwri:
-    return Hexagon::POST_STwri_nv_V4;
-
-  case Hexagon::STriw_cPt:
-    return Hexagon::STriw_cPt_nv_V4;
-
-  case Hexagon::STriw_cdnPt_V4:
-    return Hexagon::STriw_cdnPt_nv_V4;
-
-  case Hexagon::STriw_cNotPt:
-    return Hexagon::STriw_cNotPt_nv_V4;
-
-  case Hexagon::STriw_cdnNotPt_V4:
-    return Hexagon::STriw_cdnNotPt_nv_V4;
-
-  case Hexagon::STriw_indexed_cPt:
-    return Hexagon::STriw_indexed_cPt_nv_V4;
-
-  case Hexagon::STriw_indexed_cdnPt_V4:
-    return Hexagon::STriw_indexed_cdnPt_nv_V4;
-
-  case Hexagon::STriw_indexed_cNotPt:
-    return Hexagon::STriw_indexed_cNotPt_nv_V4;
-
-  case Hexagon::STriw_indexed_cdnNotPt_V4:
-    return Hexagon::STriw_indexed_cdnNotPt_nv_V4;
-
-  case Hexagon::STriw_indexed_shl_cPt_V4:
-    return Hexagon::STriw_indexed_shl_cPt_nv_V4;
-
-  case Hexagon::STriw_indexed_shl_cdnPt_V4:
-    return Hexagon::STriw_indexed_shl_cdnPt_nv_V4;
-
-  case Hexagon::STriw_indexed_shl_cNotPt_V4:
-    return Hexagon::STriw_indexed_shl_cNotPt_nv_V4;
-
-  case Hexagon::STriw_indexed_shl_cdnNotPt_V4:
-    return Hexagon::STriw_indexed_shl_cdnNotPt_nv_V4;
-
-  case Hexagon::POST_STwri_cPt:
-    return Hexagon::POST_STwri_cPt_nv_V4;
-
-  case Hexagon::POST_STwri_cdnPt_V4:
-    return Hexagon::POST_STwri_cdnPt_nv_V4;
-
-  case Hexagon::POST_STwri_cNotPt:
-    return Hexagon::POST_STwri_cNotPt_nv_V4;
-
-  case Hexagon::POST_STwri_cdnNotPt_V4:
-    return Hexagon::POST_STwri_cdnNotPt_nv_V4;
-
-  case Hexagon::STw_GP_cPt_V4:
-    return Hexagon::STw_GP_cPt_nv_V4;
-
-  case Hexagon::STw_GP_cNotPt_V4:
-    return Hexagon::STw_GP_cNotPt_nv_V4;
-
-  case Hexagon::STw_GP_cdnPt_V4:
-    return Hexagon::STw_GP_cdnPt_nv_V4;
-
-  case Hexagon::STw_GP_cdnNotPt_V4:
-    return Hexagon::STw_GP_cdnNotPt_nv_V4;
-
-  }
-}
-
-// Return .new predicate version for an instruction
-static int GetDotNewPredOp(MachineInstr *MI,
-                           const MachineBranchProbabilityInfo *MBPI,
-                           const HexagonInstrInfo *QII) {
-  switch (MI->getOpcode()) {
-  default: llvm_unreachable("Unknown .new type");
-  // Conditional stores
-  // Store byte conditionally
-  case Hexagon::STrib_cPt :
-    return Hexagon::STrib_cdnPt_V4;
-
-  case Hexagon::STrib_cNotPt :
-    return Hexagon::STrib_cdnNotPt_V4;
-
-  case Hexagon::STrib_indexed_cPt :
-    return Hexagon::STrib_indexed_cdnPt_V4;
-
-  case Hexagon::STrib_indexed_cNotPt :
-    return Hexagon::STrib_indexed_cdnNotPt_V4;
-
-  case Hexagon::STrib_imm_cPt_V4 :
-    return Hexagon::STrib_imm_cdnPt_V4;
-
-  case Hexagon::STrib_imm_cNotPt_V4 :
-    return Hexagon::STrib_imm_cdnNotPt_V4;
-
-  case Hexagon::POST_STbri_cPt :
-    return Hexagon::POST_STbri_cdnPt_V4;
-
-  case Hexagon::POST_STbri_cNotPt :
-    return Hexagon::POST_STbri_cdnNotPt_V4;
-
-  case Hexagon::STrib_indexed_shl_cPt_V4 :
-    return Hexagon::STrib_indexed_shl_cdnPt_V4;
-
-  case Hexagon::STrib_indexed_shl_cNotPt_V4 :
-    return Hexagon::STrib_indexed_shl_cdnNotPt_V4;
-
-  case Hexagon::STb_GP_cPt_V4 :
-    return Hexagon::STb_GP_cdnPt_V4;
-
-  case Hexagon::STb_GP_cNotPt_V4 :
-    return Hexagon::STb_GP_cdnNotPt_V4;
-
-  // Store doubleword conditionally
-  case Hexagon::STrid_cPt :
-    return Hexagon::STrid_cdnPt_V4;
-
-  case Hexagon::STrid_cNotPt :
-    return Hexagon::STrid_cdnNotPt_V4;
-
-  case Hexagon::STrid_indexed_cPt :
-    return Hexagon::STrid_indexed_cdnPt_V4;
-
-  case Hexagon::STrid_indexed_cNotPt :
-    return Hexagon::STrid_indexed_cdnNotPt_V4;
-
-  case Hexagon::STrid_indexed_shl_cPt_V4 :
-    return Hexagon::STrid_indexed_shl_cdnPt_V4;
-
-  case Hexagon::STrid_indexed_shl_cNotPt_V4 :
-    return Hexagon::STrid_indexed_shl_cdnNotPt_V4;
-
-  case Hexagon::POST_STdri_cPt :
-    return Hexagon::POST_STdri_cdnPt_V4;
-
-  case Hexagon::POST_STdri_cNotPt :
-    return Hexagon::POST_STdri_cdnNotPt_V4;
-
-  case Hexagon::STd_GP_cPt_V4 :
-    return Hexagon::STd_GP_cdnPt_V4;
-
-  case Hexagon::STd_GP_cNotPt_V4 :
-    return Hexagon::STd_GP_cdnNotPt_V4;
-
-  // Store halfword conditionally
-  case Hexagon::STrih_cPt :
-    return Hexagon::STrih_cdnPt_V4;
-
-  case Hexagon::STrih_cNotPt :
-    return Hexagon::STrih_cdnNotPt_V4;
-
-  case Hexagon::STrih_indexed_cPt :
-    return Hexagon::STrih_indexed_cdnPt_V4;
-
-  case Hexagon::STrih_indexed_cNotPt :
-    return Hexagon::STrih_indexed_cdnNotPt_V4;
-
-  case Hexagon::STrih_imm_cPt_V4 :
-    return Hexagon::STrih_imm_cdnPt_V4;
-
-  case Hexagon::STrih_imm_cNotPt_V4 :
-    return Hexagon::STrih_imm_cdnNotPt_V4;
-
-  case Hexagon::STrih_indexed_shl_cPt_V4 :
-    return Hexagon::STrih_indexed_shl_cdnPt_V4;
-
-  case Hexagon::STrih_indexed_shl_cNotPt_V4 :
-    return Hexagon::STrih_indexed_shl_cdnNotPt_V4;
-
-  case Hexagon::POST_SThri_cPt :
-    return Hexagon::POST_SThri_cdnPt_V4;
-
-  case Hexagon::POST_SThri_cNotPt :
-    return Hexagon::POST_SThri_cdnNotPt_V4;
-
-  case Hexagon::STh_GP_cPt_V4 :
-    return Hexagon::STh_GP_cdnPt_V4;
-
-  case Hexagon::STh_GP_cNotPt_V4 :
-    return Hexagon::STh_GP_cdnNotPt_V4;
-
-  // Store word conditionally
-  case Hexagon::STriw_cPt :
-    return Hexagon::STriw_cdnPt_V4;
-
-  case Hexagon::STriw_cNotPt :
-    return Hexagon::STriw_cdnNotPt_V4;
-
-  case Hexagon::STriw_indexed_cPt :
-    return Hexagon::STriw_indexed_cdnPt_V4;
-
-  case Hexagon::STriw_indexed_cNotPt :
-    return Hexagon::STriw_indexed_cdnNotPt_V4;
-
-  case Hexagon::STriw_imm_cPt_V4 :
-    return Hexagon::STriw_imm_cdnPt_V4;
-
-  case Hexagon::STriw_imm_cNotPt_V4 :
-    return Hexagon::STriw_imm_cdnNotPt_V4;
-
-  case Hexagon::STriw_indexed_shl_cPt_V4 :
-    return Hexagon::STriw_indexed_shl_cdnPt_V4;
-
-  case Hexagon::STriw_indexed_shl_cNotPt_V4 :
-    return Hexagon::STriw_indexed_shl_cdnNotPt_V4;
-
-  case Hexagon::POST_STwri_cPt :
-    return Hexagon::POST_STwri_cdnPt_V4;
-
-  case Hexagon::POST_STwri_cNotPt :
-    return Hexagon::POST_STwri_cdnNotPt_V4;
-
-  case Hexagon::STw_GP_cPt_V4 :
-    return Hexagon::STw_GP_cdnPt_V4;
-
-  case Hexagon::STw_GP_cNotPt_V4 :
-    return Hexagon::STw_GP_cdnNotPt_V4;
-
-  // Condtional Jumps
-  case Hexagon::JMP_t:
-  case Hexagon::JMP_f:
-    return QII->getDotNewPredJumpOp(MI, MBPI);
-
-  case Hexagon::JMPR_t:
-    return Hexagon::JMPR_tnew_tV3;
-
-  case Hexagon::JMPR_f:
-    return Hexagon::JMPR_fnew_tV3;
-
-  // Conditional Transfers
-  case Hexagon::TFR_cPt:
-    return Hexagon::TFR_cdnPt;
-
-  case Hexagon::TFR_cNotPt:
-    return Hexagon::TFR_cdnNotPt;
-
-  case Hexagon::TFRI_cPt:
-    return Hexagon::TFRI_cdnPt;
-
-  case Hexagon::TFRI_cNotPt:
-    return Hexagon::TFRI_cdnNotPt;
-
-  // Load double word
-  case Hexagon::LDrid_cPt :
-    return Hexagon::LDrid_cdnPt;
-
-  case Hexagon::LDrid_cNotPt :
-    return Hexagon::LDrid_cdnNotPt;
-
-  case Hexagon::LDrid_indexed_cPt :
-    return Hexagon::LDrid_indexed_cdnPt;
-
-  case Hexagon::LDrid_indexed_cNotPt :
-    return Hexagon::LDrid_indexed_cdnNotPt;
-
-  case Hexagon::POST_LDrid_cPt :
-    return Hexagon::POST_LDrid_cdnPt_V4;
-
-  case Hexagon::POST_LDrid_cNotPt :
-    return Hexagon::POST_LDrid_cdnNotPt_V4;
-
-  // Load word
-  case Hexagon::LDriw_cPt :
-    return Hexagon::LDriw_cdnPt;
-
-  case Hexagon::LDriw_cNotPt :
-    return Hexagon::LDriw_cdnNotPt;
-
-  case Hexagon::LDriw_indexed_cPt :
-    return Hexagon::LDriw_indexed_cdnPt;
-
-  case Hexagon::LDriw_indexed_cNotPt :
-    return Hexagon::LDriw_indexed_cdnNotPt;
-
-  case Hexagon::POST_LDriw_cPt :
-    return Hexagon::POST_LDriw_cdnPt_V4;
-
-  case Hexagon::POST_LDriw_cNotPt :
-    return Hexagon::POST_LDriw_cdnNotPt_V4;
-
-  // Load halfword
-  case Hexagon::LDrih_cPt :
-    return Hexagon::LDrih_cdnPt;
-
-  case Hexagon::LDrih_cNotPt :
-    return Hexagon::LDrih_cdnNotPt;
-
-  case Hexagon::LDrih_indexed_cPt :
-    return Hexagon::LDrih_indexed_cdnPt;
-
-  case Hexagon::LDrih_indexed_cNotPt :
-    return Hexagon::LDrih_indexed_cdnNotPt;
-
-  case Hexagon::POST_LDrih_cPt :
-    return Hexagon::POST_LDrih_cdnPt_V4;
-
-  case Hexagon::POST_LDrih_cNotPt :
-    return Hexagon::POST_LDrih_cdnNotPt_V4;
-
-  // Load byte
-  case Hexagon::LDrib_cPt :
-    return Hexagon::LDrib_cdnPt;
-
-  case Hexagon::LDrib_cNotPt :
-    return Hexagon::LDrib_cdnNotPt;
-
-  case Hexagon::LDrib_indexed_cPt :
-    return Hexagon::LDrib_indexed_cdnPt;
-
-  case Hexagon::LDrib_indexed_cNotPt :
-    return Hexagon::LDrib_indexed_cdnNotPt;
-
-  case Hexagon::POST_LDrib_cPt :
-    return Hexagon::POST_LDrib_cdnPt_V4;
-
-  case Hexagon::POST_LDrib_cNotPt :
-    return Hexagon::POST_LDrib_cdnNotPt_V4;
-
-  // Load unsigned halfword
-  case Hexagon::LDriuh_cPt :
-    return Hexagon::LDriuh_cdnPt;
-
-  case Hexagon::LDriuh_cNotPt :
-    return Hexagon::LDriuh_cdnNotPt;
-
-  case Hexagon::LDriuh_indexed_cPt :
-    return Hexagon::LDriuh_indexed_cdnPt;
-
-  case Hexagon::LDriuh_indexed_cNotPt :
-    return Hexagon::LDriuh_indexed_cdnNotPt;
-
-  case Hexagon::POST_LDriuh_cPt :
-    return Hexagon::POST_LDriuh_cdnPt_V4;
-
-  case Hexagon::POST_LDriuh_cNotPt :
-    return Hexagon::POST_LDriuh_cdnNotPt_V4;
-
-  // Load unsigned byte
-  case Hexagon::LDriub_cPt :
-    return Hexagon::LDriub_cdnPt;
-
-  case Hexagon::LDriub_cNotPt :
-    return Hexagon::LDriub_cdnNotPt;
-
-  case Hexagon::LDriub_indexed_cPt :
-    return Hexagon::LDriub_indexed_cdnPt;
-
-  case Hexagon::LDriub_indexed_cNotPt :
-    return Hexagon::LDriub_indexed_cdnNotPt;
-
-  case Hexagon::POST_LDriub_cPt :
-    return Hexagon::POST_LDriub_cdnPt_V4;
-
-  case Hexagon::POST_LDriub_cNotPt :
-    return Hexagon::POST_LDriub_cdnNotPt_V4;
-
-  // V4 indexed+scaled load
-
-  case Hexagon::LDrid_indexed_shl_cPt_V4 :
-    return Hexagon::LDrid_indexed_shl_cdnPt_V4;
-
-  case Hexagon::LDrid_indexed_shl_cNotPt_V4 :
-    return Hexagon::LDrid_indexed_shl_cdnNotPt_V4;
-
-  case Hexagon::LDrib_indexed_shl_cPt_V4 :
-    return Hexagon::LDrib_indexed_shl_cdnPt_V4;
-
-  case Hexagon::LDrib_indexed_shl_cNotPt_V4 :
-    return Hexagon::LDrib_indexed_shl_cdnNotPt_V4;
-
-  case Hexagon::LDriub_indexed_shl_cPt_V4 :
-    return Hexagon::LDriub_indexed_shl_cdnPt_V4;
-
-  case Hexagon::LDriub_indexed_shl_cNotPt_V4 :
-    return Hexagon::LDriub_indexed_shl_cdnNotPt_V4;
-
-  case Hexagon::LDrih_indexed_shl_cPt_V4 :
-    return Hexagon::LDrih_indexed_shl_cdnPt_V4;
-
-  case Hexagon::LDrih_indexed_shl_cNotPt_V4 :
-    return Hexagon::LDrih_indexed_shl_cdnNotPt_V4;
-
-  case Hexagon::LDriuh_indexed_shl_cPt_V4 :
-    return Hexagon::LDriuh_indexed_shl_cdnPt_V4;
-
-  case Hexagon::LDriuh_indexed_shl_cNotPt_V4 :
-    return Hexagon::LDriuh_indexed_shl_cdnNotPt_V4;
-
-  case Hexagon::LDriw_indexed_shl_cPt_V4 :
-    return Hexagon::LDriw_indexed_shl_cdnPt_V4;
-
-  case Hexagon::LDriw_indexed_shl_cNotPt_V4 :
-    return Hexagon::LDriw_indexed_shl_cdnNotPt_V4;
-
-  // V4 global address load
-
-  case Hexagon::LDd_GP_cPt_V4:
-    return Hexagon::LDd_GP_cdnPt_V4;
-
-  case Hexagon::LDd_GP_cNotPt_V4:
-    return Hexagon::LDd_GP_cdnNotPt_V4;
-
-  case Hexagon::LDb_GP_cPt_V4:
-    return Hexagon::LDb_GP_cdnPt_V4;
-
-  case Hexagon::LDb_GP_cNotPt_V4:
-    return Hexagon::LDb_GP_cdnNotPt_V4;
-
-  case Hexagon::LDub_GP_cPt_V4:
-    return Hexagon::LDub_GP_cdnPt_V4;
-
-  case Hexagon::LDub_GP_cNotPt_V4:
-    return Hexagon::LDub_GP_cdnNotPt_V4;
-
-  case Hexagon::LDh_GP_cPt_V4:
-    return Hexagon::LDh_GP_cdnPt_V4;
-
-  case Hexagon::LDh_GP_cNotPt_V4:
-    return Hexagon::LDh_GP_cdnNotPt_V4;
-
-  case Hexagon::LDuh_GP_cPt_V4:
-    return Hexagon::LDuh_GP_cdnPt_V4;
-
-  case Hexagon::LDuh_GP_cNotPt_V4:
-    return Hexagon::LDuh_GP_cdnNotPt_V4;
-
-  case Hexagon::LDw_GP_cPt_V4:
-    return Hexagon::LDw_GP_cdnPt_V4;
-
-  case Hexagon::LDw_GP_cNotPt_V4:
-    return Hexagon::LDw_GP_cdnNotPt_V4;
-
-  // Conditional store new-value byte
-  case Hexagon::STrib_cPt_nv_V4 :
-    return Hexagon::STrib_cdnPt_nv_V4;
-  case Hexagon::STrib_cNotPt_nv_V4 :
-    return Hexagon::STrib_cdnNotPt_nv_V4;
-
-  case Hexagon::STrib_indexed_cPt_nv_V4 :
-    return Hexagon::STrib_indexed_cdnPt_nv_V4;
-  case Hexagon::STrib_indexed_cNotPt_nv_V4 :
-    return Hexagon::STrib_indexed_cdnNotPt_nv_V4;
-
-  case Hexagon::STrib_indexed_shl_cPt_nv_V4 :
-    return Hexagon::STrib_indexed_shl_cdnPt_nv_V4;
-  case Hexagon::STrib_indexed_shl_cNotPt_nv_V4 :
-    return Hexagon::STrib_indexed_shl_cdnNotPt_nv_V4;
-
-  case Hexagon::POST_STbri_cPt_nv_V4 :
-    return Hexagon::POST_STbri_cdnPt_nv_V4;
-  case Hexagon::POST_STbri_cNotPt_nv_V4 :
-    return Hexagon::POST_STbri_cdnNotPt_nv_V4;
-
-  case Hexagon::STb_GP_cPt_nv_V4 :
-    return Hexagon::STb_GP_cdnPt_nv_V4;
-
-  case Hexagon::STb_GP_cNotPt_nv_V4 :
-    return Hexagon::STb_GP_cdnNotPt_nv_V4;
-
-  // Conditional store new-value halfword
-  case Hexagon::STrih_cPt_nv_V4 :
-    return Hexagon::STrih_cdnPt_nv_V4;
-  case Hexagon::STrih_cNotPt_nv_V4 :
-    return Hexagon::STrih_cdnNotPt_nv_V4;
-
-  case Hexagon::STrih_indexed_cPt_nv_V4 :
-    return Hexagon::STrih_indexed_cdnPt_nv_V4;
-  case Hexagon::STrih_indexed_cNotPt_nv_V4 :
-    return Hexagon::STrih_indexed_cdnNotPt_nv_V4;
-
-  case Hexagon::STrih_indexed_shl_cPt_nv_V4 :
-    return Hexagon::STrih_indexed_shl_cdnPt_nv_V4;
-  case Hexagon::STrih_indexed_shl_cNotPt_nv_V4 :
-    return Hexagon::STrih_indexed_shl_cdnNotPt_nv_V4;
-
-  case Hexagon::POST_SThri_cPt_nv_V4 :
-    return Hexagon::POST_SThri_cdnPt_nv_V4;
-  case Hexagon::POST_SThri_cNotPt_nv_V4 :
-    return Hexagon::POST_SThri_cdnNotPt_nv_V4;
-
-  case Hexagon::STh_GP_cPt_nv_V4 :
-    return Hexagon::STh_GP_cdnPt_nv_V4;
-
-  case Hexagon::STh_GP_cNotPt_nv_V4 :
-    return Hexagon::STh_GP_cdnNotPt_nv_V4;
-
-  // Conditional store new-value word
-  case Hexagon::STriw_cPt_nv_V4 :
-    return  Hexagon::STriw_cdnPt_nv_V4;
-  case Hexagon::STriw_cNotPt_nv_V4 :
-    return Hexagon::STriw_cdnNotPt_nv_V4;
-
-  case Hexagon::STriw_indexed_cPt_nv_V4 :
-    return Hexagon::STriw_indexed_cdnPt_nv_V4;
-  case Hexagon::STriw_indexed_cNotPt_nv_V4 :
-    return Hexagon::STriw_indexed_cdnNotPt_nv_V4;
-
-  case Hexagon::STriw_indexed_shl_cPt_nv_V4 :
-    return Hexagon::STriw_indexed_shl_cdnPt_nv_V4;
-  case Hexagon::STriw_indexed_shl_cNotPt_nv_V4 :
-    return Hexagon::STriw_indexed_shl_cdnNotPt_nv_V4;
-
-  case Hexagon::POST_STwri_cPt_nv_V4 :
-    return Hexagon::POST_STwri_cdnPt_nv_V4;
-  case Hexagon::POST_STwri_cNotPt_nv_V4:
-    return Hexagon::POST_STwri_cdnNotPt_nv_V4;
-
-  case Hexagon::STw_GP_cPt_nv_V4 :
-    return Hexagon::STw_GP_cdnPt_nv_V4;
-
-  case Hexagon::STw_GP_cNotPt_nv_V4 :
-    return Hexagon::STw_GP_cdnNotPt_nv_V4;
-
-  // Conditional add
-  case Hexagon::ADD_ri_cPt :
-    return Hexagon::ADD_ri_cdnPt;
-  case Hexagon::ADD_ri_cNotPt :
-    return Hexagon::ADD_ri_cdnNotPt;
-
-  case Hexagon::ADD_rr_cPt :
-    return Hexagon::ADD_rr_cdnPt;
-  case Hexagon::ADD_rr_cNotPt :
-    return Hexagon::ADD_rr_cdnNotPt;
-
-  // Conditional logical Operations
-  case Hexagon::XOR_rr_cPt :
-    return Hexagon::XOR_rr_cdnPt;
-  case Hexagon::XOR_rr_cNotPt :
-    return Hexagon::XOR_rr_cdnNotPt;
-
-  case Hexagon::AND_rr_cPt :
-    return Hexagon::AND_rr_cdnPt;
-  case Hexagon::AND_rr_cNotPt :
-    return Hexagon::AND_rr_cdnNotPt;
-
-  case Hexagon::OR_rr_cPt :
-    return Hexagon::OR_rr_cdnPt;
-  case Hexagon::OR_rr_cNotPt :
-    return Hexagon::OR_rr_cdnNotPt;
-
-  // Conditional Subtract
-  case Hexagon::SUB_rr_cPt :
-    return Hexagon::SUB_rr_cdnPt;
-  case Hexagon::SUB_rr_cNotPt :
-    return Hexagon::SUB_rr_cdnNotPt;
-
-  // Conditional combine
-  case Hexagon::COMBINE_rr_cPt :
-    return Hexagon::COMBINE_rr_cdnPt;
-  case Hexagon::COMBINE_rr_cNotPt :
-    return Hexagon::COMBINE_rr_cdnNotPt;
-
-  case Hexagon::ASLH_cPt_V4 :
-    return Hexagon::ASLH_cdnPt_V4;
-  case Hexagon::ASLH_cNotPt_V4 :
-    return Hexagon::ASLH_cdnNotPt_V4;
-
-  case Hexagon::ASRH_cPt_V4 :
-    return Hexagon::ASRH_cdnPt_V4;
-  case Hexagon::ASRH_cNotPt_V4 :
-    return Hexagon::ASRH_cdnNotPt_V4;
-
-  case Hexagon::SXTB_cPt_V4 :
-    return Hexagon::SXTB_cdnPt_V4;
-  case Hexagon::SXTB_cNotPt_V4 :
-    return Hexagon::SXTB_cdnNotPt_V4;
-
-  case Hexagon::SXTH_cPt_V4 :
-    return Hexagon::SXTH_cdnPt_V4;
-  case Hexagon::SXTH_cNotPt_V4 :
-    return Hexagon::SXTH_cdnNotPt_V4;
-
-  case Hexagon::ZXTB_cPt_V4 :
-    return Hexagon::ZXTB_cdnPt_V4;
-  case Hexagon::ZXTB_cNotPt_V4 :
-    return Hexagon::ZXTB_cdnNotPt_V4;
-
-  case Hexagon::ZXTH_cPt_V4 :
-    return Hexagon::ZXTH_cdnPt_V4;
-  case Hexagon::ZXTH_cNotPt_V4 :
-    return Hexagon::ZXTH_cdnNotPt_V4;
-  }
-}
-
 // Returns true if an instruction can be promoted to .new predicate
 // or new-value store.
 bool HexagonPacketizerList::isNewifiable(MachineInstr* MI) {
-  if ( isCondInst(MI) || IsNewifyStore(MI))
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  if ( isCondInst(MI) || QII->mayBeNewStore(MI))
     return true;
   else
     return false;
@@ -1296,896 +437,38 @@ bool HexagonPacketizerList::PromoteToDotNew(MachineInstr* MI,
 
   int NewOpcode;
   if (RC == &Hexagon::PredRegsRegClass)
-    NewOpcode = GetDotNewPredOp(MI, MBPI, QII);
+    NewOpcode = QII->GetDotNewPredOp(MI, MBPI);
   else
-    NewOpcode = GetDotNewOp(MI->getOpcode());
+    NewOpcode = QII->GetDotNewOp(MI);
   MI->setDesc(QII->get(NewOpcode));
 
   return true;
 }
 
-// Returns the most basic instruction for the .new predicated instructions and
-// new-value stores.
-// For example, all of the following instructions will be converted back to the
-// same instruction:
-// 1) if (p0.new) memw(R0+#0) = R1.new  --->
-// 2) if (p0) memw(R0+#0)= R1.new      -------> if (p0) memw(R0+#0) = R1
-// 3) if (p0.new) memw(R0+#0) = R1      --->
-//
-// To understand the translation of instruction 1 to its original form, consider
-// a packet with 3 instructions.
-// { p0 = cmp.eq(R0,R1)
-//   if (p0.new) R2 = add(R3, R4)
-//   R5 = add (R3, R1)
-//   }
-// if (p0) memw(R5+#0) = R2 <--- trying to include it in the previous packet
-//
-// This instruction can be part of the previous packet only if both p0 and R2
-// are promoted to .new values. This promotion happens in steps, first
-// predicate register is promoted to .new and in the next iteration R2 is
-// promoted. Therefore, in case of dependence check failure (due to R5) during
-// next iteration, it should be converted back to its most basic form.
-
-static int GetDotOldOp(const int opc) {
-  switch (opc) {
-  default: llvm_unreachable("Unknown .old type");
-  case Hexagon::TFR_cdnPt:
-    return Hexagon::TFR_cPt;
-
-  case Hexagon::TFR_cdnNotPt:
-    return Hexagon::TFR_cNotPt;
-
-  case Hexagon::TFRI_cdnPt:
-    return Hexagon::TFRI_cPt;
-
-  case Hexagon::TFRI_cdnNotPt:
-    return Hexagon::TFRI_cNotPt;
-
-  case Hexagon::JMP_tnew_t:
-    return Hexagon::JMP_t;
-
-  case Hexagon::JMP_fnew_t:
-    return Hexagon::JMP_f;
-
-  case Hexagon::JMPR_tnew_tV3:
-    return Hexagon::JMPR_t;
-
-  case Hexagon::JMPR_fnew_tV3:
-    return Hexagon::JMPR_f;
-
-  // Load double word
-
-  case Hexagon::LDrid_cdnPt :
-    return Hexagon::LDrid_cPt;
-
-  case Hexagon::LDrid_cdnNotPt :
-    return Hexagon::LDrid_cNotPt;
-
-  case Hexagon::LDrid_indexed_cdnPt :
-    return Hexagon::LDrid_indexed_cPt;
-
-  case Hexagon::LDrid_indexed_cdnNotPt :
-    return Hexagon::LDrid_indexed_cNotPt;
-
-  case Hexagon::POST_LDrid_cdnPt_V4 :
-    return Hexagon::POST_LDrid_cPt;
-
-  case Hexagon::POST_LDrid_cdnNotPt_V4 :
-    return Hexagon::POST_LDrid_cNotPt;
-
-  // Load word
-
-  case Hexagon::LDriw_cdnPt :
-    return Hexagon::LDriw_cPt;
-
-  case Hexagon::LDriw_cdnNotPt :
-    return Hexagon::LDriw_cNotPt;
-
-  case Hexagon::LDriw_indexed_cdnPt :
-    return Hexagon::LDriw_indexed_cPt;
-
-  case Hexagon::LDriw_indexed_cdnNotPt :
-    return Hexagon::LDriw_indexed_cNotPt;
-
-  case Hexagon::POST_LDriw_cdnPt_V4 :
-    return Hexagon::POST_LDriw_cPt;
-
-  case Hexagon::POST_LDriw_cdnNotPt_V4 :
-    return Hexagon::POST_LDriw_cNotPt;
-
-  // Load half
-
-  case Hexagon::LDrih_cdnPt :
-    return Hexagon::LDrih_cPt;
-
-  case Hexagon::LDrih_cdnNotPt :
-    return Hexagon::LDrih_cNotPt;
-
-  case Hexagon::LDrih_indexed_cdnPt :
-    return Hexagon::LDrih_indexed_cPt;
-
-  case Hexagon::LDrih_indexed_cdnNotPt :
-    return Hexagon::LDrih_indexed_cNotPt;
-
-  case Hexagon::POST_LDrih_cdnPt_V4 :
-    return Hexagon::POST_LDrih_cPt;
-
-  case Hexagon::POST_LDrih_cdnNotPt_V4 :
-    return Hexagon::POST_LDrih_cNotPt;
-
-  // Load byte
-
-  case Hexagon::LDrib_cdnPt :
-    return Hexagon::LDrib_cPt;
-
-  case Hexagon::LDrib_cdnNotPt :
-    return Hexagon::LDrib_cNotPt;
-
-  case Hexagon::LDrib_indexed_cdnPt :
-    return Hexagon::LDrib_indexed_cPt;
-
-  case Hexagon::LDrib_indexed_cdnNotPt :
-    return Hexagon::LDrib_indexed_cNotPt;
-
-  case Hexagon::POST_LDrib_cdnPt_V4 :
-    return Hexagon::POST_LDrib_cPt;
-
-  case Hexagon::POST_LDrib_cdnNotPt_V4 :
-    return Hexagon::POST_LDrib_cNotPt;
-
-  // Load unsigned half
-
-  case Hexagon::LDriuh_cdnPt :
-    return Hexagon::LDriuh_cPt;
-
-  case Hexagon::LDriuh_cdnNotPt :
-    return Hexagon::LDriuh_cNotPt;
-
-  case Hexagon::LDriuh_indexed_cdnPt :
-    return Hexagon::LDriuh_indexed_cPt;
-
-  case Hexagon::LDriuh_indexed_cdnNotPt :
-    return Hexagon::LDriuh_indexed_cNotPt;
-
-  case Hexagon::POST_LDriuh_cdnPt_V4 :
-    return Hexagon::POST_LDriuh_cPt;
-
-  case Hexagon::POST_LDriuh_cdnNotPt_V4 :
-    return Hexagon::POST_LDriuh_cNotPt;
-
-  // Load unsigned byte
-  case Hexagon::LDriub_cdnPt :
-    return Hexagon::LDriub_cPt;
-
-  case Hexagon::LDriub_cdnNotPt :
-    return Hexagon::LDriub_cNotPt;
-
-  case Hexagon::LDriub_indexed_cdnPt :
-    return Hexagon::LDriub_indexed_cPt;
-
-  case Hexagon::LDriub_indexed_cdnNotPt :
-    return Hexagon::LDriub_indexed_cNotPt;
-
-  case Hexagon::POST_LDriub_cdnPt_V4 :
-    return Hexagon::POST_LDriub_cPt;
-
-  case Hexagon::POST_LDriub_cdnNotPt_V4 :
-    return Hexagon::POST_LDriub_cNotPt;
-
-  // V4 indexed+scaled Load
-
-  case Hexagon::LDrid_indexed_shl_cdnPt_V4 :
-    return Hexagon::LDrid_indexed_shl_cPt_V4;
-
-  case Hexagon::LDrid_indexed_shl_cdnNotPt_V4 :
-    return Hexagon::LDrid_indexed_shl_cNotPt_V4;
-
-  case Hexagon::LDrib_indexed_shl_cdnPt_V4 :
-    return Hexagon::LDrib_indexed_shl_cPt_V4;
-
-  case Hexagon::LDrib_indexed_shl_cdnNotPt_V4 :
-    return Hexagon::LDrib_indexed_shl_cNotPt_V4;
-
-  case Hexagon::LDriub_indexed_shl_cdnPt_V4 :
-    return Hexagon::LDriub_indexed_shl_cPt_V4;
-
-  case Hexagon::LDriub_indexed_shl_cdnNotPt_V4 :
-    return Hexagon::LDriub_indexed_shl_cNotPt_V4;
-
-  case Hexagon::LDrih_indexed_shl_cdnPt_V4 :
-    return Hexagon::LDrih_indexed_shl_cPt_V4;
-
-  case Hexagon::LDrih_indexed_shl_cdnNotPt_V4 :
-    return Hexagon::LDrih_indexed_shl_cNotPt_V4;
-
-  case Hexagon::LDriuh_indexed_shl_cdnPt_V4 :
-    return Hexagon::LDriuh_indexed_shl_cPt_V4;
-
-  case Hexagon::LDriuh_indexed_shl_cdnNotPt_V4 :
-    return Hexagon::LDriuh_indexed_shl_cNotPt_V4;
-
-  case Hexagon::LDriw_indexed_shl_cdnPt_V4 :
-    return Hexagon::LDriw_indexed_shl_cPt_V4;
-
-  case Hexagon::LDriw_indexed_shl_cdnNotPt_V4 :
-    return Hexagon::LDriw_indexed_shl_cNotPt_V4;
-
-  // V4 global address load
-
-  case Hexagon::LDd_GP_cdnPt_V4:
-    return Hexagon::LDd_GP_cPt_V4;
-
-  case Hexagon::LDd_GP_cdnNotPt_V4:
-    return Hexagon::LDd_GP_cNotPt_V4;
-
-  case Hexagon::LDb_GP_cdnPt_V4:
-    return Hexagon::LDb_GP_cPt_V4;
-
-  case Hexagon::LDb_GP_cdnNotPt_V4:
-    return Hexagon::LDb_GP_cNotPt_V4;
-
-  case Hexagon::LDub_GP_cdnPt_V4:
-    return Hexagon::LDub_GP_cPt_V4;
-
-  case Hexagon::LDub_GP_cdnNotPt_V4:
-    return Hexagon::LDub_GP_cNotPt_V4;
-
-  case Hexagon::LDh_GP_cdnPt_V4:
-    return Hexagon::LDh_GP_cPt_V4;
-
-  case Hexagon::LDh_GP_cdnNotPt_V4:
-    return Hexagon::LDh_GP_cNotPt_V4;
-
-  case Hexagon::LDuh_GP_cdnPt_V4:
-    return Hexagon::LDuh_GP_cPt_V4;
-
-  case Hexagon::LDuh_GP_cdnNotPt_V4:
-    return Hexagon::LDuh_GP_cNotPt_V4;
-
-  case Hexagon::LDw_GP_cdnPt_V4:
-    return Hexagon::LDw_GP_cPt_V4;
-
-  case Hexagon::LDw_GP_cdnNotPt_V4:
-    return Hexagon::LDw_GP_cNotPt_V4;
-
-  // Conditional add
-
-  case Hexagon::ADD_ri_cdnPt :
-    return Hexagon::ADD_ri_cPt;
-  case Hexagon::ADD_ri_cdnNotPt :
-    return Hexagon::ADD_ri_cNotPt;
-
-  case Hexagon::ADD_rr_cdnPt :
-    return Hexagon::ADD_rr_cPt;
-  case Hexagon::ADD_rr_cdnNotPt:
-    return Hexagon::ADD_rr_cNotPt;
-
-  // Conditional logical Operations
-
-  case Hexagon::XOR_rr_cdnPt :
-    return Hexagon::XOR_rr_cPt;
-  case Hexagon::XOR_rr_cdnNotPt :
-    return Hexagon::XOR_rr_cNotPt;
-
-  case Hexagon::AND_rr_cdnPt :
-    return Hexagon::AND_rr_cPt;
-  case Hexagon::AND_rr_cdnNotPt :
-    return Hexagon::AND_rr_cNotPt;
-
-  case Hexagon::OR_rr_cdnPt :
-    return Hexagon::OR_rr_cPt;
-  case Hexagon::OR_rr_cdnNotPt :
-    return Hexagon::OR_rr_cNotPt;
-
-  // Conditional Subtract
-
-  case Hexagon::SUB_rr_cdnPt :
-    return Hexagon::SUB_rr_cPt;
-  case Hexagon::SUB_rr_cdnNotPt :
-    return Hexagon::SUB_rr_cNotPt;
-
-  // Conditional combine
-
-  case Hexagon::COMBINE_rr_cdnPt :
-    return Hexagon::COMBINE_rr_cPt;
-  case Hexagon::COMBINE_rr_cdnNotPt :
-    return Hexagon::COMBINE_rr_cNotPt;
-
-// Conditional shift operations
-
-  case Hexagon::ASLH_cdnPt_V4 :
-    return Hexagon::ASLH_cPt_V4;
-  case Hexagon::ASLH_cdnNotPt_V4 :
-    return Hexagon::ASLH_cNotPt_V4;
-
-  case Hexagon::ASRH_cdnPt_V4 :
-    return Hexagon::ASRH_cPt_V4;
-  case Hexagon::ASRH_cdnNotPt_V4 :
-    return Hexagon::ASRH_cNotPt_V4;
-
-  case Hexagon::SXTB_cdnPt_V4 :
-    return Hexagon::SXTB_cPt_V4;
-  case Hexagon::SXTB_cdnNotPt_V4 :
-    return Hexagon::SXTB_cNotPt_V4;
-
-  case Hexagon::SXTH_cdnPt_V4 :
-    return Hexagon::SXTH_cPt_V4;
-  case Hexagon::SXTH_cdnNotPt_V4 :
-    return Hexagon::SXTH_cNotPt_V4;
-
-  case Hexagon::ZXTB_cdnPt_V4 :
-    return Hexagon::ZXTB_cPt_V4;
-  case Hexagon::ZXTB_cdnNotPt_V4 :
-    return Hexagon::ZXTB_cNotPt_V4;
-
-  case Hexagon::ZXTH_cdnPt_V4 :
-    return Hexagon::ZXTH_cPt_V4;
-  case Hexagon::ZXTH_cdnNotPt_V4 :
-    return Hexagon::ZXTH_cNotPt_V4;
-
-  // Store byte
-
-  case Hexagon::STrib_imm_cdnPt_V4 :
-    return Hexagon::STrib_imm_cPt_V4;
-
-  case Hexagon::STrib_imm_cdnNotPt_V4 :
-    return Hexagon::STrib_imm_cNotPt_V4;
-
-  case Hexagon::STrib_cdnPt_nv_V4 :
-  case Hexagon::STrib_cPt_nv_V4 :
-  case Hexagon::STrib_cdnPt_V4 :
-    return Hexagon::STrib_cPt;
-
-  case Hexagon::STrib_cdnNotPt_nv_V4 :
-  case Hexagon::STrib_cNotPt_nv_V4 :
-  case Hexagon::STrib_cdnNotPt_V4 :
-    return Hexagon::STrib_cNotPt;
-
-  case Hexagon::STrib_indexed_cdnPt_V4 :
-  case Hexagon::STrib_indexed_cPt_nv_V4 :
-  case Hexagon::STrib_indexed_cdnPt_nv_V4 :
-    return Hexagon::STrib_indexed_cPt;
-
-  case Hexagon::STrib_indexed_cdnNotPt_V4 :
-  case Hexagon::STrib_indexed_cNotPt_nv_V4 :
-  case Hexagon::STrib_indexed_cdnNotPt_nv_V4 :
-    return Hexagon::STrib_indexed_cNotPt;
-
-  case Hexagon::STrib_indexed_shl_cdnPt_nv_V4:
-  case Hexagon::STrib_indexed_shl_cPt_nv_V4 :
-  case Hexagon::STrib_indexed_shl_cdnPt_V4 :
-    return Hexagon::STrib_indexed_shl_cPt_V4;
-
-  case Hexagon::STrib_indexed_shl_cdnNotPt_nv_V4:
-  case Hexagon::STrib_indexed_shl_cNotPt_nv_V4 :
-  case Hexagon::STrib_indexed_shl_cdnNotPt_V4 :
-    return Hexagon::STrib_indexed_shl_cNotPt_V4;
-
-  case Hexagon::POST_STbri_cdnPt_nv_V4 :
-  case Hexagon::POST_STbri_cPt_nv_V4 :
-  case Hexagon::POST_STbri_cdnPt_V4 :
-    return Hexagon::POST_STbri_cPt;
-
-  case Hexagon::POST_STbri_cdnNotPt_nv_V4 :
-  case Hexagon::POST_STbri_cNotPt_nv_V4:
-  case Hexagon::POST_STbri_cdnNotPt_V4 :
-    return Hexagon::POST_STbri_cNotPt;
-
-  case Hexagon::STb_GP_cdnPt_nv_V4:
-  case Hexagon::STb_GP_cdnPt_V4:
-  case Hexagon::STb_GP_cPt_nv_V4:
-    return Hexagon::STb_GP_cPt_V4;
-
-  case Hexagon::STb_GP_cdnNotPt_nv_V4:
-  case Hexagon::STb_GP_cdnNotPt_V4:
-  case Hexagon::STb_GP_cNotPt_nv_V4:
-    return Hexagon::STb_GP_cNotPt_V4;
-
-  // Store new-value byte - unconditional
-  case Hexagon::STrib_nv_V4:
-    return Hexagon::STrib;
-
-  case Hexagon::STrib_indexed_nv_V4:
-    return Hexagon::STrib_indexed;
-
-  case Hexagon::STrib_indexed_shl_nv_V4:
-    return Hexagon::STrib_indexed_shl_V4;
-
-  case Hexagon::STrib_shl_nv_V4:
-    return Hexagon::STrib_shl_V4;
-
-  case Hexagon::STb_GP_nv_V4:
-    return Hexagon::STb_GP_V4;
-
-  case Hexagon::POST_STbri_nv_V4:
-    return Hexagon::POST_STbri;
-
-  // Store halfword
-  case Hexagon::STrih_imm_cdnPt_V4 :
-    return Hexagon::STrih_imm_cPt_V4;
-
-  case Hexagon::STrih_imm_cdnNotPt_V4 :
-    return Hexagon::STrih_imm_cNotPt_V4;
-
-  case Hexagon::STrih_cdnPt_nv_V4 :
-  case Hexagon::STrih_cPt_nv_V4 :
-  case Hexagon::STrih_cdnPt_V4 :
-    return Hexagon::STrih_cPt;
-
-  case Hexagon::STrih_cdnNotPt_nv_V4 :
-  case Hexagon::STrih_cNotPt_nv_V4 :
-  case Hexagon::STrih_cdnNotPt_V4 :
-    return Hexagon::STrih_cNotPt;
-
-  case Hexagon::STrih_indexed_cdnPt_nv_V4:
-  case Hexagon::STrih_indexed_cPt_nv_V4 :
-  case Hexagon::STrih_indexed_cdnPt_V4 :
-    return Hexagon::STrih_indexed_cPt;
-
-  case Hexagon::STrih_indexed_cdnNotPt_nv_V4:
-  case Hexagon::STrih_indexed_cNotPt_nv_V4 :
-  case Hexagon::STrih_indexed_cdnNotPt_V4 :
-    return Hexagon::STrih_indexed_cNotPt;
-
-  case Hexagon::STrih_indexed_shl_cdnPt_nv_V4 :
-  case Hexagon::STrih_indexed_shl_cPt_nv_V4 :
-  case Hexagon::STrih_indexed_shl_cdnPt_V4 :
-    return Hexagon::STrih_indexed_shl_cPt_V4;
-
-  case Hexagon::STrih_indexed_shl_cdnNotPt_nv_V4 :
-  case Hexagon::STrih_indexed_shl_cNotPt_nv_V4 :
-  case Hexagon::STrih_indexed_shl_cdnNotPt_V4 :
-    return Hexagon::STrih_indexed_shl_cNotPt_V4;
-
-  case Hexagon::POST_SThri_cdnPt_nv_V4 :
-  case Hexagon::POST_SThri_cPt_nv_V4 :
-  case Hexagon::POST_SThri_cdnPt_V4 :
-    return Hexagon::POST_SThri_cPt;
-
-  case Hexagon::POST_SThri_cdnNotPt_nv_V4 :
-  case Hexagon::POST_SThri_cNotPt_nv_V4 :
-  case Hexagon::POST_SThri_cdnNotPt_V4 :
-    return Hexagon::POST_SThri_cNotPt;
-
-  case Hexagon::STh_GP_cdnPt_nv_V4:
-  case Hexagon::STh_GP_cdnPt_V4:
-  case Hexagon::STh_GP_cPt_nv_V4:
-    return Hexagon::STh_GP_cPt_V4;
-
-  case Hexagon::STh_GP_cdnNotPt_nv_V4:
-  case Hexagon::STh_GP_cdnNotPt_V4:
-  case Hexagon::STh_GP_cNotPt_nv_V4:
-    return Hexagon::STh_GP_cNotPt_V4;
-
-  // Store new-value halfword - unconditional
-
-  case Hexagon::STrih_nv_V4:
-    return Hexagon::STrih;
-
-  case Hexagon::STrih_indexed_nv_V4:
-    return Hexagon::STrih_indexed;
-
-  case Hexagon::STrih_indexed_shl_nv_V4:
-    return Hexagon::STrih_indexed_shl_V4;
-
-  case Hexagon::STrih_shl_nv_V4:
-    return Hexagon::STrih_shl_V4;
-
-  case Hexagon::STh_GP_nv_V4:
-    return Hexagon::STh_GP_V4;
-
-  case Hexagon::POST_SThri_nv_V4:
-    return Hexagon::POST_SThri;
-
-   // Store word
-
-   case Hexagon::STriw_imm_cdnPt_V4 :
-    return Hexagon::STriw_imm_cPt_V4;
-
-  case Hexagon::STriw_imm_cdnNotPt_V4 :
-    return Hexagon::STriw_imm_cNotPt_V4;
-
-  case Hexagon::STriw_cdnPt_nv_V4 :
-  case Hexagon::STriw_cPt_nv_V4 :
-  case Hexagon::STriw_cdnPt_V4 :
-    return Hexagon::STriw_cPt;
-
-  case Hexagon::STriw_cdnNotPt_nv_V4 :
-  case Hexagon::STriw_cNotPt_nv_V4 :
-  case Hexagon::STriw_cdnNotPt_V4 :
-    return Hexagon::STriw_cNotPt;
-
-  case Hexagon::STriw_indexed_cdnPt_nv_V4 :
-  case Hexagon::STriw_indexed_cPt_nv_V4 :
-  case Hexagon::STriw_indexed_cdnPt_V4 :
-    return Hexagon::STriw_indexed_cPt;
-
-  case Hexagon::STriw_indexed_cdnNotPt_nv_V4 :
-  case Hexagon::STriw_indexed_cNotPt_nv_V4 :
-  case Hexagon::STriw_indexed_cdnNotPt_V4 :
-    return Hexagon::STriw_indexed_cNotPt;
-
-  case Hexagon::STriw_indexed_shl_cdnPt_nv_V4 :
-  case Hexagon::STriw_indexed_shl_cPt_nv_V4 :
-  case Hexagon::STriw_indexed_shl_cdnPt_V4 :
-    return Hexagon::STriw_indexed_shl_cPt_V4;
-
-  case Hexagon::STriw_indexed_shl_cdnNotPt_nv_V4 :
-  case Hexagon::STriw_indexed_shl_cNotPt_nv_V4 :
-  case Hexagon::STriw_indexed_shl_cdnNotPt_V4 :
-    return Hexagon::STriw_indexed_shl_cNotPt_V4;
-
-  case Hexagon::POST_STwri_cdnPt_nv_V4 :
-  case Hexagon::POST_STwri_cPt_nv_V4 :
-  case Hexagon::POST_STwri_cdnPt_V4 :
-    return Hexagon::POST_STwri_cPt;
-
-  case Hexagon::POST_STwri_cdnNotPt_nv_V4 :
-  case Hexagon::POST_STwri_cNotPt_nv_V4 :
-  case Hexagon::POST_STwri_cdnNotPt_V4 :
-    return Hexagon::POST_STwri_cNotPt;
-
-  case Hexagon::STw_GP_cdnPt_nv_V4:
-  case Hexagon::STw_GP_cdnPt_V4:
-  case Hexagon::STw_GP_cPt_nv_V4:
-    return Hexagon::STw_GP_cPt_V4;
-
-  case Hexagon::STw_GP_cdnNotPt_nv_V4:
-  case Hexagon::STw_GP_cdnNotPt_V4:
-  case Hexagon::STw_GP_cNotPt_nv_V4:
-    return Hexagon::STw_GP_cNotPt_V4;
-
-  // Store new-value word - unconditional
-
-  case Hexagon::STriw_nv_V4:
-    return Hexagon::STriw;
-
-  case Hexagon::STriw_indexed_nv_V4:
-    return Hexagon::STriw_indexed;
-
-  case Hexagon::STriw_indexed_shl_nv_V4:
-    return Hexagon::STriw_indexed_shl_V4;
-
-  case Hexagon::STriw_shl_nv_V4:
-    return Hexagon::STriw_shl_V4;
-
-  case Hexagon::STw_GP_nv_V4:
-    return Hexagon::STw_GP_V4;
-
-  case Hexagon::POST_STwri_nv_V4:
-    return Hexagon::POST_STwri;
-
- // Store doubleword
-
-  case Hexagon::STrid_cdnPt_V4 :
-    return Hexagon::STrid_cPt;
-
-  case Hexagon::STrid_cdnNotPt_V4 :
-    return Hexagon::STrid_cNotPt;
-
-  case Hexagon::STrid_indexed_cdnPt_V4 :
-    return Hexagon::STrid_indexed_cPt;
-
-  case Hexagon::STrid_indexed_cdnNotPt_V4 :
-    return Hexagon::STrid_indexed_cNotPt;
-
-  case Hexagon::STrid_indexed_shl_cdnPt_V4 :
-    return Hexagon::STrid_indexed_shl_cPt_V4;
-
-  case Hexagon::STrid_indexed_shl_cdnNotPt_V4 :
-    return Hexagon::STrid_indexed_shl_cNotPt_V4;
-
-  case Hexagon::POST_STdri_cdnPt_V4 :
-    return Hexagon::POST_STdri_cPt;
-
-  case Hexagon::POST_STdri_cdnNotPt_V4 :
-    return Hexagon::POST_STdri_cNotPt;
-
-  case Hexagon::STd_GP_cdnPt_V4 :
-    return Hexagon::STd_GP_cPt_V4;
-
-  case Hexagon::STd_GP_cdnNotPt_V4 :
-    return Hexagon::STd_GP_cNotPt_V4;
-
-  }
-}
-
 bool HexagonPacketizerList::DemoteToDotOld(MachineInstr* MI) {
   const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
-  int NewOpcode = GetDotOldOp(MI->getOpcode());
+  int NewOpcode = QII->GetDotOldOp(MI->getOpcode());
   MI->setDesc(QII->get(NewOpcode));
   return true;
 }
 
-// Returns true if an instruction is predicated on p0 and false if it's
-// predicated on !p0.
+enum PredicateKind {
+  PK_False,
+  PK_True,
+  PK_Unknown
+};
 
-static bool GetPredicateSense(MachineInstr* MI,
-                              const HexagonInstrInfo *QII) {
+/// Returns true if an instruction is predicated on p0 and false if it's
+/// predicated on !p0.
+static PredicateKind getPredicateSense(MachineInstr* MI,
+                                       const HexagonInstrInfo *QII) {
+  if (!QII->isPredicated(MI))
+    return PK_Unknown;
 
-  switch (MI->getOpcode()) {
-  default: llvm_unreachable("Unknown predicate sense of the instruction");
-  case Hexagon::TFR_cPt:
-  case Hexagon::TFR_cdnPt:
-  case Hexagon::TFRI_cPt:
-  case Hexagon::TFRI_cdnPt:
-  case Hexagon::STrib_cPt :
-  case Hexagon::STrib_cdnPt_V4 :
-  case Hexagon::STrib_indexed_cPt :
-  case Hexagon::STrib_indexed_cdnPt_V4 :
-  case Hexagon::STrib_indexed_shl_cPt_V4 :
-  case Hexagon::STrib_indexed_shl_cdnPt_V4 :
-  case Hexagon::POST_STbri_cPt :
-  case Hexagon::POST_STbri_cdnPt_V4 :
-  case Hexagon::STrih_cPt :
-  case Hexagon::STrih_cdnPt_V4 :
-  case Hexagon::STrih_indexed_cPt :
-  case Hexagon::STrih_indexed_cdnPt_V4 :
-  case Hexagon::STrih_indexed_shl_cPt_V4 :
-  case Hexagon::STrih_indexed_shl_cdnPt_V4 :
-  case Hexagon::POST_SThri_cPt :
-  case Hexagon::POST_SThri_cdnPt_V4 :
-  case Hexagon::STriw_cPt :
-  case Hexagon::STriw_cdnPt_V4 :
-  case Hexagon::STriw_indexed_cPt :
-  case Hexagon::STriw_indexed_cdnPt_V4 :
-  case Hexagon::STriw_indexed_shl_cPt_V4 :
-  case Hexagon::STriw_indexed_shl_cdnPt_V4 :
-  case Hexagon::POST_STwri_cPt :
-  case Hexagon::POST_STwri_cdnPt_V4 :
-  case Hexagon::STrib_imm_cPt_V4 :
-  case Hexagon::STrib_imm_cdnPt_V4 :
-  case Hexagon::STrid_cPt :
-  case Hexagon::STrid_cdnPt_V4 :
-  case Hexagon::STrid_indexed_cPt :
-  case Hexagon::STrid_indexed_cdnPt_V4 :
-  case Hexagon::STrid_indexed_shl_cPt_V4 :
-  case Hexagon::STrid_indexed_shl_cdnPt_V4 :
-  case Hexagon::POST_STdri_cPt :
-  case Hexagon::POST_STdri_cdnPt_V4 :
-  case Hexagon::STrih_imm_cPt_V4 :
-  case Hexagon::STrih_imm_cdnPt_V4 :
-  case Hexagon::STriw_imm_cPt_V4 :
-  case Hexagon::STriw_imm_cdnPt_V4 :
-  case Hexagon::JMP_tnew_t :
-  case Hexagon::LDrid_cPt :
-  case Hexagon::LDrid_cdnPt :
-  case Hexagon::LDrid_indexed_cPt :
-  case Hexagon::LDrid_indexed_cdnPt :
-  case Hexagon::POST_LDrid_cPt :
-  case Hexagon::POST_LDrid_cdnPt_V4 :
-  case Hexagon::LDriw_cPt :
-  case Hexagon::LDriw_cdnPt :
-  case Hexagon::LDriw_indexed_cPt :
-  case Hexagon::LDriw_indexed_cdnPt :
-  case Hexagon::POST_LDriw_cPt :
-  case Hexagon::POST_LDriw_cdnPt_V4 :
-  case Hexagon::LDrih_cPt :
-  case Hexagon::LDrih_cdnPt :
-  case Hexagon::LDrih_indexed_cPt :
-  case Hexagon::LDrih_indexed_cdnPt :
-  case Hexagon::POST_LDrih_cPt :
-  case Hexagon::POST_LDrih_cdnPt_V4 :
-  case Hexagon::LDrib_cPt :
-  case Hexagon::LDrib_cdnPt :
-  case Hexagon::LDrib_indexed_cPt :
-  case Hexagon::LDrib_indexed_cdnPt :
-  case Hexagon::POST_LDrib_cPt :
-  case Hexagon::POST_LDrib_cdnPt_V4 :
-  case Hexagon::LDriuh_cPt :
-  case Hexagon::LDriuh_cdnPt :
-  case Hexagon::LDriuh_indexed_cPt :
-  case Hexagon::LDriuh_indexed_cdnPt :
-  case Hexagon::POST_LDriuh_cPt :
-  case Hexagon::POST_LDriuh_cdnPt_V4 :
-  case Hexagon::LDriub_cPt :
-  case Hexagon::LDriub_cdnPt :
-  case Hexagon::LDriub_indexed_cPt :
-  case Hexagon::LDriub_indexed_cdnPt :
-  case Hexagon::POST_LDriub_cPt :
-  case Hexagon::POST_LDriub_cdnPt_V4 :
-  case Hexagon::LDrid_indexed_shl_cPt_V4 :
-  case Hexagon::LDrid_indexed_shl_cdnPt_V4 :
-  case Hexagon::LDrib_indexed_shl_cPt_V4 :
-  case Hexagon::LDrib_indexed_shl_cdnPt_V4 :
-  case Hexagon::LDriub_indexed_shl_cPt_V4 :
-  case Hexagon::LDriub_indexed_shl_cdnPt_V4 :
-  case Hexagon::LDrih_indexed_shl_cPt_V4 :
-  case Hexagon::LDrih_indexed_shl_cdnPt_V4 :
-  case Hexagon::LDriuh_indexed_shl_cPt_V4 :
-  case Hexagon::LDriuh_indexed_shl_cdnPt_V4 :
-  case Hexagon::LDriw_indexed_shl_cPt_V4 :
-  case Hexagon::LDriw_indexed_shl_cdnPt_V4 :
-  case Hexagon::ADD_ri_cPt :
-  case Hexagon::ADD_ri_cdnPt :
-  case Hexagon::ADD_rr_cPt :
-  case Hexagon::ADD_rr_cdnPt :
-  case Hexagon::XOR_rr_cPt :
-  case Hexagon::XOR_rr_cdnPt :
-  case Hexagon::AND_rr_cPt :
-  case Hexagon::AND_rr_cdnPt :
-  case Hexagon::OR_rr_cPt :
-  case Hexagon::OR_rr_cdnPt :
-  case Hexagon::SUB_rr_cPt :
-  case Hexagon::SUB_rr_cdnPt :
-  case Hexagon::COMBINE_rr_cPt :
-  case Hexagon::COMBINE_rr_cdnPt :
-  case Hexagon::ASLH_cPt_V4 :
-  case Hexagon::ASLH_cdnPt_V4 :
-  case Hexagon::ASRH_cPt_V4 :
-  case Hexagon::ASRH_cdnPt_V4 :
-  case Hexagon::SXTB_cPt_V4 :
-  case Hexagon::SXTB_cdnPt_V4 :
-  case Hexagon::SXTH_cPt_V4 :
-  case Hexagon::SXTH_cdnPt_V4 :
-  case Hexagon::ZXTB_cPt_V4 :
-  case Hexagon::ZXTB_cdnPt_V4 :
-  case Hexagon::ZXTH_cPt_V4 :
-  case Hexagon::ZXTH_cdnPt_V4 :
-  case Hexagon::LDd_GP_cPt_V4 :
-  case Hexagon::LDb_GP_cPt_V4 :
-  case Hexagon::LDub_GP_cPt_V4 :
-  case Hexagon::LDh_GP_cPt_V4 :
-  case Hexagon::LDuh_GP_cPt_V4 :
-  case Hexagon::LDw_GP_cPt_V4 :
-  case Hexagon::STd_GP_cPt_V4 :
-  case Hexagon::STb_GP_cPt_V4 :
-  case Hexagon::STh_GP_cPt_V4 :
-  case Hexagon::STw_GP_cPt_V4 :
-  case Hexagon::LDd_GP_cdnPt_V4 :
-  case Hexagon::LDb_GP_cdnPt_V4 :
-  case Hexagon::LDub_GP_cdnPt_V4 :
-  case Hexagon::LDh_GP_cdnPt_V4 :
-  case Hexagon::LDuh_GP_cdnPt_V4 :
-  case Hexagon::LDw_GP_cdnPt_V4 :
-  case Hexagon::STd_GP_cdnPt_V4 :
-  case Hexagon::STb_GP_cdnPt_V4 :
-  case Hexagon::STh_GP_cdnPt_V4 :
-  case Hexagon::STw_GP_cdnPt_V4 :
-    return true;
+  if (QII->isPredicatedTrue(MI))
+    return PK_True;
 
-  case Hexagon::TFR_cNotPt:
-  case Hexagon::TFR_cdnNotPt:
-  case Hexagon::TFRI_cNotPt:
-  case Hexagon::TFRI_cdnNotPt:
-  case Hexagon::STrib_cNotPt :
-  case Hexagon::STrib_cdnNotPt_V4 :
-  case Hexagon::STrib_indexed_cNotPt :
-  case Hexagon::STrib_indexed_cdnNotPt_V4 :
-  case Hexagon::STrib_indexed_shl_cNotPt_V4 :
-  case Hexagon::STrib_indexed_shl_cdnNotPt_V4 :
-  case Hexagon::POST_STbri_cNotPt :
-  case Hexagon::POST_STbri_cdnNotPt_V4 :
-  case Hexagon::STrih_cNotPt :
-  case Hexagon::STrih_cdnNotPt_V4 :
-  case Hexagon::STrih_indexed_cNotPt :
-  case Hexagon::STrih_indexed_cdnNotPt_V4 :
-  case Hexagon::STrih_indexed_shl_cNotPt_V4 :
-  case Hexagon::STrih_indexed_shl_cdnNotPt_V4 :
-  case Hexagon::POST_SThri_cNotPt :
-  case Hexagon::POST_SThri_cdnNotPt_V4 :
-  case Hexagon::STriw_cNotPt :
-  case Hexagon::STriw_cdnNotPt_V4 :
-  case Hexagon::STriw_indexed_cNotPt :
-  case Hexagon::STriw_indexed_cdnNotPt_V4 :
-  case Hexagon::STriw_indexed_shl_cNotPt_V4 :
-  case Hexagon::STriw_indexed_shl_cdnNotPt_V4 :
-  case Hexagon::POST_STwri_cNotPt :
-  case Hexagon::POST_STwri_cdnNotPt_V4 :
-  case Hexagon::STrib_imm_cNotPt_V4 :
-  case Hexagon::STrib_imm_cdnNotPt_V4 :
-  case Hexagon::STrid_cNotPt :
-  case Hexagon::STrid_cdnNotPt_V4 :
-  case Hexagon::STrid_indexed_cdnNotPt_V4 :
-  case Hexagon::STrid_indexed_cNotPt :
-  case Hexagon::STrid_indexed_shl_cNotPt_V4 :
-  case Hexagon::STrid_indexed_shl_cdnNotPt_V4 :
-  case Hexagon::POST_STdri_cNotPt :
-  case Hexagon::POST_STdri_cdnNotPt_V4 :
-  case Hexagon::STrih_imm_cNotPt_V4 :
-  case Hexagon::STrih_imm_cdnNotPt_V4 :
-  case Hexagon::STriw_imm_cNotPt_V4 :
-  case Hexagon::STriw_imm_cdnNotPt_V4 :
-  case Hexagon::JMP_fnew_t :
-  case Hexagon::LDrid_cNotPt :
-  case Hexagon::LDrid_cdnNotPt :
-  case Hexagon::LDrid_indexed_cNotPt :
-  case Hexagon::LDrid_indexed_cdnNotPt :
-  case Hexagon::POST_LDrid_cNotPt :
-  case Hexagon::POST_LDrid_cdnNotPt_V4 :
-  case Hexagon::LDriw_cNotPt :
-  case Hexagon::LDriw_cdnNotPt :
-  case Hexagon::LDriw_indexed_cNotPt :
-  case Hexagon::LDriw_indexed_cdnNotPt :
-  case Hexagon::POST_LDriw_cNotPt :
-  case Hexagon::POST_LDriw_cdnNotPt_V4 :
-  case Hexagon::LDrih_cNotPt :
-  case Hexagon::LDrih_cdnNotPt :
-  case Hexagon::LDrih_indexed_cNotPt :
-  case Hexagon::LDrih_indexed_cdnNotPt :
-  case Hexagon::POST_LDrih_cNotPt :
-  case Hexagon::POST_LDrih_cdnNotPt_V4 :
-  case Hexagon::LDrib_cNotPt :
-  case Hexagon::LDrib_cdnNotPt :
-  case Hexagon::LDrib_indexed_cNotPt :
-  case Hexagon::LDrib_indexed_cdnNotPt :
-  case Hexagon::POST_LDrib_cNotPt :
-  case Hexagon::POST_LDrib_cdnNotPt_V4 :
-  case Hexagon::LDriuh_cNotPt :
-  case Hexagon::LDriuh_cdnNotPt :
-  case Hexagon::LDriuh_indexed_cNotPt :
-  case Hexagon::LDriuh_indexed_cdnNotPt :
-  case Hexagon::POST_LDriuh_cNotPt :
-  case Hexagon::POST_LDriuh_cdnNotPt_V4 :
-  case Hexagon::LDriub_cNotPt :
-  case Hexagon::LDriub_cdnNotPt :
-  case Hexagon::LDriub_indexed_cNotPt :
-  case Hexagon::LDriub_indexed_cdnNotPt :
-  case Hexagon::POST_LDriub_cNotPt :
-  case Hexagon::POST_LDriub_cdnNotPt_V4 :
-  case Hexagon::LDrid_indexed_shl_cNotPt_V4 :
-  case Hexagon::LDrid_indexed_shl_cdnNotPt_V4 :
-  case Hexagon::LDrib_indexed_shl_cNotPt_V4 :
-  case Hexagon::LDrib_indexed_shl_cdnNotPt_V4 :
-  case Hexagon::LDriub_indexed_shl_cNotPt_V4 :
-  case Hexagon::LDriub_indexed_shl_cdnNotPt_V4 :
-  case Hexagon::LDrih_indexed_shl_cNotPt_V4 :
-  case Hexagon::LDrih_indexed_shl_cdnNotPt_V4 :
-  case Hexagon::LDriuh_indexed_shl_cNotPt_V4 :
-  case Hexagon::LDriuh_indexed_shl_cdnNotPt_V4 :
-  case Hexagon::LDriw_indexed_shl_cNotPt_V4 :
-  case Hexagon::LDriw_indexed_shl_cdnNotPt_V4 :
-  case Hexagon::ADD_ri_cNotPt :
-  case Hexagon::ADD_ri_cdnNotPt :
-  case Hexagon::ADD_rr_cNotPt :
-  case Hexagon::ADD_rr_cdnNotPt :
-  case Hexagon::XOR_rr_cNotPt :
-  case Hexagon::XOR_rr_cdnNotPt :
-  case Hexagon::AND_rr_cNotPt :
-  case Hexagon::AND_rr_cdnNotPt :
-  case Hexagon::OR_rr_cNotPt :
-  case Hexagon::OR_rr_cdnNotPt :
-  case Hexagon::SUB_rr_cNotPt :
-  case Hexagon::SUB_rr_cdnNotPt :
-  case Hexagon::COMBINE_rr_cNotPt :
-  case Hexagon::COMBINE_rr_cdnNotPt :
-  case Hexagon::ASLH_cNotPt_V4 :
-  case Hexagon::ASLH_cdnNotPt_V4 :
-  case Hexagon::ASRH_cNotPt_V4 :
-  case Hexagon::ASRH_cdnNotPt_V4 :
-  case Hexagon::SXTB_cNotPt_V4 :
-  case Hexagon::SXTB_cdnNotPt_V4 :
-  case Hexagon::SXTH_cNotPt_V4 :
-  case Hexagon::SXTH_cdnNotPt_V4 :
-  case Hexagon::ZXTB_cNotPt_V4 :
-  case Hexagon::ZXTB_cdnNotPt_V4 :
-  case Hexagon::ZXTH_cNotPt_V4 :
-  case Hexagon::ZXTH_cdnNotPt_V4 :
-
-  case Hexagon::LDd_GP_cNotPt_V4 :
-  case Hexagon::LDb_GP_cNotPt_V4 :
-  case Hexagon::LDub_GP_cNotPt_V4 :
-  case Hexagon::LDh_GP_cNotPt_V4 :
-  case Hexagon::LDuh_GP_cNotPt_V4 :
-  case Hexagon::LDw_GP_cNotPt_V4 :
-  case Hexagon::STd_GP_cNotPt_V4 :
-  case Hexagon::STb_GP_cNotPt_V4 :
-  case Hexagon::STh_GP_cNotPt_V4 :
-  case Hexagon::STw_GP_cNotPt_V4 :
-  case Hexagon::LDd_GP_cdnNotPt_V4 :
-  case Hexagon::LDb_GP_cdnNotPt_V4 :
-  case Hexagon::LDub_GP_cdnNotPt_V4 :
-  case Hexagon::LDh_GP_cdnNotPt_V4 :
-  case Hexagon::LDuh_GP_cdnNotPt_V4 :
-  case Hexagon::LDw_GP_cdnNotPt_V4 :
-  case Hexagon::STd_GP_cdnNotPt_V4 :
-  case Hexagon::STb_GP_cdnNotPt_V4 :
-  case Hexagon::STh_GP_cdnNotPt_V4 :
-  case Hexagon::STw_GP_cdnNotPt_V4 :
-    return false;
-  }
-  // return *some value* to avoid compiler warning
-  return false;
+  return PK_False;
 }
 
 static MachineOperand& GetPostIncrementOperand(MachineInstr *MI,
@@ -2254,10 +537,10 @@ static MachineOperand& GetStoreValueOperand(MachineInstr *MI) {
 //    Arch Spec: 3.4.4.2
 bool HexagonPacketizerList::CanPromoteToNewValueStore( MachineInstr *MI,
                 MachineInstr *PacketMI, unsigned DepReg,
-                std::map <MachineInstr*, SUnit*> MIToSUnit)
-{
-  // Make sure we are looking at the store
-  if (!IsNewifyStore(MI))
+                std::map <MachineInstr*, SUnit*> MIToSUnit) {
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  // Make sure we are looking at the store, that can be promoted.
+  if (!QII->mayBeNewStore(MI))
     return false;
 
   // Make sure there is dependency and can be new value'ed
@@ -2265,12 +548,11 @@ bool HexagonPacketizerList::CanPromoteToNewValueStore( MachineInstr *MI,
       GetStoreValueOperand(MI).getReg() != DepReg)
     return false;
 
-  const HexagonRegisterInfo* QRI = 
+  const HexagonRegisterInfo* QRI =
                             (const HexagonRegisterInfo *) TM.getRegisterInfo();
   const MCInstrDesc& MCID = PacketMI->getDesc();
   // first operand is always the result
 
-  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
   const TargetRegisterClass* PacketRC = QII->getRegClass(MCID, 0, QRI, MF);
 
   // if there is already an store in the packet, no can do new value store
@@ -2313,7 +595,7 @@ bool HexagonPacketizerList::CanPromoteToNewValueStore( MachineInstr *MI,
   }
 
   // If the source that feeds the store is predicated, new value store must
-  // also be also predicated.
+  // also be predicated.
   if (QII->isPredicated(PacketMI)) {
     if (!QII->isPredicated(MI))
       return false;
@@ -2359,7 +641,7 @@ bool HexagonPacketizerList::CanPromoteToNewValueStore( MachineInstr *MI,
 
     if (( predRegNumDst != predRegNumSrc) ||
           QII->isDotNewInst(PacketMI) != QII->isDotNewInst(MI)  ||
-          GetPredicateSense(MI, QII) != GetPredicateSense(PacketMI, QII)) {
+          getPredicateSense(MI, QII) != getPredicateSense(PacketMI, QII)) {
       return false;
     }
   }
@@ -2440,10 +722,11 @@ bool HexagonPacketizerList::CanPromoteToNewValue( MachineInstr *MI,
                 MachineBasicBlock::iterator &MII)
 {
 
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
   const HexagonRegisterInfo* QRI =
                             (const HexagonRegisterInfo *) TM.getRegisterInfo();
   if (!QRI->Subtarget.hasV4TOps() ||
-      !IsNewifyStore(MI))
+      !QII->mayBeNewStore(MI))
     return false;
 
   MachineInstr *PacketMI = PacketSU->getInstr();
@@ -2470,7 +753,7 @@ bool HexagonPacketizerList::CanPromoteToDotNew( MachineInstr *MI,
 {
   const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
   // Already a dot new instruction.
-  if (QII->isDotNewInst(MI) && !IsNewifyStore(MI))
+  if (QII->isDotNewInst(MI) && !QII->mayBeNewStore(MI))
     return false;
 
   if (!isNewifiable(MI))
@@ -2480,12 +763,12 @@ bool HexagonPacketizerList::CanPromoteToDotNew( MachineInstr *MI,
   if (RC == &Hexagon::PredRegsRegClass && isCondInst(MI))
       return true;
   else if (RC != &Hexagon::PredRegsRegClass &&
-      !IsNewifyStore(MI)) // MI is not a new-value store
+      !QII->mayBeNewStore(MI)) // MI is not a new-value store
     return false;
   else {
     // Create a dot new machine instruction to see if resources can be
     // allocated. If not, bail out now.
-    int NewOpcode = GetDotNewOp(MI->getOpcode());
+    int NewOpcode = QII->GetDotNewOp(MI);
     const MCInstrDesc &desc = QII->get(NewOpcode);
     DebugLoc dl;
     MachineInstr *NewMI =
@@ -2554,16 +837,39 @@ bool HexagonPacketizerList::RestrictingDepExistInPacket (MachineInstr* MI,
 }
 
 
+/// Gets the predicate register of a predicated instruction.
+static unsigned getPredicatedRegister(MachineInstr *MI,
+                                      const HexagonInstrInfo *QII) {
+  /// We use the following rule: The first predicate register that is a use is
+  /// the predicate register of a predicated instruction.
+
+  assert(QII->isPredicated(MI) && "Must be predicated instruction");
+
+  for (MachineInstr::mop_iterator OI = MI->operands_begin(),
+       OE = MI->operands_end(); OI != OE; ++OI) {
+    MachineOperand &Op = *OI;
+    if (Op.isReg() && Op.getReg() && Op.isUse() &&
+        Hexagon::PredRegsRegClass.contains(Op.getReg()))
+      return Op.getReg();
+  }
+
+  llvm_unreachable("Unknown instruction operand layout");
+
+  return 0;
+}
+
 // Given two predicated instructions, this function detects whether
 // the predicates are complements
 bool HexagonPacketizerList::ArePredicatesComplements (MachineInstr* MI1,
      MachineInstr* MI2, std::map <MachineInstr*, SUnit*> MIToSUnit) {
 
   const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
-  // Currently can only reason about conditional transfers
-  if (!QII->isConditionalTransfer(MI1) || !QII->isConditionalTransfer(MI2)) {
+
+  // If we don't know the predicate sense of the instructions bail out early, we
+  // need it later.
+  if (getPredicateSense(MI1, QII) == PK_Unknown ||
+      getPredicateSense(MI2, QII) == PK_Unknown)
     return false;
-  }
 
   // Scheduling unit for candidate
   SUnit* SU = MIToSUnit[MI1];
@@ -2602,9 +908,9 @@ bool HexagonPacketizerList::ArePredicatesComplements (MachineInstr* MI1,
         // there already exist anti dep on the same pred in
         // the packet.
         if (PacketSU->Succs[i].getSUnit() == SU &&
+            PacketSU->Succs[i].getKind() == SDep::Data &&
             Hexagon::PredRegsRegClass.contains(
               PacketSU->Succs[i].getReg()) &&
-            PacketSU->Succs[i].getKind() == SDep::Data &&
             // Here I know that *VIN is predicate setting instruction
             // with true data dep to candidate on the register
             // we care about - c) in the above example.
@@ -2625,8 +931,12 @@ bool HexagonPacketizerList::ArePredicatesComplements (MachineInstr* MI1,
   // that the predicate sense is different
   // We also need to differentiate .old vs. .new:
   // !p0 is not complimentary to p0.new
-  return ((MI1->getOperand(1).getReg() == MI2->getOperand(1).getReg()) &&
-          (GetPredicateSense(MI1, QII) != GetPredicateSense(MI2, QII)) &&
+  unsigned PReg1 = getPredicatedRegister(MI1, QII);
+  unsigned PReg2 = getPredicatedRegister(MI2, QII);
+  return ((PReg1 == PReg2) &&
+          Hexagon::PredRegsRegClass.contains(PReg1) &&
+          Hexagon::PredRegsRegClass.contains(PReg2) &&
+          (getPredicateSense(MI1, QII) != getPredicateSense(MI2, QII)) &&
           (QII->isDotNewInst(MI1) == QII->isDotNewInst(MI2)));
 }
 
@@ -2724,24 +1034,21 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
   }
 
   // A LoopN instruction cannot appear in the same packet as a jump or call.
-  if (IsLoopN(I) && (   IsDirectJump(J)
-                     || MCIDJ.isCall()
-                     || QII->isDeallocRet(J))) {
+  if (IsLoopN(I) &&
+     (IsDirectJump(J) || MCIDJ.isCall() || QII->isDeallocRet(J))) {
     Dependence = true;
     return false;
   }
-  if (IsLoopN(J) && (   IsDirectJump(I)
-                     || MCIDI.isCall()
-                     || QII->isDeallocRet(I))) {
+  if (IsLoopN(J) &&
+     (IsDirectJump(I) || MCIDI.isCall() || QII->isDeallocRet(I))) {
     Dependence = true;
     return false;
   }
 
   // dealloc_return cannot appear in the same packet as a conditional or
   // unconditional jump.
-  if (QII->isDeallocRet(I) && (   MCIDJ.isBranch()
-                               || MCIDJ.isCall()
-                               || MCIDJ.isBarrier())) {
+  if (QII->isDeallocRet(I) &&
+     (MCIDJ.isBranch() || MCIDJ.isCall() || MCIDJ.isBarrier())) {
     Dependence = true;
     return false;
   }
@@ -2766,7 +1073,7 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
     }
 
     //if dealloc_return
-    if (MCIDJ.mayStore() && QII->isDeallocRet(I)){
+    if (MCIDJ.mayStore() && QII->isDeallocRet(I)) {
       Dependence = true;
       return false;
     }
@@ -2774,9 +1081,8 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
     // If an instruction feeds new value jump, glue it.
     MachineBasicBlock::iterator NextMII = I;
     ++NextMII;
-    MachineInstr *NextMI = NextMII;
-
-    if (QII->isNewValueJump(NextMI)) {
+    if (NextMII != I->getParent()->end() && QII->isNewValueJump(NextMII)) {
+      MachineInstr *NextMI = NextMII;
 
       bool secondRegMatch = false;
       bool maintainNewValueJump = false;
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index d4a93b5..e0f5a27 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -65,7 +65,8 @@ namespace HexagonII {
     AbsoluteSet    = 2,  // Absolute set addressing mode
     BaseImmOffset  = 3,  // Indirect with offset
     BaseLongOffset = 4,  // Indirect with long offset
-    BaseRegOffset  = 5   // Indirect with register offset
+    BaseRegOffset  = 5,  // Indirect with register offset
+    PostInc        = 6   // Post increment addressing mode
   };
 
   enum MemAccessSize {
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
index 3deb8d1..495dbb9 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
@@ -15,7 +15,7 @@
 
 using namespace llvm;
 
-HexagonMCAsmInfo::HexagonMCAsmInfo(const Target &T, StringRef TT) {
+HexagonMCAsmInfo::HexagonMCAsmInfo(StringRef TT) {
   Data16bitsDirective = "\t.half\t";
   Data32bitsDirective = "\t.word\t";
   Data64bitsDirective = 0;  // .xword is only supported by V9.
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
index d336cd5..0b94d21 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
@@ -18,11 +18,9 @@
 #include "llvm/MC/MCAsmInfo.h"
 
 namespace llvm {
-  class Target;
-
   class HexagonMCAsmInfo : public MCAsmInfo {
   public:
-    explicit HexagonMCAsmInfo(const Target &T, StringRef TT);
+    explicit HexagonMCAsmInfo(StringRef TT);
   };
 
 } // namespace llvm
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 6b1d2d1..2f93a52 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -54,13 +54,14 @@ static MCSubtargetInfo *createHexagonMCSubtargetInfo(StringRef TT,
   return X;
 }
 
-static MCAsmInfo *createHexagonMCAsmInfo(const Target &T, StringRef TT) {
-  MCAsmInfo *MAI = new HexagonMCAsmInfo(T, TT);
+static MCAsmInfo *createHexagonMCAsmInfo(const MCRegisterInfo &MRI,
+                                         StringRef TT) {
+  MCAsmInfo *MAI = new HexagonMCAsmInfo(TT);
 
   // VirtualFP = (R30 + #0).
-  MachineLocation Dst(MachineLocation::VirtualFP);
-  MachineLocation Src(Hexagon::R30, 0);
-  MAI->addInitialFrameState(0, Dst, Src);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(
+      0, Hexagon::R30, 0);
+  MAI->addInitialFrameState(Inst);
 
   return MAI;
 }
diff --git a/lib/Target/LLVMBuild.txt b/lib/Target/LLVMBuild.txt
index c06e8bc..1022ae9 100644
--- a/lib/Target/LLVMBuild.txt
+++ b/lib/Target/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = AArch64 ARM CppBackend Hexagon MBlaze MSP430 NVPTX Mips PowerPC R600 Sparc X86 XCore
+subdirectories = AArch64 ARM CppBackend Hexagon MBlaze MSP430 NVPTX Mips PowerPC R600 Sparc SystemZ X86 XCore
 
 ; This is a special group whose required libraries are extended (by llvm-build)
 ; with the best execution engine (the native JIT, if available, or the
diff --git a/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp b/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp
index c03ab38..0acfb3e 100644
--- a/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp
+++ b/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp
@@ -501,14 +501,13 @@ MCDisassembler::DecodeStatus MBlazeDisassembler::getInstruction(MCInst &instr,
                                         raw_ostream &cStream) const {
   // The machine instruction.
   uint32_t insn;
-  uint64_t read;
   uint8_t bytes[4];
 
   // By default we consume 1 byte on failure
   size = 1;
 
   // We want to read exactly 4 bytes of data.
-  if (region.readBytes(address, 4, (uint8_t*)bytes, &read) == -1 || read < 4)
+  if (region.readBytes(address, 4, bytes) == -1)
     return Fail;
 
   // Encoded as a big-endian 32-bit word in the stream.
diff --git a/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp b/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp
index 3d0d1ce..1d18cc4 100644
--- a/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp
+++ b/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp
@@ -37,13 +37,11 @@ static cl::opt<bool> MBDisableDelaySlotFiller(
 
 namespace {
   struct Filler : public MachineFunctionPass {
-
     TargetMachine &TM;
-    const TargetInstrInfo *TII;
 
     static char ID;
     Filler(TargetMachine &tm)
-      : MachineFunctionPass(ID), TM(tm), TII(tm.getInstrInfo()) { }
+      : MachineFunctionPass(ID), TM(tm) { }
 
     virtual const char *getPassName() const {
       return "MBlaze Delay Slot Filler";
@@ -239,7 +237,7 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
       Changed = true;
 
       if (D == MBB.end())
-        BuildMI(MBB, ++J, I->getDebugLoc(), TII->get(MBlaze::NOP));
+        BuildMI(MBB, ++J, I->getDebugLoc(),TM.getInstrInfo()->get(MBlaze::NOP));
       else
         MBB.splice(++J, &MBB, D);
     }
diff --git a/lib/Target/MBlaze/MBlazeISelDAGToDAG.cpp b/lib/Target/MBlaze/MBlazeISelDAGToDAG.cpp
index 34e33fd..9d6dfe6 100644
--- a/lib/Target/MBlaze/MBlazeISelDAGToDAG.cpp
+++ b/lib/Target/MBlaze/MBlazeISelDAGToDAG.cpp
@@ -181,14 +181,14 @@ SelectAddrRegImm(SDValue N, SDValue &Base, SDValue &Disp) {
 /// GOT address into a register.
 SDNode *MBlazeDAGToDAGISel::getGlobalBaseReg() {
   unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
-  return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode();
+  return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy()).getNode();
 }
 
 /// Select instructions not customized! Used for
 /// expanded, promoted and normal instructions
 SDNode* MBlazeDAGToDAGISel::Select(SDNode *Node) {
   unsigned Opcode = Node->getOpcode();
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
 
   // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode())
diff --git a/lib/Target/MBlaze/MBlazeISelLowering.cpp b/lib/Target/MBlaze/MBlazeISelLowering.cpp
index d4f9432..e07ceec 100644
--- a/lib/Target/MBlaze/MBlazeISelLowering.cpp
+++ b/lib/Target/MBlaze/MBlazeISelLowering.cpp
@@ -192,7 +192,7 @@ MBlazeTargetLowering::MBlazeTargetLowering(MBlazeTargetMachine &TM)
   computeRegisterProperties();
 }
 
-EVT MBlazeTargetLowering::getSetCCResultType(EVT VT) const {
+EVT MBlazeTargetLowering::getSetCCResultType(LLVMContext &, EVT) const {
   return MVT::i32;
 }
 
@@ -575,7 +575,7 @@ SDValue MBlazeTargetLowering::LowerSELECT_CC(SDValue Op,
   SDValue RHS = Op.getOperand(1);
   SDValue TrueVal = Op.getOperand(2);
   SDValue FalseVal = Op.getOperand(3);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned Opc;
 
   SDValue CompareFlag;
@@ -594,7 +594,7 @@ SDValue MBlazeTargetLowering::LowerSELECT_CC(SDValue Op,
 SDValue MBlazeTargetLowering::
 LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
   // FIXME there isn't actually debug info here
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   SDValue GA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32);
 
@@ -611,7 +611,7 @@ LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   SDValue ResNode;
   SDValue HiPart;
   // FIXME there isn't actually debug info here
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   EVT PtrVT = Op.getValueType();
   JumpTableSDNode *JT  = cast<JumpTableSDNode>(Op);
@@ -625,7 +625,7 @@ LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
   SDValue ResNode;
   ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
   const Constant *C = N->getConstVal();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   SDValue CP = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(),
                                          N->getOffset(), 0);
@@ -637,7 +637,7 @@ SDValue MBlazeTargetLowering::LowerVASTART(SDValue Op,
   MachineFunction &MF = DAG.getMachineFunction();
   MBlazeFunctionInfo *FuncInfo = MF.getInfo<MBlazeFunctionInfo>();
 
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue FI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
                                  getPointerTy());
 
@@ -686,7 +686,7 @@ SDValue MBlazeTargetLowering::
 LowerCall(TargetLowering::CallLoweringInfo &CLI,
           SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG                     = CLI.DAG;
-  DebugLoc &dl                          = CLI.DL;
+  SDLoc dl                              = CLI.DL;
   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
   SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
   SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
@@ -719,7 +719,8 @@ LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // Variable argument function calls require a minimum of 24-bytes of stack
   if (isVarArg && NumBytes < 24) NumBytes = 24;
 
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                               dl);
 
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
@@ -829,7 +830,7 @@ LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Create the CALLSEQ_END node.
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                             DAG.getIntPtrConstant(0, true), InFlag);
+                             DAG.getIntPtrConstant(0, true), InFlag, dl);
   if (!Ins.empty())
     InFlag = Chain.getValue(1);
 
@@ -844,7 +845,7 @@ LowerCall(TargetLowering::CallLoweringInfo &CLI,
 SDValue MBlazeTargetLowering::
 LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv,
                 bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins,
-                DebugLoc dl, SelectionDAG &DAG,
+                SDLoc dl, SelectionDAG &DAG,
                 SmallVectorImpl<SDValue> &InVals) const {
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
@@ -874,7 +875,7 @@ LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv,
 SDValue MBlazeTargetLowering::
 LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                      const SmallVectorImpl<ISD::InputArg> &Ins,
-                     DebugLoc dl, SelectionDAG &DAG,
+                     SDLoc dl, SelectionDAG &DAG,
                      SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -1017,7 +1018,7 @@ SDValue MBlazeTargetLowering::
 LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
             const SmallVectorImpl<ISD::OutputArg> &Outs,
             const SmallVectorImpl<SDValue> &OutVals,
-            DebugLoc dl, SelectionDAG &DAG) const {
+            SDLoc dl, SelectionDAG &DAG) const {
   // CCValAssign - represent the assignment of
   // the return value to a location
   SmallVector<CCValAssign, 16> RVLocs;
diff --git a/lib/Target/MBlaze/MBlazeISelLowering.h b/lib/Target/MBlaze/MBlazeISelLowering.h
index f6b4095..f874113 100644
--- a/lib/Target/MBlaze/MBlazeISelLowering.h
+++ b/lib/Target/MBlaze/MBlazeISelLowering.h
@@ -102,7 +102,7 @@ namespace llvm {
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
     /// getSetCCResultType - get the ISD::SETCC result ValueType
-    EVT getSetCCResultType(EVT VT) const;
+    EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
 
   private:
     // Subtarget Info
@@ -113,7 +113,7 @@ namespace llvm {
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                             CallingConv::ID CallConv, bool isVarArg,
                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                            DebugLoc dl, SelectionDAG &DAG,
+                            SDLoc dl, SelectionDAG &DAG,
                             SmallVectorImpl<SDValue> &InVals) const;
 
     // Lower Operand specifics
@@ -128,7 +128,7 @@ namespace llvm {
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv, bool isVarArg,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
-                           DebugLoc dl, SelectionDAG &DAG,
+                           SDLoc dl, SelectionDAG &DAG,
                            SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
@@ -140,7 +140,7 @@ namespace llvm {
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
                   const SmallVectorImpl<SDValue> &OutVals,
-                  DebugLoc dl, SelectionDAG &DAG) const;
+                  SDLoc dl, SelectionDAG &DAG) const;
 
     virtual MachineBasicBlock*
       EmitCustomShift(MachineInstr *MI, MachineBasicBlock *MBB) const;
diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.cpp b/lib/Target/MBlaze/MBlazeInstrInfo.cpp
index 79449f7..ab069e6 100644
--- a/lib/Target/MBlaze/MBlazeInstrInfo.cpp
+++ b/lib/Target/MBlaze/MBlazeInstrInfo.cpp
@@ -29,7 +29,7 @@ using namespace llvm;
 
 MBlazeInstrInfo::MBlazeInstrInfo(MBlazeTargetMachine &tm)
   : MBlazeGenInstrInfo(MBlaze::ADJCALLSTACKDOWN, MBlaze::ADJCALLSTACKUP),
-    TM(tm), RI(*TM.getSubtargetImpl(), *this) {}
+    TM(tm), RI(*TM.getSubtargetImpl()) {}
 
 static bool isZeroImm(const MachineOperand &op) {
   return op.isImm() && op.getImm() == 0;
diff --git a/lib/Target/MBlaze/MBlazeIntrinsicInfo.cpp b/lib/Target/MBlaze/MBlazeIntrinsicInfo.cpp
index 8d262a0..0d3f7d8 100644
--- a/lib/Target/MBlaze/MBlazeIntrinsicInfo.cpp
+++ b/lib/Target/MBlaze/MBlazeIntrinsicInfo.cpp
@@ -58,9 +58,8 @@ std::string MBlazeIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
 
 unsigned MBlazeIntrinsicInfo::
 lookupName(const char *Name, unsigned Len) const {
-  if (Len < 5 || Name[4] != '.' || Name[0] != 'l' || Name[1] != 'l'
-      || Name[2] != 'v' || Name[3] != 'm')
-    return 0;  // All intrinsics start with 'llvm.'
+  if (!StringRef(Name, Len).startswith("llvm."))
+    return 0; // All intrinsics start with 'llvm.'
 
 #define GET_FUNCTION_RECOGNIZER
 #include "MBlazeGenIntrinsics.inc"
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
index bd83afc..72fb8c6 100644
--- a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
+++ b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
@@ -42,8 +42,8 @@
 using namespace llvm;
 
 MBlazeRegisterInfo::
-MBlazeRegisterInfo(const MBlazeSubtarget &ST, const TargetInstrInfo &tii)
-  : MBlazeGenRegisterInfo(MBlaze::R15), Subtarget(ST), TII(tii) {}
+MBlazeRegisterInfo(const MBlazeSubtarget &ST)
+  : MBlazeGenRegisterInfo(MBlaze::R15), Subtarget(ST) {}
 
 unsigned MBlazeRegisterInfo::getPICCallReg() {
   return MBlaze::R20;
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.h b/lib/Target/MBlaze/MBlazeRegisterInfo.h
index 497f386..b463478 100644
--- a/lib/Target/MBlaze/MBlazeRegisterInfo.h
+++ b/lib/Target/MBlaze/MBlazeRegisterInfo.h
@@ -37,10 +37,8 @@ namespace MBlaze {
 
 struct MBlazeRegisterInfo : public MBlazeGenRegisterInfo {
   const MBlazeSubtarget &Subtarget;
-  const TargetInstrInfo &TII;
 
-  MBlazeRegisterInfo(const MBlazeSubtarget &Subtarget,
-                     const TargetInstrInfo &tii);
+  MBlazeRegisterInfo(const MBlazeSubtarget &Subtarget);
 
   /// Get PIC indirect call register
   static unsigned getPICCallReg();
diff --git a/lib/Target/MBlaze/MBlazeTargetMachine.cpp b/lib/Target/MBlaze/MBlazeTargetMachine.cpp
index bcdd32f..c758955 100644
--- a/lib/Target/MBlaze/MBlazeTargetMachine.cpp
+++ b/lib/Target/MBlaze/MBlazeTargetMachine.cpp
@@ -43,6 +43,7 @@ MBlazeTargetMachine(const Target &T, StringRef TT,
     FrameLowering(Subtarget),
     TLInfo(*this), TSInfo(*this),
     InstrItins(Subtarget.getInstrItineraryData()) {
+  initAsmInfo();
 }
 
 namespace {
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.cpp b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.cpp
index 380750d..5bc0668 100644
--- a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.cpp
+++ b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.cpp
@@ -53,7 +53,7 @@ static MCSubtargetInfo *createMBlazeMCSubtargetInfo(StringRef TT, StringRef CPU,
   return X;
 }
 
-static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) {
+static MCAsmInfo *createMCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) {
   Triple TheTriple(TT);
   switch (TheTriple.getOS()) {
   default:
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
index 3c95760..d213a45 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
@@ -17,7 +17,7 @@ using namespace llvm;
 
 void MSP430MCAsmInfo::anchor() { }
 
-MSP430MCAsmInfo::MSP430MCAsmInfo(const Target &T, StringRef TT) {
+MSP430MCAsmInfo::MSP430MCAsmInfo(StringRef TT) {
   PointerSize = CalleeSaveStackSlotSize = 2;
 
   PrivateGlobalPrefix = ".L";
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
index e5c2fc2..feb040d 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
@@ -18,12 +18,11 @@
 
 namespace llvm {
   class StringRef;
-  class Target;
 
   class MSP430MCAsmInfo : public MCAsmInfo {
     virtual void anchor();
   public:
-    explicit MSP430MCAsmInfo(const Target &T, StringRef TT);
+    explicit MSP430MCAsmInfo(StringRef TT);
   };
 
 } // namespace llvm
diff --git a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
index 1566c09..76bc1e7 100644
--- a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
+++ b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -259,11 +259,11 @@ bool MSP430DAGToDAGISel::SelectAddr(SDValue N,
   }
 
   Base  = (AM.BaseType == MSP430ISelAddressMode::FrameIndexBase) ?
-    CurDAG->getTargetFrameIndex(AM.Base.FrameIndex, TLI.getPointerTy()) :
+    CurDAG->getTargetFrameIndex(AM.Base.FrameIndex, TLI->getPointerTy()) :
     AM.Base.Reg;
 
   if (AM.GV)
-    Disp = CurDAG->getTargetGlobalAddress(AM.GV, N->getDebugLoc(),
+    Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(N),
                                           MVT::i16, AM.Disp,
                                           0/*AM.SymbolFlags*/);
   else if (AM.CP)
@@ -345,7 +345,7 @@ SDNode *MSP430DAGToDAGISel::SelectIndexedLoad(SDNode *N) {
     return NULL;
   }
 
-   return CurDAG->getMachineNode(Opcode, N->getDebugLoc(),
+   return CurDAG->getMachineNode(Opcode, SDLoc(N),
                                  VT, MVT::i16, MVT::Other,
                                  LD->getBasePtr(), LD->getChain());
 }
@@ -382,7 +382,7 @@ SDNode *MSP430DAGToDAGISel::SelectIndexedBinOp(SDNode *Op,
 
 
 SDNode *MSP430DAGToDAGISel::Select(SDNode *Node) {
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
 
   // Dump information about the Node being selected
   DEBUG(errs() << "Selecting: ");
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index 09cdf32..a4818b2 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -254,7 +254,7 @@ MSP430TargetLowering::LowerFormalArguments(SDValue Chain,
                                            bool isVarArg,
                                            const SmallVectorImpl<ISD::InputArg>
                                              &Ins,
-                                           DebugLoc dl,
+                                           SDLoc dl,
                                            SelectionDAG &DAG,
                                            SmallVectorImpl<SDValue> &InVals)
                                              const {
@@ -276,7 +276,7 @@ SDValue
 MSP430TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                 SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG                     = CLI.DAG;
-  DebugLoc &dl                          = CLI.DL;
+  SDLoc &dl                             = CLI.DL;
   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
   SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
   SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
@@ -310,7 +310,7 @@ MSP430TargetLowering::LowerCCCArguments(SDValue Chain,
                                         bool isVarArg,
                                         const SmallVectorImpl<ISD::InputArg>
                                           &Ins,
-                                        DebugLoc dl,
+                                        SDLoc dl,
                                         SelectionDAG &DAG,
                                         SmallVectorImpl<SDValue> &InVals)
                                           const {
@@ -407,7 +407,7 @@ MSP430TargetLowering::LowerReturn(SDValue Chain,
                                   CallingConv::ID CallConv, bool isVarArg,
                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
                                   const SmallVectorImpl<SDValue> &OutVals,
-                                  DebugLoc dl, SelectionDAG &DAG) const {
+                                  SDLoc dl, SelectionDAG &DAG) const {
 
   // CCValAssign - represent the assignment of the return value to a location
   SmallVector<CCValAssign, 16> RVLocs;
@@ -463,7 +463,7 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
                                        &Outs,
                                      const SmallVectorImpl<SDValue> &OutVals,
                                      const SmallVectorImpl<ISD::InputArg> &Ins,
-                                     DebugLoc dl, SelectionDAG &DAG,
+                                     SDLoc dl, SelectionDAG &DAG,
                                      SmallVectorImpl<SDValue> &InVals) const {
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -476,7 +476,8 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   unsigned NumBytes = CCInfo.getNextStackOffset();
 
   Chain = DAG.getCALLSEQ_START(Chain ,DAG.getConstant(NumBytes,
-                                                      getPointerTy(), true));
+                                                      getPointerTy(), true),
+                               dl);
 
   SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
   SmallVector<SDValue, 12> MemOpChains;
@@ -583,7 +584,7 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   Chain = DAG.getCALLSEQ_END(Chain,
                              DAG.getConstant(NumBytes, getPointerTy(), true),
                              DAG.getConstant(0, getPointerTy(), true),
-                             InFlag);
+                             InFlag, dl);
   InFlag = Chain.getValue(1);
 
   // Handle result values, copying them out of physregs into vregs that we
@@ -599,7 +600,7 @@ SDValue
 MSP430TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
                                       CallingConv::ID CallConv, bool isVarArg,
                                       const SmallVectorImpl<ISD::InputArg> &Ins,
-                                      DebugLoc dl, SelectionDAG &DAG,
+                                      SDLoc dl, SelectionDAG &DAG,
                                       SmallVectorImpl<SDValue> &InVals) const {
 
   // Assign locations to each value returned by this call.
@@ -625,7 +626,7 @@ SDValue MSP430TargetLowering::LowerShifts(SDValue Op,
   unsigned Opc = Op.getOpcode();
   SDNode* N = Op.getNode();
   EVT VT = Op.getValueType();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   // Expand non-constant shifts to loops:
   if (!isa<ConstantSDNode>(N->getOperand(1)))
@@ -669,15 +670,15 @@ SDValue MSP430TargetLowering::LowerGlobalAddress(SDValue Op,
   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
 
   // Create the TargetGlobalAddress node, folding in the constant offset.
-  SDValue Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(),
+  SDValue Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
                                               getPointerTy(), Offset);
-  return DAG.getNode(MSP430ISD::Wrapper, Op.getDebugLoc(),
+  return DAG.getNode(MSP430ISD::Wrapper, SDLoc(Op),
                      getPointerTy(), Result);
 }
 
 SDValue MSP430TargetLowering::LowerExternalSymbol(SDValue Op,
                                                   SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
   SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy());
 
@@ -686,7 +687,7 @@ SDValue MSP430TargetLowering::LowerExternalSymbol(SDValue Op,
 
 SDValue MSP430TargetLowering::LowerBlockAddress(SDValue Op,
                                                 SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy());
 
@@ -695,7 +696,7 @@ SDValue MSP430TargetLowering::LowerBlockAddress(SDValue Op,
 
 static SDValue EmitCMP(SDValue &LHS, SDValue &RHS, SDValue &TargetCC,
                        ISD::CondCode CC,
-                       DebugLoc dl, SelectionDAG &DAG) {
+                       SDLoc dl, SelectionDAG &DAG) {
   // FIXME: Handle bittests someday
   assert(!LHS.getValueType().isFloatingPoint() && "We don't handle FP yet");
 
@@ -782,7 +783,7 @@ SDValue MSP430TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue LHS   = Op.getOperand(2);
   SDValue RHS   = Op.getOperand(3);
   SDValue Dest  = Op.getOperand(4);
-  DebugLoc dl   = Op.getDebugLoc();
+  SDLoc dl  (Op);
 
   SDValue TargetCC;
   SDValue Flag = EmitCMP(LHS, RHS, TargetCC, CC, dl, DAG);
@@ -794,7 +795,7 @@ SDValue MSP430TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
 SDValue MSP430TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   SDValue LHS   = Op.getOperand(0);
   SDValue RHS   = Op.getOperand(1);
-  DebugLoc dl   = Op.getDebugLoc();
+  SDLoc dl  (Op);
 
   // If we are doing an AND and testing against zero, then the CMP
   // will not be generated.  The AND (or BIT) will generate the condition codes,
@@ -878,7 +879,7 @@ SDValue MSP430TargetLowering::LowerSELECT_CC(SDValue Op,
   SDValue TrueV  = Op.getOperand(2);
   SDValue FalseV = Op.getOperand(3);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
-  DebugLoc dl    = Op.getDebugLoc();
+  SDLoc dl   (Op);
 
   SDValue TargetCC;
   SDValue Flag = EmitCMP(LHS, RHS, TargetCC, CC, dl, DAG);
@@ -897,7 +898,7 @@ SDValue MSP430TargetLowering::LowerSIGN_EXTEND(SDValue Op,
                                                SelectionDAG &DAG) const {
   SDValue Val = Op.getOperand(0);
   EVT VT      = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   assert(VT == MVT::i16 && "Only support i16 for now!");
 
@@ -929,7 +930,7 @@ SDValue MSP430TargetLowering::LowerRETURNADDR(SDValue Op,
   MFI->setReturnAddressIsTaken(true);
 
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   if (Depth > 0) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
@@ -953,7 +954,7 @@ SDValue MSP430TargetLowering::LowerFRAMEADDR(SDValue Op,
   MFI->setFrameAddressIsTaken(true);
 
   EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
+  SDLoc dl(Op);  // FIXME probably not meaningful
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
                                          MSP430::FPW, VT);
@@ -975,7 +976,7 @@ SDValue MSP430TargetLowering::LowerVASTART(SDValue Op,
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
 
   // Create a store of the frame index to the location operand
-  return DAG.getStore(Op.getOperand(0), Op.getDebugLoc(), FrameIndex,
+  return DAG.getStore(Op.getOperand(0), SDLoc(Op), FrameIndex,
                       Op.getOperand(1), MachinePointerInfo(SV),
                       false, false, 0);
 }
diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h
index e0ed870..9570ef2 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.h
+++ b/lib/Target/MSP430/MSP430ISelLowering.h
@@ -130,28 +130,28 @@ namespace llvm {
                            const SmallVectorImpl<ISD::OutputArg> &Outs,
                            const SmallVectorImpl<SDValue> &OutVals,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
-                           DebugLoc dl, SelectionDAG &DAG,
+                           SDLoc dl, SelectionDAG &DAG,
                            SmallVectorImpl<SDValue> &InVals) const;
 
     SDValue LowerCCCArguments(SDValue Chain,
                               CallingConv::ID CallConv,
                               bool isVarArg,
                               const SmallVectorImpl<ISD::InputArg> &Ins,
-                              DebugLoc dl,
+                              SDLoc dl,
                               SelectionDAG &DAG,
                               SmallVectorImpl<SDValue> &InVals) const;
 
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                             CallingConv::ID CallConv, bool isVarArg,
                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                            DebugLoc dl, SelectionDAG &DAG,
+                            SDLoc dl, SelectionDAG &DAG,
                             SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv, bool isVarArg,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
-                           DebugLoc dl, SelectionDAG &DAG,
+                           SDLoc dl, SelectionDAG &DAG,
                            SmallVectorImpl<SDValue> &InVals) const;
     virtual SDValue
       LowerCall(TargetLowering::CallLoweringInfo &CLI,
@@ -162,7 +162,7 @@ namespace llvm {
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
                   const SmallVectorImpl<SDValue> &OutVals,
-                  DebugLoc dl, SelectionDAG &DAG) const;
+                  SDLoc dl, SelectionDAG &DAG) const;
 
     virtual bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
                                             SDValue &Base,
diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp
index a6b5f2f..c850594 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -29,7 +29,7 @@ using namespace llvm;
 
 MSP430InstrInfo::MSP430InstrInfo(MSP430TargetMachine &tm)
   : MSP430GenInstrInfo(MSP430::ADJCALLSTACKDOWN, MSP430::ADJCALLSTACKUP),
-    RI(tm, *this) {}
+    RI(tm) {}
 
 void MSP430InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                           MachineBasicBlock::iterator MI,
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
index 0b3e9e2..1a5e312 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -32,9 +32,8 @@
 using namespace llvm;
 
 // FIXME: Provide proper call frame setup / destroy opcodes.
-MSP430RegisterInfo::MSP430RegisterInfo(MSP430TargetMachine &tm,
-                                       const TargetInstrInfo &tii)
-  : MSP430GenRegisterInfo(MSP430::PCW), TM(tm), TII(tii) {
+MSP430RegisterInfo::MSP430RegisterInfo(MSP430TargetMachine &tm)
+  : MSP430GenRegisterInfo(MSP430::PCW), TM(tm) {
   StackAlign = TM.getFrameLowering()->getStackAlignment();
 }
 
@@ -132,6 +131,7 @@ MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     // This is actually "load effective address" of the stack slot
     // instruction. We have only two-address instructions, thus we need to
     // expand it into mov + add
+    const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
 
     MI.setDesc(TII.get(MSP430::MOV16rr));
     MI.getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.h b/lib/Target/MSP430/MSP430RegisterInfo.h
index 69cccb2..78047cc 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.h
+++ b/lib/Target/MSP430/MSP430RegisterInfo.h
@@ -27,13 +27,12 @@ class MSP430TargetMachine;
 struct MSP430RegisterInfo : public MSP430GenRegisterInfo {
 private:
   MSP430TargetMachine &TM;
-  const TargetInstrInfo &TII;
 
   /// StackAlign - Default stack alignment.
   ///
   unsigned StackAlign;
 public:
-  MSP430RegisterInfo(MSP430TargetMachine &tm, const TargetInstrInfo &tii);
+  MSP430RegisterInfo(MSP430TargetMachine &tm);
 
   /// Code Generation virtual methods...
   const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.td b/lib/Target/MSP430/MSP430RegisterInfo.td
index 07619d0..4010781 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.td
+++ b/lib/Target/MSP430/MSP430RegisterInfo.td
@@ -43,7 +43,7 @@ def R13B : MSP430Reg<13, "r13">;
 def R14B : MSP430Reg<14, "r14">;
 def R15B : MSP430Reg<15, "r15">;
 
-def subreg_8bit : SubRegIndex { let Namespace = "MSP430"; }
+def subreg_8bit : SubRegIndex<8> { let Namespace = "MSP430"; }
 
 let SubRegIndices = [subreg_8bit] in {
 def PCW  : MSP430RegWithSubregs<0,  "r0",  [PCB]>;
diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp
index 164e351..6710a09 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -36,7 +36,9 @@ MSP430TargetMachine::MSP430TargetMachine(const Target &T,
     // FIXME: Check DataLayout string.
     DL("e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16"),
     InstrInfo(*this), TLInfo(*this), TSInfo(*this),
-    FrameLowering(Subtarget) { }
+    FrameLowering(Subtarget) {
+  initAsmInfo();
+}
 
 namespace {
 /// MSP430 Code Generator Pass Configuration Options.
diff --git a/lib/Target/Mangler.cpp b/lib/Target/Mangler.cpp
index d31efa8..2269b73 100644
--- a/lib/Target/Mangler.cpp
+++ b/lib/Target/Mangler.cpp
@@ -19,6 +19,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -226,7 +227,7 @@ void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
           // "Pure" variadic functions do not receive @0 suffix.
           (!FT->isVarArg() || FT->getNumParams() == 0 ||
            (FT->getNumParams() == 1 && F->hasStructRetAttr())))
-        AddFastCallStdCallSuffix(OutName, F, TD);
+        AddFastCallStdCallSuffix(OutName, F, *TM->getDataLayout());
     }
   }
 }
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 0795cb9..d1d69d8 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -20,6 +20,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/ADT/APInt.h"
 
 using namespace llvm;
 
@@ -1290,8 +1291,16 @@ bool MipsAsmParser::searchSymbolAlias(
       const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr);
       const StringRef DefSymbol = Ref->getSymbol().getName();
       if (DefSymbol.startswith("$")) {
-        // Lookup for the register with the corresponding name.
-        int RegNum = matchRegisterName(DefSymbol.substr(1), isMips64());
+        int RegNum = -1;
+        APInt IntVal(32, -1);
+        if (!DefSymbol.substr(1).getAsInteger(10, IntVal))
+          RegNum = matchRegisterByNumber(IntVal.getZExtValue(),
+                                         isMips64()
+                                           ? Mips::CPU64RegsRegClassID
+                                           : Mips::CPURegsRegClassID);
+        else
+          // Lookup for the register with corresponding name
+          RegNum = matchRegisterName(DefSymbol.substr(1), isMips64());
         if (RegNum > -1) {
           Parser.Lex();
           MipsOperand *op = MipsOperand::CreateReg(RegNum, S,
@@ -1305,7 +1314,7 @@ bool MipsAsmParser::searchSymbolAlias(
       Parser.Lex();
       const MCConstantExpr *Const = static_cast<const MCConstantExpr*>(Expr);
       MipsOperand *op = MipsOperand::CreateImm(Const, S,
-          Parser.getTok().getLoc());
+                                               Parser.getTok().getLoc());
       Operands.push_back(op);
       return true;
     }
@@ -1451,7 +1460,7 @@ MCSymbolRefExpr::VariantKind MipsAsmParser::getVariantKind(StringRef Symbol) {
 
   return VK;
 }
-
+// Converts condition string to immediate operand value.
 static int ConvertCcString(StringRef CondString) {
   int CC = StringSwitch<unsigned>(CondString)
     .Case(".f",    0)
@@ -1741,8 +1750,22 @@ bool MipsAsmParser::parseSetAssignment() {
     return reportParseError("unexpected token in .set directive");
   Lex(); // Eat comma
 
-  if (Parser.parseExpression(Value))
-    reportParseError("expected valid expression after comma");
+  if (getLexer().is(AsmToken::Dollar)) {
+    MCSymbol *Symbol;
+    SMLoc DollarLoc = getLexer().getLoc();
+    // Consume the dollar sign, and check for a following identifier.
+    Parser.Lex();
+    // We have a '$' followed by something, make sure they are adjacent.
+    if (DollarLoc.getPointer() + 1 != getTok().getLoc().getPointer())
+      return true;
+    StringRef Res = StringRef(DollarLoc.getPointer(),
+        getTok().getEndLoc().getPointer() - DollarLoc.getPointer());
+    Symbol = getContext().GetOrCreateSymbol(Res);
+    Parser.Lex();
+    Value = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None,
+                                    getContext());
+  } else if (Parser.parseExpression(Value))
+    return reportParseError("expected valid expression after comma");
 
   // Check if the Name already exists as a symbol.
   MCSymbol *Sym = getContext().LookupSymbol(Name);
diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt
index 78a9f70..834a998 100644
--- a/lib/Target/Mips/CMakeLists.txt
+++ b/lib/Target/Mips/CMakeLists.txt
@@ -15,6 +15,7 @@ add_public_tablegen_target(MipsCommonTableGen)
 
 add_llvm_target(MipsCodeGen
   Mips16FrameLowering.cpp
+  Mips16HardFloat.cpp
   Mips16InstrInfo.cpp
   Mips16ISelDAGToDAG.cpp
   Mips16ISelLowering.cpp
@@ -34,6 +35,7 @@ add_llvm_target(MipsCodeGen
   MipsMachineFunction.cpp
   MipsModuleISelDAGToDAG.cpp
   MipsOs16.cpp
+  MipsOptimizeMathLibCalls.cpp
   MipsRegisterInfo.cpp
   MipsSEFrameLowering.cpp
   MipsSEInstrInfo.cpp
diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index 0dba33a..4af6703 100644
--- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -252,7 +252,7 @@ static DecodeStatus readInstruction32(const MemoryObject &region,
   uint8_t Bytes[4];
 
   // We want to read exactly 4 Bytes of data.
-  if (region.readBytes(address, 4, (uint8_t*)Bytes, NULL) == -1) {
+  if (region.readBytes(address, 4, Bytes) == -1) {
     size = 0;
     return MCDisassembler::Fail;
   }
diff --git a/lib/Target/Mips/MCTargetDesc/CMakeLists.txt b/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
index 4212c94..1f08789 100644
--- a/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
@@ -1,6 +1,5 @@
 add_llvm_library(LLVMMipsDesc
   MipsAsmBackend.cpp
-  MipsDirectObjLower.cpp
   MipsMCAsmInfo.cpp
   MipsMCCodeEmitter.cpp
   MipsMCTargetDesc.cpp
diff --git a/lib/Target/Mips/MCTargetDesc/MipsDirectObjLower.cpp b/lib/Target/Mips/MCTargetDesc/MipsDirectObjLower.cpp
deleted file mode 100644
index 15c4282..0000000
--- a/lib/Target/Mips/MCTargetDesc/MipsDirectObjLower.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-//===-- MipsDirectObjLower.cpp - Mips LLVM direct object lowering -----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains code to lower Mips MCInst records that are normally
-// left to the assembler to lower such as large shifts.
-//
-//===----------------------------------------------------------------------===//
-#include "MipsInstrInfo.h"
-#include "MCTargetDesc/MipsDirectObjLower.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCStreamer.h"
-
-using namespace llvm;
-
-// If the D<shift> instruction has a shift amount that is greater
-// than 31 (checked in calling routine), lower it to a D<shift>32 instruction
-void Mips::LowerLargeShift(MCInst& Inst) {
-
-  assert(Inst.getNumOperands() == 3 && "Invalid no. of operands for shift!");
-  assert(Inst.getOperand(2).isImm());
-
-  int64_t Shift = Inst.getOperand(2).getImm();
-  if (Shift <= 31)
-    return; // Do nothing
-  Shift -= 32;
-
-  // saminus32
-  Inst.getOperand(2).setImm(Shift);
-
-  switch (Inst.getOpcode()) {
-  default:
-    // Calling function is not synchronized
-    llvm_unreachable("Unexpected shift instruction");
-  case Mips::DSLL:
-    Inst.setOpcode(Mips::DSLL32);
-    return;
-  case Mips::DSRL:
-    Inst.setOpcode(Mips::DSRL32);
-    return;
-  case Mips::DSRA:
-    Inst.setOpcode(Mips::DSRA32);
-    return;
-  }
-}
-
-// Pick a DEXT or DINS instruction variant based on the pos and size operands
-void Mips::LowerDextDins(MCInst& InstIn) {
-  int Opcode = InstIn.getOpcode();
-
-  if (Opcode == Mips::DEXT)
-    assert(InstIn.getNumOperands() == 4 &&
-           "Invalid no. of machine operands for DEXT!");
-  else // Only DEXT and DINS are possible
-    assert(InstIn.getNumOperands() == 5 &&
-           "Invalid no. of machine operands for DINS!");
-
-  assert(InstIn.getOperand(2).isImm());
-  int64_t pos = InstIn.getOperand(2).getImm();
-  assert(InstIn.getOperand(3).isImm());
-  int64_t size = InstIn.getOperand(3).getImm();
-
-  if (size <= 32) {
-    if (pos < 32)  // DEXT/DINS, do nothing
-      return;
-    // DEXTU/DINSU
-    InstIn.getOperand(2).setImm(pos - 32);
-    InstIn.setOpcode((Opcode == Mips::DEXT) ? Mips::DEXTU : Mips::DINSU);
-    return;
-  }
-  // DEXTM/DINSM
-  assert(pos < 32 && "DEXT/DINS cannot have both size and pos > 32");
-  InstIn.getOperand(3).setImm(size - 32);
-  InstIn.setOpcode((Opcode == Mips::DEXT) ? Mips::DEXTM : Mips::DINSM);
-  return;
-}
diff --git a/lib/Target/Mips/MCTargetDesc/MipsDirectObjLower.h b/lib/Target/Mips/MCTargetDesc/MipsDirectObjLower.h
deleted file mode 100644
index 8813cc9..0000000
--- a/lib/Target/Mips/MCTargetDesc/MipsDirectObjLower.h
+++ /dev/null
@@ -1,28 +0,0 @@
-//===-- MipsDirectObjLower.h - Mips LLVM direct object lowering *- C++ -*--===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MIPSDIRECTOBJLOWER_H
-#define MIPSDIRECTOBJLOWER_H
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Compiler.h"
-
-namespace llvm {
-  class MCInst;
-  class MCStreamer;
-
-  namespace Mips {
-  /// MipsDirectObjLower - This name space is used to lower MCInstr in cases
-  //                       where the assembler usually finishes the lowering
-  //                       such as large shifts.
-    void LowerLargeShift(MCInst &Inst);
-    void LowerDextDins(MCInst &Inst);
-  }
-}
-
-#endif
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index 5d4b32d..33f6f96 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -18,7 +18,7 @@ using namespace llvm;
 
 void MipsMCAsmInfo::anchor() { }
 
-MipsMCAsmInfo::MipsMCAsmInfo(const Target &T, StringRef TT) {
+MipsMCAsmInfo::MipsMCAsmInfo(StringRef TT) {
   Triple TheTriple(TT);
   if ((TheTriple.getArch() == Triple::mips) ||
       (TheTriple.getArch() == Triple::mips64))
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
index e1d8789..772234e 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
@@ -18,12 +18,11 @@
 
 namespace llvm {
   class StringRef;
-  class Target;
 
   class MipsMCAsmInfo : public MCAsmInfo {
     virtual void anchor();
   public:
-    explicit MipsMCAsmInfo(const Target &T, StringRef TT);
+    explicit MipsMCAsmInfo(StringRef TT);
   };
 
 } // namespace llvm
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 9460731..a464dfe 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -13,7 +13,6 @@
 //
 #define DEBUG_TYPE "mccodeemitter"
 #include "MCTargetDesc/MipsBaseInfo.h"
-#include "MCTargetDesc/MipsDirectObjLower.h"
 #include "MCTargetDesc/MipsFixupKinds.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "llvm/ADT/APFloat.h"
@@ -114,6 +113,69 @@ MCCodeEmitter *llvm::createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
   return new MipsMCCodeEmitter(MCII, Ctx, STI, true);
 }
 
+
+// If the D<shift> instruction has a shift amount that is greater
+// than 31 (checked in calling routine), lower it to a D<shift>32 instruction
+static void LowerLargeShift(MCInst& Inst) {
+
+  assert(Inst.getNumOperands() == 3 && "Invalid no. of operands for shift!");
+  assert(Inst.getOperand(2).isImm());
+
+  int64_t Shift = Inst.getOperand(2).getImm();
+  if (Shift <= 31)
+    return; // Do nothing
+  Shift -= 32;
+
+  // saminus32
+  Inst.getOperand(2).setImm(Shift);
+
+  switch (Inst.getOpcode()) {
+  default:
+    // Calling function is not synchronized
+    llvm_unreachable("Unexpected shift instruction");
+  case Mips::DSLL:
+    Inst.setOpcode(Mips::DSLL32);
+    return;
+  case Mips::DSRL:
+    Inst.setOpcode(Mips::DSRL32);
+    return;
+  case Mips::DSRA:
+    Inst.setOpcode(Mips::DSRA32);
+    return;
+  }
+}
+
+// Pick a DEXT or DINS instruction variant based on the pos and size operands
+static void LowerDextDins(MCInst& InstIn) {
+  int Opcode = InstIn.getOpcode();
+
+  if (Opcode == Mips::DEXT)
+    assert(InstIn.getNumOperands() == 4 &&
+           "Invalid no. of machine operands for DEXT!");
+  else // Only DEXT and DINS are possible
+    assert(InstIn.getNumOperands() == 5 &&
+           "Invalid no. of machine operands for DINS!");
+
+  assert(InstIn.getOperand(2).isImm());
+  int64_t pos = InstIn.getOperand(2).getImm();
+  assert(InstIn.getOperand(3).isImm());
+  int64_t size = InstIn.getOperand(3).getImm();
+
+  if (size <= 32) {
+    if (pos < 32)  // DEXT/DINS, do nothing
+      return;
+    // DEXTU/DINSU
+    InstIn.getOperand(2).setImm(pos - 32);
+    InstIn.setOpcode((Opcode == Mips::DEXT) ? Mips::DEXTU : Mips::DINSU);
+    return;
+  }
+  // DEXTM/DINSM
+  assert(pos < 32 && "DEXT/DINS cannot have both size and pos > 32");
+  InstIn.getOperand(3).setImm(size - 32);
+  InstIn.setOpcode((Opcode == Mips::DEXT) ? Mips::DEXTM : Mips::DINSM);
+  return;
+}
+
 /// EncodeInstruction - Emit the instruction.
 /// Size the instruction (currently only 4 bytes
 void MipsMCCodeEmitter::
@@ -131,12 +193,12 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   case Mips::DSLL:
   case Mips::DSRL:
   case Mips::DSRA:
-    Mips::LowerLargeShift(TmpInst);
+    LowerLargeShift(TmpInst);
     break;
     // Double extract instruction is chosen by pos and size operands
   case Mips::DEXT:
   case Mips::DINS:
-    Mips::LowerDextDins(TmpInst);
+    LowerDextDins(TmpInst);
   }
 
   uint32_t Binary = getBinaryCodeForInstr(TmpInst, Fixups);
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index be83b54..837fabe 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -93,12 +93,12 @@ static MCSubtargetInfo *createMipsMCSubtargetInfo(StringRef TT, StringRef CPU,
   return X;
 }
 
-static MCAsmInfo *createMipsMCAsmInfo(const Target &T, StringRef TT) {
-  MCAsmInfo *MAI = new MipsMCAsmInfo(T, TT);
+static MCAsmInfo *createMipsMCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) {
+  MCAsmInfo *MAI = new MipsMCAsmInfo(TT);
 
-  MachineLocation Dst(MachineLocation::VirtualFP);
-  MachineLocation Src(Mips::SP, 0);
-  MAI->addInitialFrameState(0, Dst, Src);
+  unsigned SP = MRI.getDwarfRegNum(Mips::SP, true);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(0, SP, 0);
+  MAI->addInitialFrameState(Inst);
 
   return MAI;
 }
diff --git a/lib/Target/Mips/MicroMipsInstrInfo.td b/lib/Target/Mips/MicroMipsInstrInfo.td
index 74cdccd..7a42719 100644
--- a/lib/Target/Mips/MicroMipsInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -8,11 +8,13 @@ let isCodeGenOnly = 1 in {
                  SLTI_FM_MM<0x24>;
   def SLTiu_MM : MMRel, SetCC_I<"sltiu", setult, simm16, immSExt16, CPURegs>,
                  SLTI_FM_MM<0x2c>;
-  def ANDi_MM  : MMRel, ArithLogicI<"andi", uimm16, CPURegsOpnd, immZExt16, and>,
+  def ANDi_MM  : MMRel, ArithLogicI<"andi", uimm16, CPURegsOpnd, immZExt16,
+                                     and>,
                  ADDI_FM_MM<0x34>;
   def ORi_MM   : MMRel, ArithLogicI<"ori", uimm16, CPURegsOpnd, immZExt16, or>,
                  ADDI_FM_MM<0x14>;
-  def XORi_MM  : MMRel, ArithLogicI<"xori", uimm16, CPURegsOpnd, immZExt16, xor>,
+  def XORi_MM  : MMRel, ArithLogicI<"xori", uimm16, CPURegsOpnd, immZExt16,
+                                     xor>,
                  ADDI_FM_MM<0x1c>;
   def LUi_MM   : MMRel, LoadUpper<"lui", CPURegs, uimm16>, LUI_FM_MM;
 
diff --git a/lib/Target/Mips/Mips.h b/lib/Target/Mips/Mips.h
index 8c65bb4..b88c0d2 100644
--- a/lib/Target/Mips/Mips.h
+++ b/lib/Target/Mips/Mips.h
@@ -28,7 +28,7 @@ namespace llvm {
   FunctionPass *createMipsJITCodeEmitterPass(MipsTargetMachine &TM,
                                              JITCodeEmitter &JCE);
   FunctionPass *createMipsConstantIslandPass(MipsTargetMachine &tm);
-
+  FunctionPass *createMipsOptimizeMathLibCalls(MipsTargetMachine &TM);
 } // end namespace llvm;
 
 #endif
diff --git a/lib/Target/Mips/Mips16FrameLowering.cpp b/lib/Target/Mips/Mips16FrameLowering.cpp
index 1bb6fe4..e180c49 100644
--- a/lib/Target/Mips/Mips16FrameLowering.cpp
+++ b/lib/Target/Mips/Mips16FrameLowering.cpp
@@ -40,7 +40,7 @@ void Mips16FrameLowering::emitPrologue(MachineFunction &MF) const {
   if (StackSize == 0 && !MFI->adjustsStack()) return;
 
   MachineModuleInfo &MMI = MF.getMMI();
-  std::vector<MachineMove> &Moves = MMI.getFrameMoves();
+  const MCRegisterInfo &MRI = MMI.getContext().getRegisterInfo();
   MachineLocation DstML, SrcML;
 
   // Adjust stack.
@@ -50,24 +50,20 @@ void Mips16FrameLowering::emitPrologue(MachineFunction &MF) const {
   MCSymbol *AdjustSPLabel = MMI.getContext().CreateTempSymbol();
   BuildMI(MBB, MBBI, dl,
           TII.get(TargetOpcode::PROLOG_LABEL)).addSym(AdjustSPLabel);
-  DstML = MachineLocation(MachineLocation::VirtualFP);
-  SrcML = MachineLocation(MachineLocation::VirtualFP, -StackSize);
-  Moves.push_back(MachineMove(AdjustSPLabel, DstML, SrcML));
+  MMI.addFrameInst(
+      MCCFIInstruction::createDefCfaOffset(AdjustSPLabel, -StackSize));
 
   MCSymbol *CSLabel = MMI.getContext().CreateTempSymbol();
   BuildMI(MBB, MBBI, dl,
           TII.get(TargetOpcode::PROLOG_LABEL)).addSym(CSLabel);
-  DstML = MachineLocation(MachineLocation::VirtualFP, -8);
-  SrcML = MachineLocation(Mips::S1);
-  Moves.push_back(MachineMove(CSLabel, DstML, SrcML));
+  unsigned S1 = MRI.getDwarfRegNum(Mips::S1, true);
+  MMI.addFrameInst(MCCFIInstruction::createOffset(CSLabel, S1, -8));
 
-  DstML = MachineLocation(MachineLocation::VirtualFP, -12);
-  SrcML = MachineLocation(Mips::S0);
-  Moves.push_back(MachineMove(CSLabel, DstML, SrcML));
+  unsigned S0 = MRI.getDwarfRegNum(Mips::S0, true);
+  MMI.addFrameInst(MCCFIInstruction::createOffset(CSLabel, S0, -12));
 
-  DstML = MachineLocation(MachineLocation::VirtualFP, -4);
-  SrcML = MachineLocation(Mips::RA);
-  Moves.push_back(MachineMove(CSLabel, DstML, SrcML));
+  unsigned RA = MRI.getDwarfRegNum(Mips::RA, true);
+  MMI.addFrameInst(MCCFIInstruction::createOffset(CSLabel, RA, -4));
 
   if (hasFP(MF))
     BuildMI(MBB, MBBI, dl, TII.get(Mips::MoveR3216), Mips::S0)
diff --git a/lib/Target/Mips/Mips16HardFloat.cpp b/lib/Target/Mips/Mips16HardFloat.cpp
new file mode 100644
index 0000000..45dd5d7
--- /dev/null
+++ b/lib/Target/Mips/Mips16HardFloat.cpp
@@ -0,0 +1,458 @@
+//===---- Mips16HardFloat.cpp for Mips16 Hard Float               --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pass needed for Mips16 Hard Float
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mips16-hard-float"
+#include "Mips16HardFloat.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <string>
+
+static void inlineAsmOut
+  (LLVMContext &C, StringRef AsmString, BasicBlock *BB ) {
+  std::vector<llvm::Type *> AsmArgTypes;
+  std::vector<llvm::Value*> AsmArgs;
+  llvm::FunctionType *AsmFTy =
+    llvm::FunctionType::get(Type::getVoidTy(C),
+                            AsmArgTypes, false);
+  llvm::InlineAsm *IA =
+    llvm::InlineAsm::get(AsmFTy, AsmString, "", true,
+                         /* IsAlignStack */ false,
+                         llvm::InlineAsm::AD_ATT);
+  CallInst::Create(IA, AsmArgs, "", BB);
+}
+
+namespace {
+
+class InlineAsmHelper {
+  LLVMContext &C;
+  BasicBlock *BB;
+public:
+  InlineAsmHelper(LLVMContext &C_, BasicBlock *BB_) :
+    C(C_), BB(BB_) {
+  }
+
+  void Out(StringRef AsmString) {
+    inlineAsmOut(C, AsmString, BB);
+  }
+
+};
+}
+//
+// Return types that matter for hard float are:
+// float, double, complex float, and complex double
+//
+enum FPReturnVariant {
+  FRet, DRet, CFRet, CDRet, NoFPRet
+};
+
+//
+// Determine which FP return type this function has
+//
+static FPReturnVariant whichFPReturnVariant(Type *T) {
+  switch (T->getTypeID()) {
+  case Type::FloatTyID:
+    return FRet;
+  case Type::DoubleTyID:
+    return DRet;
+  case Type::StructTyID:
+    if (T->getStructNumElements() != 2)
+      break;
+    if ((T->getContainedType(0)->isFloatTy()) &&
+        (T->getContainedType(1)->isFloatTy()))
+      return CFRet;
+    if ((T->getContainedType(0)->isDoubleTy()) &&
+        (T->getContainedType(1)->isDoubleTy()))
+      return CDRet;
+    break;
+  default:
+    break;
+  }
+  return NoFPRet;
+}
+
+//
+// Parameter type that matter are float, (float, float), (float, double),
+// double, (double, double), (double, float)
+//
+enum FPParamVariant {
+  FSig, FFSig, FDSig,
+  DSig, DDSig, DFSig, NoSig
+};
+
+// which floating point parameter signature variant we are dealing with
+//
+typedef Type::TypeID TypeID;
+const Type::TypeID FloatTyID = Type::FloatTyID;
+const Type::TypeID DoubleTyID = Type::DoubleTyID;
+
+static FPParamVariant whichFPParamVariantNeeded(Function &F) {
+  switch (F.arg_size()) {
+  case 0:
+    return NoSig;
+  case 1:{
+    TypeID ArgTypeID = F.getFunctionType()->getParamType(0)->getTypeID();
+    switch (ArgTypeID) {
+    case FloatTyID:
+      return FSig;
+    case DoubleTyID:
+      return DSig;
+    default:
+      return NoSig;
+    }
+  }
+  default: {
+    TypeID ArgTypeID0 = F.getFunctionType()->getParamType(0)->getTypeID();
+    TypeID ArgTypeID1 = F.getFunctionType()->getParamType(1)->getTypeID();
+    switch(ArgTypeID0) {
+    case FloatTyID: {
+      switch (ArgTypeID1) {
+      case FloatTyID:
+        return FFSig;
+      case DoubleTyID:
+        return FDSig;
+      default:
+        return FSig;
+      }
+    }
+    case DoubleTyID: {
+      switch (ArgTypeID1) {
+      case FloatTyID:
+        return DFSig;
+      case DoubleTyID:
+        return DDSig;
+      default:
+        return DSig;
+      }
+    }
+    default:
+      return NoSig;
+    }
+  }
+  }
+  llvm_unreachable("can't get here");
+}
+
+// Figure out if we need float point based on the function parameters.
+// We need to move variables in and/or out of floating point
+// registers because of the ABI
+//
+static bool needsFPStubFromParams(Function &F) {
+  if (F.arg_size() >=1) {
+    Type *ArgType = F.getFunctionType()->getParamType(0);
+    switch (ArgType->getTypeID()) {
+      case Type::FloatTyID:
+      case Type::DoubleTyID:
+        return true;
+      default:
+        break;
+    }
+  }
+  return false;
+}
+
+static bool needsFPReturnHelper(Function &F) {
+  Type* RetType = F.getReturnType();
+  return whichFPReturnVariant(RetType) != NoFPRet;
+}
+
+static bool needsFPHelperFromSig(Function &F) {
+  return needsFPStubFromParams(F) || needsFPReturnHelper(F);
+}
+
+//
+// We swap between FP and Integer registers to allow Mips16 and Mips32 to
+// interoperate
+//
+
+static void swapFPIntParams
+  (FPParamVariant PV, Module *M, InlineAsmHelper &IAH,
+   bool LE, bool ToFP) {
+  //LLVMContext &Context = M->getContext();
+  std::string MI = ToFP? "mtc1 ": "mfc1 ";
+  switch (PV) {
+  case FSig:
+    IAH.Out(MI + "$$4,$$f12");
+    break;
+  case FFSig:
+    IAH.Out(MI +"$$4,$$f12");
+    IAH.Out(MI + "$$5,$$f14");
+    break;
+  case FDSig:
+    IAH.Out(MI + "$$4,$$f12");
+    if (LE) {
+      IAH.Out(MI + "$$6,$$f14");
+      IAH.Out(MI + "$$7,$$f15");
+    } else {
+      IAH.Out(MI + "$$7,$$f14");
+      IAH.Out(MI + "$$6,$$f15");
+    }
+    break;
+  case DSig:
+    if (LE) {
+      IAH.Out(MI + "$$4,$$f12");
+      IAH.Out(MI + "$$5,$$f13");
+    } else {
+      IAH.Out(MI + "$$5,$$f12");
+      IAH.Out(MI + "$$4,$$f13");
+    }
+    break;
+  case DDSig:
+    if (LE) {
+      IAH.Out(MI + "$$4,$$f12");
+      IAH.Out(MI + "$$5,$$f13");
+      IAH.Out(MI + "$$6,$$f14");
+      IAH.Out(MI + "$$7,$$f15");
+    } else {
+      IAH.Out(MI + "$$5,$$f12");
+      IAH.Out(MI + "$$4,$$f13");
+      IAH.Out(MI + "$$7,$$f14");
+      IAH.Out(MI + "$$6,$$f15");
+    }
+    break;
+  case DFSig:
+    if (LE) {
+      IAH.Out(MI + "$$4,$$f12");
+      IAH.Out(MI + "$$5,$$f13");
+    } else {
+      IAH.Out(MI + "$$5,$$f12");
+      IAH.Out(MI + "$$4,$$f13");
+    }
+    IAH.Out(MI + "$$6,$$f14");
+    break;
+  case NoSig:
+    return;
+  }
+}
+//
+// Make sure that we know we already need a stub for this function.
+// Having called needsFPHelperFromSig
+//
+static void assureFPCallStub(Function &F, Module *M,  
+                             const MipsSubtarget &Subtarget){
+  // for now we only need them for static relocation
+  if (Subtarget.getRelocationModel() == Reloc::PIC_)
+    return;
+  LLVMContext &Context = M->getContext();
+  bool LE = Subtarget.isLittle();
+  std::string Name = F.getName();
+  std::string SectionName = ".mips16.call.fp." + Name;
+  std::string StubName = "__call_stub_" + Name;
+  //
+  // see if we already have the stub
+  //
+  Function *FStub = M->getFunction(StubName);
+  if (FStub && !FStub->isDeclaration()) return;
+  FStub = Function::Create(F.getFunctionType(),
+                           Function::InternalLinkage, StubName, M);
+  FStub->addFnAttr("mips16_fp_stub");
+  FStub->addFnAttr(llvm::Attribute::Naked);
+  FStub->addFnAttr(llvm::Attribute::NoUnwind);
+  FStub->addFnAttr("nomips16");
+  FStub->setSection(SectionName);
+  BasicBlock *BB = BasicBlock::Create(Context, "entry", FStub);
+  InlineAsmHelper IAH(Context, BB);
+  FPReturnVariant RV = whichFPReturnVariant(FStub->getReturnType());
+  FPParamVariant PV = whichFPParamVariantNeeded(F);
+  swapFPIntParams(PV, M, IAH, LE, true);
+  if (RV != NoFPRet) {
+    IAH.Out("move $$18, $$31");
+    IAH.Out("jal " + Name);
+  } else {
+    IAH.Out("lui  $$25,%hi(" + Name + ")");
+    IAH.Out("addiu  $$25,$$25,%lo(" + Name + ")" );
+  }
+  switch (RV) {
+  case FRet:
+    IAH.Out("mfc1 $$2,$$f0");
+    break;
+  case DRet:
+    if (LE) {
+      IAH.Out("mfc1 $$2,$$f0");
+      IAH.Out("mfc1 $$3,$$f1");
+    } else {
+      IAH.Out("mfc1 $$3,$$f0");
+      IAH.Out("mfc1 $$2,$$f1");
+    }
+    break;
+  case CFRet:
+    if (LE) {
+    IAH.Out("mfc1 $$2,$$f0");
+    IAH.Out("mfc1 $$3,$$f2");
+    } else {
+      IAH.Out("mfc1 $$3,$$f0");
+      IAH.Out("mfc1 $$3,$$f2");
+    }
+    break;
+  case CDRet:
+    if (LE) {
+      IAH.Out("mfc1 $$4,$$f2");
+      IAH.Out("mfc1 $$5,$$f3");
+      IAH.Out("mfc1 $$2,$$f0");
+      IAH.Out("mfc1 $$3,$$f1");
+
+    } else {
+      IAH.Out("mfc1 $$5,$$f2");
+      IAH.Out("mfc1 $$4,$$f3");
+      IAH.Out("mfc1 $$3,$$f0");
+      IAH.Out("mfc1 $$2,$$f1");
+    }
+    break;
+  case NoFPRet:
+    break;
+  }
+  if (RV != NoFPRet)
+    IAH.Out("jr $$18");
+  else
+    IAH.Out("jr $$25");
+  new UnreachableInst(Context, BB);
+}
+
+//
+// Returns of float, double and complex need to be handled with a helper
+// function.
+//
+static bool fixupFPReturnAndCall
+  (Function &F, Module *M,  const MipsSubtarget &Subtarget) {
+  bool Modified = false;
+  LLVMContext &C = M->getContext();
+  Type *MyVoid = Type::getVoidTy(C);
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end();
+         I != E; ++I) {
+      Instruction &Inst = *I;
+      if (const ReturnInst *RI = dyn_cast<ReturnInst>(I)) {
+        Value *RVal = RI->getReturnValue();
+        if (!RVal) continue;
+        //
+        // If there is a return value and it needs a helper function,
+        // figure out which one and add a call before the actual
+        // return to this helper. The purpose of the helper is to move
+        // floating point values from their soft float return mapping to
+        // where they would have been mapped to in floating point registers.
+        //
+        Type *T = RVal->getType();
+        FPReturnVariant RV = whichFPReturnVariant(T);
+        if (RV == NoFPRet) continue;
+        static const char* Helper[NoFPRet] =
+          {"__mips16_ret_sf", "__mips16_ret_df", "__mips16_ret_sc",
+           "__mips16_ret_dc"};
+        const char *Name = Helper[RV];
+        AttributeSet A;
+        Value *Params[] = {RVal};
+        Modified = true;
+        //
+        // These helper functions have a different calling ABI so
+        // this __Mips16RetHelper indicates that so that later
+        // during call setup, the proper call lowering to the helper
+        // functions will take place.
+        //
+        A = A.addAttribute(C, AttributeSet::FunctionIndex,
+                           "__Mips16RetHelper");
+        A = A.addAttribute(C, AttributeSet::FunctionIndex,
+                           Attribute::ReadNone);
+        Value *F = (M->getOrInsertFunction(Name, A, MyVoid, T, NULL));
+        CallInst::Create(F, Params, "", &Inst );
+      } else if (const CallInst *CI = dyn_cast<CallInst>(I)) {
+          // pic mode calls are handled by already defined
+          // helper functions
+          if (Subtarget.getRelocationModel() != Reloc::PIC_ ) {
+            Function *F_ =  CI->getCalledFunction();
+            if (F_ && needsFPHelperFromSig(*F_)) {
+              assureFPCallStub(*F_, M, Subtarget);
+              Modified=true;
+            }
+          }
+      }
+    }
+  return Modified;
+}
+
+static void createFPFnStub(Function *F, Module *M, FPParamVariant PV,
+                  const MipsSubtarget &Subtarget ) {
+  bool PicMode = Subtarget.getRelocationModel() == Reloc::PIC_;
+  bool LE = Subtarget.isLittle();
+  LLVMContext &Context = M->getContext();
+  std::string Name = F->getName();
+  std::string SectionName = ".mips16.fn." + Name;
+  std::string StubName = "__fn_stub_" + Name;
+  std::string LocalName = "__fn_local_" + Name;
+  Function *FStub = Function::Create
+    (F->getFunctionType(),
+     Function::ExternalLinkage, StubName, M);
+  FStub->addFnAttr("mips16_fp_stub");
+  FStub->addFnAttr(llvm::Attribute::Naked);
+  FStub->addFnAttr(llvm::Attribute::NoUnwind);
+  FStub->addFnAttr("nomips16");
+  FStub->setSection(SectionName);
+  BasicBlock *BB = BasicBlock::Create(Context, "entry", FStub);
+  InlineAsmHelper IAH(Context, BB);
+  IAH.Out(" .set  macro");
+  if (PicMode) {
+    IAH.Out(".set noreorder");
+    IAH.Out(".cpload  $$2");
+    IAH.Out(".set reorder");
+    IAH.Out(".reloc 0,R_MIPS_NONE," + Name);
+    IAH.Out("la $$25," + LocalName);
+  }
+  else
+    IAH.Out("la $$25, " + Name);
+  swapFPIntParams(PV, M, IAH, LE, false);
+  IAH.Out("jr $$25");
+  IAH.Out(LocalName + " = " + Name);
+  new UnreachableInst(FStub->getContext(), BB);
+}
+
+namespace llvm {
+
+//
+// This pass only makes sense when the underlying chip has floating point but
+// we are compiling as mips16.
+// For all mips16 functions (that are not stubs we have already generated), or
+// declared via attributes as nomips16, we must:
+//    1) fixup all returns of float, double, single and double complex
+//       by calling a helper function before the actual return.
+//    2) generate helper functions (stubs) that can be called by mips32 functions
+//       that will move parameters passed normally passed in floating point
+//       registers the soft float equivalents.
+//    3) in the case of static relocation, generate helper functions so that
+//       mips16 functions can call extern functions of unknown type (mips16 or
+//       mips32).
+//    4) TBD. For pic, calls to extern functions of unknown type are handled by
+//       predefined helper functions in libc but this work is currently done
+//       during call lowering but it should be moved here in the future.
+//
+bool Mips16HardFloat::runOnModule(Module &M) {
+  DEBUG(errs() << "Run on Module Mips16HardFloat\n");
+  bool Modified = false;
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    if (F->isDeclaration() || F->hasFnAttribute("mips16_fp_stub") ||
+        F->hasFnAttribute("nomips16")) continue;
+    Modified |= fixupFPReturnAndCall(*F, &M, Subtarget);
+    FPParamVariant V = whichFPParamVariantNeeded(*F);
+    if (V != NoSig) {
+      Modified = true;
+      createFPFnStub(F, &M, V, Subtarget);
+    }
+  }
+  return Modified;
+}
+
+char Mips16HardFloat::ID = 0;
+
+}
+
+ModulePass *llvm::createMips16HardFloat(MipsTargetMachine &TM) {
+  return new Mips16HardFloat(TM);
+}
+
diff --git a/lib/Target/Mips/Mips16HardFloat.h b/lib/Target/Mips/Mips16HardFloat.h
new file mode 100644
index 0000000..b7f712a
--- /dev/null
+++ b/lib/Target/Mips/Mips16HardFloat.h
@@ -0,0 +1,54 @@
+//===---- Mips16HardFloat.h for Mips16 Hard Float                  --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a phase which implements part of the floating point
+// interoperability between Mips16 and Mips32 code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "MipsTargetMachine.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetMachine.h"
+
+
+#ifndef MIPS16HARDFLOAT_H
+#define MIPS16HARDFLOAT_H
+
+using namespace llvm;
+
+namespace llvm {
+
+class Mips16HardFloat : public ModulePass {
+
+public:
+  static char ID;
+
+  Mips16HardFloat(MipsTargetMachine &TM_) : ModulePass(ID),
+    TM(TM_), Subtarget(TM.getSubtarget<MipsSubtarget>()) {
+  }
+
+  virtual const char *getPassName() const {
+    return "MIPS16 Hard Float Pass";
+  }
+
+  virtual bool runOnModule(Module &M);
+
+protected:
+  /// Keep a pointer to the MipsSubtarget around so that we can make the right
+  /// decision when generating code for different targets.
+  const TargetMachine &TM;
+  const MipsSubtarget &Subtarget;
+
+};
+
+ModulePass *createMips16HardFloat(MipsTargetMachine &TM);
+
+}
+#endif
diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
index c1c635c..f70abda 100644
--- a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
+++ b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
@@ -42,7 +42,7 @@ bool Mips16DAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
 }
 /// Select multiply instructions.
 std::pair<SDNode*, SDNode*>
-Mips16DAGToDAGISel::selectMULT(SDNode *N, unsigned Opc, DebugLoc DL, EVT Ty,
+Mips16DAGToDAGISel::selectMULT(SDNode *N, unsigned Opc, SDLoc DL, EVT Ty,
                                bool HasLo, bool HasHi) {
   SDNode *Lo = 0, *Hi = 0;
   SDNode *Mul = CurDAG->getMachineNode(Opc, DL, MVT::Glue, N->getOperand(0),
@@ -118,11 +118,11 @@ void Mips16DAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
 SDValue Mips16DAGToDAGISel::getMips16SPAliasReg() {
   unsigned Mips16SPAliasReg =
     MF->getInfo<MipsFunctionInfo>()->getMips16SPAliasReg();
-  return CurDAG->getRegister(Mips16SPAliasReg, TLI.getPointerTy());
+  return CurDAG->getRegister(Mips16SPAliasReg, TLI->getPointerTy());
 }
 
 void Mips16DAGToDAGISel::getMips16SPRefReg(SDNode *Parent, SDValue &AliasReg) {
-  SDValue AliasFPReg = CurDAG->getRegister(Mips::S0, TLI.getPointerTy());
+  SDValue AliasFPReg = CurDAG->getRegister(Mips::S0, TLI->getPointerTy());
   if (Parent) {
     switch (Parent->getOpcode()) {
       case ISD::LOAD: {
@@ -149,7 +149,7 @@ void Mips16DAGToDAGISel::getMips16SPRefReg(SDNode *Parent, SDValue &AliasReg) {
       }
     }
   }
-  AliasReg = CurDAG->getRegister(Mips::SP, TLI.getPointerTy());
+  AliasReg = CurDAG->getRegister(Mips::SP, TLI->getPointerTy());
   return;
 
 }
@@ -235,7 +235,7 @@ bool Mips16DAGToDAGISel::selectAddr16(
 /// expanded, promoted and normal instructions
 std::pair<bool, SDNode*> Mips16DAGToDAGISel::selectNode(SDNode *Node) {
   unsigned Opcode = Node->getOpcode();
-  DebugLoc DL = Node->getDebugLoc();
+  SDLoc DL(Node);
 
   ///
   // Instruction Selection not handled by the auto-generated
diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.h b/lib/Target/Mips/Mips16ISelDAGToDAG.h
index f05f9b7..49dc6e5 100644
--- a/lib/Target/Mips/Mips16ISelDAGToDAG.h
+++ b/lib/Target/Mips/Mips16ISelDAGToDAG.h
@@ -23,7 +23,7 @@ public:
   explicit Mips16DAGToDAGISel(MipsTargetMachine &TM) : MipsDAGToDAGISel(TM) {}
 
 private:
-  std::pair<SDNode*, SDNode*> selectMULT(SDNode *N, unsigned Opc, DebugLoc DL,
+  std::pair<SDNode*, SDNode*> selectMULT(SDNode *N, unsigned Opc, SDLoc DL,
                                          EVT Ty, bool HasLo, bool HasHi);
 
   SDValue getMips16SPAliasReg();
diff --git a/lib/Target/Mips/Mips16ISelLowering.cpp b/lib/Target/Mips/Mips16ISelLowering.cpp
index f63318f..d8dd88c 100644
--- a/lib/Target/Mips/Mips16ISelLowering.cpp
+++ b/lib/Target/Mips/Mips16ISelLowering.cpp
@@ -13,6 +13,7 @@
 #define DEBUG_TYPE "mips-lower"
 #include "Mips16ISelLowering.h"
 #include "MipsRegisterInfo.h"
+#include "MipsTargetMachine.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/Support/CommandLine.h"
@@ -21,11 +22,6 @@
 
 using namespace llvm;
 
-static cl::opt<bool>
-Mips16HardFloat("mips16-hard-float", cl::NotHidden,
-                cl::desc("MIPS: mips16 hard float enable."),
-                cl::init(false));
-
 static cl::opt<bool> DontExpandCondPseudos16(
   "mips16-dont-expand-cond-pseudo",
   cl::init(false),
@@ -50,9 +46,13 @@ Mips16TargetLowering::Mips16TargetLowering(MipsTargetMachine &TM)
   // Set up the register classes
   addRegisterClass(MVT::i32, &Mips::CPU16RegsRegClass);
 
-  if (Mips16HardFloat)
+  if (Subtarget->inMips16HardFloat()) {
     setMips16HardFloatLibCalls();
-
+    NoHelperNeeded.insert("__mips16_ret_sf");
+    NoHelperNeeded.insert("__mips16_ret_df");
+    NoHelperNeeded.insert("__mips16_ret_sc");
+    NoHelperNeeded.insert("__mips16_ret_dc");
+  }
   setOperationAction(ISD::ATOMIC_FENCE,       MVT::Other, Expand);
   setOperationAction(ISD::ATOMIC_CMP_SWAP,    MVT::i32,   Expand);
   setOperationAction(ISD::ATOMIC_SWAP,        MVT::i32,   Expand);
@@ -131,17 +131,17 @@ Mips16TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     // altogether.
     return emitFEXT_T8I816_ins(Mips::BtnezX16, Mips::SltuRxRy16, MI, BB);
   case Mips::BteqzT8CmpiX16: return emitFEXT_T8I8I16_ins(
-    Mips::BteqzX16, Mips::CmpiRxImm16, Mips::CmpiRxImmX16, MI, BB);
+    Mips::BteqzX16, Mips::CmpiRxImm16, Mips::CmpiRxImmX16, false, MI, BB);
   case Mips::BteqzT8SltiX16: return emitFEXT_T8I8I16_ins(
-    Mips::BteqzX16, Mips::SltiRxImm16, Mips::SltiRxImmX16, MI, BB);
+    Mips::BteqzX16, Mips::SltiRxImm16, Mips::SltiRxImmX16, true, MI, BB);
   case Mips::BteqzT8SltiuX16: return emitFEXT_T8I8I16_ins(
-    Mips::BteqzX16, Mips::SltiuRxImm16, Mips::SltiuRxImmX16, MI, BB);
+    Mips::BteqzX16, Mips::SltiuRxImm16, Mips::SltiuRxImmX16, false, MI, BB);
   case Mips::BtnezT8CmpiX16: return emitFEXT_T8I8I16_ins(
-    Mips::BtnezX16, Mips::CmpiRxImm16, Mips::CmpiRxImmX16, MI, BB);
+    Mips::BtnezX16, Mips::CmpiRxImm16, Mips::CmpiRxImmX16, false, MI, BB);
   case Mips::BtnezT8SltiX16: return emitFEXT_T8I8I16_ins(
-    Mips::BtnezX16, Mips::SltiRxImm16, Mips::SltiRxImmX16, MI, BB);
+    Mips::BtnezX16, Mips::SltiRxImm16, Mips::SltiRxImmX16, true, MI, BB);
   case Mips::BtnezT8SltiuX16: return emitFEXT_T8I8I16_ins(
-    Mips::BtnezX16, Mips::SltiuRxImm16, Mips::SltiuRxImmX16, MI, BB);
+    Mips::BtnezX16, Mips::SltiuRxImm16, Mips::SltiuRxImmX16, false, MI, BB);
     break;
   case Mips::SltCCRxRy16:
     return emitFEXT_CCRX16_ins(Mips::SltRxRy16, MI, BB);
@@ -374,7 +374,8 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
   const char* Mips16HelperFunction = 0;
   bool NeedMips16Helper = false;
 
-  if (getTargetMachine().Options.UseSoftFloat && Mips16HardFloat) {
+  if (getTargetMachine().Options.UseSoftFloat &&
+      Subtarget->inMips16HardFloat()) {
     //
     // currently we don't have symbols tagged with the mips16 or mips32
     // qualifier so we will assume that we don't know what kind it is.
@@ -386,6 +387,13 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
         LookupHelper = false;
       }
     }
+    else if (GlobalAddressSDNode *G = 
+             dyn_cast<GlobalAddressSDNode>(CLI.Callee)) {
+      if (NoHelperNeeded.find(G->getGlobal()->getName().data()) != 
+                              NoHelperNeeded.end()) {
+        LookupHelper = false;
+      }
+    }
     if (LookupHelper) Mips16HelperFunction =
       getMips16HelperFunction(CLI.RetTy, CLI.Args, NeedMips16Helper);
 
@@ -621,7 +629,7 @@ MachineBasicBlock
 }
 
 MachineBasicBlock *Mips16TargetLowering::emitFEXT_T8I8I16_ins(
-  unsigned BtOpc, unsigned CmpiOpc, unsigned CmpiXOpc,
+  unsigned BtOpc, unsigned CmpiOpc, unsigned CmpiXOpc, bool ImmSigned,
   MachineInstr *MI,  MachineBasicBlock *BB) const {
   if (DontExpandCondPseudos16)
     return BB;
@@ -632,7 +640,8 @@ MachineBasicBlock *Mips16TargetLowering::emitFEXT_T8I8I16_ins(
   unsigned CmpOpc;
   if (isUInt<8>(imm))
     CmpOpc = CmpiOpc;
-  else if (isUInt<16>(imm))
+  else if ((!ImmSigned && isUInt<16>(imm)) ||
+           (ImmSigned && isInt<16>(imm)))
     CmpOpc = CmpiXOpc;
   else
     llvm_unreachable("immediate field not usable");
diff --git a/lib/Target/Mips/Mips16ISelLowering.h b/lib/Target/Mips/Mips16ISelLowering.h
index b23e2a1..d3c7028 100644
--- a/lib/Target/Mips/Mips16ISelLowering.h
+++ b/lib/Target/Mips/Mips16ISelLowering.h
@@ -64,7 +64,7 @@ namespace llvm {
                                            MachineBasicBlock *BB) const;
 
     MachineBasicBlock *emitFEXT_T8I8I16_ins(
-      unsigned BtOpc, unsigned CmpiOpc, unsigned CmpiXOpc,
+      unsigned BtOpc, unsigned CmpiOpc, unsigned CmpiXOpc, bool ImmSigned,
       MachineInstr *MI,  MachineBasicBlock *BB) const;
 
     MachineBasicBlock *emitFEXT_CCRX16_ins(
diff --git a/lib/Target/Mips/Mips16InstrFormats.td b/lib/Target/Mips/Mips16InstrFormats.td
index 4ff62ef..1e49934 100644
--- a/lib/Target/Mips/Mips16InstrFormats.td
+++ b/lib/Target/Mips/Mips16InstrFormats.td
@@ -61,7 +61,7 @@ class MipsInst16<dag outs, dag ins, string asmstr, list<dag> pattern,
 
   // Top 5 bits are the 'opcode' field
   let Inst{15-11} = Opcode;
-  
+
   let Size=2;
   field bits<16> SoftFail = 0;
 }
@@ -74,7 +74,7 @@ class MipsInst16_32<dag outs, dag ins, string asmstr, list<dag> pattern,
   MipsInst16_Base<outs, ins, asmstr, pattern, itin>
 {
   field bits<32> Inst;
-  
+
   let Size=4;
   field bits<32> SoftFail = 0;
 }
diff --git a/lib/Target/Mips/Mips16InstrInfo.cpp b/lib/Target/Mips/Mips16InstrInfo.cpp
index 17dd2c0..c2a496c 100644
--- a/lib/Target/Mips/Mips16InstrInfo.cpp
+++ b/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -37,7 +37,7 @@ static cl::opt<bool> NeverUseSaveRestore(
 
 Mips16InstrInfo::Mips16InstrInfo(MipsTargetMachine &tm)
   : MipsInstrInfo(tm, Mips::BimmX16),
-    RI(*tm.getSubtargetImpl(), *this) {}
+    RI(*tm.getSubtargetImpl()) {}
 
 const MipsRegisterInfo &Mips16InstrInfo::getRegisterInfo() const {
   return RI;
@@ -145,7 +145,7 @@ bool Mips16InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
 
 /// GetOppositeBranchOpc - Return the inverse of the specified
 /// opcode, e.g. turning BEQ to BNE.
-unsigned Mips16InstrInfo::GetOppositeBranchOpc(unsigned Opc) const {
+unsigned Mips16InstrInfo::getOppositeBranchOpc(unsigned Opc) const {
   switch (Opc) {
   default:  llvm_unreachable("Illegal opcode!");
   case Mips::BeqzRxImmX16: return Mips::BnezRxImmX16;
@@ -380,7 +380,7 @@ Mips16InstrInfo::loadImmediate(unsigned FrameReg,
   return Reg;
 }
 
-unsigned Mips16InstrInfo::GetAnalyzableBrOpc(unsigned Opc) const {
+unsigned Mips16InstrInfo::getAnalyzableBrOpc(unsigned Opc) const {
   return (Opc == Mips::BeqzRxImmX16   || Opc == Mips::BimmX16  ||
           Opc == Mips::BnezRxImmX16   || Opc == Mips::BteqzX16 ||
           Opc == Mips::BteqzT8CmpX16  || Opc == Mips::BteqzT8CmpiX16 ||
diff --git a/lib/Target/Mips/Mips16InstrInfo.h b/lib/Target/Mips/Mips16InstrInfo.h
index a77a904..a3bd31e 100644
--- a/lib/Target/Mips/Mips16InstrInfo.h
+++ b/lib/Target/Mips/Mips16InstrInfo.h
@@ -64,7 +64,7 @@ public:
 
   virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
 
-  virtual unsigned GetOppositeBranchOpc(unsigned Opc) const;
+  virtual unsigned getOppositeBranchOpc(unsigned Opc) const;
 
   // Adjust SP by FrameSize bytes. Save RA, S0, S1
   void makeFrame(unsigned SP, int64_t FrameSize, MachineBasicBlock &MBB,
@@ -102,7 +102,7 @@ public:
     (MachineBasicBlock &MBB, MachineBasicBlock::iterator I, int64_t Imm) const;
 
 private:
-  virtual unsigned GetAnalyzableBrOpc(unsigned Opc) const;
+  virtual unsigned getAnalyzableBrOpc(unsigned Opc) const;
 
   void ExpandRetRA16(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                    unsigned Opc) const;
diff --git a/lib/Target/Mips/Mips16RegisterInfo.cpp b/lib/Target/Mips/Mips16RegisterInfo.cpp
index 7ad18f2..018f56c 100644
--- a/lib/Target/Mips/Mips16RegisterInfo.cpp
+++ b/lib/Target/Mips/Mips16RegisterInfo.cpp
@@ -41,17 +41,16 @@
 
 using namespace llvm;
 
-Mips16RegisterInfo::Mips16RegisterInfo(const MipsSubtarget &ST,
-    const Mips16InstrInfo &I)
-  : MipsRegisterInfo(ST), TII(I) {}
+Mips16RegisterInfo::Mips16RegisterInfo(const MipsSubtarget &ST)
+  : MipsRegisterInfo(ST) {}
 
 bool Mips16RegisterInfo::requiresRegisterScavenging
   (const MachineFunction &MF) const {
-  return true;
+  return false;
 }
 bool Mips16RegisterInfo::requiresFrameIndexScavenging
   (const MachineFunction &MF) const {
-  return true;
+  return false;
 }
 
 bool Mips16RegisterInfo::useFPForScavengingIndex
@@ -66,6 +65,7 @@ bool Mips16RegisterInfo::saveScavengerRegister
    const TargetRegisterClass *RC,
    unsigned Reg) const {
   DebugLoc DL;
+  const TargetInstrInfo &TII = *MBB.getParent()->getTarget().getInstrInfo();
   TII.copyPhysReg(MBB, I, DL, Mips::T0, Reg, true);
   TII.copyPhysReg(MBB, UseMI, DL, Reg, Mips::T0, true);
   return true;
@@ -139,6 +139,9 @@ void Mips16RegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
     MachineBasicBlock &MBB = *MI.getParent();
     DebugLoc DL = II->getDebugLoc();
     unsigned NewImm;
+    const Mips16InstrInfo &TII =
+      *static_cast<const Mips16InstrInfo*>(
+        MBB.getParent()->getTarget().getInstrInfo());
     FrameReg = TII.loadImmediate(FrameReg, Offset, MBB, II, DL, NewImm);
     Offset = SignExtend64<16>(NewImm);
     IsKill = true;
diff --git a/lib/Target/Mips/Mips16RegisterInfo.h b/lib/Target/Mips/Mips16RegisterInfo.h
index 2b3d2b1..13e82a3 100644
--- a/lib/Target/Mips/Mips16RegisterInfo.h
+++ b/lib/Target/Mips/Mips16RegisterInfo.h
@@ -20,10 +20,8 @@ namespace llvm {
 class Mips16InstrInfo;
 
 class Mips16RegisterInfo : public MipsRegisterInfo {
-  const Mips16InstrInfo &TII;
 public:
-  Mips16RegisterInfo(const MipsSubtarget &Subtarget,
-                     const Mips16InstrInfo &TII);
+  Mips16RegisterInfo(const MipsSubtarget &Subtarget);
 
   bool requiresRegisterScavenging(const MachineFunction &MF) const;
 
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index fc533fb..df717fe 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -167,12 +167,12 @@ let Predicates = [IsN64, HasStdEnc], isCodeGenOnly = 1 in {
 
 /// Jump and Branch Instructions
 def JR64   : IndirectBranch<CPU64Regs>, MTLO_FM<8>;
-def BEQ64  : CBranch<"beq", seteq, CPU64Regs>, BEQ_FM<4>;
-def BNE64  : CBranch<"bne", setne, CPU64Regs>, BEQ_FM<5>;
-def BGEZ64 : CBranchZero<"bgez", setge, CPU64Regs>, BGEZ_FM<1, 1>;
-def BGTZ64 : CBranchZero<"bgtz", setgt, CPU64Regs>, BGEZ_FM<7, 0>;
-def BLEZ64 : CBranchZero<"blez", setle, CPU64Regs>, BGEZ_FM<6, 0>;
-def BLTZ64 : CBranchZero<"bltz", setlt, CPU64Regs>, BGEZ_FM<1, 0>;
+def BEQ64  : CBranch<"beq", seteq, CPU64RegsOpnd>, BEQ_FM<4>;
+def BNE64  : CBranch<"bne", setne, CPU64RegsOpnd>, BEQ_FM<5>;
+def BGEZ64 : CBranchZero<"bgez", setge, CPU64RegsOpnd>, BGEZ_FM<1, 1>;
+def BGTZ64 : CBranchZero<"bgtz", setgt, CPU64RegsOpnd>, BGEZ_FM<7, 0>;
+def BLEZ64 : CBranchZero<"blez", setle, CPU64RegsOpnd>, BGEZ_FM<6, 0>;
+def BLTZ64 : CBranchZero<"bltz", setlt, CPU64RegsOpnd>, BGEZ_FM<1, 0>;
 }
 let DecoderNamespace = "Mips64" in
 def JALR64 : JumpLinkReg<"jalr", CPU64Regs>, JALR_FM;
@@ -192,9 +192,9 @@ def PseudoDMULTu : MultDivPseudo<DMULTu, ACRegs128, CPU64RegsOpnd, MipsMultu,
 def DSDIV : Div<"ddiv", IIIdiv, CPU64RegsOpnd, [HI64, LO64]>, MULT_FM<0, 0x1e>;
 def DUDIV : Div<"ddivu", IIIdiv, CPU64RegsOpnd, [HI64, LO64]>, MULT_FM<0, 0x1f>;
 def PseudoDSDIV : MultDivPseudo<DSDIV, ACRegs128, CPU64RegsOpnd, MipsDivRem,
-                                IIIdiv, 0>;
+                                IIIdiv, 0, 1, 1>;
 def PseudoDUDIV : MultDivPseudo<DUDIV, ACRegs128, CPU64RegsOpnd, MipsDivRemU,
-                                IIIdiv, 0>;
+                                IIIdiv, 0, 1, 1>;
 
 def MTHI64 : MoveToLOHI<"mthi", CPU64Regs, [HI64]>, MTLO_FM<0x11>;
 def MTLO64 : MoveToLOHI<"mtlo", CPU64Regs, [LO64]>, MTLO_FM<0x13>;
@@ -294,6 +294,11 @@ def : WrapperPat<tglobaltlsaddr, DADDiu, CPU64Regs>;
 defm : BrcondPats<CPU64Regs, BEQ64, BNE64, SLT64, SLTu64, SLTi64, SLTiu64,
                   ZERO_64>;
 
+def : MipsPat<(brcond (i32 (setlt i64:$lhs, 1)), bb:$dst),
+              (BLEZ64 i64:$lhs, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setgt i64:$lhs, -1)), bb:$dst),
+              (BGEZ64 i64:$lhs, bb:$dst)>;
+
 // setcc patterns
 defm : SeteqPats<CPU64Regs, SLTiu64, XOR64, SLTu64, ZERO_64>;
 defm : SetlePats<CPU64Regs, SLT64, SLTu64>;
@@ -361,8 +366,14 @@ def : InstAlias<"dadd $rs, $rt, $imm",
 def : InstAlias<"or $rs, $rt, $imm",
                 (ORi64 CPU64RegsOpnd:$rs, CPU64RegsOpnd:$rt, uimm16_64:$imm),
                 1>, Requires<[HasMips64]>;
-/// Move between CPU and coprocessor registers
+def : InstAlias<"bnez $rs,$offset",
+                 (BNE64 CPU64RegsOpnd:$rs, ZERO_64, brtarget:$offset), 1>,
+                 Requires<[HasMips64]>;
+def : InstAlias<"beqz $rs,$offset",
+                 (BEQ64 CPU64RegsOpnd:$rs, ZERO_64, brtarget:$offset), 1>,
+                 Requires<[HasMips64]>;
 
+/// Move between CPU and coprocessor registers
 let DecoderNamespace = "Mips64" in {
 def DMFC0_3OP64 : MFC3OP<(outs CPU64RegsOpnd:$rt),
                          (ins CPU64RegsOpnd:$rd, uimm16:$sel),
diff --git a/lib/Target/Mips/MipsAnalyzeImmediate.cpp b/lib/Target/Mips/MipsAnalyzeImmediate.cpp
index 99b163e..31a9b7d 100644
--- a/lib/Target/Mips/MipsAnalyzeImmediate.cpp
+++ b/lib/Target/Mips/MipsAnalyzeImmediate.cpp
@@ -40,7 +40,7 @@ void MipsAnalyzeImmediate::GetInstSeqLsORi(uint64_t Imm, unsigned RemSize,
 
 void MipsAnalyzeImmediate::GetInstSeqLsSLL(uint64_t Imm, unsigned RemSize,
                                            InstSeqLs &SeqLs) {
-  unsigned Shamt = CountTrailingZeros_64(Imm);
+  unsigned Shamt = countTrailingZeros(Imm);
   GetInstSeqLs(Imm >> Shamt, RemSize - Shamt, SeqLs);
   AddInstr(SeqLs, Inst(SLL, Shamt));
 }
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index f4f71cb..6e4feda 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -249,12 +249,18 @@ void MipsAsmPrinter::EmitFunctionEntryLabel() {
 void MipsAsmPrinter::EmitFunctionBodyStart() {
   MCInstLowering.Initialize(Mang, &MF->getContext());
 
-  emitFrameDirective();
+  bool IsNakedFunction =
+    MF->getFunction()->
+      getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                   Attribute::Naked);
+  if (!IsNakedFunction)
+    emitFrameDirective();
 
   if (OutStreamer.hasRawTextSupport()) {
     SmallString<128> Str;
     raw_svector_ostream OS(Str);
-    printSavedRegsBitmask(OS);
+    if (!IsNakedFunction)
+      printSavedRegsBitmask(OS);
     OutStreamer.EmitRawText(OS.str());
     if (!Subtarget->inMips16Mode()) {
       OutStreamer.EmitRawText(StringRef("\t.set\tnoreorder"));
diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td
index 462def7..ac40b11 100644
--- a/lib/Target/Mips/MipsCallingConv.td
+++ b/lib/Target/Mips/MipsCallingConv.td
@@ -196,6 +196,13 @@ def CC_Mips_FastCC : CallingConv<[
   CCDelegateTo<CC_MipsN_FastCC>
 ]>;
 
+//==
+
+def CC_Mips16RetHelper : CallingConv<[
+  // Integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToReg<[V0, V1, A0, A1]>>
+]>;
+
 //===----------------------------------------------------------------------===//
 // Mips Calling Convention Dispatch
 //===----------------------------------------------------------------------===//
@@ -223,3 +230,6 @@ def CSR_N32 : CalleeSavedRegs<(add D31_64, D29_64, D27_64, D25_64, D24_64,
 
 def CSR_N64 : CalleeSavedRegs<(add (sequence "D%u_64", 31, 24), RA_64, FP_64,
                                    GP_64, (sequence "S%u_64", 7, 0))>;
+
+def CSR_Mips16RetHelper :
+  CalleeSavedRegs<(add V0, V1, (sequence "A%u", 3, 0), S0, S1)>;
diff --git a/lib/Target/Mips/MipsCodeEmitter.cpp b/lib/Target/Mips/MipsCodeEmitter.cpp
index 3fc402b..813037e 100644
--- a/lib/Target/Mips/MipsCodeEmitter.cpp
+++ b/lib/Target/Mips/MipsCodeEmitter.cpp
@@ -65,8 +65,7 @@ class MipsCodeEmitter : public MachineFunctionPass {
 
 public:
   MipsCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce)
-    : MachineFunctionPass(ID), JTI(0),
-      II((const MipsInstrInfo *) tm.getInstrInfo()), TD(tm.getDataLayout()),
+    : MachineFunctionPass(ID), JTI(0), II(0), TD(0),
       TM(tm), MCE(mce), MCPEs(0), MJTEs(0),
       IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
 
diff --git a/lib/Target/Mips/MipsConstantIslandPass.cpp b/lib/Target/Mips/MipsConstantIslandPass.cpp
index 1951324..bda0167 100644
--- a/lib/Target/Mips/MipsConstantIslandPass.cpp
+++ b/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -50,7 +50,6 @@ namespace {
     static char ID;
     MipsConstantIslands(TargetMachine &tm)
       : MachineFunctionPass(ID), TM(tm),
-        TII(static_cast<const MipsInstrInfo*>(tm.getInstrInfo())),
         IsPIC(TM.getRelocationModel() == Reloc::PIC_),
         ABI(TM.getSubtarget<MipsSubtarget>().getTargetABI()) {}
 
@@ -61,13 +60,9 @@ namespace {
     bool runOnMachineFunction(MachineFunction &F);
 
   private:
-
-
     const TargetMachine &TM;
-    const MipsInstrInfo *TII;
     bool IsPIC;
     unsigned ABI;
-
   };
 
   char MipsConstantIslands::ID = 0;
diff --git a/lib/Target/Mips/MipsDSPInstrInfo.td b/lib/Target/Mips/MipsDSPInstrInfo.td
index 710b40d..c12878a 100644
--- a/lib/Target/Mips/MipsDSPInstrInfo.td
+++ b/lib/Target/Mips/MipsDSPInstrInfo.td
@@ -83,16 +83,12 @@ def MipsSETCC_DSP : MipsDSPBase<"SETCC_DSP", SDTSetCC>;
 def MipsSELECT_CC_DSP : MipsDSPBase<"SELECT_CC_DSP", SDTSelectCC>;
 
 // Flags.
-class UseAC {
-  list<Register> Uses = [AC0];
+class Uses<list<Register> Regs> {
+  list<Register> Uses = Regs;
 }
 
-class UseDSPCtrl {
-  list<Register> Uses = [DSPCtrl];
-}
-
-class ClearDefs {
-  list<Register> Defs = [];
+class Defs<list<Register> Regs> {
+  list<Register> Defs = Regs;
 }
 
 // Instruction encoding.
@@ -267,7 +263,6 @@ class ADDU_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
   list<dag> Pattern = [(set RCD:$rd, (OpNode RCS:$rs, RCT:$rt))];
   InstrItinClass Itinerary = itin;
-  list<Register> Defs = [DSPCtrl];
 }
 
 class RADDU_W_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -278,7 +273,6 @@ class RADDU_W_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs");
   list<dag> Pattern = [(set RCD:$rd, (OpNode RCS:$rs))];
   InstrItinClass Itinerary = itin;
-  list<Register> Defs = [DSPCtrl];
 }
 
 class CMP_EQ_QB_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -289,7 +283,6 @@ class CMP_EQ_QB_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rs, $rt");
   list<dag> Pattern = [(OpNode RCS:$rs, RCT:$rt)];
   InstrItinClass Itinerary = itin;
-  list<Register> Defs = [DSPCtrl];
 }
 
 class CMP_EQ_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -300,7 +293,6 @@ class CMP_EQ_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
   list<dag> Pattern = [(set RCD:$rd, (OpNode RCS:$rs, RCT:$rt))];
   InstrItinClass Itinerary = itin;
-  list<Register> Defs = [DSPCtrl];
 }
 
 class PRECR_SRA_PH_W_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -311,7 +303,6 @@ class PRECR_SRA_PH_W_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa");
   list<dag> Pattern = [(set RCT:$rt, (OpNode RCS:$src, RCS:$rs, immZExt5:$sa))];
   InstrItinClass Itinerary = itin;
-  list<Register> Defs = [DSPCtrl];
   string Constraints = "$src = $rt";
 }
 
@@ -323,7 +314,6 @@ class ABSQ_S_PH_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rd, $rt");
   list<dag> Pattern = [(set RCD:$rd, (OpNode RCT:$rt))];
   InstrItinClass Itinerary = itin;
-  list<Register> Defs = [DSPCtrl];
 }
 
 class REPL_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -333,7 +323,6 @@ class REPL_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rd, $imm");
   list<dag> Pattern = [(set RC:$rd, (OpNode immPat:$imm))];
   InstrItinClass Itinerary = itin;
-  list<Register> Defs = [DSPCtrl];
 }
 
 class SHLL_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -343,7 +332,6 @@ class SHLL_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs_sa");
   list<dag> Pattern = [(set RC:$rd, (OpNode RC:$rt, CPURegs:$rs_sa))];
   InstrItinClass Itinerary = itin;
-  list<Register> Defs = [DSPCtrl];
 }
 
 class SHLL_QB_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -354,7 +342,6 @@ class SHLL_QB_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs_sa");
   list<dag> Pattern = [(set RC:$rd, (OpNode RC:$rt, ImmPat:$rs_sa))];
   InstrItinClass Itinerary = itin;
-  list<Register> Defs = [DSPCtrl];
   bit hasSideEffects = 1;
 }
 
@@ -366,7 +353,6 @@ class LX_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   list<dag> Pattern = [(set CPURegs:$rd,
                        (OpNode CPURegs:$base, CPURegs:$index))];
   InstrItinClass Itinerary = itin;
-  list<Register> Defs = [DSPCtrl];
   bit mayLoad = 1;
 }
 
@@ -378,7 +364,6 @@ class ADDUH_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
   list<dag> Pattern = [(set RCD:$rd, (OpNode RCS:$rs, RCT:$rt))];
   InstrItinClass Itinerary = itin;
-  list<Register> Defs = [DSPCtrl];
 }
 
 class APPEND_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -389,7 +374,6 @@ class APPEND_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   list<dag> Pattern =  [(set CPURegs:$rt,
                         (OpNode CPURegs:$src, CPURegs:$rs, ImmOp:$sa))];
   InstrItinClass Itinerary = itin;
-  list<Register> Defs = [DSPCtrl];
   string Constraints = "$src = $rt";
 }
 
@@ -399,7 +383,6 @@ class EXTR_W_TY1_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   dag InOperandList = (ins ACRegsDSP:$ac, CPURegs:$shift_rs);
   string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $shift_rs");
   InstrItinClass Itinerary = itin;
-  list<Register> Defs = [DSPCtrl];
 }
 
 class EXTR_W_TY1_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -408,7 +391,6 @@ class EXTR_W_TY1_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   dag InOperandList = (ins ACRegsDSP:$ac, uimm16:$shift_rs);
   string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $shift_rs");
   InstrItinClass Itinerary = itin;
-  list<Register> Defs = [DSPCtrl];
 }
 
 class SHILO_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
@@ -417,7 +399,6 @@ class SHILO_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
   string AsmString = !strconcat(instr_asm, "\t$ac, $shift");
   list<dag> Pattern = [(set ACRegsDSP:$ac,
                         (OpNode immSExt6:$shift, ACRegsDSP:$acin))];
-  list<Register> Defs = [DSPCtrl];
   string Constraints = "$acin = $ac";
 }
 
@@ -427,7 +408,6 @@ class SHILO_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
   string AsmString = !strconcat(instr_asm, "\t$ac, $rs");
   list<dag> Pattern = [(set ACRegsDSP:$ac,
                         (OpNode CPURegs:$rs, ACRegsDSP:$acin))];
-  list<Register> Defs = [DSPCtrl];
   string Constraints = "$acin = $ac";
 }
 
@@ -437,7 +417,6 @@ class MTHLIP_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
   string AsmString = !strconcat(instr_asm, "\t$rs, $ac");
   list<dag> Pattern = [(set ACRegsDSP:$ac,
                         (OpNode CPURegs:$rs, ACRegsDSP:$acin))];
-  list<Register> Uses = [DSPCtrl];
   string Constraints = "$acin = $ac";
 }
 
@@ -448,7 +427,6 @@ class RDDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rd, $mask");
   list<dag> Pattern = [(set CPURegs:$rd, (OpNode immZExt10:$mask))];
   InstrItinClass Itinerary = itin;
-  list<Register> Uses = [DSPCtrl];
 }
 
 class WRDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -458,7 +436,6 @@ class WRDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rs, $mask");
   list<dag> Pattern = [(OpNode CPURegs:$rs, immZExt10:$mask)];
   InstrItinClass Itinerary = itin;
-  list<Register> Defs = [DSPCtrl];
 }
 
 class DPA_W_PH_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
@@ -467,7 +444,6 @@ class DPA_W_PH_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
   string AsmString = !strconcat(instr_asm, "\t$ac, $rs, $rt");
   list<dag> Pattern = [(set ACRegsDSP:$ac,
                         (OpNode CPURegs:$rs, CPURegs:$rt, ACRegsDSP:$acin))];
-  list<Register> Defs = [DSPCtrl];
   string Constraints = "$acin = $ac";
 }
 
@@ -510,7 +486,6 @@ class MTHI_DESC_BASE<string instr_asm, RegisterClass RC, InstrItinClass itin> {
 
 class BPOSGE32_PSEUDO_DESC_BASE<SDPatternOperator OpNode, InstrItinClass itin> :
   MipsPseudo<(outs CPURegs:$dst), (ins), [(set CPURegs:$dst, (OpNode))]> {
-  list<Register> Uses = [DSPCtrl];
   bit usesCustomInserter = 1;
 }
 
@@ -519,7 +494,6 @@ class BPOSGE32_DESC_BASE<string instr_asm, InstrItinClass itin> {
   dag InOperandList = (ins brtarget:$offset);
   string AsmString = !strconcat(instr_asm, "\t$offset");
   InstrItinClass Itinerary = itin;
-  list<Register> Uses = [DSPCtrl];
   bit isBranch = 1;
   bit isTerminator = 1;
   bit hasDelaySlot = 1;
@@ -532,7 +506,6 @@ class INSV_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rt, $rs");
   list<dag> Pattern = [(set CPURegs:$rt, (OpNode CPURegs:$src, CPURegs:$rs))];
   InstrItinClass Itinerary = itin;
-  list<Register> Uses = [DSPCtrl];
   string Constraints = "$src = $rt";
 }
 
@@ -542,177 +515,182 @@ class INSV_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
 
 // Addition/subtraction
 class ADDU_QB_DESC : ADDU_QB_DESC_BASE<"addu.qb", null_frag, NoItinerary,
-                                       DSPRegs, DSPRegs>, IsCommutable;
+                                       DSPRegs, DSPRegs>, IsCommutable,
+                     Defs<[DSPOutFlag20]>;
 
 class ADDU_S_QB_DESC : ADDU_QB_DESC_BASE<"addu_s.qb", int_mips_addu_s_qb,
                                          NoItinerary, DSPRegs, DSPRegs>,
-                       IsCommutable;
+                       IsCommutable, Defs<[DSPOutFlag20]>;
 
 class SUBU_QB_DESC : ADDU_QB_DESC_BASE<"subu.qb", null_frag, NoItinerary,
-                                       DSPRegs, DSPRegs>;
+                                       DSPRegs, DSPRegs>,
+                     Defs<[DSPOutFlag20]>;
 
 class SUBU_S_QB_DESC : ADDU_QB_DESC_BASE<"subu_s.qb", int_mips_subu_s_qb,
-                                         NoItinerary, DSPRegs, DSPRegs>;
+                                         NoItinerary, DSPRegs, DSPRegs>,
+                       Defs<[DSPOutFlag20]>;
 
 class ADDQ_PH_DESC : ADDU_QB_DESC_BASE<"addq.ph", null_frag, NoItinerary,
-                                       DSPRegs, DSPRegs>, IsCommutable;
+                                       DSPRegs, DSPRegs>, IsCommutable,
+                     Defs<[DSPOutFlag20]>;
 
 class ADDQ_S_PH_DESC : ADDU_QB_DESC_BASE<"addq_s.ph", int_mips_addq_s_ph,
                                          NoItinerary, DSPRegs, DSPRegs>,
-                       IsCommutable;
+                       IsCommutable, Defs<[DSPOutFlag20]>;
 
 class SUBQ_PH_DESC : ADDU_QB_DESC_BASE<"subq.ph", null_frag, NoItinerary,
-                                       DSPRegs, DSPRegs>;
+                                       DSPRegs, DSPRegs>,
+                     Defs<[DSPOutFlag20]>;
 
 class SUBQ_S_PH_DESC : ADDU_QB_DESC_BASE<"subq_s.ph", int_mips_subq_s_ph,
-                                         NoItinerary, DSPRegs, DSPRegs>;
+                                         NoItinerary, DSPRegs, DSPRegs>,
+                       Defs<[DSPOutFlag20]>;
 
 class ADDQ_S_W_DESC : ADDU_QB_DESC_BASE<"addq_s.w", int_mips_addq_s_w,
                                         NoItinerary, CPURegs, CPURegs>,
-                      IsCommutable;
+                      IsCommutable, Defs<[DSPOutFlag20]>;
 
 class SUBQ_S_W_DESC : ADDU_QB_DESC_BASE<"subq_s.w", int_mips_subq_s_w,
-                                        NoItinerary, CPURegs, CPURegs>;
+                                        NoItinerary, CPURegs, CPURegs>,
+                      Defs<[DSPOutFlag20]>;
 
 class ADDSC_DESC : ADDU_QB_DESC_BASE<"addsc", null_frag, NoItinerary,
-                                     CPURegs, CPURegs>, IsCommutable;
+                                     CPURegs, CPURegs>, IsCommutable,
+                   Defs<[DSPCarry]>;
 
 class ADDWC_DESC : ADDU_QB_DESC_BASE<"addwc", null_frag, NoItinerary,
                                      CPURegs, CPURegs>,
-                   IsCommutable, UseDSPCtrl;
+                   IsCommutable, Uses<[DSPCarry]>, Defs<[DSPOutFlag20]>;
 
 class MODSUB_DESC : ADDU_QB_DESC_BASE<"modsub", int_mips_modsub, NoItinerary,
-                                      CPURegs, CPURegs>, ClearDefs;
+                                      CPURegs, CPURegs>;
 
 class RADDU_W_QB_DESC : RADDU_W_QB_DESC_BASE<"raddu.w.qb", int_mips_raddu_w_qb,
-                                             NoItinerary, CPURegs, DSPRegs>,
-                        ClearDefs;
+                                             NoItinerary, CPURegs, DSPRegs>;
 
 // Absolute value
 class ABSQ_S_PH_DESC : ABSQ_S_PH_R2_DESC_BASE<"absq_s.ph", int_mips_absq_s_ph,
-                                              NoItinerary, DSPRegs>;
+                                              NoItinerary, DSPRegs>,
+                       Defs<[DSPOutFlag20]>;
 
 class ABSQ_S_W_DESC : ABSQ_S_PH_R2_DESC_BASE<"absq_s.w", int_mips_absq_s_w,
-                                             NoItinerary, CPURegs>;
+                                             NoItinerary, CPURegs>,
+                      Defs<[DSPOutFlag20]>;
 
 // Precision reduce/expand
 class PRECRQ_QB_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrq.qb.ph",
                                                  int_mips_precrq_qb_ph,
-                                                 NoItinerary, DSPRegs, DSPRegs>,
-                          ClearDefs;
+                                                 NoItinerary, DSPRegs, DSPRegs>;
 
 class PRECRQ_PH_W_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrq.ph.w",
                                                 int_mips_precrq_ph_w,
-                                                NoItinerary, DSPRegs, CPURegs>,
-                         ClearDefs;
+                                                NoItinerary, DSPRegs, CPURegs>;
 
 class PRECRQ_RS_PH_W_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrq_rs.ph.w",
                                                    int_mips_precrq_rs_ph_w,
                                                    NoItinerary, DSPRegs,
-                                                   CPURegs>;
+                                                   CPURegs>,
+                            Defs<[DSPOutFlag22]>;
 
 class PRECRQU_S_QB_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrqu_s.qb.ph",
                                                     int_mips_precrqu_s_qb_ph,
                                                     NoItinerary, DSPRegs,
-                                                    DSPRegs>;
+                                                    DSPRegs>,
+                             Defs<[DSPOutFlag22]>;
 
 class PRECEQ_W_PHL_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceq.w.phl",
                                                  int_mips_preceq_w_phl,
-                                                 NoItinerary, CPURegs, DSPRegs>,
-                          ClearDefs;
+                                                 NoItinerary, CPURegs, DSPRegs>;
 
 class PRECEQ_W_PHR_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceq.w.phr",
                                                  int_mips_preceq_w_phr,
-                                                 NoItinerary, CPURegs, DSPRegs>,
-                          ClearDefs;
+                                                 NoItinerary, CPURegs, DSPRegs>;
 
 class PRECEQU_PH_QBL_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbl",
                                                    int_mips_precequ_ph_qbl,
-                                                   NoItinerary, DSPRegs>,
-                            ClearDefs;
+                                                   NoItinerary, DSPRegs>;
 
 class PRECEQU_PH_QBR_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbr",
                                                    int_mips_precequ_ph_qbr,
-                                                   NoItinerary, DSPRegs>,
-                            ClearDefs;
+                                                   NoItinerary, DSPRegs>;
 
 class PRECEQU_PH_QBLA_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbla",
                                                     int_mips_precequ_ph_qbla,
-                                                    NoItinerary, DSPRegs>,
-                             ClearDefs;
+                                                    NoItinerary, DSPRegs>;
 
 class PRECEQU_PH_QBRA_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbra",
                                                     int_mips_precequ_ph_qbra,
-                                                    NoItinerary, DSPRegs>,
-                             ClearDefs;
+                                                    NoItinerary, DSPRegs>;
 
 class PRECEU_PH_QBL_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbl",
                                                   int_mips_preceu_ph_qbl,
-                                                  NoItinerary, DSPRegs>,
-                           ClearDefs;
+                                                  NoItinerary, DSPRegs>;
 
 class PRECEU_PH_QBR_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbr",
                                                   int_mips_preceu_ph_qbr,
-                                                  NoItinerary, DSPRegs>,
-                           ClearDefs;
+                                                  NoItinerary, DSPRegs>;
 
 class PRECEU_PH_QBLA_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbla",
                                                    int_mips_preceu_ph_qbla,
-                                                   NoItinerary, DSPRegs>,
-                            ClearDefs;
+                                                   NoItinerary, DSPRegs>;
 
 class PRECEU_PH_QBRA_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbra",
                                                    int_mips_preceu_ph_qbra,
-                                                   NoItinerary, DSPRegs>,
-                            ClearDefs;
+                                                   NoItinerary, DSPRegs>;
 
 // Shift
 class SHLL_QB_DESC : SHLL_QB_R2_DESC_BASE<"shll.qb", null_frag, immZExt3,
-                                          NoItinerary, DSPRegs>;
+                                          NoItinerary, DSPRegs>,
+                     Defs<[DSPOutFlag22]>;
 
 class SHLLV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shllv.qb", int_mips_shll_qb,
-                                           NoItinerary, DSPRegs>;
+                                           NoItinerary, DSPRegs>,
+                      Defs<[DSPOutFlag22]>;
 
 class SHRL_QB_DESC : SHLL_QB_R2_DESC_BASE<"shrl.qb", null_frag, immZExt3,
-                                          NoItinerary, DSPRegs>, ClearDefs;
+                                          NoItinerary, DSPRegs>;
 
 class SHRLV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrlv.qb", int_mips_shrl_qb,
-                                           NoItinerary, DSPRegs>, ClearDefs;
+                                           NoItinerary, DSPRegs>;
 
 class SHLL_PH_DESC : SHLL_QB_R2_DESC_BASE<"shll.ph", null_frag, immZExt4,
-                                          NoItinerary, DSPRegs>;
+                                          NoItinerary, DSPRegs>,
+                     Defs<[DSPOutFlag22]>;
 
 class SHLLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv.ph", int_mips_shll_ph,
-                                           NoItinerary, DSPRegs>;
+                                           NoItinerary, DSPRegs>,
+                      Defs<[DSPOutFlag22]>;
 
 class SHLL_S_PH_DESC : SHLL_QB_R2_DESC_BASE<"shll_s.ph", int_mips_shll_s_ph,
-                                            immZExt4, NoItinerary, DSPRegs>;
+                                            immZExt4, NoItinerary, DSPRegs>,
+                       Defs<[DSPOutFlag22]>;
 
 class SHLLV_S_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.ph", int_mips_shll_s_ph,
-                                             NoItinerary, DSPRegs>;
+                                             NoItinerary, DSPRegs>,
+                        Defs<[DSPOutFlag22]>;
 
 class SHRA_PH_DESC : SHLL_QB_R2_DESC_BASE<"shra.ph", null_frag, immZExt4,
-                                          NoItinerary, DSPRegs>, ClearDefs;
+                                          NoItinerary, DSPRegs>;
 
 class SHRAV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrav.ph", int_mips_shra_ph,
-                                           NoItinerary, DSPRegs>, ClearDefs;
+                                           NoItinerary, DSPRegs>;
 
 class SHRA_R_PH_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.ph", int_mips_shra_r_ph,
-                                            immZExt4, NoItinerary, DSPRegs>,
-                       ClearDefs;
+                                            immZExt4, NoItinerary, DSPRegs>;
 
 class SHRAV_R_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.ph", int_mips_shra_r_ph,
-                                             NoItinerary, DSPRegs>, ClearDefs;
+                                             NoItinerary, DSPRegs>;
 
 class SHLL_S_W_DESC : SHLL_QB_R2_DESC_BASE<"shll_s.w", int_mips_shll_s_w,
-                                           immZExt5, NoItinerary, CPURegs>;
+                                           immZExt5, NoItinerary, CPURegs>,
+                      Defs<[DSPOutFlag22]>;
 
 class SHLLV_S_W_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.w", int_mips_shll_s_w,
-                                            NoItinerary, CPURegs>;
+                                            NoItinerary, CPURegs>,
+                       Defs<[DSPOutFlag22]>;
 
 class SHRA_R_W_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.w", int_mips_shra_r_w,
-                                           immZExt5, NoItinerary, CPURegs>,
-                      ClearDefs;
+                                           immZExt5, NoItinerary, CPURegs>;
 
 class SHRAV_R_W_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.w", int_mips_shra_r_w,
                                             NoItinerary, CPURegs>;
@@ -720,36 +698,43 @@ class SHRAV_R_W_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.w", int_mips_shra_r_w,
 // Multiplication
 class MULEU_S_PH_QBL_DESC : ADDU_QB_DESC_BASE<"muleu_s.ph.qbl",
                                               int_mips_muleu_s_ph_qbl,
-                                              NoItinerary, DSPRegs, DSPRegs>;
+                                              NoItinerary, DSPRegs, DSPRegs>,
+                            Defs<[DSPOutFlag21]>;
 
 class MULEU_S_PH_QBR_DESC : ADDU_QB_DESC_BASE<"muleu_s.ph.qbr",
                                               int_mips_muleu_s_ph_qbr,
-                                              NoItinerary, DSPRegs, DSPRegs>;
+                                              NoItinerary, DSPRegs, DSPRegs>,
+                            Defs<[DSPOutFlag21]>;
 
 class MULEQ_S_W_PHL_DESC : ADDU_QB_DESC_BASE<"muleq_s.w.phl",
                                              int_mips_muleq_s_w_phl,
                                              NoItinerary, CPURegs, DSPRegs>,
-                           IsCommutable;
+                           IsCommutable, Defs<[DSPOutFlag21]>;
 
 class MULEQ_S_W_PHR_DESC : ADDU_QB_DESC_BASE<"muleq_s.w.phr",
                                              int_mips_muleq_s_w_phr,
                                              NoItinerary, CPURegs, DSPRegs>,
-                           IsCommutable;
+                           IsCommutable, Defs<[DSPOutFlag21]>;
 
 class MULQ_RS_PH_DESC : ADDU_QB_DESC_BASE<"mulq_rs.ph", int_mips_mulq_rs_ph,
                                           NoItinerary, DSPRegs, DSPRegs>,
-                        IsCommutable;
+                        IsCommutable, Defs<[DSPOutFlag21]>;
 
 class MULSAQ_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"mulsaq_s.w.ph",
-                                              MipsMULSAQ_S_W_PH>;
+                                              MipsMULSAQ_S_W_PH>,
+                           Defs<[DSPOutFlag16_19]>;
 
-class MAQ_S_W_PHL_DESC : DPA_W_PH_DESC_BASE<"maq_s.w.phl", MipsMAQ_S_W_PHL>;
+class MAQ_S_W_PHL_DESC : DPA_W_PH_DESC_BASE<"maq_s.w.phl", MipsMAQ_S_W_PHL>,
+                         Defs<[DSPOutFlag16_19]>;
 
-class MAQ_S_W_PHR_DESC : DPA_W_PH_DESC_BASE<"maq_s.w.phr", MipsMAQ_S_W_PHR>;
+class MAQ_S_W_PHR_DESC : DPA_W_PH_DESC_BASE<"maq_s.w.phr", MipsMAQ_S_W_PHR>,
+                         Defs<[DSPOutFlag16_19]>;
 
-class MAQ_SA_W_PHL_DESC : DPA_W_PH_DESC_BASE<"maq_sa.w.phl", MipsMAQ_SA_W_PHL>;
+class MAQ_SA_W_PHL_DESC : DPA_W_PH_DESC_BASE<"maq_sa.w.phl", MipsMAQ_SA_W_PHL>,
+                          Defs<[DSPOutFlag16_19]>;
 
-class MAQ_SA_W_PHR_DESC : DPA_W_PH_DESC_BASE<"maq_sa.w.phr", MipsMAQ_SA_W_PHR>;
+class MAQ_SA_W_PHR_DESC : DPA_W_PH_DESC_BASE<"maq_sa.w.phr", MipsMAQ_SA_W_PHR>,
+                          Defs<[DSPOutFlag16_19]>;
 
 // Move from/to hi/lo.
 class MFHI_DESC : MFHI_DESC_BASE<"mfhi", HIRegsDSP, NoItinerary>;
@@ -766,13 +751,17 @@ class DPSU_H_QBL_DESC : DPA_W_PH_DESC_BASE<"dpsu.h.qbl", MipsDPSU_H_QBL>;
 
 class DPSU_H_QBR_DESC : DPA_W_PH_DESC_BASE<"dpsu.h.qbr", MipsDPSU_H_QBR>;
 
-class DPAQ_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpaq_s.w.ph", MipsDPAQ_S_W_PH>;
+class DPAQ_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpaq_s.w.ph", MipsDPAQ_S_W_PH>,
+                         Defs<[DSPOutFlag16_19]>;
 
-class DPSQ_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpsq_s.w.ph", MipsDPSQ_S_W_PH>;
+class DPSQ_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpsq_s.w.ph", MipsDPSQ_S_W_PH>,
+                         Defs<[DSPOutFlag16_19]>;
 
-class DPAQ_SA_L_W_DESC : DPA_W_PH_DESC_BASE<"dpaq_sa.l.w", MipsDPAQ_SA_L_W>;
+class DPAQ_SA_L_W_DESC : DPA_W_PH_DESC_BASE<"dpaq_sa.l.w", MipsDPAQ_SA_L_W>,
+                         Defs<[DSPOutFlag16_19]>;
 
-class DPSQ_SA_L_W_DESC : DPA_W_PH_DESC_BASE<"dpsq_sa.l.w", MipsDPSQ_SA_L_W>;
+class DPSQ_SA_L_W_DESC : DPA_W_PH_DESC_BASE<"dpsq_sa.l.w", MipsDPSQ_SA_L_W>,
+                         Defs<[DSPOutFlag16_19]>;
 
 class MULT_DSP_DESC  : MULT_DESC_BASE<"mult", MipsMult, NoItinerary>;
 class MULTU_DSP_DESC : MULT_DESC_BASE<"multu", MipsMultu, NoItinerary>;
@@ -784,15 +773,16 @@ class MSUBU_DSP_DESC : MADD_DESC_BASE<"msubu", MipsMSubu, NoItinerary>;
 // Comparison
 class CMPU_EQ_QB_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmpu.eq.qb",
                                                int_mips_cmpu_eq_qb, NoItinerary,
-                                               DSPRegs>, IsCommutable;
+                                               DSPRegs>,
+                        IsCommutable, Defs<[DSPCCond]>;
 
 class CMPU_LT_QB_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmpu.lt.qb",
                                                int_mips_cmpu_lt_qb, NoItinerary,
-                                               DSPRegs>;
+                                               DSPRegs>, Defs<[DSPCCond]>;
 
 class CMPU_LE_QB_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmpu.le.qb",
                                                int_mips_cmpu_le_qb, NoItinerary,
-                                               DSPRegs>;
+                                               DSPRegs>, Defs<[DSPCCond]>;
 
 class CMPGU_EQ_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgu.eq.qb",
                                                 int_mips_cmpgu_eq_qb,
@@ -809,208 +799,227 @@ class CMPGU_LE_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgu.le.qb",
 
 class CMP_EQ_PH_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmp.eq.ph", int_mips_cmp_eq_ph,
                                               NoItinerary, DSPRegs>,
-                       IsCommutable;
+                       IsCommutable, Defs<[DSPCCond]>;
 
 class CMP_LT_PH_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmp.lt.ph", int_mips_cmp_lt_ph,
-                                              NoItinerary, DSPRegs>;
+                                              NoItinerary, DSPRegs>,
+                       Defs<[DSPCCond]>;
 
 class CMP_LE_PH_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmp.le.ph", int_mips_cmp_le_ph,
-                                              NoItinerary, DSPRegs>;
+                                              NoItinerary, DSPRegs>,
+                       Defs<[DSPCCond]>;
 
 // Misc
 class BITREV_DESC : ABSQ_S_PH_R2_DESC_BASE<"bitrev", int_mips_bitrev,
-                                           NoItinerary, CPURegs>, ClearDefs;
+                                           NoItinerary, CPURegs>;
 
 class PACKRL_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"packrl.ph", int_mips_packrl_ph,
-                                              NoItinerary, DSPRegs, DSPRegs>,
-                       ClearDefs;
+                                              NoItinerary, DSPRegs, DSPRegs>;
 
 class REPL_QB_DESC : REPL_DESC_BASE<"repl.qb", int_mips_repl_qb, immZExt8,
-                                    NoItinerary, DSPRegs>, ClearDefs;
+                                    NoItinerary, DSPRegs>;
 
 class REPL_PH_DESC : REPL_DESC_BASE<"repl.ph", int_mips_repl_ph, immZExt10,
-                                    NoItinerary, DSPRegs>, ClearDefs;
+                                    NoItinerary, DSPRegs>;
 
 class REPLV_QB_DESC : ABSQ_S_PH_R2_DESC_BASE<"replv.qb", int_mips_repl_qb,
-                                             NoItinerary, DSPRegs, CPURegs>,
-                      ClearDefs;
+                                             NoItinerary, DSPRegs, CPURegs>;
 
 class REPLV_PH_DESC : ABSQ_S_PH_R2_DESC_BASE<"replv.ph", int_mips_repl_ph,
-                                             NoItinerary, DSPRegs, CPURegs>,
-                      ClearDefs;
+                                             NoItinerary, DSPRegs, CPURegs>;
 
 class PICK_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"pick.qb", int_mips_pick_qb,
                                             NoItinerary, DSPRegs, DSPRegs>,
-                     ClearDefs, UseDSPCtrl;
+                     Uses<[DSPCCond]>;
 
 class PICK_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"pick.ph", int_mips_pick_ph,
                                             NoItinerary, DSPRegs, DSPRegs>,
-                     ClearDefs, UseDSPCtrl;
+                     Uses<[DSPCCond]>;
 
-class LWX_DESC : LX_DESC_BASE<"lwx", int_mips_lwx, NoItinerary>, ClearDefs;
+class LWX_DESC : LX_DESC_BASE<"lwx", int_mips_lwx, NoItinerary>;
 
-class LHX_DESC : LX_DESC_BASE<"lhx", int_mips_lhx, NoItinerary>, ClearDefs;
+class LHX_DESC : LX_DESC_BASE<"lhx", int_mips_lhx, NoItinerary>;
 
-class LBUX_DESC : LX_DESC_BASE<"lbux", int_mips_lbux, NoItinerary>, ClearDefs;
+class LBUX_DESC : LX_DESC_BASE<"lbux", int_mips_lbux, NoItinerary>;
 
 class BPOSGE32_DESC : BPOSGE32_DESC_BASE<"bposge32", NoItinerary>;
 
 // Extr
-class EXTP_DESC : EXTR_W_TY1_R1_DESC_BASE<"extp", MipsEXTP, NoItinerary>;
+class EXTP_DESC : EXTR_W_TY1_R1_DESC_BASE<"extp", MipsEXTP, NoItinerary>,
+                  Uses<[DSPPos]>, Defs<[DSPEFI]>;
 
-class EXTPV_DESC : EXTR_W_TY1_R2_DESC_BASE<"extpv", MipsEXTP, NoItinerary>;
+class EXTPV_DESC : EXTR_W_TY1_R2_DESC_BASE<"extpv", MipsEXTP, NoItinerary>,
+                   Uses<[DSPPos]>, Defs<[DSPEFI]>;
 
-class EXTPDP_DESC : EXTR_W_TY1_R1_DESC_BASE<"extpdp", MipsEXTPDP, NoItinerary>;
+class EXTPDP_DESC : EXTR_W_TY1_R1_DESC_BASE<"extpdp", MipsEXTPDP, NoItinerary>,
+                    Uses<[DSPPos]>, Defs<[DSPPos, DSPEFI]>;
 
 class EXTPDPV_DESC : EXTR_W_TY1_R2_DESC_BASE<"extpdpv", MipsEXTPDP,
-                                             NoItinerary>;
+                                             NoItinerary>,
+                     Uses<[DSPPos]>, Defs<[DSPPos, DSPEFI]>;
 
-class EXTR_W_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr.w", MipsEXTR_W, NoItinerary>;
+class EXTR_W_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr.w", MipsEXTR_W, NoItinerary>,
+                    Defs<[DSPOutFlag23]>;
 
 class EXTRV_W_DESC : EXTR_W_TY1_R2_DESC_BASE<"extrv.w", MipsEXTR_W,
-                                             NoItinerary>;
+                                             NoItinerary>, Defs<[DSPOutFlag23]>;
 
 class EXTR_R_W_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr_r.w", MipsEXTR_R_W,
-                                              NoItinerary>;
+                                              NoItinerary>,
+                      Defs<[DSPOutFlag23]>;
 
 class EXTRV_R_W_DESC : EXTR_W_TY1_R2_DESC_BASE<"extrv_r.w", MipsEXTR_R_W,
-                                               NoItinerary>;
+                                               NoItinerary>,
+                       Defs<[DSPOutFlag23]>;
 
 class EXTR_RS_W_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr_rs.w", MipsEXTR_RS_W,
-                                               NoItinerary>;
+                                               NoItinerary>,
+                       Defs<[DSPOutFlag23]>;
 
 class EXTRV_RS_W_DESC : EXTR_W_TY1_R2_DESC_BASE<"extrv_rs.w", MipsEXTR_RS_W,
-                                                NoItinerary>;
+                                                NoItinerary>,
+                        Defs<[DSPOutFlag23]>;
 
 class EXTR_S_H_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr_s.h", MipsEXTR_S_H,
-                                              NoItinerary>;
+                                              NoItinerary>,
+                      Defs<[DSPOutFlag23]>;
 
 class EXTRV_S_H_DESC : EXTR_W_TY1_R2_DESC_BASE<"extrv_s.h", MipsEXTR_S_H,
-                                               NoItinerary>;
+                                               NoItinerary>,
+                       Defs<[DSPOutFlag23]>;
 
 class SHILO_DESC : SHILO_R1_DESC_BASE<"shilo", MipsSHILO>;
 
 class SHILOV_DESC : SHILO_R2_DESC_BASE<"shilov", MipsSHILO>;
 
-class MTHLIP_DESC : MTHLIP_DESC_BASE<"mthlip", MipsMTHLIP>;
+class MTHLIP_DESC : MTHLIP_DESC_BASE<"mthlip", MipsMTHLIP>, Defs<[DSPPos]>;
 
 class RDDSP_DESC : RDDSP_DESC_BASE<"rddsp", int_mips_rddsp, NoItinerary>;
 
 class WRDSP_DESC : WRDSP_DESC_BASE<"wrdsp", int_mips_wrdsp, NoItinerary>;
 
-class INSV_DESC : INSV_DESC_BASE<"insv", int_mips_insv, NoItinerary>;
+class INSV_DESC : INSV_DESC_BASE<"insv", int_mips_insv, NoItinerary>,
+                  Uses<[DSPPos, DSPSCount]>;
 
 //===----------------------------------------------------------------------===//
 // MIPS DSP Rev 2
 // Addition/subtraction
 class ADDU_PH_DESC : ADDU_QB_DESC_BASE<"addu.ph", int_mips_addu_ph, NoItinerary,
-                                       DSPRegs, DSPRegs>, IsCommutable;
+                                       DSPRegs, DSPRegs>, IsCommutable,
+                     Defs<[DSPOutFlag20]>;
 
 class ADDU_S_PH_DESC : ADDU_QB_DESC_BASE<"addu_s.ph", int_mips_addu_s_ph,
                                          NoItinerary, DSPRegs, DSPRegs>,
-                       IsCommutable;
+                       IsCommutable, Defs<[DSPOutFlag20]>;
 
 class SUBU_PH_DESC : ADDU_QB_DESC_BASE<"subu.ph", int_mips_subu_ph, NoItinerary,
-                                       DSPRegs, DSPRegs>;
+                                       DSPRegs, DSPRegs>,
+                     Defs<[DSPOutFlag20]>;
 
 class SUBU_S_PH_DESC : ADDU_QB_DESC_BASE<"subu_s.ph", int_mips_subu_s_ph,
-                                         NoItinerary, DSPRegs, DSPRegs>;
+                                         NoItinerary, DSPRegs, DSPRegs>,
+                       Defs<[DSPOutFlag20]>;
 
 class ADDUH_QB_DESC : ADDUH_QB_DESC_BASE<"adduh.qb", int_mips_adduh_qb,
-                                         NoItinerary, DSPRegs>,
-                      ClearDefs, IsCommutable;
+                                         NoItinerary, DSPRegs>, IsCommutable;
 
 class ADDUH_R_QB_DESC : ADDUH_QB_DESC_BASE<"adduh_r.qb", int_mips_adduh_r_qb,
-                                           NoItinerary, DSPRegs>,
-                        ClearDefs, IsCommutable;
+                                           NoItinerary, DSPRegs>, IsCommutable;
 
 class SUBUH_QB_DESC : ADDUH_QB_DESC_BASE<"subuh.qb", int_mips_subuh_qb,
-                                         NoItinerary, DSPRegs>, ClearDefs;
+                                         NoItinerary, DSPRegs>;
 
 class SUBUH_R_QB_DESC : ADDUH_QB_DESC_BASE<"subuh_r.qb", int_mips_subuh_r_qb,
-                                           NoItinerary, DSPRegs>, ClearDefs;
+                                           NoItinerary, DSPRegs>;
 
 class ADDQH_PH_DESC : ADDUH_QB_DESC_BASE<"addqh.ph", int_mips_addqh_ph,
-                                         NoItinerary, DSPRegs>,
-                      ClearDefs, IsCommutable;
+                                         NoItinerary, DSPRegs>, IsCommutable;
 
 class ADDQH_R_PH_DESC : ADDUH_QB_DESC_BASE<"addqh_r.ph", int_mips_addqh_r_ph,
-                                           NoItinerary, DSPRegs>,
-                        ClearDefs, IsCommutable;
+                                           NoItinerary, DSPRegs>, IsCommutable;
 
 class SUBQH_PH_DESC : ADDUH_QB_DESC_BASE<"subqh.ph", int_mips_subqh_ph,
-                                         NoItinerary, DSPRegs>, ClearDefs;
+                                         NoItinerary, DSPRegs>;
 
 class SUBQH_R_PH_DESC : ADDUH_QB_DESC_BASE<"subqh_r.ph", int_mips_subqh_r_ph,
-                                           NoItinerary, DSPRegs>, ClearDefs;
+                                           NoItinerary, DSPRegs>;
 
 class ADDQH_W_DESC : ADDUH_QB_DESC_BASE<"addqh.w", int_mips_addqh_w,
-                                        NoItinerary, CPURegs>,
-                     ClearDefs, IsCommutable;
+                                        NoItinerary, CPURegs>, IsCommutable;
 
 class ADDQH_R_W_DESC : ADDUH_QB_DESC_BASE<"addqh_r.w", int_mips_addqh_r_w,
-                                          NoItinerary, CPURegs>,
-                       ClearDefs, IsCommutable;
+                                          NoItinerary, CPURegs>, IsCommutable;
 
 class SUBQH_W_DESC : ADDUH_QB_DESC_BASE<"subqh.w", int_mips_subqh_w,
-                                        NoItinerary, CPURegs>, ClearDefs;
+                                        NoItinerary, CPURegs>;
 
 class SUBQH_R_W_DESC : ADDUH_QB_DESC_BASE<"subqh_r.w", int_mips_subqh_r_w,
-                                          NoItinerary, CPURegs>, ClearDefs;
+                                          NoItinerary, CPURegs>;
 
 // Comparison
 class CMPGDU_EQ_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgdu.eq.qb",
                                                  int_mips_cmpgdu_eq_qb,
                                                  NoItinerary, CPURegs, DSPRegs>,
-                          IsCommutable;
+                          IsCommutable, Defs<[DSPCCond]>;
 
 class CMPGDU_LT_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgdu.lt.qb",
                                                  int_mips_cmpgdu_lt_qb,
-                                                 NoItinerary, CPURegs, DSPRegs>;
+                                                 NoItinerary, CPURegs, DSPRegs>,
+                          Defs<[DSPCCond]>;
 
 class CMPGDU_LE_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgdu.le.qb",
                                                  int_mips_cmpgdu_le_qb,
-                                                 NoItinerary, CPURegs, DSPRegs>;
+                                                 NoItinerary, CPURegs, DSPRegs>,
+                          Defs<[DSPCCond]>;
 
 // Absolute
 class ABSQ_S_QB_DESC : ABSQ_S_PH_R2_DESC_BASE<"absq_s.qb", int_mips_absq_s_qb,
-                                              NoItinerary, DSPRegs>;
+                                              NoItinerary, DSPRegs>,
+                       Defs<[DSPOutFlag20]>;
 
 // Multiplication
 class MUL_PH_DESC : ADDUH_QB_DESC_BASE<"mul.ph", null_frag, NoItinerary,
-                                       DSPRegs>, IsCommutable;
+                                       DSPRegs>, IsCommutable,
+                    Defs<[DSPOutFlag21]>;
 
 class MUL_S_PH_DESC : ADDUH_QB_DESC_BASE<"mul_s.ph", int_mips_mul_s_ph,
-                                         NoItinerary, DSPRegs>, IsCommutable;
+                                         NoItinerary, DSPRegs>, IsCommutable,
+                      Defs<[DSPOutFlag21]>;
 
 class MULQ_S_W_DESC : ADDUH_QB_DESC_BASE<"mulq_s.w", int_mips_mulq_s_w,
-                                         NoItinerary, CPURegs>, IsCommutable;
+                                         NoItinerary, CPURegs>, IsCommutable,
+                      Defs<[DSPOutFlag21]>;
 
 class MULQ_RS_W_DESC : ADDUH_QB_DESC_BASE<"mulq_rs.w", int_mips_mulq_rs_w,
-                                          NoItinerary, CPURegs>, IsCommutable;
+                                          NoItinerary, CPURegs>, IsCommutable,
+                       Defs<[DSPOutFlag21]>;
 
 class MULQ_S_PH_DESC : ADDU_QB_DESC_BASE<"mulq_s.ph", int_mips_mulq_s_ph,
                                          NoItinerary, DSPRegs, DSPRegs>,
-                       IsCommutable;
+                       IsCommutable, Defs<[DSPOutFlag21]>;
 
 // Dot product with accumulate/subtract
 class DPA_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpa.w.ph", MipsDPA_W_PH>;
 
 class DPS_W_PH_DESC : DPA_W_PH_DESC_BASE<"dps.w.ph", MipsDPS_W_PH>;
 
-class DPAQX_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpaqx_s.w.ph", MipsDPAQX_S_W_PH>;
+class DPAQX_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpaqx_s.w.ph", MipsDPAQX_S_W_PH>,
+                          Defs<[DSPOutFlag16_19]>;
 
 class DPAQX_SA_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpaqx_sa.w.ph",
-                                              MipsDPAQX_SA_W_PH>;
+                                              MipsDPAQX_SA_W_PH>,
+                           Defs<[DSPOutFlag16_19]>;
 
 class DPAX_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpax.w.ph", MipsDPAX_W_PH>;
 
 class DPSX_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpsx.w.ph", MipsDPSX_W_PH>;
 
-class DPSQX_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpsqx_s.w.ph", MipsDPSQX_S_W_PH>;
+class DPSQX_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpsqx_s.w.ph", MipsDPSQX_S_W_PH>,
+                          Defs<[DSPOutFlag16_19]>;
 
 class DPSQX_SA_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpsqx_sa.w.ph",
-                                              MipsDPSQX_SA_W_PH>;
+                                              MipsDPSQX_SA_W_PH>,
+                           Defs<[DSPOutFlag16_19]>;
 
 class MULSA_W_PH_DESC : DPA_W_PH_DESC_BASE<"mulsa.w.ph", MipsMULSA_W_PH>;
 
@@ -1022,45 +1031,45 @@ class PRECR_QB_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"precr.qb.ph",
 class PRECR_SRA_PH_W_DESC : PRECR_SRA_PH_W_DESC_BASE<"precr_sra.ph.w",
                                                      int_mips_precr_sra_ph_w,
                                                      NoItinerary, DSPRegs,
-                                                     CPURegs>, ClearDefs;
+                                                     CPURegs>;
 
 class PRECR_SRA_R_PH_W_DESC : PRECR_SRA_PH_W_DESC_BASE<"precr_sra_r.ph.w",
                                                       int_mips_precr_sra_r_ph_w,
                                                        NoItinerary, DSPRegs,
-                                                       CPURegs>, ClearDefs;
+                                                       CPURegs>;
 
 // Shift
 class SHRA_QB_DESC : SHLL_QB_R2_DESC_BASE<"shra.qb", null_frag, immZExt3,
-                                          NoItinerary, DSPRegs>, ClearDefs;
+                                          NoItinerary, DSPRegs>;
 
 class SHRAV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrav.qb", int_mips_shra_qb,
-                                           NoItinerary, DSPRegs>, ClearDefs;
+                                           NoItinerary, DSPRegs>;
 
 class SHRA_R_QB_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.qb", int_mips_shra_r_qb,
-                                            immZExt3, NoItinerary, DSPRegs>,
-                       ClearDefs;
+                                            immZExt3, NoItinerary, DSPRegs>;
 
 class SHRAV_R_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.qb", int_mips_shra_r_qb,
-                                             NoItinerary, DSPRegs>, ClearDefs;
+                                             NoItinerary, DSPRegs>;
 
 class SHRL_PH_DESC : SHLL_QB_R2_DESC_BASE<"shrl.ph", null_frag, immZExt4,
-                                          NoItinerary, DSPRegs>, ClearDefs;
+                                          NoItinerary, DSPRegs>;
 
 class SHRLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrlv.ph", int_mips_shrl_ph,
-                                           NoItinerary, DSPRegs>, ClearDefs;
+                                           NoItinerary, DSPRegs>;
 
 // Misc
 class APPEND_DESC : APPEND_DESC_BASE<"append", int_mips_append, immZExt5,
-                                     NoItinerary>, ClearDefs;
+                                     NoItinerary>;
 
 class BALIGN_DESC : APPEND_DESC_BASE<"balign", int_mips_balign, immZExt2,
-                                     NoItinerary>, ClearDefs;
+                                     NoItinerary>;
 
 class PREPEND_DESC : APPEND_DESC_BASE<"prepend", int_mips_prepend, immZExt5,
-                                      NoItinerary>, ClearDefs;
+                                      NoItinerary>;
 
 // Pseudos.
-def BPOSGE32_PSEUDO : BPOSGE32_PSEUDO_DESC_BASE<int_mips_bposge32, NoItinerary>;
+def BPOSGE32_PSEUDO : BPOSGE32_PSEUDO_DESC_BASE<int_mips_bposge32,
+                                                NoItinerary>, Uses<[DSPPos]>;
 
 // Instruction defs.
 // MIPS DSP Rev 1
@@ -1231,10 +1240,14 @@ def PREPEND : PREPEND_ENC, PREPEND_DESC;
 }
 
 // Pseudos.
-/// Pseudo instructions for loading and storing accumulator registers.
 let isPseudo = 1 in {
+  // Pseudo instructions for loading and storing accumulator registers.
   defm LOAD_AC_DSP  : LoadM<"load_ac_dsp", ACRegsDSP>;
   defm STORE_AC_DSP : StoreM<"store_ac_dsp", ACRegsDSP>;
+
+  // Pseudos for loading and storing ccond field of DSP control register.
+  defm LOAD_CCOND_DSP  : LoadM<"load_ccond_dsp", DSPCC>;
+  defm STORE_CCOND_DSP : StoreM<"store_ccond_dsp", DSPCC>;
 }
 
 // Pseudo CMP and PICK instructions.
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index d07a595..928a43d 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -177,7 +177,7 @@ namespace {
   class Filler : public MachineFunctionPass {
   public:
     Filler(TargetMachine &tm)
-      : MachineFunctionPass(ID), TM(tm), TII(tm.getInstrInfo()) { }
+      : MachineFunctionPass(ID), TM(tm) { }
 
     virtual const char *getPassName() const {
       return "Mips Delay Slot Filler";
@@ -243,7 +243,6 @@ namespace {
     bool terminateSearch(const MachineInstr &Candidate) const;
 
     TargetMachine &TM;
-    const TargetInstrInfo *TII;
 
     static char ID;
   };
@@ -514,6 +513,8 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
     }
 
     // Bundle the NOP to the instruction with the delay slot.
+    const MipsInstrInfo *TII =
+      static_cast<const MipsInstrInfo*>(TM.getInstrInfo());
     BuildMI(MBB, llvm::next(I), I->getDebugLoc(), TII->get(Mips::NOP));
     MIBundleBuilder(MBB, I, llvm::next(llvm::next(I)));
   }
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 968e536..a1de174 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -57,7 +57,7 @@ bool MipsDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
 /// GOT address into a register.
 SDNode *MipsDAGToDAGISel::getGlobalBaseReg() {
   unsigned GlobalBaseReg = MF->getInfo<MipsFunctionInfo>()->getGlobalBaseReg();
-  return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode();
+  return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy()).getNode();
 }
 
 /// ComplexPattern used on MipsInstrInfo
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 4d76181..6351073 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -43,6 +43,11 @@ static cl::opt<bool>
 LargeGOT("mxgot", cl::Hidden,
          cl::desc("MIPS: Enable GOT larger than 64k."), cl::init(false));
 
+static cl::opt<bool>
+NoZeroDivCheck("mno-check-zero-division", cl::Hidden,
+               cl::desc("MIPS: Don't trap on integer division by zero."),
+               cl::init(false));
+
 static const uint16_t O32IntRegs[4] = {
   Mips::A0, Mips::A1, Mips::A2, Mips::A3
 };
@@ -65,7 +70,7 @@ static bool isShiftedMask(uint64_t I, uint64_t &Pos, uint64_t &Size) {
      return false;
 
   Size = CountPopulation_64(I);
-  Pos = CountTrailingZeros_64(I);
+  Pos = countTrailingZeros(I);
   return true;
 }
 
@@ -78,7 +83,7 @@ static SDValue getTargetNode(SDValue Op, SelectionDAG &DAG, unsigned Flag) {
   EVT Ty = Op.getValueType();
 
   if (GlobalAddressSDNode *N = dyn_cast<GlobalAddressSDNode>(Op))
-    return DAG.getTargetGlobalAddress(N->getGlobal(), Op.getDebugLoc(), Ty, 0,
+    return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(Op), Ty, 0,
                                       Flag);
   if (ExternalSymbolSDNode *N = dyn_cast<ExternalSymbolSDNode>(Op))
     return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
@@ -95,7 +100,7 @@ static SDValue getTargetNode(SDValue Op, SelectionDAG &DAG, unsigned Flag) {
 }
 
 static SDValue getAddrNonPIC(SDValue Op, SelectionDAG &DAG) {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT Ty = Op.getValueType();
   SDValue Hi = getTargetNode(Op, DAG, MipsII::MO_ABS_HI);
   SDValue Lo = getTargetNode(Op, DAG, MipsII::MO_ABS_LO);
@@ -106,7 +111,7 @@ static SDValue getAddrNonPIC(SDValue Op, SelectionDAG &DAG) {
 
 SDValue MipsTargetLowering::getAddrLocal(SDValue Op, SelectionDAG &DAG,
                                          bool HasMips64) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT Ty = Op.getValueType();
   unsigned GOTFlag = HasMips64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT;
   SDValue GOT = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
@@ -121,7 +126,7 @@ SDValue MipsTargetLowering::getAddrLocal(SDValue Op, SelectionDAG &DAG,
 
 SDValue MipsTargetLowering::getAddrGlobal(SDValue Op, SelectionDAG &DAG,
                                           unsigned Flag) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT Ty = Op.getValueType();
   SDValue Tgt = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
                             getTargetNode(Op, DAG, Flag));
@@ -132,7 +137,7 @@ SDValue MipsTargetLowering::getAddrGlobal(SDValue Op, SelectionDAG &DAG,
 SDValue MipsTargetLowering::getAddrGlobalLargeGOT(SDValue Op, SelectionDAG &DAG,
                                                   unsigned HiFlag,
                                                   unsigned LoFlag) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT Ty = Op.getValueType();
   SDValue Hi = DAG.getNode(MipsISD::Hi, DL, Ty, getTargetNode(Op, DAG, HiFlag));
   Hi = DAG.getNode(ISD::ADD, DL, Ty, Hi, getGlobalReg(DAG, Ty));
@@ -156,7 +161,7 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MipsISD::FPCmp:             return "MipsISD::FPCmp";
   case MipsISD::CMovFP_T:          return "MipsISD::CMovFP_T";
   case MipsISD::CMovFP_F:          return "MipsISD::CMovFP_F";
-  case MipsISD::FPRound:           return "MipsISD::FPRound";
+  case MipsISD::TruncIntFP:        return "MipsISD::TruncIntFP";
   case MipsISD::ExtractLOHI:       return "MipsISD::ExtractLOHI";
   case MipsISD::InsertLOHI:        return "MipsISD::InsertLOHI";
   case MipsISD::Mult:              return "MipsISD::Mult";
@@ -250,6 +255,7 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::VASTART,            MVT::Other, Custom);
   setOperationAction(ISD::FCOPYSIGN,          MVT::f32,   Custom);
   setOperationAction(ISD::FCOPYSIGN,          MVT::f64,   Custom);
+  setOperationAction(ISD::FP_TO_SINT,         MVT::i32,   Custom);
 
   if (!TM.Options.NoNaNsFPMath) {
     setOperationAction(ISD::FABS,             MVT::f32,   Custom);
@@ -265,6 +271,7 @@ MipsTargetLowering(MipsTargetMachine &TM)
     setOperationAction(ISD::SELECT,             MVT::i64,   Custom);
     setOperationAction(ISD::LOAD,               MVT::i64,   Custom);
     setOperationAction(ISD::STORE,              MVT::i64,   Custom);
+    setOperationAction(ISD::FP_TO_SINT,         MVT::i64,   Custom);
   }
 
   if (!HasMips64) {
@@ -407,7 +414,7 @@ const MipsTargetLowering *MipsTargetLowering::create(MipsTargetMachine &TM) {
   return llvm::createMipsSETargetLowering(TM);
 }
 
-EVT MipsTargetLowering::getSetCCResultType(EVT VT) const {
+EVT MipsTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
   if (!VT.isVector())
     return MVT::i32;
   return VT.changeVectorElementTypeToInteger();
@@ -424,7 +431,7 @@ static SDValue performDivRemCombine(SDNode *N, SelectionDAG &DAG,
   unsigned HI = (Ty == MVT::i32) ? Mips::HI : Mips::HI64;
   unsigned Opc = N->getOpcode() == ISD::SDIVREM ? MipsISD::DivRem16 :
                                                   MipsISD::DivRemU16;
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   SDValue DivRem = DAG.getNode(Opc, DL, MVT::Glue,
                                N->getOperand(0), N->getOperand(1));
@@ -502,7 +509,7 @@ static SDValue createFPCmp(SelectionDAG &DAG, const SDValue &Op) {
     return Op;
 
   SDValue RHS = Op.getOperand(1);
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   // Assume the 3rd operand is a CondCodeSDNode. Add code to check the type of
   // node if necessary.
@@ -514,7 +521,7 @@ static SDValue createFPCmp(SelectionDAG &DAG, const SDValue &Op) {
 
 // Creates and returns a CMovFPT/F node.
 static SDValue createCMovFP(SelectionDAG &DAG, SDValue Cond, SDValue True,
-                            SDValue False, DebugLoc DL) {
+                            SDValue False, SDLoc DL) {
   ConstantSDNode *CC = cast<ConstantSDNode>(Cond.getOperand(2));
   bool invert = invertFPCondCodeUser((Mips::CondCode)CC->getSExtValue());
 
@@ -545,7 +552,7 @@ static SDValue performSELECTCombine(SDNode *N, SelectionDAG &DAG,
   if (!CN || CN->getZExtValue())
     return SDValue();
 
-  const DebugLoc DL = N->getDebugLoc();
+  const SDLoc DL(N);
   ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
   SDValue True = N->getOperand(1);
 
@@ -590,7 +597,7 @@ static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG,
   if (SMPos != 0 || Pos + SMSize > ValTy.getSizeInBits())
     return SDValue();
 
-  return DAG.getNode(MipsISD::Ext, N->getDebugLoc(), ValTy,
+  return DAG.getNode(MipsISD::Ext, SDLoc(N), ValTy,
                      ShiftRight.getOperand(0), DAG.getConstant(Pos, MVT::i32),
                      DAG.getConstant(SMSize, MVT::i32));
 }
@@ -644,7 +651,7 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
   if ((Shamt != SMPos0) || (SMPos0 + SMSize0 > ValTy.getSizeInBits()))
     return SDValue();
 
-  return DAG.getNode(MipsISD::Ins, N->getDebugLoc(), ValTy, Shl.getOperand(0),
+  return DAG.getNode(MipsISD::Ins, SDLoc(N), ValTy, Shl.getOperand(0),
                      DAG.getConstant(SMPos0, MVT::i32),
                      DAG.getConstant(SMSize0, MVT::i32), And0.getOperand(0));
 }
@@ -669,7 +676,7 @@ static SDValue performADDCombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   EVT ValTy = N->getValueType(0);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   SDValue Add1 = DAG.getNode(ISD::ADD, DL, ValTy, N->getOperand(0),
                              Add.getOperand(0));
@@ -744,6 +751,7 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const
   case ISD::LOAD:               return lowerLOAD(Op, DAG);
   case ISD::STORE:              return lowerSTORE(Op, DAG);
   case ISD::ADD:                return lowerADD(Op, DAG);
+  case ISD::FP_TO_SINT:         return lowerFP_TO_SINT(Op, DAG);
   }
   return SDValue();
 }
@@ -763,6 +771,26 @@ addLiveIn(MachineFunction &MF, unsigned PReg, const TargetRegisterClass *RC)
   return VReg;
 }
 
+static MachineBasicBlock *expandPseudoDIV(MachineInstr *MI,
+                                          MachineBasicBlock &MBB,
+                                          const TargetInstrInfo &TII,
+                                          bool Is64Bit) {
+  if (NoZeroDivCheck)
+    return &MBB;
+
+  // Insert instruction "teq $divisor_reg, $zero, 7".
+  MachineBasicBlock::iterator I(MI);
+  MachineInstrBuilder MIB;
+  MIB = BuildMI(MBB, llvm::next(I), MI->getDebugLoc(), TII.get(Mips::TEQ))
+    .addOperand(MI->getOperand(2)).addReg(Mips::ZERO).addImm(7);
+
+  // Use the 32-bit sub-register if this is a 64-bit division.
+  if (Is64Bit)
+    MIB->getOperand(0).setSubReg(Mips::sub_32);
+
+  return &MBB;
+}
+
 MachineBasicBlock *
 MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                 MachineBasicBlock *BB) const {
@@ -872,6 +900,12 @@ MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case Mips::ATOMIC_CMP_SWAP_I64:
   case Mips::ATOMIC_CMP_SWAP_I64_P8:
     return emitAtomicCmpSwap(MI, BB, 8);
+  case Mips::PseudoSDIV:
+  case Mips::PseudoUDIV:
+    return expandPseudoDIV(MI, *BB, *getTargetMachine().getInstrInfo(), false);
+  case Mips::PseudoDSDIV:
+  case Mips::PseudoDUDIV:
+    return expandPseudoDIV(MI, *BB, *getTargetMachine().getInstrInfo(), true);
   }
 }
 
@@ -1039,7 +1073,14 @@ MipsTargetLowering::emitAtomicBinaryPartword(MachineInstr *MI,
   BuildMI(BB, DL, TII->get(Mips::AND), AlignedAddr)
     .addReg(Ptr).addReg(MaskLSB2);
   BuildMI(BB, DL, TII->get(Mips::ANDi), PtrLSB2).addReg(Ptr).addImm(3);
-  BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
+  if (Subtarget->isLittle()) {
+    BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
+  } else {
+    unsigned Off = RegInfo.createVirtualRegister(RC);
+    BuildMI(BB, DL, TII->get(Mips::XORi), Off)
+      .addReg(PtrLSB2).addImm((Size == 1) ? 3 : 2);
+    BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(Off).addImm(3);
+  }
   BuildMI(BB, DL, TII->get(Mips::ORi), MaskUpper)
     .addReg(Mips::ZERO).addImm(MaskImm);
   BuildMI(BB, DL, TII->get(Mips::SLLV), Mask)
@@ -1282,7 +1323,14 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
   BuildMI(BB, DL, TII->get(Mips::AND), AlignedAddr)
     .addReg(Ptr).addReg(MaskLSB2);
   BuildMI(BB, DL, TII->get(Mips::ANDi), PtrLSB2).addReg(Ptr).addImm(3);
-  BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
+  if (Subtarget->isLittle()) {
+    BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
+  } else {
+    unsigned Off = RegInfo.createVirtualRegister(RC);
+    BuildMI(BB, DL, TII->get(Mips::XORi), Off)
+      .addReg(PtrLSB2).addImm((Size == 1) ? 3 : 2);
+    BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(Off).addImm(3);
+  }
   BuildMI(BB, DL, TII->get(Mips::ORi), MaskUpper)
     .addReg(Mips::ZERO).addImm(MaskImm);
   BuildMI(BB, DL, TII->get(Mips::SLLV), Mask)
@@ -1349,7 +1397,7 @@ SDValue MipsTargetLowering::lowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
   SDValue Table = Op.getOperand(1);
   SDValue Index = Op.getOperand(2);
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT PTy = getPointerTy();
   unsigned EntrySize =
     DAG.getMachineFunction().getJumpTableInfo()->getEntrySize(*getDataLayout());
@@ -1382,7 +1430,7 @@ lowerBRCOND(SDValue Op, SelectionDAG &DAG) const
   // the block to branch to if the condition is true.
   SDValue Chain = Op.getOperand(0);
   SDValue Dest = Op.getOperand(2);
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   SDValue CondRes = createFPCmp(DAG, Op.getOperand(1));
 
@@ -1409,15 +1457,16 @@ lowerSELECT(SDValue Op, SelectionDAG &DAG) const
     return Op;
 
   return createCMovFP(DAG, Cond, Op.getOperand(1), Op.getOperand(2),
-                      Op.getDebugLoc());
+                      SDLoc(Op));
 }
 
 SDValue MipsTargetLowering::
 lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
 {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT Ty = Op.getOperand(0).getValueType();
-  SDValue Cond = DAG.getNode(ISD::SETCC, DL, getSetCCResultType(Ty),
+  SDValue Cond = DAG.getNode(ISD::SETCC, DL,
+                             getSetCCResultType(*DAG.getContext(), Ty),
                              Op.getOperand(0), Op.getOperand(1),
                              Op.getOperand(4));
 
@@ -1434,13 +1483,13 @@ SDValue MipsTargetLowering::lowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   SDValue True  = DAG.getConstant(1, MVT::i32);
   SDValue False = DAG.getConstant(0, MVT::i32);
 
-  return createCMovFP(DAG, Cond, True, False, Op.getDebugLoc());
+  return createCMovFP(DAG, Cond, True, False, SDLoc(Op));
 }
 
 SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
                                                SelectionDAG &DAG) const {
   // FIXME there isn't actually debug info here
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
 
   if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64) {
@@ -1488,7 +1537,7 @@ lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
   // Local Exec TLS Model.
 
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
-  DebugLoc DL = GA->getDebugLoc();
+  SDLoc DL(GA);
   const GlobalValue *GV = GA->getGlobal();
   EVT PtrVT = getPointerTy();
 
@@ -1593,7 +1642,7 @@ SDValue MipsTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MipsFunctionInfo *FuncInfo = MF.getInfo<MipsFunctionInfo>();
 
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   SDValue FI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
                                  getPointerTy());
 
@@ -1609,7 +1658,7 @@ static SDValue lowerFCOPYSIGN32(SDValue Op, SelectionDAG &DAG, bool HasR2) {
   EVT TyY = Op.getOperand(1).getValueType();
   SDValue Const1 = DAG.getConstant(1, MVT::i32);
   SDValue Const31 = DAG.getConstant(31, MVT::i32);
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   SDValue Res;
 
   // If operand is of type f64, extract the upper 32-bit. Otherwise, bitcast it
@@ -1654,7 +1703,7 @@ static SDValue lowerFCOPYSIGN64(SDValue Op, SelectionDAG &DAG, bool HasR2) {
   unsigned WidthY = Op.getOperand(1).getValueSizeInBits();
   EVT TyX = MVT::getIntegerVT(WidthX), TyY = MVT::getIntegerVT(WidthY);
   SDValue Const1 = DAG.getConstant(1, MVT::i32);
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   // Bitcast to integer nodes.
   SDValue X = DAG.getNode(ISD::BITCAST, DL, TyX, Op.getOperand(0));
@@ -1707,7 +1756,7 @@ MipsTargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
 
 static SDValue lowerFABS32(SDValue Op, SelectionDAG &DAG, bool HasR2) {
   SDValue Res, Const1 = DAG.getConstant(1, MVT::i32);
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   // If operand is of type f64, extract the upper 32-bit. Otherwise, bitcast it
   // to i32.
@@ -1736,7 +1785,7 @@ static SDValue lowerFABS32(SDValue Op, SelectionDAG &DAG, bool HasR2) {
 
 static SDValue lowerFABS64(SDValue Op, SelectionDAG &DAG, bool HasR2) {
   SDValue Res, Const1 = DAG.getConstant(1, MVT::i32);
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   // Bitcast to integer node.
   SDValue X = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Op.getOperand(0));
@@ -1771,7 +1820,7 @@ lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   MFI->setFrameAddressIsTaken(true);
   EVT VT = Op.getValueType();
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL,
                                          IsN64 ? Mips::FP_64 : Mips::FP, VT);
   return FrameAddr;
@@ -1791,7 +1840,7 @@ SDValue MipsTargetLowering::lowerRETURNADDR(SDValue Op,
 
   // Return RA, which contains the return address. Mark it an implicit live-in.
   unsigned Reg = MF.addLiveIn(RA, getRegClassFor(VT));
-  return DAG.getCopyFromReg(DAG.getEntryNode(), Op.getDebugLoc(), Reg, VT);
+  return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(Op), Reg, VT);
 }
 
 // An EH_RETURN is the result of lowering llvm.eh.return which in turn is
@@ -1807,7 +1856,7 @@ SDValue MipsTargetLowering::lowerEH_RETURN(SDValue Op, SelectionDAG &DAG)
   SDValue Chain     = Op.getOperand(0);
   SDValue Offset    = Op.getOperand(1);
   SDValue Handler   = Op.getOperand(2);
-  DebugLoc DL       = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT Ty = IsN64 ? MVT::i64 : MVT::i32;
 
   // Store stack offset in V1, store jump target in V0. Glue CopyToReg and
@@ -1827,14 +1876,14 @@ SDValue MipsTargetLowering::lowerATOMIC_FENCE(SDValue Op,
   // FIXME: Need pseudo-fence for 'singlethread' fences
   // FIXME: Set SType for weaker fences where supported/appropriate.
   unsigned SType = 0;
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   return DAG.getNode(MipsISD::Sync, DL, MVT::Other, Op.getOperand(0),
                      DAG.getConstant(SType, MVT::i32));
 }
 
 SDValue MipsTargetLowering::lowerShiftLeftParts(SDValue Op,
                                                 SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   SDValue Lo = Op.getOperand(0), Hi = Op.getOperand(1);
   SDValue Shamt = Op.getOperand(2);
 
@@ -1865,7 +1914,7 @@ SDValue MipsTargetLowering::lowerShiftLeftParts(SDValue Op,
 
 SDValue MipsTargetLowering::lowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
                                                  bool IsSRA) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   SDValue Lo = Op.getOperand(0), Hi = Op.getOperand(1);
   SDValue Shamt = Op.getOperand(2);
 
@@ -1909,7 +1958,7 @@ static SDValue createLoadLR(unsigned Opc, SelectionDAG &DAG, LoadSDNode *LD,
   SDValue Ptr = LD->getBasePtr();
   EVT VT = LD->getValueType(0), MemVT = LD->getMemoryVT();
   EVT BasePtrVT = Ptr.getValueType();
-  DebugLoc DL = LD->getDebugLoc();
+  SDLoc DL(LD);
   SDVTList VTList = DAG.getVTList(VT, MVT::Other);
 
   if (Offset)
@@ -1975,7 +2024,7 @@ SDValue MipsTargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   //  (set tmp1, (lwr baseptr, tmp0))
   //  (set tmp2, (shl tmp1, 32))
   //  (set dst, (srl tmp2, 32))
-  DebugLoc DL = LD->getDebugLoc();
+  SDLoc DL(LD);
   SDValue Const32 = DAG.getConstant(32, MVT::i32);
   SDValue SLL = DAG.getNode(ISD::SHL, DL, MVT::i64, LWR, Const32);
   SDValue SRL = DAG.getNode(ISD::SRL, DL, MVT::i64, SLL, Const32);
@@ -1987,7 +2036,7 @@ static SDValue createStoreLR(unsigned Opc, SelectionDAG &DAG, StoreSDNode *SD,
                              SDValue Chain, unsigned Offset) {
   SDValue Ptr = SD->getBasePtr(), Value = SD->getValue();
   EVT MemVT = SD->getMemoryVT(), BasePtrVT = Ptr.getValueType();
-  DebugLoc DL = SD->getDebugLoc();
+  SDLoc DL(SD);
   SDVTList VTList = DAG.getVTList(MVT::Other);
 
   if (Offset)
@@ -2000,16 +2049,8 @@ static SDValue createStoreLR(unsigned Opc, SelectionDAG &DAG, StoreSDNode *SD,
 }
 
 // Expand an unaligned 32 or 64-bit integer store node.
-SDValue MipsTargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
-  StoreSDNode *SD = cast<StoreSDNode>(Op);
-  EVT MemVT = SD->getMemoryVT();
-
-  // Return if store is aligned or if MemVT is neither i32 nor i64.
-  if ((SD->getAlignment() >= MemVT.getSizeInBits() / 8) ||
-      ((MemVT != MVT::i32) && (MemVT != MVT::i64)))
-    return SDValue();
-
-  bool IsLittle = Subtarget->isLittle();
+static SDValue lowerUnalignedIntStore(StoreSDNode *SD, SelectionDAG &DAG,
+                                      bool IsLittle) {
   SDValue Value = SD->getValue(), Chain = SD->getChain();
   EVT VT = Value.getValueType();
 
@@ -2036,6 +2077,34 @@ SDValue MipsTargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   return createStoreLR(MipsISD::SDR, DAG, SD, SDL, IsLittle ? 0 : 7);
 }
 
+// Lower (store (fp_to_sint $fp) $ptr) to (store (TruncIntFP $fp), $ptr).
+static SDValue lowerFP_TO_SINT_STORE(StoreSDNode *SD, SelectionDAG &DAG) {
+  SDValue Val = SD->getValue();
+
+  if (Val.getOpcode() != ISD::FP_TO_SINT)
+    return SDValue();
+
+  EVT FPTy = EVT::getFloatingPointVT(Val.getValueSizeInBits());
+  SDValue Tr = DAG.getNode(MipsISD::TruncIntFP, SDLoc(Val), FPTy,
+                           Val.getOperand(0));
+
+  return DAG.getStore(SD->getChain(), SDLoc(SD), Tr, SD->getBasePtr(),
+                      SD->getPointerInfo(), SD->isVolatile(),
+                      SD->isNonTemporal(), SD->getAlignment());
+}
+
+SDValue MipsTargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  StoreSDNode *SD = cast<StoreSDNode>(Op);
+  EVT MemVT = SD->getMemoryVT();
+
+  // Lower unaligned integer stores.
+  if ((SD->getAlignment() < MemVT.getSizeInBits() / 8) &&
+      ((MemVT == MVT::i32) || (MemVT == MVT::i64)))
+    return lowerUnalignedIntStore(SD, DAG, Subtarget->isLittle());
+
+  return lowerFP_TO_SINT_STORE(SD, DAG);
+}
+
 SDValue MipsTargetLowering::lowerADD(SDValue Op, SelectionDAG &DAG) const {
   if (Op->getOperand(0).getOpcode() != ISD::FRAMEADDR
       || cast<ConstantSDNode>
@@ -2053,10 +2122,18 @@ SDValue MipsTargetLowering::lowerADD(SDValue Op, SelectionDAG &DAG) const {
   EVT ValTy = Op->getValueType(0);
   int FI = MFI->CreateFixedObject(Op.getValueSizeInBits() / 8, 0, false);
   SDValue InArgsAddr = DAG.getFrameIndex(FI, ValTy);
-  return DAG.getNode(ISD::ADD, Op->getDebugLoc(), ValTy, InArgsAddr,
+  return DAG.getNode(ISD::ADD, SDLoc(Op), ValTy, InArgsAddr,
                      DAG.getConstant(0, ValTy));
 }
 
+SDValue MipsTargetLowering::lowerFP_TO_SINT(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  EVT FPTy = EVT::getFloatingPointVT(Op.getValueSizeInBits());
+  SDValue Trunc = DAG.getNode(MipsISD::TruncIntFP, SDLoc(Op), FPTy,
+                              Op.getOperand(0));
+  return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op.getValueType(), Trunc);
+}
+
 //===----------------------------------------------------------------------===//
 //                      Calling Convention Implementation
 //===----------------------------------------------------------------------===//
@@ -2175,7 +2252,7 @@ static unsigned getNextIntArgReg(unsigned Reg) {
 
 SDValue
 MipsTargetLowering::passArgOnStack(SDValue StackPtr, unsigned Offset,
-                                   SDValue Chain, SDValue Arg, DebugLoc DL,
+                                   SDValue Chain, SDValue Arg, SDLoc DL,
                                    bool IsTailCall, SelectionDAG &DAG) const {
   if (!IsTailCall) {
     SDValue PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr,
@@ -2229,6 +2306,15 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
   const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
   const uint32_t *Mask = TRI->getCallPreservedMask(CLI.CallConv);
   assert(Mask && "Missing call preserved mask for calling convention");
+  if (Subtarget->inMips16HardFloat()) {
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(CLI.Callee)) {
+      llvm::StringRef Sym = G->getGlobal()->getName();
+      Function *F = G->getGlobal()->getParent()->getFunction(Sym);
+      if (F->hasFnAttribute("__Mips16RetHelper")) {
+        Mask = MipsRegisterInfo::getMips16RetHelperMask();
+      }
+    }
+  }
   Ops.push_back(CLI.DAG.getRegisterMask(Mask));
 
   if (InFlag.getNode())
@@ -2241,7 +2327,7 @@ SDValue
 MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                               SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG                     = CLI.DAG;
-  DebugLoc &DL                          = CLI.DL;
+  SDLoc DL                              = CLI.DL;
   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
   SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
   SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
@@ -2260,7 +2346,9 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), ArgLocs, *DAG.getContext());
-  MipsCC MipsCCInfo(CallConv, IsO32, CCInfo);
+  MipsCC::SpecialCallingConvType SpecialCallingConv =
+    getSpecialCallingConv(Callee);
+  MipsCC MipsCCInfo(CallConv, IsO32, CCInfo, SpecialCallingConv);
 
   MipsCCInfo.analyzeCallOperands(Outs, IsVarArg,
                                  getTargetMachine().Options.UseSoftFloat,
@@ -2286,7 +2374,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   SDValue NextStackOffsetVal = DAG.getIntPtrConstant(NextStackOffset, true);
 
   if (!IsTailCall)
-    Chain = DAG.getCALLSEQ_START(Chain, NextStackOffsetVal);
+    Chain = DAG.getCALLSEQ_START(Chain, NextStackOffsetVal, DL);
 
   SDValue StackPtr = DAG.getCopyFromReg(Chain, DL,
                                         IsN64 ? Mips::SP_64 : Mips::SP,
@@ -2424,7 +2512,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Create the CALLSEQ_END node.
   Chain = DAG.getCALLSEQ_END(Chain, NextStackOffsetVal,
-                             DAG.getIntPtrConstant(0, true), InFlag);
+                             DAG.getIntPtrConstant(0, true), InFlag, DL);
   InFlag = Chain.getValue(1);
 
   // Handle result values, copying them out of physregs into vregs that we
@@ -2439,7 +2527,7 @@ SDValue
 MipsTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
                                     CallingConv::ID CallConv, bool IsVarArg,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    DebugLoc DL, SelectionDAG &DAG,
+                                    SDLoc DL, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &InVals,
                                     const SDNode *CallNode,
                                     const Type *RetTy) const {
@@ -2478,7 +2566,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
                                          CallingConv::ID CallConv,
                                          bool IsVarArg,
                                       const SmallVectorImpl<ISD::InputArg> &Ins,
-                                         DebugLoc DL, SelectionDAG &DAG,
+                                         SDLoc DL, SelectionDAG &DAG,
                                          SmallVectorImpl<SDValue> &InVals)
                                           const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -2644,7 +2732,7 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
                                 CallingConv::ID CallConv, bool IsVarArg,
                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
                                 const SmallVectorImpl<SDValue> &OutVals,
-                                DebugLoc DL, SelectionDAG &DAG) const {
+                                SDLoc DL, SelectionDAG &DAG) const {
   // CCValAssign - represent the assignment of
   // the return value to a location
   SmallVector<CCValAssign, 16> RVLocs;
@@ -3029,13 +3117,32 @@ static bool originalTypeIsF128(const Type *Ty, const SDNode *CallNode) {
   return (ES && Ty->isIntegerTy(128) && isF128SoftLibCall(ES->getSymbol()));
 }
 
-MipsTargetLowering::MipsCC::MipsCC(CallingConv::ID CC, bool IsO32_,
-                                   CCState &Info)
-  : CCInfo(Info), CallConv(CC), IsO32(IsO32_) {
+MipsTargetLowering::MipsCC::SpecialCallingConvType
+  MipsTargetLowering::getSpecialCallingConv(SDValue Callee) const {
+  MipsCC::SpecialCallingConvType SpecialCallingConv =
+    MipsCC::NoSpecialCallingConv;;
+  if (Subtarget->inMips16HardFloat()) {
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+      llvm::StringRef Sym = G->getGlobal()->getName();
+      Function *F = G->getGlobal()->getParent()->getFunction(Sym);
+      if (F->hasFnAttribute("__Mips16RetHelper")) {
+        SpecialCallingConv = MipsCC::Mips16RetHelperConv;
+      }
+    }
+  }
+  return SpecialCallingConv;
+}
+
+MipsTargetLowering::MipsCC::MipsCC(
+  CallingConv::ID CC, bool IsO32_, CCState &Info,
+    MipsCC::SpecialCallingConvType SpecialCallingConv_)
+  : CCInfo(Info), CallConv(CC), IsO32(IsO32_),
+    SpecialCallingConv(SpecialCallingConv_){
   // Pre-allocate reserved argument area.
   CCInfo.AllocateStack(reservedArgArea(), 1);
 }
 
+
 void MipsTargetLowering::MipsCC::
 analyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Args,
                     bool IsVarArg, bool IsSoftFloat, const SDNode *CallNode,
@@ -3183,6 +3290,8 @@ llvm::CCAssignFn *MipsTargetLowering::MipsCC::fixedArgFn() const {
   if (CallConv == CallingConv::Fast)
     return CC_Mips_FastCC;
 
+  if (SpecialCallingConv == Mips16RetHelperConv)
+    return CC_Mips16RetHelper;
   return IsO32 ? CC_MipsO32 : CC_MipsN;
 }
 
@@ -3233,7 +3342,7 @@ MVT MipsTargetLowering::MipsCC::getRegVT(MVT VT, const Type *OrigTy,
 }
 
 void MipsTargetLowering::
-copyByValRegs(SDValue Chain, DebugLoc DL, std::vector<SDValue> &OutChains,
+copyByValRegs(SDValue Chain, SDLoc DL, std::vector<SDValue> &OutChains,
               SelectionDAG &DAG, const ISD::ArgFlagsTy &Flags,
               SmallVectorImpl<SDValue> &InVals, const Argument *FuncArg,
               const MipsCC &CC, const ByValArgInfo &ByVal) const {
@@ -3277,7 +3386,7 @@ copyByValRegs(SDValue Chain, DebugLoc DL, std::vector<SDValue> &OutChains,
 
 // Copy byVal arg to registers and stack.
 void MipsTargetLowering::
-passByValArg(SDValue Chain, DebugLoc DL,
+passByValArg(SDValue Chain, SDLoc DL,
              std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
              SmallVector<SDValue, 8> &MemOpChains, SDValue StackPtr,
              MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg,
@@ -3375,7 +3484,7 @@ passByValArg(SDValue Chain, DebugLoc DL,
 void
 MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
                                     const MipsCC &CC, SDValue Chain,
-                                    DebugLoc DL, SelectionDAG &DAG) const {
+                                    SDLoc DL, SelectionDAG &DAG) const {
   unsigned NumRegs = CC.numIntArgRegs();
   const uint16_t *ArgRegs = CC.intArgRegs();
   const CCState &CCInfo = CC.getCCInfo();
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index 5587e8f..fe043ae 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -60,8 +60,8 @@ namespace llvm {
       CMovFP_T,
       CMovFP_F,
 
-      // Floating Point Rounding
-      FPRound,
+      // FP-to-int truncation node.
+      TruncIntFP,
 
       // Return
       Ret,
@@ -195,7 +195,7 @@ namespace llvm {
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
     /// getSetCCResultType - get the ISD::SETCC result ValueType
-    EVT getSetCCResultType(EVT VT) const;
+    EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
 
     virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
@@ -240,7 +240,14 @@ namespace llvm {
     /// arguments and inquire about calling convention information.
     class MipsCC {
     public:
-      MipsCC(CallingConv::ID CallConv, bool IsO32, CCState &Info);
+      enum SpecialCallingConvType {
+        Mips16RetHelperConv, NoSpecialCallingConv
+      };
+
+      MipsCC(
+        CallingConv::ID CallConv, bool IsO32, CCState &Info,
+        SpecialCallingConvType SpecialCallingConv = NoSpecialCallingConv);
+
 
       void analyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
                                bool IsVarArg, bool IsSoftFloat,
@@ -313,20 +320,23 @@ namespace llvm {
       CCState &CCInfo;
       CallingConv::ID CallConv;
       bool IsO32;
+      SpecialCallingConvType SpecialCallingConv;
       SmallVector<ByValArgInfo, 2> ByValArgs;
     };
-
+  protected:
     // Subtarget Info
     const MipsSubtarget *Subtarget;
 
     bool HasMips64, IsN64, IsO32;
 
   private:
+
+    MipsCC::SpecialCallingConvType getSpecialCallingConv(SDValue Callee) const;
     // Lower Operand helpers
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                             CallingConv::ID CallConv, bool isVarArg,
                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                            DebugLoc dl, SelectionDAG &DAG,
+                            SDLoc dl, SelectionDAG &DAG,
                             SmallVectorImpl<SDValue> &InVals,
                             const SDNode *CallNode, const Type *RetTy) const;
 
@@ -354,6 +364,7 @@ namespace llvm {
     SDValue lowerLOAD(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerSTORE(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerADD(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
 
     /// isEligibleForTailCallOptimization - Check whether the call is eligible
     /// for tail call optimization.
@@ -365,7 +376,7 @@ namespace llvm {
     /// copyByValArg - Copy argument registers which were used to pass a byval
     /// argument to the stack. Create a stack frame object for the byval
     /// argument.
-    void copyByValRegs(SDValue Chain, DebugLoc DL,
+    void copyByValRegs(SDValue Chain, SDLoc DL,
                        std::vector<SDValue> &OutChains, SelectionDAG &DAG,
                        const ISD::ArgFlagsTy &Flags,
                        SmallVectorImpl<SDValue> &InVals,
@@ -373,7 +384,7 @@ namespace llvm {
                        const MipsCC &CC, const ByValArgInfo &ByVal) const;
 
     /// passByValArg - Pass a byval argument in registers or on stack.
-    void passByValArg(SDValue Chain, DebugLoc DL,
+    void passByValArg(SDValue Chain, SDLoc DL,
                       std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
                       SmallVector<SDValue, 8> &MemOpChains, SDValue StackPtr,
                       MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg,
@@ -384,17 +395,17 @@ namespace llvm {
     /// to the stack. Also create a stack frame object for the first variable
     /// argument.
     void writeVarArgRegs(std::vector<SDValue> &OutChains, const MipsCC &CC,
-                         SDValue Chain, DebugLoc DL, SelectionDAG &DAG) const;
+                         SDValue Chain, SDLoc DL, SelectionDAG &DAG) const;
 
     virtual SDValue
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv, bool isVarArg,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
-                           DebugLoc dl, SelectionDAG &DAG,
+                           SDLoc dl, SelectionDAG &DAG,
                            SmallVectorImpl<SDValue> &InVals) const;
 
     SDValue passArgOnStack(SDValue StackPtr, unsigned Offset, SDValue Chain,
-                           SDValue Arg, DebugLoc DL, bool IsTailCall,
+                           SDValue Arg, SDLoc DL, bool IsTailCall,
                            SelectionDAG &DAG) const;
 
     virtual SDValue
@@ -412,7 +423,7 @@ namespace llvm {
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
                   const SmallVectorImpl<SDValue> &OutVals,
-                  DebugLoc dl, SelectionDAG &DAG) const;
+                  SDLoc dl, SelectionDAG &DAG) const;
 
     // Inline asm support
     ConstraintType getConstraintType(const std::string &Constraint) const;
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index 6b23057..e2acf28 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -30,6 +30,7 @@ def SDT_MipsFPCmp : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>, SDTCisFP<1>,
                                          SDTCisVT<2, i32>]>;
 def SDT_MipsCMovFP : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
                                           SDTCisSameAs<1, 2>]>;
+def SDT_MipsTruncIntFP : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>;
 def SDT_MipsBuildPairF64 : SDTypeProfile<1, 2, [SDTCisVT<0, f64>,
                                                 SDTCisVT<1, i32>,
                                                 SDTCisSameAs<1, 2>]>;
@@ -42,6 +43,7 @@ def MipsCMovFP_T : SDNode<"MipsISD::CMovFP_T", SDT_MipsCMovFP, [SDNPInGlue]>;
 def MipsCMovFP_F : SDNode<"MipsISD::CMovFP_F", SDT_MipsCMovFP, [SDNPInGlue]>;
 def MipsFPBrcond : SDNode<"MipsISD::FPBrcond", SDT_MipsFPBrcond,
                           [SDNPHasChain, SDNPOptInGlue]>;
+def MipsTruncIntFP : SDNode<"MipsISD::TruncIntFP", SDT_MipsTruncIntFP>;
 def MipsBuildPairF64 : SDNode<"MipsISD::BuildPairF64", SDT_MipsBuildPairF64>;
 def MipsExtractElementF64 : SDNode<"MipsISD::ExtractElementF64",
                                    SDT_MipsExtractElementF64>;
@@ -154,6 +156,7 @@ class LW_FT<string opstr, RegisterClass RC, InstrItinClass Itin,
   InstSE<(outs RC:$rt), (ins MemOpnd:$addr), !strconcat(opstr, "\t$rt, $addr"),
          [(set RC:$rt, (OpNode addrDefault:$addr))], Itin, FrmFI> {
   let DecoderMethod = "DecodeFMem";
+  let mayLoad = 1;
 }
 
 class SW_FT<string opstr, RegisterClass RC, InstrItinClass Itin,
@@ -161,6 +164,7 @@ class SW_FT<string opstr, RegisterClass RC, InstrItinClass Itin,
   InstSE<(outs), (ins RC:$rt, MemOpnd:$addr), !strconcat(opstr, "\t$rt, $addr"),
          [(OpNode RC:$rt, addrDefault:$addr)], Itin, FrmFI> {
   let DecoderMethod = "DecodeFMem";
+  let mayStore = 1;
 }
 
 class MADDS_FT<string opstr, RegisterClass RC, InstrItinClass Itin,
@@ -251,11 +255,19 @@ let Predicates = [NotFP64bit, HasStdEnc] in {
 }
 
 let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
- def CVT_S_D64 : ABSS_FT<"cvt.s.d", FGR32, FGR64, IIFcvt>, ABSS_FM<0x20, 17>;
- def CVT_S_L   : ABSS_FT<"cvt.s.l", FGR32, FGR64, IIFcvt>, ABSS_FM<0x20, 21>;
- def CVT_D64_W : ABSS_FT<"cvt.d.w", FGR64, FGR32, IIFcvt>, ABSS_FM<0x21, 20>;
- def CVT_D64_S : ABSS_FT<"cvt.d.s", FGR64, FGR32, IIFcvt>, ABSS_FM<0x21, 16>;
- def CVT_D64_L : ABSS_FT<"cvt.d.l", FGR64, FGR64, IIFcvt>, ABSS_FM<0x21, 21>;
+  def CVT_S_D64 : ABSS_FT<"cvt.s.d", FGR32, FGR64, IIFcvt>, ABSS_FM<0x20, 17>;
+  def CVT_S_L   : ABSS_FT<"cvt.s.l", FGR32, FGR64, IIFcvt>, ABSS_FM<0x20, 21>;
+  def CVT_D64_W : ABSS_FT<"cvt.d.w", FGR64, FGR32, IIFcvt>, ABSS_FM<0x21, 20>;
+  def CVT_D64_S : ABSS_FT<"cvt.d.s", FGR64, FGR32, IIFcvt>, ABSS_FM<0x21, 16>;
+  def CVT_D64_L : ABSS_FT<"cvt.d.l", FGR64, FGR64, IIFcvt>, ABSS_FM<0x21, 21>;
+}
+
+let isPseudo = 1, isCodeGenOnly = 1 in {
+  def PseudoCVT_S_W : ABSS_FT<"", FGR32, CPURegs, IIFcvt>;
+  def PseudoCVT_D32_W : ABSS_FT<"", AFGR64, CPURegs, IIFcvt>;
+  def PseudoCVT_S_L : ABSS_FT<"", FGR64, CPU64Regs, IIFcvt>;
+  def PseudoCVT_D64_W : ABSS_FT<"", FGR64, CPURegs, IIFcvt>;
+  def PseudoCVT_D64_L : ABSS_FT<"", FGR64, CPU64Regs, IIFcvt>;
 }
 
 let Predicates = [NoNaNsFPMath, HasStdEnc] in {
@@ -314,8 +326,12 @@ let Predicates = [NotN64, HasMips64, HasStdEnc],
 }
 
 let Predicates = [NotN64, NotMips64, HasStdEnc] in {
-  def LDC1 : LW_FT<"ldc1", AFGR64, IILoad, mem, load>, LW_FM<0x35>;
-  def SDC1 : SW_FT<"sdc1", AFGR64, IIStore, mem, store>, LW_FM<0x3d>;
+  let isPseudo = 1, isCodeGenOnly = 1 in {
+    def PseudoLDC1 : LW_FT<"", AFGR64, IILoad, mem, load>;
+    def PseudoSDC1 : SW_FT<"", AFGR64, IIStore, mem, store>;
+  }
+  def LDC1 : LW_FT<"ldc1", AFGR64, IILoad, mem>, LW_FM<0x35>;
+  def SDC1 : SW_FT<"sdc1", AFGR64, IIStore, mem>, LW_FM<0x3d>;
 }
 
 // Indexed loads and stores.
@@ -470,14 +486,13 @@ def ExtractElementF64 :
 def : MipsPat<(f32 fpimm0), (MTC1 ZERO)>;
 def : MipsPat<(f32 fpimm0neg), (FNEG_S (MTC1 ZERO))>;
 
-def : MipsPat<(f32 (sint_to_fp CPURegs:$src)), (CVT_S_W (MTC1 CPURegs:$src))>;
-def : MipsPat<(i32 (fp_to_sint FGR32:$src)), (MFC1 (TRUNC_W_S FGR32:$src))>;
+def : MipsPat<(f32 (sint_to_fp CPURegs:$src)), (PseudoCVT_S_W CPURegs:$src)>;
+def : MipsPat<(MipsTruncIntFP FGR32:$src), (TRUNC_W_S FGR32:$src)>;
 
 let Predicates = [NotFP64bit, HasStdEnc] in {
   def : MipsPat<(f64 (sint_to_fp CPURegs:$src)),
-                (CVT_D32_W (MTC1 CPURegs:$src))>;
-  def : MipsPat<(i32 (fp_to_sint AFGR64:$src)),
-                (MFC1 (TRUNC_W_D32 AFGR64:$src))>;
+                (PseudoCVT_D32_W CPURegs:$src)>;
+  def : MipsPat<(MipsTruncIntFP AFGR64:$src), (TRUNC_W_D32 AFGR64:$src)>;
   def : MipsPat<(f32 (fround AFGR64:$src)), (CVT_S_D32 AFGR64:$src)>;
   def : MipsPat<(f64 (fextend FGR32:$src)), (CVT_D32_S FGR32:$src)>;
 }
@@ -487,17 +502,15 @@ let Predicates = [IsFP64bit, HasStdEnc] in {
   def : MipsPat<(f64 fpimm0neg), (FNEG_D64 (DMTC1 ZERO_64))>;
 
   def : MipsPat<(f64 (sint_to_fp CPURegs:$src)),
-                (CVT_D64_W (MTC1 CPURegs:$src))>;
+                (PseudoCVT_D64_W CPURegs:$src)>;
   def : MipsPat<(f32 (sint_to_fp CPU64Regs:$src)),
-                (CVT_S_L (DMTC1 CPU64Regs:$src))>;
+                (EXTRACT_SUBREG (PseudoCVT_S_L CPU64Regs:$src), sub_32)>;
   def : MipsPat<(f64 (sint_to_fp CPU64Regs:$src)),
-                (CVT_D64_L (DMTC1 CPU64Regs:$src))>;
+                (PseudoCVT_D64_L CPU64Regs:$src)>;
 
-  def : MipsPat<(i32 (fp_to_sint FGR64:$src)),
-                (MFC1 (TRUNC_W_D64 FGR64:$src))>;
-  def : MipsPat<(i64 (fp_to_sint FGR32:$src)), (DMFC1 (TRUNC_L_S FGR32:$src))>;
-  def : MipsPat<(i64 (fp_to_sint FGR64:$src)),
-                (DMFC1 (TRUNC_L_D64 FGR64:$src))>;
+  def : MipsPat<(MipsTruncIntFP FGR64:$src), (TRUNC_W_D64 FGR64:$src)>;
+  def : MipsPat<(MipsTruncIntFP FGR32:$src), (TRUNC_L_S FGR32:$src)>;
+  def : MipsPat<(MipsTruncIntFP FGR64:$src), (TRUNC_L_D64 FGR64:$src)>;
 
   def : MipsPat<(f32 (fround FGR64:$src)), (CVT_S_D64 FGR64:$src)>;
   def : MipsPat<(f64 (fextend FGR32:$src)), (CVT_D64_S FGR32:$src)>;
@@ -523,7 +536,7 @@ let AddedComplexity = 40 in {
   }
 
   let Predicates = [NotN64, NotMips64, HasStdEnc] in {
-    def : LoadRegImmPat<LDC1, f64, load>;
-    def : StoreRegImmPat<SDC1, f64>;
+    def : LoadRegImmPat<PseudoLDC1, f64, load>;
+    def : StoreRegImmPat<PseudoSDC1, f64>;
   }
 }
diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
index ea07372..14cfcf9 100644
--- a/lib/Target/Mips/MipsInstrFormats.td
+++ b/lib/Target/Mips/MipsInstrFormats.td
@@ -476,6 +476,20 @@ class RDHWR_FM {
   let Inst{5-0}   = 0x3b;
 }
 
+class TEQ_FM<bits<6> funct> {
+  bits<5> rs;
+  bits<5> rt;
+  bits<10> code_;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-6}  = code_;
+  let Inst{5-0}   = funct;
+}
+
 //===----------------------------------------------------------------------===//
 //
 //  FLOATING POINT INSTRUCTION FORMATS
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index ad92d41..3144dae 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -77,7 +77,7 @@ MipsInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
 void MipsInstrInfo::AnalyzeCondBr(const MachineInstr *Inst, unsigned Opc,
                                   MachineBasicBlock *&BB,
                                   SmallVectorImpl<MachineOperand> &Cond) const {
-  assert(GetAnalyzableBrOpc(Opc) && "Not an analyzable branch");
+  assert(getAnalyzableBrOpc(Opc) && "Not an analyzable branch");
   int NumOp = Inst->getNumExplicitOperands();
 
   // for both int and fp branches, the last explicit operand is the
@@ -167,7 +167,7 @@ RemoveBranch(MachineBasicBlock &MBB) const
   // Up to 2 branches are removed.
   // Note that indirect branches are not removed.
   for(removed = 0; I != REnd && removed < 2; ++I, ++removed)
-    if (!GetAnalyzableBrOpc(I->getOpcode()))
+    if (!getAnalyzableBrOpc(I->getOpcode()))
       break;
 
   MBB.erase(I.base(), FirstBr.base());
@@ -182,7 +182,7 @@ ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const
 {
   assert( (Cond.size() && Cond.size() <= 3) &&
           "Invalid Mips branch condition!");
-  Cond[0].setImm(GetOppositeBranchOpc(Cond[0].getImm()));
+  Cond[0].setImm(getOppositeBranchOpc(Cond[0].getImm()));
   return false;
 }
 
@@ -210,7 +210,7 @@ AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
   BranchInstrs.push_back(LastInst);
 
   // Not an analyzable branch (e.g., indirect jump).
-  if (!GetAnalyzableBrOpc(LastOpc))
+  if (!getAnalyzableBrOpc(LastOpc))
     return LastInst->isIndirectBranch() ? BT_Indirect : BT_None;
 
   // Get the second to last instruction in the block.
@@ -219,7 +219,7 @@ AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
 
   if (++I != REnd) {
     SecondLastInst = &*I;
-    SecondLastOpc = GetAnalyzableBrOpc(SecondLastInst->getOpcode());
+    SecondLastOpc = getAnalyzableBrOpc(SecondLastInst->getOpcode());
 
     // Not an analyzable branch (must be an indirect jump).
     if (isUnpredicatedTerminator(SecondLastInst) && !SecondLastOpc)
@@ -282,3 +282,16 @@ unsigned MipsInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
   }
   }
 }
+
+MachineInstrBuilder
+MipsInstrInfo::genInstrWithNewOpc(unsigned NewOpc,
+                                  MachineBasicBlock::iterator I) const {
+  MachineInstrBuilder MIB;
+  MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), get(NewOpc));
+
+  for (unsigned J = 0, E = I->getDesc().getNumOperands(); J < E; ++J)
+    MIB.addOperand(I->getOperand(J));
+
+  MIB.setMemRefs(I->memoperands_begin(), I->memoperands_end());
+  return MIB;
+}
diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index 8c05d97..0f075ec 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h
@@ -17,6 +17,7 @@
 #include "Mips.h"
 #include "MipsAnalyzeImmediate.h"
 #include "MipsRegisterInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetInstrInfo.h"
 
@@ -81,7 +82,7 @@ public:
   ///
   virtual const MipsRegisterInfo &getRegisterInfo() const = 0;
 
-  virtual unsigned GetOppositeBranchOpc(unsigned Opc) const = 0;
+  virtual unsigned getOppositeBranchOpc(unsigned Opc) const = 0;
 
   /// Return the number of bytes of code the specified instruction may be.
   unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
@@ -116,6 +117,11 @@ public:
                                 const TargetRegisterInfo *TRI,
                                 int64_t Offset) const = 0;
 
+  /// Create an instruction which has the same operands and memory operands
+  /// as MI but has a new opcode.
+  MachineInstrBuilder genInstrWithNewOpc(unsigned NewOpc,
+                                         MachineBasicBlock::iterator I) const;
+
 protected:
   bool isZeroImm(const MachineOperand &op) const;
 
@@ -123,7 +129,7 @@ protected:
                                    unsigned Flag) const;
 
 private:
-  virtual unsigned GetAnalyzableBrOpc(unsigned Opc) const = 0;
+  virtual unsigned getAnalyzableBrOpc(unsigned Opc) const = 0;
 
   void AnalyzeCondBr(const MachineInstr *Inst, unsigned Opc,
                      MachineBasicBlock *&BB,
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index 86ec729..dc3e4be 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -26,7 +26,8 @@ def SDT_MipsCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
 def SDT_ExtractLOHI : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisVT<1, untyped>,
                                            SDTCisVT<2, i32>]>;
 def SDT_InsertLOHI : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>,
-                                          SDTCisVT<1, i32>, SDTCisSameAs<1, 2>]>;
+                                          SDTCisVT<1, i32>,
+                                          SDTCisSameAs<1, 2>]>;
 def SDT_MipsMultDiv : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>, SDTCisInt<1>,
                                     SDTCisSameAs<1, 2>]>;
 def SDT_MipsMAddMSub : SDTypeProfile<1, 3,
@@ -104,7 +105,8 @@ def MipsMSubu : SDNode<"MipsISD::MSubu", SDT_MipsMAddMSub>;
 // DivRem(u) nodes
 def MipsDivRem    : SDNode<"MipsISD::DivRem", SDT_MipsMultDiv>;
 def MipsDivRemU   : SDNode<"MipsISD::DivRemU", SDT_MipsMultDiv>;
-def MipsDivRem16  : SDNode<"MipsISD::DivRem16", SDT_MipsDivRem16, [SDNPOutGlue]>;
+def MipsDivRem16  : SDNode<"MipsISD::DivRem16", SDT_MipsDivRem16,
+                           [SDNPOutGlue]>;
 def MipsDivRemU16 : SDNode<"MipsISD::DivRemU16", SDT_MipsDivRem16,
                            [SDNPOutGlue]>;
 
@@ -389,6 +391,7 @@ class ArithLogicI<string opstr, Operand Od, RegisterOperand RO,
          [(set RO:$rt, (OpNode RO:$rs, imm_type:$imm16))],
          IIAlu, FrmI, opstr> {
   let isReMaterializable = 1;
+  let TwoOperandAliasConstraint = "$rs = $rt";
 }
 
 // Arithmetic Multiply ADD/SUB
@@ -521,7 +524,7 @@ multiclass StoreLeftRightM<string opstr, SDNode OpNode, RegisterClass RC> {
 }
 
 // Conditional Branch
-class CBranch<string opstr, PatFrag cond_op, RegisterClass RC> :
+class CBranch<string opstr, PatFrag cond_op, RegisterOperand RC> :
   InstSE<(outs), (ins RC:$rs, RC:$rt, brtarget:$offset),
          !strconcat(opstr, "\t$rs, $rt, $offset"),
          [(brcond (i32 (cond_op RC:$rs, RC:$rt)), bb:$offset)], IIBranch,
@@ -532,7 +535,7 @@ class CBranch<string opstr, PatFrag cond_op, RegisterClass RC> :
   let Defs = [AT];
 }
 
-class CBranchZero<string opstr, PatFrag cond_op, RegisterClass RC> :
+class CBranchZero<string opstr, PatFrag cond_op, RegisterOperand RC> :
   InstSE<(outs), (ins RC:$rs, brtarget:$offset),
          !strconcat(opstr, "\t$rs, $offset"),
          [(brcond (i32 (cond_op RC:$rs, 0)), bb:$offset)], IIBranch, FrmI> {
@@ -637,6 +640,11 @@ class SYNC_FT :
   InstSE<(outs), (ins i32imm:$stype), "sync $stype", [(MipsSync imm:$stype)],
          NoItinerary, FrmOther>;
 
+let hasSideEffects = 1 in
+class TEQ_FT<string opstr, RegisterOperand RO> :
+  InstSE<(outs), (ins RO:$rs, RO:$rt, uimm16:$code_),
+         !strconcat(opstr, "\t$rs, $rt, $code_"), [], NoItinerary, FrmI>;
+
 // Mul, Div
 class Mult<string opstr, InstrItinClass itin, RegisterOperand RO,
            list<Register> DefRegs> :
@@ -651,12 +659,14 @@ class Mult<string opstr, InstrItinClass itin, RegisterOperand RO,
 // operands.
 class MultDivPseudo<Instruction RealInst, RegisterClass R0, RegisterOperand R1,
                     SDPatternOperator OpNode, InstrItinClass Itin,
-                    bit IsComm = 1, bit HasSideEffects = 0> :
+                    bit IsComm = 1, bit HasSideEffects = 0,
+                    bit UsesCustomInserter = 0> :
   PseudoSE<(outs R0:$ac), (ins R1:$rs, R1:$rt),
            [(set R0:$ac, (OpNode R1:$rs, R1:$rt))], Itin>,
   PseudoInstExpansion<(RealInst R1:$rs, R1:$rt)> {
   let isCommutable = IsComm;
   let hasSideEffects = HasSideEffects;
+  let usesCustomInserter = UsesCustomInserter;
 }
 
 // Pseudo multiply add/sub instruction with explicit accumulator register
@@ -923,6 +933,7 @@ defm SWL : StoreLeftRightM<"swl", MipsSWL, CPURegs>, LW_FM<0x2a>;
 defm SWR : StoreLeftRightM<"swr", MipsSWR, CPURegs>, LW_FM<0x2e>;
 
 def SYNC : SYNC_FT, SYNC_FM;
+def TEQ : TEQ_FT<"teq", CPURegsOpnd>, TEQ_FM<0x34>;
 
 /// Load-linked, Store-conditional
 let Predicates = [NotN64, HasStdEnc] in {
@@ -940,12 +951,12 @@ def J       : JumpFJ<jmptarget, "j", br, bb>, FJ<2>,
               Requires<[RelocStatic, HasStdEnc]>, IsBranch;
 def JR      : IndirectBranch<CPURegs>, MTLO_FM<8>;
 def B       : UncondBranch<"b">, B_FM;
-def BEQ     : CBranch<"beq", seteq, CPURegs>, BEQ_FM<4>;
-def BNE     : CBranch<"bne", setne, CPURegs>, BEQ_FM<5>;
-def BGEZ    : CBranchZero<"bgez", setge, CPURegs>, BGEZ_FM<1, 1>;
-def BGTZ    : CBranchZero<"bgtz", setgt, CPURegs>, BGEZ_FM<7, 0>;
-def BLEZ    : CBranchZero<"blez", setle, CPURegs>, BGEZ_FM<6, 0>;
-def BLTZ    : CBranchZero<"bltz", setlt, CPURegs>, BGEZ_FM<1, 0>;
+def BEQ     : CBranch<"beq", seteq, CPURegsOpnd>, BEQ_FM<4>;
+def BNE     : CBranch<"bne", setne, CPURegsOpnd>, BEQ_FM<5>;
+def BGEZ    : CBranchZero<"bgez", setge, CPURegsOpnd>, BGEZ_FM<1, 1>;
+def BGTZ    : CBranchZero<"bgtz", setgt, CPURegsOpnd>, BGEZ_FM<7, 0>;
+def BLEZ    : CBranchZero<"blez", setle, CPURegsOpnd>, BGEZ_FM<6, 0>;
+def BLTZ    : CBranchZero<"bltz", setlt, CPURegsOpnd>, BGEZ_FM<1, 0>;
 
 def BAL_BR: BAL_FT, BAL_FM;
 
@@ -989,9 +1000,10 @@ def PseudoMULT  : MultDivPseudo<MULT, ACRegs, CPURegsOpnd, MipsMult, IIImul>;
 def PseudoMULTu : MultDivPseudo<MULTu, ACRegs, CPURegsOpnd, MipsMultu, IIImul>;
 def SDIV  : Div<"div", IIIdiv, CPURegsOpnd, [HI, LO]>, MULT_FM<0, 0x1a>;
 def UDIV  : Div<"divu", IIIdiv, CPURegsOpnd, [HI, LO]>, MULT_FM<0, 0x1b>;
-def PseudoSDIV : MultDivPseudo<SDIV, ACRegs, CPURegsOpnd, MipsDivRem, IIIdiv, 0>;
+def PseudoSDIV : MultDivPseudo<SDIV, ACRegs, CPURegsOpnd, MipsDivRem, IIIdiv,
+                               0, 1, 1>;
 def PseudoUDIV : MultDivPseudo<UDIV, ACRegs, CPURegsOpnd, MipsDivRemU, IIIdiv,
-                               0>;
+                               0, 1, 1>;
 
 def MTHI : MoveToLOHI<"mthi", CPURegs, [HI]>, MTLO_FM<0x11>;
 def MTLO : MoveToLOHI<"mtlo", CPURegs, [LO]>, MTLO_FM<0x13>;
@@ -1095,7 +1107,12 @@ def : InstAlias<"mfc2 $rt, $rd",
                 (MFC2_3OP CPURegsOpnd:$rt, CPURegsOpnd:$rd, 0), 0>;
 def : InstAlias<"mtc2 $rt, $rd",
                 (MTC2_3OP CPURegsOpnd:$rd, 0, CPURegsOpnd:$rt), 0>;
-
+def : InstAlias<"bnez $rs,$offset",
+                 (BNE CPURegsOpnd:$rs, ZERO, brtarget:$offset), 1>,
+                 Requires<[NotMips64]>;
+def : InstAlias<"beqz $rs,$offset",
+                 (BEQ CPURegsOpnd:$rs, ZERO, brtarget:$offset), 1>,
+                 Requires<[NotMips64]>;
 //===----------------------------------------------------------------------===//
 // Assembler Pseudo Instructions
 //===----------------------------------------------------------------------===//
@@ -1248,6 +1265,10 @@ def : MipsPat<(brcond (i32 (setge RC:$lhs, immSExt16:$rhs)), bb:$dst),
               (BEQ (SLTiOp RC:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
 def : MipsPat<(brcond (i32 (setuge RC:$lhs, immSExt16:$rhs)), bb:$dst),
               (BEQ (SLTiuOp RC:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setgt RC:$lhs, immSExt16Plus1:$rhs)), bb:$dst),
+              (BEQ (SLTiOp RC:$lhs, (Plus1 imm:$rhs)), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setugt RC:$lhs, immSExt16Plus1:$rhs)), bb:$dst),
+              (BEQ (SLTiuOp RC:$lhs, (Plus1 imm:$rhs)), ZERO, bb:$dst)>;
 
 def : MipsPat<(brcond (i32 (setle RC:$lhs, RC:$rhs)), bb:$dst),
               (BEQ (SLTOp RC:$rhs, RC:$lhs), ZERO, bb:$dst)>;
@@ -1260,9 +1281,18 @@ def : MipsPat<(brcond RC:$cond, bb:$dst),
 
 defm : BrcondPats<CPURegs, BEQ, BNE, SLT, SLTu, SLTi, SLTiu, ZERO>;
 
+def : MipsPat<(brcond (i32 (setlt i32:$lhs, 1)), bb:$dst),
+              (BLEZ i32:$lhs, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setgt i32:$lhs, -1)), bb:$dst),
+              (BGEZ i32:$lhs, bb:$dst)>;
+
 // setcc patterns
 multiclass SeteqPats<RegisterClass RC, Instruction SLTiuOp, Instruction XOROp,
                      Instruction SLTuOp, Register ZEROReg> {
+  def : MipsPat<(seteq RC:$lhs, 0),
+                (SLTiuOp RC:$lhs, 1)>;
+  def : MipsPat<(setne RC:$lhs, 0),
+                (SLTuOp ZEROReg, RC:$lhs)>;
   def : MipsPat<(seteq RC:$lhs, RC:$rhs),
                 (SLTiuOp (XOROp RC:$lhs, RC:$rhs), 1)>;
   def : MipsPat<(setne RC:$lhs, RC:$rhs),
diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp
index bf5ad37..073daba 100644
--- a/lib/Target/Mips/MipsLongBranch.cpp
+++ b/lib/Target/Mips/MipsLongBranch.cpp
@@ -65,7 +65,6 @@ namespace {
     static char ID;
     MipsLongBranch(TargetMachine &tm)
       : MachineFunctionPass(ID), TM(tm),
-        TII(static_cast<const MipsInstrInfo*>(tm.getInstrInfo())),
         IsPIC(TM.getRelocationModel() == Reloc::PIC_),
         ABI(TM.getSubtarget<MipsSubtarget>().getTargetABI()),
         LongBranchSeqSize(!IsPIC ? 2 : (ABI == MipsSubtarget::N64 ? 13 : 9)) {}
@@ -85,7 +84,6 @@ namespace {
     void expandToLongBranch(MBBInfo &Info);
 
     const TargetMachine &TM;
-    const MipsInstrInfo *TII;
     MachineFunction *MF;
     SmallVector<MBBInfo, 16> MBBInfos;
     bool IsPIC;
@@ -172,6 +170,8 @@ void MipsLongBranch::initMBBInfo() {
   MBBInfos.clear();
   MBBInfos.resize(MF->size());
 
+  const MipsInstrInfo *TII =
+    static_cast<const MipsInstrInfo*>(TM.getInstrInfo());
   for (unsigned I = 0, E = MBBInfos.size(); I < E; ++I) {
     MachineBasicBlock *MBB = MF->getBlockNumbered(I);
 
@@ -217,7 +217,9 @@ int64_t MipsLongBranch::computeOffset(const MachineInstr *Br) {
 // MachineBasicBlock operand MBBOpnd.
 void MipsLongBranch::replaceBranch(MachineBasicBlock &MBB, Iter Br,
                                    DebugLoc DL, MachineBasicBlock *MBBOpnd) {
-  unsigned NewOpc = TII->GetOppositeBranchOpc(Br->getOpcode());
+  const MipsInstrInfo *TII =
+    static_cast<const MipsInstrInfo*>(TM.getInstrInfo());
+  unsigned NewOpc = TII->getOppositeBranchOpc(Br->getOpcode());
   const MCInstrDesc &NewDesc = TII->get(NewOpc);
 
   MachineInstrBuilder MIB = BuildMI(MBB, Br, DL, NewDesc);
@@ -247,6 +249,9 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
   MachineFunction::iterator FallThroughMBB = ++MachineFunction::iterator(MBB);
   MachineBasicBlock *LongBrMBB = MF->CreateMachineBasicBlock(BB);
 
+  const MipsInstrInfo *TII =
+    static_cast<const MipsInstrInfo*>(TM.getInstrInfo());
+
   MF->insert(FallThroughMBB, LongBrMBB);
   MBB->removeSuccessor(TgtMBB);
   MBB->addSuccessor(LongBrMBB);
@@ -399,6 +404,9 @@ static void emitGPDisp(MachineFunction &F, const MipsInstrInfo *TII) {
 }
 
 bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
+  const MipsInstrInfo *TII =
+    static_cast<const MipsInstrInfo*>(TM.getInstrInfo());
+
   if (TM.getSubtarget<MipsSubtarget>().inMips16Mode())
     return false;
   if ((TM.getRelocationModel() == Reloc::PIC_) &&
diff --git a/lib/Target/Mips/MipsOptimizeMathLibCalls.cpp b/lib/Target/Mips/MipsOptimizeMathLibCalls.cpp
new file mode 100644
index 0000000..de3f09c
--- /dev/null
+++ b/lib/Target/Mips/MipsOptimizeMathLibCalls.cpp
@@ -0,0 +1,175 @@
+//===---- MipsOptimizeMathLibCalls.cpp - Optimize math lib calls.      ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass does an IR transformation which enables the backend to emit native
+// math instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsTargetMachine.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+static cl::opt<bool> DisableOpt("disable-mips-math-optimization",
+                                cl::init(false),
+                                cl::desc("MIPS: Disable math lib call "
+                                         "optimization."), cl::Hidden);
+
+namespace {
+  class MipsOptimizeMathLibCalls : public FunctionPass {
+  public:
+    static char ID;
+
+    MipsOptimizeMathLibCalls(MipsTargetMachine &TM_) :
+      FunctionPass(ID), TM(TM_) {}
+
+    virtual const char *getPassName() const {
+      return "MIPS: Optimize calls to math library functions.";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+
+    virtual bool runOnFunction(Function &F);
+
+  private:
+    /// Optimize calls to sqrt.
+    bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
+                      BasicBlock &CurrBB,
+                      Function::iterator &BB);
+
+    const TargetMachine &TM;
+  };
+
+  char MipsOptimizeMathLibCalls::ID = 0;
+}
+
+FunctionPass *llvm::createMipsOptimizeMathLibCalls(MipsTargetMachine &TM) {
+  return new MipsOptimizeMathLibCalls(TM);
+}
+
+void MipsOptimizeMathLibCalls::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetLibraryInfo>();
+  FunctionPass::getAnalysisUsage(AU);
+}
+
+bool MipsOptimizeMathLibCalls::runOnFunction(Function &F) {
+  if (DisableOpt)
+    return false;
+
+  const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
+
+  if (Subtarget.inMips16Mode())
+    return false;
+
+  bool Changed = false;
+  Function::iterator CurrBB;
+  const TargetLibraryInfo *LibInfo = &getAnalysis<TargetLibraryInfo>();
+
+  for (Function::iterator BB = F.begin(), BE = F.end(); BB != BE;) {
+    CurrBB = BB++;
+
+    for (BasicBlock::iterator II = CurrBB->begin(), IE = CurrBB->end();
+         II != IE; ++II) {
+      CallInst *Call = dyn_cast<CallInst>(&*II);
+      Function *CalledFunc;
+
+      if (!Call || !(CalledFunc = Call->getCalledFunction()))
+        continue;
+
+      LibFunc::Func LibFunc;
+      Attribute A = CalledFunc->getAttributes()
+        .getAttribute(AttributeSet::FunctionIndex, "use-soft-float");
+
+      // Skip if function has "use-soft-float" attribute.
+      if ((A.isStringAttribute() && (A.getValueAsString() == "true")) ||
+          TM.Options.UseSoftFloat)
+        continue;
+
+      // Skip if function either has local linkage or is not a known library
+      // function.
+      if (CalledFunc->hasLocalLinkage() || !CalledFunc->hasName() ||
+          !LibInfo->getLibFunc(CalledFunc->getName(), LibFunc))
+        continue;
+
+      switch (LibFunc) {
+      case LibFunc::sqrtf:
+      case LibFunc::sqrt:
+        if (optimizeSQRT(Call, CalledFunc, *CurrBB, BB))
+          break;
+        continue;
+      default:
+        continue;
+      }
+
+      Changed = true;
+      break;
+    }
+  }
+
+  return Changed;
+}
+
+bool MipsOptimizeMathLibCalls::optimizeSQRT(CallInst *Call,
+                                            Function *CalledFunc,
+                                            BasicBlock &CurrBB,
+                                            Function::iterator &BB) {
+  // There is no need to change the IR, since backend will emit sqrt
+  // instruction if the call has already been marked read-only.
+  if (Call->onlyReadsMemory())
+    return false;
+
+  // Do the following transformation:
+  //
+  // (before)
+  // dst = sqrt(src)
+  //
+  // (after)
+  // v0 = sqrt_noreadmem(src) # native sqrt instruction.
+  // if (v0 is a NaN)
+  //   v1 = sqrt(src)         # library call.
+  // dst = phi(v0, v1)
+  //
+
+  // Move all instructions following Call to newly created block JoinBB.
+  // Create phi and replace all uses.
+  BasicBlock *JoinBB = llvm::SplitBlock(&CurrBB, Call->getNextNode(), this);
+  IRBuilder<> Builder(JoinBB, JoinBB->begin());
+  PHINode *Phi = Builder.CreatePHI(Call->getType(), 2);
+  Call->replaceAllUsesWith(Phi);
+
+  // Create basic block LibCallBB and insert a call to library function sqrt.
+  BasicBlock *LibCallBB = BasicBlock::Create(CurrBB.getContext(), "call.sqrt",
+                                             CurrBB.getParent(), JoinBB);
+  Builder.SetInsertPoint(LibCallBB);
+  Instruction *LibCall = Call->clone();
+  Builder.Insert(LibCall);
+  Builder.CreateBr(JoinBB);
+
+  // Add attribute "readnone" so that backend can use a native sqrt instruction
+  // for this call. Insert a FP compare instruction and a conditional branch
+  // at the end of CurrBB.
+  Call->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone);
+  CurrBB.getTerminator()->eraseFromParent();
+  Builder.SetInsertPoint(&CurrBB);
+  Value *FCmp = Builder.CreateFCmpOEQ(Call, Call);
+  Builder.CreateCondBr(FCmp, JoinBB, LibCallBB);
+
+  // Add phi operands.
+  Phi->addIncoming(Call, &CurrBB);
+  Phi->addIncoming(LibCall, LibCallBB);
+
+  BB = JoinBB;
+  return true;
+}
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index 3250733..ae25e45 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -100,6 +100,10 @@ MipsRegisterInfo::getCallPreservedMask(CallingConv::ID) const {
   return CSR_N64_RegMask;
 }
 
+const uint32_t *MipsRegisterInfo::getMips16RetHelperMask() {
+  return CSR_Mips16RetHelper_RegMask;
+}
+
 BitVector MipsRegisterInfo::
 getReservedRegs(const MachineFunction &MF) const {
   static const uint16_t ReservedCPURegs[] = {
@@ -145,7 +149,11 @@ getReservedRegs(const MachineFunction &MF) const {
   Reserved.set(Mips::HWR29_64);
 
   // Reserve DSP control register.
-  Reserved.set(Mips::DSPCtrl);
+  Reserved.set(Mips::DSPPos);
+  Reserved.set(Mips::DSPSCount);
+  Reserved.set(Mips::DSPCarry);
+  Reserved.set(Mips::DSPEFI);
+  Reserved.set(Mips::DSPOutFlag);
 
   // Reserve RA if in mips16 mode.
   if (Subtarget.inMips16Mode()) {
diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h
index 5ed5124..20ba41d 100644
--- a/lib/Target/Mips/MipsRegisterInfo.h
+++ b/lib/Target/Mips/MipsRegisterInfo.h
@@ -46,6 +46,7 @@ public:
                                MachineFunction &MF) const;
   const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
   const uint32_t *getCallPreservedMask(CallingConv::ID) const;
+  static const uint32_t *getMips16RetHelperMask();
 
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
index 865d4d7..ad6912c 100644
--- a/lib/Target/Mips/MipsRegisterInfo.td
+++ b/lib/Target/Mips/MipsRegisterInfo.td
@@ -11,11 +11,16 @@
 //  Declarations that describe the MIPS register file
 //===----------------------------------------------------------------------===//
 let Namespace = "Mips" in {
-def sub_fpeven : SubRegIndex;
-def sub_fpodd  : SubRegIndex;
-def sub_32     : SubRegIndex;
-def sub_lo     : SubRegIndex;
-def sub_hi     : SubRegIndex;
+def sub_fpeven : SubRegIndex<32>;
+def sub_fpodd  : SubRegIndex<32, 32>;
+def sub_32     : SubRegIndex<32>;
+def sub_lo     : SubRegIndex<32>;
+def sub_hi     : SubRegIndex<32, 32>;
+def sub_dsp16_19 : SubRegIndex<4, 16>;
+def sub_dsp20    : SubRegIndex<1, 20>;
+def sub_dsp21    : SubRegIndex<1, 21>;
+def sub_dsp22    : SubRegIndex<1, 22>;
+def sub_dsp23    : SubRegIndex<1, 23>;
 }
 
 class Unallocatable {
@@ -264,8 +269,23 @@ let Namespace = "Mips" in {
 
   def AC0_64 : ACC<0, "ac0", [LO64, HI64]>;
 
-  def DSPCtrl : Register<"dspctrl">;
+  // DSP-ASE control register fields.
+  def DSPPos : Register<"">;
+  def DSPSCount : Register<"">;
+  def DSPCarry : Register<"">;
+  def DSPEFI : Register<"">;
+  def DSPOutFlag16_19 : Register<"">;
+  def DSPOutFlag20 : Register<"">;
+  def DSPOutFlag21 : Register<"">;
+  def DSPOutFlag22 : Register<"">;
+  def DSPOutFlag23 : Register<"">;
   def DSPCCond : Register<"">;
+
+  let SubRegIndices = [sub_dsp16_19, sub_dsp20, sub_dsp21, sub_dsp22,
+                       sub_dsp23] in
+  def DSPOutFlag : RegisterWithSubRegs<"", [DSPOutFlag16_19, DSPOutFlag20,
+                                            DSPOutFlag21, DSPOutFlag22,
+                                            DSPOutFlag23]>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp
index 2b76704..91ffb94 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -40,6 +40,8 @@ public:
 
 private:
   bool expandInstr(MachineBasicBlock &MBB, Iter I);
+  void expandLoadCCond(MachineBasicBlock &MBB, Iter I);
+  void expandStoreCCond(MachineBasicBlock &MBB, Iter I);
   void expandLoadACC(MachineBasicBlock &MBB, Iter I, unsigned RegSize);
   void expandStoreACC(MachineBasicBlock &MBB, Iter I, unsigned RegSize);
   bool expandCopy(MachineBasicBlock &MBB, Iter I);
@@ -47,16 +49,12 @@ private:
                      unsigned Src, unsigned RegSize);
 
   MachineFunction &MF;
-  const MipsSEInstrInfo &TII;
-  const MipsRegisterInfo &RegInfo;
   MachineRegisterInfo &MRI;
 };
 }
 
 ExpandPseudo::ExpandPseudo(MachineFunction &MF_)
-  : MF(MF_),
-    TII(*static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo())),
-    RegInfo(TII.getRegisterInfo()), MRI(MF.getRegInfo()) {}
+  : MF(MF_), MRI(MF.getRegInfo()) {}
 
 bool ExpandPseudo::expand() {
   bool Expanded = false;
@@ -71,6 +69,14 @@ bool ExpandPseudo::expand() {
 
 bool ExpandPseudo::expandInstr(MachineBasicBlock &MBB, Iter I) {
   switch(I->getOpcode()) {
+  case Mips::LOAD_CCOND_DSP:
+  case Mips::LOAD_CCOND_DSP_P8:
+    expandLoadCCond(MBB, I);
+    break;
+  case Mips::STORE_CCOND_DSP:
+  case Mips::STORE_CCOND_DSP_P8:
+    expandStoreCCond(MBB, I);
+    break;
   case Mips::LOAD_AC64:
   case Mips::LOAD_AC64_P8:
   case Mips::LOAD_AC_DSP:
@@ -103,6 +109,46 @@ bool ExpandPseudo::expandInstr(MachineBasicBlock &MBB, Iter I) {
   return true;
 }
 
+void ExpandPseudo::expandLoadCCond(MachineBasicBlock &MBB, Iter I) {
+  //  load $vr, FI
+  //  copy ccond, $vr
+
+  assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
+
+  const MipsSEInstrInfo &TII =
+    *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
+  const MipsRegisterInfo &RegInfo =
+    *static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+
+  const TargetRegisterClass *RC = RegInfo.intRegClass(4);
+  unsigned VR = MRI.createVirtualRegister(RC);
+  unsigned Dst = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
+
+  TII.loadRegFromStack(MBB, I, VR, FI, RC, &RegInfo, 0);
+  BuildMI(MBB, I, I->getDebugLoc(), TII.get(TargetOpcode::COPY), Dst)
+    .addReg(VR, RegState::Kill);
+}
+
+void ExpandPseudo::expandStoreCCond(MachineBasicBlock &MBB, Iter I) {
+  //  copy $vr, ccond
+  //  store $vr, FI
+
+  assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
+
+  const MipsSEInstrInfo &TII =
+    *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
+  const MipsRegisterInfo &RegInfo =
+    *static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+
+  const TargetRegisterClass *RC = RegInfo.intRegClass(4);
+  unsigned VR = MRI.createVirtualRegister(RC);
+  unsigned Src = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
+
+  BuildMI(MBB, I, I->getDebugLoc(), TII.get(TargetOpcode::COPY), VR)
+    .addReg(Src, getKillRegState(I->getOperand(0).isKill()));
+  TII.storeRegToStack(MBB, I, VR, true, FI, RC, &RegInfo, 0);
+}
+
 void ExpandPseudo::expandLoadACC(MachineBasicBlock &MBB, Iter I,
                                  unsigned RegSize) {
   //  load $vr0, FI
@@ -112,6 +158,11 @@ void ExpandPseudo::expandLoadACC(MachineBasicBlock &MBB, Iter I,
 
   assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
 
+  const MipsSEInstrInfo &TII =
+    *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
+  const MipsRegisterInfo &RegInfo =
+    *static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+
   const TargetRegisterClass *RC = RegInfo.intRegClass(RegSize);
   unsigned VR0 = MRI.createVirtualRegister(RC);
   unsigned VR1 = MRI.createVirtualRegister(RC);
@@ -136,6 +187,11 @@ void ExpandPseudo::expandStoreACC(MachineBasicBlock &MBB, Iter I,
 
   assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
 
+  const MipsSEInstrInfo &TII =
+    *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
+  const MipsRegisterInfo &RegInfo =
+    *static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+
   const TargetRegisterClass *RC = RegInfo.intRegClass(RegSize);
   unsigned VR0 = MRI.createVirtualRegister(RC);
   unsigned VR1 = MRI.createVirtualRegister(RC);
@@ -170,6 +226,11 @@ bool ExpandPseudo::expandCopyACC(MachineBasicBlock &MBB, Iter I, unsigned Dst,
   //  copy $vr1, src_hi
   //  copy dst_hi, $vr1
 
+  const MipsSEInstrInfo &TII =
+    *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
+  const MipsRegisterInfo &RegInfo =
+    *static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+
   const TargetRegisterClass *RC = RegInfo.intRegClass(RegSize);
   unsigned VR0 = MRI.createVirtualRegister(RC);
   unsigned VR1 = MRI.createVirtualRegister(RC);
@@ -204,10 +265,12 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock &MBB   = MF.front();
   MachineFrameInfo *MFI    = MF.getFrameInfo();
   MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
-  const MipsRegisterInfo *RegInfo =
-    static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+
   const MipsSEInstrInfo &TII =
     *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
+  const MipsRegisterInfo &RegInfo =
+    *static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+
   MachineBasicBlock::iterator MBBI = MBB.begin();
   DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
   unsigned SP = STI.isABI_N64() ? Mips::SP_64 : Mips::SP;
@@ -222,7 +285,7 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF) const {
   if (StackSize == 0 && !MFI->adjustsStack()) return;
 
   MachineModuleInfo &MMI = MF.getMMI();
-  std::vector<MachineMove> &Moves = MMI.getFrameMoves();
+  const MCRegisterInfo &MRI = MMI.getContext().getRegisterInfo();
   MachineLocation DstML, SrcML;
 
   // Adjust stack.
@@ -232,9 +295,8 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF) const {
   MCSymbol *AdjustSPLabel = MMI.getContext().CreateTempSymbol();
   BuildMI(MBB, MBBI, dl,
           TII.get(TargetOpcode::PROLOG_LABEL)).addSym(AdjustSPLabel);
-  DstML = MachineLocation(MachineLocation::VirtualFP);
-  SrcML = MachineLocation(MachineLocation::VirtualFP, -StackSize);
-  Moves.push_back(MachineMove(AdjustSPLabel, DstML, SrcML));
+  MMI.addFrameInst(
+      MCCFIInstruction::createDefCfaOffset(AdjustSPLabel, -StackSize));
 
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
 
@@ -258,21 +320,22 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF) const {
       // If Reg is a double precision register, emit two cfa_offsets,
       // one for each of the paired single precision registers.
       if (Mips::AFGR64RegClass.contains(Reg)) {
-        MachineLocation DstML0(MachineLocation::VirtualFP, Offset);
-        MachineLocation DstML1(MachineLocation::VirtualFP, Offset + 4);
-        MachineLocation SrcML0(RegInfo->getSubReg(Reg, Mips::sub_fpeven));
-        MachineLocation SrcML1(RegInfo->getSubReg(Reg, Mips::sub_fpodd));
+        unsigned Reg0 =
+            MRI.getDwarfRegNum(RegInfo.getSubReg(Reg, Mips::sub_fpeven), true);
+        unsigned Reg1 =
+            MRI.getDwarfRegNum(RegInfo.getSubReg(Reg, Mips::sub_fpodd), true);
 
         if (!STI.isLittle())
-          std::swap(SrcML0, SrcML1);
+          std::swap(Reg0, Reg1);
 
-        Moves.push_back(MachineMove(CSLabel, DstML0, SrcML0));
-        Moves.push_back(MachineMove(CSLabel, DstML1, SrcML1));
+        MMI.addFrameInst(
+            MCCFIInstruction::createOffset(CSLabel, Reg0, Offset));
+        MMI.addFrameInst(
+            MCCFIInstruction::createOffset(CSLabel, Reg1, Offset + 4));
       } else {
         // Reg is either in CPURegs or FGR32.
-        DstML = MachineLocation(MachineLocation::VirtualFP, Offset);
-        SrcML = MachineLocation(Reg);
-        Moves.push_back(MachineMove(CSLabel, DstML, SrcML));
+        MMI.addFrameInst(MCCFIInstruction::createOffset(
+            CSLabel, MRI.getDwarfRegNum(Reg, 1), Offset));
       }
     }
   }
@@ -286,7 +349,7 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF) const {
       if (!MBB.isLiveIn(ehDataReg(I)))
         MBB.addLiveIn(ehDataReg(I));
       TII.storeRegToStackSlot(MBB, MBBI, ehDataReg(I), false,
-                              MipsFI->getEhDataRegFI(I), RC, RegInfo);
+                              MipsFI->getEhDataRegFI(I), RC, &RegInfo);
     }
 
     // Emit .cfi_offset directives for eh data registers.
@@ -295,9 +358,8 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF) const {
             TII.get(TargetOpcode::PROLOG_LABEL)).addSym(CSLabel2);
     for (int I = 0; I < 4; ++I) {
       int64_t Offset = MFI->getObjectOffset(MipsFI->getEhDataRegFI(I));
-      DstML = MachineLocation(MachineLocation::VirtualFP, Offset);
-      SrcML = MachineLocation(ehDataReg(I));
-      Moves.push_back(MachineMove(CSLabel2, DstML, SrcML));
+      unsigned Reg = MRI.getDwarfRegNum(ehDataReg(I), true);
+      MMI.addFrameInst(MCCFIInstruction::createOffset(CSLabel2, Reg, Offset));
     }
   }
 
@@ -310,9 +372,8 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF) const {
     MCSymbol *SetFPLabel = MMI.getContext().CreateTempSymbol();
     BuildMI(MBB, MBBI, dl,
             TII.get(TargetOpcode::PROLOG_LABEL)).addSym(SetFPLabel);
-    DstML = MachineLocation(FP);
-    SrcML = MachineLocation(MachineLocation::VirtualFP);
-    Moves.push_back(MachineMove(SetFPLabel, DstML, SrcML));
+    MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister(
+        SetFPLabel, MRI.getDwarfRegNum(FP, true)));
   }
 }
 
@@ -321,10 +382,12 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   MachineFrameInfo *MFI            = MF.getFrameInfo();
   MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
-  const MipsRegisterInfo *RegInfo =
-    static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+
   const MipsSEInstrInfo &TII =
     *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
+  const MipsRegisterInfo &RegInfo =
+    *static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+
   DebugLoc dl = MBBI->getDebugLoc();
   unsigned SP = STI.isABI_N64() ? Mips::SP_64 : Mips::SP;
   unsigned FP = STI.isABI_N64() ? Mips::FP_64 : Mips::FP;
@@ -355,7 +418,7 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
     // Insert instructions that restore eh data registers.
     for (int J = 0; J < 4; ++J) {
       TII.loadRegFromStackSlot(MBB, I, ehDataReg(J), MipsFI->getEhDataRegFI(J),
-                               RC, RegInfo);
+                               RC, &RegInfo);
     }
   }
 
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index b54f1f4..7684bec 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -41,6 +41,31 @@ bool MipsSEDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
   return MipsDAGToDAGISel::runOnMachineFunction(MF);
 }
 
+void MipsSEDAGToDAGISel::addDSPCtrlRegOperands(bool IsDef, MachineInstr &MI,
+                                               MachineFunction &MF) {
+  MachineInstrBuilder MIB(MF, &MI);
+  unsigned Mask = MI.getOperand(1).getImm();
+  unsigned Flag = IsDef ? RegState::ImplicitDefine : RegState::Implicit;
+
+  if (Mask & 1)
+    MIB.addReg(Mips::DSPPos, Flag);
+
+  if (Mask & 2)
+    MIB.addReg(Mips::DSPSCount, Flag);
+
+  if (Mask & 4)
+    MIB.addReg(Mips::DSPCarry, Flag);
+
+  if (Mask & 8)
+    MIB.addReg(Mips::DSPOutFlag, Flag);
+
+  if (Mask & 16)
+    MIB.addReg(Mips::DSPCCond, Flag);
+
+  if (Mask & 32)
+    MIB.addReg(Mips::DSPEFI, Flag);
+}
+
 bool MipsSEDAGToDAGISel::replaceUsesWithZeroReg(MachineRegisterInfo *MRI,
                                                 const MachineInstr& MI) {
   unsigned DstReg = 0, ZeroReg = 0;
@@ -178,12 +203,18 @@ void MipsSEDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
 
   for (MachineFunction::iterator MFI = MF.begin(), MFE = MF.end(); MFI != MFE;
        ++MFI)
-    for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I)
-      replaceUsesWithZeroReg(MRI, *I);
+    for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) {
+      if (I->getOpcode() == Mips::RDDSP)
+        addDSPCtrlRegOperands(false, *I, MF);
+      else if (I->getOpcode() == Mips::WRDSP)
+        addDSPCtrlRegOperands(true, *I, MF);
+      else
+        replaceUsesWithZeroReg(MRI, *I);
+    }
 }
 
 SDNode *MipsSEDAGToDAGISel::selectAddESubE(unsigned MOp, SDValue InFlag,
-                                           SDValue CmpLHS, DebugLoc DL,
+                                           SDValue CmpLHS, SDLoc DL,
                                            SDNode *Node) const {
   unsigned Opc = InFlag.getOpcode(); (void)Opc;
 
@@ -285,7 +316,7 @@ bool MipsSEDAGToDAGISel::selectIntAddr(SDValue Addr, SDValue &Base,
 
 std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
   unsigned Opcode = Node->getOpcode();
-  DebugLoc DL = Node->getDebugLoc();
+  SDLoc DL(Node);
 
   ///
   // Instruction Selection not handled by the auto-generated
@@ -343,7 +374,7 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
       AnalyzeImm.Analyze(Imm, Size, false);
 
     MipsAnalyzeImmediate::InstSeq::const_iterator Inst = Seq.begin();
-    DebugLoc DL = CN->getDebugLoc();
+    SDLoc DL(CN);
     SDNode *RegOpnd;
     SDValue ImmOpnd = CurDAG->getTargetConstant(SignExtend64<16>(Inst->ImmOpnd),
                                                 MVT::i64);
@@ -371,7 +402,7 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
   }
 
   case MipsISD::ThreadPointer: {
-    EVT PtrVT = TLI.getPointerTy();
+    EVT PtrVT = TLI->getPointerTy();
     unsigned RdhwrOpc, SrcReg, DestReg;
 
     if (PtrVT == MVT::i32) {
@@ -385,7 +416,7 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
     }
 
     SDNode *Rdhwr =
-      CurDAG->getMachineNode(RdhwrOpc, Node->getDebugLoc(),
+      CurDAG->getMachineNode(RdhwrOpc, SDLoc(Node),
                              Node->getValueType(0),
                              CurDAG->getRegister(SrcReg, PtrVT));
     SDValue Chain = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, DestReg,
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.h b/lib/Target/Mips/MipsSEISelDAGToDAG.h
index 0dae73d..03ed1f9 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.h
@@ -27,13 +27,16 @@ private:
 
   virtual bool runOnMachineFunction(MachineFunction &MF);
 
+  void addDSPCtrlRegOperands(bool IsDef, MachineInstr &MI,
+                             MachineFunction &MF);
+
   bool replaceUsesWithZeroReg(MachineRegisterInfo *MRI, const MachineInstr&);
 
-  std::pair<SDNode*, SDNode*> selectMULT(SDNode *N, unsigned Opc, DebugLoc dl,
+  std::pair<SDNode*, SDNode*> selectMULT(SDNode *N, unsigned Opc, SDLoc dl,
                                          EVT Ty, bool HasLo, bool HasHi);
 
   SDNode *selectAddESubE(unsigned MOp, SDValue InFlag, SDValue CmpLHS,
-                         DebugLoc DL, SDNode *Node) const;
+                         SDLoc DL, SDNode *Node) const;
 
   virtual bool selectAddrRegImm(SDValue Addr, SDValue &Base,
                                 SDValue &Offset) const;
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index 8544bb8..f640ecc 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -186,7 +186,7 @@ static bool selectMADD(SDNode *ADDENode, SelectionDAG *CurDAG) {
   if (!MultHi.hasOneUse() || !MultLo.hasOneUse())
     return false;
 
-  DebugLoc DL = ADDENode->getDebugLoc();
+  SDLoc DL(ADDENode);
 
   // Initialize accumulator.
   SDValue ACCIn = CurDAG->getNode(MipsISD::InsertLOHI, DL, MVT::Untyped,
@@ -262,7 +262,7 @@ static bool selectMSUB(SDNode *SUBENode, SelectionDAG *CurDAG) {
   if (!MultHi.hasOneUse() || !MultLo.hasOneUse())
     return false;
 
-  DebugLoc DL = SUBENode->getDebugLoc();
+  SDLoc DL(SUBENode);
 
   // Initialize accumulator.
   SDValue ACCIn = CurDAG->getNode(MipsISD::InsertLOHI, DL, MVT::Untyped,
@@ -337,7 +337,7 @@ static SDValue performDSPShiftCombine(unsigned Opc, SDNode *N, EVT Ty,
       (SplatValue.getZExtValue() >= EltSize))
     return SDValue();
 
-  return DAG.getNode(Opc, N->getDebugLoc(), Ty, N->getOperand(0),
+  return DAG.getNode(Opc, SDLoc(N), Ty, N->getOperand(0),
                      DAG.getConstant(SplatValue.getZExtValue(), MVT::i32));
 }
 
@@ -402,7 +402,7 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) {
   if (!isLegalDSPCondCode(Ty, cast<CondCodeSDNode>(N->getOperand(2))->get()))
     return SDValue();
 
-  return DAG.getNode(MipsISD::SETCC_DSP, N->getDebugLoc(), Ty, N->getOperand(0),
+  return DAG.getNode(MipsISD::SETCC_DSP, SDLoc(N), Ty, N->getOperand(0),
                      N->getOperand(1), N->getOperand(2));
 }
 
@@ -417,7 +417,7 @@ static SDValue performVSELECTCombine(SDNode *N, SelectionDAG &DAG) {
   if (SetCC.getOpcode() != MipsISD::SETCC_DSP)
     return SDValue();
 
-  return DAG.getNode(MipsISD::SELECT_CC_DSP, N->getDebugLoc(), Ty,
+  return DAG.getNode(MipsISD::SELECT_CC_DSP, SDLoc(N), Ty,
                      SetCC.getOperand(0), SetCC.getOperand(1), N->getOperand(1),
                      N->getOperand(2), SetCC.getOperand(2));
 }
@@ -500,7 +500,7 @@ SDValue MipsSETargetLowering::lowerMulDiv(SDValue Op, unsigned NewOpc,
                                           bool HasLo, bool HasHi,
                                           SelectionDAG &DAG) const {
   EVT Ty = Op.getOperand(0).getValueType();
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   SDValue Mult = DAG.getNode(NewOpc, DL, MVT::Untyped,
                              Op.getOperand(0), Op.getOperand(1));
   SDValue Lo, Hi;
@@ -520,7 +520,7 @@ SDValue MipsSETargetLowering::lowerMulDiv(SDValue Op, unsigned NewOpc,
 }
 
 
-static SDValue initAccumulator(SDValue In, DebugLoc DL, SelectionDAG &DAG) {
+static SDValue initAccumulator(SDValue In, SDLoc DL, SelectionDAG &DAG) {
   SDValue InLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, In,
                              DAG.getConstant(0, MVT::i32));
   SDValue InHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, In,
@@ -528,7 +528,7 @@ static SDValue initAccumulator(SDValue In, DebugLoc DL, SelectionDAG &DAG) {
   return DAG.getNode(MipsISD::InsertLOHI, DL, MVT::Untyped, InLo, InHi);
 }
 
-static SDValue extractLOHI(SDValue Op, DebugLoc DL, SelectionDAG &DAG) {
+static SDValue extractLOHI(SDValue Op, SDLoc DL, SelectionDAG &DAG) {
   SDValue Lo = DAG.getNode(MipsISD::ExtractLOHI, DL, MVT::i32, Op,
                            DAG.getConstant(Mips::sub_lo, MVT::i32));
   SDValue Hi = DAG.getNode(MipsISD::ExtractLOHI, DL, MVT::i32, Op,
@@ -549,7 +549,7 @@ static SDValue extractLOHI(SDValue Op, DebugLoc DL, SelectionDAG &DAG) {
 // out64 = merge-values (v0, v1)
 //
 static SDValue lowerDSPIntr(SDValue Op, SelectionDAG &DAG, unsigned Opc) {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   bool HasChainIn = Op->getOperand(0).getValueType() == MVT::Other;
   SmallVector<SDValue, 3> Ops;
   unsigned OpNo = 0;
diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp
index 2e7048d..e2a33dd 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -18,15 +18,21 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
+static cl::opt<bool> NoDPLoadStore("mno-ldc1-sdc1", cl::init(false),
+                                   cl::desc("Expand double precision loads and "
+                                            "stores to their single precision "
+                                            "counterparts."));
+
 MipsSEInstrInfo::MipsSEInstrInfo(MipsTargetMachine &tm)
   : MipsInstrInfo(tm,
                   tm.getRelocationModel() == Reloc::PIC_ ? Mips::B : Mips::J),
-    RI(*tm.getSubtargetImpl(), *this),
+    RI(*tm.getSubtargetImpl()),
     IsN64(tm.getSubtarget<MipsSubtarget>().isABI_N64()) {}
 
 const MipsRegisterInfo &MipsSEInstrInfo::getRegisterInfo() const {
@@ -103,6 +109,11 @@ void MipsSEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       Opc = Mips::MFHI_DSP;
     else if (Mips::LORegsDSPRegClass.contains(SrcReg))
       Opc = Mips::MFLO_DSP;
+    else if (Mips::DSPCCRegClass.contains(SrcReg)) {
+      BuildMI(MBB, I, DL, get(Mips::RDDSP), DestReg).addImm(1 << 4)
+        .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+      return;
+    }
   }
   else if (Mips::CPURegsRegClass.contains(SrcReg)) { // Copy from CPU Reg.
     if (Mips::CCRRegClass.contains(DestReg))
@@ -117,6 +128,12 @@ void MipsSEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       Opc = Mips::MTHI_DSP;
     else if (Mips::LORegsDSPRegClass.contains(DestReg))
       Opc = Mips::MTLO_DSP;
+    else if (Mips::DSPCCRegClass.contains(DestReg)) {
+      BuildMI(MBB, I, DL, get(Mips::WRDSP))
+        .addReg(SrcReg, getKillRegState(KillSrc)).addImm(1 << 4)
+        .addReg(DestReg, RegState::ImplicitDefine);
+      return;
+    }
   }
   else if (Mips::FGR32RegClass.contains(DestReg, SrcReg))
     Opc = Mips::FMOV_S;
@@ -180,6 +197,8 @@ storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     Opc = IsN64 ? Mips::STORE_AC_DSP_P8 : Mips::STORE_AC_DSP;
   else if (Mips::ACRegs128RegClass.hasSubClassEq(RC))
     Opc = IsN64 ? Mips::STORE_AC128_P8 : Mips::STORE_AC128;
+  else if (Mips::DSPCCRegClass.hasSubClassEq(RC))
+    Opc = IsN64 ? Mips::STORE_CCOND_DSP_P8 : Mips::STORE_CCOND_DSP;
   else if (Mips::FGR32RegClass.hasSubClassEq(RC))
     Opc = IsN64 ? Mips::SWC1_P8 : Mips::SWC1;
   else if (Mips::AFGR64RegClass.hasSubClassEq(RC))
@@ -211,6 +230,8 @@ loadRegFromStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     Opc = IsN64 ? Mips::LOAD_AC_DSP_P8 : Mips::LOAD_AC_DSP;
   else if (Mips::ACRegs128RegClass.hasSubClassEq(RC))
     Opc = IsN64 ? Mips::LOAD_AC128_P8 : Mips::LOAD_AC128;
+  else if (Mips::DSPCCRegClass.hasSubClassEq(RC))
+    Opc = IsN64 ? Mips::LOAD_CCOND_DSP_P8 : Mips::LOAD_CCOND_DSP;
   else if (Mips::FGR32RegClass.hasSubClassEq(RC))
     Opc = IsN64 ? Mips::LWC1_P8 : Mips::LWC1;
   else if (Mips::AFGR64RegClass.hasSubClassEq(RC))
@@ -230,17 +251,38 @@ bool MipsSEInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   default:
     return false;
   case Mips::RetRA:
-    ExpandRetRA(MBB, MI, Mips::RET);
+    expandRetRA(MBB, MI, Mips::RET);
+    break;
+  case Mips::PseudoCVT_S_W:
+    expandCvtFPInt(MBB, MI, Mips::CVT_S_W, Mips::MTC1, false);
+    break;
+  case Mips::PseudoCVT_D32_W:
+    expandCvtFPInt(MBB, MI, Mips::CVT_D32_W, Mips::MTC1, false);
+    break;
+  case Mips::PseudoCVT_S_L:
+    expandCvtFPInt(MBB, MI, Mips::CVT_S_L, Mips::DMTC1, true);
+    break;
+  case Mips::PseudoCVT_D64_W:
+    expandCvtFPInt(MBB, MI, Mips::CVT_D64_W, Mips::MTC1, true);
+    break;
+  case Mips::PseudoCVT_D64_L:
+    expandCvtFPInt(MBB, MI, Mips::CVT_D64_L, Mips::DMTC1, true);
     break;
   case Mips::BuildPairF64:
-    ExpandBuildPairF64(MBB, MI);
+    expandBuildPairF64(MBB, MI);
     break;
   case Mips::ExtractElementF64:
-    ExpandExtractElementF64(MBB, MI);
+    expandExtractElementF64(MBB, MI);
+    break;
+  case Mips::PseudoLDC1:
+    expandDPLoadStore(MBB, MI, Mips::LDC1, Mips::LWC1);
+    break;
+  case Mips::PseudoSDC1:
+    expandDPLoadStore(MBB, MI, Mips::SDC1, Mips::SWC1);
     break;
   case Mips::MIPSeh_return32:
   case Mips::MIPSeh_return64:
-    ExpandEhReturn(MBB, MI);
+    expandEhReturn(MBB, MI);
     break;
   }
 
@@ -248,9 +290,9 @@ bool MipsSEInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   return true;
 }
 
-/// GetOppositeBranchOpc - Return the inverse of the specified
+/// getOppositeBranchOpc - Return the inverse of the specified
 /// opcode, e.g. turning BEQ to BNE.
-unsigned MipsSEInstrInfo::GetOppositeBranchOpc(unsigned Opc) const {
+unsigned MipsSEInstrInfo::getOppositeBranchOpc(unsigned Opc) const {
   switch (Opc) {
   default:           llvm_unreachable("Illegal opcode!");
   case Mips::BEQ:    return Mips::BNE;
@@ -331,7 +373,7 @@ MipsSEInstrInfo::loadImmediate(int64_t Imm, MachineBasicBlock &MBB,
   return Reg;
 }
 
-unsigned MipsSEInstrInfo::GetAnalyzableBrOpc(unsigned Opc) const {
+unsigned MipsSEInstrInfo::getAnalyzableBrOpc(unsigned Opc) const {
   return (Opc == Mips::BEQ    || Opc == Mips::BNE    || Opc == Mips::BGTZ   ||
           Opc == Mips::BGEZ   || Opc == Mips::BLTZ   || Opc == Mips::BLEZ   ||
           Opc == Mips::BEQ64  || Opc == Mips::BNE64  || Opc == Mips::BGTZ64 ||
@@ -341,13 +383,49 @@ unsigned MipsSEInstrInfo::GetAnalyzableBrOpc(unsigned Opc) const {
          Opc : 0;
 }
 
-void MipsSEInstrInfo::ExpandRetRA(MachineBasicBlock &MBB,
+void MipsSEInstrInfo::expandRetRA(MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator I,
                                 unsigned Opc) const {
   BuildMI(MBB, I, I->getDebugLoc(), get(Opc)).addReg(Mips::RA);
 }
 
-void MipsSEInstrInfo::ExpandExtractElementF64(MachineBasicBlock &MBB,
+std::pair<bool, bool>
+MipsSEInstrInfo::compareOpndSize(unsigned Opc,
+                                 const MachineFunction &MF) const {
+  const MCInstrDesc &Desc = get(Opc);
+  assert(Desc.NumOperands == 2 && "Unary instruction expected.");
+  const MipsRegisterInfo *RI = &getRegisterInfo();
+  unsigned DstRegSize = getRegClass(Desc, 0, RI, MF)->getSize();
+  unsigned SrcRegSize = getRegClass(Desc, 1, RI, MF)->getSize();
+
+  return std::make_pair(DstRegSize > SrcRegSize, DstRegSize < SrcRegSize);
+}
+
+void MipsSEInstrInfo::expandCvtFPInt(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I,
+                                     unsigned CvtOpc, unsigned MovOpc,
+                                     bool IsI64) const {
+  const MCInstrDesc &CvtDesc = get(CvtOpc), &MovDesc = get(MovOpc);
+  const MachineOperand &Dst = I->getOperand(0), &Src = I->getOperand(1);
+  unsigned DstReg = Dst.getReg(), SrcReg = Src.getReg(), TmpReg = DstReg;
+  unsigned KillSrc =  getKillRegState(Src.isKill());
+  DebugLoc DL = I->getDebugLoc();
+  unsigned SubIdx = (IsI64 ? Mips::sub_32 : Mips::sub_fpeven);
+  bool DstIsLarger, SrcIsLarger;
+
+  tie(DstIsLarger, SrcIsLarger) = compareOpndSize(CvtOpc, *MBB.getParent());
+
+  if (DstIsLarger)
+    TmpReg = getRegisterInfo().getSubReg(DstReg, SubIdx);
+
+  if (SrcIsLarger)
+    DstReg = getRegisterInfo().getSubReg(DstReg, SubIdx);
+
+  BuildMI(MBB, I, DL, MovDesc, TmpReg).addReg(SrcReg, KillSrc);
+  BuildMI(MBB, I, DL, CvtDesc, DstReg).addReg(TmpReg, RegState::Kill);
+}
+
+void MipsSEInstrInfo::expandExtractElementF64(MachineBasicBlock &MBB,
                                           MachineBasicBlock::iterator I) const {
   unsigned DstReg = I->getOperand(0).getReg();
   unsigned SrcReg = I->getOperand(1).getReg();
@@ -362,7 +440,7 @@ void MipsSEInstrInfo::ExpandExtractElementF64(MachineBasicBlock &MBB,
   BuildMI(MBB, I, dl, Mfc1Tdd, DstReg).addReg(SubReg);
 }
 
-void MipsSEInstrInfo::ExpandBuildPairF64(MachineBasicBlock &MBB,
+void MipsSEInstrInfo::expandBuildPairF64(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator I) const {
   unsigned DstReg = I->getOperand(0).getReg();
   unsigned LoReg = I->getOperand(1).getReg(), HiReg = I->getOperand(2).getReg();
@@ -378,7 +456,57 @@ void MipsSEInstrInfo::ExpandBuildPairF64(MachineBasicBlock &MBB,
     .addReg(HiReg);
 }
 
-void MipsSEInstrInfo::ExpandEhReturn(MachineBasicBlock &MBB,
+/// Add 4 to the displacement of operand MO.
+static void fixDisp(MachineOperand &MO) {
+  switch (MO.getType()) {
+  default:
+    llvm_unreachable("Unhandled operand type.");
+  case MachineOperand::MO_Immediate:
+    MO.setImm(MO.getImm() + 4);
+    break;
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_ConstantPoolIndex:
+  case MachineOperand::MO_BlockAddress:
+  case MachineOperand::MO_TargetIndex:
+  case MachineOperand::MO_ExternalSymbol:
+    MO.setOffset(MO.getOffset() + 4);
+    break;
+  }
+}
+
+void MipsSEInstrInfo::expandDPLoadStore(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator I,
+                                        unsigned OpcD, unsigned OpcS) const {
+  // If NoDPLoadStore is false, just change the opcode.
+  if (!NoDPLoadStore) {
+    genInstrWithNewOpc(OpcD, I);
+    return;
+  }
+
+  // Expand a double precision FP load or store to two single precision
+  // instructions.
+
+  const TargetRegisterInfo &TRI = getRegisterInfo();
+  const MachineOperand &ValReg = I->getOperand(0);
+  unsigned LoReg = TRI.getSubReg(ValReg.getReg(), Mips::sub_fpeven);
+  unsigned HiReg = TRI.getSubReg(ValReg.getReg(), Mips::sub_fpodd);
+
+  if (!TM.getSubtarget<MipsSubtarget>().isLittle())
+    std::swap(LoReg, HiReg);
+
+  // Create an instruction which loads from or stores to the lower memory
+  // address.
+  MachineInstrBuilder MIB = genInstrWithNewOpc(OpcS, I);
+  MIB->getOperand(0).setReg(LoReg);
+
+  // Create an instruction which loads from or stores to the higher memory
+  // address.
+  MIB = genInstrWithNewOpc(OpcS, I);
+  MIB->getOperand(0).setReg(HiReg);
+  fixDisp(MIB->getOperand(2));
+}
+
+void MipsSEInstrInfo::expandEhReturn(MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I) const {
   // This pseudo instruction is generated as part of the lowering of
   // ISD::EH_RETURN. We convert it to a stack increment by OffsetReg, and
diff --git a/lib/Target/Mips/MipsSEInstrInfo.h b/lib/Target/Mips/MipsSEInstrInfo.h
index 0bf7876..d962ef0 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.h
+++ b/lib/Target/Mips/MipsSEInstrInfo.h
@@ -65,7 +65,7 @@ public:
 
   virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
 
-  virtual unsigned GetOppositeBranchOpc(unsigned Opc) const;
+  virtual unsigned getOppositeBranchOpc(unsigned Opc) const;
 
   /// Adjust SP by Amount bytes.
   void adjustStackPtr(unsigned SP, int64_t Amount, MachineBasicBlock &MBB,
@@ -79,15 +79,35 @@ public:
                          unsigned *NewImm) const;
 
 private:
-  virtual unsigned GetAnalyzableBrOpc(unsigned Opc) const;
+  virtual unsigned getAnalyzableBrOpc(unsigned Opc) const;
 
-  void ExpandRetRA(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+  void expandRetRA(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                    unsigned Opc) const;
-  void ExpandExtractElementF64(MachineBasicBlock &MBB,
+
+  std::pair<bool, bool> compareOpndSize(unsigned Opc,
+                                        const MachineFunction &MF) const;
+
+  /// Expand pseudo Int-to-FP conversion instructions.
+  ///
+  /// For example, the following pseudo instruction
+  ///  PseudoCVT_D32_W D2, A5
+  /// gets expanded into these two instructions:
+  ///  MTC1 F4, A5
+  ///  CVT_D32_W D2, F4
+  ///
+  /// We do this expansion post-RA to avoid inserting a floating point copy
+  /// instruction between MTC1 and CVT_D32_W.
+  void expandCvtFPInt(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                      unsigned CvtOpc, unsigned MovOpc, bool IsI64) const;
+
+  void expandExtractElementF64(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator I) const;
-  void ExpandBuildPairF64(MachineBasicBlock &MBB,
+  void expandBuildPairF64(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator I) const;
-  void ExpandEhReturn(MachineBasicBlock &MBB,
+  void expandDPLoadStore(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator I, unsigned OpcD,
+                         unsigned OpcS) const;
+  void expandEhReturn(MachineBasicBlock &MBB,
                       MachineBasicBlock::iterator I) const;
 };
 
diff --git a/lib/Target/Mips/MipsSERegisterInfo.cpp b/lib/Target/Mips/MipsSERegisterInfo.cpp
index 9696738..9763f85 100644
--- a/lib/Target/Mips/MipsSERegisterInfo.cpp
+++ b/lib/Target/Mips/MipsSERegisterInfo.cpp
@@ -40,9 +40,8 @@
 
 using namespace llvm;
 
-MipsSERegisterInfo::MipsSERegisterInfo(const MipsSubtarget &ST,
-                                       const MipsSEInstrInfo &I)
-  : MipsRegisterInfo(ST), TII(I) {}
+MipsSERegisterInfo::MipsSERegisterInfo(const MipsSubtarget &ST)
+  : MipsRegisterInfo(ST) {}
 
 bool MipsSERegisterInfo::
 requiresRegisterScavenging(const MachineFunction &MF) const {
@@ -119,7 +118,9 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
     DebugLoc DL = II->getDebugLoc();
     unsigned ADDu = Subtarget.isABI_N64() ? Mips::DADDu : Mips::ADDu;
     unsigned NewImm;
-
+    const MipsSEInstrInfo &TII =
+      *static_cast<const MipsSEInstrInfo*>(
+        MBB.getParent()->getTarget().getInstrInfo());
     unsigned Reg = TII.loadImmediate(Offset, MBB, II, DL, &NewImm);
     BuildMI(MBB, II, DL, TII.get(ADDu), Reg).addReg(FrameReg)
       .addReg(Reg, RegState::Kill);
diff --git a/lib/Target/Mips/MipsSERegisterInfo.h b/lib/Target/Mips/MipsSERegisterInfo.h
index 2f7c37b..76cdd9d 100644
--- a/lib/Target/Mips/MipsSERegisterInfo.h
+++ b/lib/Target/Mips/MipsSERegisterInfo.h
@@ -21,11 +21,8 @@ namespace llvm {
 class MipsSEInstrInfo;
 
 class MipsSERegisterInfo : public MipsRegisterInfo {
-  const MipsSEInstrInfo &TII;
-
 public:
-  MipsSERegisterInfo(const MipsSubtarget &Subtarget,
-                     const MipsSEInstrInfo &TII);
+  MipsSERegisterInfo(const MipsSubtarget &Subtarget);
 
   bool requiresRegisterScavenging(const MachineFunction &MF) const;
 
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index 14a2b27..259e68d 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -48,6 +48,11 @@ static cl::opt<bool> Mips_Os16(
            "floating point as Mips 16"),
   cl::Hidden);
 
+static cl::opt<bool>
+Mips16HardFloat("mips16-hard-float", cl::NotHidden,
+                cl::desc("MIPS: mips16 hard float enable."),
+                cl::init(false));
+
 void MipsSubtarget::anchor() { }
 
 MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
@@ -58,7 +63,8 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
   IsSingleFloat(false), IsFP64bit(false), IsGP64bit(false), HasVFPU(false),
   IsLinux(true), HasSEInReg(false), HasCondMov(false), HasSwap(false),
   HasBitCount(false), HasFPIdx(false),
-  InMips16Mode(false), InMicroMipsMode(false), HasDSP(false), HasDSPR2(false),
+  InMips16Mode(false), InMips16HardFloat(Mips16HardFloat),
+  InMicroMipsMode(false), HasDSP(false), HasDSPR2(false),
   AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16),
   RM(_RM), OverrideMode(NoOverride), TM(_TM)
 {
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index f2f0e15..ef7568a 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -93,6 +93,9 @@ protected:
   // InMips16 -- can process Mips16 instructions
   bool InMips16Mode;
 
+  // Mips16 hard float
+  bool InMips16HardFloat;
+
   // PreviousInMips16 -- the function we just processed was in Mips 16 Mode
   bool PreviousInMips16Mode;
 
@@ -170,9 +173,12 @@ public:
     }
     llvm_unreachable("Unexpected mode");
   }
-  bool inMips16ModeDefault() {
+  bool inMips16ModeDefault() const {
     return InMips16Mode;
   }
+  bool inMips16HardFloat() const {
+    return inMips16Mode() && InMips16HardFloat;
+  }
   bool inMicroMipsMode() const { return InMicroMipsMode; }
   bool hasDSP() const { return HasDSP; }
   bool hasDSPR2() const { return HasDSPR2; }
@@ -188,7 +194,8 @@ public:
   bool hasBitCount()  const { return HasBitCount; }
   bool hasFPIdx()     const { return HasFPIdx; }
 
-  bool allowMixed16_32() const { return AllowMixed16_32;};
+  bool allowMixed16_32() const { return inMips16ModeDefault() |
+                                        AllowMixed16_32;}
 
   bool os16() const { return Os16;};
 
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index ee28e2a..9af2f1b 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -22,6 +22,7 @@
 #include "MipsSEISelLowering.h"
 #include "MipsSEISelDAGToDAG.h"
 #include "Mips16FrameLowering.h"
+#include "Mips16HardFloat.h"
 #include "Mips16InstrInfo.h"
 #include "Mips16ISelDAGToDAG.h"
 #include "Mips16ISelLowering.h"
@@ -71,6 +72,7 @@ MipsTargetMachine(const Target &T, StringRef TT,
     FrameLowering(MipsFrameLowering::create(*this, Subtarget)),
     TLInfo(MipsTargetLowering::create(*this)),
     TSInfo(*this), JITInfo() {
+  initAsmInfo();
 }
 
 
@@ -156,6 +158,9 @@ void MipsPassConfig::addIRPasses() {
   TargetPassConfig::addIRPasses();
   if (getMipsSubtarget().os16())
     addPass(createMipsOs16(getMipsTargetMachine()));
+  if (getMipsSubtarget().inMips16HardFloat())
+    addPass(createMips16HardFloat(getMipsTargetMachine()));
+  addPass(createMipsOptimizeMathLibCalls(getMipsTargetMachine()));
 }
 // Install an instruction selector pass using
 // the ISelDag to gen Mips code.
diff --git a/lib/Target/NVPTX/CMakeLists.txt b/lib/Target/NVPTX/CMakeLists.txt
index 7da2fed..a8293da 100644
--- a/lib/Target/NVPTX/CMakeLists.txt
+++ b/lib/Target/NVPTX/CMakeLists.txt
@@ -23,6 +23,8 @@ set(NVPTXCodeGen_sources
   NVPTXAsmPrinter.cpp
   NVPTXUtilities.cpp
   NVVMReflect.cpp
+  NVPTXGenericToNVVM.cpp
+  NVPTXPrologEpilogPass.cpp
   )
 
 add_llvm_target(NVPTXCodeGen ${NVPTXCodeGen_sources})
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
index b3e8b5d..edf4a80 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
@@ -22,7 +22,6 @@ namespace llvm {
 enum AddressSpace {
   ADDRESS_SPACE_GENERIC = 0,
   ADDRESS_SPACE_GLOBAL = 1,
-  ADDRESS_SPACE_CONST_NOT_GEN = 2, // Not part of generic space
   ADDRESS_SPACE_SHARED = 3,
   ADDRESS_SPACE_CONST = 4,
   ADDRESS_SPACE_LOCAL = 5,
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
index 459cd96..dfa1ff5 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
@@ -17,17 +17,15 @@
 
 using namespace llvm;
 
-bool CompileForDebugging;
-
 // -debug-compile - Command line option to inform opt and llc passes to
 // compile for debugging
-static cl::opt<bool, true>
-Debug("debug-compile", cl::desc("Compile for debugging"), cl::Hidden,
-      cl::location(CompileForDebugging), cl::init(false));
+static cl::opt<bool> CompileForDebugging("debug-compile",
+                                         cl::desc("Compile for debugging"),
+                                         cl::Hidden, cl::init(false));
 
 void NVPTXMCAsmInfo::anchor() {}
 
-NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Target &T, const StringRef &TT) {
+NVPTXMCAsmInfo::NVPTXMCAsmInfo(const StringRef &TT) {
   Triple TheTriple(TT);
   if (TheTriple.getArch() == Triple::nvptx64) {
     PointerSize = CalleeSaveStackSlotSize = 8;
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
index 82097da..7d1633f 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
@@ -23,7 +23,7 @@ class StringRef;
 class NVPTXMCAsmInfo : public MCAsmInfo {
   virtual void anchor();
 public:
-  explicit NVPTXMCAsmInfo(const Target &T, const StringRef &TT);
+  explicit NVPTXMCAsmInfo(const StringRef &TT);
 };
 } // namespace llvm
 
diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h
index 6a53a44..179dc27 100644
--- a/lib/Target/NVPTX/NVPTX.h
+++ b/lib/Target/NVPTX/NVPTX.h
@@ -16,6 +16,7 @@
 #define LLVM_TARGET_NVPTX_H
 
 #include "MCTargetDesc/NVPTXBaseInfo.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -26,6 +27,7 @@
 namespace llvm {
 class NVPTXTargetMachine;
 class FunctionPass;
+class MachineFunctionPass;
 class formatted_raw_ostream;
 
 namespace NVPTXCC {
@@ -62,6 +64,10 @@ createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOpt::Level OptLevel);
 FunctionPass *createLowerStructArgsPass(NVPTXTargetMachine &);
 FunctionPass *createNVPTXReMatPass(NVPTXTargetMachine &);
 FunctionPass *createNVPTXReMatBlockPass(NVPTXTargetMachine &);
+ModulePass *createGenericToNVVMPass();
+ModulePass *createNVVMReflectPass();
+ModulePass *createNVVMReflectPass(const StringMap<int>& Mapping);
+MachineFunctionPass *createNVPTXPrologEpilogPass();
 
 bool isImageOrSamplerVal(const Value *, const Module *);
 
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index ce5d78a..ff73931 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -68,11 +68,12 @@ InterleaveSrc("nvptx-emit-src", cl::ZeroOrMore,
 namespace {
 /// DiscoverDependentGlobals - Return a set of GlobalVariables on which \p V
 /// depends.
-void DiscoverDependentGlobals(Value *V, DenseSet<GlobalVariable *> &Globals) {
-  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+void DiscoverDependentGlobals(const Value *V,
+                              DenseSet<const GlobalVariable *> &Globals) {
+  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
     Globals.insert(GV);
   else {
-    if (User *U = dyn_cast<User>(V)) {
+    if (const User *U = dyn_cast<User>(V)) {
       for (unsigned i = 0, e = U->getNumOperands(); i != e; ++i) {
         DiscoverDependentGlobals(U->getOperand(i), Globals);
       }
@@ -84,8 +85,9 @@ void DiscoverDependentGlobals(Value *V, DenseSet<GlobalVariable *> &Globals) {
 /// instances to be emitted, but only after any dependents have been added
 /// first.
 void VisitGlobalVariableForEmission(
-    GlobalVariable *GV, SmallVectorImpl<GlobalVariable *> &Order,
-    DenseSet<GlobalVariable *> &Visited, DenseSet<GlobalVariable *> &Visiting) {
+    const GlobalVariable *GV, SmallVectorImpl<const GlobalVariable *> &Order,
+    DenseSet<const GlobalVariable *> &Visited,
+    DenseSet<const GlobalVariable *> &Visiting) {
   // Have we already visited this one?
   if (Visited.count(GV))
     return;
@@ -98,12 +100,12 @@ void VisitGlobalVariableForEmission(
   Visiting.insert(GV);
 
   // Make sure we visit all dependents first
-  DenseSet<GlobalVariable *> Others;
+  DenseSet<const GlobalVariable *> Others;
   for (unsigned i = 0, e = GV->getNumOperands(); i != e; ++i)
     DiscoverDependentGlobals(GV->getOperand(i), Others);
 
-  for (DenseSet<GlobalVariable *>::iterator I = Others.begin(),
-                                            E = Others.end();
+  for (DenseSet<const GlobalVariable *>::iterator I = Others.begin(),
+                                                  E = Others.end();
        I != E; ++I)
     VisitGlobalVariableForEmission(*I, Order, Visited, Visiting);
 
@@ -405,6 +407,11 @@ void NVPTXAsmPrinter::EmitFunctionEntryLabel() {
   SmallString<128> Str;
   raw_svector_ostream O(Str);
 
+  if (!GlobalsEmitted) {
+    emitGlobals(*MF->getFunction()->getParent());
+    GlobalsEmitted = true;
+  }
+  
   // Set up
   MRI = &MF->getRegInfo();
   F = MF->getFunction();
@@ -429,9 +436,7 @@ void NVPTXAsmPrinter::EmitFunctionEntryLabel() {
 }
 
 void NVPTXAsmPrinter::EmitFunctionBodyStart() {
-  const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
-  unsigned numRegClasses = TRI.getNumRegClasses();
-  VRidGlobal2LocalMap = new std::map<unsigned, unsigned>[numRegClasses + 1];
+  VRegMapping.clear();
   OutStreamer.EmitRawText(StringRef("{\n"));
   setAndEmitFunctionVirtualRegisters(*MF);
 
@@ -443,7 +448,7 @@ void NVPTXAsmPrinter::EmitFunctionBodyStart() {
 
 void NVPTXAsmPrinter::EmitFunctionBodyEnd() {
   OutStreamer.EmitRawText(StringRef("}\n"));
-  delete[] VRidGlobal2LocalMap;
+  VRegMapping.clear();
 }
 
 void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
@@ -500,9 +505,8 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
 void NVPTXAsmPrinter::getVirtualRegisterName(unsigned vr, bool isVec,
                                              raw_ostream &O) {
   const TargetRegisterClass *RC = MRI->getRegClass(vr);
-  unsigned id = RC->getID();
 
-  std::map<unsigned, unsigned> &regmap = VRidGlobal2LocalMap[id];
+  DenseMap<unsigned, unsigned> &regmap = VRegMapping[RC];
   unsigned mapped_vr = regmap[vr];
 
   if (!isVec) {
@@ -695,7 +699,7 @@ void NVPTXAsmPrinter::emitDeclaration(const Function *F, raw_ostream &O) {
   else
     O << ".func ";
   printReturnValStr(F, O);
-  O << *CurrentFnSym << "\n";
+  O << *Mang->getSymbol(F) << "\n";
   emitFunctionParamList(F, O);
   O << ";\n";
 }
@@ -795,7 +799,7 @@ static bool useFuncSeen(const Constant *C,
   return false;
 }
 
-void NVPTXAsmPrinter::emitDeclarations(Module &M, raw_ostream &O) {
+void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) {
   llvm::DenseMap<const Function *, bool> seenMap;
   for (Module::const_iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI) {
     const Function *F = FI;
@@ -805,7 +809,6 @@ void NVPTXAsmPrinter::emitDeclarations(Module &M, raw_ostream &O) {
         continue;
       if (F->getIntrinsicID())
         continue;
-      CurrentFnSym = Mang->getSymbol(F);
       emitDeclaration(F, O);
       continue;
     }
@@ -817,14 +820,12 @@ void NVPTXAsmPrinter::emitDeclarations(Module &M, raw_ostream &O) {
           // The use is in the initialization of a global variable
           // that is a function pointer, so print a declaration
           // for the original function
-          CurrentFnSym = Mang->getSymbol(F);
           emitDeclaration(F, O);
           break;
         }
         // Emit a declaration of this function if the function that
         // uses this constant expr has already been seen.
         if (useFuncSeen(C, seenMap)) {
-          CurrentFnSym = Mang->getSymbol(F);
           emitDeclaration(F, O);
           break;
         }
@@ -844,7 +845,6 @@ void NVPTXAsmPrinter::emitDeclarations(Module &M, raw_ostream &O) {
       // appearing in the module before the callee. so print out
       // a declaration for the callee.
       if (seenMap.find(caller) != seenMap.end()) {
-        CurrentFnSym = Mang->getSymbol(F);
         emitDeclaration(F, O);
         break;
       }
@@ -909,7 +909,7 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) {
   const_cast<TargetLoweringObjectFile &>(getObjFileLowering())
       .Initialize(OutContext, TM);
 
-  Mang = new Mangler(OutContext, *TM.getDataLayout());
+  Mang = new Mangler(OutContext, &TM);
 
   // Emit header before any dwarf directives are emitted below.
   emitHeader(M, OS1);
@@ -921,6 +921,12 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) {
   if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA)
     recordAndEmitFilenames(M);
 
+  GlobalsEmitted = false;
+    
+  return false; // success
+}
+
+void NVPTXAsmPrinter::emitGlobals(const Module &M) {
   SmallString<128> Str2;
   raw_svector_ostream OS2(Str2);
 
@@ -931,13 +937,13 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) {
   // global variable in order, and ensure that we emit it *after* its dependent
   // globals. We use a little extra memory maintaining both a set and a list to
   // have fast searches while maintaining a strict ordering.
-  SmallVector<GlobalVariable *, 8> Globals;
-  DenseSet<GlobalVariable *> GVVisited;
-  DenseSet<GlobalVariable *> GVVisiting;
+  SmallVector<const GlobalVariable *, 8> Globals;
+  DenseSet<const GlobalVariable *> GVVisited;
+  DenseSet<const GlobalVariable *> GVVisiting;
 
   // Visit each global variable, in order
-  for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E;
-       ++I)
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I)
     VisitGlobalVariableForEmission(I, Globals, GVVisited, GVVisiting);
 
   assert(GVVisited.size() == M.getGlobalList().size() &&
@@ -951,7 +957,6 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) {
   OS2 << '\n';
 
   OutStreamer.EmitRawText(OS2.str());
-  return false; // success
 }
 
 void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O) {
@@ -989,6 +994,14 @@ void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O) {
 }
 
 bool NVPTXAsmPrinter::doFinalization(Module &M) {
+
+  // If we did not emit any functions, then the global declarations have not
+  // yet been emitted.
+  if (!GlobalsEmitted) {
+    emitGlobals(M);
+    GlobalsEmitted = true;
+  }
+
   // XXX Temproarily remove global variables so that doFinalization() will not
   // emit them again (global variables are emitted at beginning).
 
@@ -1063,7 +1076,8 @@ void NVPTXAsmPrinter::emitLinkageDirective(const GlobalValue *V,
   }
 }
 
-void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O,
+void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
+                                         raw_ostream &O,
                                          bool processDemoted) {
 
   // Skip meta data
@@ -1107,10 +1121,10 @@ void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O,
   if (llvm::isSampler(*GVar)) {
     O << ".global .samplerref " << llvm::getSamplerName(*GVar);
 
-    Constant *Initializer = NULL;
+    const Constant *Initializer = NULL;
     if (GVar->hasInitializer())
       Initializer = GVar->getInitializer();
-    ConstantInt *CI = NULL;
+    const ConstantInt *CI = NULL;
     if (Initializer)
       CI = dyn_cast<ConstantInt>(Initializer);
     if (CI) {
@@ -1183,7 +1197,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O,
     if (localDecls.find(demotedFunc) != localDecls.end())
       localDecls[demotedFunc].push_back(GVar);
     else {
-      std::vector<GlobalVariable *> temp;
+      std::vector<const GlobalVariable *> temp;
       temp.push_back(GVar);
       localDecls[demotedFunc] = temp;
     }
@@ -1199,17 +1213,20 @@ void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O,
 
   if (ETy->isPrimitiveType() || ETy->isIntegerTy() || isa<PointerType>(ETy)) {
     O << " .";
-    O << getPTXFundamentalTypeStr(ETy, false);
+    // Special case: ABI requires that we use .u8 for predicates
+    if (ETy->isIntegerTy(1))
+      O << "u8";
+    else
+      O << getPTXFundamentalTypeStr(ETy, false);
     O << " ";
     O << *Mang->getSymbol(GVar);
 
     // Ptx allows variable initilization only for constant and global state
     // spaces.
     if (((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) ||
-         (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST_NOT_GEN) ||
          (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) &&
         GVar->hasInitializer()) {
-      Constant *Initializer = GVar->getInitializer();
+      const Constant *Initializer = GVar->getInitializer();
       if (!Initializer->isNullValue()) {
         O << " = ";
         printScalarConstant(Initializer, O);
@@ -1230,10 +1247,9 @@ void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O,
       // Ptx allows variable initilization only for constant and
       // global state spaces.
       if (((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) ||
-           (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST_NOT_GEN) ||
            (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) &&
           GVar->hasInitializer()) {
-        Constant *Initializer = GVar->getInitializer();
+        const Constant *Initializer = GVar->getInitializer();
         if (!isa<UndefValue>(Initializer) && !Initializer->isNullValue()) {
           AggBuffer aggBuffer(ElementSize, O, *this);
           bufferAggregateConstant(Initializer, &aggBuffer);
@@ -1283,7 +1299,7 @@ void NVPTXAsmPrinter::emitDemotedVars(const Function *f, raw_ostream &O) {
   if (localDecls.find(f) == localDecls.end())
     return;
 
-  std::vector<GlobalVariable *> &gvars = localDecls[f];
+  std::vector<const GlobalVariable *> &gvars = localDecls[f];
 
   for (unsigned i = 0, e = gvars.size(); i != e; ++i) {
     O << "\t// demoted variable\n\t";
@@ -1301,14 +1317,6 @@ void NVPTXAsmPrinter::emitPTXAddressSpace(unsigned int AddressSpace,
     O << "global";
     break;
   case llvm::ADDRESS_SPACE_CONST:
-    // This logic should be consistent with that in
-    // getCodeAddrSpace() (NVPTXISelDATToDAT.cpp)
-    if (nvptxSubtarget.hasGenericLdSt())
-      O << "global";
-    else
-      O << "const";
-    break;
-  case llvm::ADDRESS_SPACE_CONST_NOT_GEN:
     O << "const";
     break;
   case llvm::ADDRESS_SPACE_SHARED:
@@ -1448,7 +1456,7 @@ void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I,
                                      int paramIndex, raw_ostream &O) {
   if ((nvptxSubtarget.getDrvInterface() == NVPTX::NVCL) ||
       (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA))
-    O << *CurrentFnSym << "_param_" << paramIndex;
+    O << *Mang->getSymbol(I->getParent()) << "_param_" << paramIndex;
   else {
     std::string argName = I->getName();
     const char *p = argName.c_str();
@@ -1507,11 +1515,13 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
       if (llvm::isImage(*I)) {
         std::string sname = I->getName();
         if (llvm::isImageWriteOnly(*I))
-          O << "\t.param .surfref " << *CurrentFnSym << "_param_" << paramIndex;
+          O << "\t.param .surfref " << *Mang->getSymbol(F) << "_param_"
+            << paramIndex;
         else // Default image is read_only
-          O << "\t.param .texref " << *CurrentFnSym << "_param_" << paramIndex;
+          O << "\t.param .texref " << *Mang->getSymbol(F) << "_param_"
+            << paramIndex;
       } else // Should be llvm::isSampler(*I)
-        O << "\t.param .samplerref " << *CurrentFnSym << "_param_"
+        O << "\t.param .samplerref " << *Mang->getSymbol(F) << "_param_"
           << paramIndex;
       continue;
     }
@@ -1546,14 +1556,13 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
             default:
               O << ".ptr ";
               break;
-            case llvm::ADDRESS_SPACE_CONST_NOT_GEN:
+            case llvm::ADDRESS_SPACE_CONST:
               O << ".ptr .const ";
               break;
             case llvm::ADDRESS_SPACE_SHARED:
               O << ".ptr .shared ";
               break;
             case llvm::ADDRESS_SPACE_GLOBAL:
-            case llvm::ADDRESS_SPACE_CONST:
               O << ".ptr .global ";
               break;
             }
@@ -1564,7 +1573,13 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
         }
 
         // non-pointer scalar to kernel func
-        O << "\t.param ." << getPTXFundamentalTypeStr(Ty) << " ";
+        O << "\t.param .";
+        // Special case: predicate operands become .u8 types
+        if (Ty->isIntegerTy(1))
+          O << "u8";
+        else
+          O << getPTXFundamentalTypeStr(Ty);
+        O << " ";
         printParamName(I, paramIndex, O);
         continue;
       }
@@ -1680,48 +1695,36 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters(
   for (unsigned i = 0; i < numVRs; i++) {
     unsigned int vr = TRI->index2VirtReg(i);
     const TargetRegisterClass *RC = MRI->getRegClass(vr);
-    std::map<unsigned, unsigned> &regmap = VRidGlobal2LocalMap[RC->getID()];
+    DenseMap<unsigned, unsigned> &regmap = VRegMapping[RC];
     int n = regmap.size();
     regmap.insert(std::make_pair(vr, n + 1));
   }
 
   // Emit register declarations
   // @TODO: Extract out the real register usage
-  O << "\t.reg .pred %p<" << NVPTXNumRegisters << ">;\n";
-  O << "\t.reg .s16 %rc<" << NVPTXNumRegisters << ">;\n";
-  O << "\t.reg .s16 %rs<" << NVPTXNumRegisters << ">;\n";
-  O << "\t.reg .s32 %r<" << NVPTXNumRegisters << ">;\n";
-  O << "\t.reg .s64 %rl<" << NVPTXNumRegisters << ">;\n";
-  O << "\t.reg .f32 %f<" << NVPTXNumRegisters << ">;\n";
-  O << "\t.reg .f64 %fl<" << NVPTXNumRegisters << ">;\n";
+  // O << "\t.reg .pred %p<" << NVPTXNumRegisters << ">;\n";
+  // O << "\t.reg .s16 %rc<" << NVPTXNumRegisters << ">;\n";
+  // O << "\t.reg .s16 %rs<" << NVPTXNumRegisters << ">;\n";
+  // O << "\t.reg .s32 %r<" << NVPTXNumRegisters << ">;\n";
+  // O << "\t.reg .s64 %rl<" << NVPTXNumRegisters << ">;\n";
+  // O << "\t.reg .f32 %f<" << NVPTXNumRegisters << ">;\n";
+  // O << "\t.reg .f64 %fl<" << NVPTXNumRegisters << ">;\n";
 
   // Emit declaration of the virtual registers or 'physical' registers for
   // each register class
-  //for (unsigned i=0; i< numRegClasses; i++) {
-  //    std::map<unsigned, unsigned> &regmap = VRidGlobal2LocalMap[i];
-  //    const TargetRegisterClass *RC = TRI->getRegClass(i);
-  //    std::string rcname = getNVPTXRegClassName(RC);
-  //    std::string rcStr = getNVPTXRegClassStr(RC);
-  //    //int n = regmap.size();
-  //    if (!isNVPTXVectorRegClass(RC)) {
-  //      O << "\t.reg " << rcname << " \t" << rcStr << "<"
-  //        << NVPTXNumRegisters << ">;\n";
-  //    }
-
-  // Only declare those registers that may be used. And do not emit vector
-  // registers as
-  // they are all elementized to scalar registers.
-  //if (n && !isNVPTXVectorRegClass(RC)) {
-  //    if (RegAllocNilUsed) {
-  //        O << "\t.reg " << rcname << " \t" << rcStr << "<" << (n+1)
-  //          << ">;\n";
-  //    }
-  //    else {
-  //        O << "\t.reg " << rcname << " \t" << StrToUpper(rcStr)
-  //          << "<" << 32 << ">;\n";
-  //    }
-  //}
-  //}
+  for (unsigned i=0; i< TRI->getNumRegClasses(); i++) {
+    const TargetRegisterClass *RC = TRI->getRegClass(i);
+    DenseMap<unsigned, unsigned> &regmap = VRegMapping[RC];
+    std::string rcname = getNVPTXRegClassName(RC);
+    std::string rcStr = getNVPTXRegClassStr(RC);
+    int n = regmap.size();
+
+    // Only declare those registers that may be used.
+    if (n) {
+       O << "\t.reg " << rcname << " \t" << rcStr << "<" << (n+1)
+         << ">;\n";
+    }
+  }
 
   OutStreamer.EmitRawText(O.str());
 }
@@ -1751,12 +1754,12 @@ void NVPTXAsmPrinter::printFPConstant(const ConstantFP *Fp, raw_ostream &O) {
   O << utohexstr(API.getZExtValue());
 }
 
-void NVPTXAsmPrinter::printScalarConstant(Constant *CPV, raw_ostream &O) {
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) {
+void NVPTXAsmPrinter::printScalarConstant(const Constant *CPV, raw_ostream &O) {
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) {
     O << CI->getValue();
     return;
   }
-  if (ConstantFP *CFP = dyn_cast<ConstantFP>(CPV)) {
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CPV)) {
     printFPConstant(CFP, O);
     return;
   }
@@ -1764,13 +1767,13 @@ void NVPTXAsmPrinter::printScalarConstant(Constant *CPV, raw_ostream &O) {
     O << "0";
     return;
   }
-  if (GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
+  if (const GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
     O << *Mang->getSymbol(GVar);
     return;
   }
-  if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
-    Value *v = Cexpr->stripPointerCasts();
-    if (GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
+  if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+    const Value *v = Cexpr->stripPointerCasts();
+    if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
       O << *Mang->getSymbol(GVar);
       return;
     } else {
@@ -1781,7 +1784,7 @@ void NVPTXAsmPrinter::printScalarConstant(Constant *CPV, raw_ostream &O) {
   llvm_unreachable("Not scalar type found in printScalarConstant()");
 }
 
-void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes,
+void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
                                    AggBuffer *aggBuffer) {
 
   const DataLayout *TD = TM.getDataLayout();
@@ -1809,13 +1812,13 @@ void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes,
       ptr = (unsigned char *)&int16;
       aggBuffer->addBytes(ptr, 2, Bytes);
     } else if (ETy == Type::getInt32Ty(CPV->getContext())) {
-      if (ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
+      if (const ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
         int int32 = (int)(constInt->getZExtValue());
         ptr = (unsigned char *)&int32;
         aggBuffer->addBytes(ptr, 4, Bytes);
         break;
-      } else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
-        if (ConstantInt *constInt = dyn_cast<ConstantInt>(
+      } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+        if (const ConstantInt *constInt = dyn_cast<ConstantInt>(
                 ConstantFoldConstantExpression(Cexpr, TD))) {
           int int32 = (int)(constInt->getZExtValue());
           ptr = (unsigned char *)&int32;
@@ -1831,13 +1834,13 @@ void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes,
       }
       llvm_unreachable("unsupported integer const type");
     } else if (ETy == Type::getInt64Ty(CPV->getContext())) {
-      if (ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
+      if (const ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
         long long int64 = (long long)(constInt->getZExtValue());
         ptr = (unsigned char *)&int64;
         aggBuffer->addBytes(ptr, 8, Bytes);
         break;
-      } else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
-        if (ConstantInt *constInt = dyn_cast<ConstantInt>(
+      } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+        if (const ConstantInt *constInt = dyn_cast<ConstantInt>(
                 ConstantFoldConstantExpression(Cexpr, TD))) {
           long long int64 = (long long)(constInt->getZExtValue());
           ptr = (unsigned char *)&int64;
@@ -1858,7 +1861,7 @@ void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes,
   }
   case Type::FloatTyID:
   case Type::DoubleTyID: {
-    ConstantFP *CFP = dyn_cast<ConstantFP>(CPV);
+    const ConstantFP *CFP = dyn_cast<ConstantFP>(CPV);
     const Type *Ty = CFP->getType();
     if (Ty == Type::getFloatTy(CPV->getContext())) {
       float float32 = (float) CFP->getValueAPF().convertToFloat();
@@ -1874,10 +1877,10 @@ void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes,
     break;
   }
   case Type::PointerTyID: {
-    if (GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
+    if (const GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
       aggBuffer->addSymbol(GVar);
-    } else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
-      Value *v = Cexpr->stripPointerCasts();
+    } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+      const Value *v = Cexpr->stripPointerCasts();
       aggBuffer->addSymbol(v);
     }
     unsigned int s = TD->getTypeAllocSize(CPV->getType());
@@ -1906,7 +1909,7 @@ void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes,
   }
 }
 
-void NVPTXAsmPrinter::bufferAggregateConstant(Constant *CPV,
+void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV,
                                               AggBuffer *aggBuffer) {
   const DataLayout *TD = TM.getDataLayout();
   int Bytes;
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h
index 6dc9fc0..55f2943 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -91,7 +91,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
     unsigned char *buffer; // the buffer
     unsigned numSymbols;   // number of symbol addresses
     SmallVector<unsigned, 4> symbolPosInBuffer;
-    SmallVector<Value *, 4> Symbols;
+    SmallVector<const Value *, 4> Symbols;
 
   private:
     unsigned curpos;
@@ -128,7 +128,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
       }
       return curpos;
     }
-    void addSymbol(Value *GVar) {
+    void addSymbol(const Value *GVar) {
       symbolPosInBuffer.push_back(curpos);
       Symbols.push_back(GVar);
       numSymbols++;
@@ -153,11 +153,11 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
           if (pos)
             O << ", ";
           if (pos == nextSymbolPos) {
-            Value *v = Symbols[nSym];
-            if (GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
+            const Value *v = Symbols[nSym];
+            if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
               MCSymbol *Name = AP.Mang->getSymbol(GVar);
               O << *Name;
-            } else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(v)) {
+            } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(v)) {
               O << *nvptx::LowerConstant(Cexpr, AP);
             } else
               llvm_unreachable("symbol type unknown");
@@ -205,10 +205,12 @@ private:
   void printImplicitDef(const MachineInstr *MI, raw_ostream &O) const;
   // definition autogenerated.
   void printInstruction(const MachineInstr *MI, raw_ostream &O);
-  void printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O, bool = false);
+  void printModuleLevelGV(const GlobalVariable *GVar, raw_ostream &O,
+                          bool = false);
   void printParamName(int paramIndex, raw_ostream &O);
   void printParamName(Function::const_arg_iterator I, int paramIndex,
                       raw_ostream &O);
+  void emitGlobals(const Module &M);
   void emitHeader(Module &M, raw_ostream &O);
   void emitKernelFunctionDirectives(const Function &F, raw_ostream &O) const;
   void emitVirtualRegister(unsigned int vr, bool isVec, raw_ostream &O);
@@ -234,12 +236,16 @@ protected:
 private:
   std::string CurrentBankselLabelInBasicBlock;
 
+  bool GlobalsEmitted;
+  
   // This is specific per MachineFunction.
   const MachineRegisterInfo *MRI;
   // The contents are specific for each
   // MachineFunction. But the size of the
   // array is not.
-  std::map<unsigned, unsigned> *VRidGlobal2LocalMap;
+  typedef DenseMap<unsigned, unsigned> VRegMap;
+  typedef DenseMap<const TargetRegisterClass *, VRegMap> VRegRCMap;
+  VRegRCMap VRegMapping;
   // cache the subtarget here.
   const NVPTXSubtarget &nvptxSubtarget;
   // Build the map between type name and ID based on module's type
@@ -247,7 +253,7 @@ private:
   std::map<const Type *, std::string> TypeNameMap;
 
   // List of variables demoted to a function scope.
-  std::map<const Function *, std::vector<GlobalVariable *> > localDecls;
+  std::map<const Function *, std::vector<const GlobalVariable *> > localDecls;
 
   // To record filename to ID mapping
   std::map<std::string, unsigned> filenameMap;
@@ -256,15 +262,15 @@ private:
   void emitPTXGlobalVariable(const GlobalVariable *GVar, raw_ostream &O);
   void emitPTXAddressSpace(unsigned int AddressSpace, raw_ostream &O) const;
   std::string getPTXFundamentalTypeStr(const Type *Ty, bool = true) const;
-  void printScalarConstant(Constant *CPV, raw_ostream &O);
+  void printScalarConstant(const Constant *CPV, raw_ostream &O);
   void printFPConstant(const ConstantFP *Fp, raw_ostream &O);
-  void bufferLEByte(Constant *CPV, int Bytes, AggBuffer *aggBuffer);
-  void bufferAggregateConstant(Constant *CV, AggBuffer *aggBuffer);
+  void bufferLEByte(const Constant *CPV, int Bytes, AggBuffer *aggBuffer);
+  void bufferAggregateConstant(const Constant *CV, AggBuffer *aggBuffer);
 
   void printOperandProper(const MachineOperand &MO);
 
   void emitLinkageDirective(const GlobalValue *V, raw_ostream &O);
-  void emitDeclarations(Module &, raw_ostream &O);
+  void emitDeclarations(const Module &, raw_ostream &O);
   void emitDeclaration(const Function *, raw_ostream &O);
 
   static const char *getRegisterName(unsigned RegNo);
@@ -277,7 +283,6 @@ public:
       : AsmPrinter(TM, Streamer),
         nvptxSubtarget(TM.getSubtarget<NVPTXSubtarget>()) {
     CurrentBankselLabelInBasicBlock = "";
-    VRidGlobal2LocalMap = NULL;
     reader = NULL;
   }
 
diff --git a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
new file mode 100644
index 0000000..1077c46
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -0,0 +1,436 @@
+//===-- GenericToNVVM.cpp - Convert generic module to NVVM module - C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Convert generic global variables into either .global or .const access based
+// on the variable's "constant" qualifier.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXUtilities.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
+
+#include "llvm/PassManager.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/ADT/ValueMap.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/IRBuilder.h"
+
+using namespace llvm;
+
+namespace llvm {
+void initializeGenericToNVVMPass(PassRegistry &);
+}
+
+namespace {
+class GenericToNVVM : public ModulePass {
+public:
+  static char ID;
+
+  GenericToNVVM() : ModulePass(ID) {}
+
+  virtual bool runOnModule(Module &M);
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+  }
+
+private:
+  Value *getOrInsertCVTA(Module *M, Function *F, GlobalVariable *GV,
+                         IRBuilder<> &Builder);
+  Value *remapConstant(Module *M, Function *F, Constant *C,
+                       IRBuilder<> &Builder);
+  Value *remapConstantVectorOrConstantAggregate(Module *M, Function *F,
+                                                Constant *C,
+                                                IRBuilder<> &Builder);
+  Value *remapConstantExpr(Module *M, Function *F, ConstantExpr *C,
+                           IRBuilder<> &Builder);
+  void remapNamedMDNode(Module *M, NamedMDNode *N);
+  MDNode *remapMDNode(Module *M, MDNode *N);
+
+  typedef ValueMap<GlobalVariable *, GlobalVariable *> GVMapTy;
+  typedef ValueMap<Constant *, Value *> ConstantToValueMapTy;
+  GVMapTy GVMap;
+  ConstantToValueMapTy ConstantToValueMap;
+};
+}
+
+char GenericToNVVM::ID = 0;
+
+ModulePass *llvm::createGenericToNVVMPass() { return new GenericToNVVM(); }
+
+INITIALIZE_PASS(
+    GenericToNVVM, "generic-to-nvvm",
+    "Ensure that the global variables are in the global address space", false,
+    false)
+
+bool GenericToNVVM::runOnModule(Module &M) {
+  // Create a clone of each global variable that has the default address space.
+  // The clone is created with the global address space  specifier, and the pair
+  // of original global variable and its clone is placed in the GVMap for later
+  // use.
+
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E;) {
+    GlobalVariable *GV = I++;
+    if (GV->getType()->getAddressSpace() == llvm::ADDRESS_SPACE_GENERIC &&
+        !llvm::isTexture(*GV) && !llvm::isSurface(*GV) &&
+        !GV->getName().startswith("llvm.")) {
+      GlobalVariable *NewGV = new GlobalVariable(
+          M, GV->getType()->getElementType(), GV->isConstant(),
+          GV->getLinkage(), GV->hasInitializer() ? GV->getInitializer() : NULL,
+          "", GV, GV->getThreadLocalMode(), llvm::ADDRESS_SPACE_GLOBAL);
+      NewGV->copyAttributesFrom(GV);
+      GVMap[GV] = NewGV;
+    }
+  }
+
+  // Return immediately, if every global variable has a specific address space
+  // specifier.
+  if (GVMap.empty()) {
+    return false;
+  }
+
+  // Walk through the instructions in function defitinions, and replace any use
+  // of original global variables in GVMap with a use of the corresponding
+  // copies in GVMap.  If necessary, promote constants to instructions.
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+    if (I->isDeclaration()) {
+      continue;
+    }
+    IRBuilder<> Builder(I->getEntryBlock().getFirstNonPHIOrDbg());
+    for (Function::iterator BBI = I->begin(), BBE = I->end(); BBI != BBE;
+         ++BBI) {
+      for (BasicBlock::iterator II = BBI->begin(), IE = BBI->end(); II != IE;
+           ++II) {
+        for (unsigned i = 0, e = II->getNumOperands(); i < e; ++i) {
+          Value *Operand = II->getOperand(i);
+          if (isa<Constant>(Operand)) {
+            II->setOperand(
+                i, remapConstant(&M, I, cast<Constant>(Operand), Builder));
+          }
+        }
+      }
+    }
+    ConstantToValueMap.clear();
+  }
+
+  // Walk through the metadata section and update the debug information
+  // associated with the global variables in the default address space.
+  for (Module::named_metadata_iterator I = M.named_metadata_begin(),
+                                       E = M.named_metadata_end();
+       I != E; I++) {
+    remapNamedMDNode(&M, I);
+  }
+
+  // Walk through the global variable  initializers, and replace any use of
+  // original global variables in GVMap with a use of the corresponding copies
+  // in GVMap.  The copies need to be bitcast to the original global variable
+  // types, as we cannot use cvta in global variable initializers.
+  for (GVMapTy::iterator I = GVMap.begin(), E = GVMap.end(); I != E;) {
+    GlobalVariable *GV = I->first;
+    GlobalVariable *NewGV = I->second;
+    ++I;
+    Constant *BitCastNewGV = ConstantExpr::getBitCast(NewGV, GV->getType());
+    // At this point, the remaining uses of GV should be found only in global
+    // variable initializers, as other uses have been already been removed
+    // while walking through the instructions in function definitions.
+    for (Value::use_iterator UI = GV->use_begin(), UE = GV->use_end();
+         UI != UE;) {
+      Use &U = (UI++).getUse();
+      U.set(BitCastNewGV);
+    }
+    std::string Name = GV->getName();
+    GV->removeDeadConstantUsers();
+    GV->eraseFromParent();
+    NewGV->setName(Name);
+  }
+  GVMap.clear();
+
+  return true;
+}
+
+Value *GenericToNVVM::getOrInsertCVTA(Module *M, Function *F,
+                                      GlobalVariable *GV,
+                                      IRBuilder<> &Builder) {
+  PointerType *GVType = GV->getType();
+  Value *CVTA = NULL;
+
+  // See if the address space conversion requires the operand to be bitcast
+  // to i8 addrspace(n)* first.
+  EVT ExtendedGVType = EVT::getEVT(GVType->getElementType(), true);
+  if (!ExtendedGVType.isInteger() && !ExtendedGVType.isFloatingPoint()) {
+    // A bitcast to i8 addrspace(n)* on the operand is needed.
+    LLVMContext &Context = M->getContext();
+    unsigned int AddrSpace = GVType->getAddressSpace();
+    Type *DestTy = PointerType::get(Type::getInt8Ty(Context), AddrSpace);
+    CVTA = Builder.CreateBitCast(GV, DestTy, "cvta");
+    // Insert the address space conversion.
+    Type *ResultType =
+        PointerType::get(Type::getInt8Ty(Context), llvm::ADDRESS_SPACE_GENERIC);
+    SmallVector<Type *, 2> ParamTypes;
+    ParamTypes.push_back(ResultType);
+    ParamTypes.push_back(DestTy);
+    Function *CVTAFunction = Intrinsic::getDeclaration(
+        M, Intrinsic::nvvm_ptr_global_to_gen, ParamTypes);
+    CVTA = Builder.CreateCall(CVTAFunction, CVTA, "cvta");
+    // Another bitcast from i8 * to <the element type of GVType> * is
+    // required.
+    DestTy =
+        PointerType::get(GVType->getElementType(), llvm::ADDRESS_SPACE_GENERIC);
+    CVTA = Builder.CreateBitCast(CVTA, DestTy, "cvta");
+  } else {
+    // A simple CVTA is enough.
+    SmallVector<Type *, 2> ParamTypes;
+    ParamTypes.push_back(PointerType::get(GVType->getElementType(),
+                                          llvm::ADDRESS_SPACE_GENERIC));
+    ParamTypes.push_back(GVType);
+    Function *CVTAFunction = Intrinsic::getDeclaration(
+        M, Intrinsic::nvvm_ptr_global_to_gen, ParamTypes);
+    CVTA = Builder.CreateCall(CVTAFunction, GV, "cvta");
+  }
+
+  return CVTA;
+}
+
+Value *GenericToNVVM::remapConstant(Module *M, Function *F, Constant *C,
+                                    IRBuilder<> &Builder) {
+  // If the constant C has been converted already in the given function  F, just
+  // return the converted value.
+  ConstantToValueMapTy::iterator CTII = ConstantToValueMap.find(C);
+  if (CTII != ConstantToValueMap.end()) {
+    return CTII->second;
+  }
+
+  Value *NewValue = C;
+  if (isa<GlobalVariable>(C)) {
+    // If the constant C is a global variable and is found in  GVMap, generate a
+    // set set of instructions that convert the clone of C with the global
+    // address space specifier to a generic pointer.
+    // The constant C cannot be used here, as it will be erased from the
+    // module eventually.  And the clone of C with the global address space
+    // specifier cannot be used here either, as it will affect the types of
+    // other instructions in the function.  Hence, this address space conversion
+    // is required.
+    GVMapTy::iterator I = GVMap.find(cast<GlobalVariable>(C));
+    if (I != GVMap.end()) {
+      NewValue = getOrInsertCVTA(M, F, I->second, Builder);
+    }
+  } else if (isa<ConstantVector>(C) || isa<ConstantArray>(C) ||
+             isa<ConstantStruct>(C)) {
+    // If any element in the constant vector or aggregate C is or uses a global
+    // variable in GVMap, the constant C needs to be reconstructed, using a set
+    // of instructions.
+    NewValue = remapConstantVectorOrConstantAggregate(M, F, C, Builder);
+  } else if (isa<ConstantExpr>(C)) {
+    // If any operand in the constant expression C is or uses a global variable
+    // in GVMap, the constant expression C needs to be reconstructed, using a
+    // set of instructions.
+    NewValue = remapConstantExpr(M, F, cast<ConstantExpr>(C), Builder);
+  }
+
+  ConstantToValueMap[C] = NewValue;
+  return NewValue;
+}
+
+Value *GenericToNVVM::remapConstantVectorOrConstantAggregate(
+    Module *M, Function *F, Constant *C, IRBuilder<> &Builder) {
+  bool OperandChanged = false;
+  SmallVector<Value *, 4> NewOperands;
+  unsigned NumOperands = C->getNumOperands();
+
+  // Check if any element is or uses a global variable in  GVMap, and thus
+  // converted to another value.
+  for (unsigned i = 0; i < NumOperands; ++i) {
+    Value *Operand = C->getOperand(i);
+    Value *NewOperand = remapConstant(M, F, cast<Constant>(Operand), Builder);
+    OperandChanged |= Operand != NewOperand;
+    NewOperands.push_back(NewOperand);
+  }
+
+  // If none of the elements has been modified, return C as it is.
+  if (!OperandChanged) {
+    return C;
+  }
+
+  // If any of the elements has been  modified, construct the equivalent
+  // vector or aggregate value with a set instructions and the converted
+  // elements.
+  Value *NewValue = UndefValue::get(C->getType());
+  if (isa<ConstantVector>(C)) {
+    for (unsigned i = 0; i < NumOperands; ++i) {
+      Value *Idx = ConstantInt::get(Type::getInt32Ty(M->getContext()), i);
+      NewValue = Builder.CreateInsertElement(NewValue, NewOperands[i], Idx);
+    }
+  } else {
+    for (unsigned i = 0; i < NumOperands; ++i) {
+      NewValue =
+          Builder.CreateInsertValue(NewValue, NewOperands[i], makeArrayRef(i));
+    }
+  }
+
+  return NewValue;
+}
+
+Value *GenericToNVVM::remapConstantExpr(Module *M, Function *F, ConstantExpr *C,
+                                        IRBuilder<> &Builder) {
+  bool OperandChanged = false;
+  SmallVector<Value *, 4> NewOperands;
+  unsigned NumOperands = C->getNumOperands();
+
+  // Check if any operand is or uses a global variable in  GVMap, and thus
+  // converted to another value.
+  for (unsigned i = 0; i < NumOperands; ++i) {
+    Value *Operand = C->getOperand(i);
+    Value *NewOperand = remapConstant(M, F, cast<Constant>(Operand), Builder);
+    OperandChanged |= Operand != NewOperand;
+    NewOperands.push_back(NewOperand);
+  }
+
+  // If none of the operands has been modified, return C as it is.
+  if (!OperandChanged) {
+    return C;
+  }
+
+  // If any of the operands has been modified, construct the instruction with
+  // the converted operands.
+  unsigned Opcode = C->getOpcode();
+  switch (Opcode) {
+  case Instruction::ICmp:
+    // CompareConstantExpr (icmp)
+    return Builder.CreateICmp(CmpInst::Predicate(C->getPredicate()),
+                              NewOperands[0], NewOperands[1]);
+  case Instruction::FCmp:
+    // CompareConstantExpr (fcmp)
+    assert(false && "Address space conversion should have no effect "
+                    "on float point CompareConstantExpr (fcmp)!");
+    return C;
+  case Instruction::ExtractElement:
+    // ExtractElementConstantExpr
+    return Builder.CreateExtractElement(NewOperands[0], NewOperands[1]);
+  case Instruction::InsertElement:
+    // InsertElementConstantExpr
+    return Builder.CreateInsertElement(NewOperands[0], NewOperands[1],
+                                       NewOperands[2]);
+  case Instruction::ShuffleVector:
+    // ShuffleVector
+    return Builder.CreateShuffleVector(NewOperands[0], NewOperands[1],
+                                       NewOperands[2]);
+  case Instruction::ExtractValue:
+    // ExtractValueConstantExpr
+    return Builder.CreateExtractValue(NewOperands[0], C->getIndices());
+  case Instruction::InsertValue:
+    // InsertValueConstantExpr
+    return Builder.CreateInsertValue(NewOperands[0], NewOperands[1],
+                                     C->getIndices());
+  case Instruction::GetElementPtr:
+    // GetElementPtrConstantExpr
+    return cast<GEPOperator>(C)->isInBounds()
+               ? Builder.CreateGEP(
+                     NewOperands[0],
+                     makeArrayRef(&NewOperands[1], NumOperands - 1))
+               : Builder.CreateInBoundsGEP(
+                     NewOperands[0],
+                     makeArrayRef(&NewOperands[1], NumOperands - 1));
+  case Instruction::Select:
+    // SelectConstantExpr
+    return Builder.CreateSelect(NewOperands[0], NewOperands[1], NewOperands[2]);
+  default:
+    // BinaryConstantExpr
+    if (Instruction::isBinaryOp(Opcode)) {
+      return Builder.CreateBinOp(Instruction::BinaryOps(C->getOpcode()),
+                                 NewOperands[0], NewOperands[1]);
+    }
+    // UnaryConstantExpr
+    if (Instruction::isCast(Opcode)) {
+      return Builder.CreateCast(Instruction::CastOps(C->getOpcode()),
+                                NewOperands[0], C->getType());
+    }
+    assert(false && "GenericToNVVM encountered an unsupported ConstantExpr");
+    return C;
+  }
+}
+
+void GenericToNVVM::remapNamedMDNode(Module *M, NamedMDNode *N) {
+
+  bool OperandChanged = false;
+  SmallVector<MDNode *, 16> NewOperands;
+  unsigned NumOperands = N->getNumOperands();
+
+  // Check if any operand is or contains a global variable in  GVMap, and thus
+  // converted to another value.
+  for (unsigned i = 0; i < NumOperands; ++i) {
+    MDNode *Operand = N->getOperand(i);
+    MDNode *NewOperand = remapMDNode(M, Operand);
+    OperandChanged |= Operand != NewOperand;
+    NewOperands.push_back(NewOperand);
+  }
+
+  // If none of the operands has been modified, return immediately.
+  if (!OperandChanged) {
+    return;
+  }
+
+  // Replace the old operands with the new operands.
+  N->dropAllReferences();
+  for (SmallVector<MDNode *, 16>::iterator I = NewOperands.begin(),
+                                           E = NewOperands.end();
+       I != E; ++I) {
+    N->addOperand(*I);
+  }
+}
+
+MDNode *GenericToNVVM::remapMDNode(Module *M, MDNode *N) {
+
+  bool OperandChanged = false;
+  SmallVector<Value *, 8> NewOperands;
+  unsigned NumOperands = N->getNumOperands();
+
+  // Check if any operand is or contains a global variable in  GVMap, and thus
+  // converted to another value.
+  for (unsigned i = 0; i < NumOperands; ++i) {
+    Value *Operand = N->getOperand(i);
+    Value *NewOperand = Operand;
+    if (Operand) {
+      if (isa<GlobalVariable>(Operand)) {
+        GVMapTy::iterator I = GVMap.find(cast<GlobalVariable>(Operand));
+        if (I != GVMap.end()) {
+          NewOperand = I->second;
+          if (++i < NumOperands) {
+            NewOperands.push_back(NewOperand);
+            // Address space of the global variable follows the global variable
+            // in the global variable debug info (see createGlobalVariable in
+            // lib/Analysis/DIBuilder.cpp).
+            NewOperand =
+                ConstantInt::get(Type::getInt32Ty(M->getContext()),
+                                 I->second->getType()->getAddressSpace());
+          }
+        }
+      } else if (isa<MDNode>(Operand)) {
+        NewOperand = remapMDNode(M, cast<MDNode>(Operand));
+      }
+    }
+    OperandChanged |= Operand != NewOperand;
+    NewOperands.push_back(NewOperand);
+  }
+
+  // If none of the operands has been modified, return N as it is.
+  if (!OperandChanged) {
+    return N;
+  }
+
+  // If any of the operands has been modified, create a new MDNode with the new
+  // operands.
+  return MDNode::get(M->getContext(), makeArrayRef(NewOperands));
+}
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 0f4c8db..ac6dbb9 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -42,6 +42,11 @@ static cl::opt<int> UsePrecDivF32(
              " IEEE Compliant F32 div.rnd if avaiable."),
     cl::init(2));
 
+static cl::opt<bool>
+UsePrecSqrtF32("nvptx-prec-sqrtf32",
+          cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
+          cl::init(true));
+
 /// createNVPTXISelDag - This pass converts a legalized DAG into a
 /// NVPTX-specific DAG, ready for instruction scheduling.
 FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
@@ -74,6 +79,8 @@ NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
 
   // Decide how to translate f32 div
   do_DIVF32_PREC = UsePrecDivF32;
+  // Decide how to translate f32 sqrt
+  do_SQRTF32_PREC = UsePrecSqrtF32;
   // sm less than sm_20 does not support div.rnd. Use div.full.
   if (do_DIVF32_PREC == 2 && !Subtarget.reqPTX20())
     do_DIVF32_PREC = 1;
@@ -120,42 +127,26 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
 static unsigned int getCodeAddrSpace(MemSDNode *N,
                                      const NVPTXSubtarget &Subtarget) {
   const Value *Src = N->getSrcValue();
+
   if (!Src)
-    return NVPTX::PTXLdStInstCode::LOCAL;
+    return NVPTX::PTXLdStInstCode::GENERIC;
 
   if (const PointerType *PT = dyn_cast<PointerType>(Src->getType())) {
     switch (PT->getAddressSpace()) {
-    case llvm::ADDRESS_SPACE_LOCAL:
-      return NVPTX::PTXLdStInstCode::LOCAL;
-    case llvm::ADDRESS_SPACE_GLOBAL:
-      return NVPTX::PTXLdStInstCode::GLOBAL;
-    case llvm::ADDRESS_SPACE_SHARED:
-      return NVPTX::PTXLdStInstCode::SHARED;
-    case llvm::ADDRESS_SPACE_CONST_NOT_GEN:
-      return NVPTX::PTXLdStInstCode::CONSTANT;
-    case llvm::ADDRESS_SPACE_GENERIC:
-      return NVPTX::PTXLdStInstCode::GENERIC;
-    case llvm::ADDRESS_SPACE_PARAM:
-      return NVPTX::PTXLdStInstCode::PARAM;
-    case llvm::ADDRESS_SPACE_CONST:
-      // If the arch supports generic address space, translate it to GLOBAL
-      // for correctness.
-      // If the arch does not support generic address space, then the arch
-      // does not really support ADDRESS_SPACE_CONST, translate it to
-      // to CONSTANT for better performance.
-      if (Subtarget.hasGenericLdSt())
-        return NVPTX::PTXLdStInstCode::GLOBAL;
-      else
-        return NVPTX::PTXLdStInstCode::CONSTANT;
-    default:
-      break;
+    case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL;
+    case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL;
+    case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED;
+    case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC;
+    case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM;
+    case llvm::ADDRESS_SPACE_CONST: return NVPTX::PTXLdStInstCode::CONSTANT;
+    default: break;
     }
   }
-  return NVPTX::PTXLdStInstCode::LOCAL;
+  return NVPTX::PTXLdStInstCode::GENERIC;
 }
 
 SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   LoadSDNode *LD = cast<LoadSDNode>(N);
   EVT LoadedVT = LD->getMemoryVT();
   SDNode *NVPTXLD = NULL;
@@ -198,7 +189,8 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
   //          type is integer
   // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
   MVT ScalarVT = SimpleVT.getScalarType();
-  unsigned fromTypeWidth = ScalarVT.getSizeInBits();
+  // Read at least 8 bits (predicates are stored as 8-bit values)
+  unsigned fromTypeWidth = std::max(8U, ScalarVT.getSizeInBits());
   unsigned int fromType;
   if ((LD->getExtensionType() == ISD::SEXTLOAD))
     fromType = NVPTX::PTXLdStInstCode::Signed;
@@ -394,7 +386,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
   SDValue Op1 = N->getOperand(1);
   SDValue Addr, Offset, Base;
   unsigned Opcode;
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   SDNode *LD;
   MemSDNode *MemSD = cast<MemSDNode>(N);
   EVT LoadedVT = MemSD->getMemoryVT();
@@ -423,7 +415,8 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
   //          type is integer
   // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
   MVT ScalarVT = SimpleVT.getScalarType();
-  unsigned FromTypeWidth = ScalarVT.getSizeInBits();
+  // Read at least 8 bits (predicates are stored as 8-bit values)
+  unsigned FromTypeWidth = std::max(8U, ScalarVT.getSizeInBits());
   unsigned int FromType;
   // The last operand holds the original LoadSDNode::getExtensionType() value
   unsigned ExtensionType = cast<ConstantSDNode>(
@@ -775,7 +768,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
   SDValue Chain = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
   unsigned Opcode;
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   SDNode *LD;
 
   EVT RetVT = N->getValueType(0);
@@ -972,7 +965,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
 }
 
 SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   StoreSDNode *ST = cast<StoreSDNode>(N);
   EVT StoreVT = ST->getMemoryVT();
   SDNode *NVPTXST = NULL;
@@ -1207,7 +1200,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
   SDValue Op1 = N->getOperand(1);
   SDValue Addr, Offset, Base;
   unsigned Opcode;
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   SDNode *ST;
   EVT EltVT = Op1.getValueType();
   MemSDNode *MemSD = cast<MemSDNode>(N);
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 70e8e46..ed16d44 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -41,6 +41,10 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
   //    Otherwise, use div.full
   int do_DIVF32_PREC;
 
+  // If true, generate sqrt.rn, else generate sqrt.approx. If FTZ
+  // is true, then generate the corresponding FTZ version.
+  bool do_SQRTF32_PREC;
+
   // If true, add .ftz to f32 instructions.
   // This is only meaningful for sm_20 and later, as the default
   // is not ftz.
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 6e01a5a..6cc850e 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -275,7 +275,7 @@ bool NVPTXTargetLowering::shouldSplitVectorElementType(EVT VT) const {
 
 SDValue
 NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   Op = DAG.getTargetGlobalAddress(GV, dl, getPointerTy());
   return DAG.getNode(NVPTXISD::Wrapper, dl, getPointerTy(), Op);
@@ -435,7 +435,7 @@ std::string NVPTXTargetLowering::getPrototype(
 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                        SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG = CLI.DAG;
-  DebugLoc &dl = CLI.DL;
+  SDLoc dl = CLI.DL;
   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
   SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
   SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
@@ -449,8 +449,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
 
   SDValue tempChain = Chain;
-  Chain =
-      DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(uniqueCallSite, true));
+  Chain = DAG.getCALLSEQ_START(Chain,
+                               DAG.getIntPtrConstant(uniqueCallSite, true),
+                               dl);
   SDValue InFlag = Chain.getValue(1);
 
   assert((Outs.size() == Args.size()) &&
@@ -795,7 +796,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   }
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(uniqueCallSite, true),
                              DAG.getIntPtrConstant(uniqueCallSite + 1, true),
-                             InFlag);
+                             InFlag, dl);
   uniqueCallSite++;
 
   // set isTailCall to false for now, until we figure out how to express
@@ -810,7 +811,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 SDValue
 NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
   SDNode *Node = Op.getNode();
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
   SmallVector<SDValue, 8> Ops;
   unsigned NumOperands = Node->getNumOperands();
   for (unsigned i = 0; i < NumOperands; ++i) {
@@ -866,7 +867,7 @@ SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
 SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
   SDNode *Node = Op.getNode();
   LoadSDNode *LD = cast<LoadSDNode>(Node);
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
   assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
   assert(Node->getValueType(0) == MVT::i1 &&
          "Custom lowering for i1 load only");
@@ -896,7 +897,7 @@ SDValue
 NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
   SDNode *N = Op.getNode();
   SDValue Val = N->getOperand(1);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   EVT ValVT = Val.getValueType();
 
   if (ValVT.isVector()) {
@@ -985,7 +986,7 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
 // st i8, addr
 SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
   SDNode *Node = Op.getNode();
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
   StoreSDNode *ST = cast<StoreSDNode>(Node);
   SDValue Tmp1 = ST->getChain();
   SDValue Tmp2 = ST->getBasePtr();
@@ -1046,7 +1047,7 @@ bool llvm::isImageOrSamplerVal(const Value *arg, const Module *context) {
 
 SDValue NVPTXTargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-    const SmallVectorImpl<ISD::InputArg> &Ins, DebugLoc dl, SelectionDAG &DAG,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG,
     SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
   const DataLayout *TD = getDataLayout();
@@ -1145,14 +1146,14 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
             false,
             TD->getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
         if (p.getNode())
-          DAG.AssignOrdering(p.getNode(), idx + 1);
+          p.getNode()->setIROrder(idx + 1);
         InVals.push_back(p);
       } else {
         // If no ABI, just move the param symbol
         SDValue Arg = getParamSymbol(DAG, idx, ObjectVT);
         SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
         if (p.getNode())
-          DAG.AssignOrdering(p.getNode(), idx + 1);
+          p.getNode()->setIROrder(idx + 1);
         InVals.push_back(p);
       }
       continue;
@@ -1169,7 +1170,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
       SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
       SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
       if (p.getNode())
-        DAG.AssignOrdering(p.getNode(), idx + 1);
+        p.getNode()->setIROrder(idx + 1);
       if (isKernel)
         InVals.push_back(p);
       else {
@@ -1240,7 +1241,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
 SDValue NVPTXTargetLowering::LowerReturn(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::OutputArg> &Outs,
-    const SmallVectorImpl<SDValue> &OutVals, DebugLoc dl,
+    const SmallVectorImpl<SDValue> &OutVals, SDLoc dl,
     SelectionDAG &DAG) const {
 
   bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
@@ -1450,7 +1451,7 @@ unsigned NVPTXTargetLowering::getFunctionAlignment(const Function *) const {
 static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
                               SmallVectorImpl<SDValue> &Results) {
   EVT ResVT = N->getValueType(0);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   assert(ResVT.isVector() && "Vector load must have vector type");
 
@@ -1543,7 +1544,7 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
                                      SmallVectorImpl<SDValue> &Results) {
   SDValue Chain = N->getOperand(0);
   SDValue Intrin = N->getOperand(1);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   // Get the intrinsic ID
   unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h
index 3cd49d3..d3ed63a 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -100,7 +100,7 @@ public:
   /// getFunctionAlignment - Return the Log2 alignment of this function.
   virtual unsigned getFunctionAlignment(const Function *F) const;
 
-  virtual EVT getSetCCResultType(EVT VT) const {
+  virtual EVT getSetCCResultType(LLVMContext &, EVT VT) const {
     if (VT.isVector())
       return MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
     return MVT::i1;
@@ -112,7 +112,7 @@ public:
 
   virtual SDValue LowerFormalArguments(
       SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-      const SmallVectorImpl<ISD::InputArg> &Ins, DebugLoc dl, SelectionDAG &DAG,
+      const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG,
       SmallVectorImpl<SDValue> &InVals) const;
 
   virtual SDValue
@@ -125,7 +125,7 @@ public:
   virtual SDValue
   LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
               const SmallVectorImpl<ISD::OutputArg> &Outs,
-              const SmallVectorImpl<SDValue> &OutVals, DebugLoc dl,
+              const SmallVectorImpl<SDValue> &OutVals, SDLoc dl,
               SelectionDAG &DAG) const;
 
   virtual void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index 33a63c2..52be287 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -32,36 +32,36 @@ NVPTXInstrInfo::NVPTXInstrInfo(NVPTXTargetMachine &tm)
 void NVPTXInstrInfo::copyPhysReg(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL,
     unsigned DestReg, unsigned SrcReg, bool KillSrc) const {
-  if (NVPTX::Int32RegsRegClass.contains(DestReg) &&
-      NVPTX::Int32RegsRegClass.contains(SrcReg))
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *DestRC = MRI.getRegClass(DestReg);
+  const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+
+  if (DestRC != SrcRC)
+    report_fatal_error("Attempted to created cross-class register copy");
+
+  if (DestRC == &NVPTX::Int32RegsRegClass)
     BuildMI(MBB, I, DL, get(NVPTX::IMOV32rr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
-  else if (NVPTX::Int8RegsRegClass.contains(DestReg) &&
-           NVPTX::Int8RegsRegClass.contains(SrcReg))
-    BuildMI(MBB, I, DL, get(NVPTX::IMOV8rr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
-  else if (NVPTX::Int1RegsRegClass.contains(DestReg) &&
-           NVPTX::Int1RegsRegClass.contains(SrcReg))
+      .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (DestRC == &NVPTX::Int1RegsRegClass)
     BuildMI(MBB, I, DL, get(NVPTX::IMOV1rr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
-  else if (NVPTX::Float32RegsRegClass.contains(DestReg) &&
-           NVPTX::Float32RegsRegClass.contains(SrcReg))
+      .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (DestRC == &NVPTX::Float32RegsRegClass)
     BuildMI(MBB, I, DL, get(NVPTX::FMOV32rr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
-  else if (NVPTX::Int16RegsRegClass.contains(DestReg) &&
-           NVPTX::Int16RegsRegClass.contains(SrcReg))
+      .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (DestRC == &NVPTX::Int16RegsRegClass)
     BuildMI(MBB, I, DL, get(NVPTX::IMOV16rr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
-  else if (NVPTX::Int64RegsRegClass.contains(DestReg) &&
-           NVPTX::Int64RegsRegClass.contains(SrcReg))
+      .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (DestRC == &NVPTX::Int8RegsRegClass)
+    BuildMI(MBB, I, DL, get(NVPTX::IMOV8rr), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (DestRC == &NVPTX::Int64RegsRegClass)
     BuildMI(MBB, I, DL, get(NVPTX::IMOV64rr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
-  else if (NVPTX::Float64RegsRegClass.contains(DestReg) &&
-           NVPTX::Float64RegsRegClass.contains(SrcReg))
+      .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (DestRC == &NVPTX::Float64RegsRegClass)
     BuildMI(MBB, I, DL, get(NVPTX::FMOV64rr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
+      .addReg(SrcReg, getKillRegState(KillSrc));
   else {
-    llvm_unreachable("Don't know how to copy a register");
+    llvm_unreachable("Bad register copy");
   }
 }
 
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
index f43abe2..da6dd39 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -75,6 +75,9 @@ def allowFMA_ftz : Predicate<"(allowFMA && UseF32FTZ)">;
 def do_DIVF32_APPROX : Predicate<"do_DIVF32_PREC==0">;
 def do_DIVF32_FULL : Predicate<"do_DIVF32_PREC==1">;
 
+def do_SQRTF32_APPROX : Predicate<"do_SQRTF32_PREC==0">;
+def do_SQRTF32_RN : Predicate<"do_SQRTF32_PREC==1">;
+
 def hasHWROT32 : Predicate<"Subtarget.hasHWROT32()">;
 
 def true : Predicate<"1">;
diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td
index 49e2568..24037ca 100644
--- a/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -512,6 +512,16 @@ def INT_NVVM_SQRT_RM_D : F_MATH_1<"sqrt.rm.f64 \t$dst, $src0;", Float64Regs,
 def INT_NVVM_SQRT_RP_D : F_MATH_1<"sqrt.rp.f64 \t$dst, $src0;", Float64Regs,
   Float64Regs, int_nvvm_sqrt_rp_d>;
 
+// nvvm_sqrt intrinsic
+def : Pat<(int_nvvm_sqrt_f Float32Regs:$a),
+          (INT_NVVM_SQRT_RN_FTZ_F Float32Regs:$a)>, Requires<[doF32FTZ, do_SQRTF32_RN]>;
+def : Pat<(int_nvvm_sqrt_f Float32Regs:$a),
+          (INT_NVVM_SQRT_RN_F Float32Regs:$a)>, Requires<[do_SQRTF32_RN]>;
+def : Pat<(int_nvvm_sqrt_f Float32Regs:$a),
+          (INT_NVVM_SQRT_APPROX_FTZ_F Float32Regs:$a)>, Requires<[doF32FTZ]>;
+def : Pat<(int_nvvm_sqrt_f Float32Regs:$a),
+          (INT_NVVM_SQRT_APPROX_F Float32Regs:$a)>;
+
 //
 // Rsqrt
 //
@@ -1510,38 +1520,12 @@ multiclass G_TO_NG<string Str, Intrinsic Intrin> {
 defm cvta_local  : NG_TO_G<"local", int_nvvm_ptr_local_to_gen>;
 defm cvta_shared : NG_TO_G<"shared", int_nvvm_ptr_shared_to_gen>;
 defm cvta_global : NG_TO_G<"global", int_nvvm_ptr_global_to_gen>;
+defm cvta_const  : NG_TO_G<"const", int_nvvm_ptr_constant_to_gen>;
 
 defm cvta_to_local   : G_TO_NG<"local", int_nvvm_ptr_gen_to_local>;
 defm cvta_to_shared : G_TO_NG<"shared", int_nvvm_ptr_gen_to_shared>;
 defm cvta_to_global : G_TO_NG<"global", int_nvvm_ptr_gen_to_global>;
-
-def cvta_const : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
-               "mov.u32 \t$result, $src;",
-     [(set Int32Regs:$result, (int_nvvm_ptr_constant_to_gen Int32Regs:$src))]>;
-def cvta_const_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
-               "mov.u64 \t$result, $src;",
-     [(set Int64Regs:$result, (int_nvvm_ptr_constant_to_gen Int64Regs:$src))]>;
-
-
-
-// @TODO: Revisit this.  There is a type
-// contradiction between iPTRAny and iPTR for the def.
-/*def cvta_const_addr : NVPTXInst<(outs Int32Regs:$result), (ins imemAny:$src),
-               "mov.u32 \t$result, $src;",
-     [(set Int32Regs:$result, (int_nvvm_ptr_constant_to_gen
-     (Wrapper tglobaladdr:$src)))]>;
-def cvta_const_addr_64 : NVPTXInst<(outs Int64Regs:$result), (ins imemAny:$src),
-               "mov.u64 \t$result, $src;",
-     [(set Int64Regs:$result, (int_nvvm_ptr_constant_to_gen
-     (Wrapper tglobaladdr:$src)))]>;*/
-
-
-def cvta_to_const : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
-            "mov.u32 \t$result, $src;",
-     [(set Int32Regs:$result, (int_nvvm_ptr_gen_to_constant Int32Regs:$src))]>;
-def cvta_to_const_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
-            "mov.u64 \t$result, $src;",
-     [(set Int64Regs:$result, (int_nvvm_ptr_gen_to_constant Int64Regs:$src))]>;
+defm cvta_to_const  : G_TO_NG<"const", int_nvvm_ptr_gen_to_constant>;
 
 
 // nvvm.ptr.gen.to.param
diff --git a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
new file mode 100644
index 0000000..843ebed
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
@@ -0,0 +1,225 @@
+//===-- NVPTXPrologEpilogPass.cpp - NVPTX prolog/epilog inserter ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a copy of the generic LLVM PrologEpilogInserter pass, modified
+// to remove unneeded functionality and to handle virtual registers. Most code
+// here is a copy of PrologEpilogInserter.cpp.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/Pass.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+class NVPTXPrologEpilogPass : public MachineFunctionPass {
+public:
+  static char ID;
+  NVPTXPrologEpilogPass() : MachineFunctionPass(ID) {}
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+private:
+  void calculateFrameObjectOffsets(MachineFunction &Fn);
+};
+}
+
+MachineFunctionPass *llvm::createNVPTXPrologEpilogPass() {
+  return new NVPTXPrologEpilogPass();
+}
+
+char NVPTXPrologEpilogPass::ID = 0;
+
+bool NVPTXPrologEpilogPass::runOnMachineFunction(MachineFunction &MF) {
+  const TargetMachine &TM = MF.getTarget();
+  const TargetFrameLowering &TFI = *TM.getFrameLowering();
+  const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
+  bool Modified = false;
+
+  calculateFrameObjectOffsets(MF);
+
+  for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) {
+    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) {
+      MachineInstr *MI = I;
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        if (!MI->getOperand(i).isFI())
+          continue;
+        TRI.eliminateFrameIndex(MI, 0, i, NULL);
+        Modified = true;
+      }
+    }
+  }
+
+  // Add function prolog/epilog
+  TFI.emitPrologue(MF);
+
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+    // If last instruction is a return instruction, add an epilogue
+    if (!I->empty() && I->back().isReturn())
+      TFI.emitEpilogue(MF, *I);
+  }
+
+  return Modified;
+}
+
+/// AdjustStackOffset - Helper function used to adjust the stack frame offset.
+static inline void
+AdjustStackOffset(MachineFrameInfo *MFI, int FrameIdx,
+                  bool StackGrowsDown, int64_t &Offset,
+                  unsigned &MaxAlign) {
+  // If the stack grows down, add the object size to find the lowest address.
+  if (StackGrowsDown)
+    Offset += MFI->getObjectSize(FrameIdx);
+
+  unsigned Align = MFI->getObjectAlignment(FrameIdx);
+
+  // If the alignment of this object is greater than that of the stack, then
+  // increase the stack alignment to match.
+  MaxAlign = std::max(MaxAlign, Align);
+
+  // Adjust to alignment boundary.
+  Offset = (Offset + Align - 1) / Align * Align;
+
+  if (StackGrowsDown) {
+    DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset << "]\n");
+    MFI->setObjectOffset(FrameIdx, -Offset); // Set the computed offset
+  } else {
+    DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset << "]\n");
+    MFI->setObjectOffset(FrameIdx, Offset);
+    Offset += MFI->getObjectSize(FrameIdx);
+  }
+}
+
+void
+NVPTXPrologEpilogPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
+  const TargetFrameLowering &TFI = *Fn.getTarget().getFrameLowering();
+  const TargetRegisterInfo *RegInfo = Fn.getTarget().getRegisterInfo();
+
+  bool StackGrowsDown =
+    TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
+
+  // Loop over all of the stack objects, assigning sequential addresses...
+  MachineFrameInfo *MFI = Fn.getFrameInfo();
+
+  // Start at the beginning of the local area.
+  // The Offset is the distance from the stack top in the direction
+  // of stack growth -- so it's always nonnegative.
+  int LocalAreaOffset = TFI.getOffsetOfLocalArea();
+  if (StackGrowsDown)
+    LocalAreaOffset = -LocalAreaOffset;
+  assert(LocalAreaOffset >= 0
+         && "Local area offset should be in direction of stack growth");
+  int64_t Offset = LocalAreaOffset;
+
+  // If there are fixed sized objects that are preallocated in the local area,
+  // non-fixed objects can't be allocated right at the start of local area.
+  // We currently don't support filling in holes in between fixed sized
+  // objects, so we adjust 'Offset' to point to the end of last fixed sized
+  // preallocated object.
+  for (int i = MFI->getObjectIndexBegin(); i != 0; ++i) {
+    int64_t FixedOff;
+    if (StackGrowsDown) {
+      // The maximum distance from the stack pointer is at lower address of
+      // the object -- which is given by offset. For down growing stack
+      // the offset is negative, so we negate the offset to get the distance.
+      FixedOff = -MFI->getObjectOffset(i);
+    } else {
+      // The maximum distance from the start pointer is at the upper
+      // address of the object.
+      FixedOff = MFI->getObjectOffset(i) + MFI->getObjectSize(i);
+    }
+    if (FixedOff > Offset) Offset = FixedOff;
+  }
+
+  // NOTE: We do not have a call stack
+
+  unsigned MaxAlign = MFI->getMaxAlignment();
+
+  // No scavenger
+
+  // FIXME: Once this is working, then enable flag will change to a target
+  // check for whether the frame is large enough to want to use virtual
+  // frame index registers. Functions which don't want/need this optimization
+  // will continue to use the existing code path.
+  if (MFI->getUseLocalStackAllocationBlock()) {
+    unsigned Align = MFI->getLocalFrameMaxAlign();
+
+    // Adjust to alignment boundary.
+    Offset = (Offset + Align - 1) / Align * Align;
+
+    DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n");
+
+    // Resolve offsets for objects in the local block.
+    for (unsigned i = 0, e = MFI->getLocalFrameObjectCount(); i != e; ++i) {
+      std::pair<int, int64_t> Entry = MFI->getLocalFrameObjectMap(i);
+      int64_t FIOffset = (StackGrowsDown ? -Offset : Offset) + Entry.second;
+      DEBUG(dbgs() << "alloc FI(" << Entry.first << ") at SP[" <<
+            FIOffset << "]\n");
+      MFI->setObjectOffset(Entry.first, FIOffset);
+    }
+    // Allocate the local block
+    Offset += MFI->getLocalFrameSize();
+
+    MaxAlign = std::max(Align, MaxAlign);
+  }
+
+  // No stack protector
+
+  // Then assign frame offsets to stack objects that are not used to spill
+  // callee saved registers.
+  for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
+    if (MFI->isObjectPreAllocated(i) &&
+        MFI->getUseLocalStackAllocationBlock())
+      continue;
+    if (MFI->isDeadObjectIndex(i))
+      continue;
+
+    AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign);
+  }
+
+  // No scavenger
+
+  if (!TFI.targetHandlesStackFrameRounding()) {
+    // If we have reserved argument space for call sites in the function
+    // immediately on entry to the current function, count it as part of the
+    // overall stack size.
+    if (MFI->adjustsStack() && TFI.hasReservedCallFrame(Fn))
+      Offset += MFI->getMaxCallFrameSize();
+
+    // Round up the size to a multiple of the alignment.  If the function has
+    // any calls or alloca's, align to the target's StackAlignment value to
+    // ensure that the callee's frame or the alloca data is suitably aligned;
+    // otherwise, for leaf functions, align to the TransientStackAlignment
+    // value.
+    unsigned StackAlign;
+    if (MFI->adjustsStack() || MFI->hasVarSizedObjects() ||
+        (RegInfo->needsStackRealignment(Fn) && MFI->getObjectIndexEnd() != 0))
+      StackAlign = TFI.getStackAlignment();
+    else
+      StackAlign = TFI.getTransientStackAlignment();
+
+    // If the frame pointer is eliminated, all frame offsets will be relative to
+    // SP not FP. Align to MaxAlign so this works.
+    StackAlign = std::max(StackAlign, MaxAlign);
+    unsigned AlignMask = StackAlign - 1;
+    Offset = (Offset + AlignMask) & ~uint64_t(AlignMask);
+  }
+
+  // Update frame info to pretend that this is part of the stack...
+  int64_t StackSize = Offset - LocalAreaOffset;
+  MFI->setStackSize(StackSize);
+}
diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
index 2824653..bb039f8 100644
--- a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
@@ -57,9 +57,9 @@ std::string getNVPTXRegClassStr(TargetRegisterClass const *RC) {
     return "%f";
   }
   if (RC == &NVPTX::Float64RegsRegClass) {
-    return "%fd";
+    return "%fl";
   } else if (RC == &NVPTX::Int64RegsRegClass) {
-    return "%rd";
+    return "%rl";
   } else if (RC == &NVPTX::Int32RegsRegClass) {
     return "%r";
   } else if (RC == &NVPTX::Int16RegsRegClass) {
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 67ca6b5..72afe8d 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -49,6 +49,7 @@ using namespace llvm;
 
 namespace llvm {
 void initializeNVVMReflectPass(PassRegistry&);
+void initializeGenericToNVVMPass(PassRegistry&);
 }
 
 extern "C" void LLVMInitializeNVPTXTarget() {
@@ -62,6 +63,7 @@ extern "C" void LLVMInitializeNVPTXTarget() {
   // FIXME: This pass is really intended to be invoked during IR optimization,
   // but it's very NVPTX-specific.
   initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
+  initializeGenericToNVVMPass(*PassRegistry::getPassRegistry());
 }
 
 NVPTXTargetMachine::NVPTXTargetMachine(
@@ -72,7 +74,9 @@ NVPTXTargetMachine::NVPTXTargetMachine(
       Subtarget(TT, CPU, FS, is64bit), DL(Subtarget.getDataLayout()),
       InstrInfo(*this), TLInfo(*this), TSInfo(*this),
       FrameLowering(
-          *this, is64bit) /*FrameInfo(TargetFrameInfo::StackGrowsUp, 8, 0)*/ {}
+          *this, is64bit) /*FrameInfo(TargetFrameInfo::StackGrowsUp, 8, 0)*/ {
+  initAsmInfo();
+}
 
 void NVPTXTargetMachine32::anchor() {}
 
@@ -90,7 +94,7 @@ NVPTXTargetMachine64::NVPTXTargetMachine64(
     CodeGenOpt::Level OL)
     : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
 
-namespace llvm {
+namespace {
 class NVPTXPassConfig : public TargetPassConfig {
 public:
   NVPTXPassConfig(NVPTXTargetMachine *TM, PassManagerBase &PM)
@@ -100,16 +104,36 @@ public:
     return getTM<NVPTXTargetMachine>();
   }
 
+  virtual void addIRPasses();
   virtual bool addInstSelector();
   virtual bool addPreRegAlloc();
+  virtual bool addPostRegAlloc();
+
+  virtual FunctionPass *createTargetRegisterAllocator(bool) LLVM_OVERRIDE;
+  virtual void addFastRegAlloc(FunctionPass *RegAllocPass);
+  virtual void addOptimizedRegAlloc(FunctionPass *RegAllocPass);
 };
-}
+} // end anonymous namespace
 
 TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
   NVPTXPassConfig *PassConfig = new NVPTXPassConfig(this, PM);
   return PassConfig;
 }
 
+void NVPTXPassConfig::addIRPasses() {
+  // The following passes are known to not play well with virtual regs hanging
+  // around after register allocation (which in our case, is *all* registers).
+  // We explicitly disable them here.  We do, however, need some functionality
+  // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the
+  // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp).
+  disablePass(&PrologEpilogCodeInserterID);
+  disablePass(&MachineCopyPropagationID);
+  disablePass(&BranchFolderPassID);
+
+  TargetPassConfig::addIRPasses();
+  addPass(createGenericToNVVMPass());
+}
+
 bool NVPTXPassConfig::addInstSelector() {
   addPass(createLowerAggrCopies());
   addPass(createSplitBBatBarPass());
@@ -119,3 +143,21 @@ bool NVPTXPassConfig::addInstSelector() {
 }
 
 bool NVPTXPassConfig::addPreRegAlloc() { return false; }
+bool NVPTXPassConfig::addPostRegAlloc() {
+  addPass(createNVPTXPrologEpilogPass());
+  return false;
+}
+
+FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) {
+  return 0; // No reg alloc
+}
+
+void NVPTXPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
+  assert(!RegAllocPass && "NVPTX uses no regalloc!");
+  addPass(&StrongPHIEliminationID);
+}
+
+void NVPTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
+  assert(!RegAllocPass && "NVPTX uses no regalloc!");
+  addPass(&StrongPHIEliminationID);
+}
diff --git a/lib/Target/NVPTX/NVVMReflect.cpp b/lib/Target/NVPTX/NVVMReflect.cpp
index 0ad62ce..3cc324b 100644
--- a/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/lib/Target/NVPTX/NVVMReflect.cpp
@@ -14,6 +14,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "NVPTX.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
@@ -40,7 +41,7 @@ using namespace llvm;
 namespace llvm { void initializeNVVMReflectPass(PassRegistry &); }
 
 namespace {
-class LLVM_LIBRARY_VISIBILITY NVVMReflect : public ModulePass {
+class NVVMReflect : public ModulePass {
 private:
   StringMap<int> VarMap;
   typedef DenseMap<std::string, int>::iterator VarMapIter;
@@ -48,9 +49,18 @@ private:
 
 public:
   static char ID;
-  NVVMReflect() : ModulePass(ID) {
+  NVVMReflect() : ModulePass(ID), ReflectFunction(0) {
+    initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
     VarMap.clear();
-    ReflectFunction = 0;
+  }
+
+  NVVMReflect(const StringMap<int> &Mapping)
+  : ModulePass(ID), ReflectFunction(0) {
+    initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
+    for (StringMap<int>::const_iterator I = Mapping.begin(), E = Mapping.end();
+         I != E; ++I) {
+      VarMap[(*I).getKey()] = (*I).getValue();
+    }
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); }
@@ -60,6 +70,14 @@ public:
 };
 }
 
+ModulePass *llvm::createNVVMReflectPass() {
+  return new NVVMReflect();
+}
+
+ModulePass *llvm::createNVVMReflectPass(const StringMap<int>& Mapping) {
+  return new NVVMReflect(Mapping);
+}
+
 static cl::opt<bool>
 NVVMReflectEnabled("nvvm-reflect-enable", cl::init(true),
                    cl::desc("NVVM reflection, enabled by default"));
diff --git a/lib/Target/PowerPC/AsmParser/CMakeLists.txt b/lib/Target/PowerPC/AsmParser/CMakeLists.txt
new file mode 100644
index 0000000..3aa59c0
--- /dev/null
+++ b/lib/Target/PowerPC/AsmParser/CMakeLists.txt
@@ -0,0 +1,8 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/..
+                     ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMPowerPCAsmParser
+  PPCAsmParser.cpp
+  )
+
+add_dependencies(LLVMPowerPCAsmParser PowerPCCommonTableGen)
diff --git a/lib/Target/PowerPC/AsmParser/LLVMBuild.txt b/lib/Target/PowerPC/AsmParser/LLVMBuild.txt
new file mode 100644
index 0000000..bd08c13
--- /dev/null
+++ b/lib/Target/PowerPC/AsmParser/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/PowerPC/AsmParser/LLVMBuild.txt --------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = PowerPCAsmParser
+parent = PowerPC
+required_libraries = PowerPCInfo MC MCParser Support
+add_to_library_groups = PowerPC
diff --git a/lib/Target/PowerPC/AsmParser/Makefile b/lib/Target/PowerPC/AsmParser/Makefile
new file mode 100644
index 0000000..c8a8915
--- /dev/null
+++ b/lib/Target/PowerPC/AsmParser/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/PowerPC/AsmParser/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMPowerPCAsmParser
+
+# Hack: we need to include 'main' PowerPC target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
new file mode 100644
index 0000000..9cf16f0
--- /dev/null
+++ b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -0,0 +1,723 @@
+//===-- PPCAsmParser.cpp - Parse PowerPC asm to MCInst instructions ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+static unsigned RRegs[32] = {
+  PPC::R0,  PPC::R1,  PPC::R2,  PPC::R3,
+  PPC::R4,  PPC::R5,  PPC::R6,  PPC::R7,
+  PPC::R8,  PPC::R9,  PPC::R10, PPC::R11,
+  PPC::R12, PPC::R13, PPC::R14, PPC::R15,
+  PPC::R16, PPC::R17, PPC::R18, PPC::R19,
+  PPC::R20, PPC::R21, PPC::R22, PPC::R23,
+  PPC::R24, PPC::R25, PPC::R26, PPC::R27,
+  PPC::R28, PPC::R29, PPC::R30, PPC::R31
+};
+static unsigned RRegsNoR0[32] = {
+  PPC::ZERO,
+            PPC::R1,  PPC::R2,  PPC::R3,
+  PPC::R4,  PPC::R5,  PPC::R6,  PPC::R7,
+  PPC::R8,  PPC::R9,  PPC::R10, PPC::R11,
+  PPC::R12, PPC::R13, PPC::R14, PPC::R15,
+  PPC::R16, PPC::R17, PPC::R18, PPC::R19,
+  PPC::R20, PPC::R21, PPC::R22, PPC::R23,
+  PPC::R24, PPC::R25, PPC::R26, PPC::R27,
+  PPC::R28, PPC::R29, PPC::R30, PPC::R31
+};
+static unsigned XRegs[32] = {
+  PPC::X0,  PPC::X1,  PPC::X2,  PPC::X3,
+  PPC::X4,  PPC::X5,  PPC::X6,  PPC::X7,
+  PPC::X8,  PPC::X9,  PPC::X10, PPC::X11,
+  PPC::X12, PPC::X13, PPC::X14, PPC::X15,
+  PPC::X16, PPC::X17, PPC::X18, PPC::X19,
+  PPC::X20, PPC::X21, PPC::X22, PPC::X23,
+  PPC::X24, PPC::X25, PPC::X26, PPC::X27,
+  PPC::X28, PPC::X29, PPC::X30, PPC::X31
+};
+static unsigned XRegsNoX0[32] = {
+  PPC::ZERO8,
+            PPC::X1,  PPC::X2,  PPC::X3,
+  PPC::X4,  PPC::X5,  PPC::X6,  PPC::X7,
+  PPC::X8,  PPC::X9,  PPC::X10, PPC::X11,
+  PPC::X12, PPC::X13, PPC::X14, PPC::X15,
+  PPC::X16, PPC::X17, PPC::X18, PPC::X19,
+  PPC::X20, PPC::X21, PPC::X22, PPC::X23,
+  PPC::X24, PPC::X25, PPC::X26, PPC::X27,
+  PPC::X28, PPC::X29, PPC::X30, PPC::X31
+};
+static unsigned FRegs[32] = {
+  PPC::F0,  PPC::F1,  PPC::F2,  PPC::F3,
+  PPC::F4,  PPC::F5,  PPC::F6,  PPC::F7,
+  PPC::F8,  PPC::F9,  PPC::F10, PPC::F11,
+  PPC::F12, PPC::F13, PPC::F14, PPC::F15,
+  PPC::F16, PPC::F17, PPC::F18, PPC::F19,
+  PPC::F20, PPC::F21, PPC::F22, PPC::F23,
+  PPC::F24, PPC::F25, PPC::F26, PPC::F27,
+  PPC::F28, PPC::F29, PPC::F30, PPC::F31
+};
+static unsigned VRegs[32] = {
+  PPC::V0,  PPC::V1,  PPC::V2,  PPC::V3,
+  PPC::V4,  PPC::V5,  PPC::V6,  PPC::V7,
+  PPC::V8,  PPC::V9,  PPC::V10, PPC::V11,
+  PPC::V12, PPC::V13, PPC::V14, PPC::V15,
+  PPC::V16, PPC::V17, PPC::V18, PPC::V19,
+  PPC::V20, PPC::V21, PPC::V22, PPC::V23,
+  PPC::V24, PPC::V25, PPC::V26, PPC::V27,
+  PPC::V28, PPC::V29, PPC::V30, PPC::V31
+};
+static unsigned CRBITRegs[32] = {
+  PPC::CR0LT, PPC::CR0GT, PPC::CR0EQ, PPC::CR0UN,
+  PPC::CR1LT, PPC::CR1GT, PPC::CR1EQ, PPC::CR1UN,
+  PPC::CR2LT, PPC::CR2GT, PPC::CR2EQ, PPC::CR2UN,
+  PPC::CR3LT, PPC::CR3GT, PPC::CR3EQ, PPC::CR3UN,
+  PPC::CR4LT, PPC::CR4GT, PPC::CR4EQ, PPC::CR4UN,
+  PPC::CR5LT, PPC::CR5GT, PPC::CR5EQ, PPC::CR5UN,
+  PPC::CR6LT, PPC::CR6GT, PPC::CR6EQ, PPC::CR6UN,
+  PPC::CR7LT, PPC::CR7GT, PPC::CR7EQ, PPC::CR7UN
+};
+static unsigned CRRegs[8] = {
+  PPC::CR0, PPC::CR1, PPC::CR2, PPC::CR3,
+  PPC::CR4, PPC::CR5, PPC::CR6, PPC::CR7
+};
+
+struct PPCOperand;
+
+class PPCAsmParser : public MCTargetAsmParser {
+  MCSubtargetInfo &STI;
+  MCAsmParser &Parser;
+  bool IsPPC64;
+
+  MCAsmParser &getParser() const { return Parser; }
+  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
+  void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
+  bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
+
+  bool isPPC64() const { return IsPPC64; }
+
+  bool MatchRegisterName(const AsmToken &Tok,
+                         unsigned &RegNo, int64_t &IntVal);
+
+  virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
+
+  bool ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  bool ParseDirectiveWord(unsigned Size, SMLoc L);
+  bool ParseDirectiveTC(unsigned Size, SMLoc L);
+
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                               MCStreamer &Out, unsigned &ErrorInfo,
+                               bool MatchingInlineAsm);
+
+  void ProcessInstruction(MCInst &Inst,
+                          const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
+
+  /// @name Auto-generated Match Functions
+  /// {
+
+#define GET_ASSEMBLER_HEADER
+#include "PPCGenAsmMatcher.inc"
+
+  /// }
+
+
+public:
+  PPCAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser)
+    : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
+    // Check for 64-bit vs. 32-bit pointer mode.
+    Triple TheTriple(STI.getTargetTriple());
+    IsPPC64 = TheTriple.getArch() == Triple::ppc64;
+    // Initialize the set of available features.
+    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+  }
+
+  virtual bool ParseInstruction(ParseInstructionInfo &Info,
+                                StringRef Name, SMLoc NameLoc,
+                                SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  virtual bool ParseDirective(AsmToken DirectiveID);
+};
+
+/// PPCOperand - Instances of this class represent a parsed PowerPC machine
+/// instruction.
+struct PPCOperand : public MCParsedAsmOperand {
+  enum KindTy {
+    Token,
+    Immediate,
+    Expression
+  } Kind;
+
+  SMLoc StartLoc, EndLoc;
+  bool IsPPC64;
+
+  struct TokOp {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct ImmOp {
+    int64_t Val;
+  };
+
+  struct ExprOp {
+    const MCExpr *Val;
+  };
+
+  union {
+    struct TokOp Tok;
+    struct ImmOp Imm;
+    struct ExprOp Expr;
+  };
+
+  PPCOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+public:
+  PPCOperand(const PPCOperand &o) : MCParsedAsmOperand() {
+    Kind = o.Kind;
+    StartLoc = o.StartLoc;
+    EndLoc = o.EndLoc;
+    IsPPC64 = o.IsPPC64;
+    switch (Kind) {
+    case Token:
+      Tok = o.Tok;
+      break;
+    case Immediate:
+      Imm = o.Imm;
+      break;
+    case Expression:
+      Expr = o.Expr;
+      break;
+    }
+  }
+
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const { return StartLoc; }
+
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const { return EndLoc; }
+
+  /// isPPC64 - True if this operand is for an instruction in 64-bit mode.
+  bool isPPC64() const { return IsPPC64; }
+
+  int64_t getImm() const {
+    assert(Kind == Immediate && "Invalid access!");
+    return Imm.Val;
+  }
+
+  const MCExpr *getExpr() const {
+    assert(Kind == Expression && "Invalid access!");
+    return Expr.Val;
+  }
+
+  unsigned getReg() const {
+    assert(isRegNumber() && "Invalid access!");
+    return (unsigned) Imm.Val;
+  }
+
+  unsigned getCCReg() const {
+    assert(isCCRegNumber() && "Invalid access!");
+    return (unsigned) Imm.Val;
+  }
+
+  unsigned getCRBitMask() const {
+    assert(isCRBitMask() && "Invalid access!");
+    return 7 - countTrailingZeros<uint64_t>(Imm.Val);
+  }
+
+  bool isToken() const { return Kind == Token; }
+  bool isImm() const { return Kind == Immediate || Kind == Expression; }
+  bool isU5Imm() const { return Kind == Immediate && isUInt<5>(getImm()); }
+  bool isS5Imm() const { return Kind == Immediate && isInt<5>(getImm()); }
+  bool isU6Imm() const { return Kind == Immediate && isUInt<6>(getImm()); }
+  bool isU16Imm() const { return Kind == Expression ||
+                                 (Kind == Immediate && isUInt<16>(getImm())); }
+  bool isS16Imm() const { return Kind == Expression ||
+                                 (Kind == Immediate && isInt<16>(getImm())); }
+  bool isS16ImmX4() const { return Kind == Expression ||
+                                   (Kind == Immediate && isInt<16>(getImm()) &&
+                                    (getImm() & 3) == 0); }
+  bool isRegNumber() const { return Kind == Immediate && isUInt<5>(getImm()); }
+  bool isCCRegNumber() const { return Kind == Immediate &&
+                                      isUInt<3>(getImm()); }
+  bool isCRBitMask() const { return Kind == Immediate && isUInt<8>(getImm()) &&
+                                    isPowerOf2_32(getImm()); }
+  bool isMem() const { return false; }
+  bool isReg() const { return false; }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    llvm_unreachable("addRegOperands");
+  }
+
+  void addRegGPRCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(RRegs[getReg()]));
+  }
+
+  void addRegGPRCNoR0Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(RRegsNoR0[getReg()]));
+  }
+
+  void addRegG8RCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(XRegs[getReg()]));
+  }
+
+  void addRegG8RCNoX0Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(XRegsNoX0[getReg()]));
+  }
+
+  void addRegGxRCOperands(MCInst &Inst, unsigned N) const {
+    if (isPPC64())
+      addRegG8RCOperands(Inst, N);
+    else
+      addRegGPRCOperands(Inst, N);
+  }
+
+  void addRegGxRCNoR0Operands(MCInst &Inst, unsigned N) const {
+    if (isPPC64())
+      addRegG8RCNoX0Operands(Inst, N);
+    else
+      addRegGPRCNoR0Operands(Inst, N);
+  }
+
+  void addRegF4RCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(FRegs[getReg()]));
+  }
+
+  void addRegF8RCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(FRegs[getReg()]));
+  }
+
+  void addRegVRRCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(VRegs[getReg()]));
+  }
+
+  void addRegCRBITRCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(CRBITRegs[getReg()]));
+  }
+
+  void addRegCRRCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(CRRegs[getCCReg()]));
+  }
+
+  void addCRBitMaskOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(CRRegs[getCRBitMask()]));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    if (Kind == Immediate)
+      Inst.addOperand(MCOperand::CreateImm(getImm()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(getExpr()));
+  }
+
+  StringRef getToken() const {
+    assert(Kind == Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  virtual void print(raw_ostream &OS) const;
+
+  static PPCOperand *CreateToken(StringRef Str, SMLoc S, bool IsPPC64) {
+    PPCOperand *Op = new PPCOperand(Token);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    Op->IsPPC64 = IsPPC64;
+    return Op;
+  }
+
+  static PPCOperand *CreateImm(int64_t Val, SMLoc S, SMLoc E, bool IsPPC64) {
+    PPCOperand *Op = new PPCOperand(Immediate);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    Op->IsPPC64 = IsPPC64;
+    return Op;
+  }
+
+  static PPCOperand *CreateExpr(const MCExpr *Val,
+                                SMLoc S, SMLoc E, bool IsPPC64) {
+    PPCOperand *Op = new PPCOperand(Expression);
+    Op->Expr.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    Op->IsPPC64 = IsPPC64;
+    return Op;
+  }
+};
+
+} // end anonymous namespace.
+
+void PPCOperand::print(raw_ostream &OS) const {
+  switch (Kind) {
+  case Token:
+    OS << "'" << getToken() << "'";
+    break;
+  case Immediate:
+    OS << getImm();
+    break;
+  case Expression:
+    getExpr()->print(OS);
+    break;
+  }
+}
+
+
+void PPCAsmParser::
+ProcessInstruction(MCInst &Inst,
+                   const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  switch (Inst.getOpcode()) {
+  case PPC::SLWI: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    TmpInst.setOpcode(PPC::RLWINM);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(N));
+    TmpInst.addOperand(MCOperand::CreateImm(0));
+    TmpInst.addOperand(MCOperand::CreateImm(31 - N));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::SRWI: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    TmpInst.setOpcode(PPC::RLWINM);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(32 - N));
+    TmpInst.addOperand(MCOperand::CreateImm(N));
+    TmpInst.addOperand(MCOperand::CreateImm(31));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::SLDI: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    TmpInst.setOpcode(PPC::RLDICR);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(N));
+    TmpInst.addOperand(MCOperand::CreateImm(63 - N));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::SRDI: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    TmpInst.setOpcode(PPC::RLDICL);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(64 - N));
+    TmpInst.addOperand(MCOperand::CreateImm(N));
+    Inst = TmpInst;
+    break;
+  }
+  }
+}
+
+bool PPCAsmParser::
+MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                        SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                        MCStreamer &Out, unsigned &ErrorInfo,
+                        bool MatchingInlineAsm) {
+  MCInst Inst;
+
+  switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) {
+  default: break;
+  case Match_Success:
+    // Post-process instructions (typically extended mnemonics)
+    ProcessInstruction(Inst, Operands);
+    Inst.setLoc(IDLoc);
+    Out.EmitInstruction(Inst);
+    return false;
+  case Match_MissingFeature:
+    return Error(IDLoc, "instruction use requires an option to be enabled");
+  case Match_MnemonicFail:
+      return Error(IDLoc, "unrecognized instruction mnemonic");
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0U) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction");
+
+      ErrorLoc = ((PPCOperand*)Operands[ErrorInfo])->getStartLoc();
+      if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+    }
+
+    return Error(ErrorLoc, "invalid operand for instruction");
+  }
+  }
+
+  llvm_unreachable("Implement any new match types added!");
+}
+
+bool PPCAsmParser::
+MatchRegisterName(const AsmToken &Tok, unsigned &RegNo, int64_t &IntVal) {
+  if (Tok.is(AsmToken::Identifier)) {
+    StringRef Name = Tok.getString();
+
+    if (Name.equals_lower("lr")) {
+      RegNo = isPPC64()? PPC::LR8 : PPC::LR;
+      IntVal = 8;
+      return false;
+    } else if (Name.equals_lower("ctr")) {
+      RegNo = isPPC64()? PPC::CTR8 : PPC::CTR;
+      IntVal = 9;
+      return false;
+    } else if (Name.substr(0, 1).equals_lower("r") &&
+               !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
+      RegNo = isPPC64()? XRegs[IntVal] : RRegs[IntVal];
+      return false;
+    } else if (Name.substr(0, 1).equals_lower("f") &&
+               !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
+      RegNo = FRegs[IntVal];
+      return false;
+    } else if (Name.substr(0, 1).equals_lower("v") &&
+               !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
+      RegNo = VRegs[IntVal];
+      return false;
+    } else if (Name.substr(0, 2).equals_lower("cr") &&
+               !Name.substr(2).getAsInteger(10, IntVal) && IntVal < 8) {
+      RegNo = CRRegs[IntVal];
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool PPCAsmParser::
+ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) {
+  const AsmToken &Tok = Parser.getTok();
+  StartLoc = Tok.getLoc();
+  EndLoc = Tok.getEndLoc();
+  RegNo = 0;
+  int64_t IntVal;
+
+  if (!MatchRegisterName(Tok, RegNo, IntVal)) {
+    Parser.Lex(); // Eat identifier token.
+    return false;
+  }
+
+  return Error(StartLoc, "invalid register name");
+}
+
+bool PPCAsmParser::
+ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  const MCExpr *EVal;
+  PPCOperand *Op;
+
+  // Attempt to parse the next token as an immediate
+  switch (getLexer().getKind()) {
+  // Special handling for register names.  These are interpreted
+  // as immediates corresponding to the register number.
+  case AsmToken::Percent:
+    Parser.Lex(); // Eat the '%'.
+    unsigned RegNo;
+    int64_t IntVal;
+    if (!MatchRegisterName(Parser.getTok(), RegNo, IntVal)) {
+      Parser.Lex(); // Eat the identifier token.
+      Op = PPCOperand::CreateImm(IntVal, S, E, isPPC64());
+      Operands.push_back(Op);
+      return false;
+    }
+    return Error(S, "invalid register name");
+
+  // All other expressions
+  case AsmToken::LParen:
+  case AsmToken::Plus:
+  case AsmToken::Minus:
+  case AsmToken::Integer:
+  case AsmToken::Identifier:
+  case AsmToken::Dot:
+  case AsmToken::Dollar:
+    if (!getParser().parseExpression(EVal))
+      break;
+    /* fall through */
+  default:
+    return Error(S, "unknown operand");
+  }
+
+  if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(EVal))
+    Op = PPCOperand::CreateImm(CE->getValue(), S, E, isPPC64());
+  else
+    Op = PPCOperand::CreateExpr(EVal, S, E, isPPC64());
+
+  // Push the parsed operand into the list of operands
+  Operands.push_back(Op);
+
+  // Check for D-form memory operands
+  if (getLexer().is(AsmToken::LParen)) {
+    Parser.Lex(); // Eat the '('.
+    S = Parser.getTok().getLoc();
+
+    int64_t IntVal;
+    switch (getLexer().getKind()) {
+    case AsmToken::Percent:
+      Parser.Lex(); // Eat the '%'.
+      unsigned RegNo;
+      if (MatchRegisterName(Parser.getTok(), RegNo, IntVal))
+        return Error(S, "invalid register name");
+      Parser.Lex(); // Eat the identifier token.
+      break;
+
+    case AsmToken::Integer:
+      if (getParser().parseAbsoluteExpression(IntVal) ||
+          IntVal < 0 || IntVal > 31)
+        return Error(S, "invalid register number");
+      break;
+
+    default:
+      return Error(S, "invalid memory operand");
+    }
+
+    if (getLexer().isNot(AsmToken::RParen))
+      return Error(Parser.getTok().getLoc(), "missing ')'");
+    E = Parser.getTok().getLoc();
+    Parser.Lex(); // Eat the ')'.
+
+    Op = PPCOperand::CreateImm(IntVal, S, E, isPPC64());
+    Operands.push_back(Op);
+  }
+
+  return false;
+}
+
+/// Parse an instruction mnemonic followed by its operands.
+bool PPCAsmParser::
+ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
+                 SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // The first operand is the token for the instruction name.
+  // If the instruction ends in a '.', we need to create a separate
+  // token for it, to match what TableGen is doing.
+  size_t Dot = Name.find('.');
+  StringRef Mnemonic = Name.slice(0, Dot);
+  Operands.push_back(PPCOperand::CreateToken(Mnemonic, NameLoc, isPPC64()));
+  if (Dot != StringRef::npos) {
+    SMLoc DotLoc = SMLoc::getFromPointer(NameLoc.getPointer() + Dot);
+    StringRef DotStr = Name.slice(Dot, StringRef::npos);
+    Operands.push_back(PPCOperand::CreateToken(DotStr, DotLoc, isPPC64()));
+  }
+
+  // If there are no more operands then finish
+  if (getLexer().is(AsmToken::EndOfStatement))
+    return false;
+
+  // Parse the first operand
+  if (ParseOperand(Operands))
+    return true;
+
+  while (getLexer().isNot(AsmToken::EndOfStatement) &&
+         getLexer().is(AsmToken::Comma)) {
+    // Consume the comma token
+    getLexer().Lex();
+
+    // Parse the next operand
+    if (ParseOperand(Operands))
+      return true;
+  }
+
+  return false;
+}
+
+/// ParseDirective parses the PPC specific directives
+bool PPCAsmParser::ParseDirective(AsmToken DirectiveID) {
+  StringRef IDVal = DirectiveID.getIdentifier();
+  if (IDVal == ".word")
+    return ParseDirectiveWord(4, DirectiveID.getLoc());
+  if (IDVal == ".tc")
+    return ParseDirectiveTC(isPPC64()? 8 : 4, DirectiveID.getLoc());
+  return true;
+}
+
+/// ParseDirectiveWord
+///  ::= .word [ expression (, expression)* ]
+bool PPCAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      const MCExpr *Value;
+      if (getParser().parseExpression(Value))
+        return true;
+
+      getParser().getStreamer().EmitValue(Value, Size);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      if (getLexer().isNot(AsmToken::Comma))
+        return Error(L, "unexpected token in directive");
+      Parser.Lex();
+    }
+  }
+
+  Parser.Lex();
+  return false;
+}
+
+/// ParseDirectiveTC
+///  ::= .tc [ symbol (, expression)* ]
+bool PPCAsmParser::ParseDirectiveTC(unsigned Size, SMLoc L) {
+  // Skip TC symbol, which is only used with XCOFF.
+  while (getLexer().isNot(AsmToken::EndOfStatement)
+         && getLexer().isNot(AsmToken::Comma))
+    Parser.Lex();
+  if (getLexer().isNot(AsmToken::Comma))
+    return Error(L, "unexpected token in directive");
+  Parser.Lex();
+
+  // Align to word size.
+  getParser().getStreamer().EmitValueToAlignment(Size);
+
+  // Emit expressions.
+  return ParseDirectiveWord(Size, L);
+}
+
+/// Force static initialization.
+extern "C" void LLVMInitializePowerPCAsmParser() {
+  RegisterMCAsmParser<PPCAsmParser> A(ThePPC32Target);
+  RegisterMCAsmParser<PPCAsmParser> B(ThePPC64Target);
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "PPCGenAsmMatcher.inc"
diff --git a/lib/Target/PowerPC/CMakeLists.txt b/lib/Target/PowerPC/CMakeLists.txt
index 6036428..e5c5204 100644
--- a/lib/Target/PowerPC/CMakeLists.txt
+++ b/lib/Target/PowerPC/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(LLVM_TARGET_DEFINITIONS PPC.td)
 
 tablegen(LLVM PPCGenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM PPCGenAsmMatcher.inc -gen-asm-matcher)
 tablegen(LLVM PPCGenCodeEmitter.inc -gen-emitter)
 tablegen(LLVM PPCGenMCCodeEmitter.inc -gen-emitter -mc-emitter)
 tablegen(LLVM PPCGenRegisterInfo.inc -gen-register-info)
@@ -26,12 +27,14 @@ add_llvm_target(PowerPCCodeGen
   PPCRegisterInfo.cpp
   PPCSubtarget.cpp
   PPCTargetMachine.cpp
+  PPCTargetObjectFile.cpp
   PPCTargetTransformInfo.cpp
   PPCSelectionDAGInfo.cpp
   )
 
 add_dependencies(LLVMPowerPCCodeGen intrinsics_gen)
 
+add_subdirectory(AsmParser)
 add_subdirectory(InstPrinter)
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
index bacc108..432167e 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -129,7 +129,10 @@ void PPCInstPrinter::printU6ImmOperand(const MCInst *MI, unsigned OpNo,
 
 void PPCInstPrinter::printS16ImmOperand(const MCInst *MI, unsigned OpNo,
                                         raw_ostream &O) {
-  O << (short)MI->getOperand(OpNo).getImm();
+  if (MI->getOperand(OpNo).isImm())
+    O << (short)MI->getOperand(OpNo).getImm();
+  else
+    printOperand(MI, OpNo, O);
 }
 
 void PPCInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
@@ -137,22 +140,14 @@ void PPCInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
   O << (unsigned short)MI->getOperand(OpNo).getImm();
 }
 
-void PPCInstPrinter::printS16X4ImmOperand(const MCInst *MI, unsigned OpNo,
-                                          raw_ostream &O) {
-  if (MI->getOperand(OpNo).isImm())
-    O << (short)(MI->getOperand(OpNo).getImm()*4);
-  else
-    printOperand(MI, OpNo, O);
-}
-
 void PPCInstPrinter::printBranchOperand(const MCInst *MI, unsigned OpNo,
                                         raw_ostream &O) {
   if (!MI->getOperand(OpNo).isImm())
     return printOperand(MI, OpNo, O);
 
   // Branches can take an immediate operand.  This is used by the branch
-  // selection pass to print $+8, an eight byte displacement from the PC.
-  O << "$+";
+  // selection pass to print .+8, an eight byte displacement from the PC.
+  O << ".+";
   printAbsAddrOperand(MI, OpNo, O);
 }
 
@@ -182,7 +177,7 @@ void PPCInstPrinter::printcrbitm(const MCInst *MI, unsigned OpNo,
 
 void PPCInstPrinter::printMemRegImm(const MCInst *MI, unsigned OpNo,
                                     raw_ostream &O) {
-  printSymbolLo(MI, OpNo, O);
+  printS16ImmOperand(MI, OpNo, O);
   O << '(';
   if (MI->getOperand(OpNo+1).getReg() == PPC::R0)
     O << "0";
@@ -191,22 +186,6 @@ void PPCInstPrinter::printMemRegImm(const MCInst *MI, unsigned OpNo,
   O << ')';
 }
 
-void PPCInstPrinter::printMemRegImmShifted(const MCInst *MI, unsigned OpNo,
-                                           raw_ostream &O) {
-  if (MI->getOperand(OpNo).isImm())
-    printS16X4ImmOperand(MI, OpNo, O);
-  else
-    printSymbolLo(MI, OpNo, O);
-  O << '(';
-  
-  if (MI->getOperand(OpNo+1).getReg() == PPC::R0)
-    O << "0";
-  else
-    printOperand(MI, OpNo+1, O);
-  O << ')';
-}
-
-
 void PPCInstPrinter::printMemRegReg(const MCInst *MI, unsigned OpNo,
                                     raw_ostream &O) {
   // When used as the base register, r0 reads constant zero rather than
@@ -256,39 +235,4 @@ void PPCInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   assert(Op.isExpr() && "unknown operand kind in printOperand");
   O << *Op.getExpr();
 }
-  
-void PPCInstPrinter::printSymbolLo(const MCInst *MI, unsigned OpNo,
-                                   raw_ostream &O) {
-  if (MI->getOperand(OpNo).isImm())
-    return printS16ImmOperand(MI, OpNo, O);
-  
-  // FIXME: This is a terrible hack because we can't encode lo16() as an operand
-  // flag of a subtraction.  See the FIXME in GetSymbolRef in PPCMCInstLower.
-  if (MI->getOperand(OpNo).isExpr() &&
-      isa<MCBinaryExpr>(MI->getOperand(OpNo).getExpr())) {
-    O << "lo16(";
-    printOperand(MI, OpNo, O);
-    O << ')';
-  } else {
-    printOperand(MI, OpNo, O);
-  }
-}
-
-void PPCInstPrinter::printSymbolHi(const MCInst *MI, unsigned OpNo,
-                                   raw_ostream &O) {
-  if (MI->getOperand(OpNo).isImm())
-    return printS16ImmOperand(MI, OpNo, O);
-
-  // FIXME: This is a terrible hack because we can't encode lo16() as an operand
-  // flag of a subtraction.  See the FIXME in GetSymbolRef in PPCMCInstLower.
-  if (MI->getOperand(OpNo).isExpr() &&
-      isa<MCBinaryExpr>(MI->getOperand(OpNo).getExpr())) {
-    O << "ha16(";
-    printOperand(MI, OpNo, O);
-    O << ')';
-  } else {
-    printOperand(MI, OpNo, O);
-  }
-}
-
 
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
index 8f1e211..f64a329 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
@@ -50,19 +50,13 @@ public:
   void printU6ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printS16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printS16X4ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printAbsAddrOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
 
   void printcrbitm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
 
   void printMemRegImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printMemRegImmShifted(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printMemRegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  
-  // FIXME: Remove
-  void printSymbolLo(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printSymbolHi(const MCInst *MI, unsigned OpNo, raw_ostream &O);
 };
 } // end namespace llvm
 
diff --git a/lib/Target/PowerPC/LLVMBuild.txt b/lib/Target/PowerPC/LLVMBuild.txt
index 95fac54..7b3e843 100644
--- a/lib/Target/PowerPC/LLVMBuild.txt
+++ b/lib/Target/PowerPC/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = InstPrinter MCTargetDesc TargetInfo
+subdirectories = AsmParser InstPrinter MCTargetDesc TargetInfo
 
 [component_0]
 type = TargetGroup
diff --git a/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt b/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt
index b674883..45be471 100644
--- a/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt
@@ -3,6 +3,7 @@ add_llvm_library(LLVMPowerPCDesc
   PPCMCTargetDesc.cpp
   PPCMCAsmInfo.cpp
   PPCMCCodeEmitter.cpp
+  PPCMCExpr.cpp
   PPCPredicates.cpp
   PPCELFObjectWriter.cpp
   )
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index ec26574..3fa2e09 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -22,7 +22,7 @@
 #include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
-static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
+static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
   switch (Kind) {
   default:
     llvm_unreachable("Unknown fixup kind!");
@@ -37,19 +37,35 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
     return Value & 0xfffc;
   case PPC::fixup_ppc_br24:
     return Value & 0x3fffffc;
-#if 0
-  case PPC::fixup_ppc_hi16:
-    return (Value >> 16) & 0xffff;
-#endif
-  case PPC::fixup_ppc_ha16:
-    return ((Value >> 16) + ((Value & 0x8000) ? 1 : 0)) & 0xffff;
-  case PPC::fixup_ppc_lo16:
+  case PPC::fixup_ppc_half16:
     return Value & 0xffff;
-  case PPC::fixup_ppc_lo16_ds:
+  case PPC::fixup_ppc_half16ds:
     return Value & 0xfffc;
   }
 }
 
+static unsigned getFixupKindNumBytes(unsigned Kind) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+  case FK_Data_1:
+    return 1;
+  case FK_Data_2:
+  case PPC::fixup_ppc_half16:
+  case PPC::fixup_ppc_half16ds:
+    return 2;
+  case FK_Data_4:
+  case PPC::fixup_ppc_brcond14:
+  case PPC::fixup_ppc_br24:
+    return 4;
+  case FK_Data_8:
+    return 8;
+  case PPC::fixup_ppc_tlsreg:
+  case PPC::fixup_ppc_nofixup:
+    return 0;
+  }
+}
+
 namespace {
 class PPCMachObjectWriter : public MCMachObjectTargetWriter {
 public:
@@ -77,9 +93,8 @@ public:
       // name                    offset  bits  flags
       { "fixup_ppc_br24",        6,      24,   MCFixupKindInfo::FKF_IsPCRel },
       { "fixup_ppc_brcond14",    16,     14,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_ppc_lo16",        16,     16,   0 },
-      { "fixup_ppc_ha16",        16,     16,   0 },
-      { "fixup_ppc_lo16_ds",     16,     14,   0 },
+      { "fixup_ppc_half16",       0,     16,   0 },
+      { "fixup_ppc_half16ds",     0,     14,   0 },
       { "fixup_ppc_tlsreg",       0,      0,   0 },
       { "fixup_ppc_nofixup",      0,      0,   0 }
     };
@@ -98,12 +113,13 @@ public:
     if (!Value) return;           // Doesn't change encoding.
 
     unsigned Offset = Fixup.getOffset();
+    unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
 
     // For each byte of the fragment that the fixup touches, mask in the bits
     // from the fixup value. The Value has been "split up" into the appropriate
     // bitfields above.
-    for (unsigned i = 0; i != 4; ++i)
-      Data[Offset + i] |= uint8_t((Value >> ((4 - i - 1)*8)) & 0xff);
+    for (unsigned i = 0; i != NumBytes; ++i)
+      Data[Offset + i] |= uint8_t((Value >> ((NumBytes - i - 1)*8)) & 0xff);
   }
 
   bool mayNeedRelaxation(const MCInst &Inst) const {
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
index 81a86dc..7188f93 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -33,26 +33,9 @@ namespace {
     virtual const MCSymbol *undefinedExplicitRelSym(const MCValue &Target,
                                                     const MCFixup &Fixup,
                                                     bool IsPCRel) const;
-    virtual void adjustFixupOffset(const MCFixup &Fixup, uint64_t &RelocOffset);
-
-    virtual void sortRelocs(const MCAssembler &Asm,
-                            std::vector<ELFRelocationEntry> &Relocs);
-  };
-
-  class PPCELFRelocationEntry : public ELFRelocationEntry {
-  public:
-    PPCELFRelocationEntry(const ELFRelocationEntry &RE);
-    bool operator<(const PPCELFRelocationEntry &RE) const {
-      return (RE.r_offset < r_offset ||
-              (RE.r_offset == r_offset && RE.Type > Type));
-    }
   };
 }
 
-PPCELFRelocationEntry::PPCELFRelocationEntry(const ELFRelocationEntry &RE)
-  : ELFRelocationEntry(RE.r_offset, RE.Index, RE.Type, RE.Symbol,
-                       RE.r_addend, *RE.Fixup) {}
-
 PPCELFObjectWriter::PPCELFObjectWriter(bool Is64Bit, uint8_t OSABI)
   : MCELFObjectTargetWriter(Is64Bit, OSABI,
                             Is64Bit ?  ELF::EM_PPC64 : ELF::EM_PPC,
@@ -98,7 +81,7 @@ unsigned PPCELFObjectWriter::getRelocTypeInner(const MCValue &Target,
     case PPC::fixup_ppc_brcond14:
       Type = ELF::R_PPC_ADDR14; // XXX: or BRNTAKEN?_
       break;
-    case PPC::fixup_ppc_ha16:
+    case PPC::fixup_ppc_half16:
       switch (Modifier) {
       default: llvm_unreachable("Unsupported Modifier");
       case MCSymbolRefExpr::VK_PPC_TPREL16_HA:
@@ -107,7 +90,7 @@ unsigned PPCELFObjectWriter::getRelocTypeInner(const MCValue &Target,
       case MCSymbolRefExpr::VK_PPC_DTPREL16_HA:
         Type = ELF::R_PPC64_DTPREL16_HA;
         break;
-      case MCSymbolRefExpr::VK_None:
+      case MCSymbolRefExpr::VK_PPC_ADDR16_HA:
         Type = ELF::R_PPC_ADDR16_HA;
 	break;
       case MCSymbolRefExpr::VK_PPC_TOC16_HA:
@@ -122,11 +105,6 @@ unsigned PPCELFObjectWriter::getRelocTypeInner(const MCValue &Target,
       case MCSymbolRefExpr::VK_PPC_GOT_TLSLD16_HA:
         Type = ELF::R_PPC64_GOT_TLSLD16_HA;
         break;
-      }
-      break;
-    case PPC::fixup_ppc_lo16:
-      switch (Modifier) {
-      default: llvm_unreachable("Unsupported Modifier");
       case MCSymbolRefExpr::VK_PPC_TPREL16_LO:
         Type = ELF::R_PPC_TPREL16_LO;
         break;
@@ -134,6 +112,9 @@ unsigned PPCELFObjectWriter::getRelocTypeInner(const MCValue &Target,
         Type = ELF::R_PPC64_DTPREL16_LO;
         break;
       case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_PPC_ADDR16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_ADDR16_LO:
         Type = ELF::R_PPC_ADDR16_LO;
 	break;
       case MCSymbolRefExpr::VK_PPC_TOC_ENTRY:
@@ -150,12 +131,15 @@ unsigned PPCELFObjectWriter::getRelocTypeInner(const MCValue &Target,
         break;
       }
       break;
-    case PPC::fixup_ppc_lo16_ds:
+    case PPC::fixup_ppc_half16ds:
       switch (Modifier) {
       default: llvm_unreachable("Unsupported Modifier");
       case MCSymbolRefExpr::VK_None:
         Type = ELF::R_PPC64_ADDR16_DS;
         break;
+      case MCSymbolRefExpr::VK_PPC_ADDR16_LO:
+        Type = ELF::R_PPC64_ADDR16_LO_DS;
+        break;
       case MCSymbolRefExpr::VK_PPC_TOC_ENTRY:
         Type = ELF::R_PPC64_TOC16_DS;
 	break;
@@ -231,47 +215,6 @@ const MCSymbol *PPCELFObjectWriter::undefinedExplicitRelSym(const MCValue &Targe
   return NULL;
 }
 
-void PPCELFObjectWriter::
-adjustFixupOffset(const MCFixup &Fixup, uint64_t &RelocOffset) {
-  switch ((unsigned)Fixup.getKind()) {
-    case PPC::fixup_ppc_ha16:
-    case PPC::fixup_ppc_lo16:
-    case PPC::fixup_ppc_lo16_ds:
-      RelocOffset += 2;
-      break;
-    default:
-      break;
-  }
-}
-
-// The standard sorter only sorts on the r_offset field, but PowerPC can
-// have multiple relocations at the same offset.  Sort secondarily on the
-// relocation type to avoid nondeterminism.
-void PPCELFObjectWriter::sortRelocs(const MCAssembler &Asm,
-                                    std::vector<ELFRelocationEntry> &Relocs) {
-
-  // Copy to a temporary vector of relocation entries having a different
-  // sort function.
-  std::vector<PPCELFRelocationEntry> TmpRelocs;
-  
-  for (std::vector<ELFRelocationEntry>::iterator R = Relocs.begin();
-       R != Relocs.end(); ++R) {
-    TmpRelocs.push_back(PPCELFRelocationEntry(*R));
-  }
-
-  // Sort in place by ascending r_offset and descending r_type.
-  array_pod_sort(TmpRelocs.begin(), TmpRelocs.end());
-
-  // Copy back to the original vector.
-  unsigned I = 0;
-  for (std::vector<PPCELFRelocationEntry>::iterator R = TmpRelocs.begin();
-       R != TmpRelocs.end(); ++R, ++I) {
-    Relocs[I] = ELFRelocationEntry(R->r_offset, R->Index, R->Type,
-                                   R->Symbol, R->r_addend, *R->Fixup);
-  }
-}
-
-
 MCObjectWriter *llvm::createPPCELFObjectWriter(raw_ostream &OS,
                                                bool Is64Bit,
                                                uint8_t OSABI) {
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
index 86c44f5..3ea59f0 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
@@ -25,17 +25,13 @@ enum Fixups {
   /// branches.
   fixup_ppc_brcond14,
   
-  /// fixup_ppc_lo16 - A 16-bit fixup corresponding to lo16(_foo) for instrs
-  /// like 'li'.
-  fixup_ppc_lo16,
+  /// fixup_ppc_half16 - A 16-bit fixup corresponding to lo16(_foo)
+  /// or ha16(_foo) for instrs like 'li' or 'addis'.
+  fixup_ppc_half16,
   
-  /// fixup_ppc_ha16 - A 16-bit fixup corresponding to ha16(_foo) for instrs
-  /// like 'lis'.
-  fixup_ppc_ha16,
-  
-  /// fixup_ppc_lo16_ds - A 14-bit fixup corresponding to lo16(_foo) with
+  /// fixup_ppc_half16ds - A 14-bit fixup corresponding to lo16(_foo) with
   /// implied 2 zero bits for instrs like 'std'.
-  fixup_ppc_lo16_ds,
+  fixup_ppc_half16ds,
 
   /// fixup_ppc_tlsreg - Insert thread-pointer register number.
   fixup_ppc_tlsreg,
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index a25d7fe..bb7ce6f 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -59,6 +59,7 @@ PPCLinuxMCAsmInfo::PPCLinuxMCAsmInfo(bool is64Bit) {
 
   // Set up DWARF directives
   HasLEB128 = true;  // Target asm supports leb128 directives (little-endian)
+  MinInstAlignment = 4;
 
   // Exceptions handling
   ExceptionsType = ExceptionHandling::DwarfCFI;
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index 2223cd6..31c73ae 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -48,10 +48,8 @@ public:
                                SmallVectorImpl<MCFixup> &Fixups) const;
   unsigned getCondBrEncoding(const MCInst &MI, unsigned OpNo,
                              SmallVectorImpl<MCFixup> &Fixups) const;
-  unsigned getHA16Encoding(const MCInst &MI, unsigned OpNo,
-                           SmallVectorImpl<MCFixup> &Fixups) const;
-  unsigned getLO16Encoding(const MCInst &MI, unsigned OpNo,
-                           SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getS16ImmEncoding(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
   unsigned getMemRIEncoding(const MCInst &MI, unsigned OpNo,
                             SmallVectorImpl<MCFixup> &Fixups) const;
   unsigned getMemRIXEncoding(const MCInst &MI, unsigned OpNo,
@@ -136,25 +134,14 @@ unsigned PPCMCCodeEmitter::getCondBrEncoding(const MCInst &MI, unsigned OpNo,
   return 0;
 }
 
-unsigned PPCMCCodeEmitter::getHA16Encoding(const MCInst &MI, unsigned OpNo,
+unsigned PPCMCCodeEmitter::getS16ImmEncoding(const MCInst &MI, unsigned OpNo,
                                        SmallVectorImpl<MCFixup> &Fixups) const {
   const MCOperand &MO = MI.getOperand(OpNo);
   if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups);
   
   // Add a fixup for the branch target.
-  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
-                                   (MCFixupKind)PPC::fixup_ppc_ha16));
-  return 0;
-}
-
-unsigned PPCMCCodeEmitter::getLO16Encoding(const MCInst &MI, unsigned OpNo,
-                                       SmallVectorImpl<MCFixup> &Fixups) const {
-  const MCOperand &MO = MI.getOperand(OpNo);
-  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups);
-  
-  // Add a fixup for the branch target.
-  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
-                                   (MCFixupKind)PPC::fixup_ppc_lo16));
+  Fixups.push_back(MCFixup::Create(2, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_half16));
   return 0;
 }
 
@@ -170,8 +157,8 @@ unsigned PPCMCCodeEmitter::getMemRIEncoding(const MCInst &MI, unsigned OpNo,
     return (getMachineOpValue(MI, MO, Fixups) & 0xFFFF) | RegBits;
   
   // Add a fixup for the displacement field.
-  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
-                                   (MCFixupKind)PPC::fixup_ppc_lo16));
+  Fixups.push_back(MCFixup::Create(2, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_half16));
   return RegBits;
 }
 
@@ -185,11 +172,11 @@ unsigned PPCMCCodeEmitter::getMemRIXEncoding(const MCInst &MI, unsigned OpNo,
   
   const MCOperand &MO = MI.getOperand(OpNo);
   if (MO.isImm())
-    return (getMachineOpValue(MI, MO, Fixups) & 0x3FFF) | RegBits;
+    return ((getMachineOpValue(MI, MO, Fixups) >> 2) & 0x3FFF) | RegBits;
   
   // Add a fixup for the displacement field.
-  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
-                                   (MCFixupKind)PPC::fixup_ppc_lo16_ds));
+  Fixups.push_back(MCFixup::Create(2, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_half16ds));
   return RegBits;
 }
 
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
new file mode 100644
index 0000000..f0613ff
--- /dev/null
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
@@ -0,0 +1,108 @@
+//===-- PPCMCExpr.cpp - PPC specific MC expression classes ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ppcmcexpr"
+#include "PPCMCExpr.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+
+using namespace llvm;
+
+const PPCMCExpr*
+PPCMCExpr::Create(VariantKind Kind, const MCExpr *Expr,
+                       MCContext &Ctx) {
+  return new (Ctx) PPCMCExpr(Kind, Expr);
+}
+
+void PPCMCExpr::PrintImpl(raw_ostream &OS) const {
+  switch (Kind) {
+  default: llvm_unreachable("Invalid kind!");
+  case VK_PPC_HA16: OS << "ha16"; break;
+  case VK_PPC_LO16: OS << "lo16"; break;
+  }
+
+  OS << '(';
+  getSubExpr()->print(OS);
+  OS << ')';
+}
+
+bool
+PPCMCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
+                                     const MCAsmLayout *Layout) const {
+  MCValue Value;
+
+  if (!getSubExpr()->EvaluateAsRelocatable(Value, *Layout))
+    return false;
+
+  if (Value.isAbsolute()) {
+    int64_t Result = Value.getConstant();
+    switch (Kind) {
+      default:
+        llvm_unreachable("Invalid kind!");
+      case VK_PPC_HA16:
+        Result = ((Result >> 16) + ((Result & 0x8000) ? 1 : 0)) & 0xffff;
+        break;
+      case VK_PPC_LO16:
+        Result = Result & 0xffff;
+        break;
+    }
+    Res = MCValue::get(Result);
+  } else {
+    MCContext &Context = Layout->getAssembler().getContext();
+    const MCSymbolRefExpr *Sym = Value.getSymA();
+    MCSymbolRefExpr::VariantKind Modifier = Sym->getKind();
+    if (Modifier != MCSymbolRefExpr::VK_None)
+      return false;
+    switch (Kind) {
+      default:
+        llvm_unreachable("Invalid kind!");
+      case VK_PPC_HA16:
+        Modifier = MCSymbolRefExpr::VK_PPC_ADDR16_HA;
+        break;
+      case VK_PPC_LO16:
+        Modifier = MCSymbolRefExpr::VK_PPC_ADDR16_LO;
+        break;
+    }
+    Sym = MCSymbolRefExpr::Create(&Sym->getSymbol(), Modifier, Context);
+    Res = MCValue::get(Sym, Value.getSymB(), Value.getConstant());
+  }
+
+  return true;
+}
+
+// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps
+// that method should be made public?
+static void AddValueSymbols_(const MCExpr *Value, MCAssembler *Asm) {
+  switch (Value->getKind()) {
+  case MCExpr::Target:
+    llvm_unreachable("Can't handle nested target expr!");
+
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
+    AddValueSymbols_(BE->getLHS(), Asm);
+    AddValueSymbols_(BE->getRHS(), Asm);
+    break;
+  }
+
+  case MCExpr::SymbolRef:
+    Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol());
+    break;
+
+  case MCExpr::Unary:
+    AddValueSymbols_(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm);
+    break;
+  }
+}
+
+void PPCMCExpr::AddValueSymbols(MCAssembler *Asm) const {
+  AddValueSymbols_(getSubExpr(), Asm);
+}
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
new file mode 100644
index 0000000..a080537
--- /dev/null
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
@@ -0,0 +1,78 @@
+//===-- PPCMCExpr.h - PPC specific MC expression classes --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPCMCEXPR_H
+#define PPCMCEXPR_H
+
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCAsmLayout.h"
+
+namespace llvm {
+
+class PPCMCExpr : public MCTargetExpr {
+public:
+  enum VariantKind {
+    VK_PPC_None,
+    VK_PPC_HA16,
+    VK_PPC_LO16
+  };
+
+private:
+  const VariantKind Kind;
+  const MCExpr *Expr;
+
+  explicit PPCMCExpr(VariantKind _Kind, const MCExpr *_Expr)
+    : Kind(_Kind), Expr(_Expr) {}
+
+public:
+  /// @name Construction
+  /// @{
+
+  static const PPCMCExpr *Create(VariantKind Kind, const MCExpr *Expr,
+                                      MCContext &Ctx);
+
+  static const PPCMCExpr *CreateHa16(const MCExpr *Expr, MCContext &Ctx) {
+    return Create(VK_PPC_HA16, Expr, Ctx);
+  }
+
+  static const PPCMCExpr *CreateLo16(const MCExpr *Expr, MCContext &Ctx) {
+    return Create(VK_PPC_LO16, Expr, Ctx);
+  }
+
+  /// @}
+  /// @name Accessors
+  /// @{
+
+  /// getOpcode - Get the kind of this expression.
+  VariantKind getKind() const { return Kind; }
+
+  /// getSubExpr - Get the child of this expression.
+  const MCExpr *getSubExpr() const { return Expr; }
+
+  /// @}
+
+  void PrintImpl(raw_ostream &OS) const;
+  bool EvaluateAsRelocatableImpl(MCValue &Res,
+                                 const MCAsmLayout *Layout) const;
+  void AddValueSymbols(MCAssembler *) const;
+  const MCSection *FindAssociatedSection() const {
+    return getSubExpr()->FindAssociatedSection();
+  }
+
+  // There are no TLS PPCMCExprs at the moment.
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {}
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+};
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 2209f93..2da30f9 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -58,7 +58,7 @@ static MCSubtargetInfo *createPPCMCSubtargetInfo(StringRef TT, StringRef CPU,
   return X;
 }
 
-static MCAsmInfo *createPPCMCAsmInfo(const Target &T, StringRef TT) {
+static MCAsmInfo *createPPCMCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) {
   Triple TheTriple(TT);
   bool isPPC64 = TheTriple.getArch() == Triple::ppc64;
 
@@ -69,9 +69,10 @@ static MCAsmInfo *createPPCMCAsmInfo(const Target &T, StringRef TT) {
     MAI = new PPCLinuxMCAsmInfo(isPPC64);
 
   // Initial state of the frame pointer is R1.
-  MachineLocation Dst(MachineLocation::VirtualFP);
-  MachineLocation Src(isPPC64? PPC::X1 : PPC::R1, 0);
-  MAI->addInitialFrameState(0, Dst, Src);
+  unsigned Reg = isPPC64 ? PPC::X1 : PPC::R1;
+  MCCFIInstruction Inst =
+      MCCFIInstruction::createDefCfa(0, MRI.getDwarfRegNum(Reg, true), 0);
+  MAI->addInitialFrameState(Inst);
 
   return MAI;
 }
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
index 444758c..3ab9005 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
@@ -32,7 +32,8 @@ namespace PPC {
     PRED_GT     = (1 << 5) | 12,
     PRED_NE     = (2 << 5) |  4,
     PRED_UN     = (3 << 5) | 12,
-    PRED_NU     = (3 << 5) |  4
+    PRED_NU     = (3 << 5) |  4,
+    PRED_BAD    = 0
   };
   
   /// Invert the specified predicate.  != -> ==, < -> >=.
diff --git a/lib/Target/PowerPC/Makefile b/lib/Target/PowerPC/Makefile
index 1617b26..6666694 100644
--- a/lib/Target/PowerPC/Makefile
+++ b/lib/Target/PowerPC/Makefile
@@ -12,12 +12,12 @@ LIBRARYNAME = LLVMPowerPCCodeGen
 TARGET = PPC
 
 # Make sure that tblgen is run, first thing.
-BUILT_SOURCES = PPCGenRegisterInfo.inc \
+BUILT_SOURCES = PPCGenRegisterInfo.inc PPCGenAsmMatcher.inc \
                 PPCGenAsmWriter.inc  PPCGenCodeEmitter.inc \
                 PPCGenInstrInfo.inc PPCGenDAGISel.inc \
                 PPCGenSubtargetInfo.inc PPCGenCallingConv.inc \
                 PPCGenMCCodeEmitter.inc
 
-DIRS = InstPrinter TargetInfo MCTargetDesc
+DIRS = AsmParser InstPrinter TargetInfo MCTargetDesc
 
 include $(LEVEL)/Makefile.common
diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
index b4be51a..2e79610 100644
--- a/lib/Target/PowerPC/PPC.h
+++ b/lib/Target/PowerPC/PPC.h
@@ -30,7 +30,10 @@ namespace llvm {
   class AsmPrinter;
   class MCInst;
 
-  FunctionPass *createPPCCTRLoops();
+  FunctionPass *createPPCCTRLoops(PPCTargetMachine &TM);
+#ifndef NDEBUG
+  FunctionPass *createPPCCTRLoopsVerify();
+#endif
   FunctionPass *createPPCEarlyReturnPass();
   FunctionPass *createPPCBranchSelectionPass();
   FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td
index 649ffc1..eb73c67 100644
--- a/lib/Target/PowerPC/PPC.td
+++ b/lib/Target/PowerPC/PPC.td
@@ -268,9 +268,14 @@ def PPCAsmWriter : AsmWriter {
   bit isMCAsmWriter = 1;
 }
 
+def PPCAsmParser : AsmParser {
+  let ShouldEmitMatchRegisterName = 0;
+}
+
 def PPC : Target {
   // Information about the instructions.
   let InstructionSet = PPCInstrInfo;
   
   let AssemblyWriters = [PPCAsmWriter];
+  let AssemblyParsers = [PPCAsmParser];
 }
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 3c7cc4e..c43b5c9 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -20,6 +20,7 @@
 #include "PPC.h"
 #include "InstPrinter/PPCInstPrinter.h"
 #include "MCTargetDesc/PPCPredicates.h"
+#include "MCTargetDesc/PPCMCExpr.h"
 #include "PPCSubtarget.h"
 #include "PPCTargetMachine.h"
 #include "llvm/ADT/MapVector.h"
@@ -910,6 +911,9 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
       OutStreamer.EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol);
 
       const MCExpr *Anon = MCSymbolRefExpr::Create(AnonSymbol, OutContext);
+      const MCExpr *LazyPtrExpr = MCSymbolRefExpr::Create(LazyPtr, OutContext);
+      const MCExpr *Sub =
+        MCBinaryExpr::CreateSub(LazyPtrExpr, Anon, OutContext);
 
       // mflr r0
       OutStreamer.EmitInstruction(MCInstBuilder(PPC::MFLR).addReg(PPC::R0));
@@ -919,21 +923,20 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
       // mflr r11
       OutStreamer.EmitInstruction(MCInstBuilder(PPC::MFLR).addReg(PPC::R11));
       // addis r11, r11, ha16(LazyPtr - AnonSymbol)
-      const MCExpr *Sub =
-        MCBinaryExpr::CreateSub(MCSymbolRefExpr::Create(LazyPtr, OutContext),
-                                Anon, OutContext);
+      const MCExpr *SubHa16 = PPCMCExpr::CreateHa16(Sub, OutContext);
       OutStreamer.EmitInstruction(MCInstBuilder(PPC::ADDIS)
         .addReg(PPC::R11)
         .addReg(PPC::R11)
-        .addExpr(Sub));
+        .addExpr(SubHa16));
       // mtlr r0
       OutStreamer.EmitInstruction(MCInstBuilder(PPC::MTLR).addReg(PPC::R0));
 
       // ldu r12, lo16(LazyPtr - AnonSymbol)(r11)
       // lwzu r12, lo16(LazyPtr - AnonSymbol)(r11)
+      const MCExpr *SubLo16 = PPCMCExpr::CreateLo16(Sub, OutContext);
       OutStreamer.EmitInstruction(MCInstBuilder(isPPC64 ? PPC::LDU : PPC::LWZU)
         .addReg(PPC::R12)
-        .addExpr(Sub).addExpr(Sub)
+        .addExpr(SubLo16).addExpr(SubLo16)
         .addReg(PPC::R11));
       // mtctr r12
       OutStreamer.EmitInstruction(MCInstBuilder(PPC::MTCTR).addReg(PPC::R12));
@@ -967,24 +970,22 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
     MCSymbol *Stub = Stubs[i].first;
     MCSymbol *RawSym = Stubs[i].second.getPointer();
     MCSymbol *LazyPtr = GetLazyPtr(Stub, OutContext);
+    const MCExpr *LazyPtrExpr = MCSymbolRefExpr::Create(LazyPtr, OutContext);
 
     OutStreamer.SwitchSection(StubSection);
     EmitAlignment(4);
     OutStreamer.EmitLabel(Stub);
     OutStreamer.EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol);
+
     // lis r11, ha16(LazyPtr)
-    const MCExpr *LazyPtrHa16 =
-      MCSymbolRefExpr::Create(LazyPtr, MCSymbolRefExpr::VK_PPC_DARWIN_HA16,
-                              OutContext);
+    const MCExpr *LazyPtrHa16 = PPCMCExpr::CreateHa16(LazyPtrExpr, OutContext);
     OutStreamer.EmitInstruction(MCInstBuilder(PPC::LIS)
       .addReg(PPC::R11)
       .addExpr(LazyPtrHa16));
 
-    const MCExpr *LazyPtrLo16 =
-      MCSymbolRefExpr::Create(LazyPtr, MCSymbolRefExpr::VK_PPC_DARWIN_LO16,
-                              OutContext);
     // ldu r12, lo16(LazyPtr)(r11)
     // lwzu r12, lo16(LazyPtr)(r11)
+    const MCExpr *LazyPtrLo16 = PPCMCExpr::CreateLo16(LazyPtrExpr, OutContext);
     OutStreamer.EmitInstruction(MCInstBuilder(isPPC64 ? PPC::LDU : PPC::LWZU)
       .addReg(PPC::R12)
       .addExpr(LazyPtrLo16).addExpr(LazyPtrLo16)
diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp
index bd1c378..3e608ca 100644
--- a/lib/Target/PowerPC/PPCBranchSelector.cpp
+++ b/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -112,15 +112,21 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
       unsigned MBBStartOffset = 0;
       for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
            I != E; ++I) {
-        if (I->getOpcode() != PPC::BCC || I->getOperand(2).isImm()) {
+        MachineBasicBlock *Dest = 0;
+        if (I->getOpcode() == PPC::BCC && !I->getOperand(2).isImm())
+          Dest = I->getOperand(2).getMBB();
+        else if ((I->getOpcode() == PPC::BDNZ8 || I->getOpcode() == PPC::BDNZ ||
+                  I->getOpcode() == PPC::BDZ8  || I->getOpcode() == PPC::BDZ) &&
+                 !I->getOperand(0).isImm())
+          Dest = I->getOperand(0).getMBB();
+
+        if (!Dest) {
           MBBStartOffset += TII->GetInstSizeInBytes(I);
           continue;
         }
         
         // Determine the offset from the current branch to the destination
         // block.
-        MachineBasicBlock *Dest = I->getOperand(2).getMBB();
-        
         int BranchSize;
         if (Dest->getNumber() <= MBB.getNumber()) {
           // If this is a backwards branch, the delta is the offset from the
diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp
index 81a54d7..08247c2 100644
--- a/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -9,767 +9,619 @@
 //
 // This pass identifies loops where we can generate the PPC branch instructions
 // that decrement and test the count register (CTR) (bdnz and friends).
-// This pass is based on the HexagonHardwareLoops pass.
 //
 // The pattern that defines the induction variable can changed depending on
 // prior optimizations.  For example, the IndVarSimplify phase run by 'opt'
 // normalizes induction variables, and the Loop Strength Reduction pass
 // run by 'llc' may also make changes to the induction variable.
-// The pattern detected by this phase is due to running Strength Reduction.
 //
 // Criteria for CTR loops:
 //  - Countable loops (w/ ind. var for a trip count)
-//  - Assumes loops are normalized by IndVarSimplify
 //  - Try inner-most loops first
 //  - No nested CTR loops.
 //  - No function calls in loops.
 //
-//  Note: As with unconverted loops, PPCBranchSelector must be run after this
-//  pass in order to convert long-displacement jumps into jump pairs.
-//
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "ctrloops"
-#include "PPC.h"
-#include "MCTargetDesc/PPCPredicates.h"
-#include "PPCTargetMachine.h"
-#include "llvm/ADT/DenseMap.h"
+
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
 #include "llvm/PassSupport.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "PPCTargetMachine.h"
+#include "PPC.h"
+
+#ifndef NDEBUG
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#endif
+
 #include <algorithm>
+#include <vector>
 
 using namespace llvm;
 
+#ifndef NDEBUG
+static cl::opt<int> CTRLoopLimit("ppc-max-ctrloop", cl::Hidden, cl::init(-1));
+#endif
+
 STATISTIC(NumCTRLoops, "Number of loops converted to CTR loops");
 
 namespace llvm {
   void initializePPCCTRLoopsPass(PassRegistry&);
+#ifndef NDEBUG
+  void initializePPCCTRLoopsVerifyPass(PassRegistry&);
+#endif
 }
 
 namespace {
-  class CountValue;
-  struct PPCCTRLoops : public MachineFunctionPass {
-    MachineLoopInfo       *MLI;
-    MachineRegisterInfo   *MRI;
-    const TargetInstrInfo *TII;
+  struct PPCCTRLoops : public FunctionPass {
+
+#ifndef NDEBUG
+    static int Counter;
+#endif
 
   public:
-    static char ID;   // Pass identification, replacement for typeid
+    static char ID;
 
-    PPCCTRLoops() : MachineFunctionPass(ID) {
+    PPCCTRLoops() : FunctionPass(ID), TM(0) {
+      initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry());
+    }
+    PPCCTRLoops(PPCTargetMachine &TM) : FunctionPass(ID), TM(&TM) {
       initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry());
     }
 
-    virtual bool runOnMachineFunction(MachineFunction &MF);
-
-    const char *getPassName() const { return "PPC CTR Loops"; }
+    virtual bool runOnFunction(Function &F);
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesCFG();
-      AU.addRequired<MachineDominatorTree>();
-      AU.addPreserved<MachineDominatorTree>();
-      AU.addRequired<MachineLoopInfo>();
-      AU.addPreserved<MachineLoopInfo>();
-      MachineFunctionPass::getAnalysisUsage(AU);
+      AU.addRequired<LoopInfo>();
+      AU.addPreserved<LoopInfo>();
+      AU.addRequired<DominatorTree>();
+      AU.addPreserved<DominatorTree>();
+      AU.addRequired<ScalarEvolution>();
     }
 
   private:
-    /// getCanonicalInductionVariable - Check to see if the loop has a canonical
-    /// induction variable.
-    /// Should be defined in MachineLoop. Based upon version in class Loop.
-    void getCanonicalInductionVariable(MachineLoop *L,
-                              SmallVector<MachineInstr *, 4> &IVars,
-                              SmallVector<MachineInstr *, 4> &IOps) const;
-
-    /// getTripCount - Return a loop-invariant LLVM register indicating the
-    /// number of times the loop will be executed.  If the trip-count cannot
-    /// be determined, this return null.
-    CountValue *getTripCount(MachineLoop *L,
-                             SmallVector<MachineInstr *, 2> &OldInsts) const;
-
-    /// isInductionOperation - Return true if the instruction matches the
-    /// pattern for an opertion that defines an induction variable.
-    bool isInductionOperation(const MachineInstr *MI, unsigned IVReg) const;
-
-    /// isInvalidOperation - Return true if the instruction is not valid within
-    /// a CTR loop.
-    bool isInvalidLoopOperation(const MachineInstr *MI) const;
-
-    /// containsInavlidInstruction - Return true if the loop contains an
-    /// instruction that inhibits using the CTR loop.
-    bool containsInvalidInstruction(MachineLoop *L) const;
-
-    /// converToCTRLoop - Given a loop, check if we can convert it to a
-    /// CTR loop.  If so, then perform the conversion and return true.
-    bool convertToCTRLoop(MachineLoop *L);
-
-    /// isDead - Return true if the instruction is now dead.
-    bool isDead(const MachineInstr *MI,
-                SmallVector<MachineInstr *, 1> &DeadPhis) const;
-
-    /// removeIfDead - Remove the instruction if it is now dead.
-    void removeIfDead(MachineInstr *MI);
+    bool mightUseCTR(const Triple &TT, BasicBlock *BB);
+    bool convertToCTRLoop(Loop *L);
+
+  private:
+    PPCTargetMachine *TM;
+    LoopInfo *LI;
+    ScalarEvolution *SE;
+    DataLayout *TD;
+    DominatorTree *DT;
+    const TargetLibraryInfo *LibInfo;
   };
 
   char PPCCTRLoops::ID = 0;
+#ifndef NDEBUG
+  int PPCCTRLoops::Counter = 0;
+#endif
 
-
-  // CountValue class - Abstraction for a trip count of a loop. A
-  // smaller vesrsion of the MachineOperand class without the concerns
-  // of changing the operand representation.
-  class CountValue {
+#ifndef NDEBUG
+  struct PPCCTRLoopsVerify : public MachineFunctionPass {
   public:
-    enum CountValueType {
-      CV_Register,
-      CV_Immediate
-    };
-  private:
-    CountValueType Kind;
-    union Values {
-      unsigned RegNum;
-      int64_t ImmVal;
-      Values(unsigned r) : RegNum(r) {}
-      Values(int64_t i) : ImmVal(i) {}
-    } Contents;
-    bool isNegative;
+    static char ID;
 
-  public:
-    CountValue(unsigned r, bool neg) : Kind(CV_Register), Contents(r),
-                                       isNegative(neg) {}
-    explicit CountValue(int64_t i) : Kind(CV_Immediate), Contents(i),
-                                     isNegative(i < 0) {}
-    CountValueType getType() const { return Kind; }
-    bool isReg() const { return Kind == CV_Register; }
-    bool isImm() const { return Kind == CV_Immediate; }
-    bool isNeg() const { return isNegative; }
-
-    unsigned getReg() const {
-      assert(isReg() && "Wrong CountValue accessor");
-      return Contents.RegNum;
-    }
-    void setReg(unsigned Val) {
-      Contents.RegNum = Val;
-    }
-    int64_t getImm() const {
-      assert(isImm() && "Wrong CountValue accessor");
-      if (isNegative) {
-        return -Contents.ImmVal;
-      }
-      return Contents.ImmVal;
-    }
-    void setImm(int64_t Val) {
-      Contents.ImmVal = Val;
+    PPCCTRLoopsVerify() : MachineFunctionPass(ID) {
+      initializePPCCTRLoopsVerifyPass(*PassRegistry::getPassRegistry());
     }
 
-    void print(raw_ostream &OS, const TargetMachine *TM = 0) const {
-      if (isReg()) { OS << PrintReg(getReg()); }
-      if (isImm()) { OS << getImm(); }
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<MachineDominatorTree>();
+      MachineFunctionPass::getAnalysisUsage(AU);
     }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  private:
+    MachineDominatorTree *MDT;
   };
+
+  char PPCCTRLoopsVerify::ID = 0;
+#endif // NDEBUG
 } // end anonymous namespace
 
 INITIALIZE_PASS_BEGIN(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
                       false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
                     false, false)
 
-/// isCompareEquals - Returns true if the instruction is a compare equals
-/// instruction with an immediate operand.
-static bool isCompareEqualsImm(const MachineInstr *MI, bool &SignedCmp,
-                               bool &Int64Cmp) {
-  if (MI->getOpcode() == PPC::CMPWI) {
-    SignedCmp = true;
-    Int64Cmp = false;
-    return true;
-  } else if (MI->getOpcode() == PPC::CMPDI) {
-    SignedCmp = true;
-    Int64Cmp = true;
-    return true;
-  } else if (MI->getOpcode() == PPC::CMPLWI) {
-    SignedCmp = false;
-    Int64Cmp = false;
-    return true;
-  } else if (MI->getOpcode() == PPC::CMPLDI) {
-    SignedCmp = false;
-    Int64Cmp = true;
-    return true;
-  }
-
-  return false;
+FunctionPass *llvm::createPPCCTRLoops(PPCTargetMachine &TM) {
+  return new PPCCTRLoops(TM);
 }
 
+#ifndef NDEBUG
+INITIALIZE_PASS_BEGIN(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
+                      "PowerPC CTR Loops Verify", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
+                    "PowerPC CTR Loops Verify", false, false)
 
-/// createPPCCTRLoops - Factory for creating
-/// the CTR loop phase.
-FunctionPass *llvm::createPPCCTRLoops() {
-  return new PPCCTRLoops();
+FunctionPass *llvm::createPPCCTRLoopsVerify() {
+  return new PPCCTRLoopsVerify();
 }
+#endif // NDEBUG
 
+bool PPCCTRLoops::runOnFunction(Function &F) {
+  LI = &getAnalysis<LoopInfo>();
+  SE = &getAnalysis<ScalarEvolution>();
+  DT = &getAnalysis<DominatorTree>();
+  TD = getAnalysisIfAvailable<DataLayout>();
+  LibInfo = getAnalysisIfAvailable<TargetLibraryInfo>();
 
-bool PPCCTRLoops::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG(dbgs() << "********* PPC CTR Loops *********\n");
-
-  bool Changed = false;
+  bool MadeChange = false;
 
-  // get the loop information
-  MLI = &getAnalysis<MachineLoopInfo>();
-  // get the register information
-  MRI = &MF.getRegInfo();
-  // the target specific instructio info.
-  TII = MF.getTarget().getInstrInfo();
-
-  for (MachineLoopInfo::iterator I = MLI->begin(), E = MLI->end();
+  for (LoopInfo::iterator I = LI->begin(), E = LI->end();
        I != E; ++I) {
-    MachineLoop *L = *I;
-    if (!L->getParentLoop()) {
-      Changed |= convertToCTRLoop(L);
-    }
+    Loop *L = *I;
+    if (!L->getParentLoop())
+      MadeChange |= convertToCTRLoop(L);
   }
 
-  return Changed;
+  return MadeChange;
 }
 
-/// getCanonicalInductionVariable - Check to see if the loop has a canonical
-/// induction variable. We check for a simple recurrence pattern - an
-/// integer recurrence that decrements by one each time through the loop and
-/// ends at zero.  If so, return the phi node that corresponds to it.
-///
-/// Based upon the similar code in LoopInfo except this code is specific to
-/// the machine.
-/// This method assumes that the IndVarSimplify pass has been run by 'opt'.
-///
-void
-PPCCTRLoops::getCanonicalInductionVariable(MachineLoop *L,
-                                  SmallVector<MachineInstr *, 4> &IVars,
-                                  SmallVector<MachineInstr *, 4> &IOps) const {
-  MachineBasicBlock *TopMBB = L->getTopBlock();
-  MachineBasicBlock::pred_iterator PI = TopMBB->pred_begin();
-  assert(PI != TopMBB->pred_end() &&
-         "Loop must have more than one incoming edge!");
-  MachineBasicBlock *Backedge = *PI++;
-  if (PI == TopMBB->pred_end()) return;  // dead loop
-  MachineBasicBlock *Incoming = *PI++;
-  if (PI != TopMBB->pred_end()) return;  // multiple backedges?
-
-  // make sure there is one incoming and one backedge and determine which
-  // is which.
-  if (L->contains(Incoming)) {
-    if (L->contains(Backedge))
-      return;
-    std::swap(Incoming, Backedge);
-  } else if (!L->contains(Backedge))
-    return;
-
-  // Loop over all of the PHI nodes, looking for a canonical induction variable:
-  //   - The PHI node is "reg1 = PHI reg2, BB1, reg3, BB2".
-  //   - The recurrence comes from the backedge.
-  //   - the definition is an induction operatio.n
-  for (MachineBasicBlock::iterator I = TopMBB->begin(), E = TopMBB->end();
-       I != E && I->isPHI(); ++I) {
-    MachineInstr *MPhi = &*I;
-    unsigned DefReg = MPhi->getOperand(0).getReg();
-    for (unsigned i = 1; i != MPhi->getNumOperands(); i += 2) {
-      // Check each operand for the value from the backedge.
-      MachineBasicBlock *MBB = MPhi->getOperand(i+1).getMBB();
-      if (L->contains(MBB)) { // operands comes from the backedge
-        // Check if the definition is an induction operation.
-        MachineInstr *DI = MRI->getVRegDef(MPhi->getOperand(i).getReg());
-        if (isInductionOperation(DI, DefReg)) {
-          IOps.push_back(DI);
-          IVars.push_back(MPhi);
+bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
+  for (BasicBlock::iterator J = BB->begin(), JE = BB->end();
+       J != JE; ++J) {
+    if (CallInst *CI = dyn_cast<CallInst>(J)) {
+      if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue())) {
+        // Inline ASM is okay, unless it clobbers the ctr register.
+        InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints();
+        for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) {
+          InlineAsm::ConstraintInfo &C = CIV[i];
+          if (C.Type != InlineAsm::isInput)
+            for (unsigned j = 0, je = C.Codes.size(); j < je; ++j)
+              if (StringRef(C.Codes[j]).equals_lower("{ctr}"))
+                return true;
         }
-      }
-    }
-  }
-  return;
-}
 
-/// getTripCount - Return a loop-invariant LLVM value indicating the
-/// number of times the loop will be executed.  The trip count can
-/// be either a register or a constant value.  If the trip-count
-/// cannot be determined, this returns null.
-///
-/// We find the trip count from the phi instruction that defines the
-/// induction variable.  We follow the links to the CMP instruction
-/// to get the trip count.
-///
-/// Based upon getTripCount in LoopInfo.
-///
-CountValue *PPCCTRLoops::getTripCount(MachineLoop *L,
-                           SmallVector<MachineInstr *, 2> &OldInsts) const {
-  MachineBasicBlock *LastMBB = L->getExitingBlock();
-  // Don't generate a CTR loop if the loop has more than one exit.
-  if (LastMBB == 0)
-    return 0;
-
-  MachineBasicBlock::iterator LastI = LastMBB->getFirstTerminator();
-  if (LastI->getOpcode() != PPC::BCC)
-    return 0;
-
-  // We need to make sure that this compare is defining the condition
-  // register actually used by the terminating branch.
-
-  unsigned PredReg = LastI->getOperand(1).getReg();
-  DEBUG(dbgs() << "Examining loop with first terminator: " << *LastI);
-
-  unsigned PredCond = LastI->getOperand(0).getImm();
-  if (PredCond != PPC::PRED_EQ && PredCond != PPC::PRED_NE)
-    return 0;
-
-  // Check that the loop has a induction variable.
-  SmallVector<MachineInstr *, 4> IVars, IOps;
-  getCanonicalInductionVariable(L, IVars, IOps);
-  for (unsigned i = 0; i < IVars.size(); ++i) {
-    MachineInstr *IOp = IOps[i];
-    MachineInstr *IV_Inst = IVars[i];
-
-    // Canonical loops will end with a 'cmpwi/cmpdi cr, IV, Imm',
-    //  if Imm is 0, get the count from the PHI opnd
-    //  if Imm is -M, than M is the count
-    //  Otherwise, Imm is the count
-    MachineOperand *IV_Opnd;
-    const MachineOperand *InitialValue;
-    if (!L->contains(IV_Inst->getOperand(2).getMBB())) {
-      InitialValue = &IV_Inst->getOperand(1);
-      IV_Opnd = &IV_Inst->getOperand(3);
-    } else {
-      InitialValue = &IV_Inst->getOperand(3);
-      IV_Opnd = &IV_Inst->getOperand(1);
-    }
+        continue;
+      }
 
-    DEBUG(dbgs() << "Considering:\n");
-    DEBUG(dbgs() << "  induction operation: " << *IOp);
-    DEBUG(dbgs() << "  induction variable: " << *IV_Inst);
-    DEBUG(dbgs() << "  initial value: " << *InitialValue << "\n");
-  
-    // Look for the cmp instruction to determine if we
-    // can get a useful trip count.  The trip count can
-    // be either a register or an immediate.  The location
-    // of the value depends upon the type (reg or imm).
-    for (MachineRegisterInfo::reg_iterator
-         RI = MRI->reg_begin(IV_Opnd->getReg()), RE = MRI->reg_end();
-         RI != RE; ++RI) {
-      IV_Opnd = &RI.getOperand();
-      bool SignedCmp, Int64Cmp;
-      MachineInstr *MI = IV_Opnd->getParent();
-      if (L->contains(MI) && isCompareEqualsImm(MI, SignedCmp, Int64Cmp) &&
-          MI->getOperand(0).getReg() == PredReg) {
-
-        OldInsts.push_back(MI);
-        OldInsts.push_back(IOp);
- 
-        DEBUG(dbgs() << "  compare: " << *MI);
- 
-        const MachineOperand &MO = MI->getOperand(2);
-        assert(MO.isImm() && "IV Cmp Operand should be an immediate");
-
-        int64_t ImmVal;
-        if (SignedCmp)
-          ImmVal = (short) MO.getImm();
-        else
-          ImmVal = MO.getImm();
-  
-        const MachineInstr *IV_DefInstr = MRI->getVRegDef(IV_Opnd->getReg());
-        assert(L->contains(IV_DefInstr->getParent()) &&
-               "IV definition should occurs in loop");
-        int64_t iv_value = (short) IV_DefInstr->getOperand(2).getImm();
-  
-        assert(InitialValue->isReg() && "Expecting register for init value");
-        unsigned InitialValueReg = InitialValue->getReg();
-  
-        MachineInstr *DefInstr = MRI->getVRegDef(InitialValueReg);
-  
-        // Here we need to look for an immediate load (an li or lis/ori pair).
-        if (DefInstr && (DefInstr->getOpcode() == PPC::ORI8 ||
-                         DefInstr->getOpcode() == PPC::ORI)) {
-          int64_t start = DefInstr->getOperand(2).getImm();
-          MachineInstr *DefInstr2 =
-            MRI->getVRegDef(DefInstr->getOperand(1).getReg());
-          if (DefInstr2 && (DefInstr2->getOpcode() == PPC::LIS8 ||
-                            DefInstr2->getOpcode() == PPC::LIS)) {
-            DEBUG(dbgs() << "  initial constant: " << *DefInstr);
-            DEBUG(dbgs() << "  initial constant: " << *DefInstr2);
-
-            start |= int64_t(short(DefInstr2->getOperand(1).getImm())) << 16;
-  
-            int64_t count = ImmVal - start;
-            if ((count % iv_value) != 0) {
-              return 0;
-            }
-
-            OldInsts.push_back(DefInstr);
-            OldInsts.push_back(DefInstr2);
-
-            // count/iv_value, the trip count, should be positive here. If it
-            // is negative, that indicates that the counter will wrap.
-            if (Int64Cmp)
-              return new CountValue(count/iv_value);
-            else
-              return new CountValue(uint32_t(count/iv_value));
-          }
-        } else if (DefInstr && (DefInstr->getOpcode() == PPC::LI8 ||
-                                DefInstr->getOpcode() == PPC::LI)) {
-          DEBUG(dbgs() << "  initial constant: " << *DefInstr);
-
-          int64_t count = ImmVal -
-            int64_t(short(DefInstr->getOperand(1).getImm()));
-          if ((count % iv_value) != 0) {
-            return 0;
+      if (!TM)
+        return true;
+      const TargetLowering *TLI = TM->getTargetLowering();
+
+      if (Function *F = CI->getCalledFunction()) {
+        // Most intrinsics don't become function calls, but some might.
+        // sin, cos, exp and log are always calls.
+        unsigned Opcode;
+        if (F->getIntrinsicID() != Intrinsic::not_intrinsic) {
+          switch (F->getIntrinsicID()) {
+          default: continue;
+
+// VisualStudio defines setjmp as _setjmp
+#if defined(_MSC_VER) && defined(setjmp) && \
+                       !defined(setjmp_undefined_for_msvc)
+#  pragma push_macro("setjmp")
+#  undef setjmp
+#  define setjmp_undefined_for_msvc
+#endif
+
+          case Intrinsic::setjmp:
+
+#if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc)
+ // let's return it to _setjmp state
+#  pragma pop_macro("setjmp")
+#  undef setjmp_undefined_for_msvc
+#endif
+
+          case Intrinsic::longjmp:
+          case Intrinsic::memcpy:
+          case Intrinsic::memmove:
+          case Intrinsic::memset:
+          case Intrinsic::powi:
+          case Intrinsic::log:
+          case Intrinsic::log2:
+          case Intrinsic::log10:
+          case Intrinsic::exp:
+          case Intrinsic::exp2:
+          case Intrinsic::pow:
+          case Intrinsic::sin:
+          case Intrinsic::cos:
+            return true;
+          case Intrinsic::sqrt:      Opcode = ISD::FSQRT;      break;
+          case Intrinsic::floor:     Opcode = ISD::FFLOOR;     break;
+          case Intrinsic::ceil:      Opcode = ISD::FCEIL;      break;
+          case Intrinsic::trunc:     Opcode = ISD::FTRUNC;     break;
+          case Intrinsic::rint:      Opcode = ISD::FRINT;      break;
+          case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
           }
+        }
 
-          OldInsts.push_back(DefInstr);
-
-          if (Int64Cmp)
-            return new CountValue(count/iv_value);
-          else
-            return new CountValue(uint32_t(count/iv_value));
-        } else if (iv_value == 1 || iv_value == -1) {
-          // We can't determine a constant starting value.
-          if (ImmVal == 0) {
-            return new CountValue(InitialValueReg, iv_value > 0);
+        // PowerPC does not use [US]DIVREM or other library calls for
+        // operations on regular types which are not otherwise library calls
+        // (i.e. soft float or atomics). If adapting for targets that do,
+        // additional care is required here.
+
+        LibFunc::Func Func;
+        if (!F->hasLocalLinkage() && F->hasName() && LibInfo &&
+            LibInfo->getLibFunc(F->getName(), Func) &&
+            LibInfo->hasOptimizedCodeGen(Func)) {
+          // Non-read-only functions are never treated as intrinsics.
+          if (!CI->onlyReadsMemory())
+            return true;
+
+          // Conversion happens only for FP calls.
+          if (!CI->getArgOperand(0)->getType()->isFloatingPointTy())
+            return true;
+
+          switch (Func) {
+          default: return true;
+          case LibFunc::copysign:
+          case LibFunc::copysignf:
+          case LibFunc::copysignl:
+            continue; // ISD::FCOPYSIGN is never a library call.
+          case LibFunc::fabs:
+          case LibFunc::fabsf:
+          case LibFunc::fabsl:
+            continue; // ISD::FABS is never a library call.
+          case LibFunc::sqrt:
+          case LibFunc::sqrtf:
+          case LibFunc::sqrtl:
+            Opcode = ISD::FSQRT; break;
+          case LibFunc::floor:
+          case LibFunc::floorf:
+          case LibFunc::floorl:
+            Opcode = ISD::FFLOOR; break;
+          case LibFunc::nearbyint:
+          case LibFunc::nearbyintf:
+          case LibFunc::nearbyintl:
+            Opcode = ISD::FNEARBYINT; break;
+          case LibFunc::ceil:
+          case LibFunc::ceilf:
+          case LibFunc::ceill:
+            Opcode = ISD::FCEIL; break;
+          case LibFunc::rint:
+          case LibFunc::rintf:
+          case LibFunc::rintl:
+            Opcode = ISD::FRINT; break;
+          case LibFunc::trunc:
+          case LibFunc::truncf:
+          case LibFunc::truncl:
+            Opcode = ISD::FTRUNC; break;
           }
-          // FIXME: handle non-zero end value.
+
+          MVT VTy =
+            TLI->getSimpleValueType(CI->getArgOperand(0)->getType(), true);
+          if (VTy == MVT::Other)
+            return true;
+          
+          if (TLI->isOperationLegalOrCustom(Opcode, VTy))
+            continue;
+          else if (VTy.isVector() &&
+                   TLI->isOperationLegalOrCustom(Opcode, VTy.getScalarType()))
+            continue;
+
+          return true;
         }
-        // FIXME: handle non-unit increments (we might not want to introduce
-        // division but we can handle some 2^n cases with shifts).
-  
       }
-    }
-  }
-  return 0;
-}
-
-/// isInductionOperation - return true if the operation is matches the
-/// pattern that defines an induction variable:
-///    addi iv, c
-///
-bool
-PPCCTRLoops::isInductionOperation(const MachineInstr *MI,
-                                           unsigned IVReg) const {
-  return ((MI->getOpcode() == PPC::ADDI || MI->getOpcode() == PPC::ADDI8) &&
-          MI->getOperand(1).isReg() && // could be a frame index instead
-          MI->getOperand(1).getReg() == IVReg);
-}
 
-/// isInvalidOperation - Return true if the operation is invalid within
-/// CTR loop.
-bool
-PPCCTRLoops::isInvalidLoopOperation(const MachineInstr *MI) const {
-
-  // call is not allowed because the callee may use a CTR loop
-  if (MI->getDesc().isCall()) {
-    return true;
-  }
-  // check if the instruction defines a CTR loop register
-  // (this will also catch nested CTR loops)
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
-    if (MO.isReg() && MO.isDef() &&
-        (MO.getReg() == PPC::CTR || MO.getReg() == PPC::CTR8)) {
       return true;
-    }
-  }
-  return false;
-}
+    } else if (isa<BinaryOperator>(J) &&
+               J->getType()->getScalarType()->isPPC_FP128Ty()) {
+      // Most operations on ppc_f128 values become calls.
+      return true;
+    } else if (isa<UIToFPInst>(J) || isa<SIToFPInst>(J) ||
+               isa<FPToUIInst>(J) || isa<FPToSIInst>(J)) {
+      CastInst *CI = cast<CastInst>(J);
+      if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() ||
+          CI->getDestTy()->getScalarType()->isPPC_FP128Ty() ||
+          (TT.isArch32Bit() &&
+           (CI->getSrcTy()->getScalarType()->isIntegerTy(64) ||
+            CI->getDestTy()->getScalarType()->isIntegerTy(64))
+          ))
+        return true;
+    } else if (TT.isArch32Bit() &&
+               J->getType()->getScalarType()->isIntegerTy(64) &&
+               (J->getOpcode() == Instruction::UDiv ||
+                J->getOpcode() == Instruction::SDiv ||
+                J->getOpcode() == Instruction::URem ||
+                J->getOpcode() == Instruction::SRem)) {
+      return true;
+    } else if (isa<IndirectBrInst>(J) || isa<InvokeInst>(J)) {
+      // On PowerPC, indirect jumps use the counter register.
+      return true;
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) {
+      if (!TM)
+        return true;
+      const TargetLowering *TLI = TM->getTargetLowering();
 
-/// containsInvalidInstruction - Return true if the loop contains
-/// an instruction that inhibits the use of the CTR loop function.
-///
-bool PPCCTRLoops::containsInvalidInstruction(MachineLoop *L) const {
-  const std::vector<MachineBasicBlock*> Blocks = L->getBlocks();
-  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
-    MachineBasicBlock *MBB = Blocks[i];
-    for (MachineBasicBlock::iterator
-           MII = MBB->begin(), E = MBB->end(); MII != E; ++MII) {
-      const MachineInstr *MI = &*MII;
-      if (isInvalidLoopOperation(MI)) {
+      if (TLI->supportJumpTables() &&
+          SI->getNumCases()+1 >= (unsigned) TLI->getMinimumJumpTableEntries())
         return true;
-      }
     }
   }
+
   return false;
 }
 
-/// isDead returns true if the instruction is dead
-/// (this was essentially copied from DeadMachineInstructionElim::isDead, but
-/// with special cases for inline asm, physical registers and instructions with
-/// side effects removed)
-bool PPCCTRLoops::isDead(const MachineInstr *MI,
-                         SmallVector<MachineInstr *, 1> &DeadPhis) const {
-  // Examine each operand.
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
-    if (MO.isReg() && MO.isDef()) {
-      unsigned Reg = MO.getReg();
-      if (!MRI->use_nodbg_empty(Reg)) {
-        // This instruction has users, but if the only user is the phi node for
-        // the parent block, and the only use of that phi node is this
-        // instruction, then this instruction is dead: both it (and the phi
-        // node) can be removed.
-        MachineRegisterInfo::use_iterator I = MRI->use_begin(Reg);
-        if (llvm::next(I) == MRI->use_end() &&
-            I.getOperand().getParent()->isPHI()) {
-          MachineInstr *OnePhi = I.getOperand().getParent();
-
-          for (unsigned j = 0, f = OnePhi->getNumOperands(); j != f; ++j) {
-            const MachineOperand &OPO = OnePhi->getOperand(j);
-            if (OPO.isReg() && OPO.isDef()) {
-              unsigned OPReg = OPO.getReg();
-
-              MachineRegisterInfo::use_iterator nextJ;
-              for (MachineRegisterInfo::use_iterator J = MRI->use_begin(OPReg),
-                   E = MRI->use_end(); J!=E; J=nextJ) {
-                nextJ = llvm::next(J);
-                MachineOperand& Use = J.getOperand();
-                MachineInstr *UseMI = Use.getParent();
-
-                if (MI != UseMI) {
-                  // The phi node has a user that is not MI, bail...
-                  return false;
-                }
-              }
-            }
-          }
+bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
+  bool MadeChange = false;
 
-          DeadPhis.push_back(OnePhi);
-        } else {
-          // This def has a non-debug use. Don't delete the instruction!
-          return false;
-        }
-      }
-    }
+  Triple TT = Triple(L->getHeader()->getParent()->getParent()->
+                     getTargetTriple());
+  if (!TT.isArch32Bit() && !TT.isArch64Bit())
+    return MadeChange; // Unknown arch. type.
+
+  // Process nested loops first.
+  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
+    MadeChange |= convertToCTRLoop(*I);
   }
 
-  // If there are no defs with uses, the instruction is dead.
-  return true;
-}
+  // If a nested loop has been converted, then we can't convert this loop.
+  if (MadeChange)
+    return MadeChange;
+
+#ifndef NDEBUG
+  // Stop trying after reaching the limit (if any).
+  int Limit = CTRLoopLimit;
+  if (Limit >= 0) {
+    if (Counter >= CTRLoopLimit)
+      return false;
+    Counter++;
+  }
+#endif
+
+  // We don't want to spill/restore the counter register, and so we don't
+  // want to use the counter register if the loop contains calls.
+  for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
+       I != IE; ++I)
+    if (mightUseCTR(TT, *I))
+      return MadeChange;
+
+  SmallVector<BasicBlock*, 4> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+
+  BasicBlock *CountedExitBlock = 0;
+  const SCEV *ExitCount = 0;
+  BranchInst *CountedExitBranch = 0;
+  for (SmallVector<BasicBlock*, 4>::iterator I = ExitingBlocks.begin(),
+       IE = ExitingBlocks.end(); I != IE; ++I) {
+    const SCEV *EC = SE->getExitCount(L, *I);
+    DEBUG(dbgs() << "Exit Count for " << *L << " from block " <<
+                    (*I)->getName() << ": " << *EC << "\n");
+    if (isa<SCEVCouldNotCompute>(EC))
+      continue;
+    if (const SCEVConstant *ConstEC = dyn_cast<SCEVConstant>(EC)) {
+      if (ConstEC->getValue()->isZero())
+        continue;
+    } else if (!SE->isLoopInvariant(EC, L))
+      continue;
+
+    // We now have a loop-invariant count of loop iterations (which is not the
+    // constant zero) for which we know that this loop will not exit via this
+    // exisiting block.
+
+    // We need to make sure that this block will run on every loop iteration.
+    // For this to be true, we must dominate all blocks with backedges. Such
+    // blocks are in-loop predecessors to the header block.
+    bool NotAlways = false;
+    for (pred_iterator PI = pred_begin(L->getHeader()),
+         PIE = pred_end(L->getHeader()); PI != PIE; ++PI) {
+      if (!L->contains(*PI))
+        continue;
 
-void PPCCTRLoops::removeIfDead(MachineInstr *MI) {
-  // This procedure was essentially copied from DeadMachineInstructionElim
+      if (!DT->dominates(*I, *PI)) {
+        NotAlways = true;
+        break;
+      }
+    }
 
-  SmallVector<MachineInstr *, 1> DeadPhis;
-  if (isDead(MI, DeadPhis)) {
-    DEBUG(dbgs() << "CTR looping will remove: " << *MI);
+    if (NotAlways)
+      continue;
 
-    // It is possible that some DBG_VALUE instructions refer to this
-    // instruction.  Examine each def operand for such references;
-    // if found, mark the DBG_VALUE as undef (but don't delete it).
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      const MachineOperand &MO = MI->getOperand(i);
-      if (!MO.isReg() || !MO.isDef())
+    // Make sure this blocks ends with a conditional branch.
+    Instruction *TI = (*I)->getTerminator();
+    if (!TI)
+      continue;
+
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      if (!BI->isConditional())
         continue;
-      unsigned Reg = MO.getReg();
-      MachineRegisterInfo::use_iterator nextI;
-      for (MachineRegisterInfo::use_iterator I = MRI->use_begin(Reg),
-           E = MRI->use_end(); I!=E; I=nextI) {
-        nextI = llvm::next(I);  // I is invalidated by the setReg
-        MachineOperand& Use = I.getOperand();
-        MachineInstr *UseMI = Use.getParent();
-        if (UseMI==MI)
-          continue;
-        if (Use.isDebug()) // this might also be a instr -> phi -> instr case
-                           // which can also be removed.
-          UseMI->getOperand(0).setReg(0U);
-      }
-    }
 
-    MI->eraseFromParent();
-    for (unsigned i = 0; i < DeadPhis.size(); ++i) {
-      DeadPhis[i]->eraseFromParent();
-    }
+      CountedExitBranch = BI;
+    } else
+      continue;
+
+    // Note that this block may not be the loop latch block, even if the loop
+    // has a latch block.
+    CountedExitBlock = *I;
+    ExitCount = EC;
+    break;
   }
+
+  if (!CountedExitBlock)
+    return MadeChange;
+
+  BasicBlock *Preheader = L->getLoopPreheader();
+
+  // If we don't have a preheader, then insert one. If we already have a
+  // preheader, then we can use it (except if the preheader contains a use of
+  // the CTR register because some such uses might be reordered by the
+  // selection DAG after the mtctr instruction).
+  if (!Preheader || mightUseCTR(TT, Preheader))
+    Preheader = InsertPreheaderForLoop(L, this);
+  if (!Preheader)
+    return MadeChange;
+
+  DEBUG(dbgs() << "Preheader for exit count: " << Preheader->getName() << "\n");
+
+  // Insert the count into the preheader and replace the condition used by the
+  // selected branch.
+  MadeChange = true;
+
+  SCEVExpander SCEVE(*SE, "loopcnt");
+  LLVMContext &C = SE->getContext();
+  Type *CountType = TT.isArch64Bit() ? Type::getInt64Ty(C) :
+                                       Type::getInt32Ty(C);
+  if (!ExitCount->getType()->isPointerTy() &&
+      ExitCount->getType() != CountType)
+    ExitCount = SE->getZeroExtendExpr(ExitCount, CountType);
+  ExitCount = SE->getAddExpr(ExitCount,
+                             SE->getConstant(CountType, 1)); 
+  Value *ECValue = SCEVE.expandCodeFor(ExitCount, CountType,
+                                       Preheader->getTerminator());
+
+  IRBuilder<> CountBuilder(Preheader->getTerminator());
+  Module *M = Preheader->getParent()->getParent();
+  Value *MTCTRFunc = Intrinsic::getDeclaration(M, Intrinsic::ppc_mtctr,
+                                               CountType);
+  CountBuilder.CreateCall(MTCTRFunc, ECValue);
+
+  IRBuilder<> CondBuilder(CountedExitBranch);
+  Value *DecFunc =
+    Intrinsic::getDeclaration(M, Intrinsic::ppc_is_decremented_ctr_nonzero);
+  Value *NewCond = CondBuilder.CreateCall(DecFunc);
+  Value *OldCond = CountedExitBranch->getCondition();
+  CountedExitBranch->setCondition(NewCond);
+
+  // The false branch must exit the loop.
+  if (!L->contains(CountedExitBranch->getSuccessor(0)))
+    CountedExitBranch->swapSuccessors();
+
+  // The old condition may be dead now, and may have even created a dead PHI
+  // (the original induction variable).
+  RecursivelyDeleteTriviallyDeadInstructions(OldCond);
+  DeleteDeadPHIs(CountedExitBlock);
+
+  ++NumCTRLoops;
+  return MadeChange;
 }
 
-/// converToCTRLoop - check if the loop is a candidate for
-/// converting to a CTR loop.  If so, then perform the
-/// transformation.
-///
-/// This function works on innermost loops first.  A loop can
-/// be converted if it is a counting loop; either a register
-/// value or an immediate.
-///
-/// The code makes several assumptions about the representation
-/// of the loop in llvm.
-bool PPCCTRLoops::convertToCTRLoop(MachineLoop *L) {
-  bool Changed = false;
-  // Process nested loops first.
-  for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
-    Changed |= convertToCTRLoop(*I);
-  }
-  // If a nested loop has been converted, then we can't convert this loop.
-  if (Changed) {
-    return Changed;
+#ifndef NDEBUG
+static bool clobbersCTR(const MachineInstr *MI) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg()) {
+      if (MO.isDef() && (MO.getReg() == PPC::CTR || MO.getReg() == PPC::CTR8))
+        return true;
+    } else if (MO.isRegMask()) {
+      if (MO.clobbersPhysReg(PPC::CTR) || MO.clobbersPhysReg(PPC::CTR8))
+        return true;
+    }
   }
 
-  SmallVector<MachineInstr *, 2> OldInsts;
-  // Are we able to determine the trip count for the loop?
-  CountValue *TripCount = getTripCount(L, OldInsts);
-  if (TripCount == 0) {
-    DEBUG(dbgs() << "failed to get trip count!\n");
-    return false;
-  }
+  return false;
+}
 
-  if (TripCount->isImm()) {
-    DEBUG(dbgs() << "constant trip count: " << TripCount->getImm() << "\n");
+static bool verifyCTRBranch(MachineBasicBlock *MBB,
+                            MachineBasicBlock::iterator I) {
+  MachineBasicBlock::iterator BI = I;
+  SmallSet<MachineBasicBlock *, 16>   Visited;
+  SmallVector<MachineBasicBlock *, 8> Preds;
+  bool CheckPreds;
+
+  if (I == MBB->begin()) {
+    Visited.insert(MBB);
+    goto queue_preds;
+  } else
+    --I;
+
+check_block:
+  Visited.insert(MBB);
+  if (I == MBB->end())
+    goto queue_preds;
+
+  CheckPreds = true;
+  for (MachineBasicBlock::iterator IE = MBB->begin();; --I) {
+    unsigned Opc = I->getOpcode();
+    if (Opc == PPC::MTCTRloop || Opc == PPC::MTCTR8loop) {
+      CheckPreds = false;
+      break;
+    }
 
-    // FIXME: We currently can't form 64-bit constants
-    // (including 32-bit unsigned constants)
-    if (!isInt<32>(TripCount->getImm()))
+    if (I != BI && clobbersCTR(I)) {
+      DEBUG(dbgs() << "BB#" << MBB->getNumber() << " (" <<
+                      MBB->getFullName() << ") instruction " << *I <<
+                      " clobbers CTR, invalidating " << "BB#" <<
+                      BI->getParent()->getNumber() << " (" <<
+                      BI->getParent()->getFullName() << ") instruction " <<
+                      *BI << "\n");
       return false;
-  }
+    }
 
-  // Does the loop contain any invalid instructions?
-  if (containsInvalidInstruction(L)) {
-    return false;
+    if (I == IE)
+      break;
   }
-  MachineBasicBlock *Preheader = L->getLoopPreheader();
-  // No preheader means there's not place for the loop instr.
-  if (Preheader == 0) {
-    return false;
-  }
-  MachineBasicBlock::iterator InsertPos = Preheader->getFirstTerminator();
 
-  DebugLoc dl;
-  if (InsertPos != Preheader->end())
-    dl = InsertPos->getDebugLoc();
+  if (!CheckPreds && Preds.empty())
+    return true;
 
-  MachineBasicBlock *LastMBB = L->getExitingBlock();
-  // Don't generate CTR loop if the loop has more than one exit.
-  if (LastMBB == 0) {
-    return false;
-  }
-  MachineBasicBlock::iterator LastI = LastMBB->getFirstTerminator();
-
-  // Determine the loop start.
-  MachineBasicBlock *LoopStart = L->getTopBlock();
-  if (L->getLoopLatch() != LastMBB) {
-    // When the exit and latch are not the same, use the latch block as the
-    // start.
-    // The loop start address is used only after the 1st iteration, and the loop
-    // latch may contains instrs. that need to be executed after the 1st iter.
-    LoopStart = L->getLoopLatch();
-    // Make sure the latch is a successor of the exit, otherwise it won't work.
-    if (!LastMBB->isSuccessor(LoopStart)) {
+  if (CheckPreds) {
+queue_preds:
+    if (MachineFunction::iterator(MBB) == MBB->getParent()->begin()) {
+      DEBUG(dbgs() << "Unable to find a MTCTR instruction for BB#" <<
+                      BI->getParent()->getNumber() << " (" <<
+                      BI->getParent()->getFullName() << ") instruction " <<
+                      *BI << "\n");
       return false;
     }
+
+    for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
+         PIE = MBB->pred_end(); PI != PIE; ++PI)
+      Preds.push_back(*PI);
   }
 
-  // Convert the loop to a CTR loop
-  DEBUG(dbgs() << "Change to CTR loop at "; L->dump());
-
-  MachineFunction *MF = LastMBB->getParent();
-  const PPCSubtarget &Subtarget = MF->getTarget().getSubtarget<PPCSubtarget>();
-  bool isPPC64 = Subtarget.isPPC64();
-
-  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
-  const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
-  const TargetRegisterClass *RC = isPPC64 ? G8RC : GPRC;
-
-  unsigned CountReg;
-  if (TripCount->isReg()) {
-    // Create a copy of the loop count register.
-    const TargetRegisterClass *SrcRC =
-      MF->getRegInfo().getRegClass(TripCount->getReg());
-    CountReg = MF->getRegInfo().createVirtualRegister(RC);
-    unsigned CopyOp = (isPPC64 && GPRC->hasSubClassEq(SrcRC)) ?
-                        (unsigned) PPC::EXTSW_32_64 :
-                        (unsigned) TargetOpcode::COPY;
-    BuildMI(*Preheader, InsertPos, dl,
-            TII->get(CopyOp), CountReg).addReg(TripCount->getReg());
-    if (TripCount->isNeg()) {
-      unsigned CountReg1 = CountReg;
-      CountReg = MF->getRegInfo().createVirtualRegister(RC);
-      BuildMI(*Preheader, InsertPos, dl,
-              TII->get(isPPC64 ? PPC::NEG8 : PPC::NEG),
-                       CountReg).addReg(CountReg1);
+  do {
+    MBB = Preds.pop_back_val();
+    if (!Visited.count(MBB)) {
+      I = MBB->getLastNonDebugInstr();
+      goto check_block;
     }
-  } else {
-    assert(TripCount->isImm() && "Expecting immedate vaule for trip count");
-    // Put the trip count in a register for transfer into the count register.
-
-    int64_t CountImm = TripCount->getImm();
-    if (TripCount->isNeg())
-      CountImm = -CountImm;
-
-    CountReg = MF->getRegInfo().createVirtualRegister(RC);
-    if (abs64(CountImm) > 0x7FFF) {
-      BuildMI(*Preheader, InsertPos, dl,
-              TII->get(isPPC64 ? PPC::LIS8 : PPC::LIS),
-              CountReg).addImm((CountImm >> 16) & 0xFFFF);
-      unsigned CountReg1 = CountReg;
-      CountReg = MF->getRegInfo().createVirtualRegister(RC);
-      BuildMI(*Preheader, InsertPos, dl,
-              TII->get(isPPC64 ? PPC::ORI8 : PPC::ORI),
-              CountReg).addReg(CountReg1).addImm(CountImm & 0xFFFF);
-    } else {
-      BuildMI(*Preheader, InsertPos, dl,
-              TII->get(isPPC64 ? PPC::LI8 : PPC::LI),
-              CountReg).addImm(CountImm);
-    }
-  }
+  } while (!Preds.empty());
 
-  // Add the mtctr instruction to the beginning of the loop.
-  BuildMI(*Preheader, InsertPos, dl,
-          TII->get(isPPC64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(CountReg,
-            TripCount->isImm() ? RegState::Kill : 0);
-
-  // Make sure the loop start always has a reference in the CFG.  We need to
-  // create a BlockAddress operand to get this mechanism to work both the
-  // MachineBasicBlock and BasicBlock objects need the flag set.
-  LoopStart->setHasAddressTaken();
-  // This line is needed to set the hasAddressTaken flag on the BasicBlock
-  // object
-  BlockAddress::get(const_cast<BasicBlock *>(LoopStart->getBasicBlock()));
-
-  // Replace the loop branch with a bdnz instruction.
-  dl = LastI->getDebugLoc();
-  const std::vector<MachineBasicBlock*> Blocks = L->getBlocks();
-  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
-    MachineBasicBlock *MBB = Blocks[i];
-    if (MBB != Preheader)
-      MBB->addLiveIn(isPPC64 ? PPC::CTR8 : PPC::CTR);
-  }
+  return true;
+}
 
-  // The loop ends with either:
-  //  - a conditional branch followed by an unconditional branch, or
-  //  - a conditional branch to the loop start.
-  assert(LastI->getOpcode() == PPC::BCC &&
-         "loop end must start with a BCC instruction");
-  // Either the BCC branches to the beginning of the loop, or it
-  // branches out of the loop and there is an unconditional branch
-  // to the start of the loop.
-  MachineBasicBlock *BranchTarget = LastI->getOperand(2).getMBB();
-  BuildMI(*LastMBB, LastI, dl,
-        TII->get((BranchTarget == LoopStart) ?
-                 (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
-                 (isPPC64 ? PPC::BDZ8 : PPC::BDZ))).addMBB(BranchTarget);
-
-  // Conditional branch; just delete it.
-  DEBUG(dbgs() << "Removing old branch: " << *LastI);
-  LastMBB->erase(LastI);
-
-  delete TripCount;
-
-  // The induction operation (add) and the comparison (cmpwi) may now be
-  // unneeded. If these are unneeded, then remove them.
-  for (unsigned i = 0; i < OldInsts.size(); ++i)
-    removeIfDead(OldInsts[i]);
+bool PPCCTRLoopsVerify::runOnMachineFunction(MachineFunction &MF) {
+  MDT = &getAnalysis<MachineDominatorTree>();
+
+  // Verify that all bdnz/bdz instructions are dominated by a loop mtctr before
+  // any other instructions that might clobber the ctr register.
+  for (MachineFunction::iterator I = MF.begin(), IE = MF.end();
+       I != IE; ++I) {
+    MachineBasicBlock *MBB = I;
+    if (!MDT->isReachableFromEntry(MBB))
+      continue;
+
+    for (MachineBasicBlock::iterator MII = MBB->getFirstTerminator(),
+      MIIE = MBB->end(); MII != MIIE; ++MII) {
+      unsigned Opc = MII->getOpcode();
+      if (Opc == PPC::BDNZ8 || Opc == PPC::BDNZ ||
+          Opc == PPC::BDZ8  || Opc == PPC::BDZ)
+        if (!verifyCTRBranch(MBB, MII))
+          llvm_unreachable("Invalid PPC CTR loop!");
+    }
+  }
 
-  ++NumCTRLoops;
-  return true;
+  return false;
 }
+#endif // NDEBUG
 
diff --git a/lib/Target/PowerPC/PPCCodeEmitter.cpp b/lib/Target/PowerPC/PPCCodeEmitter.cpp
index 6478718..40e4968 100644
--- a/lib/Target/PowerPC/PPCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/PPCCodeEmitter.cpp
@@ -64,8 +64,7 @@ namespace {
     unsigned getDirectBrEncoding(const MachineInstr &MI, unsigned OpNo) const;
     unsigned getCondBrEncoding(const MachineInstr &MI, unsigned OpNo) const;
 
-    unsigned getHA16Encoding(const MachineInstr &MI, unsigned OpNo) const;
-    unsigned getLO16Encoding(const MachineInstr &MI, unsigned OpNo) const;
+    unsigned getS16ImmEncoding(const MachineInstr &MI, unsigned OpNo) const;
     unsigned getMemRIEncoding(const MachineInstr &MI, unsigned OpNo) const;
     unsigned getMemRIXEncoding(const MachineInstr &MI, unsigned OpNo) const;
     unsigned getTLSRegEncoding(const MachineInstr &MI, unsigned OpNo) const;
@@ -194,21 +193,19 @@ unsigned PPCCodeEmitter::getCondBrEncoding(const MachineInstr &MI,
   return 0;
 }
 
-unsigned PPCCodeEmitter::getHA16Encoding(const MachineInstr &MI,
-                                         unsigned OpNo) const {
+unsigned PPCCodeEmitter::getS16ImmEncoding(const MachineInstr &MI,
+                                           unsigned OpNo) const {
   const MachineOperand &MO = MI.getOperand(OpNo);
   if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO);
 
-  MCE.addRelocation(GetRelocation(MO, PPC::reloc_absolute_high));
-  return 0;
-}
+  unsigned RelocID;
+  switch (MO.getTargetFlags() & PPCII::MO_ACCESS_MASK) {
+    default: llvm_unreachable("Unsupported target operand flags!");
+    case PPCII::MO_HA16: RelocID = PPC::reloc_absolute_high; break;
+    case PPCII::MO_LO16: RelocID = PPC::reloc_absolute_low; break;
+  }
 
-unsigned PPCCodeEmitter::getLO16Encoding(const MachineInstr &MI,
-                                         unsigned OpNo) const {
-  const MachineOperand &MO = MI.getOperand(OpNo);
-  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO);
-  
-  MCE.addRelocation(GetRelocation(MO, PPC::reloc_absolute_low));
+  MCE.addRelocation(GetRelocation(MO, RelocID));
   return 0;
 }
 
@@ -237,7 +234,7 @@ unsigned PPCCodeEmitter::getMemRIXEncoding(const MachineInstr &MI,
   
   const MachineOperand &MO = MI.getOperand(OpNo);
   if (MO.isImm())
-    return (getMachineOpValue(MI, MO) & 0x3FFF) | RegBits;
+    return ((getMachineOpValue(MI, MO) >> 2) & 0x3FFF) | RegBits;
   
   MCE.addRelocation(GetRelocation(MO, PPC::reloc_absolute_low_ix));
   return RegBits;
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index 9ec10f6..dabe613 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -334,6 +334,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
     *static_cast<const PPCInstrInfo*>(MF.getTarget().getInstrInfo());
 
   MachineModuleInfo &MMI = MF.getMMI();
+  const MCRegisterInfo &MRI = MMI.getContext().getRegisterInfo();
   DebugLoc dl;
   bool needsFrameMoves = MMI.hasDebugInfo() ||
     MF.getFunction()->needsUnwindTableEntry();
@@ -400,13 +401,13 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
     if (HasFP)
       BuildMI(MBB, MBBI, dl, TII.get(PPC::STD))
         .addReg(PPC::X31)
-        .addImm(FPOffset/4)
+        .addImm(FPOffset)
         .addReg(PPC::X1);
 
     if (MustSaveLR)
       BuildMI(MBB, MBBI, dl, TII.get(PPC::STD))
         .addReg(PPC::X0)
-        .addImm(LROffset / 4)
+        .addImm(LROffset)
         .addReg(PPC::X1);
 
     if (!MustSaveCRs.empty())
@@ -500,7 +501,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
     } else if (isInt<16>(NegFrameSize)) {
       BuildMI(MBB, MBBI, dl, TII.get(PPC::STDU), PPC::X1)
         .addReg(PPC::X1)
-        .addImm(NegFrameSize / 4)
+        .addImm(NegFrameSize)
         .addReg(PPC::X1);
     } else {
       BuildMI(MBB, MBBI, dl, TII.get(PPC::LIS8), PPC::X0)
@@ -515,8 +516,6 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
     }
   }
 
-  std::vector<MachineMove> &Moves = MMI.getFrameMoves();
-
   // Add the "machine moves" for the instructions we generated above, but in
   // reverse order.
   if (needsFrameMoves) {
@@ -525,25 +524,22 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
     BuildMI(MBB, MBBI, dl, TII.get(PPC::PROLOG_LABEL)).addSym(FrameLabel);
 
     // Show update of SP.
-    if (NegFrameSize) {
-      MachineLocation SPDst(MachineLocation::VirtualFP);
-      MachineLocation SPSrc(MachineLocation::VirtualFP, NegFrameSize);
-      Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc));
-    } else {
-      MachineLocation SP(isPPC64 ? PPC::X31 : PPC::R31);
-      Moves.push_back(MachineMove(FrameLabel, SP, SP));
-    }
+    assert(NegFrameSize);
+    MMI.addFrameInst(
+        MCCFIInstruction::createDefCfaOffset(FrameLabel, NegFrameSize));
 
     if (HasFP) {
-      MachineLocation FPDst(MachineLocation::VirtualFP, FPOffset);
-      MachineLocation FPSrc(isPPC64 ? PPC::X31 : PPC::R31);
-      Moves.push_back(MachineMove(FrameLabel, FPDst, FPSrc));
+      unsigned Reg = isPPC64 ? PPC::X31 : PPC::R31;
+      Reg = MRI.getDwarfRegNum(Reg, true);
+      MMI.addFrameInst(
+          MCCFIInstruction::createOffset(FrameLabel, Reg, FPOffset));
     }
 
     if (MustSaveLR) {
-      MachineLocation LRDst(MachineLocation::VirtualFP, LROffset);
-      MachineLocation LRSrc(isPPC64 ? PPC::LR8 : PPC::LR);
-      Moves.push_back(MachineMove(FrameLabel, LRDst, LRSrc));
+      unsigned Reg = isPPC64 ? PPC::LR8 : PPC::LR;
+      Reg = MRI.getDwarfRegNum(Reg, true);
+      MMI.addFrameInst(
+          MCCFIInstruction::createOffset(FrameLabel, Reg, LROffset));
     }
   }
 
@@ -567,10 +563,10 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       // Mark effective beginning of when frame pointer is ready.
       BuildMI(MBB, MBBI, dl, TII.get(PPC::PROLOG_LABEL)).addSym(ReadyLabel);
 
-      MachineLocation FPDst(HasFP ? (isPPC64 ? PPC::X31 : PPC::R31) :
-                                    (isPPC64 ? PPC::X1 : PPC::R1));
-      MachineLocation FPSrc(MachineLocation::VirtualFP);
-      Moves.push_back(MachineMove(ReadyLabel, FPDst, FPSrc));
+      unsigned Reg = HasFP ? (isPPC64 ? PPC::X31 : PPC::R31)
+                           : (isPPC64 ? PPC::X1 : PPC::R1);
+      Reg = MRI.getDwarfRegNum(Reg, true);
+      MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister(ReadyLabel, Reg));
     }
   }
 
@@ -600,16 +596,14 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       if (Subtarget.isSVR4ABI()
 	  && Subtarget.isPPC64()
 	  && (PPC::CR2 <= Reg && Reg <= PPC::CR4)) {
-	MachineLocation CSDst(PPC::X1, 8);
-	MachineLocation CSSrc(PPC::CR2);
-	Moves.push_back(MachineMove(Label, CSDst, CSSrc));
+        MMI.addFrameInst(MCCFIInstruction::createOffset(
+            Label, MRI.getDwarfRegNum(PPC::CR2, true), 8));
 	continue;
       }
 
       int Offset = MFI->getObjectOffset(CSI[I].getFrameIdx());
-      MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
-      MachineLocation CSSrc(Reg);
-      Moves.push_back(MachineMove(Label, CSDst, CSSrc));
+      MMI.addFrameInst(MCCFIInstruction::createOffset(
+          Label, MRI.getDwarfRegNum(Reg, true), Offset));
     }
   }
 }
@@ -747,7 +741,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
   if (isPPC64) {
     if (MustSaveLR)
       BuildMI(MBB, MBBI, dl, TII.get(PPC::LD), PPC::X0)
-        .addImm(LROffset/4).addReg(PPC::X1);
+        .addImm(LROffset).addReg(PPC::X1);
 
     if (!MustSaveCRs.empty())
       BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ8), PPC::X12)
@@ -755,7 +749,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
 
     if (HasFP)
       BuildMI(MBB, MBBI, dl, TII.get(PPC::LD), PPC::X31)
-        .addImm(FPOffset/4).addReg(PPC::X1);
+        .addImm(FPOffset).addReg(PPC::X1);
 
     if (!MustSaveCRs.empty())
       for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
@@ -1170,6 +1164,7 @@ PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
         FuncInfo->addMustSaveCR(Reg);
       } else {
         CRSpilled = true;
+        FuncInfo->setSpillsCR();
 
 	// 32-bit:  FP-relative.  Note that we made sure CR2-CR4 all have
 	// the same frame index in PPCRegisterInfo::hasReservedSpillSlot.
diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
index 4bf1e33..0df50e1 100644
--- a/lib/Target/PowerPC/PPCHazardRecognizers.cpp
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
@@ -71,8 +71,8 @@ void PPCScoreboardHazardRecognizer::Reset() {
 //   3. Handling of the esoteric cases in "Resource-based Instruction Grouping".
 //
 
-PPCHazardRecognizer970::PPCHazardRecognizer970(const TargetInstrInfo &tii)
-  : TII(tii) {
+PPCHazardRecognizer970::PPCHazardRecognizer970(const TargetMachine &TM)
+  : TM(TM) {
   EndDispatchGroup();
 }
 
@@ -91,7 +91,7 @@ PPCHazardRecognizer970::GetInstrType(unsigned Opcode,
                                      bool &isFirst, bool &isSingle,
                                      bool &isCracked,
                                      bool &isLoad, bool &isStore) {
-  const MCInstrDesc &MCID = TII.get(Opcode);
+  const MCInstrDesc &MCID = TM.getInstrInfo()->get(Opcode);
 
   isLoad  = MCID.mayLoad();
   isStore = MCID.mayStore();
diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.h b/lib/Target/PowerPC/PPCHazardRecognizers.h
index 55b45d0..84b8e6d 100644
--- a/lib/Target/PowerPC/PPCHazardRecognizers.h
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.h
@@ -43,7 +43,7 @@ public:
 /// setting the CTR register then branching through it within a dispatch group),
 /// or storing then loading from the same address within a dispatch group.
 class PPCHazardRecognizer970 : public ScheduleHazardRecognizer {
-  const TargetInstrInfo &TII;
+  const TargetMachine &TM;
 
   unsigned NumIssued;  // Number of insts issued, including advanced cycles.
 
@@ -64,7 +64,7 @@ class PPCHazardRecognizer970 : public ScheduleHazardRecognizer {
   unsigned NumStores;
 
 public:
-  PPCHazardRecognizer970(const TargetInstrInfo &TII);
+  PPCHazardRecognizer970(const TargetMachine &TM);
   virtual HazardType getHazardType(SUnit *SU, int Stalls);
   virtual void EmitInstruction(SUnit *SU);
   virtual void AdvanceCycle();
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index aed0fbb..e006945 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -110,13 +110,13 @@ namespace {
 
     /// SelectCC - Select a comparison of the specified values with the
     /// specified condition code, returning the CR# of the expression.
-    SDValue SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC, DebugLoc dl);
+    SDValue SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDLoc dl);
 
     /// SelectAddrImm - Returns true if the address N can be represented by
     /// a base register plus a signed 16-bit displacement [r+imm].
     bool SelectAddrImm(SDValue N, SDValue &Disp,
                        SDValue &Base) {
-      return PPCLowering.SelectAddressRegImm(N, Disp, Base, *CurDAG);
+      return PPCLowering.SelectAddressRegImm(N, Disp, Base, *CurDAG, false);
     }
 
     /// SelectAddrImmOffs - Return true if the operand is valid for a preinc
@@ -145,11 +145,11 @@ namespace {
       return PPCLowering.SelectAddressRegRegOnly(N, Base, Index, *CurDAG);
     }
 
-    /// SelectAddrImmShift - Returns true if the address N can be represented by
-    /// a base register plus a signed 14-bit displacement [r+imm*4].  Suitable
-    /// for use by STD and friends.
-    bool SelectAddrImmShift(SDValue N, SDValue &Disp, SDValue &Base) {
-      return PPCLowering.SelectAddressRegImmShift(N, Disp, Base, *CurDAG);
+    /// SelectAddrImmX4 - Returns true if the address N can be represented by
+    /// a base register plus a signed 16-bit displacement that is a multiple of 4.
+    /// Suitable for use by STD and friends.
+    bool SelectAddrImmX4(SDValue N, SDValue &Disp, SDValue &Base) {
+      return PPCLowering.SelectAddressRegImm(N, Disp, Base, *CurDAG, true);
     }
 
     // Select an address into a single register.
@@ -332,17 +332,17 @@ static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
 bool PPCDAGToDAGISel::isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME) {
   if (isShiftedMask_32(Val)) {
     // look for the first non-zero bit
-    MB = CountLeadingZeros_32(Val);
+    MB = countLeadingZeros(Val);
     // look for the first zero bit after the run of ones
-    ME = CountLeadingZeros_32((Val - 1) ^ Val);
+    ME = countLeadingZeros((Val - 1) ^ Val);
     return true;
   } else {
     Val = ~Val; // invert mask
     if (isShiftedMask_32(Val)) {
       // effectively look for the first zero bit
-      ME = CountLeadingZeros_32(Val) - 1;
+      ME = countLeadingZeros(Val) - 1;
       // effectively look for the first one bit after the run of zeros
-      MB = CountLeadingZeros_32((Val - 1) ^ Val) + 1;
+      MB = countLeadingZeros((Val - 1) ^ Val) + 1;
       return true;
     }
   }
@@ -397,7 +397,7 @@ bool PPCDAGToDAGISel::isRotateAndMask(SDNode *N, unsigned Mask,
 SDNode *PPCDAGToDAGISel::SelectBitfieldInsert(SDNode *N) {
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   APInt LKZ, LKO, RKZ, RKO;
   CurDAG->ComputeMaskedBits(Op0, LKZ, LKO);
@@ -466,7 +466,7 @@ SDNode *PPCDAGToDAGISel::SelectBitfieldInsert(SDNode *N) {
 /// SelectCC - Select a comparison of the specified values with the specified
 /// condition code, returning the CR# of the expression.
 SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS,
-                                    ISD::CondCode CC, DebugLoc dl) {
+                                    ISD::CondCode CC, SDLoc dl) {
   // Always select the LHS.
   unsigned Opc;
 
@@ -710,7 +710,7 @@ static unsigned int getVCmpEQInst(MVT::SimpleValueType VecVT) {
 
 
 SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   unsigned Imm;
   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
   EVT PtrVT = CurDAG->getTargetLoweringInfo().getPointerTy();
@@ -894,7 +894,7 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
 // Select - Convert the specified operand from a target-independent to a
 // target-specific node if it hasn't already been changed.
 SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   if (N->isMachineOpcode())
     return NULL;   // Already selected.
 
@@ -912,7 +912,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
 
       // If it can't be represented as a 32 bit value.
       if (!isInt<32>(Imm)) {
-        Shift = CountTrailingZeros_64(Imm);
+        Shift = countTrailingZeros<uint64_t>(Imm);
         int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
 
         // If the shifted value fits 32 bits.
@@ -1242,6 +1242,15 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
                         getI32Imm(BROpc) };
     return CurDAG->SelectNodeTo(N, SelectCCOp, N->getValueType(0), Ops, 4);
   }
+  case PPCISD::BDNZ:
+  case PPCISD::BDZ: {
+    bool IsPPC64 = PPCSubTarget.isPPC64();
+    SDValue Ops[] = { N->getOperand(1), N->getOperand(0) };
+    return CurDAG->SelectNodeTo(N, N->getOpcode() == PPCISD::BDNZ ?
+                                   (IsPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
+                                   (IsPPC64 ? PPC::BDZ8 : PPC::BDZ),
+                                MVT::Other, Ops, 2);
+  }
   case PPCISD::COND_BRANCH: {
     // Op #0 is the Chain.
     // Op #1 is the PPC::PRED_* number.
@@ -1519,7 +1528,7 @@ void PPCDAGToDAGISel::PostprocessISelDAG() {
     // immediate operand, add it now.
     if (ReplaceFlags) {
       if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) {
-        DebugLoc dl = GA->getDebugLoc();
+        SDLoc dl(GA);
         const GlobalValue *GV = GA->getGlobal();
         ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, 0, Flags);
       } else if (ConstantPoolSDNode *CP =
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 3fcafdc..e2433e7 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -16,6 +16,7 @@
 #include "PPCMachineFunctionInfo.h"
 #include "PPCPerfectShuffle.h"
 #include "PPCTargetMachine.h"
+#include "PPCTargetObjectFile.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -36,21 +37,6 @@
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
-static bool CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
-                                       CCValAssign::LocInfo &LocInfo,
-                                       ISD::ArgFlagsTy &ArgFlags,
-                                       CCState &State);
-static bool CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
-                                              MVT &LocVT,
-                                              CCValAssign::LocInfo &LocInfo,
-                                              ISD::ArgFlagsTy &ArgFlags,
-                                              CCState &State);
-static bool CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
-                                                MVT &LocVT,
-                                                CCValAssign::LocInfo &LocInfo,
-                                                ISD::ArgFlagsTy &ArgFlags,
-                                                CCState &State);
-
 static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
 cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
 
@@ -64,14 +50,15 @@ static TargetLoweringObjectFile *CreateTLOF(const PPCTargetMachine &TM) {
   if (TM.getSubtargetImpl()->isDarwin())
     return new TargetLoweringObjectFileMachO();
 
+  if (TM.getSubtargetImpl()->isSVR4ABI())
+    return new PPC64LinuxTargetObjectFile();
+
   return new TargetLoweringObjectFileELF();
 }
 
 PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   : TargetLowering(TM, CreateTLOF(TM)), PPCSubTarget(*TM.getSubtargetImpl()) {
   const PPCSubtarget *Subtarget = &TM.getSubtarget<PPCSubtarget>();
-  PPCRegInfo = TM.getRegisterInfo();
-  PPCII = TM.getInstrInfo();
 
   setPow2DivIsCheap();
 
@@ -309,6 +296,9 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
 
+  // To handle counter-based loop conditions.
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom);
+
   // Comparisons that require checking two conditions.
   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
   setCondCodeAction(ISD::SETULT, MVT::f64, Expand);
@@ -529,9 +519,11 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
 
   // We have target-specific dag combine patterns for the following nodes:
   setTargetDAGCombine(ISD::SINT_TO_FP);
+  setTargetDAGCombine(ISD::LOAD);
   setTargetDAGCombine(ISD::STORE);
   setTargetDAGCombine(ISD::BR_CC);
   setTargetDAGCombine(ISD::BSWAP);
+  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
 
   // Use reciprocal estimates.
   if (TM.Options.UnsafeFPMath) {
@@ -642,6 +634,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::LARX:            return "PPCISD::LARX";
   case PPCISD::STCX:            return "PPCISD::STCX";
   case PPCISD::COND_BRANCH:     return "PPCISD::COND_BRANCH";
+  case PPCISD::BDNZ:            return "PPCISD::BDNZ";
+  case PPCISD::BDZ:             return "PPCISD::BDZ";
   case PPCISD::MFFS:            return "PPCISD::MFFS";
   case PPCISD::FADDRTZ:         return "PPCISD::FADDRTZ";
   case PPCISD::TC_RETURN:       return "PPCISD::TC_RETURN";
@@ -662,10 +656,11 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA";
   case PPCISD::ADDI_DTPREL_L:   return "PPCISD::ADDI_DTPREL_L";
   case PPCISD::VADD_SPLAT:      return "PPCISD::VADD_SPLAT";
+  case PPCISD::SC:              return "PPCISD::SC";
   }
 }
 
-EVT PPCTargetLowering::getSetCCResultType(EVT VT) const {
+EVT PPCTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
   if (!VT.isVector())
     return MVT::i32;
   return VT.changeVectorElementTypeToInteger();
@@ -1038,20 +1033,23 @@ bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base,
 
 /// Returns true if the address N can be represented by a base register plus
 /// a signed 16-bit displacement [r+imm], and if it is not better
-/// represented as reg+reg.
+/// represented as reg+reg.  If Aligned is true, only accept displacements
+/// suitable for STD and friends, i.e. multiples of 4.
 bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
                                             SDValue &Base,
-                                            SelectionDAG &DAG) const {
+                                            SelectionDAG &DAG,
+                                            bool Aligned) const {
   // FIXME dl should come from parent load or store, not from address
-  DebugLoc dl = N.getDebugLoc();
+  SDLoc dl(N);
   // If this can be more profitably realized as r+r, fail.
   if (SelectAddressRegReg(N, Disp, Base, DAG))
     return false;
 
   if (N.getOpcode() == ISD::ADD) {
     short imm = 0;
-    if (isIntS16Immediate(N.getOperand(1), imm)) {
-      Disp = DAG.getTargetConstant((int)imm & 0xFFFF, MVT::i32);
+    if (isIntS16Immediate(N.getOperand(1), imm) &&
+        (!Aligned || (imm & 3) == 0)) {
+      Disp = DAG.getTargetConstant(imm, N.getValueType());
       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
         Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
       } else {
@@ -1072,7 +1070,8 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
     }
   } else if (N.getOpcode() == ISD::OR) {
     short imm = 0;
-    if (isIntS16Immediate(N.getOperand(1), imm)) {
+    if (isIntS16Immediate(N.getOperand(1), imm) &&
+        (!Aligned || (imm & 3) == 0)) {
       // If this is an or of disjoint bitfields, we can codegen this as an add
       // (for better address arithmetic) if the LHS and RHS of the OR are
       // provably disjoint.
@@ -1083,7 +1082,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
         // If all of the bits are known zero on the LHS or RHS, the add won't
         // carry.
         Base = N.getOperand(0);
-        Disp = DAG.getTargetConstant((int)imm & 0xFFFF, MVT::i32);
+        Disp = DAG.getTargetConstant(imm, N.getValueType());
         return true;
       }
     }
@@ -1093,7 +1092,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
     // If this address fits entirely in a 16-bit sext immediate field, codegen
     // this as "d, 0"
     short Imm;
-    if (isIntS16Immediate(CN, Imm)) {
+    if (isIntS16Immediate(CN, Imm) && (!Aligned || (Imm & 3) == 0)) {
       Disp = DAG.getTargetConstant(Imm, CN->getValueType(0));
       Base = DAG.getRegister(PPCSubTarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
                              CN->getValueType(0));
@@ -1101,8 +1100,9 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
     }
 
     // Handle 32-bit sext immediates with LIS + addr mode.
-    if (CN->getValueType(0) == MVT::i32 ||
-        (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) {
+    if ((CN->getValueType(0) == MVT::i32 ||
+         (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) &&
+        (!Aligned || (CN->getZExtValue() & 3) == 0)) {
       int Addr = (int)CN->getZExtValue();
 
       // Otherwise, break this down into an LIS + disp.
@@ -1150,92 +1150,6 @@ bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
   return true;
 }
 
-/// SelectAddressRegImmShift - Returns true if the address N can be
-/// represented by a base register plus a signed 14-bit displacement
-/// [r+imm*4].  Suitable for use by STD and friends.
-bool PPCTargetLowering::SelectAddressRegImmShift(SDValue N, SDValue &Disp,
-                                                 SDValue &Base,
-                                                 SelectionDAG &DAG) const {
-  // FIXME dl should come from the parent load or store, not the address
-  DebugLoc dl = N.getDebugLoc();
-  // If this can be more profitably realized as r+r, fail.
-  if (SelectAddressRegReg(N, Disp, Base, DAG))
-    return false;
-
-  if (N.getOpcode() == ISD::ADD) {
-    short imm = 0;
-    if (isIntS16Immediate(N.getOperand(1), imm) && (imm & 3) == 0) {
-      Disp = DAG.getTargetConstant(((int)imm & 0xFFFF) >> 2, MVT::i32);
-      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
-        Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
-      } else {
-        Base = N.getOperand(0);
-      }
-      return true; // [r+i]
-    } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {
-      // Match LOAD (ADD (X, Lo(G))).
-      assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
-             && "Cannot handle constant offsets yet!");
-      Disp = N.getOperand(1).getOperand(0);  // The global address.
-      assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
-             Disp.getOpcode() == ISD::TargetConstantPool ||
-             Disp.getOpcode() == ISD::TargetJumpTable);
-      Base = N.getOperand(0);
-      return true;  // [&g+r]
-    }
-  } else if (N.getOpcode() == ISD::OR) {
-    short imm = 0;
-    if (isIntS16Immediate(N.getOperand(1), imm) && (imm & 3) == 0) {
-      // If this is an or of disjoint bitfields, we can codegen this as an add
-      // (for better address arithmetic) if the LHS and RHS of the OR are
-      // provably disjoint.
-      APInt LHSKnownZero, LHSKnownOne;
-      DAG.ComputeMaskedBits(N.getOperand(0), LHSKnownZero, LHSKnownOne);
-      if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {
-        // If all of the bits are known zero on the LHS or RHS, the add won't
-        // carry.
-        Base = N.getOperand(0);
-        Disp = DAG.getTargetConstant(((int)imm & 0xFFFF) >> 2, MVT::i32);
-        return true;
-      }
-    }
-  } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
-    // Loading from a constant address.  Verify low two bits are clear.
-    if ((CN->getZExtValue() & 3) == 0) {
-      // If this address fits entirely in a 14-bit sext immediate field, codegen
-      // this as "d, 0"
-      short Imm;
-      if (isIntS16Immediate(CN, Imm)) {
-        Disp = DAG.getTargetConstant((unsigned short)Imm >> 2, getPointerTy());
-        Base = DAG.getRegister(PPCSubTarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
-                               CN->getValueType(0));
-        return true;
-      }
-
-      // Fold the low-part of 32-bit absolute addresses into addr mode.
-      if (CN->getValueType(0) == MVT::i32 ||
-          (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) {
-        int Addr = (int)CN->getZExtValue();
-
-        // Otherwise, break this down into an LIS + disp.
-        Disp = DAG.getTargetConstant((short)Addr >> 2, MVT::i32);
-        Base = DAG.getTargetConstant((Addr-(signed short)Addr) >> 16, MVT::i32);
-        unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8;
-        Base = SDValue(DAG.getMachineNode(Opc, dl, CN->getValueType(0), Base),0);
-        return true;
-      }
-    }
-  }
-
-  Disp = DAG.getTargetConstant(0, getPointerTy());
-  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N))
-    Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
-  else
-    Base = N;
-  return true;      // [r+0]
-}
-
-
 /// getPreIndexedAddressParts - returns true by value, base pointer and
 /// offset pointer and addressing mode by reference if the node's address
 /// can be legally represented as pre-indexed load / store address.
@@ -1288,18 +1202,16 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
     return true;
   }
 
-  // LDU/STU use reg+imm*4, others use reg+imm.
+  // LDU/STU can only handle immediates that are a multiple of 4.
   if (VT != MVT::i64) {
-    // reg + imm
-    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG))
+    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, false))
       return false;
   } else {
     // LDU/STU need an address with at least 4-byte alignment.
     if (Alignment < 4)
       return false;
 
-    // reg + imm * 4.
-    if (!SelectAddressRegImmShift(Ptr, Offset, Base, DAG))
+    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, true))
       return false;
   }
 
@@ -1355,7 +1267,7 @@ static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC,
                              SelectionDAG &DAG) {
   EVT PtrVT = HiPart.getValueType();
   SDValue Zero = DAG.getConstant(0, PtrVT);
-  DebugLoc DL = HiPart.getDebugLoc();
+  SDLoc DL(HiPart);
 
   SDValue Hi = DAG.getNode(PPCISD::Hi, DL, PtrVT, HiPart, Zero);
   SDValue Lo = DAG.getNode(PPCISD::Lo, DL, PtrVT, LoPart, Zero);
@@ -1380,7 +1292,7 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
   // The actual address of the GlobalValue is stored in the TOC.
   if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) {
     SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0);
-    return DAG.getNode(PPCISD::TOC_ENTRY, CP->getDebugLoc(), MVT::i64, GA,
+    return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(CP), MVT::i64, GA,
                        DAG.getRegister(PPC::X2, MVT::i64));
   }
 
@@ -1401,7 +1313,7 @@ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   // The actual address of the GlobalValue is stored in the TOC.
   if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) {
     SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
-    return DAG.getNode(PPCISD::TOC_ENTRY, JT->getDebugLoc(), MVT::i64, GA,
+    return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(JT), MVT::i64, GA,
                        DAG.getRegister(PPC::X2, MVT::i64));
   }
 
@@ -1429,7 +1341,7 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
                                               SelectionDAG &DAG) const {
 
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
-  DebugLoc dl = GA->getDebugLoc();
+  SDLoc dl(GA);
   const GlobalValue *GV = GA->getGlobal();
   EVT PtrVT = getPointerTy();
   bool is64bit = PPCSubTarget.isPPC64();
@@ -1515,7 +1427,7 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
                                               SelectionDAG &DAG) const {
   EVT PtrVT = Op.getValueType();
   GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
-  DebugLoc DL = GSDN->getDebugLoc();
+  SDLoc DL(GSDN);
   const GlobalValue *GV = GSDN->getGlobal();
 
   // 64-bit SVR4 ABI code is always position-independent.
@@ -1546,7 +1458,7 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
 
 SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   // If we're comparing for equality to zero, expose the fact that this is
   // implented as a ctlz/srl pair on ppc, so that the dag combiner can
@@ -1595,7 +1507,7 @@ SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG,
   SDValue InChain = Node->getOperand(0);
   SDValue VAListPtr = Node->getOperand(1);
   const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
 
   assert(!Subtarget.isPPC64() && "LowerVAARG is PPC32 only");
 
@@ -1706,7 +1618,7 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   SDValue Trmp = Op.getOperand(1); // trampoline
   SDValue FPtr = Op.getOperand(2); // nested function
   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   bool isPPC64 = (PtrVT == MVT::i64);
@@ -1748,7 +1660,7 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG,
   MachineFunction &MF = DAG.getMachineFunction();
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
 
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   if (Subtarget.isDarwinABI() || Subtarget.isPPC64()) {
     // vastart just stores the address of the VarArgsFrameIndex slot into the
@@ -1842,18 +1754,18 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG,
 
 #include "PPCGenCallingConv.inc"
 
-static bool CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
-                                       CCValAssign::LocInfo &LocInfo,
-                                       ISD::ArgFlagsTy &ArgFlags,
-                                       CCState &State) {
+bool llvm::CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                      CCValAssign::LocInfo &LocInfo,
+                                      ISD::ArgFlagsTy &ArgFlags,
+                                      CCState &State) {
   return true;
 }
 
-static bool CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
-                                              MVT &LocVT,
-                                              CCValAssign::LocInfo &LocInfo,
-                                              ISD::ArgFlagsTy &ArgFlags,
-                                              CCState &State) {
+bool llvm::CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
+                                             MVT &LocVT,
+                                             CCValAssign::LocInfo &LocInfo,
+                                             ISD::ArgFlagsTy &ArgFlags,
+                                             CCState &State) {
   static const uint16_t ArgRegs[] = {
     PPC::R3, PPC::R4, PPC::R5, PPC::R6,
     PPC::R7, PPC::R8, PPC::R9, PPC::R10,
@@ -1876,11 +1788,11 @@ static bool CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
   return false;
 }
 
-static bool CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
-                                                MVT &LocVT,
-                                                CCValAssign::LocInfo &LocInfo,
-                                                ISD::ArgFlagsTy &ArgFlags,
-                                                CCState &State) {
+bool llvm::CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
+                                               MVT &LocVT,
+                                               CCValAssign::LocInfo &LocInfo,
+                                               ISD::ArgFlagsTy &ArgFlags,
+                                               CCState &State) {
   static const uint16_t ArgRegs[] = {
     PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
     PPC::F8
@@ -1931,7 +1843,7 @@ PPCTargetLowering::LowerFormalArguments(SDValue Chain,
                                         CallingConv::ID CallConv, bool isVarArg,
                                         const SmallVectorImpl<ISD::InputArg>
                                           &Ins,
-                                        DebugLoc dl, SelectionDAG &DAG,
+                                        SDLoc dl, SelectionDAG &DAG,
                                         SmallVectorImpl<SDValue> &InVals)
                                           const {
   if (PPCSubTarget.isSVR4ABI()) {
@@ -1953,7 +1865,7 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
                                       CallingConv::ID CallConv, bool isVarArg,
                                       const SmallVectorImpl<ISD::InputArg>
                                         &Ins,
-                                      DebugLoc dl, SelectionDAG &DAG,
+                                      SDLoc dl, SelectionDAG &DAG,
                                       SmallVectorImpl<SDValue> &InVals) const {
 
   // 32-bit SVR4 ABI Stack Frame Layout:
@@ -2170,14 +2082,14 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
 SDValue
 PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT,
                                      SelectionDAG &DAG, SDValue ArgVal,
-                                     DebugLoc dl) const {
+                                     SDLoc dl) const {
   if (Flags.isSExt())
     ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal,
                          DAG.getValueType(ObjectVT));
   else if (Flags.isZExt())
     ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal,
                          DAG.getValueType(ObjectVT));
-  
+
   return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal);
 }
 
@@ -2213,7 +2125,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
                                       CallingConv::ID CallConv, bool isVarArg,
                                       const SmallVectorImpl<ISD::InputArg>
                                         &Ins,
-                                      DebugLoc dl, SelectionDAG &DAG,
+                                      SDLoc dl, SelectionDAG &DAG,
                                       SmallVectorImpl<SDValue> &InVals) const {
   // TODO: add description of PPC stack frame format, or at least some docs.
   //
@@ -2502,7 +2414,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
                                       CallingConv::ID CallConv, bool isVarArg,
                                       const SmallVectorImpl<ISD::InputArg>
                                         &Ins,
-                                      DebugLoc dl, SelectionDAG &DAG,
+                                      SDLoc dl, SelectionDAG &DAG,
                                       SmallVectorImpl<SDValue> &InVals) const {
   // TODO: add description of PPC stack frame format, or at least some docs.
   //
@@ -2600,17 +2512,17 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
 
   SmallVector<SDValue, 8> MemOps;
   unsigned nAltivecParamsAtEnd = 0;
-  // FIXME: FuncArg and Ins[ArgNo] must reference the same argument.
-  // When passing anonymous aggregates, this is currently not true.
-  // See LowerFormalArguments_64SVR4 for a fix.
   Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
-  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo, ++FuncArg) {
+  unsigned CurArgIdx = 0;
+  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
     SDValue ArgVal;
     bool needsLoad = false;
     EVT ObjectVT = Ins[ArgNo].VT;
     unsigned ObjSize = ObjectVT.getSizeInBits()/8;
     unsigned ArgSize = ObjSize;
     ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
+    std::advance(FuncArg, Ins[ArgNo].OrigArgIndex - CurArgIdx);
+    CurArgIdx = Ins[ArgNo].OrigArgIndex;
 
     unsigned CurArgOffset = ArgOffset;
 
@@ -3004,7 +2916,7 @@ StoreTailCallArgumentsToStackSlot(SelectionDAG &DAG,
                                            SDValue Chain,
                    const SmallVector<TailCallArgumentInfo, 8> &TailCallArgs,
                    SmallVector<SDValue, 8> &MemOpChains,
-                   DebugLoc dl) {
+                   SDLoc dl) {
   for (unsigned i = 0, e = TailCallArgs.size(); i != e; ++i) {
     SDValue Arg = TailCallArgs[i].Arg;
     SDValue FIN = TailCallArgs[i].FrameIdxOp;
@@ -3026,7 +2938,7 @@ static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG,
                                                int SPDiff,
                                                bool isPPC64,
                                                bool isDarwinABI,
-                                               DebugLoc dl) {
+                                               SDLoc dl) {
   if (SPDiff) {
     // Calculate the new stack slot for the return address.
     int SlotSize = isPPC64 ? 8 : 4;
@@ -3083,7 +2995,7 @@ SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG,
                                                         SDValue &LROpOut,
                                                         SDValue &FPOpOut,
                                                         bool isDarwinABI,
-                                                        DebugLoc dl) const {
+                                                        SDLoc dl) const {
   if (SPDiff) {
     // Load the LR and FP stack slot for later adjusting.
     EVT VT = PPCSubTarget.isPPC64() ? MVT::i64 : MVT::i32;
@@ -3113,7 +3025,7 @@ SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG,
 static SDValue
 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
-                          DebugLoc dl) {
+                          SDLoc dl) {
   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
                        false, false, MachinePointerInfo(0),
@@ -3128,7 +3040,7 @@ LowerMemOpCallTo(SelectionDAG &DAG, MachineFunction &MF, SDValue Chain,
                  unsigned ArgOffset, bool isPPC64, bool isTailCall,
                  bool isVector, SmallVector<SDValue, 8> &MemOpChains,
                  SmallVector<TailCallArgumentInfo, 8> &TailCallArguments,
-                 DebugLoc dl) {
+                 SDLoc dl) {
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   if (!isTailCall) {
     if (isVector) {
@@ -3149,7 +3061,7 @@ LowerMemOpCallTo(SelectionDAG &DAG, MachineFunction &MF, SDValue Chain,
 
 static
 void PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain,
-                     DebugLoc dl, bool isPPC64, int SPDiff, unsigned NumBytes,
+                     SDLoc dl, bool isPPC64, int SPDiff, unsigned NumBytes,
                      SDValue LROp, SDValue FPOp, bool isDarwinABI,
                      SmallVector<TailCallArgumentInfo, 8> &TailCallArguments) {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -3171,13 +3083,13 @@ void PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain,
 
   // Emit callseq_end just before tailcall node.
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                             DAG.getIntPtrConstant(0, true), InFlag);
+                             DAG.getIntPtrConstant(0, true), InFlag, dl);
   InFlag = Chain.getValue(1);
 }
 
 static
 unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
-                     SDValue &Chain, DebugLoc dl, int SPDiff, bool isTailCall,
+                     SDValue &Chain, SDLoc dl, int SPDiff, bool isTailCall,
                      SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass,
                      SmallVector<SDValue, 8> &Ops, std::vector<EVT> &NodeTys,
                      const PPCSubtarget &PPCSubTarget) {
@@ -3363,7 +3275,7 @@ SDValue
 PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
                                    CallingConv::ID CallConv, bool isVarArg,
                                    const SmallVectorImpl<ISD::InputArg> &Ins,
-                                   DebugLoc dl, SelectionDAG &DAG,
+                                   SDLoc dl, SelectionDAG &DAG,
                                    SmallVectorImpl<SDValue> &InVals) const {
 
   SmallVector<CCValAssign, 16> RVLocs;
@@ -3406,7 +3318,7 @@ PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 }
 
 SDValue
-PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
+PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl,
                               bool isTailCall, bool isVarArg,
                               SelectionDAG &DAG,
                               SmallVector<std::pair<unsigned, SDValue>, 8>
@@ -3493,7 +3405,7 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
 
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
                              DAG.getIntPtrConstant(BytesCalleePops, true),
-                             InFlag);
+                             InFlag, dl);
   if (!Ins.empty())
     InFlag = Chain.getValue(1);
 
@@ -3505,7 +3417,7 @@ SDValue
 PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                              SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG                     = CLI.DAG;
-  DebugLoc &dl                          = CLI.DL;
+  SDLoc &dl                          = CLI.DL;
   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
   SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
   SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
@@ -3542,7 +3454,7 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
                                     const SmallVectorImpl<SDValue> &OutVals,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SDLoc dl, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &InVals) const {
   // See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description
   // of the 32-bit SVR4 ABI stack frame layout.
@@ -3628,7 +3540,8 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
 
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                               dl);
   SDValue CallSeqStart = Chain;
 
   // Load the return address and frame pointer so it can be moved somewhere else
@@ -3679,7 +3592,8 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
 
       // This must go outside the CALLSEQ_START..END.
       SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
-                           CallSeqStart.getNode()->getOperand(1));
+                           CallSeqStart.getNode()->getOperand(1),
+                           SDLoc(MemcpyCall));
       DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
                              NewCallSeqStart.getNode());
       Chain = CallSeqStart = NewCallSeqStart;
@@ -3755,13 +3669,14 @@ PPCTargetLowering::createMemcpyOutsideCallSeq(SDValue Arg, SDValue PtrOff,
                                               SDValue CallSeqStart,
                                               ISD::ArgFlagsTy Flags,
                                               SelectionDAG &DAG,
-                                              DebugLoc dl) const {
+                                              SDLoc dl) const {
   SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff,
                         CallSeqStart.getNode()->getOperand(0),
                         Flags, DAG, dl);
   // The MEMCPY must go outside the CALLSEQ_START..END.
   SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
-                             CallSeqStart.getNode()->getOperand(1));
+                             CallSeqStart.getNode()->getOperand(1),
+                             SDLoc(MemcpyCall));
   DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
                          NewCallSeqStart.getNode());
   return NewCallSeqStart;
@@ -3774,7 +3689,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
                                     const SmallVectorImpl<SDValue> &OutVals,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SDLoc dl, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &InVals) const {
 
   unsigned NumOps = Outs.size();
@@ -3815,7 +3730,8 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
 
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                               dl);
   SDValue CallSeqStart = Chain;
 
   // Load the return address and frame pointer so it can be move somewhere else
@@ -3940,7 +3856,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
         // register.
         // FIXME: The memcpy seems to produce pretty awful code for
         // small aggregates, particularly for packed ones.
-        // FIXME: It would be preferable to use the slot in the 
+        // FIXME: It would be preferable to use the slot in the
         // parameter save area instead of a new local variable.
         SDValue Const = DAG.getConstant(8 - Size, PtrOff.getValueType());
         SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
@@ -4145,7 +4061,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
                                     const SmallVectorImpl<SDValue> &OutVals,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SDLoc dl, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &InVals) const {
 
   unsigned NumOps = Outs.size();
@@ -4186,7 +4102,8 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
 
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                               dl);
   SDValue CallSeqStart = Chain;
 
   // Load the return address and frame pointer so it can be move somewhere else
@@ -4502,7 +4419,7 @@ PPCTargetLowering::LowerReturn(SDValue Chain,
                                CallingConv::ID CallConv, bool isVarArg,
                                const SmallVectorImpl<ISD::OutputArg> &Outs,
                                const SmallVectorImpl<SDValue> &OutVals,
-                               DebugLoc dl, SelectionDAG &DAG) const {
+                               SDLoc dl, SelectionDAG &DAG) const {
 
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
@@ -4551,7 +4468,7 @@ PPCTargetLowering::LowerReturn(SDValue Chain,
 SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
                                    const PPCSubtarget &Subtarget) const {
   // When we pop the dynamic allocation we need to restore the SP link.
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   // Get the corect type for pointers.
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
@@ -4636,7 +4553,7 @@ SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   // Get the inputs.
   SDValue Chain = Op.getOperand(0);
   SDValue Size  = Op.getOperand(1);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   // Get the corect type for pointers.
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
@@ -4653,7 +4570,7 @@ SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
 
 SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
                                                SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   return DAG.getNode(PPCISD::EH_SJLJ_SETJMP, DL,
                      DAG.getVTList(MVT::i32, MVT::Other),
                      Op.getOperand(0), Op.getOperand(1));
@@ -4661,7 +4578,7 @@ SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
 
 SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
                                                 SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   return DAG.getNode(PPCISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
                      Op.getOperand(0), Op.getOperand(1));
 }
@@ -4687,7 +4604,7 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   EVT CmpVT = Op.getOperand(0).getValueType();
   SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
   SDValue TV  = Op.getOperand(2), FV  = Op.getOperand(3);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   // If the RHS of the comparison is a 0.0, we don't need to do the
   // subtraction at all.
@@ -4768,7 +4685,7 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 
 // FIXME: Split this code up when LegalizeDAGTypes lands.
 SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
-                                           DebugLoc dl) const {
+                                           SDLoc dl) const {
   assert(Op.getOperand(0).getValueType().isFloatingPoint());
   SDValue Src = Op.getOperand(0);
   if (Src.getValueType() == MVT::f32)
@@ -4827,7 +4744,7 @@ SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
 
 SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
                                            SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   // Don't handle ppc_fp128 here; let it be lowered to a libcall.
   if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
     return SDValue();
@@ -4961,7 +4878,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
 
 SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
                                             SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   /*
    The rounding mode is in bits 30:31 of FPSR, and has the following
    settings:
@@ -5027,7 +4944,7 @@ SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
 SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   unsigned BitWidth = VT.getSizeInBits();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   assert(Op.getNumOperands() == 3 &&
          VT == Op.getOperand(1).getValueType() &&
          "Unexpected SHL!");
@@ -5055,7 +4972,7 @@ SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned BitWidth = VT.getSizeInBits();
   assert(Op.getNumOperands() == 3 &&
          VT == Op.getOperand(1).getValueType() &&
@@ -5083,7 +5000,7 @@ SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const {
 }
 
 SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   EVT VT = Op.getValueType();
   unsigned BitWidth = VT.getSizeInBits();
   assert(Op.getNumOperands() == 3 &&
@@ -5118,7 +5035,7 @@ SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {
 /// BuildSplatI - Build a canonical splati of Val with an element size of
 /// SplatSize.  Cast the result to VT.
 static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT,
-                             SelectionDAG &DAG, DebugLoc dl) {
+                             SelectionDAG &DAG, SDLoc dl) {
   assert(Val >= -16 && Val <= 15 && "vsplti is out of range!");
 
   static const EVT VTys[] = { // canonical VT to use for each size.
@@ -5142,10 +5059,20 @@ static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT,
   return DAG.getNode(ISD::BITCAST, dl, ReqVT, Res);
 }
 
+/// BuildIntrinsicOp - Return a unary operator intrinsic node with the
+/// specified intrinsic ID.
+static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op,
+                                SelectionDAG &DAG, SDLoc dl,
+                                EVT DestVT = MVT::Other) {
+  if (DestVT == MVT::Other) DestVT = Op.getValueType();
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
+                     DAG.getConstant(IID, MVT::i32), Op);
+}
+
 /// BuildIntrinsicOp - Return a binary operator intrinsic node with the
 /// specified intrinsic ID.
 static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS,
-                                SelectionDAG &DAG, DebugLoc dl,
+                                SelectionDAG &DAG, SDLoc dl,
                                 EVT DestVT = MVT::Other) {
   if (DestVT == MVT::Other) DestVT = LHS.getValueType();
   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
@@ -5156,7 +5083,7 @@ static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS,
 /// specified intrinsic ID.
 static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1,
                                 SDValue Op2, SelectionDAG &DAG,
-                                DebugLoc dl, EVT DestVT = MVT::Other) {
+                                SDLoc dl, EVT DestVT = MVT::Other) {
   if (DestVT == MVT::Other) DestVT = Op0.getValueType();
   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
                      DAG.getConstant(IID, MVT::i32), Op0, Op1, Op2);
@@ -5166,7 +5093,7 @@ static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1,
 /// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified
 /// amount.  The result has the specified value type.
 static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt,
-                             EVT VT, SelectionDAG &DAG, DebugLoc dl) {
+                             EVT VT, SelectionDAG &DAG, SDLoc dl) {
   // Force LHS/RHS to be the right type.
   LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, LHS);
   RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, RHS);
@@ -5185,7 +5112,7 @@ static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt,
 // sequence of ops that should be used.
 SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
                                              SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
   assert(BVN != 0 && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");
 
@@ -5341,7 +5268,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
 /// the specified operations to build the shuffle.
 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
                                       SDValue RHS, SelectionDAG &DAG,
-                                      DebugLoc dl) {
+                                      SDLoc dl) {
   unsigned OpNum = (PFEntry >> 26) & 0x0F;
   unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
   unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
@@ -5420,7 +5347,7 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
 /// lowered into a vperm.
 SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
                                                SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
@@ -5587,7 +5514,7 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                    SelectionDAG &DAG) const {
   // If this is a lowered altivec predicate compare, CompareOpc is set to the
   // opcode number of the comparison.
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   int CompareOpc;
   bool isDot;
   if (!getAltivecCompareInfo(Op, CompareOpc, isDot))
@@ -5651,7 +5578,7 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
 SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
                                                    SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   // Create a stack slot that is 16-byte aligned.
   MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
   int FrameIdx = FrameInfo->CreateStackObject(16, 16, false);
@@ -5668,7 +5595,7 @@ SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
 }
 
 SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   if (Op.getValueType() == MVT::v4i32) {
     SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
 
@@ -5755,7 +5682,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
   case ISD::FP_TO_UINT:
   case ISD::FP_TO_SINT:         return LowerFP_TO_INT(Op, DAG,
-                                                       Op.getDebugLoc());
+                                                       SDLoc(Op));
   case ISD::UINT_TO_FP:
   case ISD::SINT_TO_FP:         return LowerINT_TO_FP(Op, DAG);
   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
@@ -5772,6 +5699,9 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
   case ISD::MUL:                return LowerMUL(Op, DAG);
 
+  // For counter-based loop handling.
+  case ISD::INTRINSIC_W_CHAIN:  return SDValue();
+
   // Frame & Return address.
   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
@@ -5782,10 +5712,26 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
                                            SmallVectorImpl<SDValue>&Results,
                                            SelectionDAG &DAG) const {
   const TargetMachine &TM = getTargetMachine();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   switch (N->getOpcode()) {
   default:
     llvm_unreachable("Do not know how to custom type legalize this operation!");
+  case ISD::INTRINSIC_W_CHAIN: {
+    if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() !=
+        Intrinsic::ppc_is_decremented_ctr_nonzero)
+      break;
+
+    assert(N->getValueType(0) == MVT::i1 &&
+           "Unexpected result type for CTR decrement intrinsic");
+    EVT SVT = getSetCCResultType(*DAG.getContext(), N->getValueType(0));
+    SDVTList VTs = DAG.getVTList(SVT, MVT::Other);
+    SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0),
+                                 N->getOperand(1)); 
+
+    Results.push_back(NewInt);
+    Results.push_back(NewInt.getValue(1));
+    break;
+  }
   case ISD::VAARG: {
     if (!TM.getSubtarget<PPCSubtarget>().isSVR4ABI()
         || TM.getSubtarget<PPCSubtarget>().isPPC64())
@@ -6101,7 +6047,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   if (PPCSubTarget.isPPC64() && PPCSubTarget.isSVR4ABI()) {
     MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD))
             .addReg(PPC::X2)
-            .addImm(TOCOffset / 4)
+            .addImm(TOCOffset)
             .addReg(BufReg);
 
     MIB.setMemRefs(MMOBegin, MMOEnd);
@@ -6109,7 +6055,9 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
 
   // Setup
   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB);
-  MIB.addRegMask(PPCRegInfo->getNoPreservedMask());
+  const PPCRegisterInfo *TRI =
+    static_cast<const PPCRegisterInfo*>(getTargetMachine().getRegisterInfo());
+  MIB.addRegMask(TRI->getNoPreservedMask());
 
   BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1);
 
@@ -6129,7 +6077,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   if (PPCSubTarget.isPPC64()) {
     MIB = BuildMI(mainMBB, DL, TII->get(PPC::STD))
             .addReg(LabelReg)
-            .addImm(LabelOffset / 4)
+            .addImm(LabelOffset)
             .addReg(BufReg);
   } else {
     MIB = BuildMI(mainMBB, DL, TII->get(PPC::STW))
@@ -6202,7 +6150,7 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
   // Reload IP
   if (PVT == MVT::i64) {
     MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), Tmp)
-            .addImm(LabelOffset / 4)
+            .addImm(LabelOffset)
             .addReg(BufReg);
   } else {
     MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), Tmp)
@@ -6214,7 +6162,7 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
   // Reload SP
   if (PVT == MVT::i64) {
     MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), SP)
-            .addImm(SPOffset / 4)
+            .addImm(SPOffset)
             .addReg(BufReg);
   } else {
     MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), SP)
@@ -6229,7 +6177,7 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
   // Reload TOC
   if (PVT == MVT::i64 && PPCSubTarget.isSVR4ABI()) {
     MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2)
-            .addImm(TOCOffset / 4)
+            .addImm(TOCOffset)
             .addReg(BufReg);
 
     MIB.setMemRefs(MMOBegin, MMOEnd);
@@ -6272,8 +6220,10 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     Cond.push_back(MI->getOperand(1));
 
     DebugLoc dl = MI->getDebugLoc();
-    PPCII->insertSelect(*BB, MI, dl, MI->getOperand(0).getReg(), Cond,
-                        MI->getOperand(2).getReg(), MI->getOperand(3).getReg());
+    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+    TII->insertSelect(*BB, MI, dl, MI->getOperand(0).getReg(),
+                      Cond, MI->getOperand(2).getReg(),
+                      MI->getOperand(3).getReg());
   } else if (MI->getOpcode() == PPC::SELECT_CC_I4 ||
              MI->getOpcode() == PPC::SELECT_CC_I8 ||
              MI->getOpcode() == PPC::SELECT_CC_F4 ||
@@ -6717,7 +6667,7 @@ SDValue PPCTargetLowering::DAGCombineFastRecip(SDValue Op,
       ++Iterations;
 
     SelectionDAG &DAG = DCI.DAG;
-    DebugLoc dl = Op.getDebugLoc();
+    SDLoc dl(Op);
 
     SDValue FPOne =
       DAG.getConstantFP(1.0, VT.getScalarType());
@@ -6779,7 +6729,7 @@ SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDValue Op,
       ++Iterations;
 
     SelectionDAG &DAG = DCI.DAG;
-    DebugLoc dl = Op.getDebugLoc();
+    SDLoc dl(Op);
 
     SDValue FPThreeHalves =
       DAG.getConstantFP(1.5, VT.getScalarType());
@@ -6823,11 +6773,120 @@ SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDValue Op,
   return SDValue();
 }
 
+// Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does
+// not enforce equality of the chain operands.
+static bool isConsecutiveLS(LSBaseSDNode *LS, LSBaseSDNode *Base,
+                            unsigned Bytes, int Dist,
+                            SelectionDAG &DAG) {
+  EVT VT = LS->getMemoryVT();
+  if (VT.getSizeInBits() / 8 != Bytes)
+    return false;
+
+  SDValue Loc = LS->getBasePtr();
+  SDValue BaseLoc = Base->getBasePtr();
+  if (Loc.getOpcode() == ISD::FrameIndex) {
+    if (BaseLoc.getOpcode() != ISD::FrameIndex)
+      return false;
+    const MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+    int FI  = cast<FrameIndexSDNode>(Loc)->getIndex();
+    int BFI = cast<FrameIndexSDNode>(BaseLoc)->getIndex();
+    int FS  = MFI->getObjectSize(FI);
+    int BFS = MFI->getObjectSize(BFI);
+    if (FS != BFS || FS != (int)Bytes) return false;
+    return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes);
+  }
+
+  // Handle X+C
+  if (DAG.isBaseWithConstantOffset(Loc) && Loc.getOperand(0) == BaseLoc &&
+      cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue() == Dist*Bytes)
+    return true;
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const GlobalValue *GV1 = NULL;
+  const GlobalValue *GV2 = NULL;
+  int64_t Offset1 = 0;
+  int64_t Offset2 = 0;
+  bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1);
+  bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2);
+  if (isGA1 && isGA2 && GV1 == GV2)
+    return Offset1 == (Offset2 + Dist*Bytes);
+  return false;
+}
+
+// Return true is there is a nearyby consecutive load to the one provided
+// (regardless of alignment). We search up and down the chain, looking though
+// token factors and other loads (but nothing else). As a result, a true
+// results indicates that it is safe to create a new consecutive load adjacent
+// to the load provided.
+static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
+  SDValue Chain = LD->getChain();
+  EVT VT = LD->getMemoryVT();
+
+  SmallSet<SDNode *, 16> LoadRoots;
+  SmallVector<SDNode *, 8> Queue(1, Chain.getNode());
+  SmallSet<SDNode *, 16> Visited;
+
+  // First, search up the chain, branching to follow all token-factor operands.
+  // If we find a consecutive load, then we're done, otherwise, record all
+  // nodes just above the top-level loads and token factors.
+  while (!Queue.empty()) {
+    SDNode *ChainNext = Queue.pop_back_val();
+    if (!Visited.insert(ChainNext))
+      continue;
+
+    if (LoadSDNode *ChainLD = dyn_cast<LoadSDNode>(ChainNext)) {
+      if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
+        return true;
+
+      if (!Visited.count(ChainLD->getChain().getNode()))
+        Queue.push_back(ChainLD->getChain().getNode());
+    } else if (ChainNext->getOpcode() == ISD::TokenFactor) {
+      for (SDNode::op_iterator O = ChainNext->op_begin(),
+           OE = ChainNext->op_end(); O != OE; ++O)
+        if (!Visited.count(O->getNode()))
+          Queue.push_back(O->getNode());
+    } else
+      LoadRoots.insert(ChainNext);
+  }
+
+  // Second, search down the chain, starting from the top-level nodes recorded
+  // in the first phase. These top-level nodes are the nodes just above all
+  // loads and token factors. Starting with their uses, recursively look though
+  // all loads (just the chain uses) and token factors to find a consecutive
+  // load.
+  Visited.clear();
+  Queue.clear();
+
+  for (SmallSet<SDNode *, 16>::iterator I = LoadRoots.begin(),
+       IE = LoadRoots.end(); I != IE; ++I) {
+    Queue.push_back(*I);
+       
+    while (!Queue.empty()) {
+      SDNode *LoadRoot = Queue.pop_back_val();
+      if (!Visited.insert(LoadRoot))
+        continue;
+
+      if (LoadSDNode *ChainLD = dyn_cast<LoadSDNode>(LoadRoot))
+        if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
+          return true;
+
+      for (SDNode::use_iterator UI = LoadRoot->use_begin(),
+           UE = LoadRoot->use_end(); UI != UE; ++UI)
+        if (((isa<LoadSDNode>(*UI) &&
+            cast<LoadSDNode>(*UI)->getChain().getNode() == LoadRoot) ||
+            UI->getOpcode() == ISD::TokenFactor) && !Visited.count(*UI))
+          Queue.push_back(*UI);
+    }
+  }
+
+  return false;
+}
+
 SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   const TargetMachine &TM = getTargetMachine();
   SelectionDAG &DAG = DCI.DAG;
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   switch (N->getOpcode()) {
   default: break;
   case PPCISD::SHL:
@@ -6868,7 +6927,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
                                  DCI);
       if (RV.getNode() != 0) {
         DCI.AddToWorklist(RV.getNode());
-        RV = DAG.getNode(ISD::FP_EXTEND, N->getOperand(1).getDebugLoc(),
+        RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N->getOperand(1)),
                          N->getValueType(0), RV);
         DCI.AddToWorklist(RV.getNode());
         return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
@@ -6881,7 +6940,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
                                  DCI);
       if (RV.getNode() != 0) {
         DCI.AddToWorklist(RV.getNode());
-        RV = DAG.getNode(ISD::FP_ROUND, N->getOperand(1).getDebugLoc(),
+        RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N->getOperand(1)),
                          N->getValueType(0), RV,
                          N->getOperand(1).getOperand(1));
         DCI.AddToWorklist(RV.getNode());
@@ -6999,6 +7058,157 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
                                 cast<StoreSDNode>(N)->getMemOperand());
     }
     break;
+  case ISD::LOAD: {
+    LoadSDNode *LD = cast<LoadSDNode>(N);
+    EVT VT = LD->getValueType(0);
+    Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
+    unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty);
+    if (ISD::isNON_EXTLoad(N) && VT.isVector() &&
+        TM.getSubtarget<PPCSubtarget>().hasAltivec() &&
+        DCI.getDAGCombineLevel() == AfterLegalizeTypes &&
+        LD->getAlignment() < ABIAlignment) {
+      // This is a type-legal unaligned Altivec load.
+      SDValue Chain = LD->getChain();
+      SDValue Ptr = LD->getBasePtr();
+
+      // This implements the loading of unaligned vectors as described in
+      // the venerable Apple Velocity Engine overview. Specifically:
+      // https://developer.apple.com/hardwaredrivers/ve/alignment.html
+      // https://developer.apple.com/hardwaredrivers/ve/code_optimization.html
+      //
+      // The general idea is to expand a sequence of one or more unaligned
+      // loads into a alignment-based permutation-control instruction (lvsl),
+      // a series of regular vector loads (which always truncate their
+      // input address to an aligned address), and a series of permutations.
+      // The results of these permutations are the requested loaded values.
+      // The trick is that the last "extra" load is not taken from the address
+      // you might suspect (sizeof(vector) bytes after the last requested
+      // load), but rather sizeof(vector) - 1 bytes after the last
+      // requested vector. The point of this is to avoid a page fault if the
+      // base address happend to be aligned. This works because if the base
+      // address is aligned, then adding less than a full vector length will
+      // cause the last vector in the sequence to be (re)loaded. Otherwise,
+      // the next vector will be fetched as you might suspect was necessary.
+
+      // We might be able to reuse the permutation generation from
+      // a different base address offset from this one by an aligned amount.
+      // The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this
+      // optimization later.
+      SDValue PermCntl = BuildIntrinsicOp(Intrinsic::ppc_altivec_lvsl, Ptr,
+                                          DAG, dl, MVT::v16i8);
+
+      // Refine the alignment of the original load (a "new" load created here
+      // which was identical to the first except for the alignment would be
+      // merged with the existing node regardless).
+      MachineFunction &MF = DAG.getMachineFunction();
+      MachineMemOperand *MMO =
+        MF.getMachineMemOperand(LD->getPointerInfo(),
+                                LD->getMemOperand()->getFlags(),
+                                LD->getMemoryVT().getStoreSize(),
+                                ABIAlignment);
+      LD->refineAlignment(MMO);
+      SDValue BaseLoad = SDValue(LD, 0);
+
+      // Note that the value of IncOffset (which is provided to the next
+      // load's pointer info offset value, and thus used to calculate the
+      // alignment), and the value of IncValue (which is actually used to
+      // increment the pointer value) are different! This is because we
+      // require the next load to appear to be aligned, even though it
+      // is actually offset from the base pointer by a lesser amount.
+      int IncOffset = VT.getSizeInBits() / 8;
+      int IncValue = IncOffset;
+
+      // Walk (both up and down) the chain looking for another load at the real
+      // (aligned) offset (the alignment of the other load does not matter in
+      // this case). If found, then do not use the offset reduction trick, as
+      // that will prevent the loads from being later combined (as they would
+      // otherwise be duplicates).
+      if (!findConsecutiveLoad(LD, DAG))
+        --IncValue;
+
+      SDValue Increment = DAG.getConstant(IncValue, getPointerTy());
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+
+      SDValue ExtraLoad =
+        DAG.getLoad(VT, dl, Chain, Ptr,
+                    LD->getPointerInfo().getWithOffset(IncOffset),
+                    LD->isVolatile(), LD->isNonTemporal(),
+                    LD->isInvariant(), ABIAlignment);
+
+      SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+        BaseLoad.getValue(1), ExtraLoad.getValue(1));
+
+      if (BaseLoad.getValueType() != MVT::v4i32)
+        BaseLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, BaseLoad);
+
+      if (ExtraLoad.getValueType() != MVT::v4i32)
+        ExtraLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ExtraLoad);
+
+      SDValue Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm,
+                                      BaseLoad, ExtraLoad, PermCntl, DAG, dl);
+
+      if (VT != MVT::v4i32)
+        Perm = DAG.getNode(ISD::BITCAST, dl, VT, Perm);
+
+      // Now we need to be really careful about how we update the users of the
+      // original load. We cannot just call DCI.CombineTo (or
+      // DAG.ReplaceAllUsesWith for that matter), because the load still has
+      // uses created here (the permutation for example) that need to stay.
+      SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+      while (UI != UE) {
+        SDUse &Use = UI.getUse();
+        SDNode *User = *UI;
+        // Note: BaseLoad is checked here because it might not be N, but a
+        // bitcast of N.
+        if (User == Perm.getNode() || User == BaseLoad.getNode() ||
+            User == TF.getNode() || Use.getResNo() > 1) {
+          ++UI;
+          continue;
+        }
+
+        SDValue To = Use.getResNo() ? TF : Perm;
+        ++UI;
+
+        SmallVector<SDValue, 8> Ops;
+        for (SDNode::op_iterator O = User->op_begin(),
+             OE = User->op_end(); O != OE; ++O) {
+          if (*O == Use)
+            Ops.push_back(To);
+          else
+            Ops.push_back(*O);
+        }
+
+        DAG.UpdateNodeOperands(User, Ops.data(), Ops.size());
+      }
+
+      return SDValue(N, 0);
+    }
+    }
+    break;
+  case ISD::INTRINSIC_WO_CHAIN:
+    if (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue() ==
+          Intrinsic::ppc_altivec_lvsl &&
+        N->getOperand(1)->getOpcode() == ISD::ADD) {
+      SDValue Add = N->getOperand(1);
+
+      if (DAG.MaskedValueIsZero(Add->getOperand(1),
+            APInt::getAllOnesValue(4 /* 16 byte alignment */).zext(
+              Add.getValueType().getScalarType().getSizeInBits()))) {
+        SDNode *BasePtr = Add->getOperand(0).getNode();
+        for (SDNode::use_iterator UI = BasePtr->use_begin(),
+             UE = BasePtr->use_end(); UI != UE; ++UI) {
+          if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+              cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() ==
+                Intrinsic::ppc_altivec_lvsl) {
+            // We've found another LVSL, and this address if an aligned
+            // multiple of that one. The results will be the same, so use the
+            // one we've just found instead.
+
+            return SDValue(*UI, 0);
+          }
+        }
+      }
+    }
   case ISD::BSWAP:
     // Turn BSWAP (LOAD) -> lhbrx/lwbrx.
     if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
@@ -7097,6 +7307,39 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
     // compare down to code that is difficult to reassemble.
     ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
     SDValue LHS = N->getOperand(2), RHS = N->getOperand(3);
+
+    // Sometimes the promoted value of the intrinsic is ANDed by some non-zero
+    // value. If so, pass-through the AND to get to the intrinsic.
+    if (LHS.getOpcode() == ISD::AND &&
+        LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN &&
+        cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() ==
+          Intrinsic::ppc_is_decremented_ctr_nonzero &&
+        isa<ConstantSDNode>(LHS.getOperand(1)) &&
+        !cast<ConstantSDNode>(LHS.getOperand(1))->getConstantIntValue()->
+          isZero())
+      LHS = LHS.getOperand(0);
+
+    if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
+        cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() ==
+          Intrinsic::ppc_is_decremented_ctr_nonzero &&
+        isa<ConstantSDNode>(RHS)) {
+      assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
+             "Counter decrement comparison is not EQ or NE");
+
+      unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue();
+      bool isBDNZ = (CC == ISD::SETEQ && Val) ||
+                    (CC == ISD::SETNE && !Val);
+
+      // We now need to make the intrinsic dead (it cannot be instruction
+      // selected).
+      DAG.ReplaceAllUsesOfValueWith(LHS.getValue(1), LHS.getOperand(0));
+      assert(LHS.getNode()->hasOneUse() &&
+             "Counter decrement has more than one use");
+
+      return DAG.getNode(isBDNZ ? PPCISD::BDNZ : PPCISD::BDZ, dl, MVT::Other,
+                         N->getOperand(0), N->getOperand(4));
+    }
+
     int CompareOpc;
     bool isDot;
 
@@ -7406,25 +7649,13 @@ bool PPCTargetLowering::isLegalAddressingMode(const AddrMode &AM,
   return true;
 }
 
-/// isLegalAddressImmediate - Return true if the integer value can be used
-/// as the offset of the target addressing mode for load / store of the
-/// given type.
-bool PPCTargetLowering::isLegalAddressImmediate(int64_t V,Type *Ty) const{
-  // PPC allows a sign-extended 16-bit immediate field.
-  return (V > -(1 << 16) && V < (1 << 16)-1);
-}
-
-bool PPCTargetLowering::isLegalAddressImmediate(GlobalValue* GV) const {
-  return false;
-}
-
 SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
                                            SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MFI->setReturnAddressIsTaken(true);
 
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 
   // Make sure the function does not optimize away the store of the RA to
@@ -7454,7 +7685,7 @@ SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
 
 SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
                                           SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 423e983..e85f96c 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -20,6 +20,7 @@
 #include "PPCRegisterInfo.h"
 #include "PPCSubtarget.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/Target/TargetLowering.h"
 
 namespace llvm {
@@ -146,6 +147,10 @@ namespace llvm {
       /// an optional input flag argument.
       COND_BRANCH,
 
+      /// CHAIN = BDNZ CHAIN, DESTBB - These are used to create counter-based
+      /// loops.
+      BDNZ, BDZ,
+
       /// F8RC = FADDRTZ F8RC, F8RC - This is an FADD done with rounding
       /// towards zero.  Used only as part of the long double-to-int
       /// conversion sequence.
@@ -175,61 +180,61 @@ namespace llvm {
 
       /// G8RC = ADDIS_GOT_TPREL_HA %X2, Symbol - Used by the initial-exec
       /// TLS model, produces an ADDIS8 instruction that adds the GOT
-      /// base to sym@got@tprel@ha.
+      /// base to sym\@got\@tprel\@ha.
       ADDIS_GOT_TPREL_HA,
 
       /// G8RC = LD_GOT_TPREL_L Symbol, G8RReg - Used by the initial-exec
       /// TLS model, produces a LD instruction with base register G8RReg
-      /// and offset sym@got@tprel@l.  This completes the addition that
+      /// and offset sym\@got\@tprel\@l.  This completes the addition that
       /// finds the offset of "sym" relative to the thread pointer.
       LD_GOT_TPREL_L,
 
       /// G8RC = ADD_TLS G8RReg, Symbol - Used by the initial-exec TLS
       /// model, produces an ADD instruction that adds the contents of
       /// G8RReg to the thread pointer.  Symbol contains a relocation
-      /// sym@tls which is to be replaced by the thread pointer and
+      /// sym\@tls which is to be replaced by the thread pointer and
       /// identifies to the linker that the instruction is part of a
       /// TLS sequence.
       ADD_TLS,
 
       /// G8RC = ADDIS_TLSGD_HA %X2, Symbol - For the general-dynamic TLS
       /// model, produces an ADDIS8 instruction that adds the GOT base
-      /// register to sym@got@tlsgd@ha.
+      /// register to sym\@got\@tlsgd\@ha.
       ADDIS_TLSGD_HA,
 
       /// G8RC = ADDI_TLSGD_L G8RReg, Symbol - For the general-dynamic TLS
       /// model, produces an ADDI8 instruction that adds G8RReg to
-      /// sym@got@tlsgd@l.
+      /// sym\@got\@tlsgd\@l.
       ADDI_TLSGD_L,
 
       /// G8RC = GET_TLS_ADDR %X3, Symbol - For the general-dynamic TLS
-      /// model, produces a call to __tls_get_addr(sym@tlsgd).
+      /// model, produces a call to __tls_get_addr(sym\@tlsgd).
       GET_TLS_ADDR,
 
       /// G8RC = ADDIS_TLSLD_HA %X2, Symbol - For the local-dynamic TLS
       /// model, produces an ADDIS8 instruction that adds the GOT base
-      /// register to sym@got@tlsld@ha.
+      /// register to sym\@got\@tlsld\@ha.
       ADDIS_TLSLD_HA,
 
       /// G8RC = ADDI_TLSLD_L G8RReg, Symbol - For the local-dynamic TLS
       /// model, produces an ADDI8 instruction that adds G8RReg to
-      /// sym@got@tlsld@l.
+      /// sym\@got\@tlsld\@l.
       ADDI_TLSLD_L,
 
       /// G8RC = GET_TLSLD_ADDR %X3, Symbol - For the local-dynamic TLS
-      /// model, produces a call to __tls_get_addr(sym@tlsld).
+      /// model, produces a call to __tls_get_addr(sym\@tlsld).
       GET_TLSLD_ADDR,
 
       /// G8RC = ADDIS_DTPREL_HA %X3, Symbol, Chain - For the
       /// local-dynamic TLS model, produces an ADDIS8 instruction
-      /// that adds X3 to sym@dtprel@ha.  The Chain operand is needed 
+      /// that adds X3 to sym\@dtprel\@ha. The Chain operand is needed
       /// to tie this in place following a copy to %X3 from the result
       /// of a GET_TLSLD_ADDR.
       ADDIS_DTPREL_HA,
 
       /// G8RC = ADDI_DTPREL_L G8RReg, Symbol - For the local-dynamic TLS
       /// model, produces an ADDI8 instruction that adds G8RReg to
-      /// sym@got@dtprel@l.
+      /// sym\@got\@dtprel\@l.
       ADDI_DTPREL_L,
 
       /// VRRC = VADD_SPLAT Elt, EltSize - Temporary node to be expanded
@@ -238,6 +243,10 @@ namespace llvm {
       /// optimizations due to constant folding.
       VADD_SPLAT,
 
+      /// CHAIN = SC CHAIN, Imm128 - System call.  The 7-bit unsigned
+      /// operand identifies the operating system entry point.
+      SC,
+
       /// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a
       /// byte-swapping store instruction.  It byte-swaps the low "Type" bits of
       /// the GPRC input, then stores it through Ptr.  Type can be either i16 or
@@ -266,16 +275,16 @@ namespace llvm {
 
       /// G8RC = ADDIS_TOC_HA %X2, Symbol - For medium and large code model,
       /// produces an ADDIS8 instruction that adds the TOC base register to
-      /// sym@toc@ha.
+      /// sym\@toc\@ha.
       ADDIS_TOC_HA,
 
       /// G8RC = LD_TOC_L Symbol, G8RReg - For medium and large code model,
       /// produces a LD instruction with base register G8RReg and offset
-      /// sym@toc@l.  Preceded by an ADDIS_TOC_HA to form a full 32-bit offset.
+      /// sym\@toc\@l. Preceded by an ADDIS_TOC_HA to form a full 32-bit offset.
       LD_TOC_L,
 
       /// G8RC = ADDI_TOC_L G8RReg, Symbol - For medium code model, produces
-      /// an ADDI8 instruction that adds G8RReg to sym@toc@l.
+      /// an ADDI8 instruction that adds G8RReg to sym\@toc\@l.
       /// Preceded by an ADDIS_TOC_HA to form a full 32-bit offset.
       ADDI_TOC_L
     };
@@ -327,8 +336,6 @@ namespace llvm {
 
   class PPCTargetLowering : public TargetLowering {
     const PPCSubtarget &PPCSubTarget;
-    const PPCRegisterInfo *PPCRegInfo;
-    const PPCInstrInfo *PPCII;
 
   public:
     explicit PPCTargetLowering(PPCTargetMachine &TM);
@@ -340,7 +347,7 @@ namespace llvm {
     virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
 
     /// getSetCCResultType - Return the ISD::SETCC ValueType
-    virtual EVT getSetCCResultType(EVT VT) const;
+    virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
 
     /// getPreIndexedAddressParts - returns true by value, base pointer and
     /// offset pointer and addressing mode by reference if the node's address
@@ -358,21 +365,16 @@ namespace llvm {
 
     /// SelectAddressRegImm - Returns true if the address N can be represented
     /// by a base register plus a signed 16-bit displacement [r+imm], and if it
-    /// is not better represented as reg+reg.
+    /// is not better represented as reg+reg.  If Aligned is true, only accept
+    /// displacements suitable for STD and friends, i.e. multiples of 4.
     bool SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base,
-                             SelectionDAG &DAG) const;
+                             SelectionDAG &DAG, bool Aligned) const;
 
     /// SelectAddressRegRegOnly - Given the specified addressed, force it to be
     /// represented as an indexed [r+r] operation.
     bool SelectAddressRegRegOnly(SDValue N, SDValue &Base, SDValue &Index,
                                  SelectionDAG &DAG) const;
 
-    /// SelectAddressRegImmShift - Returns true if the address N can be
-    /// represented by a base register plus a signed 14-bit displacement
-    /// [r+imm*4].  Suitable for use by STD and friends.
-    bool SelectAddressRegImmShift(SDValue N, SDValue &Disp, SDValue &Base,
-                                  SelectionDAG &DAG) const;
-
     Sched::Preference getSchedulingPreference(SDNode *N) const;
 
     /// LowerOperation - Provide custom lowering hooks for some operations.
@@ -436,15 +438,6 @@ namespace llvm {
     /// by AM is legal for this target, for a load/store of the specified type.
     virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty)const;
 
-    /// isLegalAddressImmediate - Return true if the integer value can be used
-    /// as the offset of the target addressing mode for load / store of the
-    /// given type.
-    virtual bool isLegalAddressImmediate(int64_t V, Type *Ty) const;
-
-    /// isLegalAddressImmediate - Return true if the GlobalValue can be used as
-    /// the offset of the target addressing mode.
-    virtual bool isLegalAddressImmediate(GlobalValue *GV) const;
-
     virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
 
     /// getOptimalMemOpType - Returns the target specific optimal type for load
@@ -459,7 +452,7 @@ namespace llvm {
     /// It returns EVT::Other if the type should be determined using generic
     /// target-independent logic.
     virtual EVT
-    getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, 
+    getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
                         bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
                         MachineFunction &MF) const;
 
@@ -490,7 +483,7 @@ namespace llvm {
                                          SDValue &LROpOut,
                                          SDValue &FPOpOut,
                                          bool isDarwinABI,
-                                         DebugLoc dl) const;
+                                         SDLoc dl) const;
 
     SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
@@ -511,7 +504,7 @@ namespace llvm {
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG,
                                       const PPCSubtarget &Subtarget) const;
     SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, DebugLoc dl) const;
+    SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, SDLoc dl) const;
     SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const;
@@ -526,9 +519,9 @@ namespace llvm {
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                             CallingConv::ID CallConv, bool isVarArg,
                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                            DebugLoc dl, SelectionDAG &DAG,
+                            SDLoc dl, SelectionDAG &DAG,
                             SmallVectorImpl<SDValue> &InVals) const;
-    SDValue FinishCall(CallingConv::ID CallConv, DebugLoc dl, bool isTailCall,
+    SDValue FinishCall(CallingConv::ID CallConv, SDLoc dl, bool isTailCall,
                        bool isVarArg,
                        SelectionDAG &DAG,
                        SmallVector<std::pair<unsigned, SDValue>, 8>
@@ -543,7 +536,7 @@ namespace llvm {
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv, bool isVarArg,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
-                           DebugLoc dl, SelectionDAG &DAG,
+                           SDLoc dl, SelectionDAG &DAG,
                            SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
@@ -561,11 +554,11 @@ namespace llvm {
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
                   const SmallVectorImpl<SDValue> &OutVals,
-                  DebugLoc dl, SelectionDAG &DAG) const;
+                  SDLoc dl, SelectionDAG &DAG) const;
 
     SDValue
       extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT, SelectionDAG &DAG,
-                        SDValue ArgVal, DebugLoc dl) const;
+                        SDValue ArgVal, SDLoc dl) const;
 
     void
       setMinReservedArea(MachineFunction &MF, SelectionDAG &DAG,
@@ -576,25 +569,25 @@ namespace llvm {
       LowerFormalArguments_Darwin(SDValue Chain,
                                   CallingConv::ID CallConv, bool isVarArg,
                                   const SmallVectorImpl<ISD::InputArg> &Ins,
-                                  DebugLoc dl, SelectionDAG &DAG,
+                                  SDLoc dl, SelectionDAG &DAG,
                                   SmallVectorImpl<SDValue> &InVals) const;
     SDValue
       LowerFormalArguments_64SVR4(SDValue Chain,
                                   CallingConv::ID CallConv, bool isVarArg,
                                   const SmallVectorImpl<ISD::InputArg> &Ins,
-                                  DebugLoc dl, SelectionDAG &DAG,
+                                  SDLoc dl, SelectionDAG &DAG,
                                   SmallVectorImpl<SDValue> &InVals) const;
     SDValue
       LowerFormalArguments_32SVR4(SDValue Chain,
                                   CallingConv::ID CallConv, bool isVarArg,
                                   const SmallVectorImpl<ISD::InputArg> &Ins,
-                                  DebugLoc dl, SelectionDAG &DAG,
+                                  SDLoc dl, SelectionDAG &DAG,
                                   SmallVectorImpl<SDValue> &InVals) const;
 
     SDValue
       createMemcpyOutsideCallSeq(SDValue Arg, SDValue PtrOff,
                                  SDValue CallSeqStart, ISD::ArgFlagsTy Flags,
-                                 SelectionDAG &DAG, DebugLoc dl) const;
+                                 SelectionDAG &DAG, SDLoc dl) const;
 
     SDValue
       LowerCall_Darwin(SDValue Chain, SDValue Callee,
@@ -603,7 +596,7 @@ namespace llvm {
                        const SmallVectorImpl<ISD::OutputArg> &Outs,
                        const SmallVectorImpl<SDValue> &OutVals,
                        const SmallVectorImpl<ISD::InputArg> &Ins,
-                       DebugLoc dl, SelectionDAG &DAG,
+                       SDLoc dl, SelectionDAG &DAG,
                        SmallVectorImpl<SDValue> &InVals) const;
     SDValue
       LowerCall_64SVR4(SDValue Chain, SDValue Callee,
@@ -612,7 +605,7 @@ namespace llvm {
                        const SmallVectorImpl<ISD::OutputArg> &Outs,
                        const SmallVectorImpl<SDValue> &OutVals,
                        const SmallVectorImpl<ISD::InputArg> &Ins,
-                       DebugLoc dl, SelectionDAG &DAG,
+                       SDLoc dl, SelectionDAG &DAG,
                        SmallVectorImpl<SDValue> &InVals) const;
     SDValue
     LowerCall_32SVR4(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
@@ -620,7 +613,7 @@ namespace llvm {
                      const SmallVectorImpl<ISD::OutputArg> &Outs,
                      const SmallVectorImpl<SDValue> &OutVals,
                      const SmallVectorImpl<ISD::InputArg> &Ins,
-                     DebugLoc dl, SelectionDAG &DAG,
+                     SDLoc dl, SelectionDAG &DAG,
                      SmallVectorImpl<SDValue> &InVals) const;
 
     SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
@@ -629,6 +622,23 @@ namespace llvm {
     SDValue DAGCombineFastRecip(SDValue Op, DAGCombinerInfo &DCI) const;
     SDValue DAGCombineFastRecipFSQRT(SDValue Op, DAGCombinerInfo &DCI) const;
   };
+
+  bool CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                  CCValAssign::LocInfo &LocInfo,
+                                  ISD::ArgFlagsTy &ArgFlags,
+                                  CCState &State);
+
+  bool CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
+                                         MVT &LocVT,
+                                         CCValAssign::LocInfo &LocInfo,
+                                         ISD::ArgFlagsTy &ArgFlags,
+                                         CCState &State);
+
+  bool CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
+                                           MVT &LocVT,
+                                           CCValAssign::LocInfo &LocInfo,
+                                           ISD::ArgFlagsTy &ArgFlags,
+                                           CCState &State);
 }
 
 #endif   // LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index e5d0b91..0245ba7 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -17,17 +17,12 @@
 //
 def s16imm64 : Operand<i64> {
   let PrintMethod = "printS16ImmOperand";
+  let EncoderMethod = "getS16ImmEncoding";
+  let ParserMatchClass = PPCS16ImmAsmOperand;
 }
 def u16imm64 : Operand<i64> {
   let PrintMethod = "printU16ImmOperand";
-}
-def symbolHi64 : Operand<i64> {
-  let PrintMethod = "printSymbolHi";
-  let EncoderMethod = "getHA16Encoding";
-}
-def symbolLo64 : Operand<i64> {
-  let PrintMethod = "printSymbolLo";
-  let EncoderMethod = "getLO16Encoding";
+  let ParserMatchClass = PPCU16ImmAsmOperand;
 }
 def tocentry : Operand<iPTR> {
   let MIOperandInfo = (ops i64imm:$imm);
@@ -289,6 +284,12 @@ def MTCTR8 : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS),
                            "mtctr $rS", SprMTSPR>,
              PPC970_DGroup_First, PPC970_Unit_FXU;
 }
+let hasSideEffects = 1, isCodeGenOnly = 1, Defs = [CTR8] in {
+let Pattern = [(int_ppc_mtctr i64:$rS)] in
+def MTCTR8loop : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS),
+                               "mtctr $rS", SprMTSPR>,
+                 PPC970_DGroup_First, PPC970_Unit_FXU;
+}
 
 let Pattern = [(set i64:$rT, readcyclecounter)] in
 def MFTB8 : XFXForm_1_ext<31, 339, 268, (outs g8rc:$rT), (ins),
@@ -325,10 +326,10 @@ let Interpretation64Bit = 1 in {
 let neverHasSideEffects = 1 in {
 
 let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
-def LI8  : DForm_2_r0<14, (outs g8rc:$rD), (ins symbolLo64:$imm),
+def LI8  : DForm_2_r0<14, (outs g8rc:$rD), (ins s16imm64:$imm),
                       "li $rD, $imm", IntSimple,
-                      [(set i64:$rD, immSExt16:$imm)]>;
-def LIS8 : DForm_2_r0<15, (outs g8rc:$rD), (ins symbolHi64:$imm),
+                      [(set i64:$rD, imm64SExt16:$imm)]>;
+def LIS8 : DForm_2_r0<15, (outs g8rc:$rD), (ins s16imm64:$imm),
                       "lis $rD, $imm", IntSimple,
                       [(set i64:$rD, imm16ShiftedSExt:$imm)]>;
 }
@@ -400,18 +401,18 @@ defm ADDC8 : XOForm_1rc<31, 10, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
 let Defs = [CARRY] in
 def ADDIC8 : DForm_2<12, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm),
                      "addic $rD, $rA, $imm", IntGeneral,
-                     [(set i64:$rD, (addc i64:$rA, immSExt16:$imm))]>;
-def ADDI8  : DForm_2<14, (outs g8rc:$rD), (ins g8rc_nox0:$rA, symbolLo64:$imm),
+                     [(set i64:$rD, (addc i64:$rA, imm64SExt16:$imm))]>;
+def ADDI8  : DForm_2<14, (outs g8rc:$rD), (ins g8rc_nox0:$rA, s16imm64:$imm),
                      "addi $rD, $rA, $imm", IntSimple,
-                     [(set i64:$rD, (add i64:$rA, immSExt16:$imm))]>;
-def ADDIS8 : DForm_2<15, (outs g8rc:$rD), (ins g8rc_nox0:$rA, symbolHi64:$imm),
+                     [(set i64:$rD, (add i64:$rA, imm64SExt16:$imm))]>;
+def ADDIS8 : DForm_2<15, (outs g8rc:$rD), (ins g8rc_nox0:$rA, s16imm64:$imm),
                      "addis $rD, $rA, $imm", IntSimple,
                      [(set i64:$rD, (add i64:$rA, imm16ShiftedSExt:$imm))]>;
 
 let Defs = [CARRY] in {
 def SUBFIC8: DForm_2< 8, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm),
                      "subfic $rD, $rA, $imm", IntGeneral,
-                     [(set i64:$rD, (subc immSExt16:$imm, i64:$rA))]>;
+                     [(set i64:$rD, (subc imm64SExt16:$imm, i64:$rA))]>;
 defm SUBFC8 : XOForm_1r<31, 8, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
                         "subfc", "$rT, $rA, $rB", IntGeneral,
                         [(set i64:$rT, (subc i64:$rB, i64:$rA))]>,
@@ -746,25 +747,25 @@ def ADDItocL: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
                        (PPCaddiTocL i64:$reg, tglobaladdr:$disp))]>, isPPC64;
 
 // Support for thread-local storage.
-def ADDISgotTprelHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, symbolHi64:$disp),
+def ADDISgotTprelHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
                          "#ADDISgotTprelHA",
                          [(set i64:$rD,
                            (PPCaddisGotTprelHA i64:$reg,
                                                tglobaltlsaddr:$disp))]>,
                   isPPC64;
-def LDgotTprelL: Pseudo<(outs g8rc:$rD), (ins symbolLo64:$disp, g8rc_nox0:$reg),
+def LDgotTprelL: Pseudo<(outs g8rc:$rD), (ins s16imm64:$disp, g8rc_nox0:$reg),
                         "#LDgotTprelL",
                         [(set i64:$rD,
                           (PPCldGotTprelL tglobaltlsaddr:$disp, i64:$reg))]>,
                  isPPC64;
 def : Pat<(PPCaddTls i64:$in, tglobaltlsaddr:$g),
           (ADD8TLS $in, tglobaltlsaddr:$g)>;
-def ADDIStlsgdHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, symbolHi64:$disp),
+def ADDIStlsgdHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
                          "#ADDIStlsgdHA",
                          [(set i64:$rD,
                            (PPCaddisTlsgdHA i64:$reg, tglobaltlsaddr:$disp))]>,
                   isPPC64;
-def ADDItlsgdL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, symbolLo64:$disp),
+def ADDItlsgdL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
                        "#ADDItlsgdL",
                        [(set i64:$rD,
                          (PPCaddiTlsgdL i64:$reg, tglobaltlsaddr:$disp))]>,
@@ -774,12 +775,12 @@ def GETtlsADDR : Pseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym),
                         [(set i64:$rD,
                           (PPCgetTlsAddr i64:$reg, tglobaltlsaddr:$sym))]>,
                  isPPC64;
-def ADDIStlsldHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, symbolHi64:$disp),
+def ADDIStlsldHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
                          "#ADDIStlsldHA",
                          [(set i64:$rD,
                            (PPCaddisTlsldHA i64:$reg, tglobaltlsaddr:$disp))]>,
                   isPPC64;
-def ADDItlsldL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, symbolLo64:$disp),
+def ADDItlsldL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
                        "#ADDItlsldL",
                        [(set i64:$rD,
                          (PPCaddiTlsldL i64:$reg, tglobaltlsaddr:$disp))]>,
@@ -789,13 +790,13 @@ def GETtlsldADDR : Pseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym),
                           [(set i64:$rD,
                             (PPCgetTlsldAddr i64:$reg, tglobaltlsaddr:$sym))]>,
                    isPPC64;
-def ADDISdtprelHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, symbolHi64:$disp),
+def ADDISdtprelHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
                           "#ADDISdtprelHA",
                           [(set i64:$rD,
                             (PPCaddisDtprelHA i64:$reg,
                                               tglobaltlsaddr:$disp))]>,
                    isPPC64;
-def ADDIdtprelL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, symbolLo64:$disp),
+def ADDIdtprelL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
                          "#ADDIdtprelL",
                          [(set i64:$rD,
                            (PPCaddiDtprelL i64:$reg, tglobaltlsaddr:$disp))]>,
diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td
index 41b4e01..a244058 100644
--- a/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/lib/Target/PowerPC/PPCInstrFormats.td
@@ -145,6 +145,19 @@ class BForm_2<bits<6> opcode, bits<5> bo, bits<5> bi, bit aa, bit lk,
   let Inst{31}    = lk;
 }
 
+// 1.7.3 SC-Form
+class SCForm<bits<6> opcode, bits<1> xo,
+                     dag OOL, dag IOL, string asmstr, InstrItinClass itin,
+                     list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<7>  LEV;
+
+  let Pattern = pattern;
+
+  let Inst{20-26} = LEV;
+  let Inst{30}    = xo;
+}
+
 // 1.7.4 D-Form
 class DForm_base<bits<6> opcode, dag OOL, dag IOL, string asmstr,
                  InstrItinClass itin, list<dag> pattern> 
@@ -365,6 +378,12 @@ class XForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
               InstrItinClass itin, list<dag> pattern> 
   : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern>;
 
+class XForm_1a<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let RST = 0;
+}
+
 class XForm_6<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
               InstrItinClass itin, list<dag> pattern> 
   : XForm_base_r3xo_swapped<opcode, xo, OOL, IOL, asmstr, itin> {
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 847bd22..a3eeb20 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -47,7 +47,7 @@ cl::desc("Disable compare instruction optimization"), cl::Hidden);
 
 PPCInstrInfo::PPCInstrInfo(PPCTargetMachine &tm)
   : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP),
-    TM(tm), RI(*TM.getSubtargetImpl(), *this) {}
+    TM(tm), RI(*TM.getSubtargetImpl()) {}
 
 /// CreateTargetHazardRecognizer - Return the hazard recognizer to use for
 /// this target when scheduling the DAG.
@@ -74,10 +74,9 @@ ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetPostRAHazardRecognizer(
   // Most subtargets use a PPC970 recognizer.
   if (Directive != PPC::DIR_440 && Directive != PPC::DIR_A2 &&
       Directive != PPC::DIR_E500mc && Directive != PPC::DIR_E5500) {
-    const TargetInstrInfo *TII = TM.getInstrInfo();
-    assert(TII && "No InstrInfo?");
+    assert(TM.getInstrInfo() && "No InstrInfo?");
 
-    return new PPCHazardRecognizer970(*TII);
+    return new PPCHazardRecognizer970(TM);
   }
 
   return new PPCScoreboardHazardRecognizer(II, DAG);
@@ -1096,8 +1095,11 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
 
   int OpC = CmpInstr->getOpcode();
   unsigned CRReg = CmpInstr->getOperand(0).getReg();
-  bool isFP = OpC == PPC::FCMPUS || OpC == PPC::FCMPUD;
-  unsigned CRRecReg = isFP ? PPC::CR1 : PPC::CR0;
+
+  // FP record forms set CR1 based on the execption status bits, not a
+  // comparison with zero.
+  if (OpC == PPC::FCMPUS || OpC == PPC::FCMPUD)
+    return false;
 
   // The record forms set the condition register based on a signed comparison
   // with zero (so says the ISA manual). This is not as straightforward as it
@@ -1140,9 +1142,9 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
         equalityOnly = true;
       } else
         return false;
-    } else if (!isFP)
+    } else
       equalityOnly = is64BitUnsignedCompare;
-  } else if (!isFP)
+  } else
     equalityOnly = is32BitUnsignedCompare;
 
   if (equalityOnly) {
@@ -1153,25 +1155,19 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
       MachineInstr *UseMI = &*I;
       if (UseMI->getOpcode() == PPC::BCC) {
         unsigned Pred = UseMI->getOperand(0).getImm();
-        if (Pred == PPC::PRED_EQ || Pred == PPC::PRED_NE)
-          continue;
-
-        return false;
+        if (Pred != PPC::PRED_EQ && Pred != PPC::PRED_NE)
+          return false;
       } else if (UseMI->getOpcode() == PPC::ISEL ||
                  UseMI->getOpcode() == PPC::ISEL8) {
         unsigned SubIdx = UseMI->getOperand(3).getSubReg();
-        if (SubIdx == PPC::sub_eq)
-          continue;
-
-        return false;
+        if (SubIdx != PPC::sub_eq)
+          return false;
       } else
         return false;
     }
   }
 
-  // Get ready to iterate backward from CmpInstr.
-  MachineBasicBlock::iterator I = CmpInstr, E = MI,
-                              B = CmpInstr->getParent()->begin();
+  MachineBasicBlock::iterator I = CmpInstr;
 
   // Scan forward to find the first use of the compare.
   for (MachineBasicBlock::iterator EL = CmpInstr->getParent()->end();
@@ -1188,9 +1184,6 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
       break;
   }
 
-  // Early exit if we're at the beginning of the BB.
-  if (I == B) return false;
-
   // There are two possible candidates which can be changed to set CR[01].
   // One is MI, the other is a SUB instruction.
   // For CMPrr(r1,r2), we are looking for SUB(r1,r2) or SUB(r2,r1).
@@ -1210,13 +1203,18 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
   // Search for Sub.
   const TargetRegisterInfo *TRI = &getRegisterInfo();
   --I;
+
+  // Get ready to iterate backward from CmpInstr.
+  MachineBasicBlock::iterator E = MI,
+                              B = CmpInstr->getParent()->begin();
+
   for (; I != E && !noSub; --I) {
     const MachineInstr &Instr = *I;
     unsigned IOpC = Instr.getOpcode();
 
     if (&*I != CmpInstr && (
-        Instr.modifiesRegister(CRRecReg, TRI) ||
-        Instr.readsRegister(CRRecReg, TRI)))
+        Instr.modifiesRegister(PPC::CR0, TRI) ||
+        Instr.readsRegister(PPC::CR0, TRI)))
       // This instruction modifies or uses the record condition register after
       // the one we want to change. While we could do this transformation, it
       // would likely not be profitable. This transformation removes one
@@ -1236,15 +1234,6 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
       break;
     }
 
-    if (isFP && (IOpC == PPC::FSUB || IOpC == PPC::FSUBS) &&
-        ((Instr.getOperand(1).getReg() == SrcReg &&
-          Instr.getOperand(2).getReg() == SrcReg2) ||
-        (Instr.getOperand(1).getReg() == SrcReg2 &&
-         Instr.getOperand(2).getReg() == SrcReg))) {
-      Sub = &*I;
-      break;
-    }
-
     if (I == B)
       // The 'and' is below the comparison instruction.
       return false;
@@ -1290,8 +1279,7 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
 
     // The operands to subf are the opposite of sub, so only in the fixed-point
     // case, invert the order.
-    if (!isFP)
-      ShouldSwap = !ShouldSwap;
+    ShouldSwap = !ShouldSwap;
   }
 
   if (ShouldSwap)
@@ -1330,7 +1318,7 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
   MachineBasicBlock::iterator MII = MI;
   BuildMI(*MI->getParent(), llvm::next(MII), MI->getDebugLoc(),
           get(TargetOpcode::COPY), CRReg)
-    .addReg(CRRecReg, MIOpC != NewOpC ? RegState::Kill : 0);
+    .addReg(PPC::CR0, MIOpC != NewOpC ? RegState::Kill : 0);
 
   if (MIOpC != NewOpC) {
     // We need to be careful here: we're replacing one instruction with
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 7d3540e..1b7ea93 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -162,6 +162,10 @@ def PPCeh_sjlj_longjmp : SDNode<"PPCISD::EH_SJLJ_LONGJMP",
                                 SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
                                 [SDNPHasChain, SDNPSideEffect]>;
 
+def SDT_PPCsc     : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def PPCsc         : SDNode<"PPCISD::SC", SDT_PPCsc,
+                           [SDNPHasChain, SDNPSideEffect]>;
+
 def PPCvcmp       : SDNode<"PPCISD::VCMP" , SDT_PPCvcmp, []>;
 def PPCvcmp_o     : SDNode<"PPCISD::VCMPo", SDT_PPCvcmp, [SDNPOutGlue]>;
 
@@ -246,13 +250,15 @@ def maskimm32 : PatLeaf<(imm), [{
     return false;
 }]>;
 
-def immSExt16  : PatLeaf<(imm), [{
-  // immSExt16 predicate - True if the immediate fits in a 16-bit sign extended
-  // field.  Used by instructions like 'addi'.
-  if (N->getValueType(0) == MVT::i32)
-    return (int32_t)N->getZExtValue() == (short)N->getZExtValue();
-  else
-    return (int64_t)N->getZExtValue() == (short)N->getZExtValue();
+def imm32SExt16  : Operand<i32>, ImmLeaf<i32, [{
+  // imm32SExt16 predicate - True if the i32 immediate fits in a 16-bit
+  // sign extended field.  Used by instructions like 'addi'.
+  return (int32_t)Imm == (short)Imm;
+}]>;
+def imm64SExt16  : Operand<i64>, ImmLeaf<i64, [{
+  // imm64SExt16 predicate - True if the i64 immediate fits in a 16-bit
+  // sign extended field.  Used by instructions like 'addi'.
+  return (int64_t)Imm == (short)Imm;
 }]>;
 def immZExt16  : PatLeaf<(imm), [{
   // immZExt16 predicate - True if the immediate fits in a 16-bit zero extended
@@ -283,7 +289,7 @@ def imm16ShiftedSExt : PatLeaf<(imm), [{
 }], HI16>;
 
 // Some r+i load/store instructions (such as LD, STD, LDU, etc.) that require
-// restricted memrix (offset/4) constants are alignment sensitive. If these
+// restricted memrix (4-aligned) constants are alignment sensitive. If these
 // offsets are hidden behind TOC entries than the values of the lower-order
 // bits cannot be checked directly. As a result, we need to also incorporate
 // an alignment check into the relevant patterns.
@@ -342,30 +348,102 @@ class NoEncode<string E> {
 // all their register operands.
 // For this purpose, we define one RegisterOperand for each RegisterClass,
 // using the same name as the class, just in lower case.
-def gprc : RegisterOperand<GPRC>;
-def g8rc : RegisterOperand<G8RC>;
-def gprc_nor0 : RegisterOperand<GPRC_NOR0>;
-def g8rc_nox0 : RegisterOperand<G8RC_NOX0>;
-def f8rc : RegisterOperand<F8RC>;
-def f4rc : RegisterOperand<F4RC>;
-def vrrc : RegisterOperand<VRRC>;
-def crbitrc : RegisterOperand<CRBITRC>;
-def crrc : RegisterOperand<CRRC>;
 
+def PPCRegGPRCAsmOperand : AsmOperandClass {
+  let Name = "RegGPRC"; let PredicateMethod = "isRegNumber";
+}
+def gprc : RegisterOperand<GPRC> {
+  let ParserMatchClass = PPCRegGPRCAsmOperand;
+}
+def PPCRegG8RCAsmOperand : AsmOperandClass {
+  let Name = "RegG8RC"; let PredicateMethod = "isRegNumber";
+}
+def g8rc : RegisterOperand<G8RC> {
+  let ParserMatchClass = PPCRegG8RCAsmOperand;
+}
+def PPCRegGPRCNoR0AsmOperand : AsmOperandClass {
+  let Name = "RegGPRCNoR0"; let PredicateMethod = "isRegNumber";
+}
+def gprc_nor0 : RegisterOperand<GPRC_NOR0> {
+  let ParserMatchClass = PPCRegGPRCNoR0AsmOperand;
+}
+def PPCRegG8RCNoX0AsmOperand : AsmOperandClass {
+  let Name = "RegG8RCNoX0"; let PredicateMethod = "isRegNumber";
+}
+def g8rc_nox0 : RegisterOperand<G8RC_NOX0> {
+  let ParserMatchClass = PPCRegG8RCNoX0AsmOperand;
+}
+def PPCRegF8RCAsmOperand : AsmOperandClass {
+  let Name = "RegF8RC"; let PredicateMethod = "isRegNumber";
+}
+def f8rc : RegisterOperand<F8RC> {
+  let ParserMatchClass = PPCRegF8RCAsmOperand;
+}
+def PPCRegF4RCAsmOperand : AsmOperandClass {
+  let Name = "RegF4RC"; let PredicateMethod = "isRegNumber";
+}
+def f4rc : RegisterOperand<F4RC> {
+  let ParserMatchClass = PPCRegF4RCAsmOperand;
+}
+def PPCRegVRRCAsmOperand : AsmOperandClass {
+  let Name = "RegVRRC"; let PredicateMethod = "isRegNumber";
+}
+def vrrc : RegisterOperand<VRRC> {
+  let ParserMatchClass = PPCRegVRRCAsmOperand;
+}
+def PPCRegCRBITRCAsmOperand : AsmOperandClass {
+  let Name = "RegCRBITRC"; let PredicateMethod = "isRegNumber";
+}
+def crbitrc : RegisterOperand<CRBITRC> {
+  let ParserMatchClass = PPCRegCRBITRCAsmOperand;
+}
+def PPCRegCRRCAsmOperand : AsmOperandClass {
+  let Name = "RegCRRC"; let PredicateMethod = "isCCRegNumber";
+}
+def crrc : RegisterOperand<CRRC> {
+  let ParserMatchClass = PPCRegCRRCAsmOperand;
+}
+
+def PPCS5ImmAsmOperand : AsmOperandClass {
+  let Name = "S5Imm"; let PredicateMethod = "isS5Imm";
+  let RenderMethod = "addImmOperands";
+}
 def s5imm   : Operand<i32> {
   let PrintMethod = "printS5ImmOperand";
+  let ParserMatchClass = PPCS5ImmAsmOperand;
+}
+def PPCU5ImmAsmOperand : AsmOperandClass {
+  let Name = "U5Imm"; let PredicateMethod = "isU5Imm";
+  let RenderMethod = "addImmOperands";
 }
 def u5imm   : Operand<i32> {
   let PrintMethod = "printU5ImmOperand";
+  let ParserMatchClass = PPCU5ImmAsmOperand;
+}
+def PPCU6ImmAsmOperand : AsmOperandClass {
+  let Name = "U6Imm"; let PredicateMethod = "isU6Imm";
+  let RenderMethod = "addImmOperands";
 }
 def u6imm   : Operand<i32> {
   let PrintMethod = "printU6ImmOperand";
+  let ParserMatchClass = PPCU6ImmAsmOperand;
+}
+def PPCS16ImmAsmOperand : AsmOperandClass {
+  let Name = "S16Imm"; let PredicateMethod = "isS16Imm";
+  let RenderMethod = "addImmOperands";
 }
 def s16imm  : Operand<i32> {
   let PrintMethod = "printS16ImmOperand";
+  let EncoderMethod = "getS16ImmEncoding";
+  let ParserMatchClass = PPCS16ImmAsmOperand;
+}
+def PPCU16ImmAsmOperand : AsmOperandClass {
+  let Name = "U16Imm"; let PredicateMethod = "isU16Imm";
+  let RenderMethod = "addImmOperands";
 }
 def u16imm  : Operand<i32> {
   let PrintMethod = "printU16ImmOperand";
+  let ParserMatchClass = PPCU16ImmAsmOperand;
 }
 def directbrtarget : Operand<OtherVT> {
   let PrintMethod = "printBranchOperand";
@@ -381,24 +459,44 @@ def calltarget : Operand<iPTR> {
 def aaddr : Operand<iPTR> {
   let PrintMethod = "printAbsAddrOperand";
 }
-def symbolHi: Operand<i32> {
-  let PrintMethod = "printSymbolHi";
-  let EncoderMethod = "getHA16Encoding";
-}
-def symbolLo: Operand<i32> {
-  let PrintMethod = "printSymbolLo";
-  let EncoderMethod = "getLO16Encoding";
+def PPCCRBitMaskOperand : AsmOperandClass {
+ let Name = "CRBitMask"; let PredicateMethod = "isCRBitMask";
 }
 def crbitm: Operand<i8> {
   let PrintMethod = "printcrbitm";
   let EncoderMethod = "get_crbitm_encoding";
+  let ParserMatchClass = PPCCRBitMaskOperand;
 }
 // Address operands
 // A version of ptr_rc which excludes R0 (or X0 in 64-bit mode).
-def ptr_rc_nor0 : PointerLikeRegClass<1>;
+def PPCRegGxRCNoR0Operand : AsmOperandClass {
+  let Name = "RegGxRCNoR0"; let PredicateMethod = "isRegNumber";
+}
+def ptr_rc_nor0 : Operand<iPTR>, PointerLikeRegClass<1> {
+  let ParserMatchClass = PPCRegGxRCNoR0Operand;
+}
+// A version of ptr_rc usable with the asm parser.
+def PPCRegGxRCOperand : AsmOperandClass {
+  let Name = "RegGxRC"; let PredicateMethod = "isRegNumber";
+}
+def ptr_rc_idx : Operand<iPTR>, PointerLikeRegClass<0> {
+  let ParserMatchClass = PPCRegGxRCOperand;
+}
 
-def dispRI : Operand<iPTR>;
-def dispRIX : Operand<iPTR>;
+def PPCDispRIOperand : AsmOperandClass {
+ let Name = "DispRI"; let PredicateMethod = "isS16Imm";
+ let RenderMethod = "addImmOperands";
+}
+def dispRI : Operand<iPTR> {
+  let ParserMatchClass = PPCDispRIOperand;
+}
+def PPCDispRIXOperand : AsmOperandClass {
+ let Name = "DispRIX"; let PredicateMethod = "isS16ImmX4";
+ let RenderMethod = "addImmOperands";
+}
+def dispRIX : Operand<iPTR> {
+  let ParserMatchClass = PPCDispRIXOperand;
+}
 
 def memri : Operand<iPTR> {
   let PrintMethod = "printMemRegImm";
@@ -407,10 +505,10 @@ def memri : Operand<iPTR> {
 }
 def memrr : Operand<iPTR> {
   let PrintMethod = "printMemRegReg";
-  let MIOperandInfo = (ops ptr_rc_nor0:$ptrreg, ptr_rc:$offreg);
+  let MIOperandInfo = (ops ptr_rc_nor0:$ptrreg, ptr_rc_idx:$offreg);
 }
-def memrix : Operand<iPTR> {   // memri where the imm is shifted 2 bits.
-  let PrintMethod = "printMemRegImmShifted";
+def memrix : Operand<iPTR> {   // memri where the imm is 4-aligned.
+  let PrintMethod = "printMemRegImm";
   let MIOperandInfo = (ops dispRIX:$imm, ptr_rc_nor0:$reg);
   let EncoderMethod = "getMemRIXEncoding";
 }
@@ -431,7 +529,7 @@ def pred : Operand<OtherVT> {
 def iaddr  : ComplexPattern<iPTR, 2, "SelectAddrImm",    [], []>;
 def xaddr  : ComplexPattern<iPTR, 2, "SelectAddrIdx",    [], []>;
 def xoaddr : ComplexPattern<iPTR, 2, "SelectAddrIdxOnly",[], []>;
-def ixaddr : ComplexPattern<iPTR, 2, "SelectAddrImmShift", [], []>; // "std"
+def ixaddr : ComplexPattern<iPTR, 2, "SelectAddrImmX4",  [], []>; // "std"
 
 // The address in a single register. This is used with the SjLj
 // pseudo-instructions.
@@ -888,6 +986,12 @@ let isBranch = 1, isTerminator = 1 in {
                         "#EH_SjLj_Setup\t$dst", []>;
 }
 
+// System call.
+let PPC970_Unit = 7 in {
+  def SC     : SCForm<17, 1, (outs), (ins i32imm:$lev),
+                      "sc $lev", BrB, [(PPCsc (i32 imm:$lev))]>;
+}
+
 // DCB* instructions.
 def DCBA   : DCB_Form<758, 0, (outs), (ins memrr:$dst),
                       "dcba $dst", LdStDCBF, [(int_ppc_dcba xoaddr:$dst)]>,
@@ -1290,41 +1394,41 @@ def SYNC : XForm_24_sync<31, 598, (outs), (ins),
 //
 
 let PPC970_Unit = 1 in {  // FXU Operations.
-def ADDI   : DForm_2<14, (outs gprc:$rD), (ins gprc_nor0:$rA, symbolLo:$imm),
+def ADDI   : DForm_2<14, (outs gprc:$rD), (ins gprc_nor0:$rA, s16imm:$imm),
                      "addi $rD, $rA, $imm", IntSimple,
-                     [(set i32:$rD, (add i32:$rA, immSExt16:$imm))]>;
+                     [(set i32:$rD, (add i32:$rA, imm32SExt16:$imm))]>;
 let BaseName = "addic" in {
 let Defs = [CARRY] in
 def ADDIC  : DForm_2<12, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm),
                      "addic $rD, $rA, $imm", IntGeneral,
-                     [(set i32:$rD, (addc i32:$rA, immSExt16:$imm))]>,
+                     [(set i32:$rD, (addc i32:$rA, imm32SExt16:$imm))]>,
                      RecFormRel, PPC970_DGroup_Cracked;
 let Defs = [CARRY, CR0] in
 def ADDICo : DForm_2<13, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm),
                      "addic. $rD, $rA, $imm", IntGeneral,
                      []>, isDOT, RecFormRel;
 }
-def ADDIS  : DForm_2<15, (outs gprc:$rD), (ins gprc_nor0:$rA, symbolHi:$imm),
+def ADDIS  : DForm_2<15, (outs gprc:$rD), (ins gprc_nor0:$rA, s16imm:$imm),
                      "addis $rD, $rA, $imm", IntSimple,
                      [(set i32:$rD, (add i32:$rA, imm16ShiftedSExt:$imm))]>;
 let isCodeGenOnly = 1 in
-def LA     : DForm_2<14, (outs gprc:$rD), (ins gprc_nor0:$rA, symbolLo:$sym),
+def LA     : DForm_2<14, (outs gprc:$rD), (ins gprc_nor0:$rA, s16imm:$sym),
                      "la $rD, $sym($rA)", IntGeneral,
                      [(set i32:$rD, (add i32:$rA,
                                           (PPClo tglobaladdr:$sym, 0)))]>;
 def MULLI  : DForm_2< 7, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm),
                      "mulli $rD, $rA, $imm", IntMulLI,
-                     [(set i32:$rD, (mul i32:$rA, immSExt16:$imm))]>;
+                     [(set i32:$rD, (mul i32:$rA, imm32SExt16:$imm))]>;
 let Defs = [CARRY] in
 def SUBFIC : DForm_2< 8, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm),
                      "subfic $rD, $rA, $imm", IntGeneral,
-                     [(set i32:$rD, (subc immSExt16:$imm, i32:$rA))]>;
+                     [(set i32:$rD, (subc imm32SExt16:$imm, i32:$rA))]>;
 
 let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
-  def LI  : DForm_2_r0<14, (outs gprc:$rD), (ins symbolLo:$imm),
+  def LI  : DForm_2_r0<14, (outs gprc:$rD), (ins s16imm:$imm),
                        "li $rD, $imm", IntSimple,
-                       [(set i32:$rD, immSExt16:$imm)]>;
-  def LIS : DForm_2_r0<15, (outs gprc:$rD), (ins symbolHi:$imm),
+                       [(set i32:$rD, imm32SExt16:$imm)]>;
+  def LIS : DForm_2_r0<15, (outs gprc:$rD), (ins s16imm:$imm),
                        "lis $rD, $imm", IntSimple,
                        [(set i32:$rD, imm16ShiftedSExt:$imm)]>;
 }
@@ -1591,6 +1695,12 @@ def MTCTR : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS),
                           "mtctr $rS", SprMTSPR>,
             PPC970_DGroup_First, PPC970_Unit_FXU;
 }
+let hasSideEffects = 1, isCodeGenOnly = 1, Defs = [CTR] in {
+let Pattern = [(int_ppc_mtctr i32:$rS)] in
+def MTCTRloop : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS),
+                              "mtctr $rS", SprMTSPR>,
+                PPC970_DGroup_First, PPC970_Unit_FXU;
+}
 
 let Defs = [LR] in {
 def MTLR  : XFXForm_7_ext<31, 467, 8, (outs), (ins gprc:$rS),
@@ -1905,7 +2015,7 @@ def : Pat<(or i32:$in, imm:$imm),
 def : Pat<(xor i32:$in, imm:$imm),
           (XORIS (XORI $in, (LO16 imm:$imm)), (HI16 imm:$imm))>;
 // SUBFIC
-def : Pat<(sub immSExt16:$imm, i32:$in),
+def : Pat<(sub imm32SExt16:$imm, i32:$in),
           (SUBFIC $in, imm:$imm)>;
 
 // SHL/SRL
@@ -2012,3 +2122,82 @@ def : Pat<(fma f32:$A, (fneg f32:$C), f32:$B),
 
 include "PPCInstrAltivec.td"
 include "PPCInstr64Bit.td"
+
+
+//===----------------------------------------------------------------------===//
+// PowerPC Instructions used for assembler/disassembler only
+//
+
+def ISYNC : XLForm_2_ext<19, 150, 0, 0, 0, (outs), (ins),
+                         "isync", SprISYNC, []>;
+
+def ICBI : XForm_1a<31, 982, (outs), (ins memrr:$src),
+                    "icbi $src", LdStICBI, []>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC Assembler Instruction Aliases
+//
+
+// Pseudo-instructions for alternate assembly syntax (never used by codegen).
+// These are aliases that require C++ handling to convert to the target
+// instruction, while InstAliases can be handled directly by tblgen.
+class PPCAsmPseudo<string asm, dag iops>
+  : Instruction {
+  let Namespace = "PPC";
+  bit PPC64 = 0;  // Default value, override with isPPC64
+
+  let OutOperandList = (outs);
+  let InOperandList = iops;
+  let Pattern = [];
+  let AsmString = asm;
+  let isAsmParserOnly = 1;
+  let isPseudo = 1;
+}
+
+def : InstAlias<"sc", (SC 0)>;
+
+def : InstAlias<"mr $rA, $rB", (OR8 g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
+
+def SLWI : PPCAsmPseudo<"slwi $rA, $rS, $n",
+                        (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
+def SRWI : PPCAsmPseudo<"srwi $rA, $rS, $n",
+                        (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
+def SLDI : PPCAsmPseudo<"sldi $rA, $rS, $n",
+                        (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
+def SRDI : PPCAsmPseudo<"srdi $rA, $rS, $n",
+                        (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
+
+multiclass BranchExtendedMnemonic<string name, int bibo> {
+  def : InstAlias<"b"#name#" $cc, $dst",
+                  (BCC bibo, crrc:$cc, condbrtarget:$dst)>;
+  def : InstAlias<"b"#name#" $dst",
+                  (BCC bibo, CR0, condbrtarget:$dst)>;
+
+  def : InstAlias<"b"#name#"lr $cc",
+                  (BCLR bibo, crrc:$cc)>;
+  def : InstAlias<"b"#name#"lr",
+                  (BCLR bibo, CR0)>;
+
+  def : InstAlias<"b"#name#"ctr $cc",
+                  (BCCTR bibo, crrc:$cc)>;
+  def : InstAlias<"b"#name#"ctr",
+                  (BCCTR bibo, CR0)>;
+
+  def : InstAlias<"b"#name#"ctrl $cc",
+                  (BCCTRL bibo, crrc:$cc)>;
+  def : InstAlias<"b"#name#"ctrl",
+                  (BCCTRL bibo, CR0)>;
+}
+defm : BranchExtendedMnemonic<"lt", 12>;
+defm : BranchExtendedMnemonic<"gt", 44>;
+defm : BranchExtendedMnemonic<"eq", 76>;
+defm : BranchExtendedMnemonic<"un", 108>;
+defm : BranchExtendedMnemonic<"so", 108>;
+defm : BranchExtendedMnemonic<"ge", 4>;
+defm : BranchExtendedMnemonic<"nl", 4>;
+defm : BranchExtendedMnemonic<"le", 36>;
+defm : BranchExtendedMnemonic<"ng", 36>;
+defm : BranchExtendedMnemonic<"ne", 68>;
+defm : BranchExtendedMnemonic<"nu", 100>;
+defm : BranchExtendedMnemonic<"ns", 100>;
+
diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp
index f8cf3a5..ba7efc1 100644
--- a/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "PPC.h"
+#include "MCTargetDesc/PPCMCExpr.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
@@ -110,32 +111,32 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
 
   unsigned access = MO.getTargetFlags() & PPCII::MO_ACCESS_MASK;
 
-  switch (access) {
-    case PPCII::MO_HA16: RefKind = isDarwin ? 
-                           MCSymbolRefExpr::VK_PPC_DARWIN_HA16 : 
-                           MCSymbolRefExpr::VK_PPC_GAS_HA16; 
-                         break;
-    case PPCII::MO_LO16: RefKind = isDarwin ? 
-                           MCSymbolRefExpr::VK_PPC_DARWIN_LO16 : 
-                           MCSymbolRefExpr::VK_PPC_GAS_LO16; 
-                         break;
-    case PPCII::MO_TPREL16_HA: RefKind = MCSymbolRefExpr::VK_PPC_TPREL16_HA;
-                               break;
-    case PPCII::MO_TPREL16_LO: RefKind = MCSymbolRefExpr::VK_PPC_TPREL16_LO;
-                               break;
-    case PPCII::MO_DTPREL16_LO: RefKind = MCSymbolRefExpr::VK_PPC_DTPREL16_LO;
-                                break;
-    case PPCII::MO_TLSLD16_LO: RefKind = MCSymbolRefExpr::VK_PPC_GOT_TLSLD16_LO;
-                               break;
-    case PPCII::MO_TOC16_LO: RefKind = MCSymbolRefExpr::VK_PPC_TOC16_LO;
-                             break;
-   }
+  if (!isDarwin) {
+    switch (access) {
+      case PPCII::MO_HA16:
+        RefKind = MCSymbolRefExpr::VK_PPC_ADDR16_HA;
+        break;
+      case PPCII::MO_LO16:
+        RefKind = MCSymbolRefExpr::VK_PPC_ADDR16_LO;
+        break;
+      case PPCII::MO_TPREL16_HA:
+        RefKind = MCSymbolRefExpr::VK_PPC_TPREL16_HA;
+        break;
+      case PPCII::MO_TPREL16_LO:
+        RefKind = MCSymbolRefExpr::VK_PPC_TPREL16_LO;
+        break;
+      case PPCII::MO_DTPREL16_LO:
+        RefKind = MCSymbolRefExpr::VK_PPC_DTPREL16_LO;
+        break;
+      case PPCII::MO_TLSLD16_LO:
+        RefKind = MCSymbolRefExpr::VK_PPC_GOT_TLSLD16_LO;
+        break;
+      case PPCII::MO_TOC16_LO:
+        RefKind = MCSymbolRefExpr::VK_PPC_TOC16_LO;
+        break;
+    }
+  }
 
-  // FIXME: This isn't right, but we don't have a good way to express this in
-  // the MC Level, see below.
-  if (MO.getTargetFlags() & PPCII::MO_PIC_FLAG)
-    RefKind = MCSymbolRefExpr::VK_None;
-  
   const MCExpr *Expr = MCSymbolRefExpr::Create(Symbol, RefKind, Ctx);
 
   if (!MO.isJTI() && MO.getOffset())
@@ -149,10 +150,20 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
     
     const MCExpr *PB = MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
     Expr = MCBinaryExpr::CreateSub(Expr, PB, Ctx);
-    // FIXME: We have no way to make the result be VK_PPC_LO16/VK_PPC_HA16,
-    // since it is not a symbol!
   }
-  
+
+  // Add Darwin ha16() / lo16() markers if required.
+  if (isDarwin) {
+    switch (access) {
+      case PPCII::MO_HA16:
+        Expr = PPCMCExpr::CreateHa16(Expr, Ctx);
+        break;
+      case PPCII::MO_LO16:
+        Expr = PPCMCExpr::CreateLo16(Expr, Ctx);
+        break;
+    }
+  }
+
   return MCOperand::CreateExpr(Expr);
 }
 
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 2be6324..a4e328e 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -48,12 +48,11 @@
 
 using namespace llvm;
 
-PPCRegisterInfo::PPCRegisterInfo(const PPCSubtarget &ST,
-                                 const TargetInstrInfo &tii)
+PPCRegisterInfo::PPCRegisterInfo(const PPCSubtarget &ST)
   : PPCGenRegisterInfo(ST.isPPC64() ? PPC::LR8 : PPC::LR,
                        ST.isPPC64() ? 0 : 1,
                        ST.isPPC64() ? 0 : 1),
-    Subtarget(ST), TII(tii) {
+    Subtarget(ST) {
   ImmToIdxMap[PPC::LD]   = PPC::LDX;    ImmToIdxMap[PPC::STD]  = PPC::STDX;
   ImmToIdxMap[PPC::LBZ]  = PPC::LBZX;   ImmToIdxMap[PPC::STB]  = PPC::STBX;
   ImmToIdxMap[PPC::LHZ]  = PPC::LHZX;   ImmToIdxMap[PPC::LHA]  = PPC::LHAX;
@@ -136,6 +135,11 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   Reserved.set(PPC::FP);
   Reserved.set(PPC::FP8);
 
+  // The counter registers must be reserved so that counter-based loops can
+  // be correctly formed (and the mtctr instructions are not DCE'd).
+  Reserved.set(PPC::CTR);
+  Reserved.set(PPC::CTR8);
+
   Reserved.set(PPC::R1);
   Reserved.set(PPC::LR);
   Reserved.set(PPC::LR8);
@@ -214,6 +218,8 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
   MachineFunction &MF = *MBB.getParent();
   // Get the frame info.
   MachineFrameInfo *MFI = MF.getFrameInfo();
+  // Get the instruction info.
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   // Determine whether 64-bit pointers are used.
   bool LP64 = Subtarget.isPPC64();
   DebugLoc dl = MI.getDebugLoc();
@@ -307,6 +313,7 @@ void PPCRegisterInfo::lowerCRSpilling(MachineBasicBlock::iterator II,
   // Get the instruction's basic block.
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   DebugLoc dl = MI.getDebugLoc();
 
   bool LP64 = Subtarget.isPPC64();
@@ -350,6 +357,7 @@ void PPCRegisterInfo::lowerCRRestore(MachineBasicBlock::iterator II,
   // Get the instruction's basic block.
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   DebugLoc dl = MI.getDebugLoc();
 
   bool LP64 = Subtarget.isPPC64();
@@ -391,6 +399,7 @@ void PPCRegisterInfo::lowerVRSAVESpilling(MachineBasicBlock::iterator II,
   // Get the instruction's basic block.
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   DebugLoc dl = MI.getDebugLoc();
 
   const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
@@ -415,6 +424,7 @@ void PPCRegisterInfo::lowerVRSAVERestore(MachineBasicBlock::iterator II,
   // Get the instruction's basic block.
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   DebugLoc dl = MI.getDebugLoc();
 
   const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
@@ -454,9 +464,8 @@ PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
   return false;
 }
 
-// Figure out if the offset in the instruction is shifted right two bits. This
-// is true for instructions like "STD", which the machine implicitly adds two
-// low zeros to.
+// Figure out if the offset in the instruction must be a multiple of 4.
+// This is true for instructions like "STD".
 static bool usesIXAddr(const MachineInstr &MI) {
   unsigned OpC = MI.getOpcode();
 
@@ -493,6 +502,8 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MachineBasicBlock &MBB = *MI.getParent();
   // Get the basic block's function.
   MachineFunction &MF = *MBB.getParent();
+  // Get the instruction info.
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   // Get the frame info.
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
@@ -549,10 +560,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
   // Now add the frame object offset to the offset from r1.
   int Offset = MFI->getObjectOffset(FrameIndex);
-  if (!isIXAddr)
-    Offset += MI.getOperand(OffsetOperandNo).getImm();
-  else
-    Offset += MI.getOperand(OffsetOperandNo).getImm() << 2;
+  Offset += MI.getOperand(OffsetOperandNo).getImm();
 
   // If we're not using a Frame Pointer that has been set to the value of the
   // SP before having the stack size subtracted from it, then add the stack size
@@ -572,8 +580,6 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   if (OpC == PPC::DBG_VALUE || // DBG_VALUE is always Reg+Imm
       (!noImmForm &&
        isInt<16>(Offset) && (!isIXAddr || (Offset & 3) == 0))) {
-    if (isIXAddr)
-      Offset >>= 2;    // The actual encoded value has the low two bits zero.
     MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset);
     return;
   }
@@ -650,11 +656,7 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
   }
 
   unsigned OffsetOperandNo = getOffsetONFromFION(*MI, FIOperandNum);
-
-  if (!usesIXAddr(*MI))
-    Offset += MI->getOperand(OffsetOperandNo).getImm();
-  else
-    Offset += MI->getOperand(OffsetOperandNo).getImm() << 2;
+  Offset += MI->getOperand(OffsetOperandNo).getImm();
 
   // It's the load/store FI references that cause issues, as it can be difficult
   // to materialize the offset if it won't fit in the literal field. Estimate
@@ -711,9 +713,10 @@ materializeFrameBaseRegister(MachineBasicBlock *MBB,
   if (Ins != MBB->end())
     DL = Ins->getDebugLoc();
 
+  const MachineFunction &MF = *MBB->getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   const MCInstrDesc &MCID = TII.get(ADDriOpc);
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-  const MachineFunction &MF = *MBB->getParent();
   MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this, MF));
 
   BuildMI(*MBB, Ins, DL, MCID, BaseReg)
@@ -734,17 +737,7 @@ PPCRegisterInfo::resolveFrameIndex(MachineBasicBlock::iterator I,
 
   MI.getOperand(FIOperandNum).ChangeToRegister(BaseReg, false);
   unsigned OffsetOperandNo = getOffsetONFromFION(MI, FIOperandNum);
-
-  bool isIXAddr = usesIXAddr(MI);
-  if (!isIXAddr)
-    Offset += MI.getOperand(OffsetOperandNo).getImm();
-  else
-    Offset += MI.getOperand(OffsetOperandNo).getImm() << 2;
-
-  // Figure out if the offset in the instruction is shifted right two bits.
-  if (isIXAddr)
-    Offset >>= 2;    // The actual encoded value has the low two bits zero.
-
+  Offset += MI.getOperand(OffsetOperandNo).getImm();
   MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset);
 }
 
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
index 7a48b4b..93626a9 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -29,9 +29,8 @@ class Type;
 class PPCRegisterInfo : public PPCGenRegisterInfo {
   DenseMap<unsigned, unsigned> ImmToIdxMap;
   const PPCSubtarget &Subtarget;
-  const TargetInstrInfo &TII;
 public:
-  PPCRegisterInfo(const PPCSubtarget &SubTarget, const TargetInstrInfo &tii);
+  PPCRegisterInfo(const PPCSubtarget &SubTarget);
   
   /// getPointerRegClass - Return the register class to use to hold pointers.
   /// This is used for addressing modes.
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.td b/lib/Target/PowerPC/PPCRegisterInfo.td
index 57a25f5..b1b4f06 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -11,11 +11,11 @@
 //===----------------------------------------------------------------------===//
 
 let Namespace = "PPC" in {
-def sub_lt : SubRegIndex;
-def sub_gt : SubRegIndex;
-def sub_eq : SubRegIndex;
-def sub_un : SubRegIndex;
-def sub_32 : SubRegIndex;
+def sub_lt : SubRegIndex<1>;
+def sub_gt : SubRegIndex<1, 1>;
+def sub_eq : SubRegIndex<1, 2>;
+def sub_un : SubRegIndex<1, 3>;
+def sub_32 : SubRegIndex<32>;
 }
 
 
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index 14dc794..da03b4c 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -48,6 +48,7 @@ PPCTargetMachine::PPCTargetMachine(const Target &T, StringRef TT,
   // The binutils for the BG/P are too old for CFI.
   if (Subtarget.isBGP())
     setMCUseCFI(false);
+  initAsmInfo();
 }
 
 void PPC32TargetMachine::anchor() { }
@@ -90,7 +91,7 @@ public:
     return *getPPCTargetMachine().getSubtargetImpl();
   }
 
-  virtual bool addPreRegAlloc();
+  virtual bool addPreISel();
   virtual bool addILPOpts();
   virtual bool addInstSelector();
   virtual bool addPreSched2();
@@ -102,9 +103,9 @@ TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) {
   return new PPCPassConfig(this, PM);
 }
 
-bool PPCPassConfig::addPreRegAlloc() {
+bool PPCPassConfig::addPreISel() {
   if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
-    addPass(createPPCCTRLoops());
+    addPass(createPPCCTRLoops(getPPCTargetMachine()));
 
   return false;
 }
@@ -121,6 +122,12 @@ bool PPCPassConfig::addILPOpts() {
 bool PPCPassConfig::addInstSelector() {
   // Install an instruction selector.
   addPass(createPPCISelDag(getPPCTargetMachine()));
+
+#ifndef NDEBUG
+  if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
+    addPass(createPPCCTRLoopsVerify());
+#endif
+
   return false;
 }
 
diff --git a/lib/Target/PowerPC/PPCTargetObjectFile.cpp b/lib/Target/PowerPC/PPCTargetObjectFile.cpp
new file mode 100644
index 0000000..90e4f15
--- /dev/null
+++ b/lib/Target/PowerPC/PPCTargetObjectFile.cpp
@@ -0,0 +1,57 @@
+//===-- PPCTargetObjectFile.cpp - PPC Object Info -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCTargetObjectFile.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Target/Mangler.h"
+
+using namespace llvm;
+
+void
+PPC64LinuxTargetObjectFile::
+Initialize(MCContext &Ctx, const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+}
+
+const MCSection * PPC64LinuxTargetObjectFile::
+SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+                       Mangler *Mang, const TargetMachine &TM) const {
+
+  const MCSection *DefaultSection = 
+    TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang, TM);
+
+  if (DefaultSection != ReadOnlySection)
+    return DefaultSection;
+
+  // Here override ReadOnlySection to DataRelROSection for PPC64 SVR4 ABI
+  // when we have a constant that contains global relocations.  This is
+  // necessary because of this ABI's handling of pointers to functions in
+  // a shared library.  The address of a function is actually the address
+  // of a function descriptor, which resides in the .opd section.  Generated
+  // code uses the descriptor directly rather than going via the GOT as some
+  // other ABIs do, which means that initialized function pointers must
+  // reference the descriptor.  The linker must convert copy relocs of
+  // pointers to functions in shared libraries into dynamic relocations,
+  // because of an ordering problem with initialization of copy relocs and
+  // PLT entries.  The dynamic relocation will be initialized by the dynamic
+  // linker, so we must use DataRelROSection instead of ReadOnlySection.
+  // For more information, see the description of ELIMINATE_COPY_RELOCS in
+  // GNU ld.
+  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+
+  if (GVar && GVar->isConstant() &&
+      (GVar->getInitializer()->getRelocationInfo() ==
+       Constant::GlobalRelocations))
+    return DataRelROSection;
+
+  return DefaultSection;
+}
diff --git a/lib/Target/PowerPC/PPCTargetObjectFile.h b/lib/Target/PowerPC/PPCTargetObjectFile.h
new file mode 100644
index 0000000..9203e23
--- /dev/null
+++ b/lib/Target/PowerPC/PPCTargetObjectFile.h
@@ -0,0 +1,32 @@
+//===-- PPCTargetObjectFile.h - PPC Object Info -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_PPC_TARGETOBJECTFILE_H
+#define LLVM_TARGET_PPC_TARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+  /// PPC64LinuxTargetObjectFile - This implementation is used for
+  /// 64-bit PowerPC Linux.
+  class PPC64LinuxTargetObjectFile : public TargetLoweringObjectFileELF {
+
+    virtual void Initialize(MCContext &Ctx, const TargetMachine &TM);
+
+    virtual const MCSection *
+    SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+                           Mangler *Mang, const TargetMachine &TM) const;
+  };
+
+}  // end namespace llvm
+
+#endif
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
index 9792bd8..f284291 100644
--- a/lib/Target/R600/AMDGPU.h
+++ b/lib/Target/R600/AMDGPU.h
@@ -11,21 +11,28 @@
 #ifndef AMDGPU_H
 #define AMDGPU_H
 
-#include "AMDGPUTargetMachine.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
 
-class FunctionPass;
+class AMDGPUInstrPrinter;
 class AMDGPUTargetMachine;
+class FunctionPass;
+class MCAsmInfo;
+class raw_ostream;
+class Target;
+class TargetMachine;
 
 // R600 Passes
-FunctionPass* createR600KernelParametersPass(const DataLayout *TD);
+FunctionPass *createR600VectorRegMerger(TargetMachine &tm);
+FunctionPass *createR600TextureIntrinsicsReplacer();
 FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
 FunctionPass *createR600EmitClauseMarkers(TargetMachine &tm);
 FunctionPass *createR600Packetizer(TargetMachine &tm);
 FunctionPass *createR600ControlFlowFinalizer(TargetMachine &tm);
+FunctionPass *createAMDGPUCFGPreparationPass(TargetMachine &tm);
+FunctionPass *createAMDGPUCFGStructurizerPass(TargetMachine &tm);
 
 // SI Passes
 FunctionPass *createSIAnnotateControlFlowPass();
@@ -36,7 +43,10 @@ FunctionPass *createSIInsertWaits(TargetMachine &tm);
 // Passes common to R600 and SI
 Pass *createAMDGPUStructurizeCFGPass();
 FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm);
-FunctionPass* createAMDGPUIndirectAddressingPass(TargetMachine &tm);
+FunctionPass *createAMDGPUIndirectAddressingPass(TargetMachine &tm);
+FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
+
+extern Target TheAMDGPUTarget;
 
 } // End namespace llvm
 
@@ -49,4 +59,41 @@ namespace ShaderType {
   };
 }
 
+/// OpenCL uses address spaces to differentiate between
+/// various memory regions on the hardware. On the CPU
+/// all of the address spaces point to the same memory,
+/// however on the GPU, each address space points to
+/// a seperate piece of memory that is unique from other
+/// memory locations.
+namespace AMDGPUAS {
+enum AddressSpaces {
+  PRIVATE_ADDRESS  = 0, ///< Address space for private memory.
+  GLOBAL_ADDRESS   = 1, ///< Address space for global memory (RAT0, VTX0).
+  CONSTANT_ADDRESS = 2, ///< Address space for constant memory
+  LOCAL_ADDRESS    = 3, ///< Address space for local memory.
+  REGION_ADDRESS   = 4, ///< Address space for region memory.
+  ADDRESS_NONE     = 5, ///< Address space for unknown memory.
+  PARAM_D_ADDRESS  = 6, ///< Address space for direct addressible parameter memory (CONST0)
+  PARAM_I_ADDRESS  = 7, ///< Address space for indirect addressible parameter memory (VTX1)
+  CONSTANT_BUFFER_0 = 8,
+  CONSTANT_BUFFER_1 = 9,
+  CONSTANT_BUFFER_2 = 10,
+  CONSTANT_BUFFER_3 = 11,
+  CONSTANT_BUFFER_4 = 12,
+  CONSTANT_BUFFER_5 = 13,
+  CONSTANT_BUFFER_6 = 14,
+  CONSTANT_BUFFER_7 = 15,
+  CONSTANT_BUFFER_8 = 16,
+  CONSTANT_BUFFER_9 = 17,
+  CONSTANT_BUFFER_10 = 18,
+  CONSTANT_BUFFER_11 = 19,
+  CONSTANT_BUFFER_12 = 20,
+  CONSTANT_BUFFER_13 = 21,
+  CONSTANT_BUFFER_14 = 22,
+  CONSTANT_BUFFER_15 = 23,
+  LAST_ADDRESS     = 24
+};
+
+} // namespace AMDGPUAS
+
 #endif // AMDGPU_H
diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td
index 1a26c77..0048e25 100644
--- a/lib/Target/R600/AMDGPU.td
+++ b/lib/Target/R600/AMDGPU.td
@@ -10,6 +10,79 @@
 // Include AMDIL TD files
 include "AMDILBase.td"
 
+//===----------------------------------------------------------------------===//
+// Subtarget Features
+//===----------------------------------------------------------------------===//
+
+// Debugging Features
+
+def FeatureDumpCode : SubtargetFeature <"DumpCode",
+        "DumpCode",
+        "true",
+        "Dump MachineInstrs in the CodeEmitter">;
+
+// Target features
+
+def FeatureFP64     : SubtargetFeature<"fp64",
+        "FP64",
+        "true",
+        "Enable 64bit double precision operations">;
+
+def Feature64BitPtr : SubtargetFeature<"64BitPtr",
+        "Is64bit",
+        "true",
+        "Specify if 64bit addressing should be used.">;
+
+def Feature32on64BitPtr : SubtargetFeature<"64on32BitPtr",
+        "Is32on64bit",
+        "false",
+        "Specify if 64bit sized pointers with 32bit addressing should be used.">;
+
+def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst",
+        "R600ALUInst",
+        "false",
+        "Older version of ALU instructions encoding.">;
+
+def FeatureVertexCache : SubtargetFeature<"HasVertexCache",
+        "HasVertexCache",
+        "true",
+        "Specify use of dedicated vertex cache.">;
+
+def FeatureCaymanISA : SubtargetFeature<"caymanISA",
+        "CaymanISA",
+        "true",
+        "Use Cayman ISA">;
+
+class SubtargetFeatureFetchLimit <string Value> :
+                          SubtargetFeature <"fetch"#Value,
+        "TexVTXClauseSize",
+        Value,
+        "Limit the maximum number of fetches in a clause to "#Value>;
+
+def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">;
+def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">;
+
+class SubtargetFeatureGeneration <string Value,
+                                  list<SubtargetFeature> Implies> :
+        SubtargetFeature <Value, "Gen", "AMDGPUSubtarget::"#Value,
+                          Value#" GPU generation", Implies>;
+
+def FeatureR600 : SubtargetFeatureGeneration<"R600",
+        [FeatureR600ALUInst, FeatureFetchLimit8]>;
+
+def FeatureR700 : SubtargetFeatureGeneration<"R700",
+        [FeatureFetchLimit16]>;
+
+def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN",
+        [FeatureFetchLimit16]>;
+
+def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
+        [FeatureFetchLimit16]>;
+
+def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
+        [Feature64BitPtr, FeatureFP64]>;
+
+//===----------------------------------------------------------------------===//
 
 def AMDGPUInstrInfo : InstrInfo {
   let guessInstructionProperties = 1;
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
index c915f50..f720c7e 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
@@ -19,11 +19,12 @@
 
 #include "AMDGPUAsmPrinter.h"
 #include "AMDGPU.h"
+#include "R600Defines.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
 #include "SIDefines.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
-#include "R600MachineFunctionInfo.h"
-#include "R600RegisterInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
@@ -62,7 +63,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
                                               ELF::SHT_PROGBITS, 0,
                                               SectionKind::getReadOnly());
   OutStreamer.SwitchSection(ConfigSection);
-  if (STM.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
+  if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
     EmitProgramInfoSI(MF);
   } else {
     EmitProgramInfoR600(MF);
@@ -78,6 +79,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(MachineFunction &MF) {
   const R600RegisterInfo * RI =
                 static_cast<const R600RegisterInfo*>(TM.getRegisterInfo());
   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+  const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
 
   for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
                                                   BB != BB_E; ++BB) {
@@ -101,9 +103,33 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(MachineFunction &MF) {
       }
     }
   }
-  OutStreamer.EmitIntValue(MaxGPR + 1, 4);
-  OutStreamer.EmitIntValue(MFI->StackSize, 4);
-  OutStreamer.EmitIntValue(killPixel, 4);
+
+  unsigned RsrcReg;
+  if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) {
+    // Evergreen / Northern Islands
+    switch (MFI->ShaderType) {
+    default: // Fall through
+    case ShaderType::COMPUTE:  RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
+    case ShaderType::GEOMETRY: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
+    case ShaderType::PIXEL:    RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
+    case ShaderType::VERTEX:   RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break;
+    }
+  } else {
+    // R600 / R700
+    switch (MFI->ShaderType) {
+    default: // Fall through
+    case ShaderType::GEOMETRY: // Fall through
+    case ShaderType::COMPUTE:  // Fall through
+    case ShaderType::VERTEX:   RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
+    case ShaderType::PIXEL:    RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
+    }
+  }
+
+  OutStreamer.EmitIntValue(RsrcReg, 4);
+  OutStreamer.EmitIntValue(S_NUM_GPRS(MaxGPR + 1) |
+                           S_STACK_SIZE(MFI->StackSize), 4);
+  OutStreamer.EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
+  OutStreamer.EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
 }
 
 void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF) {
diff --git a/lib/Target/R600/AMDGPUCallingConv.td b/lib/Target/R600/AMDGPUCallingConv.td
index 9c30515..84e4f3a 100644
--- a/lib/Target/R600/AMDGPUCallingConv.td
+++ b/lib/Target/R600/AMDGPUCallingConv.td
@@ -32,17 +32,21 @@ def CC_SI : CallingConv<[
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
     VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31
-  ]>>>,
+  ]>>>
+
+]>;
 
-  // This is the default for i64 values.
-  // XXX: We should change this once clang understands the CC_AMDGPU.
-  CCIfType<[i64], CCAssignToRegWithShadow<
-   [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ],
-   [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ]
-  >>
+// Calling convention for SI compute kernels
+def CC_SI_Kernel : CallingConv<[
+  CCIfType<[i64],      CCAssignToStack <8, 4>>,
+  CCIfType<[i32, f32], CCAssignToStack <4, 4>>,
+  CCIfType<[i16],      CCAssignToStack <2, 4>>,
+  CCIfType<[i8],       CCAssignToStack <1, 4>>
 ]>;
 
 def CC_AMDGPU : CallingConv<[
-  CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>().device()"#
-       "->getGeneration() == AMDGPUDeviceInfo::HD7XXX", CCDelegateTo<CC_SI>>
+  CCIf<"State.getMachineFunction().getInfo<SIMachineFunctionInfo>()->"#
+       "ShaderType == ShaderType::COMPUTE", CCDelegateTo<CC_SI_Kernel>>,
+  CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>()"#
+       ".getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS", CCDelegateTo<CC_SI>>
 ]>;
diff --git a/lib/Target/R600/AMDGPUFrameLowering.cpp b/lib/Target/R600/AMDGPUFrameLowering.cpp
index 815d6f7..40f14d2 100644
--- a/lib/Target/R600/AMDGPUFrameLowering.cpp
+++ b/lib/Target/R600/AMDGPUFrameLowering.cpp
@@ -78,27 +78,8 @@ int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
   int UpperBound = FI == -1 ? MFI->getNumObjects() : FI;
 
   for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) {
-    const AllocaInst *Alloca = MFI->getObjectAllocation(i);
-    unsigned ArrayElements;
-    const Type *AllocaType = Alloca->getAllocatedType();
-    const Type *ElementType;
-
-    if (AllocaType->isArrayTy()) {
-      ArrayElements = AllocaType->getArrayNumElements();
-      ElementType = AllocaType->getArrayElementType();
-    } else {
-      ArrayElements = 1;
-      ElementType = AllocaType;
-    }
-
-    unsigned VectorElements;
-    if (ElementType->isVectorTy()) {
-      VectorElements = ElementType->getVectorNumElements();
-    } else {
-      VectorElements = 1;
-    }
-
-    Offset += (VectorElements / getStackWidth(MF)) * ArrayElements;
+    unsigned Size = MFI->getObjectSize(i);
+    Offset += (Size / (getStackWidth(MF) * 4));
   }
   return Offset;
 }
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index a266df5..02d6fab 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -14,9 +14,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUISelLowering.h"
+#include "AMDGPU.h"
 #include "AMDGPURegisterInfo.h"
-#include "AMDILIntrinsicInfo.h"
 #include "AMDGPUSubtarget.h"
+#include "AMDILIntrinsicInfo.h"
+#include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -46,6 +48,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
   setOperationAction(ISD::FRINT,  MVT::f32, Legal);
 
+  // The hardware supports ROTR, but not ROTL
+  setOperationAction(ISD::ROTL, MVT::i32, Expand);
+
   // Lower floating point store/load to integer store/load to reduce the number
   // of patterns in tablegen.
   setOperationAction(ISD::STORE, MVT::f32, Promote);
@@ -83,7 +88,7 @@ SDValue AMDGPUTargetLowering::LowerReturn(
                                      bool isVarArg,
                                      const SmallVectorImpl<ISD::OutputArg> &Outs,
                                      const SmallVectorImpl<SDValue> &OutVals,
-                                     DebugLoc DL, SelectionDAG &DAG) const {
+                                     SDLoc DL, SelectionDAG &DAG) const {
   return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain);
 }
 
@@ -114,7 +119,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
 SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     SelectionDAG &DAG) const {
   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT VT = Op.getValueType();
 
   switch (IntrinsicID) {
@@ -154,7 +159,7 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op,
     SelectionDAG &DAG) const {
 
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT VT = Op.getValueType();
   SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
                                               Op.getOperand(1));
@@ -166,7 +171,7 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op,
 /// LRP(a, b, c) = muladd(a,  b, (1 - a) * c)
 SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
     SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT VT = Op.getValueType();
   SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT,
                                 DAG.getConstantFP(1.0f, MVT::f32),
@@ -181,7 +186,7 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
 /// \brief Generate Min/Max node
 SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op,
     SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT VT = Op.getValueType();
 
   SDValue LHS = Op.getOperand(0);
@@ -242,7 +247,7 @@ SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op,
 
 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
     SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT VT = Op.getValueType();
 
   SDValue Num = Op.getOperand(0);
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
index c2a79ea..69a0ac9 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -33,8 +33,9 @@ protected:
   /// MachineFunction.
   ///
   /// \returns a RegisterSDNode representing Reg.
-  SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC,
-                                                  unsigned Reg, EVT VT) const;
+  virtual SDValue CreateLiveInRegister(SelectionDAG &DAG,
+                                       const TargetRegisterClass *RC,
+                                       unsigned Reg, EVT VT) const;
 
   bool isHWTrueValue(SDValue Op) const;
   bool isHWFalseValue(SDValue Op) const;
@@ -49,7 +50,7 @@ public:
                               bool isVarArg,
                               const SmallVectorImpl<ISD::OutputArg> &Outs,
                               const SmallVectorImpl<SDValue> &OutVals,
-                              DebugLoc DL, SelectionDAG &DAG) const;
+                              SDLoc DL, SelectionDAG &DAG) const;
   virtual SDValue LowerCall(CallLoweringInfo &CLI,
                             SmallVectorImpl<SDValue> &InVals) const {
     CLI.Callee.dump();
@@ -115,8 +116,6 @@ enum {
   RET_FLAG,
   BRANCH_COND,
   // End AMDIL ISD Opcodes
-  BITALIGN,
-  BUFFER_STORE,
   DWORDADDR,
   FRACT,
   FMAX,
@@ -126,6 +125,8 @@ enum {
   SMIN,
   UMIN,
   URECIP,
+  DOT4,
+  TEXTURE_FETCH,
   EXPORT,
   CONST_ADDRESS,
   REGISTER_LOAD,
diff --git a/lib/Target/R600/AMDGPUIndirectAddressing.cpp b/lib/Target/R600/AMDGPUIndirectAddressing.cpp
index ed6c8ec..3ce3ecf 100644
--- a/lib/Target/R600/AMDGPUIndirectAddressing.cpp
+++ b/lib/Target/R600/AMDGPUIndirectAddressing.cpp
@@ -39,7 +39,7 @@ private:
 public:
   AMDGPUIndirectAddressingPass(TargetMachine &tm) :
     MachineFunctionPass(ID),
-    TII(static_cast<const AMDGPUInstrInfo*>(tm.getInstrInfo()))
+    TII(0)
     { }
 
   virtual bool runOnMachineFunction(MachineFunction &MF);
@@ -59,6 +59,8 @@ FunctionPass *llvm::createAMDGPUIndirectAddressingPass(TargetMachine &tm) {
 bool AMDGPUIndirectAddressingPass::runOnMachineFunction(MachineFunction &MF) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
+  TII = static_cast<const AMDGPUInstrInfo*>(MF.getTarget().getInstrInfo());
+
   int IndirectBegin = TII->getIndirectIndexBegin(MF);
   int IndirectEnd = TII->getIndirectIndexEnd(MF);
 
@@ -224,7 +226,7 @@ bool AMDGPUIndirectAddressingPass::runOnMachineFunction(MachineFunction &MF) {
             unsigned LiveAddress = RegisterAddressMap[Reg];
             // Chain the live-ins
             if (LiveAddressRegisterMap.find(LiveAddress) !=
-                                                     RegisterAddressMap.end()) {
+                LiveAddressRegisterMap.end()) {
               MI.addOperand(MachineOperand::CreateReg(
                                   LiveAddressRegisterMap[LiveAddress],
                                   false, // isDef
diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp
index 30f736c..31b3002 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.cpp
+++ b/lib/Target/R600/AMDGPUInstrInfo.cpp
@@ -16,7 +16,6 @@
 #include "AMDGPUInstrInfo.h"
 #include "AMDGPURegisterInfo.h"
 #include "AMDGPUTargetMachine.h"
-#include "AMDIL.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -28,7 +27,7 @@
 using namespace llvm;
 
 AMDGPUInstrInfo::AMDGPUInstrInfo(TargetMachine &tm)
-  : AMDGPUGenInstrInfo(0,0), RI(tm, *this), TM(tm) { }
+  : AMDGPUGenInstrInfo(0,0), RI(tm), TM(tm) { }
 
 const AMDGPURegisterInfo &AMDGPUInstrInfo::getRegisterInfo() const {
   return RI;
@@ -99,27 +98,6 @@ bool AMDGPUInstrInfo::getNextBranchInstr(MachineBasicBlock::iterator &iter,
   return false;
 }
 
-MachineBasicBlock::iterator skipFlowControl(MachineBasicBlock *MBB) {
-  MachineBasicBlock::iterator tmp = MBB->end();
-  if (!MBB->size()) {
-    return MBB->end();
-  }
-  while (--tmp) {
-    if (tmp->getOpcode() == AMDGPU::ENDLOOP
-        || tmp->getOpcode() == AMDGPU::ENDIF
-        || tmp->getOpcode() == AMDGPU::ELSE) {
-      if (tmp == MBB->begin()) {
-        return tmp;
-      } else {
-        continue;
-      }
-    }  else {
-      return ++tmp;
-    }
-  }
-  return MBB->end();
-}
-
 void
 AMDGPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator MI,
diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td
index b66ae87..48d89dd 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.td
+++ b/lib/Target/R600/AMDGPUInstrInfo.td
@@ -23,12 +23,6 @@ def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [
 // AMDGPU DAG Nodes
 //
 
-// out = ((a << 32) | b) >> c)
-//
-// Can be used to optimize rtol:
-// rotl(a, b) = bitalign(a, a, 32 - b)
-def AMDGPUbitalign : SDNode<"AMDGPUISD::BITALIGN", AMDGPUDTIntTernaryOp>;
-
 // This argument to this node is a dword address.
 def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>;
 
@@ -71,8 +65,6 @@ def AMDGPUumin : SDNode<"AMDGPUISD::UMIN", SDTIntBinOp,
 // e is rounding error
 def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>;
 
-def fpow : SDNode<"ISD::FPOW", SDTFPBinOp>;
-
 def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD",
                           SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
                           [SDNPHasChain, SDNPMayLoad]>;
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index 83e1359..29df374 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -90,6 +90,10 @@ def zextloadi8_global : PatFrag<(ops node:$ptr), (zextloadi8 node:$ptr), [{
     return isGlobalLoad(dyn_cast<LoadSDNode>(N));
 }]>;
 
+def zextloadi8_constant : PatFrag<(ops node:$ptr), (zextloadi8 node:$ptr), [{
+    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
 class Constants {
 int TWO_PI = 0x40c90fdb;
 int PI = 0x40490fdb;
@@ -276,6 +280,31 @@ multiclass BFIPatterns <Instruction BFI_INT> {
 
 }
 
+// SHA-256 Ma patterns
+
+// ((x & z) | (y & (x | z))) -> BFI_INT (XOR x, y), z, y
+class SHA256MaPattern <Instruction BFI_INT, Instruction XOR> : Pat <
+  (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))),
+  (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y)
+>;
+
+// Bitfield extract patterns
+
+def legalshift32 : ImmLeaf <i32, [{return Imm >=0 && Imm < 32;}]>;
+def bfemask : PatLeaf <(imm), [{return isMask_32(N->getZExtValue());}],
+                            SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(CountTrailingOnes_32(N->getZExtValue()), MVT::i32);}]>>;
+
+class BFEPattern <Instruction BFE> : Pat <
+  (and (srl i32:$x, legalshift32:$y), bfemask:$z),
+  (BFE $x, $y, $z)
+>;
+
+// rotr pattern
+class ROTRPattern <Instruction BIT_ALIGN> : Pat <
+  (rotr i32:$src0, i32:$src1),
+  (BIT_ALIGN $src0, $src0, $src1)
+>;
+
 include "R600Instructions.td"
 
 include "SIInstrInfo.td"
diff --git a/lib/Target/R600/AMDGPURegisterInfo.cpp b/lib/Target/R600/AMDGPURegisterInfo.cpp
index fe994d2..3402092 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.cpp
+++ b/lib/Target/R600/AMDGPURegisterInfo.cpp
@@ -17,11 +17,9 @@
 
 using namespace llvm;
 
-AMDGPURegisterInfo::AMDGPURegisterInfo(TargetMachine &tm,
-    const TargetInstrInfo &tii)
+AMDGPURegisterInfo::AMDGPURegisterInfo(TargetMachine &tm)
 : AMDGPUGenRegisterInfo(0),
-  TM(tm),
-  TII(tii)
+  TM(tm)
   { }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/R600/AMDGPURegisterInfo.h b/lib/Target/R600/AMDGPURegisterInfo.h
index 1fc88e7..7cbd34b 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.h
+++ b/lib/Target/R600/AMDGPURegisterInfo.h
@@ -30,10 +30,9 @@ class TargetInstrInfo;
 
 struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
   TargetMachine &TM;
-  const TargetInstrInfo &TII;
   static const uint16_t CalleeSavedReg;
 
-  AMDGPURegisterInfo(TargetMachine &tm, const TargetInstrInfo &tii);
+  AMDGPURegisterInfo(TargetMachine &tm);
 
   virtual BitVector getReservedRegs(const MachineFunction &MF) const {
     assert(!"Unimplemented");  return BitVector();
diff --git a/lib/Target/R600/AMDGPURegisterInfo.td b/lib/Target/R600/AMDGPURegisterInfo.td
index b5aca03..835a146 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.td
+++ b/lib/Target/R600/AMDGPURegisterInfo.td
@@ -14,7 +14,8 @@
 let Namespace = "AMDGPU" in {
 
 foreach Index = 0-15 in {
-  def sub#Index : SubRegIndex;
+  // Indices are used in a variety of ways here, so don't set a size/offset.
+  def sub#Index : SubRegIndex<-1, -1>;
 }
 
 def INDIRECT_BASE_ADDR : Register <"INDIRECT_BASE_ADDR">;
diff --git a/lib/Target/R600/AMDGPUStructurizeCFG.cpp b/lib/Target/R600/AMDGPUStructurizeCFG.cpp
index dea43b8..d26783d 100644
--- a/lib/Target/R600/AMDGPUStructurizeCFG.cpp
+++ b/lib/Target/R600/AMDGPUStructurizeCFG.cpp
@@ -16,14 +16,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SCCIterator.h"
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/Analysis/RegionIterator.h"
 #include "llvm/Analysis/RegionPass.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Support/PatternMatch.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
 
 using namespace llvm;
 using namespace llvm::PatternMatch;
@@ -353,7 +353,7 @@ Value *AMDGPUStructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx,
   if (Term->isConditional()) {
     Cond = Term->getCondition();
 
-    if (Idx != Invert)
+    if (Idx != (unsigned)Invert)
       Cond = invert(Cond);
   }
   return Cond;
diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
index a7e1d7b..8ed5a74 100644
--- a/lib/Target/R600/AMDGPUSubtarget.cpp
+++ b/lib/Target/R600/AMDGPUSubtarget.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUSubtarget.h"
+#include <stdio.h>
 
 using namespace llvm;
 
@@ -25,8 +26,6 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) :
   AMDGPUGenSubtargetInfo(TT, CPU, FS), DumpCode(false) {
     InstrItins = getInstrItineraryForCPU(CPU);
 
-  memset(CapsOverride, 0, sizeof(*CapsOverride)
-      * AMDGPUDeviceInfo::MaxNumberCapabilities);
   // Default card
   StringRef GPU = CPU;
   Is64bit = false;
@@ -34,22 +33,15 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) :
   DefaultSize[1] = 1;
   DefaultSize[2] = 1;
   HasVertexCache = false;
+  TexVTXClauseSize = 0;
+  Gen = AMDGPUSubtarget::R600;
+  FP64 = false;
+  CaymanISA = false;
   ParseSubtargetFeatures(GPU, FS);
   DevName = GPU;
-  Device = AMDGPUDeviceInfo::getDeviceFromName(DevName, this, Is64bit);
-}
-
-AMDGPUSubtarget::~AMDGPUSubtarget() {
-  delete Device;
 }
 
 bool
-AMDGPUSubtarget::isOverride(AMDGPUDeviceInfo::Caps caps) const {
-  assert(caps < AMDGPUDeviceInfo::MaxNumberCapabilities &&
-      "Caps index is out of bounds!");
-  return CapsOverride[caps];
-}
-bool
 AMDGPUSubtarget::is64bit() const  {
   return Is64bit;
 }
@@ -57,6 +49,22 @@ bool
 AMDGPUSubtarget::hasVertexCache() const {
   return HasVertexCache;
 }
+short
+AMDGPUSubtarget::getTexVTXClauseSize() const {
+  return TexVTXClauseSize;
+}
+enum AMDGPUSubtarget::Generation
+AMDGPUSubtarget::getGeneration() const {
+  return Gen;
+}
+bool
+AMDGPUSubtarget::hasHWFP64() const {
+  return FP64;
+}
+bool
+AMDGPUSubtarget::hasCaymanISA() const {
+  return CaymanISA;
+}
 bool
 AMDGPUSubtarget::isTargetELF() const {
   return false;
@@ -72,21 +80,28 @@ AMDGPUSubtarget::getDefaultSize(uint32_t dim) const {
 
 std::string
 AMDGPUSubtarget::getDataLayout() const {
-    if (!Device) {
-        return std::string("e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16"
-                "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32"
-                "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64"
-                "-v96:128:128-v128:128:128-v192:256:256-v256:256:256"
-                "-v512:512:512-v1024:1024:1024-v2048:2048:2048-a0:0:64");
-    }
-    return Device->getDataLayout();
+  std::string DataLayout = std::string(
+   "e"
+   "-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32"
+   "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128"
+   "-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+   "-n32:64"
+  );
+
+  if (hasHWFP64()) {
+    DataLayout.append("-f64:64:64");
+  }
+
+  if (is64bit()) {
+    DataLayout.append("-p:64:64:64");
+  } else {
+    DataLayout.append("-p:32:32:32");
+  }
+
+  return DataLayout;
 }
 
 std::string
 AMDGPUSubtarget::getDeviceName() const {
   return DevName;
 }
-const AMDGPUDevice *
-AMDGPUSubtarget::device() const {
-  return Device;
-}
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
index b6501a4..8c65096 100644
--- a/lib/Target/R600/AMDGPUSubtarget.h
+++ b/lib/Target/R600/AMDGPUSubtarget.h
@@ -14,7 +14,7 @@
 
 #ifndef AMDGPUSUBTARGET_H
 #define AMDGPUSUBTARGET_H
-#include "AMDILDevice.h"
+#include "AMDGPU.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -27,9 +27,16 @@
 namespace llvm {
 
 class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
+public:
+  enum Generation {
+    R600 = 0,
+    R700,
+    EVERGREEN,
+    NORTHERN_ISLANDS,
+    SOUTHERN_ISLANDS
+  };
+
 private:
-  bool CapsOverride[AMDGPUDeviceInfo::MaxNumberCapabilities];
-  const AMDGPUDevice *Device;
   size_t DefaultSize[3];
   std::string DevName;
   bool Is64bit;
@@ -37,23 +44,28 @@ private:
   bool DumpCode;
   bool R600ALUInst;
   bool HasVertexCache;
+  short TexVTXClauseSize;
+  enum Generation Gen;
+  bool FP64;
+  bool CaymanISA;
 
   InstrItineraryData InstrItins;
 
 public:
   AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS);
-  virtual ~AMDGPUSubtarget();
 
   const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
   virtual void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
-  bool isOverride(AMDGPUDeviceInfo::Caps) const;
   bool is64bit() const;
   bool hasVertexCache() const;
+  short getTexVTXClauseSize() const;
+  enum Generation getGeneration() const;
+  bool hasHWFP64() const;
+  bool hasCaymanISA() const;
 
   // Helper functions to simplify if statements
   bool isTargetELF() const;
-  const AMDGPUDevice* device() const;
   std::string getDataLayout() const;
   std::string getDeviceName() const;
   virtual size_t getDefaultSize(uint32_t dim) const;
diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
index 0ec67ce..2fba434 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
@@ -58,18 +58,19 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT,
   LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OptLevel),
   Subtarget(TT, CPU, FS),
   Layout(Subtarget.getDataLayout()),
-  FrameLowering(TargetFrameLowering::StackGrowsUp,
-      Subtarget.device()->getStackAlignment(), 0),
+  FrameLowering(TargetFrameLowering::StackGrowsUp, 16 // Stack Alignment
+                                                 , 0),
   IntrinsicInfo(this),
   InstrItins(&Subtarget.getInstrItineraryData()) {
   // TLInfo uses InstrInfo so it must be initialized after.
-  if (Subtarget.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
-    InstrInfo = new R600InstrInfo(*this);
-    TLInfo = new R600TargetLowering(*this);
+  if (Subtarget.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+    InstrInfo.reset(new R600InstrInfo(*this));
+    TLInfo.reset(new R600TargetLowering(*this));
   } else {
-    InstrInfo = new SIInstrInfo(*this);
-    TLInfo = new SITargetLowering(*this);
+    InstrInfo.reset(new SIInstrInfo(*this));
+    TLInfo.reset(new SITargetLowering(*this));
   }
+  initAsmInfo();
 }
 
 AMDGPUTargetMachine::~AMDGPUTargetMachine() {
@@ -81,7 +82,7 @@ public:
   AMDGPUPassConfig(AMDGPUTargetMachine *TM, PassManagerBase &PM)
     : TargetPassConfig(TM, PM) {
     const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
-    if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
+    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
       enablePass(&MachineSchedulerID);
       MachineSchedRegistry::setDefault(createR600MachineScheduler);
     }
@@ -107,19 +108,20 @@ TargetPassConfig *AMDGPUTargetMachine::createPassConfig(PassManagerBase &PM) {
 bool
 AMDGPUPassConfig::addPreISel() {
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
-  if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
+  if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
     addPass(createAMDGPUStructurizeCFGPass());
     addPass(createSIAnnotateControlFlowPass());
+  } else {
+    addPass(createR600TextureIntrinsicsReplacer());
   }
   return false;
 }
 
 bool AMDGPUPassConfig::addInstSelector() {
-  addPass(createAMDGPUPeepholeOpt(*TM));
   addPass(createAMDGPUISelDag(getAMDGPUTargetMachine()));
 
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
-  if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
+  if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
     // This callbacks this pass uses are not implemented yet on SI.
     addPass(createAMDGPUIndirectAddressingPass(*TM));
   }
@@ -128,13 +130,18 @@ bool AMDGPUPassConfig::addInstSelector() {
 
 bool AMDGPUPassConfig::addPreRegAlloc() {
   addPass(createAMDGPUConvertToISAPass(*TM));
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+
+  if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+    addPass(createR600VectorRegMerger(*TM));
+  }
   return false;
 }
 
 bool AMDGPUPassConfig::addPostRegAlloc() {
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
 
-  if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
+  if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
     addPass(createSIInsertWaits(*TM));
   }
   return false;
@@ -148,7 +155,7 @@ bool AMDGPUPassConfig::addPreSched2() {
 
 bool AMDGPUPassConfig::addPreEmitPass() {
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
-  if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
+  if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
     addPass(createAMDGPUCFGPreparationPass(*TM));
     addPass(createAMDGPUCFGStructurizerPass(*TM));
     addPass(createR600EmitClauseMarkers(*TM));
diff --git a/lib/Target/R600/AMDGPUTargetMachine.h b/lib/Target/R600/AMDGPUTargetMachine.h
index 2afe787..bb26ed9 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.h
+++ b/lib/Target/R600/AMDGPUTargetMachine.h
@@ -25,7 +25,7 @@
 
 namespace llvm {
 
-MCAsmInfo* createMCAsmInfo(const Target &T, StringRef TT);
+MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT);
 
 class AMDGPUTargetMachine : public LLVMTargetMachine {
 
@@ -33,36 +33,36 @@ class AMDGPUTargetMachine : public LLVMTargetMachine {
   const DataLayout Layout;
   AMDGPUFrameLowering FrameLowering;
   AMDGPUIntrinsicInfo IntrinsicInfo;
-  const AMDGPUInstrInfo * InstrInfo;
-  AMDGPUTargetLowering * TLInfo;
-  const InstrItineraryData* InstrItins;
+  OwningPtr<AMDGPUInstrInfo> InstrInfo;
+  OwningPtr<AMDGPUTargetLowering> TLInfo;
+  const InstrItineraryData *InstrItins;
 
 public:
-   AMDGPUTargetMachine(const Target &T, StringRef TT, StringRef FS,
-                       StringRef CPU,
-                       TargetOptions Options,
-                       Reloc::Model RM, CodeModel::Model CM,
-                       CodeGenOpt::Level OL);
-   ~AMDGPUTargetMachine();
-   virtual const AMDGPUFrameLowering* getFrameLowering() const {
-     return &FrameLowering;
-   }
-   virtual const AMDGPUIntrinsicInfo* getIntrinsicInfo() const {
-     return &IntrinsicInfo;
-   }
-   virtual const AMDGPUInstrInfo *getInstrInfo() const {return InstrInfo;}
-   virtual const AMDGPUSubtarget *getSubtargetImpl() const {return &Subtarget; }
-   virtual const AMDGPURegisterInfo *getRegisterInfo() const {
-      return &InstrInfo->getRegisterInfo();
-   }
-   virtual AMDGPUTargetLowering * getTargetLowering() const {
-      return TLInfo;
-   }
-   virtual const InstrItineraryData* getInstrItineraryData() const {
-      return InstrItins;
-   }
-   virtual const DataLayout* getDataLayout() const { return &Layout; }
-   virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+  AMDGPUTargetMachine(const Target &T, StringRef TT, StringRef FS,
+                      StringRef CPU, TargetOptions Options, Reloc::Model RM,
+                      CodeModel::Model CM, CodeGenOpt::Level OL);
+  ~AMDGPUTargetMachine();
+  virtual const AMDGPUFrameLowering *getFrameLowering() const {
+    return &FrameLowering;
+  }
+  virtual const AMDGPUIntrinsicInfo *getIntrinsicInfo() const {
+    return &IntrinsicInfo;
+  }
+  virtual const AMDGPUInstrInfo *getInstrInfo() const {
+    return InstrInfo.get();
+  }
+  virtual const AMDGPUSubtarget *getSubtargetImpl() const { return &Subtarget; }
+  virtual const AMDGPURegisterInfo *getRegisterInfo() const {
+    return &InstrInfo->getRegisterInfo();
+  }
+  virtual AMDGPUTargetLowering *getTargetLowering() const {
+    return TLInfo.get();
+  }
+  virtual const InstrItineraryData *getInstrItineraryData() const {
+    return InstrItins;
+  }
+  virtual const DataLayout *getDataLayout() const { return &Layout; }
+  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/AMDIL.h b/lib/Target/R600/AMDIL.h
deleted file mode 100644
index 39ab664..0000000
--- a/lib/Target/R600/AMDIL.h
+++ /dev/null
@@ -1,121 +0,0 @@
-//===-- AMDIL.h - Top-level interface for AMDIL representation --*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-/// This file contains the entry points for global functions defined in the LLVM
-/// AMDGPU back-end.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef AMDIL_H
-#define AMDIL_H
-
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/Target/TargetMachine.h"
-
-#define ARENA_SEGMENT_RESERVED_UAVS 12
-#define DEFAULT_ARENA_UAV_ID 8
-#define DEFAULT_RAW_UAV_ID 7
-#define GLOBAL_RETURN_RAW_UAV_ID 11
-#define HW_MAX_NUM_CB 8
-#define MAX_NUM_UNIQUE_UAVS 8
-#define OPENCL_MAX_NUM_ATOMIC_COUNTERS 8
-#define OPENCL_MAX_READ_IMAGES 128
-#define OPENCL_MAX_WRITE_IMAGES 8
-#define OPENCL_MAX_SAMPLERS 16
-
-// The next two values can never be zero, as zero is the ID that is
-// used to assert against.
-#define DEFAULT_LDS_ID     1
-#define DEFAULT_GDS_ID     1
-#define DEFAULT_SCRATCH_ID 1
-#define DEFAULT_VEC_SLOTS  8
-
-#define OCL_DEVICE_RV710        0x0001
-#define OCL_DEVICE_RV730        0x0002
-#define OCL_DEVICE_RV770        0x0004
-#define OCL_DEVICE_CEDAR        0x0008
-#define OCL_DEVICE_REDWOOD      0x0010
-#define OCL_DEVICE_JUNIPER      0x0020
-#define OCL_DEVICE_CYPRESS      0x0040
-#define OCL_DEVICE_CAICOS       0x0080
-#define OCL_DEVICE_TURKS        0x0100
-#define OCL_DEVICE_BARTS        0x0200
-#define OCL_DEVICE_CAYMAN       0x0400
-#define OCL_DEVICE_ALL          0x3FFF
-
-/// The number of function ID's that are reserved for 
-/// internal compiler usage.
-const unsigned int RESERVED_FUNCS = 1024;
-
-namespace llvm {
-class AMDGPUInstrPrinter;
-class FunctionPass;
-class MCAsmInfo;
-class raw_ostream;
-class Target;
-class TargetMachine;
-
-// Instruction selection passes.
-FunctionPass*
-  createAMDGPUISelDag(TargetMachine &TM);
-FunctionPass*
-  createAMDGPUPeepholeOpt(TargetMachine &TM);
-
-// Pre emit passes.
-FunctionPass*
-  createAMDGPUCFGPreparationPass(TargetMachine &TM);
-FunctionPass*
-  createAMDGPUCFGStructurizerPass(TargetMachine &TM);
-
-extern Target TheAMDGPUTarget;
-} // end namespace llvm;
-
-// Include device information enumerations
-#include "AMDILDeviceInfo.h"
-
-namespace llvm {
-/// OpenCL uses address spaces to differentiate between
-/// various memory regions on the hardware. On the CPU
-/// all of the address spaces point to the same memory,
-/// however on the GPU, each address space points to
-/// a seperate piece of memory that is unique from other
-/// memory locations.
-namespace AMDGPUAS {
-enum AddressSpaces {
-  PRIVATE_ADDRESS  = 0, ///< Address space for private memory.
-  GLOBAL_ADDRESS   = 1, ///< Address space for global memory (RAT0, VTX0).
-  CONSTANT_ADDRESS = 2, ///< Address space for constant memory
-  LOCAL_ADDRESS    = 3, ///< Address space for local memory.
-  REGION_ADDRESS   = 4, ///< Address space for region memory.
-  ADDRESS_NONE     = 5, ///< Address space for unknown memory.
-  PARAM_D_ADDRESS  = 6, ///< Address space for direct addressible parameter memory (CONST0)
-  PARAM_I_ADDRESS  = 7, ///< Address space for indirect addressible parameter memory (VTX1)
-  CONSTANT_BUFFER_0 = 8,
-  CONSTANT_BUFFER_1 = 9,
-  CONSTANT_BUFFER_2 = 10,
-  CONSTANT_BUFFER_3 = 11,
-  CONSTANT_BUFFER_4 = 12,
-  CONSTANT_BUFFER_5 = 13,
-  CONSTANT_BUFFER_6 = 14,
-  CONSTANT_BUFFER_7 = 15,
-  CONSTANT_BUFFER_8 = 16,
-  CONSTANT_BUFFER_9 = 17,
-  CONSTANT_BUFFER_10 = 18,
-  CONSTANT_BUFFER_11 = 19,
-  CONSTANT_BUFFER_12 = 20,
-  CONSTANT_BUFFER_13 = 21,
-  CONSTANT_BUFFER_14 = 22,
-  CONSTANT_BUFFER_15 = 23,
-  LAST_ADDRESS     = 24
-};
-
-} // namespace AMDGPUAS
-
-} // end namespace llvm
-#endif // AMDIL_H
diff --git a/lib/Target/R600/AMDIL7XXDevice.cpp b/lib/Target/R600/AMDIL7XXDevice.cpp
deleted file mode 100644
index ea6ac34..0000000
--- a/lib/Target/R600/AMDIL7XXDevice.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-//===-- AMDIL7XXDevice.cpp - Device Info for 7XX GPUs ---------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-// \file
-//==-----------------------------------------------------------------------===//
-#include "AMDIL7XXDevice.h"
-#include "AMDGPUSubtarget.h"
-#include "AMDILDevice.h"
-
-using namespace llvm;
-
-AMDGPU7XXDevice::AMDGPU7XXDevice(AMDGPUSubtarget *ST) : AMDGPUDevice(ST) {
-  setCaps();
-  std::string name = mSTM->getDeviceName();
-  if (name == "rv710") {
-    DeviceFlag = OCL_DEVICE_RV710;
-  } else if (name == "rv730") {
-    DeviceFlag = OCL_DEVICE_RV730;
-  } else {
-    DeviceFlag = OCL_DEVICE_RV770;
-  }
-}
-
-AMDGPU7XXDevice::~AMDGPU7XXDevice() {
-}
-
-void AMDGPU7XXDevice::setCaps() {
-  mSWBits.set(AMDGPUDeviceInfo::LocalMem);
-}
-
-size_t AMDGPU7XXDevice::getMaxLDSSize() const {
-  if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
-    return MAX_LDS_SIZE_700;
-  }
-  return 0;
-}
-
-size_t AMDGPU7XXDevice::getWavefrontSize() const {
-  return AMDGPUDevice::HalfWavefrontSize;
-}
-
-uint32_t AMDGPU7XXDevice::getGeneration() const {
-  return AMDGPUDeviceInfo::HD4XXX;
-}
-
-uint32_t AMDGPU7XXDevice::getResourceID(uint32_t DeviceID) const {
-  switch (DeviceID) {
-  default:
-    assert(0 && "ID type passed in is unknown!");
-    break;
-  case GLOBAL_ID:
-  case CONSTANT_ID:
-  case RAW_UAV_ID:
-  case ARENA_UAV_ID:
-    break;
-  case LDS_ID:
-    if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
-      return DEFAULT_LDS_ID;
-    }
-    break;
-  case SCRATCH_ID:
-    if (usesHardware(AMDGPUDeviceInfo::PrivateMem)) {
-      return DEFAULT_SCRATCH_ID;
-    }
-    break;
-  case GDS_ID:
-    assert(0 && "GDS UAV ID is not supported on this chip");
-    if (usesHardware(AMDGPUDeviceInfo::RegionMem)) {
-      return DEFAULT_GDS_ID;
-    }
-    break;
-  };
-
-  return 0;
-}
-
-uint32_t AMDGPU7XXDevice::getMaxNumUAVs() const {
-  return 1;
-}
-
-AMDGPU770Device::AMDGPU770Device(AMDGPUSubtarget *ST): AMDGPU7XXDevice(ST) {
-  setCaps();
-}
-
-AMDGPU770Device::~AMDGPU770Device() {
-}
-
-void AMDGPU770Device::setCaps() {
-  if (mSTM->isOverride(AMDGPUDeviceInfo::DoubleOps)) {
-    mSWBits.set(AMDGPUDeviceInfo::FMA);
-    mHWBits.set(AMDGPUDeviceInfo::DoubleOps);
-  }
-  mSWBits.set(AMDGPUDeviceInfo::BarrierDetect);
-  mHWBits.reset(AMDGPUDeviceInfo::LongOps);
-  mSWBits.set(AMDGPUDeviceInfo::LongOps);
-  mSWBits.set(AMDGPUDeviceInfo::LocalMem);
-}
-
-size_t AMDGPU770Device::getWavefrontSize() const {
-  return AMDGPUDevice::WavefrontSize;
-}
-
-AMDGPU710Device::AMDGPU710Device(AMDGPUSubtarget *ST) : AMDGPU7XXDevice(ST) {
-}
-
-AMDGPU710Device::~AMDGPU710Device() {
-}
-
-size_t AMDGPU710Device::getWavefrontSize() const {
-  return AMDGPUDevice::QuarterWavefrontSize;
-}
diff --git a/lib/Target/R600/AMDIL7XXDevice.h b/lib/Target/R600/AMDIL7XXDevice.h
deleted file mode 100644
index 1cf4ca4..0000000
--- a/lib/Target/R600/AMDIL7XXDevice.h
+++ /dev/null
@@ -1,72 +0,0 @@
-//==-- AMDIL7XXDevice.h - Define 7XX Device Device for AMDIL ---*- C++ -*--===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-/// \file
-/// \brief Interface for the subtarget data classes.
-///
-/// This file will define the interface that each generation needs to
-/// implement in order to correctly answer queries on the capabilities of the
-/// specific hardware.
-//===----------------------------------------------------------------------===//
-#ifndef AMDIL7XXDEVICEIMPL_H
-#define AMDIL7XXDEVICEIMPL_H
-#include "AMDILDevice.h"
-
-namespace llvm {
-class AMDGPUSubtarget;
-
-//===----------------------------------------------------------------------===//
-// 7XX generation of devices and their respective sub classes
-//===----------------------------------------------------------------------===//
-
-/// \brief The AMDGPU7XXDevice class represents the generic 7XX device.
-///
-/// All 7XX devices are derived from this class. The AMDGPU7XX device will only
-/// support the minimal features that are required to be considered OpenCL 1.0
-/// compliant and nothing more.
-class AMDGPU7XXDevice : public AMDGPUDevice {
-public:
-  AMDGPU7XXDevice(AMDGPUSubtarget *ST);
-  virtual ~AMDGPU7XXDevice();
-  virtual size_t getMaxLDSSize() const;
-  virtual size_t getWavefrontSize() const;
-  virtual uint32_t getGeneration() const;
-  virtual uint32_t getResourceID(uint32_t DeviceID) const;
-  virtual uint32_t getMaxNumUAVs() const;
-
-protected:
-  virtual void setCaps();
-};
-
-/// \brief The AMDGPU770Device class represents the RV770 chip and it's
-/// derivative cards.
-///
-/// The difference between this device and the base class is this device device
-/// adds support for double precision and has a larger wavefront size.
-class AMDGPU770Device : public AMDGPU7XXDevice {
-public:
-  AMDGPU770Device(AMDGPUSubtarget *ST);
-  virtual ~AMDGPU770Device();
-  virtual size_t getWavefrontSize() const;
-private:
-  virtual void setCaps();
-};
-
-/// \brief The AMDGPU710Device class derives from the 7XX base class.
-///
-/// This class is a smaller derivative, so we need to overload some of the
-/// functions in order to correctly specify this information.
-class AMDGPU710Device : public AMDGPU7XXDevice {
-public:
-  AMDGPU710Device(AMDGPUSubtarget *ST);
-  virtual ~AMDGPU710Device();
-  virtual size_t getWavefrontSize() const;
-};
-
-} // namespace llvm
-#endif // AMDILDEVICEIMPL_H
diff --git a/lib/Target/R600/AMDILBase.td b/lib/Target/R600/AMDILBase.td
index e221110..5dcd478 100644
--- a/lib/Target/R600/AMDILBase.td
+++ b/lib/Target/R600/AMDILBase.td
@@ -16,70 +16,6 @@ def ALU_NULL : FuncUnit;
 def NullALU : InstrItinClass;
 
 //===----------------------------------------------------------------------===//
-// AMDIL Subtarget features.
-//===----------------------------------------------------------------------===//
-def FeatureFP64     : SubtargetFeature<"fp64",
-        "CapsOverride[AMDGPUDeviceInfo::DoubleOps]",
-        "true",
-        "Enable 64bit double precision operations">;
-def FeatureByteAddress    : SubtargetFeature<"byte_addressable_store",
-        "CapsOverride[AMDGPUDeviceInfo::ByteStores]",
-        "true",
-        "Enable byte addressable stores">;
-def FeatureBarrierDetect : SubtargetFeature<"barrier_detect",
-        "CapsOverride[AMDGPUDeviceInfo::BarrierDetect]",
-        "true",
-        "Enable duplicate barrier detection(HD5XXX or later).">;
-def FeatureImages : SubtargetFeature<"images",
-        "CapsOverride[AMDGPUDeviceInfo::Images]",
-        "true",
-        "Enable image functions">;
-def FeatureMultiUAV : SubtargetFeature<"multi_uav",
-        "CapsOverride[AMDGPUDeviceInfo::MultiUAV]",
-        "true",
-        "Generate multiple UAV code(HD5XXX family or later)">;
-def FeatureMacroDB : SubtargetFeature<"macrodb",
-        "CapsOverride[AMDGPUDeviceInfo::MacroDB]",
-        "true",
-        "Use internal macrodb, instead of macrodb in driver">;
-def FeatureNoAlias : SubtargetFeature<"noalias",
-        "CapsOverride[AMDGPUDeviceInfo::NoAlias]",
-        "true",
-        "assert that all kernel argument pointers are not aliased">;
-def FeatureNoInline : SubtargetFeature<"no-inline",
-        "CapsOverride[AMDGPUDeviceInfo::NoInline]",
-        "true",
-        "specify whether to not inline functions">;
-
-def Feature64BitPtr : SubtargetFeature<"64BitPtr",
-        "Is64bit",
-        "false",
-        "Specify if 64bit addressing should be used.">;
-
-def Feature32on64BitPtr : SubtargetFeature<"64on32BitPtr",
-        "Is32on64bit",
-        "false",
-        "Specify if 64bit sized pointers with 32bit addressing should be used.">;
-def FeatureDebug : SubtargetFeature<"debug",
-        "CapsOverride[AMDGPUDeviceInfo::Debug]",
-        "true",
-        "Debug mode is enabled, so disable hardware accelerated address spaces.">;
-def FeatureDumpCode : SubtargetFeature <"DumpCode",
-        "DumpCode",
-        "true",
-        "Dump MachineInstrs in the CodeEmitter">;
-
-def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst",
-        "R600ALUInst",
-        "false",
-        "Older version of ALU instructions encoding.">;
-
-def FeatureVertexCache : SubtargetFeature<"HasVertexCache",
-        "HasVertexCache",
-        "true",
-        "Specify use of dedicated vertex cache.">;
-
-//===----------------------------------------------------------------------===//
 // Register File, Calling Conv, Instruction Descriptions
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/Target/R600/AMDILCFGStructurizer.cpp b/lib/Target/R600/AMDILCFGStructurizer.cpp
index b0cd0f9..4910e5d 100644
--- a/lib/Target/R600/AMDILCFGStructurizer.cpp
+++ b/lib/Target/R600/AMDILCFGStructurizer.cpp
@@ -11,8 +11,8 @@
 #define DEBUGME 0
 #define DEBUG_TYPE "structcfg"
 
+#include "AMDGPU.h"
 #include "AMDGPUInstrInfo.h"
-#include "AMDIL.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -28,9 +28,12 @@
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
+#define DEFAULT_VEC_SLOTS 8
+
 // TODO: move-begin.
 
 //===----------------------------------------------------------------------===//
@@ -57,7 +60,7 @@ STATISTIC(numClonedInstr,           "CFGStructurizer cloned instructions");
 // Miscellaneous utility for CFGStructurizer.
 //
 //===----------------------------------------------------------------------===//
-namespace llvmCFGStruct {
+namespace {
 #define SHOWNEWINSTR(i) \
   if (DEBUGME) errs() << "New instr: " << *i << "\n"
 
@@ -98,7 +101,7 @@ void ReverseVector(SmallVector<NodeT *, DEFAULT_VEC_SLOTS> &Src) {
   }
 }
 
-} //end namespace llvmCFGStruct
+} // end anonymous namespace
 
 //===----------------------------------------------------------------------===//
 //
@@ -106,7 +109,7 @@ void ReverseVector(SmallVector<NodeT *, DEFAULT_VEC_SLOTS> &Src) {
 //
 //===----------------------------------------------------------------------===//
 
-namespace llvmCFGStruct {
+namespace {
 template<class PassT>
 struct CFGStructTraits {
 };
@@ -142,7 +145,7 @@ public:
   LandInformation() : landBlk(NULL) {}
 };
 
-} //end of namespace llvmCFGStruct
+} // end anonymous namespace
 
 //===----------------------------------------------------------------------===//
 //
@@ -150,7 +153,7 @@ public:
 //
 //===----------------------------------------------------------------------===//
 
-namespace llvmCFGStruct {
+namespace {
 // bixia TODO: port it to BasicBlock, not just MachineBasicBlock.
 template<class PassT>
 class  CFGStructurizer {
@@ -2446,7 +2449,7 @@ CFGStructurizer<PassT>::findNearestCommonPostDom
   return commonDom;
 } //findNearestCommonPostDom
 
-} //end namespace llvm
+} // end anonymous namespace
 
 //todo: move-end
 
@@ -2458,9 +2461,7 @@ CFGStructurizer<PassT>::findNearestCommonPostDom
 //===----------------------------------------------------------------------===//
 
 
-using namespace llvmCFGStruct;
-
-namespace llvm {
+namespace {
 class AMDGPUCFGStructurizer : public MachineFunctionPass {
 public:
   typedef MachineInstr              InstructionType;
@@ -2474,26 +2475,26 @@ public:
 
 protected:
   TargetMachine &TM;
-  const TargetInstrInfo *TII;
-  const AMDGPURegisterInfo *TRI;
 
 public:
   AMDGPUCFGStructurizer(char &pid, TargetMachine &tm);
   const TargetInstrInfo *getTargetInstrInfo() const;
-
-private:
-
+  const AMDGPURegisterInfo *getTargetRegisterInfo() const;
 };
 
-} //end of namespace llvm
+} // end anonymous namespace
 AMDGPUCFGStructurizer::AMDGPUCFGStructurizer(char &pid, TargetMachine &tm)
-: MachineFunctionPass(pid), TM(tm), TII(tm.getInstrInfo()),
-  TRI(static_cast<const AMDGPURegisterInfo *>(tm.getRegisterInfo())) {
+  : MachineFunctionPass(pid), TM(tm) {
 }
 
 const TargetInstrInfo *AMDGPUCFGStructurizer::getTargetInstrInfo() const {
-  return TII;
+  return TM.getInstrInfo();
+}
+
+const AMDGPURegisterInfo *AMDGPUCFGStructurizer::getTargetRegisterInfo() const {
+  return static_cast<const AMDGPURegisterInfo *>(TM.getRegisterInfo());
 }
+
 //===----------------------------------------------------------------------===//
 //
 // CFGPrepare
@@ -2501,9 +2502,7 @@ const TargetInstrInfo *AMDGPUCFGStructurizer::getTargetInstrInfo() const {
 //===----------------------------------------------------------------------===//
 
 
-using namespace llvmCFGStruct;
-
-namespace llvm {
+namespace {
 class AMDGPUCFGPrepare : public AMDGPUCFGStructurizer {
 public:
   static char ID;
@@ -2515,13 +2514,10 @@ public:
   virtual void getAnalysisUsage(AnalysisUsage &AU) const;
 
   bool runOnMachineFunction(MachineFunction &F);
-
-private:
-
 };
 
 char AMDGPUCFGPrepare::ID = 0;
-} //end of namespace llvm
+} // end anonymous namespace
 
 AMDGPUCFGPrepare::AMDGPUCFGPrepare(TargetMachine &tm)
   : AMDGPUCFGStructurizer(ID, tm )  {
@@ -2545,9 +2541,7 @@ void AMDGPUCFGPrepare::getAnalysisUsage(AnalysisUsage &AU) const {
 //===----------------------------------------------------------------------===//
 
 
-using namespace llvmCFGStruct;
-
-namespace llvm {
+namespace {
 class AMDGPUCFGPerform : public AMDGPUCFGStructurizer {
 public:
   static char ID;
@@ -2557,13 +2551,10 @@ public:
   virtual const char *getPassName() const;
   virtual void getAnalysisUsage(AnalysisUsage &AU) const;
   bool runOnMachineFunction(MachineFunction &F);
-
-private:
-
 };
 
 char AMDGPUCFGPerform::ID = 0;
-} //end of namespace llvm
+} // end anonymous namespace
 
   AMDGPUCFGPerform::AMDGPUCFGPerform(TargetMachine &tm)
 : AMDGPUCFGStructurizer(ID, tm) {
@@ -2587,7 +2578,7 @@ void AMDGPUCFGPerform::getAnalysisUsage(AnalysisUsage &AU) const {
 //
 //===----------------------------------------------------------------------===//
 
-namespace llvmCFGStruct {
+namespace {
 // this class is tailor to the AMDGPU backend
 template<>
 struct CFGStructTraits<AMDGPUCFGStructurizer> {
@@ -3024,28 +3015,24 @@ struct CFGStructTraits<AMDGPUCFGStructurizer> {
     return &pass.getAnalysis<MachineLoopInfo>();
   }
 }; // template class CFGStructTraits
-} //end of namespace llvm
+} // end anonymous namespace
 
 // createAMDGPUCFGPreparationPass- Returns a pass
-FunctionPass *llvm::createAMDGPUCFGPreparationPass(TargetMachine &tm
-                                                 ) {
-  return new AMDGPUCFGPrepare(tm );
+FunctionPass *llvm::createAMDGPUCFGPreparationPass(TargetMachine &tm) {
+  return new AMDGPUCFGPrepare(tm);
 }
 
 bool AMDGPUCFGPrepare::runOnMachineFunction(MachineFunction &func) {
-  return llvmCFGStruct::CFGStructurizer<AMDGPUCFGStructurizer>().prepare(func,
-                                                                        *this,
-                                                                        TRI);
+  return CFGStructurizer<AMDGPUCFGStructurizer>().prepare(func, *this,
+                                                       getTargetRegisterInfo());
 }
 
 // createAMDGPUCFGStructurizerPass- Returns a pass
-FunctionPass *llvm::createAMDGPUCFGStructurizerPass(TargetMachine &tm
-                                                  ) {
-  return new AMDGPUCFGPerform(tm );
+FunctionPass *llvm::createAMDGPUCFGStructurizerPass(TargetMachine &tm) {
+  return new AMDGPUCFGPerform(tm);
 }
 
 bool AMDGPUCFGPerform::runOnMachineFunction(MachineFunction &func) {
-  return llvmCFGStruct::CFGStructurizer<AMDGPUCFGStructurizer>().run(func,
-                                                                    *this,
-                                                                    TRI);
+  return CFGStructurizer<AMDGPUCFGStructurizer>().run(func, *this,
+                                                      getTargetRegisterInfo());
 }
diff --git a/lib/Target/R600/AMDILDevice.cpp b/lib/Target/R600/AMDILDevice.cpp
deleted file mode 100644
index db8e01e..0000000
--- a/lib/Target/R600/AMDILDevice.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-//===-- AMDILDevice.cpp - Base class for AMDIL Devices --------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-/// \file
-//==-----------------------------------------------------------------------===//
-#include "AMDILDevice.h"
-#include "AMDGPUSubtarget.h"
-
-using namespace llvm;
-// Default implementation for all of the classes.
-AMDGPUDevice::AMDGPUDevice(AMDGPUSubtarget *ST) : mSTM(ST) {
-  mHWBits.resize(AMDGPUDeviceInfo::MaxNumberCapabilities);
-  mSWBits.resize(AMDGPUDeviceInfo::MaxNumberCapabilities);
-  setCaps();
-  DeviceFlag = OCL_DEVICE_ALL;
-}
-
-AMDGPUDevice::~AMDGPUDevice() {
-    mHWBits.clear();
-    mSWBits.clear();
-}
-
-size_t AMDGPUDevice::getMaxGDSSize() const {
-  return 0;
-}
-
-uint32_t 
-AMDGPUDevice::getDeviceFlag() const {
-  return DeviceFlag;
-}
-
-size_t AMDGPUDevice::getMaxNumCBs() const {
-  if (usesHardware(AMDGPUDeviceInfo::ConstantMem)) {
-    return HW_MAX_NUM_CB;
-  }
-
-  return 0;
-}
-
-size_t AMDGPUDevice::getMaxCBSize() const {
-  if (usesHardware(AMDGPUDeviceInfo::ConstantMem)) {
-    return MAX_CB_SIZE;
-  }
-
-  return 0;
-}
-
-size_t AMDGPUDevice::getMaxScratchSize() const {
-  return 65536;
-}
-
-uint32_t AMDGPUDevice::getStackAlignment() const {
-  return 16;
-}
-
-void AMDGPUDevice::setCaps() {
-  mSWBits.set(AMDGPUDeviceInfo::HalfOps);
-  mSWBits.set(AMDGPUDeviceInfo::ByteOps);
-  mSWBits.set(AMDGPUDeviceInfo::ShortOps);
-  mSWBits.set(AMDGPUDeviceInfo::HW64BitDivMod);
-  if (mSTM->isOverride(AMDGPUDeviceInfo::NoInline)) {
-    mSWBits.set(AMDGPUDeviceInfo::NoInline);
-  }
-  if (mSTM->isOverride(AMDGPUDeviceInfo::MacroDB)) {
-    mSWBits.set(AMDGPUDeviceInfo::MacroDB);
-  }
-  if (mSTM->isOverride(AMDGPUDeviceInfo::Debug)) {
-    mSWBits.set(AMDGPUDeviceInfo::ConstantMem);
-  } else {
-    mHWBits.set(AMDGPUDeviceInfo::ConstantMem);
-  }
-  if (mSTM->isOverride(AMDGPUDeviceInfo::Debug)) {
-    mSWBits.set(AMDGPUDeviceInfo::PrivateMem);
-  } else {
-    mHWBits.set(AMDGPUDeviceInfo::PrivateMem);
-  }
-  if (mSTM->isOverride(AMDGPUDeviceInfo::BarrierDetect)) {
-    mSWBits.set(AMDGPUDeviceInfo::BarrierDetect);
-  }
-  mSWBits.set(AMDGPUDeviceInfo::ByteLDSOps);
-  mSWBits.set(AMDGPUDeviceInfo::LongOps);
-}
-
-AMDGPUDeviceInfo::ExecutionMode
-AMDGPUDevice::getExecutionMode(AMDGPUDeviceInfo::Caps Caps) const {
-  if (mHWBits[Caps]) {
-    assert(!mSWBits[Caps] && "Cannot set both SW and HW caps");
-    return AMDGPUDeviceInfo::Hardware;
-  }
-
-  if (mSWBits[Caps]) {
-    assert(!mHWBits[Caps] && "Cannot set both SW and HW caps");
-    return AMDGPUDeviceInfo::Software;
-  }
-
-  return AMDGPUDeviceInfo::Unsupported;
-
-}
-
-bool AMDGPUDevice::isSupported(AMDGPUDeviceInfo::Caps Mode) const {
-  return getExecutionMode(Mode) != AMDGPUDeviceInfo::Unsupported;
-}
-
-bool AMDGPUDevice::usesHardware(AMDGPUDeviceInfo::Caps Mode) const {
-  return getExecutionMode(Mode) == AMDGPUDeviceInfo::Hardware;
-}
-
-bool AMDGPUDevice::usesSoftware(AMDGPUDeviceInfo::Caps Mode) const {
-  return getExecutionMode(Mode) == AMDGPUDeviceInfo::Software;
-}
-
-std::string
-AMDGPUDevice::getDataLayout() const {
-  std::string DataLayout = std::string(
-   "e"
-   "-p:32:32:32"
-   "-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32"
-   "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128"
-   "-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048"
-   "-n32:64"
-  );
-
-  if (usesHardware(AMDGPUDeviceInfo::DoubleOps)) {
-    DataLayout.append("-f64:64:64");
-  }
-
-  return DataLayout;
-}
diff --git a/lib/Target/R600/AMDILDevice.h b/lib/Target/R600/AMDILDevice.h
deleted file mode 100644
index 97df98c..0000000
--- a/lib/Target/R600/AMDILDevice.h
+++ /dev/null
@@ -1,117 +0,0 @@
-//===---- AMDILDevice.h - Define Device Data for AMDGPU -----*- C++ -*------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Interface for the subtarget data classes.
-//
-/// This file will define the interface that each generation needs to
-/// implement in order to correctly answer queries on the capabilities of the
-/// specific hardware.
-//===----------------------------------------------------------------------===//
-#ifndef AMDILDEVICEIMPL_H
-#define AMDILDEVICEIMPL_H
-#include "AMDIL.h"
-#include "llvm/ADT/BitVector.h"
-
-namespace llvm {
-  class AMDGPUSubtarget;
-  class MCStreamer;
-//===----------------------------------------------------------------------===//
-// Interface for data that is specific to a single device
-//===----------------------------------------------------------------------===//
-class AMDGPUDevice {
-public:
-  AMDGPUDevice(AMDGPUSubtarget *ST);
-  virtual ~AMDGPUDevice();
-
-  // Enum values for the various memory types.
-  enum {
-    RAW_UAV_ID   = 0,
-    ARENA_UAV_ID = 1,
-    LDS_ID       = 2,
-    GDS_ID       = 3,
-    SCRATCH_ID   = 4,
-    CONSTANT_ID  = 5,
-    GLOBAL_ID    = 6,
-    MAX_IDS      = 7
-  } IO_TYPE_IDS;
-
-  /// \returns The max LDS size that the hardware supports.  Size is in
-  /// bytes.
-  virtual size_t getMaxLDSSize() const = 0;
-
-  /// \returns The max GDS size that the hardware supports if the GDS is
-  /// supported by the hardware.  Size is in bytes.
-  virtual size_t getMaxGDSSize() const;
-
-  /// \returns The max number of hardware constant address spaces that
-  /// are supported by this device.
-  virtual size_t getMaxNumCBs() const;
-
-  /// \returns The max number of bytes a single hardware constant buffer
-  /// can support.  Size is in bytes.
-  virtual size_t getMaxCBSize() const;
-
-  /// \returns The max number of bytes allowed by the hardware scratch
-  /// buffer.  Size is in bytes.
-  virtual size_t getMaxScratchSize() const;
-
-  /// \brief Get the flag that corresponds to the device.
-  virtual uint32_t getDeviceFlag() const;
-
-  /// \returns The number of work-items that exist in a single hardware
-  /// wavefront.
-  virtual size_t getWavefrontSize() const = 0;
-
-  /// \brief Get the generational name of this specific device.
-  virtual uint32_t getGeneration() const = 0;
-
-  /// \brief Get the stack alignment of this specific device.
-  virtual uint32_t getStackAlignment() const;
-
-  /// \brief Get the resource ID for this specific device.
-  virtual uint32_t getResourceID(uint32_t DeviceID) const = 0;
-
-  /// \brief Get the max number of UAV's for this device.
-  virtual uint32_t getMaxNumUAVs() const = 0;
-
-
-  // API utilizing more detailed capabilities of each family of
-  // cards. If a capability is supported, then either usesHardware or
-  // usesSoftware returned true.  If usesHardware returned true, then
-  // usesSoftware must return false for the same capability.  Hardware
-  // execution means that the feature is done natively by the hardware
-  // and is not emulated by the softare.  Software execution means
-  // that the feature could be done in the hardware, but there is
-  // software that emulates it with possibly using the hardware for
-  // support since the hardware does not fully comply with OpenCL
-  // specs.
-
-  bool isSupported(AMDGPUDeviceInfo::Caps Mode) const;
-  bool usesHardware(AMDGPUDeviceInfo::Caps Mode) const;
-  bool usesSoftware(AMDGPUDeviceInfo::Caps Mode) const;
-  virtual std::string getDataLayout() const;
-  static const unsigned int MAX_LDS_SIZE_700 = 16384;
-  static const unsigned int MAX_LDS_SIZE_800 = 32768;
-  static const unsigned int WavefrontSize = 64;
-  static const unsigned int HalfWavefrontSize = 32;
-  static const unsigned int QuarterWavefrontSize = 16;
-protected:
-  virtual void setCaps();
-  BitVector mHWBits;
-  llvm::BitVector mSWBits;
-  AMDGPUSubtarget *mSTM;
-  uint32_t DeviceFlag;
-private:
-  AMDGPUDeviceInfo::ExecutionMode
-  getExecutionMode(AMDGPUDeviceInfo::Caps Caps) const;
-};
-
-} // namespace llvm
-#endif // AMDILDEVICEIMPL_H
diff --git a/lib/Target/R600/AMDILDeviceInfo.cpp b/lib/Target/R600/AMDILDeviceInfo.cpp
deleted file mode 100644
index 1787959..0000000
--- a/lib/Target/R600/AMDILDeviceInfo.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-//===-- AMDILDeviceInfo.cpp - AMDILDeviceInfo class -----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Function that creates DeviceInfo from a device name and other information.
-//
-//==-----------------------------------------------------------------------===//
-#include "AMDILDevices.h"
-#include "AMDGPUSubtarget.h"
-
-using namespace llvm;
-namespace llvm {
-namespace AMDGPUDeviceInfo {
-
-AMDGPUDevice* getDeviceFromName(const std::string &deviceName,
-                                AMDGPUSubtarget *ptr,
-                                bool is64bit, bool is64on32bit) {
-  if (deviceName.c_str()[2] == '7') {
-    switch (deviceName.c_str()[3]) {
-    case '1':
-      return new AMDGPU710Device(ptr);
-    case '7':
-      return new AMDGPU770Device(ptr);
-    default:
-      return new AMDGPU7XXDevice(ptr);
-    }
-  } else if (deviceName == "cypress") {
-#if DEBUG
-    assert(!is64bit && "This device does not support 64bit pointers!");
-    assert(!is64on32bit && "This device does not support 64bit"
-          " on 32bit pointers!");
-#endif
-    return new AMDGPUCypressDevice(ptr);
-  } else if (deviceName == "juniper") {
-#if DEBUG
-    assert(!is64bit && "This device does not support 64bit pointers!");
-    assert(!is64on32bit && "This device does not support 64bit"
-          " on 32bit pointers!");
-#endif
-    return new AMDGPUEvergreenDevice(ptr);
-  } else if (deviceName == "redwood" || deviceName == "sumo") {
-#if DEBUG
-    assert(!is64bit && "This device does not support 64bit pointers!");
-    assert(!is64on32bit && "This device does not support 64bit"
-          " on 32bit pointers!");
-#endif
-    return new AMDGPURedwoodDevice(ptr);
-  } else if (deviceName == "cedar") {
-#if DEBUG
-    assert(!is64bit && "This device does not support 64bit pointers!");
-    assert(!is64on32bit && "This device does not support 64bit"
-          " on 32bit pointers!");
-#endif
-    return new AMDGPUCedarDevice(ptr);
-  } else if (deviceName == "barts" || deviceName == "turks") {
-#if DEBUG
-    assert(!is64bit && "This device does not support 64bit pointers!");
-    assert(!is64on32bit && "This device does not support 64bit"
-          " on 32bit pointers!");
-#endif
-    return new AMDGPUNIDevice(ptr);
-  } else if (deviceName == "cayman") {
-#if DEBUG
-    assert(!is64bit && "This device does not support 64bit pointers!");
-    assert(!is64on32bit && "This device does not support 64bit"
-          " on 32bit pointers!");
-#endif
-    return new AMDGPUCaymanDevice(ptr);
-  } else if (deviceName == "caicos") {
-#if DEBUG
-    assert(!is64bit && "This device does not support 64bit pointers!");
-    assert(!is64on32bit && "This device does not support 64bit"
-          " on 32bit pointers!");
-#endif
-    return new AMDGPUNIDevice(ptr);
-  } else if (deviceName == "SI" ||
-             deviceName == "tahiti" || deviceName == "pitcairn" ||
-             deviceName == "verde"  || deviceName == "oland") {
-    return new AMDGPUSIDevice(ptr);
-  } else {
-#if DEBUG
-    assert(!is64bit && "This device does not support 64bit pointers!");
-    assert(!is64on32bit && "This device does not support 64bit"
-          " on 32bit pointers!");
-#endif
-    return new AMDGPU7XXDevice(ptr);
-  }
-}
-} // End namespace AMDGPUDeviceInfo
-} // End namespace llvm
diff --git a/lib/Target/R600/AMDILDeviceInfo.h b/lib/Target/R600/AMDILDeviceInfo.h
deleted file mode 100644
index 4b2c3a5..0000000
--- a/lib/Target/R600/AMDILDeviceInfo.h
+++ /dev/null
@@ -1,88 +0,0 @@
-//===-- AMDILDeviceInfo.h - Constants for describing devices --------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-/// \file
-//==-----------------------------------------------------------------------===//
-#ifndef AMDILDEVICEINFO_H
-#define AMDILDEVICEINFO_H
-
-
-#include <string>
-
-namespace llvm {
-  class AMDGPUDevice;
-  class AMDGPUSubtarget;
-  namespace AMDGPUDeviceInfo {
-    /// Each Capabilities can be executed using a hardware instruction,
-    /// emulated with a sequence of software instructions, or not
-    /// supported at all.
-    enum ExecutionMode {
-      Unsupported = 0, ///< Unsupported feature on the card(Default value)
-       /// This is the execution mode that is set if the feature is emulated in
-       /// software.
-      Software,
-      /// This execution mode is set if the feature exists natively in hardware
-      Hardware
-    };
-
-    enum Caps {
-      HalfOps          = 0x1,  ///< Half float is supported or not.
-      DoubleOps        = 0x2,  ///< Double is supported or not.
-      ByteOps          = 0x3,  ///< Byte(char) is support or not.
-      ShortOps         = 0x4,  ///< Short is supported or not.
-      LongOps          = 0x5,  ///< Long is supported or not.
-      Images           = 0x6,  ///< Images are supported or not.
-      ByteStores       = 0x7,  ///< ByteStores available(!HD4XXX).
-      ConstantMem      = 0x8,  ///< Constant/CB memory.
-      LocalMem         = 0x9,  ///< Local/LDS memory.
-      PrivateMem       = 0xA,  ///< Scratch/Private/Stack memory.
-      RegionMem        = 0xB,  ///< OCL GDS Memory Extension.
-      FMA              = 0xC,  ///< Use HW FMA or SW FMA.
-      ArenaSegment     = 0xD,  ///< Use for Arena UAV per pointer 12-1023.
-      MultiUAV         = 0xE,  ///< Use for UAV per Pointer 0-7.
-      Reserved0        = 0xF,  ///< ReservedFlag
-      NoAlias          = 0x10, ///< Cached loads.
-      Signed24BitOps   = 0x11, ///< Peephole Optimization.
-      /// Debug mode implies that no hardware features or optimizations
-      /// are performned and that all memory access go through a single
-      /// uav(Arena on HD5XXX/HD6XXX and Raw on HD4XXX).
-      Debug            = 0x12,
-      CachedMem        = 0x13, ///< Cached mem is available or not.
-      BarrierDetect    = 0x14, ///< Detect duplicate barriers.
-      Reserved1        = 0x15, ///< Reserved flag
-      ByteLDSOps       = 0x16, ///< Flag to specify if byte LDS ops are available.
-      ArenaVectors     = 0x17, ///< Flag to specify if vector loads from arena work.
-      TmrReg           = 0x18, ///< Flag to specify if Tmr register is supported.
-      NoInline         = 0x19, ///< Flag to specify that no inlining should occur.
-      MacroDB          = 0x1A, ///< Flag to specify that backend handles macrodb.
-      HW64BitDivMod    = 0x1B, ///< Flag for backend to generate 64bit div/mod.
-      ArenaUAV         = 0x1C, ///< Flag to specify that arena uav is supported.
-      PrivateUAV       = 0x1D, ///< Flag to specify that private memory uses uav's.
-      /// If more capabilities are required, then
-      /// this number needs to be increased.
-      /// All capabilities must come before this
-      /// number.
-      MaxNumberCapabilities = 0x20
-    };
-    /// These have to be in order with the older generations
-    /// having the lower number enumerations.
-    enum Generation {
-      HD4XXX = 0, ///< 7XX based devices.
-      HD5XXX, ///< Evergreen based devices.
-      HD6XXX, ///< NI/Evergreen+ based devices.
-      HD7XXX, ///< Southern Islands based devices.
-      HDTEST, ///< Experimental feature testing device.
-      HDNUMGEN
-    };
-
-
-  AMDGPUDevice*
-    getDeviceFromName(const std::string &name, AMDGPUSubtarget *ptr,
-                      bool is64bit = false, bool is64on32bit = false);
-  } // namespace AMDILDeviceInfo
-} // namespace llvm
-#endif // AMDILDEVICEINFO_H
diff --git a/lib/Target/R600/AMDILDevices.h b/lib/Target/R600/AMDILDevices.h
deleted file mode 100644
index 636fa6d..0000000
--- a/lib/Target/R600/AMDILDevices.h
+++ /dev/null
@@ -1,19 +0,0 @@
-//===-- AMDILDevices.h - Consolidate AMDIL Device headers -----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-/// \file
-//==-----------------------------------------------------------------------===//
-#ifndef AMDIL_DEVICES_H
-#define AMDIL_DEVICES_H
-// Include all of the device specific header files
-#include "AMDIL7XXDevice.h"
-#include "AMDILDevice.h"
-#include "AMDILEvergreenDevice.h"
-#include "AMDILNIDevice.h"
-#include "AMDILSIDevice.h"
-
-#endif // AMDIL_DEVICES_H
diff --git a/lib/Target/R600/AMDILEvergreenDevice.cpp b/lib/Target/R600/AMDILEvergreenDevice.cpp
deleted file mode 100644
index c5213a0..0000000
--- a/lib/Target/R600/AMDILEvergreenDevice.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-//===-- AMDILEvergreenDevice.cpp - Device Info for Evergreen --------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-/// \file
-//==-----------------------------------------------------------------------===//
-#include "AMDILEvergreenDevice.h"
-
-using namespace llvm;
-
-AMDGPUEvergreenDevice::AMDGPUEvergreenDevice(AMDGPUSubtarget *ST)
-: AMDGPUDevice(ST) {
-  setCaps();
-  std::string name = ST->getDeviceName();
-  if (name == "cedar") {
-    DeviceFlag = OCL_DEVICE_CEDAR;
-  } else if (name == "redwood") {
-    DeviceFlag = OCL_DEVICE_REDWOOD;
-  } else if (name == "cypress") {
-    DeviceFlag = OCL_DEVICE_CYPRESS;
-  } else {
-    DeviceFlag = OCL_DEVICE_JUNIPER;
-  }
-}
-
-AMDGPUEvergreenDevice::~AMDGPUEvergreenDevice() {
-}
-
-size_t AMDGPUEvergreenDevice::getMaxLDSSize() const {
-  if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
-    return MAX_LDS_SIZE_800;
-  } else {
-    return 0;
-  }
-}
-size_t AMDGPUEvergreenDevice::getMaxGDSSize() const {
-  if (usesHardware(AMDGPUDeviceInfo::RegionMem)) {
-    return MAX_LDS_SIZE_800;
-  } else {
-    return 0;
-  }
-}
-uint32_t AMDGPUEvergreenDevice::getMaxNumUAVs() const {
-  return 12;
-}
-
-uint32_t AMDGPUEvergreenDevice::getResourceID(uint32_t id) const {
-  switch(id) {
-  default:
-    assert(0 && "ID type passed in is unknown!");
-    break;
-  case CONSTANT_ID:
-  case RAW_UAV_ID:
-    return GLOBAL_RETURN_RAW_UAV_ID;
-  case GLOBAL_ID:
-  case ARENA_UAV_ID:
-    return DEFAULT_ARENA_UAV_ID;
-  case LDS_ID:
-    if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
-      return DEFAULT_LDS_ID;
-    } else {
-      return DEFAULT_ARENA_UAV_ID;
-    }
-  case GDS_ID:
-    if (usesHardware(AMDGPUDeviceInfo::RegionMem)) {
-      return DEFAULT_GDS_ID;
-    } else {
-      return DEFAULT_ARENA_UAV_ID;
-    }
-  case SCRATCH_ID:
-    if (usesHardware(AMDGPUDeviceInfo::PrivateMem)) {
-      return DEFAULT_SCRATCH_ID;
-    } else {
-      return DEFAULT_ARENA_UAV_ID;
-    }
-  };
-  return 0;
-}
-
-size_t AMDGPUEvergreenDevice::getWavefrontSize() const {
-  return AMDGPUDevice::WavefrontSize;
-}
-
-uint32_t AMDGPUEvergreenDevice::getGeneration() const {
-  return AMDGPUDeviceInfo::HD5XXX;
-}
-
-void AMDGPUEvergreenDevice::setCaps() {
-  mSWBits.set(AMDGPUDeviceInfo::ArenaSegment);
-  mHWBits.set(AMDGPUDeviceInfo::ArenaUAV);
-  mHWBits.set(AMDGPUDeviceInfo::HW64BitDivMod);
-  mSWBits.reset(AMDGPUDeviceInfo::HW64BitDivMod);
-  mSWBits.set(AMDGPUDeviceInfo::Signed24BitOps);
-  if (mSTM->isOverride(AMDGPUDeviceInfo::ByteStores)) {
-    mHWBits.set(AMDGPUDeviceInfo::ByteStores);
-  }
-  if (mSTM->isOverride(AMDGPUDeviceInfo::Debug)) {
-    mSWBits.set(AMDGPUDeviceInfo::LocalMem);
-    mSWBits.set(AMDGPUDeviceInfo::RegionMem);
-  } else {
-    mHWBits.set(AMDGPUDeviceInfo::LocalMem);
-    mHWBits.set(AMDGPUDeviceInfo::RegionMem);
-  }
-  mHWBits.set(AMDGPUDeviceInfo::Images);
-  if (mSTM->isOverride(AMDGPUDeviceInfo::NoAlias)) {
-    mHWBits.set(AMDGPUDeviceInfo::NoAlias);
-  }
-  mHWBits.set(AMDGPUDeviceInfo::CachedMem);
-  if (mSTM->isOverride(AMDGPUDeviceInfo::MultiUAV)) {
-    mHWBits.set(AMDGPUDeviceInfo::MultiUAV);
-  }
-  mHWBits.set(AMDGPUDeviceInfo::ByteLDSOps);
-  mSWBits.reset(AMDGPUDeviceInfo::ByteLDSOps);
-  mHWBits.set(AMDGPUDeviceInfo::ArenaVectors);
-  mHWBits.set(AMDGPUDeviceInfo::LongOps);
-  mSWBits.reset(AMDGPUDeviceInfo::LongOps);
-  mHWBits.set(AMDGPUDeviceInfo::TmrReg);
-}
-
-AMDGPUCypressDevice::AMDGPUCypressDevice(AMDGPUSubtarget *ST)
-  : AMDGPUEvergreenDevice(ST) {
-  setCaps();
-}
-
-AMDGPUCypressDevice::~AMDGPUCypressDevice() {
-}
-
-void AMDGPUCypressDevice::setCaps() {
-  if (mSTM->isOverride(AMDGPUDeviceInfo::DoubleOps)) {
-    mHWBits.set(AMDGPUDeviceInfo::DoubleOps);
-    mHWBits.set(AMDGPUDeviceInfo::FMA);
-  }
-}
-
-
-AMDGPUCedarDevice::AMDGPUCedarDevice(AMDGPUSubtarget *ST)
-  : AMDGPUEvergreenDevice(ST) {
-  setCaps();
-}
-
-AMDGPUCedarDevice::~AMDGPUCedarDevice() {
-}
-
-void AMDGPUCedarDevice::setCaps() {
-  mSWBits.set(AMDGPUDeviceInfo::FMA);
-}
-
-size_t AMDGPUCedarDevice::getWavefrontSize() const {
-  return AMDGPUDevice::QuarterWavefrontSize;
-}
-
-AMDGPURedwoodDevice::AMDGPURedwoodDevice(AMDGPUSubtarget *ST)
-  : AMDGPUEvergreenDevice(ST) {
-  setCaps();
-}
-
-AMDGPURedwoodDevice::~AMDGPURedwoodDevice() {
-}
-
-void AMDGPURedwoodDevice::setCaps() {
-  mSWBits.set(AMDGPUDeviceInfo::FMA);
-}
-
-size_t AMDGPURedwoodDevice::getWavefrontSize() const {
-  return AMDGPUDevice::HalfWavefrontSize;
-}
diff --git a/lib/Target/R600/AMDILEvergreenDevice.h b/lib/Target/R600/AMDILEvergreenDevice.h
deleted file mode 100644
index ea90f77..0000000
--- a/lib/Target/R600/AMDILEvergreenDevice.h
+++ /dev/null
@@ -1,93 +0,0 @@
-//==- AMDILEvergreenDevice.h - Define Evergreen Device for AMDIL -*- C++ -*--=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Interface for the subtarget data classes.
-///
-/// This file will define the interface that each generation needs to
-/// implement in order to correctly answer queries on the capabilities of the
-/// specific hardware.
-//===----------------------------------------------------------------------===//
-#ifndef AMDILEVERGREENDEVICE_H
-#define AMDILEVERGREENDEVICE_H
-#include "AMDGPUSubtarget.h"
-#include "AMDILDevice.h"
-
-namespace llvm {
-  class AMDGPUSubtarget;
-//===----------------------------------------------------------------------===//
-// Evergreen generation of devices and their respective sub classes
-//===----------------------------------------------------------------------===//
-
-
-/// \brief The AMDGPUEvergreenDevice is the base device class for all of the Evergreen
-/// series of cards.
-///
-/// This class contains information required to differentiate
-/// the Evergreen device from the generic AMDGPUDevice. This device represents
-/// that capabilities of the 'Juniper' cards, also known as the HD57XX.
-class AMDGPUEvergreenDevice : public AMDGPUDevice {
-public:
-  AMDGPUEvergreenDevice(AMDGPUSubtarget *ST);
-  virtual ~AMDGPUEvergreenDevice();
-  virtual size_t getMaxLDSSize() const;
-  virtual size_t getMaxGDSSize() const;
-  virtual size_t getWavefrontSize() const;
-  virtual uint32_t getGeneration() const;
-  virtual uint32_t getMaxNumUAVs() const;
-  virtual uint32_t getResourceID(uint32_t) const;
-protected:
-  virtual void setCaps();
-};
-
-/// The AMDGPUCypressDevice is similiar to the AMDGPUEvergreenDevice, except it has
-/// support for double precision operations. This device is used to represent
-/// both the Cypress and Hemlock cards, which are commercially known as HD58XX
-/// and HD59XX cards.
-class AMDGPUCypressDevice : public AMDGPUEvergreenDevice {
-public:
-  AMDGPUCypressDevice(AMDGPUSubtarget *ST);
-  virtual ~AMDGPUCypressDevice();
-private:
-  virtual void setCaps();
-};
-
-
-/// \brief The AMDGPUCedarDevice is the class that represents all of the 'Cedar' based
-/// devices.
-///
-/// This class differs from the base AMDGPUEvergreenDevice in that the
-/// device is a ~quarter of the 'Juniper'. These are commercially known as the
-/// HD54XX and HD53XX series of cards.
-class AMDGPUCedarDevice : public AMDGPUEvergreenDevice {
-public:
-  AMDGPUCedarDevice(AMDGPUSubtarget *ST);
-  virtual ~AMDGPUCedarDevice();
-  virtual size_t getWavefrontSize() const;
-private:
-  virtual void setCaps();
-};
-
-/// \brief The AMDGPURedwoodDevice is the class the represents all of the 'Redwood' based
-/// devices.
-///
-/// This class differs from the base class, in that these devices are
-/// considered about half of a 'Juniper' device. These are commercially known as
-/// the HD55XX and HD56XX series of cards.
-class AMDGPURedwoodDevice : public AMDGPUEvergreenDevice {
-public:
-  AMDGPURedwoodDevice(AMDGPUSubtarget *ST);
-  virtual ~AMDGPURedwoodDevice();
-  virtual size_t getWavefrontSize() const;
-private:
-  virtual void setCaps();
-};
-  
-} // namespace llvm
-#endif // AMDILEVERGREENDEVICE_H
diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp
index ba75a44..93432a2 100644
--- a/lib/Target/R600/AMDILISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp
@@ -14,14 +14,14 @@
 #include "AMDGPUInstrInfo.h"
 #include "AMDGPUISelLowering.h" // For AMDGPUISD
 #include "AMDGPURegisterInfo.h"
-#include "AMDILDevices.h"
 #include "R600InstrInfo.h"
 #include "SIISelLowering.h"
 #include "llvm/ADT/ValueMap.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/Support/Compiler.h"
-#include "llvm/CodeGen/SelectionDAG.h"
 #include <list>
 #include <queue>
 
@@ -48,7 +48,10 @@ public:
 
 private:
   inline SDValue getSmallIPtrImm(unsigned Imm);
+  bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs,
+                   const R600InstrInfo *TII, std::vector<unsigned> Cst);
   bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
+  bool FoldDotOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
 
   // Complex pattern selectors
   bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2);
@@ -164,7 +167,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
   default: break;
   case ISD::BUILD_VECTOR: {
     const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
-    if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
+    if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
       break;
     }
     // BUILD_VECTOR is usually lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
@@ -194,7 +197,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
   case ISD::BUILD_PAIR: {
     SDValue RC, SubReg0, SubReg1;
     const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
-    if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
+    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
       break;
     }
     if (N->getValueType(0) == MVT::i128) {
@@ -211,7 +214,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
                             N->getOperand(1), SubReg1 };
     return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
-                                  N->getDebugLoc(), N->getValueType(0), Ops);
+                                  SDLoc(N), N->getValueType(0), Ops);
   }
 
   case ISD::ConstantFP:
@@ -219,7 +222,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
     // XXX: Custom immediate lowering not implemented yet.  Instead we use
     // pseudo instructions defined in SIInstructions.td
-    if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
+    if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
       break;
     }
     const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(TM.getInstrInfo());
@@ -314,9 +317,23 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
   // Fold operands of selected node
 
   const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
-  if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
+  if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
     const R600InstrInfo *TII =
         static_cast<const R600InstrInfo*>(TM.getInstrInfo());
+    if (Result && Result->isMachineOpcode() && Result->getMachineOpcode() == AMDGPU::DOT_4) {
+      bool IsModified = false;
+      do {
+        std::vector<SDValue> Ops;
+        for(SDNode::op_iterator I = Result->op_begin(), E = Result->op_end();
+            I != E; ++I)
+          Ops.push_back(*I);
+        IsModified = FoldDotOperands(Result->getMachineOpcode(), TII, Ops);
+        if (IsModified) {
+          Result = CurDAG->UpdateNodeOperands(Result, Ops.data(), Ops.size());
+        }
+      } while (IsModified);
+      
+    }
     if (Result && Result->isMachineOpcode() &&
         !(TII->get(Result->getMachineOpcode()).TSFlags & R600_InstFlag::VECTOR)
         && TII->isALUInstr(Result->getMachineOpcode())) {
@@ -359,6 +376,43 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
   return Result;
 }
 
+bool AMDGPUDAGToDAGISel::FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg,
+                                     SDValue &Abs, const R600InstrInfo *TII,
+                                     std::vector<unsigned> Consts) {
+  switch (Src.getOpcode()) {
+  case AMDGPUISD::CONST_ADDRESS: {
+    SDValue CstOffset;
+    if (Src.getValueType().isVector() ||
+        !SelectGlobalValueConstantOffset(Src.getOperand(0), CstOffset))
+      return false;
+
+    ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset);
+    Consts.push_back(Cst->getZExtValue());
+    if (!TII->fitsConstReadLimitations(Consts))
+      return false;
+
+    Src = CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32);
+    Sel = CstOffset;
+    return true;
+    }
+  case ISD::FNEG:
+    Src = Src.getOperand(0);
+    Neg = CurDAG->getTargetConstant(1, MVT::i32);
+    return true;
+  case ISD::FABS:
+    if (!Abs.getNode())
+      return false;
+    Src = Src.getOperand(0);
+    Abs = CurDAG->getTargetConstant(1, MVT::i32);
+    return true;
+  case ISD::BITCAST:
+    Src = Src.getOperand(0);
+    return true;
+  default:
+    return false;
+  }
+}
+
 bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode,
     const R600InstrInfo *TII, std::vector<SDValue> &Ops) {
   int OperandIdx[] = {
@@ -382,59 +436,101 @@ bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode,
     -1
   };
 
+  // Gather constants values
+  std::vector<unsigned> Consts;
+  for (unsigned j = 0; j < 3; j++) {
+    int SrcIdx = OperandIdx[j];
+    if (SrcIdx < 0)
+      break;
+    if (RegisterSDNode *Reg = dyn_cast<RegisterSDNode>(Ops[SrcIdx - 1])) {
+      if (Reg->getReg() == AMDGPU::ALU_CONST) {
+        ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Ops[SelIdx[j] - 1]);
+        Consts.push_back(Cst->getZExtValue());
+      }
+    }
+  }
+
   for (unsigned i = 0; i < 3; i++) {
     if (OperandIdx[i] < 0)
       return false;
-    SDValue Operand = Ops[OperandIdx[i] - 1];
-    switch (Operand.getOpcode()) {
-    case AMDGPUISD::CONST_ADDRESS: {
-      SDValue CstOffset;
-      if (Operand.getValueType().isVector() ||
-          !SelectGlobalValueConstantOffset(Operand.getOperand(0), CstOffset))
-        break;
-
-      // Gather others constants values
-      std::vector<unsigned> Consts;
-      for (unsigned j = 0; j < 3; j++) {
-        int SrcIdx = OperandIdx[j];
-        if (SrcIdx < 0)
-          break;
-        if (RegisterSDNode *Reg = dyn_cast<RegisterSDNode>(Ops[SrcIdx - 1])) {
-          if (Reg->getReg() == AMDGPU::ALU_CONST) {
-            ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Ops[SelIdx[j] - 1]);
-            Consts.push_back(Cst->getZExtValue());
-          }
-        }
-      }
+    SDValue &Src = Ops[OperandIdx[i] - 1];
+    SDValue &Sel = Ops[SelIdx[i] - 1];
+    SDValue &Neg = Ops[NegIdx[i] - 1];
+    SDValue FakeAbs;
+    SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
+    if (FoldOperand(Src, Sel, Neg, Abs, TII, Consts))
+      return true;
+  }
+  return false;
+}
 
-      ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset);
-      Consts.push_back(Cst->getZExtValue());
-      if (!TII->fitsConstReadLimitations(Consts))
-        break;
+bool AMDGPUDAGToDAGISel::FoldDotOperands(unsigned Opcode,
+    const R600InstrInfo *TII, std::vector<SDValue> &Ops) {
+  int OperandIdx[] = {
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_X),
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_Y),
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_Z),
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_W),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_X),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_Y),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_Z),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_W)
+  };
+  int SelIdx[] = {
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_SEL_X),
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_SEL_Y),
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_SEL_Z),
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_SEL_W),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_SEL_X),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_SEL_Y),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_SEL_Z),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_SEL_W)
+  };
+  int NegIdx[] = {
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_NEG_X),
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_NEG_Y),
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_NEG_Z),
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_NEG_W),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_NEG_X),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_NEG_Y),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_NEG_Z),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_NEG_W)
+  };
+  int AbsIdx[] = {
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_ABS_X),
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_ABS_Y),
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_ABS_Z),
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_ABS_W),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_ABS_X),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_ABS_Y),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_ABS_Z),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_ABS_W)
+  };
 
-      Ops[OperandIdx[i] - 1] = CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32);
-      Ops[SelIdx[i] - 1] = CstOffset;
-      return true;
-      }
-    case ISD::FNEG:
-      if (NegIdx[i] < 0)
-        break;
-      Ops[OperandIdx[i] - 1] = Operand.getOperand(0);
-      Ops[NegIdx[i] - 1] = CurDAG->getTargetConstant(1, MVT::i32);
-      return true;
-    case ISD::FABS:
-      if (AbsIdx[i] < 0)
-        break;
-      Ops[OperandIdx[i] - 1] = Operand.getOperand(0);
-      Ops[AbsIdx[i] - 1] = CurDAG->getTargetConstant(1, MVT::i32);
-      return true;
-    case ISD::BITCAST:
-      Ops[OperandIdx[i] - 1] = Operand.getOperand(0);
-      return true;
-    default:
+  // Gather constants values
+  std::vector<unsigned> Consts;
+  for (unsigned j = 0; j < 8; j++) {
+    int SrcIdx = OperandIdx[j];
+    if (SrcIdx < 0)
       break;
+    if (RegisterSDNode *Reg = dyn_cast<RegisterSDNode>(Ops[SrcIdx - 1])) {
+      if (Reg->getReg() == AMDGPU::ALU_CONST) {
+        ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Ops[SelIdx[j] - 1]);
+        Consts.push_back(Cst->getZExtValue());
+      }
     }
   }
+
+  for (unsigned i = 0; i < 8; i++) {
+    if (OperandIdx[i] < 0)
+      return false;
+    SDValue &Src = Ops[OperandIdx[i] - 1];
+    SDValue &Sel = Ops[SelIdx[i] - 1];
+    SDValue &Neg = Ops[NegIdx[i] - 1];
+    SDValue &Abs = Ops[AbsIdx[i] - 1];
+    if (FoldOperand(Src, Sel, Neg, Abs, TII, Consts))
+      return true;
+  }
   return false;
 }
 
@@ -616,7 +712,7 @@ bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
   } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr))
              && isInt<16>(IMMOffset->getZExtValue())) {
     Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
-                                  CurDAG->getEntryNode().getDebugLoc(),
+                                  SDLoc(CurDAG->getEntryNode()),
                                   AMDGPU::ZERO, MVT::i32);
     Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), MVT::i32);
     return true;
@@ -649,18 +745,45 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
 
 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
 
+  if (Subtarget.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+    return;
+  }
+
   // Go over all selected nodes and try to fold them a bit more
-  const AMDGPUTargetLowering& Lowering = ((const AMDGPUTargetLowering&)TLI);
+  const AMDGPUTargetLowering& Lowering = (*(const AMDGPUTargetLowering*)TLI);
   for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
        E = CurDAG->allnodes_end(); I != E; ++I) {
 
-    MachineSDNode *Node = dyn_cast<MachineSDNode>(I);
-    if (!Node)
+    SDNode *Node = I;
+    switch (Node->getOpcode()) {
+    // Fix the register class in copy to CopyToReg nodes - ISel will always
+    // use SReg classes for 64-bit copies, but this is not always what we want.
+    case ISD::CopyToReg: {
+      unsigned Reg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
+      SDValue Val = Node->getOperand(2);
+      const TargetRegisterClass *RC = RegInfo->getRegClass(Reg);
+      if (RC != &AMDGPU::SReg_64RegClass) {
+        continue;
+      }
+
+      if (!Val.getNode()->isMachineOpcode()) {
+        continue;
+      }
+
+      const MCInstrDesc Desc = TM.getInstrInfo()->get(Val.getNode()->getMachineOpcode());
+      const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+      RegInfo->setRegClass(Reg, TRI->getRegClass(Desc.OpInfo[0].RegClass));
       continue;
+    }
+    }
 
-    SDNode *ResNode = Lowering.PostISelFolding(Node, *CurDAG);
-    if (ResNode != Node)
+    MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(I);
+    if (!MachineNode)
+      continue;
+
+    SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
+    if (ResNode != Node) {
       ReplaceUses(Node, ResNode);
+    }
   }
 }
-
diff --git a/lib/Target/R600/AMDILISelLowering.cpp b/lib/Target/R600/AMDILISelLowering.cpp
index 922cac1..d669966 100644
--- a/lib/Target/R600/AMDILISelLowering.cpp
+++ b/lib/Target/R600/AMDILISelLowering.cpp
@@ -15,7 +15,6 @@
 #include "AMDGPUISelLowering.h"
 #include "AMDGPURegisterInfo.h"
 #include "AMDGPUSubtarget.h"
-#include "AMDILDevices.h"
 #include "AMDILIntrinsicInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -138,8 +137,6 @@ void AMDGPUTargetLowering::InitAMDILLowering() {
     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 
-    // GPU doesn't have a rotl, rotr, or byteswap instruction
-    setOperationAction(ISD::ROTR, VT, Expand);
     setOperationAction(ISD::BSWAP, VT, Expand);
 
     // GPU doesn't have any counting operators
@@ -158,21 +155,19 @@ void AMDGPUTargetLowering::InitAMDILLowering() {
     setOperationAction(ISD::SELECT_CC, VT, Expand);
 
   }
-  if (STM.device()->isSupported(AMDGPUDeviceInfo::LongOps)) {
-    setOperationAction(ISD::MULHU, MVT::i64, Expand);
-    setOperationAction(ISD::MULHU, MVT::v2i64, Expand);
-    setOperationAction(ISD::MULHS, MVT::i64, Expand);
-    setOperationAction(ISD::MULHS, MVT::v2i64, Expand);
-    setOperationAction(ISD::ADD, MVT::v2i64, Expand);
-    setOperationAction(ISD::SREM, MVT::v2i64, Expand);
-    setOperationAction(ISD::Constant          , MVT::i64  , Legal);
-    setOperationAction(ISD::SDIV, MVT::v2i64, Expand);
-    setOperationAction(ISD::TRUNCATE, MVT::v2i64, Expand);
-    setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Expand);
-    setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Expand);
-    setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Expand);
-  }
-  if (STM.device()->isSupported(AMDGPUDeviceInfo::DoubleOps)) {
+  setOperationAction(ISD::MULHU, MVT::i64, Expand);
+  setOperationAction(ISD::MULHU, MVT::v2i64, Expand);
+  setOperationAction(ISD::MULHS, MVT::i64, Expand);
+  setOperationAction(ISD::MULHS, MVT::v2i64, Expand);
+  setOperationAction(ISD::ADD, MVT::v2i64, Expand);
+  setOperationAction(ISD::SREM, MVT::v2i64, Expand);
+  setOperationAction(ISD::Constant          , MVT::i64  , Legal);
+  setOperationAction(ISD::SDIV, MVT::v2i64, Expand);
+  setOperationAction(ISD::TRUNCATE, MVT::v2i64, Expand);
+  setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Expand);
+  setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Expand);
+  setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Expand);
+  if (STM.hasHWFP64()) {
     // we support loading/storing v2f64 but not operations on the type
     setOperationAction(ISD::FADD, MVT::v2f64, Expand);
     setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
@@ -331,7 +326,7 @@ SDValue
 AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const {
   SDValue Data = Op.getOperand(0);
   VTSDNode *BaseType = cast<VTSDNode>(Op.getOperand(1));
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT DVT = Data.getValueType();
   EVT BVT = BaseType->getVT();
   unsigned baseBits = BVT.getScalarType().getSizeInBits();
@@ -387,7 +382,7 @@ AMDGPUTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   SDValue Result;
   Result = DAG.getNode(
       AMDGPUISD::BRANCH_COND,
-      Op.getDebugLoc(),
+      SDLoc(Op),
       Op.getValueType(),
       Chain, Jump, Cond);
   return Result;
@@ -395,7 +390,7 @@ AMDGPUTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue
 AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT OVT = Op.getValueType();
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
@@ -476,7 +471,7 @@ AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue
 AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT OVT = Op.getValueType();
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
@@ -547,7 +542,7 @@ AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue
 AMDGPUTargetLowering::LowerSREM8(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT OVT = Op.getValueType();
   MVT INTTY = MVT::i32;
   if (OVT == MVT::v2i8) {
@@ -564,7 +559,7 @@ AMDGPUTargetLowering::LowerSREM8(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue
 AMDGPUTargetLowering::LowerSREM16(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT OVT = Op.getValueType();
   MVT INTTY = MVT::i32;
   if (OVT == MVT::v2i16) {
@@ -581,7 +576,7 @@ AMDGPUTargetLowering::LowerSREM16(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue
 AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT OVT = Op.getValueType();
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
diff --git a/lib/Target/R600/AMDILInstrInfo.td b/lib/Target/R600/AMDILInstrInfo.td
index 110f147..f7d0bd5 100644
--- a/lib/Target/R600/AMDILInstrInfo.td
+++ b/lib/Target/R600/AMDILInstrInfo.td
@@ -10,63 +10,6 @@
 // This file describes the AMDIL instructions in TableGen format.
 //
 //===----------------------------------------------------------------------===//
-// AMDIL Instruction Predicate Definitions
-// Predicate that is set to true if the hardware supports double precision
-// divide
-def HasHWDDiv                 : Predicate<"Subtarget.device()"
-                           "->getGeneration() > AMDGPUDeviceInfo::HD4XXX && "
-              "Subtarget.device()->usesHardware(AMDGPUDeviceInfo::DoubleOps)">;
-
-// Predicate that is set to true if the hardware supports double, but not double
-// precision divide in hardware
-def HasSWDDiv             : Predicate<"Subtarget.device()"
-                           "->getGeneration() == AMDGPUDeviceInfo::HD4XXX &&"
-              "Subtarget.device()->usesHardware(AMDGPUDeviceInfo::DoubleOps)">;
-
-// Predicate that is set to true if the hardware support 24bit signed
-// math ops. Otherwise a software expansion to 32bit math ops is used instead.
-def HasHWSign24Bit          : Predicate<"Subtarget.device()"
-                            "->getGeneration() > AMDGPUDeviceInfo::HD5XXX">;
-
-// Predicate that is set to true if 64bit operations are supported or not
-def HasHW64Bit              : Predicate<"Subtarget.device()"
-                            "->usesHardware(AMDGPUDeviceInfo::LongOps)">;
-def HasSW64Bit              : Predicate<"Subtarget.device()"
-                            "->usesSoftware(AMDGPUDeviceInfo::LongOps)">;
-
-// Predicate that is set to true if the timer register is supported
-def HasTmrRegister          : Predicate<"Subtarget.device()"
-                            "->isSupported(AMDGPUDeviceInfo::TmrReg)">;
-// Predicate that is true if we are at least evergreen series
-def HasDeviceIDInst         : Predicate<"Subtarget.device()"
-                            "->getGeneration() >= AMDGPUDeviceInfo::HD5XXX">;
-
-// Predicate that is true if we have region address space.
-def hasRegionAS             : Predicate<"Subtarget.device()"
-                            "->usesHardware(AMDGPUDeviceInfo::RegionMem)">;
-
-// Predicate that is false if we don't have region address space.
-def noRegionAS             : Predicate<"!Subtarget.device()"
-                            "->isSupported(AMDGPUDeviceInfo::RegionMem)">;
-
-
-// Predicate that is set to true if 64bit Mul is supported in the IL or not
-def HasHW64Mul              : Predicate<"Subtarget.calVersion()" 
-                                          ">= CAL_VERSION_SC_139"
-                                          "&& Subtarget.device()"
-                                          "->getGeneration() >="
-                                          "AMDGPUDeviceInfo::HD5XXX">;
-def HasSW64Mul              : Predicate<"Subtarget.calVersion()" 
-                                          "< CAL_VERSION_SC_139">;
-// Predicate that is set to true if 64bit Div/Mod is supported in the IL or not
-def HasHW64DivMod           : Predicate<"Subtarget.device()"
-                            "->usesHardware(AMDGPUDeviceInfo::HW64BitDivMod)">;
-def HasSW64DivMod           : Predicate<"Subtarget.device()"
-                            "->usesSoftware(AMDGPUDeviceInfo::HW64BitDivMod)">;
-
-// Predicate that is set to true if 64bit pointer are used.
-def Has64BitPtr             : Predicate<"Subtarget.is64bit()">;
-def Has32BitPtr             : Predicate<"!Subtarget.is64bit()">;
 //===--------------------------------------------------------------------===//
 // Custom Operands
 //===--------------------------------------------------------------------===//
diff --git a/lib/Target/R600/AMDILIntrinsicInfo.cpp b/lib/Target/R600/AMDILIntrinsicInfo.cpp
index 4ddb057..762ee39 100644
--- a/lib/Target/R600/AMDILIntrinsicInfo.cpp
+++ b/lib/Target/R600/AMDILIntrinsicInfo.cpp
@@ -14,7 +14,6 @@
 
 #include "AMDILIntrinsicInfo.h"
 #include "AMDGPUSubtarget.h"
-#include "AMDIL.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
@@ -50,6 +49,9 @@ AMDGPUIntrinsicInfo::getName(unsigned int IntrID, Type **Tys,
 
 unsigned int
 AMDGPUIntrinsicInfo::lookupName(const char *Name, unsigned int Len) const  {
+  if (!StringRef(Name, Len).startswith("llvm."))
+    return 0; // All intrinsics start with 'llvm.'
+
 #define GET_FUNCTION_RECOGNIZER
 #include "AMDGPUGenIntrinsics.inc"
 #undef GET_FUNCTION_RECOGNIZER
diff --git a/lib/Target/R600/AMDILNIDevice.cpp b/lib/Target/R600/AMDILNIDevice.cpp
deleted file mode 100644
index 47c3f7f..0000000
--- a/lib/Target/R600/AMDILNIDevice.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-//===-- AMDILNIDevice.cpp - Device Info for Northern Islands devices ------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-/// \file
-//==-----------------------------------------------------------------------===//
-#include "AMDILNIDevice.h"
-#include "AMDGPUSubtarget.h"
-#include "AMDILEvergreenDevice.h"
-
-using namespace llvm;
-
-AMDGPUNIDevice::AMDGPUNIDevice(AMDGPUSubtarget *ST)
-  : AMDGPUEvergreenDevice(ST) {
-  std::string name = ST->getDeviceName();
-  if (name == "caicos") {
-    DeviceFlag = OCL_DEVICE_CAICOS;
-  } else if (name == "turks") {
-    DeviceFlag = OCL_DEVICE_TURKS;
-  } else if (name == "cayman") {
-    DeviceFlag = OCL_DEVICE_CAYMAN;
-  } else {
-    DeviceFlag = OCL_DEVICE_BARTS;
-  }
-}
-AMDGPUNIDevice::~AMDGPUNIDevice() {
-}
-
-size_t
-AMDGPUNIDevice::getMaxLDSSize() const {
-  if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
-    return MAX_LDS_SIZE_900;
-  } else {
-    return 0;
-  }
-}
-
-uint32_t
-AMDGPUNIDevice::getGeneration() const {
-  return AMDGPUDeviceInfo::HD6XXX;
-}
-
-
-AMDGPUCaymanDevice::AMDGPUCaymanDevice(AMDGPUSubtarget *ST)
-  : AMDGPUNIDevice(ST) {
-  setCaps();
-}
-
-AMDGPUCaymanDevice::~AMDGPUCaymanDevice() {
-}
-
-void
-AMDGPUCaymanDevice::setCaps() {
-  if (mSTM->isOverride(AMDGPUDeviceInfo::DoubleOps)) {
-    mHWBits.set(AMDGPUDeviceInfo::DoubleOps);
-    mHWBits.set(AMDGPUDeviceInfo::FMA);
-  }
-  mHWBits.set(AMDGPUDeviceInfo::Signed24BitOps);
-  mSWBits.reset(AMDGPUDeviceInfo::Signed24BitOps);
-  mSWBits.set(AMDGPUDeviceInfo::ArenaSegment);
-}
-
diff --git a/lib/Target/R600/AMDILNIDevice.h b/lib/Target/R600/AMDILNIDevice.h
deleted file mode 100644
index 24a6408..0000000
--- a/lib/Target/R600/AMDILNIDevice.h
+++ /dev/null
@@ -1,57 +0,0 @@
-//===------- AMDILNIDevice.h - Define NI Device for AMDIL -*- C++ -*------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-/// \file
-/// \brief Interface for the subtarget data classes.
-///
-/// This file will define the interface that each generation needs to
-/// implement in order to correctly answer queries on the capabilities of the
-/// specific hardware.
-//===---------------------------------------------------------------------===//
-#ifndef AMDILNIDEVICE_H
-#define AMDILNIDEVICE_H
-#include "AMDGPUSubtarget.h"
-#include "AMDILEvergreenDevice.h"
-
-namespace llvm {
-
-class AMDGPUSubtarget;
-//===---------------------------------------------------------------------===//
-// NI generation of devices and their respective sub classes
-//===---------------------------------------------------------------------===//
-
-/// \brief The AMDGPUNIDevice is the base class for all Northern Island series of
-/// cards.
-///
-/// It is very similiar to the AMDGPUEvergreenDevice, with the major
-/// exception being differences in wavefront size and hardware capabilities.  The
-/// NI devices are all 64 wide wavefronts and also add support for signed 24 bit
-/// integer operations
-class AMDGPUNIDevice : public AMDGPUEvergreenDevice {
-public:
-  AMDGPUNIDevice(AMDGPUSubtarget*);
-  virtual ~AMDGPUNIDevice();
-  virtual size_t getMaxLDSSize() const;
-  virtual uint32_t getGeneration() const;
-};
-
-/// Just as the AMDGPUCypressDevice is the double capable version of the
-/// AMDGPUEvergreenDevice, the AMDGPUCaymanDevice is the double capable version
-/// of the AMDGPUNIDevice.  The other major difference is that the Cayman Device
-/// has 4 wide ALU's, whereas the rest of the NI family is a 5 wide.
-class AMDGPUCaymanDevice: public AMDGPUNIDevice {
-public:
-  AMDGPUCaymanDevice(AMDGPUSubtarget*);
-  virtual ~AMDGPUCaymanDevice();
-private:
-  virtual void setCaps();
-};
-
-static const unsigned int MAX_LDS_SIZE_900 = AMDGPUDevice::MAX_LDS_SIZE_800;
-} // namespace llvm
-#endif // AMDILNIDEVICE_H
diff --git a/lib/Target/R600/AMDILPeepholeOptimizer.cpp b/lib/Target/R600/AMDILPeepholeOptimizer.cpp
deleted file mode 100644
index 3a28038..0000000
--- a/lib/Target/R600/AMDILPeepholeOptimizer.cpp
+++ /dev/null
@@ -1,1215 +0,0 @@
-//===-- AMDILPeepholeOptimizer.cpp - AMDGPU Peephole optimizations ---------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-/// \file
-//==-----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "PeepholeOpt"
-#ifdef DEBUG
-#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
-#else
-#define DEBUGME 0
-#endif
-
-#include "AMDILDevices.h"
-#include "AMDGPUInstrInfo.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionAnalysis.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/MathExtras.h"
-
-#include <sstream>
-
-#if 0
-STATISTIC(PointerAssignments, "Number of dynamic pointer "
-    "assigments discovered");
-STATISTIC(PointerSubtract, "Number of pointer subtractions discovered");
-#endif
-
-using namespace llvm;
-// The Peephole optimization pass is used to do simple last minute optimizations
-// that are required for correct code or to remove redundant functions
-namespace {
-
-class OpaqueType;
-
-class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass {
-public:
-  TargetMachine &TM;
-  static char ID;
-  AMDGPUPeepholeOpt(TargetMachine &tm);
-  ~AMDGPUPeepholeOpt();
-  const char *getPassName() const;
-  bool runOnFunction(Function &F);
-  bool doInitialization(Module &M);
-  bool doFinalization(Module &M);
-  void getAnalysisUsage(AnalysisUsage &AU) const;
-protected:
-private:
-  // Function to initiate all of the instruction level optimizations.
-  bool instLevelOptimizations(BasicBlock::iterator *inst);
-  // Quick check to see if we need to dump all of the pointers into the
-  // arena. If this is correct, then we set all pointers to exist in arena. This
-  // is a workaround for aliasing of pointers in a struct/union.
-  bool dumpAllIntoArena(Function &F);
-  // Because I don't want to invalidate any pointers while in the
-  // safeNestedForEachFunction. I push atomic conversions to a vector and handle
-  // it later. This function does the conversions if required.
-  void doAtomicConversionIfNeeded(Function &F);
-  // Because __amdil_is_constant cannot be properly evaluated if
-  // optimizations are disabled, the call's are placed in a vector
-  // and evaluated after the __amdil_image* functions are evaluated
-  // which should allow the __amdil_is_constant function to be
-  // evaluated correctly.
-  void doIsConstCallConversionIfNeeded();
-  bool mChanged;
-  bool mDebug;
-  bool mConvertAtomics;
-  CodeGenOpt::Level optLevel;
-  // Run a series of tests to see if we can optimize a CALL instruction.
-  bool optimizeCallInst(BasicBlock::iterator *bbb);
-  // A peephole optimization to optimize bit extract sequences.
-  bool optimizeBitExtract(Instruction *inst);
-  // A peephole optimization to optimize bit insert sequences.
-  bool optimizeBitInsert(Instruction *inst);
-  bool setupBitInsert(Instruction *base, 
-                      Instruction *&src, 
-                      Constant *&mask, 
-                      Constant *&shift);
-  // Expand the bit field insert instruction on versions of OpenCL that
-  // don't support it.
-  bool expandBFI(CallInst *CI);
-  // Expand the bit field mask instruction on version of OpenCL that 
-  // don't support it.
-  bool expandBFM(CallInst *CI);
-  // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
-  // this case we need to expand them. These functions check for 24bit functions
-  // and then expand.
-  bool isSigned24BitOps(CallInst *CI);
-  void expandSigned24BitOps(CallInst *CI);
-  // One optimization that can occur is that if the required workgroup size is
-  // specified then the result of get_local_size is known at compile time and
-  // can be returned accordingly.
-  bool isRWGLocalOpt(CallInst *CI);
-  // On northern island cards, the division is slightly less accurate than on
-  // previous generations, so we need to utilize a more accurate division. So we
-  // can translate the accurate divide to a normal divide on all other cards.
-  bool convertAccurateDivide(CallInst *CI);
-  void expandAccurateDivide(CallInst *CI);
-  // If the alignment is set incorrectly, it can produce really inefficient
-  // code. This checks for this scenario and fixes it if possible.
-  bool correctMisalignedMemOp(Instruction *inst);
-
-  // If we are in no opt mode, then we need to make sure that
-  // local samplers are properly propagated as constant propagation 
-  // doesn't occur and we need to know the value of kernel defined
-  // samplers at compile time.
-  bool propagateSamplerInst(CallInst *CI);
-
-  // Helper functions
-
-  // Group of functions that recursively calculate the size of a structure based
-  // on it's sub-types.
-  size_t getTypeSize(Type * const T, bool dereferencePtr = false);
-  size_t getTypeSize(StructType * const ST, bool dereferencePtr = false);
-  size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false);
-  size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false);
-  size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false);
-  size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false);
-  size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false);
-  size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false);
-
-  LLVMContext *mCTX;
-  Function *mF;
-  const AMDGPUSubtarget *mSTM;
-  SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs;
-  SmallVector<CallInst *, 16> isConstVec;
-}; // class AMDGPUPeepholeOpt
-  char AMDGPUPeepholeOpt::ID = 0;
-
-// A template function that has two levels of looping before calling the
-// function with a pointer to the current iterator.
-template<class InputIterator, class SecondIterator, class Function>
-Function safeNestedForEach(InputIterator First, InputIterator Last,
-                              SecondIterator S, Function F) {
-  for ( ; First != Last; ++First) {
-    SecondIterator sf, sl;
-    for (sf = First->begin(), sl = First->end();
-         sf != sl; )  {
-      if (!F(&sf)) {
-        ++sf;
-      } 
-    }
-  }
-  return F;
-}
-
-} // anonymous namespace
-
-namespace llvm {
-  FunctionPass *
-  createAMDGPUPeepholeOpt(TargetMachine &tm) {
-    return new AMDGPUPeepholeOpt(tm);
-  }
-} // llvm namespace
-
-AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm)
-  : FunctionPass(ID), TM(tm)  {
-  mDebug = DEBUGME;
-  optLevel = TM.getOptLevel();
-
-}
-
-AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt()  {
-}
-
-const char *
-AMDGPUPeepholeOpt::getPassName() const  {
-  return "AMDGPU PeepHole Optimization Pass";
-}
-
-bool 
-containsPointerType(Type *Ty)  {
-  if (!Ty) {
-    return false;
-  }
-  switch(Ty->getTypeID()) {
-  default:
-    return false;
-  case Type::StructTyID: {
-    const StructType *ST = dyn_cast<StructType>(Ty);
-    for (StructType::element_iterator stb = ST->element_begin(),
-           ste = ST->element_end(); stb != ste; ++stb) {
-      if (!containsPointerType(*stb)) {
-        continue;
-      }
-      return true;
-    }
-    break;
-  }
-  case Type::VectorTyID:
-  case Type::ArrayTyID:
-    return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType());
-  case Type::PointerTyID:
-    return true;
-  };
-  return false;
-}
-
-bool 
-AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F)  {
-  bool dumpAll = false;
-  for (Function::const_arg_iterator cab = F.arg_begin(),
-       cae = F.arg_end(); cab != cae; ++cab) {
-    const Argument *arg = cab;
-    const PointerType *PT = dyn_cast<PointerType>(arg->getType());
-    if (!PT) {
-      continue;
-    }
-    Type *DereferencedType = PT->getElementType();
-    if (!dyn_cast<StructType>(DereferencedType) 
-        ) {
-      continue;
-    }
-    if (!containsPointerType(DereferencedType)) {
-      continue;
-    }
-    // FIXME: Because a pointer inside of a struct/union may be aliased to
-    // another pointer we need to take the conservative approach and place all
-    // pointers into the arena until more advanced detection is implemented.
-    dumpAll = true;
-  }
-  return dumpAll;
-}
-void
-AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded() {
-  if (isConstVec.empty()) {
-    return;
-  }
-  for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) {
-    CallInst *CI = isConstVec[x];
-    Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
-    Type *aType = Type::getInt32Ty(*mCTX);
-    Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
-      : ConstantInt::get(aType, 0);
-    CI->replaceAllUsesWith(Val);
-    CI->eraseFromParent();
-  }
-  isConstVec.clear();
-}
-void 
-AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F)  {
-  // Don't do anything if we don't have any atomic operations.
-  if (atomicFuncs.empty()) {
-    return;
-  }
-  // Change the function name for the atomic if it is required
-  uint32_t size = atomicFuncs.size();
-  for (uint32_t x = 0; x < size; ++x) {
-    atomicFuncs[x].first->setOperand(
-        atomicFuncs[x].first->getNumOperands()-1, 
-        atomicFuncs[x].second);
-
-  }
-  mChanged = true;
-  if (mConvertAtomics) {
-    return;
-  }
-}
-
-bool 
-AMDGPUPeepholeOpt::runOnFunction(Function &MF)  {
-  mChanged = false;
-  mF = &MF;
-  mSTM = &TM.getSubtarget<AMDGPUSubtarget>();
-  if (mDebug) {
-    MF.dump();
-  }
-  mCTX = &MF.getType()->getContext();
-  mConvertAtomics = true;
-  safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
-     std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations),
-                  this));
-
-  doAtomicConversionIfNeeded(MF);
-  doIsConstCallConversionIfNeeded();
-
-  if (mDebug) {
-    MF.dump();
-  }
-  return mChanged;
-}
-
-bool 
-AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb)  {
-  Instruction *inst = (*bbb);
-  CallInst *CI = dyn_cast<CallInst>(inst);
-  if (!CI) {
-    return false;
-  }
-  if (isSigned24BitOps(CI)) {
-    expandSigned24BitOps(CI);
-    ++(*bbb);
-    CI->eraseFromParent();
-    return true;
-  }
-  if (propagateSamplerInst(CI)) {
-    return false;
-  }
-  if (expandBFI(CI) || expandBFM(CI)) {
-    ++(*bbb);
-    CI->eraseFromParent();
-    return true;
-  }
-  if (convertAccurateDivide(CI)) {
-    expandAccurateDivide(CI);
-    ++(*bbb);
-    CI->eraseFromParent();
-    return true;
-  }
-
-  StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName();
-  if (calleeName.startswith("__amdil_is_constant")) {
-    // If we do not have optimizations, then this
-    // cannot be properly evaluated, so we add the
-    // call instruction to a vector and process
-    // them at the end of processing after the
-    // samplers have been correctly handled.
-    if (optLevel == CodeGenOpt::None) {
-      isConstVec.push_back(CI);
-      return false;
-    } else {
-      Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
-      Type *aType = Type::getInt32Ty(*mCTX);
-      Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
-        : ConstantInt::get(aType, 0);
-      CI->replaceAllUsesWith(Val);
-      ++(*bbb);
-      CI->eraseFromParent();
-      return true;
-    }
-  }
-
-  if (calleeName.equals("__amdil_is_asic_id_i32")) {
-    ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0));
-    Type *aType = Type::getInt32Ty(*mCTX);
-    Value *Val = CV;
-    if (Val) {
-      Val = ConstantInt::get(aType, 
-          mSTM->device()->getDeviceFlag() & CV->getZExtValue());
-    } else {
-      Val = ConstantInt::get(aType, 0);
-    }
-    CI->replaceAllUsesWith(Val);
-    ++(*bbb);
-    CI->eraseFromParent();
-    return true;
-  }
-  Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1));
-  if (!F) {
-    return false;
-  } 
-  if (F->getName().startswith("__atom") && !CI->getNumUses() 
-      && F->getName().find("_xchg") == StringRef::npos) {
-    std::string buffer(F->getName().str() + "_noret");
-    F = dyn_cast<Function>(
-          F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));
-    atomicFuncs.push_back(std::make_pair(CI, F));
-  }
-  
-  if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment)
-      && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) {
-    return false;
-  }
-  if (!mConvertAtomics) {
-    return false;
-  }
-  StringRef name = F->getName();
-  if (name.startswith("__atom") && name.find("_g") != StringRef::npos) {
-    mConvertAtomics = false;
-  }
-  return false;
-}
-
-bool
-AMDGPUPeepholeOpt::setupBitInsert(Instruction *base, 
-    Instruction *&src, 
-    Constant *&mask, 
-    Constant *&shift) {
-  if (!base) {
-    if (mDebug) {
-      dbgs() << "Null pointer passed into function.\n";
-    }
-    return false;
-  }
-  bool andOp = false;
-  if (base->getOpcode() == Instruction::Shl) {
-    shift = dyn_cast<Constant>(base->getOperand(1));
-  } else if (base->getOpcode() == Instruction::And) {
-    mask = dyn_cast<Constant>(base->getOperand(1));
-    andOp = true;
-  } else {
-    if (mDebug) {
-      dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
-    }
-    // If the base is neither a Shl or a And, we don't fit any of the patterns above.
-    return false;
-  }
-  src = dyn_cast<Instruction>(base->getOperand(0));
-  if (!src) {
-    if (mDebug) {
-      dbgs() << "Failed setup since the base operand is not an instruction!\n";
-    }
-    return false;
-  }
-  // If we find an 'and' operation, then we don't need to
-  // find the next operation as we already know the
-  // bits that are valid at this point.
-  if (andOp) {
-    return true;
-  }
-  if (src->getOpcode() == Instruction::Shl && !shift) {
-    shift = dyn_cast<Constant>(src->getOperand(1));
-    src = dyn_cast<Instruction>(src->getOperand(0));
-  } else if (src->getOpcode() == Instruction::And && !mask) {
-    mask = dyn_cast<Constant>(src->getOperand(1));
-  }
-  if (!mask && !shift) {
-    if (mDebug) {
-      dbgs() << "Failed setup since both mask and shift are NULL!\n";
-    }
-    // Did not find a constant mask or a shift.
-    return false;
-  }
-  return true;
-}
-bool
-AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst)  {
-  if (!inst) {
-    return false;
-  }
-  if (!inst->isBinaryOp()) {
-    return false;
-  }
-  if (inst->getOpcode() != Instruction::Or) {
-    return false;
-  }
-  if (optLevel == CodeGenOpt::None) {
-    return false;
-  }
-  // We want to do an optimization on a sequence of ops that in the end equals a
-  // single ISA instruction.
-  // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
-  // Some simplified versions of this pattern are as follows:
-  // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
-  // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
-  // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
-  // (A & B) | (D << F) when (1 << F) >= B
-  // (A << C) | (D & E) when (1 << C) >= E
-  if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
-    // The HD4XXX hardware doesn't support the ubit_insert instruction.
-    return false;
-  }
-  Type *aType = inst->getType();
-  bool isVector = aType->isVectorTy();
-  int numEle = 1;
-  // This optimization only works on 32bit integers.
-  if (aType->getScalarType()
-      != Type::getInt32Ty(inst->getContext())) {
-    return false;
-  }
-  if (isVector) {
-    const VectorType *VT = dyn_cast<VectorType>(aType);
-    numEle = VT->getNumElements();
-    // We currently cannot support more than 4 elements in a intrinsic and we
-    // cannot support Vec3 types.
-    if (numEle > 4 || numEle == 3) {
-      return false;
-    }
-  }
-  // TODO: Handle vectors.
-  if (isVector) {
-    if (mDebug) {
-      dbgs() << "!!! Vectors are not supported yet!\n";
-    }
-    return false;
-  }
-  Instruction *LHSSrc = NULL, *RHSSrc = NULL;
-  Constant *LHSMask = NULL, *RHSMask = NULL;
-  Constant *LHSShift = NULL, *RHSShift = NULL;
-  Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0));
-  Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1));
-  if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) {
-    if (mDebug) {
-      dbgs() << "Found an OR Operation that failed setup!\n";
-      inst->dump();
-      if (LHS) { LHS->dump(); }
-      if (LHSSrc) { LHSSrc->dump(); }
-      if (LHSMask) { LHSMask->dump(); }
-      if (LHSShift) { LHSShift->dump(); }
-    }
-    // There was an issue with the setup for BitInsert.
-    return false;
-  }
-  if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) {
-    if (mDebug) {
-      dbgs() << "Found an OR Operation that failed setup!\n";
-      inst->dump();
-      if (RHS) { RHS->dump(); }
-      if (RHSSrc) { RHSSrc->dump(); }
-      if (RHSMask) { RHSMask->dump(); }
-      if (RHSShift) { RHSShift->dump(); }
-    }
-    // There was an issue with the setup for BitInsert.
-    return false;
-  }
-  if (mDebug) {
-    dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
-    dbgs() << "Op:        "; inst->dump();
-    dbgs() << "LHS:       "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; }
-    dbgs() << "LHS Src:   "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; }
-    dbgs() << "LHS Mask:  "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; }
-    dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; }
-    dbgs() << "RHS:       "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; }
-    dbgs() << "RHS Src:   "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; }
-    dbgs() << "RHS Mask:  "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; }
-    dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; }
-  }
-  Constant *offset = NULL;
-  Constant *width = NULL;
-  uint32_t lhsMaskVal = 0, rhsMaskVal = 0;
-  uint32_t lhsShiftVal = 0, rhsShiftVal = 0;
-  uint32_t lhsMaskWidth = 0, rhsMaskWidth = 0;
-  uint32_t lhsMaskOffset = 0, rhsMaskOffset = 0;
-  lhsMaskVal = (LHSMask 
-      ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0);
-  rhsMaskVal = (RHSMask 
-      ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0);
-  lhsShiftVal = (LHSShift 
-      ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0);
-  rhsShiftVal = (RHSShift 
-      ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0);
-  lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal;
-  rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal;
-  lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal;
-  rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal;
-  // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
-  if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) {
-    return false;
-  }
-  if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) {
-    offset = ConstantInt::get(aType, lhsMaskOffset, false);
-    width = ConstantInt::get(aType, lhsMaskWidth, false);
-    RHSSrc = RHS;
-    if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) {
-      return false;
-    }
-    if (!LHSShift) {
-      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
-          "MaskShr", LHS);
-    } else if (lhsShiftVal != lhsMaskOffset) {
-      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
-          "MaskShr", LHS);
-    }
-    if (mDebug) {
-      dbgs() << "Optimizing LHS!\n";
-    }
-  } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) {
-    offset = ConstantInt::get(aType, rhsMaskOffset, false);
-    width = ConstantInt::get(aType, rhsMaskWidth, false);
-    LHSSrc = RHSSrc;
-    RHSSrc = LHS;
-    if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) {
-      return false;
-    }
-    if (!RHSShift) {
-      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
-          "MaskShr", RHS);
-    } else if (rhsShiftVal != rhsMaskOffset) {
-      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
-          "MaskShr", RHS);
-    }
-    if (mDebug) {
-      dbgs() << "Optimizing RHS!\n";
-    }
-  } else {
-    if (mDebug) {
-      dbgs() << "Failed constraint 3!\n";
-    }
-    return false;
-  }
-  if (mDebug) {
-    dbgs() << "Width:  "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; }
-    dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; }
-    dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; }
-    dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; }
-  }
-  if (!offset || !width) {
-    if (mDebug) {
-      dbgs() << "Either width or offset are NULL, failed detection!\n";
-    }
-    return false;
-  }
-  // Lets create the function signature.
-  std::vector<Type *> callTypes;
-  callTypes.push_back(aType);
-  callTypes.push_back(aType);
-  callTypes.push_back(aType);
-  callTypes.push_back(aType);
-  FunctionType *funcType = FunctionType::get(aType, callTypes, false);
-  std::string name = "__amdil_ubit_insert";
-  if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; }
-  Function *Func = 
-    dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
-        getOrInsertFunction(StringRef(name), funcType));
-  Value *Operands[4] = {
-    width,
-    offset,
-    LHSSrc,
-    RHSSrc
-  };
-  CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt");
-  if (mDebug) {
-    dbgs() << "Old Inst: ";
-    inst->dump();
-    dbgs() << "New Inst: ";
-    CI->dump();
-    dbgs() << "\n\n";
-  }
-  CI->insertBefore(inst);
-  inst->replaceAllUsesWith(CI);
-  return true;
-}
-
-bool 
-AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst)  {
-  if (!inst) {
-    return false;
-  }
-  if (!inst->isBinaryOp()) {
-    return false;
-  }
-  if (inst->getOpcode() != Instruction::And) {
-    return false;
-  }
-  if (optLevel == CodeGenOpt::None) {
-    return false;
-  }
-  // We want to do some simple optimizations on Shift right/And patterns. The
-  // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
-  // value smaller than 32 and C is a mask. If C is a constant value, then the
-  // following transformation can occur. For signed integers, it turns into the
-  // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
-  // integers, it turns into the function call dst =
-  // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
-  // can be found in Section 7.9 of the ATI IL spec of the stream SDK for
-  // Evergreen hardware.
-  if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
-    // This does not work on HD4XXX hardware.
-    return false;
-  }
-  Type *aType = inst->getType();
-  bool isVector = aType->isVectorTy();
-
-  // XXX Support vector types
-  if (isVector) {
-    return false;
-  }
-  int numEle = 1;
-  // This only works on 32bit integers
-  if (aType->getScalarType()
-      != Type::getInt32Ty(inst->getContext())) {
-    return false;
-  }
-  if (isVector) {
-    const VectorType *VT = dyn_cast<VectorType>(aType);
-    numEle = VT->getNumElements();
-    // We currently cannot support more than 4 elements in a intrinsic and we
-    // cannot support Vec3 types.
-    if (numEle > 4 || numEle == 3) {
-      return false;
-    }
-  }
-  BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0));
-  // If the first operand is not a shift instruction, then we can return as it
-  // doesn't match this pattern.
-  if (!ShiftInst || !ShiftInst->isShift()) {
-    return false;
-  }
-  // If we are a shift left, then we need don't match this pattern.
-  if (ShiftInst->getOpcode() == Instruction::Shl) {
-    return false;
-  }
-  bool isSigned = ShiftInst->isArithmeticShift();
-  Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1));
-  Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1));
-  // Lets make sure that the shift value and the and mask are constant integers.
-  if (!AndMask || !ShrVal) {
-    return false;
-  }
-  Constant *newMaskConst;
-  Constant *shiftValConst;
-  if (isVector) {
-    // Handle the vector case
-    std::vector<Constant *> maskVals;
-    std::vector<Constant *> shiftVals;
-    ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask);
-    ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal);
-    Type *scalarType = AndMaskVec->getType()->getScalarType();
-    assert(AndMaskVec->getNumOperands() ==
-           ShrValVec->getNumOperands() && "cannot have a "
-           "combination where the number of elements to a "
-           "shift and an and are different!");
-    for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) {
-      ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x));
-      ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x));
-      if (!AndCI || !ShiftIC) {
-        return false;
-      }
-      uint32_t maskVal = (uint32_t)AndCI->getZExtValue();
-      if (!isMask_32(maskVal)) {
-        return false;
-      }
-      maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
-      uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue();
-      // If the mask or shiftval is greater than the bitcount, then break out.
-      if (maskVal >= 32 || shiftVal >= 32) {
-        return false;
-      }
-      // If the mask val is greater than the the number of original bits left
-      // then this optimization is invalid.
-      if (maskVal > (32 - shiftVal)) {
-        return false;
-      }
-      maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned));
-      shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned));
-    }
-    newMaskConst = ConstantVector::get(maskVals);
-    shiftValConst = ConstantVector::get(shiftVals);
-  } else {
-    // Handle the scalar case
-    uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue();
-    // This must be a mask value where all lower bits are set to 1 and then any
-    // bit higher is set to 0.
-    if (!isMask_32(maskVal)) {
-      return false;
-    }
-    maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
-    // Count the number of bits set in the mask, this is the width of the
-    // resulting bit set that is extracted from the source value.
-    uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue();
-    // If the mask or shift val is greater than the bitcount, then break out.
-    if (maskVal >= 32 || shiftVal >= 32) {
-      return false;
-    }
-    // If the mask val is greater than the the number of original bits left then
-    // this optimization is invalid.
-    if (maskVal > (32 - shiftVal)) {
-      return false;
-    }
-    newMaskConst = ConstantInt::get(aType, maskVal, isSigned);
-    shiftValConst = ConstantInt::get(aType, shiftVal, isSigned);
-  }
-  // Lets create the function signature.
-  std::vector<Type *> callTypes;
-  callTypes.push_back(aType);
-  callTypes.push_back(aType);
-  callTypes.push_back(aType);
-  FunctionType *funcType = FunctionType::get(aType, callTypes, false);
-  std::string name = "llvm.AMDGPU.bit.extract.u32";
-  if (isVector) {
-    name += ".v" + itostr(numEle) + "i32";
-  } else {
-    name += ".";
-  }
-  // Lets create the function.
-  Function *Func = 
-    dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
-                       getOrInsertFunction(StringRef(name), funcType));
-  Value *Operands[3] = {
-    ShiftInst->getOperand(0),
-    shiftValConst,
-    newMaskConst
-  };
-  // Lets create the Call with the operands
-  CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");
-  CI->setDoesNotAccessMemory();
-  CI->insertBefore(inst);
-  inst->replaceAllUsesWith(CI);
-  return true;
-}
-
-bool
-AMDGPUPeepholeOpt::expandBFI(CallInst *CI) {
-  if (!CI) {
-    return false;
-  }
-  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
-  if (!LHS->getName().startswith("__amdil_bfi")) {
-    return false;
-  }
-  Type* type = CI->getOperand(0)->getType();
-  Constant *negOneConst = NULL;
-  if (type->isVectorTy()) {
-    std::vector<Constant *> negOneVals;
-    negOneConst = ConstantInt::get(CI->getContext(), 
-        APInt(32, StringRef("-1"), 10));
-    for (size_t x = 0,
-        y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
-      negOneVals.push_back(negOneConst);
-    }
-    negOneConst = ConstantVector::get(negOneVals);
-  } else {
-    negOneConst = ConstantInt::get(CI->getContext(), 
-        APInt(32, StringRef("-1"), 10));
-  }
-  // __amdil_bfi => (A & B) | (~A & C)
-  BinaryOperator *lhs = 
-    BinaryOperator::Create(Instruction::And, CI->getOperand(0),
-        CI->getOperand(1), "bfi_and", CI);
-  BinaryOperator *rhs =
-    BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst,
-        "bfi_not", CI);
-  rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2),
-      "bfi_and", CI);
-  lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI);
-  CI->replaceAllUsesWith(lhs);
-  return true;
-}
-
-bool
-AMDGPUPeepholeOpt::expandBFM(CallInst *CI) {
-  if (!CI) {
-    return false;
-  }
-  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
-  if (!LHS->getName().startswith("__amdil_bfm")) {
-    return false;
-  }
-  // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
-  Constant *newMaskConst = NULL;
-  Constant *newShiftConst = NULL;
-  Type* type = CI->getOperand(0)->getType();
-  if (type->isVectorTy()) {
-    std::vector<Constant*> newMaskVals, newShiftVals;
-    newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
-    newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
-    for (size_t x = 0,
-        y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
-      newMaskVals.push_back(newMaskConst);
-      newShiftVals.push_back(newShiftConst);
-    }
-    newMaskConst = ConstantVector::get(newMaskVals);
-    newShiftConst = ConstantVector::get(newShiftVals);
-  } else {
-    newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
-    newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
-  }
-  BinaryOperator *lhs =
-    BinaryOperator::Create(Instruction::And, CI->getOperand(0),
-        newMaskConst, "bfm_mask", CI);
-  lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst,
-      lhs, "bfm_shl", CI);
-  lhs = BinaryOperator::Create(Instruction::Sub, lhs,
-      newShiftConst, "bfm_sub", CI);
-  BinaryOperator *rhs =
-    BinaryOperator::Create(Instruction::And, CI->getOperand(1),
-        newMaskConst, "bfm_mask", CI);
-  lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI);
-  CI->replaceAllUsesWith(lhs);
-  return true;
-}
-
-bool
-AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb)  {
-  Instruction *inst = (*bbb);
-  if (optimizeCallInst(bbb)) {
-    return true;
-  }
-  if (optimizeBitExtract(inst)) {
-    return false;
-  }
-  if (optimizeBitInsert(inst)) {
-    return false;
-  }
-  if (correctMisalignedMemOp(inst)) {
-    return false;
-  }
-  return false;
-}
-bool
-AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst) {
-  LoadInst *linst = dyn_cast<LoadInst>(inst);
-  StoreInst *sinst = dyn_cast<StoreInst>(inst);
-  unsigned alignment;
-  Type* Ty = inst->getType();
-  if (linst) {
-    alignment = linst->getAlignment();
-    Ty = inst->getType();
-  } else if (sinst) {
-    alignment = sinst->getAlignment();
-    Ty = sinst->getValueOperand()->getType();
-  } else {
-    return false;
-  }
-  unsigned size = getTypeSize(Ty);
-  if (size == alignment || size < alignment) {
-    return false;
-  }
-  if (!Ty->isStructTy()) {
-    return false;
-  }
-  if (alignment < 4) {
-    if (linst) {
-      linst->setAlignment(0);
-      return true;
-    } else if (sinst) {
-      sinst->setAlignment(0);
-      return true;
-    }
-  }
-  return false;
-}
-bool 
-AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI)  {
-  if (!CI) {
-    return false;
-  }
-  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
-  std::string namePrefix = LHS->getName().substr(0, 14);
-  if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24"
-      && namePrefix != "__amdil__imul24_high") {
-    return false;
-  }
-  if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) {
-    return false;
-  }
-  return true;
-}
-
-void 
-AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI)  {
-  assert(isSigned24BitOps(CI) && "Must be a "
-      "signed 24 bit operation to call this function!");
-  Value *LHS = CI->getOperand(CI->getNumOperands()-1);
-  // On 7XX and 8XX we do not have signed 24bit, so we need to
-  // expand it to the following:
-  // imul24 turns into 32bit imul
-  // imad24 turns into 32bit imad
-  // imul24_high turns into 32bit imulhigh
-  if (LHS->getName().substr(0, 14) == "__amdil_imad24") {
-    Type *aType = CI->getOperand(0)->getType();
-    bool isVector = aType->isVectorTy();
-    int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
-    std::vector<Type*> callTypes;
-    callTypes.push_back(CI->getOperand(0)->getType());
-    callTypes.push_back(CI->getOperand(1)->getType());
-    callTypes.push_back(CI->getOperand(2)->getType());
-    FunctionType *funcType =
-      FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
-    std::string name = "__amdil_imad";
-    if (isVector) {
-      name += "_v" + itostr(numEle) + "i32";
-    } else {
-      name += "_i32";
-    }
-    Function *Func = dyn_cast<Function>(
-                       CI->getParent()->getParent()->getParent()->
-                       getOrInsertFunction(StringRef(name), funcType));
-    Value *Operands[3] = {
-      CI->getOperand(0),
-      CI->getOperand(1),
-      CI->getOperand(2)
-    };
-    CallInst *nCI = CallInst::Create(Func, Operands, "imad24");
-    nCI->insertBefore(CI);
-    CI->replaceAllUsesWith(nCI);
-  } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") {
-    BinaryOperator *mulOp =
-      BinaryOperator::Create(Instruction::Mul, CI->getOperand(0),
-          CI->getOperand(1), "imul24", CI);
-    CI->replaceAllUsesWith(mulOp);
-  } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") {
-    Type *aType = CI->getOperand(0)->getType();
-
-    bool isVector = aType->isVectorTy();
-    int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
-    std::vector<Type*> callTypes;
-    callTypes.push_back(CI->getOperand(0)->getType());
-    callTypes.push_back(CI->getOperand(1)->getType());
-    FunctionType *funcType =
-      FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
-    std::string name = "__amdil_imul_high";
-    if (isVector) {
-      name += "_v" + itostr(numEle) + "i32";
-    } else {
-      name += "_i32";
-    }
-    Function *Func = dyn_cast<Function>(
-                       CI->getParent()->getParent()->getParent()->
-                       getOrInsertFunction(StringRef(name), funcType));
-    Value *Operands[2] = {
-      CI->getOperand(0),
-      CI->getOperand(1)
-    };
-    CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high");
-    nCI->insertBefore(CI);
-    CI->replaceAllUsesWith(nCI);
-  }
-}
-
-bool 
-AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI)  {
-  return (CI != NULL
-          && CI->getOperand(CI->getNumOperands() - 1)->getName() 
-          == "__amdil_get_local_size_int");
-}
-
-bool 
-AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI)  {
-  if (!CI) {
-    return false;
-  }
-  if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX
-      && (mSTM->getDeviceName() == "cayman")) {
-    return false;
-  }
-  return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20) 
-      == "__amdil_improved_div";
-}
-
-void 
-AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI)  {
-  assert(convertAccurateDivide(CI)
-         && "expanding accurate divide can only happen if it is expandable!");
-  BinaryOperator *divOp =
-    BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0),
-                           CI->getOperand(1), "fdiv32", CI);
-  CI->replaceAllUsesWith(divOp);
-}
-
-bool
-AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI) {
-  if (optLevel != CodeGenOpt::None) {
-    return false;
-  }
-
-  if (!CI) {
-    return false;
-  }
-
-  unsigned funcNameIdx = 0;
-  funcNameIdx = CI->getNumOperands() - 1;
-  StringRef calleeName = CI->getOperand(funcNameIdx)->getName();
-  if (calleeName != "__amdil_image2d_read_norm"
-   && calleeName != "__amdil_image2d_read_unnorm"
-   && calleeName != "__amdil_image3d_read_norm"
-   && calleeName != "__amdil_image3d_read_unnorm") {
-    return false;
-  }
-
-  unsigned samplerIdx = 2;
-  samplerIdx = 1;
-  Value *sampler = CI->getOperand(samplerIdx);
-  LoadInst *lInst = dyn_cast<LoadInst>(sampler);
-  if (!lInst) {
-    return false;
-  }
-
-  if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
-    return false;
-  }
-
-  GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand());
-  // If we are loading from what is not a global value, then we
-  // fail and return.
-  if (!gv) {
-    return false;
-  }
-
-  // If we don't have an initializer or we have an initializer and
-  // the initializer is not a 32bit integer, we fail.
-  if (!gv->hasInitializer() 
-      || !gv->getInitializer()->getType()->isIntegerTy(32)) {
-      return false;
-  }
-
-  // Now that we have the global variable initializer, lets replace
-  // all uses of the load instruction with the samplerVal and
-  // reparse the __amdil_is_constant() function.
-  Constant *samplerVal = gv->getInitializer();
-  lInst->replaceAllUsesWith(samplerVal);
-  return true;
-}
-
-bool 
-AMDGPUPeepholeOpt::doInitialization(Module &M)  {
-  return false;
-}
-
-bool 
-AMDGPUPeepholeOpt::doFinalization(Module &M)  {
-  return false;
-}
-
-void 
-AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const  {
-  AU.addRequired<MachineFunctionAnalysis>();
-  FunctionPass::getAnalysisUsage(AU);
-  AU.setPreservesAll();
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) {
-  size_t size = 0;
-  if (!T) {
-    return size;
-  }
-  switch (T->getTypeID()) {
-  case Type::X86_FP80TyID:
-  case Type::FP128TyID:
-  case Type::PPC_FP128TyID:
-  case Type::LabelTyID:
-    assert(0 && "These types are not supported by this backend");
-  default:
-  case Type::FloatTyID:
-  case Type::DoubleTyID:
-    size = T->getPrimitiveSizeInBits() >> 3;
-    break;
-  case Type::PointerTyID:
-    size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr);
-    break;
-  case Type::IntegerTyID:
-    size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr);
-    break;
-  case Type::StructTyID:
-    size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr);
-    break;
-  case Type::ArrayTyID:
-    size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr);
-    break;
-  case Type::FunctionTyID:
-    size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr);
-    break;
-  case Type::VectorTyID:
-    size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr);
-    break;
-  };
-  return size;
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST,
-    bool dereferencePtr) {
-  size_t size = 0;
-  if (!ST) {
-    return size;
-  }
-  Type *curType;
-  StructType::element_iterator eib;
-  StructType::element_iterator eie;
-  for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
-    curType = *eib;
-    size += getTypeSize(curType, dereferencePtr);
-  }
-  return size;
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT,
-    bool dereferencePtr) {
-  return IT ? (IT->getBitWidth() >> 3) : 0;
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT,
-    bool dereferencePtr) {
-    assert(0 && "Should not be able to calculate the size of an function type");
-    return 0;
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT,
-    bool dereferencePtr) {
-  return (size_t)(AT ? (getTypeSize(AT->getElementType(),
-                                    dereferencePtr) * AT->getNumElements())
-                     : 0);
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT,
-    bool dereferencePtr) {
-  return VT ? (VT->getBitWidth() >> 3) : 0;
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT,
-    bool dereferencePtr) {
-  if (!PT) {
-    return 0;
-  }
-  Type *CT = PT->getElementType();
-  if (CT->getTypeID() == Type::StructTyID &&
-      PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
-    return getTypeSize(dyn_cast<StructType>(CT));
-  } else if (dereferencePtr) {
-    size_t size = 0;
-    for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
-      size += getTypeSize(PT->getContainedType(x), dereferencePtr);
-    }
-    return size;
-  } else {
-    return 4;
-  }
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT,
-    bool dereferencePtr) {
-  //assert(0 && "Should not be able to calculate the size of an opaque type");
-  return 4;
-}
diff --git a/lib/Target/R600/AMDILSIDevice.cpp b/lib/Target/R600/AMDILSIDevice.cpp
deleted file mode 100644
index 0d1de3d..0000000
--- a/lib/Target/R600/AMDILSIDevice.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-//===-- AMDILSIDevice.cpp - Device Info for Southern Islands GPUs ---------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-/// \file
-//==-----------------------------------------------------------------------===//
-#include "AMDILSIDevice.h"
-#include "AMDGPUSubtarget.h"
-#include "AMDILEvergreenDevice.h"
-#include "AMDILNIDevice.h"
-
-using namespace llvm;
-
-AMDGPUSIDevice::AMDGPUSIDevice(AMDGPUSubtarget *ST)
-  : AMDGPUEvergreenDevice(ST) {
-}
-AMDGPUSIDevice::~AMDGPUSIDevice() {
-}
-
-size_t
-AMDGPUSIDevice::getMaxLDSSize() const {
-  if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
-    return MAX_LDS_SIZE_900;
-  } else {
-    return 0;
-  }
-}
-
-uint32_t
-AMDGPUSIDevice::getGeneration() const {
-  return AMDGPUDeviceInfo::HD7XXX;
-}
-
-std::string
-AMDGPUSIDevice::getDataLayout() const {
-  return std::string(
-    "e"
-    "-p:64:64:64"
-    "-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64"
-    "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128"
-    "-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
-    "-v2048:2048:2048"
-    "-n32:64"
-  );
-}
diff --git a/lib/Target/R600/AMDILSIDevice.h b/lib/Target/R600/AMDILSIDevice.h
deleted file mode 100644
index 5b2cb25..0000000
--- a/lib/Target/R600/AMDILSIDevice.h
+++ /dev/null
@@ -1,39 +0,0 @@
-//===------- AMDILSIDevice.h - Define SI Device for AMDIL -*- C++ -*------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Interface for the subtarget data classes.
-///
-/// This file will define the interface that each generation needs to
-/// implement in order to correctly answer queries on the capabilities of the
-/// specific hardware.
-//===---------------------------------------------------------------------===//
-#ifndef AMDILSIDEVICE_H
-#define AMDILSIDEVICE_H
-#include "AMDILEvergreenDevice.h"
-
-namespace llvm {
-class AMDGPUSubtarget;
-//===---------------------------------------------------------------------===//
-// SI generation of devices and their respective sub classes
-//===---------------------------------------------------------------------===//
-
-/// \brief The AMDGPUSIDevice is the base class for all Southern Island series
-/// of cards.
-class AMDGPUSIDevice : public AMDGPUEvergreenDevice {
-public:
-  AMDGPUSIDevice(AMDGPUSubtarget*);
-  virtual ~AMDGPUSIDevice();
-  virtual size_t getMaxLDSSize() const;
-  virtual uint32_t getGeneration() const;
-  virtual std::string getDataLayout() const;
-};
-
-} // namespace llvm
-#endif // AMDILSIDEVICE_H
diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
index 2ad2047..1b79bf5 100644
--- a/lib/Target/R600/CMakeLists.txt
+++ b/lib/Target/R600/CMakeLists.txt
@@ -12,17 +12,10 @@ tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer)
 add_public_tablegen_target(AMDGPUCommonTableGen)
 
 add_llvm_target(R600CodeGen
-  AMDIL7XXDevice.cpp
   AMDILCFGStructurizer.cpp
-  AMDILDevice.cpp
-  AMDILDeviceInfo.cpp
-  AMDILEvergreenDevice.cpp
   AMDILIntrinsicInfo.cpp
   AMDILISelDAGToDAG.cpp
   AMDILISelLowering.cpp
-  AMDILNIDevice.cpp
-  AMDILPeepholeOptimizer.cpp
-  AMDILSIDevice.cpp
   AMDGPUAsmPrinter.cpp
   AMDGPUFrameLowering.cpp
   AMDGPUIndirectAddressing.cpp
@@ -42,8 +35,10 @@ add_llvm_target(R600CodeGen
   R600ISelLowering.cpp
   R600MachineFunctionInfo.cpp
   R600MachineScheduler.cpp
+  R600OptimizeVectorRegisters.cpp
   R600Packetizer.cpp
   R600RegisterInfo.cpp
+  R600TextureIntrinsicsReplacer.cpp
   SIAnnotateControlFlow.cpp
   SIInsertWaits.cpp
   SIInstrInfo.cpp
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
index 10547a5..8c814e0 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
@@ -10,13 +10,14 @@
 
 #include "AMDGPUInstPrinter.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
 
 using namespace llvm;
 
 void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
                              StringRef Annot) {
+  OS.flush();
   printInstruction(MI, OS);
 
   printAnnotation(OS, Annot);
@@ -67,11 +68,14 @@ void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo,
 }
 
 void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo,
-                                    raw_ostream &O, StringRef Asm) {
+                                   raw_ostream &O, StringRef Asm,
+                                   StringRef Default) {
   const MCOperand &Op = MI->getOperand(OpNo);
   assert(Op.isImm());
   if (Op.getImm() == 1) {
     O << Asm;
+  } else {
+    O << Default;
   }
 }
 
@@ -98,7 +102,7 @@ void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo,
 
 void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo,
                                   raw_ostream &O) {
-  printIfSet(MI, OpNo, O, " *");
+  printIfSet(MI, OpNo, O.indent(25 - O.GetNumBytesInBuffer()), "*", " ");
 }
 
 void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo,
@@ -169,4 +173,86 @@ void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo,
     O << "." << chans[chan];
 }
 
+void AMDGPUInstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo,
+                                         raw_ostream &O) {
+  int BankSwizzle = MI->getOperand(OpNo).getImm();
+  switch (BankSwizzle) {
+  case 1:
+    O << "BS:VEC_021";
+    break;
+  case 2:
+    O << "BS:VEC_120";
+    break;
+  case 3:
+    O << "BS:VEC_102";
+    break;
+  case 4:
+    O << "BS:VEC_201";
+    break;
+  case 5:
+    O << "BS:VEC_210";
+    break;
+  default:
+    break;
+  }
+  return;
+}
+
+void AMDGPUInstPrinter::printRSel(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  unsigned Sel = MI->getOperand(OpNo).getImm();
+  switch (Sel) {
+  case 0:
+    O << "X";
+    break;
+  case 1:
+    O << "Y";
+    break;
+  case 2:
+    O << "Z";
+    break;
+  case 3:
+    O << "W";
+    break;
+  case 4:
+    O << "0";
+    break;
+  case 5:
+    O << "1";
+    break;
+  case 7:
+    O << "_";
+    break;
+  default:
+    break;
+  }
+}
+
+void AMDGPUInstPrinter::printCT(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  unsigned CT = MI->getOperand(OpNo).getImm();
+  switch (CT) {
+  case 0:
+    O << "U";
+    break;
+  case 1:
+    O << "N";
+    break;
+  default:
+    break;
+  }
+}
+
+void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo,
+                                    raw_ostream &O) {
+  int KCacheMode = MI->getOperand(OpNo).getImm();
+  if (KCacheMode > 0) {
+    int KCacheBank = MI->getOperand(OpNo - 2).getImm();
+    O << "CB" << KCacheBank <<":";
+    int KCacheAddr = MI->getOperand(OpNo + 2).getImm();
+    int LineSize = (KCacheMode == 1)?16:32;
+    O << KCacheAddr * 16 << "-" << KCacheAddr * 16 + LineSize;
+  }
+}
+
 #include "AMDGPUGenAsmWriter.inc"
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
index 767a708..4c1dfa6 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
@@ -35,7 +35,8 @@ private:
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O, StringRef Asm);
+  void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O,
+                  StringRef Asm, StringRef Default = "");
   void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O);
@@ -47,6 +48,10 @@ private:
   void printUpdatePred(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printWrite(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printBankSwizzle(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printRSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printCT(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printKCache(const MCInst *MI, unsigned OpNo, raw_ostream &O);
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
index a3397f3..9a36903 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -82,6 +82,8 @@ void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
 // ELFAMDGPUAsmBackend class
 //===----------------------------------------------------------------------===//
 
+namespace {
+
 class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend {
 public:
   ELFAMDGPUAsmBackend(const Target &T) : AMDGPUAsmBackend(T) { }
@@ -91,6 +93,8 @@ public:
   }
 };
 
+} // end anonymous namespace
+
 MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T, StringRef TT,
                                            StringRef CPU) {
   return new ELFAMDGPUAsmBackend(T);
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 2aae26a..f1c44df 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -11,7 +11,7 @@
 #include "AMDGPUMCAsmInfo.h"
 
 using namespace llvm;
-AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Target &T, StringRef &TT) : MCAsmInfo() {
+AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfo() {
   HasSingleParameterDotFile = false;
   WeakDefDirective = 0;
   //===------------------------------------------------------------------===//
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
index 3ad0fa6..485167b 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
@@ -17,12 +17,11 @@
 #include "llvm/MC/MCAsmInfo.h"
 namespace llvm {
 
-class Target;
 class StringRef;
 
 class AMDGPUMCAsmInfo : public MCAsmInfo {
 public:
-  explicit AMDGPUMCAsmInfo(const Target &T, StringRef &TT);
+  explicit AMDGPUMCAsmInfo(StringRef &TT);
   const char* getDataASDirective(unsigned int Size, unsigned int AS) const;
   const MCSection* getNonexecutableStackSection(MCContext &CTX) const;
 };
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 45d009c..61d70bb 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -78,7 +78,7 @@ static MCCodeEmitter *createAMDGPUMCCodeEmitter(const MCInstrInfo &MCII,
   if (STI.getFeatureBits() & AMDGPU::Feature64BitPtr) {
     return createSIMCCodeEmitter(MCII, MRI, STI, Ctx);
   } else {
-    return createR600MCCodeEmitter(MCII, MRI, STI, Ctx);
+    return createR600MCCodeEmitter(MCII, MRI, STI);
   }
 }
 
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
index 09d0d5b..abb0320 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -33,8 +33,7 @@ extern Target TheAMDGPUTarget;
 
 MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII,
                                        const MCRegisterInfo &MRI,
-                                       const MCSubtargetInfo &STI,
-                                       MCContext &Ctx);
+                                       const MCSubtargetInfo &STI);
 
 MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII,
                                      const MCRegisterInfo &MRI,
diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
index 7c83d86..4d6c25c 100644
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -9,12 +9,8 @@
 //
 /// \file
 ///
-/// This code emitter outputs bytecode that is understood by the r600g driver
-/// in the Mesa [1] project.  The bytecode is very similar to the hardware's ISA,
-/// but it still needs to be run through a finalizer in order to be executed
-/// by the GPU.
-///
-/// [1] http://www.mesa3d.org/
+/// \brief The R600 code emitter produces machine code that can be executed
+/// directly on the GPU device.
 //
 //===----------------------------------------------------------------------===//
 
@@ -30,9 +26,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include <stdio.h>
 
-#define SRC_BYTE_COUNT 11
-#define DST_BYTE_COUNT 5
-
 using namespace llvm;
 
 namespace {
@@ -43,13 +36,12 @@ class R600MCCodeEmitter : public AMDGPUMCCodeEmitter {
   const MCInstrInfo &MCII;
   const MCRegisterInfo &MRI;
   const MCSubtargetInfo &STI;
-  MCContext &Ctx;
 
 public:
 
   R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
-                    const MCSubtargetInfo &sti, MCContext &ctx)
-    : MCII(mcii), MRI(mri), STI(sti), Ctx(ctx) { }
+                    const MCSubtargetInfo &sti)
+    : MCII(mcii), MRI(mri), STI(sti) { }
 
   /// \brief Encode the instruction and write it to the OS.
   virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
@@ -60,30 +52,14 @@ public:
                                      SmallVectorImpl<MCFixup> &Fixups) const;
 private:
 
-  void EmitALUInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups,
-                    raw_ostream &OS) const;
-  void EmitSrc(const MCInst &MI, unsigned OpIdx, raw_ostream &OS) const;
-  void EmitSrcISA(const MCInst &MI, unsigned RegOpIdx, unsigned SelOpIdx,
-                    raw_ostream &OS) const;
-  void EmitDst(const MCInst &MI, raw_ostream &OS) const;
-  void EmitFCInstr(const MCInst &MI, raw_ostream &OS) const;
-
-  void EmitNullBytes(unsigned int byteCount, raw_ostream &OS) const;
-
   void EmitByte(unsigned int byte, raw_ostream &OS) const;
 
-  void EmitTwoBytes(uint32_t bytes, raw_ostream &OS) const;
-
   void Emit(uint32_t value, raw_ostream &OS) const;
   void Emit(uint64_t value, raw_ostream &OS) const;
 
   unsigned getHWRegChan(unsigned reg) const;
   unsigned getHWReg(unsigned regNo) const;
 
-  bool isFCOp(unsigned opcode) const;
-  bool isTexOp(unsigned opcode) const;
-  bool isFlagSet(const MCInst &MI, unsigned Operand, unsigned Flag) const;
-
 };
 
 } // End anonymous namespace
@@ -95,16 +71,6 @@ enum RegElement {
   ELEMENT_W
 };
 
-enum InstrTypes {
-  INSTR_ALU = 0,
-  INSTR_TEX,
-  INSTR_FC,
-  INSTR_NATIVE,
-  INSTR_VTX,
-  INSTR_EXPORT,
-  INSTR_CFALU
-};
-
 enum FCInstr {
   FC_IF_PREDICATE = 0,
   FC_ELSE,
@@ -115,386 +81,63 @@ enum FCInstr {
   FC_CONTINUE
 };
 
-enum TextureTypes {
-  TEXTURE_1D = 1,
-  TEXTURE_2D,
-  TEXTURE_3D,
-  TEXTURE_CUBE,
-  TEXTURE_RECT,
-  TEXTURE_SHADOW1D,
-  TEXTURE_SHADOW2D,
-  TEXTURE_SHADOWRECT,
-  TEXTURE_1D_ARRAY,
-  TEXTURE_2D_ARRAY,
-  TEXTURE_SHADOW1D_ARRAY,
-  TEXTURE_SHADOW2D_ARRAY
-};
-
 MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII,
                                            const MCRegisterInfo &MRI,
-                                           const MCSubtargetInfo &STI,
-                                           MCContext &Ctx) {
-  return new R600MCCodeEmitter(MCII, MRI, STI, Ctx);
+                                           const MCSubtargetInfo &STI) {
+  return new R600MCCodeEmitter(MCII, MRI, STI);
 }
 
 void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
                                        SmallVectorImpl<MCFixup> &Fixups) const {
-  if (isFCOp(MI.getOpcode())){
-    EmitFCInstr(MI, OS);
-  } else if (MI.getOpcode() == AMDGPU::RETURN ||
+  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+  if (MI.getOpcode() == AMDGPU::RETURN ||
     MI.getOpcode() == AMDGPU::FETCH_CLAUSE ||
     MI.getOpcode() == AMDGPU::ALU_CLAUSE ||
     MI.getOpcode() == AMDGPU::BUNDLE ||
     MI.getOpcode() == AMDGPU::KILL) {
     return;
-  } else {
-    switch(MI.getOpcode()) {
-    case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
-    case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
-      uint64_t inst = getBinaryCodeForInstr(MI, Fixups);
-      EmitByte(INSTR_NATIVE, OS);
-      Emit(inst, OS);
-      break;
-    }
-    case AMDGPU::CONSTANT_LOAD_eg:
-    case AMDGPU::VTX_READ_PARAM_8_eg:
-    case AMDGPU::VTX_READ_PARAM_16_eg:
-    case AMDGPU::VTX_READ_PARAM_32_eg:
-    case AMDGPU::VTX_READ_PARAM_128_eg:
-    case AMDGPU::VTX_READ_GLOBAL_8_eg:
-    case AMDGPU::VTX_READ_GLOBAL_32_eg:
-    case AMDGPU::VTX_READ_GLOBAL_128_eg:
-    case AMDGPU::TEX_VTX_CONSTBUF:
-    case AMDGPU::TEX_VTX_TEXBUF : {
-      uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups);
-      uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset
-      InstWord2 |= 1 << 19;
-
-      EmitByte(INSTR_NATIVE, OS);
-      Emit(InstWord01, OS);
-      EmitByte(INSTR_NATIVE, OS);
-      Emit(InstWord2, OS);
-      Emit((u_int32_t) 0, OS);
-      break;
-    }
-    case AMDGPU::TEX_LD:
-    case AMDGPU::TEX_GET_TEXTURE_RESINFO:
-    case AMDGPU::TEX_SAMPLE:
-    case AMDGPU::TEX_SAMPLE_C:
-    case AMDGPU::TEX_SAMPLE_L:
-    case AMDGPU::TEX_SAMPLE_C_L:
-    case AMDGPU::TEX_SAMPLE_LB:
-    case AMDGPU::TEX_SAMPLE_C_LB:
-    case AMDGPU::TEX_SAMPLE_G:
-    case AMDGPU::TEX_SAMPLE_C_G:
-    case AMDGPU::TEX_GET_GRADIENTS_H:
-    case AMDGPU::TEX_GET_GRADIENTS_V:
-    case AMDGPU::TEX_SET_GRADIENTS_H:
-    case AMDGPU::TEX_SET_GRADIENTS_V: {
-      unsigned Opcode = MI.getOpcode();
-      bool HasOffsets = (Opcode == AMDGPU::TEX_LD);
-      unsigned OpOffset = HasOffsets ? 3 : 0;
-      int64_t Sampler = MI.getOperand(OpOffset + 3).getImm();
-      int64_t TextureType = MI.getOperand(OpOffset + 4).getImm();
-
-      uint32_t SrcSelect[4] = {0, 1, 2, 3};
-      uint32_t Offsets[3] = {0, 0, 0};
-      uint64_t CoordType[4] = {1, 1, 1, 1};
-
-      if (HasOffsets)
-        for (unsigned i = 0; i < 3; i++) {
-          int SignedOffset = MI.getOperand(i + 2).getImm();
-          Offsets[i] = (SignedOffset & 0x1F);
-        }
-          
-
-      if (TextureType == TEXTURE_RECT ||
-          TextureType == TEXTURE_SHADOWRECT) {
-        CoordType[ELEMENT_X] = 0;
-        CoordType[ELEMENT_Y] = 0;
-      }
-
-      if (TextureType == TEXTURE_1D_ARRAY ||
-          TextureType == TEXTURE_SHADOW1D_ARRAY) {
-        if (Opcode == AMDGPU::TEX_SAMPLE_C_L ||
-            Opcode == AMDGPU::TEX_SAMPLE_C_LB) {
-          CoordType[ELEMENT_Y] = 0;
-        } else {
-          CoordType[ELEMENT_Z] = 0;
-          SrcSelect[ELEMENT_Z] = ELEMENT_Y;
-        }
-      } else if (TextureType == TEXTURE_2D_ARRAY ||
-          TextureType == TEXTURE_SHADOW2D_ARRAY) {
-        CoordType[ELEMENT_Z] = 0;
-      }
-
-
-      if ((TextureType == TEXTURE_SHADOW1D ||
-          TextureType == TEXTURE_SHADOW2D ||
-          TextureType == TEXTURE_SHADOWRECT ||
-          TextureType == TEXTURE_SHADOW1D_ARRAY) &&
-          Opcode != AMDGPU::TEX_SAMPLE_C_L &&
-          Opcode != AMDGPU::TEX_SAMPLE_C_LB) {
-        SrcSelect[ELEMENT_W] = ELEMENT_Z;
-      }
-
-      uint64_t Word01 = getBinaryCodeForInstr(MI, Fixups) |
-          CoordType[ELEMENT_X] << 60 | CoordType[ELEMENT_Y] << 61 |
-          CoordType[ELEMENT_Z] << 62 | CoordType[ELEMENT_W] << 63;
+  } else if (IS_VTX(Desc)) {
+    uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups);
+    uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset
+    InstWord2 |= 1 << 19;
+
+    Emit(InstWord01, OS);
+    Emit(InstWord2, OS);
+    Emit((uint32_t) 0, OS);
+  } else if (IS_TEX(Desc)) {
+      int64_t Sampler = MI.getOperand(14).getImm();
+
+      int64_t SrcSelect[4] = {
+        MI.getOperand(2).getImm(),
+        MI.getOperand(3).getImm(),
+        MI.getOperand(4).getImm(),
+        MI.getOperand(5).getImm()
+      };
+      int64_t Offsets[3] = {
+        MI.getOperand(6).getImm() & 0x1F,
+        MI.getOperand(7).getImm() & 0x1F,
+        MI.getOperand(8).getImm() & 0x1F
+      };
+
+      uint64_t Word01 = getBinaryCodeForInstr(MI, Fixups);
       uint32_t Word2 = Sampler << 15 | SrcSelect[ELEMENT_X] << 20 |
           SrcSelect[ELEMENT_Y] << 23 | SrcSelect[ELEMENT_Z] << 26 |
           SrcSelect[ELEMENT_W] << 29 | Offsets[0] << 0 | Offsets[1] << 5 |
           Offsets[2] << 10;
 
-      EmitByte(INSTR_NATIVE, OS);
       Emit(Word01, OS);
-      EmitByte(INSTR_NATIVE, OS);
       Emit(Word2, OS);
-      Emit((u_int32_t) 0, OS);
-      break;
-    }
-    case AMDGPU::CF_ALU:
-    case AMDGPU::CF_ALU_PUSH_BEFORE: {
-      uint64_t Inst = getBinaryCodeForInstr(MI, Fixups);
-      EmitByte(INSTR_NATIVE, OS);
-      Emit(Inst, OS);
-      break;
-    }
-    case AMDGPU::CF_CALL_FS_EG:
-    case AMDGPU::CF_CALL_FS_R600:
-      return;
-    case AMDGPU::CF_TC_EG:
-    case AMDGPU::CF_VC_EG:
-    case AMDGPU::CF_TC_R600:
-    case AMDGPU::CF_VC_R600:
-    case AMDGPU::WHILE_LOOP_EG:
-    case AMDGPU::END_LOOP_EG:
-    case AMDGPU::LOOP_BREAK_EG:
-    case AMDGPU::CF_CONTINUE_EG:
-    case AMDGPU::CF_JUMP_EG:
-    case AMDGPU::CF_ELSE_EG:
-    case AMDGPU::POP_EG:
-    case AMDGPU::WHILE_LOOP_R600:
-    case AMDGPU::END_LOOP_R600:
-    case AMDGPU::LOOP_BREAK_R600:
-    case AMDGPU::CF_CONTINUE_R600:
-    case AMDGPU::CF_JUMP_R600:
-    case AMDGPU::CF_ELSE_R600:
-    case AMDGPU::POP_R600:
-    case AMDGPU::EG_ExportSwz:
-    case AMDGPU::R600_ExportSwz:
-    case AMDGPU::EG_ExportBuf:
-    case AMDGPU::R600_ExportBuf:
-    case AMDGPU::PAD:
-    case AMDGPU::CF_END_R600:
-    case AMDGPU::CF_END_EG:
-    case AMDGPU::CF_END_CM: {
-      uint64_t Inst = getBinaryCodeForInstr(MI, Fixups);
-      EmitByte(INSTR_NATIVE, OS);
-      Emit(Inst, OS);
-      break;
-    }
-    default:
-      uint64_t Inst = getBinaryCodeForInstr(MI, Fixups);
-      EmitByte(INSTR_NATIVE, OS);
-      Emit(Inst, OS);
-      break;
-    }
-  }
-}
-
-void R600MCCodeEmitter::EmitALUInstr(const MCInst &MI,
-                                     SmallVectorImpl<MCFixup> &Fixups,
-                                     raw_ostream &OS) const {
-  const MCInstrDesc &MCDesc = MCII.get(MI.getOpcode());
-
-  // Emit instruction type
-  EmitByte(INSTR_ALU, OS);
-
-  uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups);
-
-  //older alu have different encoding for instructions with one or two src
-  //parameters.
-  if ((STI.getFeatureBits() & AMDGPU::FeatureR600ALUInst) &&
-      !(MCDesc.TSFlags & R600_InstFlag::OP3)) {
-    uint64_t ISAOpCode = InstWord01 & (0x3FFULL << 39);
-    InstWord01 &= ~(0x3FFULL << 39);
-    InstWord01 |= ISAOpCode << 1;
-  }
-
-  unsigned SrcNum = MCDesc.TSFlags & R600_InstFlag::OP3 ? 3 :
-      MCDesc.TSFlags & R600_InstFlag::OP2 ? 2 : 1;
-
-  EmitByte(SrcNum, OS);
-
-  const unsigned SrcOps[3][2] = {
-      {R600Operands::SRC0, R600Operands::SRC0_SEL},
-      {R600Operands::SRC1, R600Operands::SRC1_SEL},
-      {R600Operands::SRC2, R600Operands::SRC2_SEL}
-  };
-
-  for (unsigned SrcIdx = 0; SrcIdx < SrcNum; ++SrcIdx) {
-    unsigned RegOpIdx = R600Operands::ALUOpTable[SrcNum-1][SrcOps[SrcIdx][0]];
-    unsigned SelOpIdx = R600Operands::ALUOpTable[SrcNum-1][SrcOps[SrcIdx][1]];
-    EmitSrcISA(MI, RegOpIdx, SelOpIdx, OS);
-  }
-
-  Emit(InstWord01, OS);
-  return;
-}
-
-void R600MCCodeEmitter::EmitSrc(const MCInst &MI, unsigned OpIdx,
-                                raw_ostream &OS) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  union {
-    float f;
-    uint32_t i;
-  } Value;
-  Value.i = 0;
-  // Emit the source select (2 bytes).  For GPRs, this is the register index.
-  // For other potential instruction operands, (e.g. constant registers) the
-  // value of the source select is defined in the r600isa docs.
-  if (MO.isReg()) {
-    unsigned reg = MO.getReg();
-    EmitTwoBytes(getHWReg(reg), OS);
-    if (reg == AMDGPU::ALU_LITERAL_X) {
-      unsigned ImmOpIndex = MI.getNumOperands() - 1;
-      MCOperand ImmOp = MI.getOperand(ImmOpIndex);
-      if (ImmOp.isFPImm()) {
-        Value.f = ImmOp.getFPImm();
-      } else {
-        assert(ImmOp.isImm());
-        Value.i = ImmOp.getImm();
-      }
-    }
-  } else {
-    // XXX: Handle other operand types.
-    EmitTwoBytes(0, OS);
-  }
-
-  // Emit the source channel (1 byte)
-  if (MO.isReg()) {
-    EmitByte(getHWRegChan(MO.getReg()), OS);
+      Emit((uint32_t) 0, OS);
   } else {
-    EmitByte(0, OS);
-  }
-
-  // XXX: Emit isNegated (1 byte)
-  if ((!(isFlagSet(MI, OpIdx, MO_FLAG_ABS)))
-      && (isFlagSet(MI, OpIdx, MO_FLAG_NEG) ||
-     (MO.isReg() &&
-      (MO.getReg() == AMDGPU::NEG_ONE || MO.getReg() == AMDGPU::NEG_HALF)))){
-    EmitByte(1, OS);
-  } else {
-    EmitByte(0, OS);
-  }
-
-  // Emit isAbsolute (1 byte)
-  if (isFlagSet(MI, OpIdx, MO_FLAG_ABS)) {
-    EmitByte(1, OS);
-  } else {
-    EmitByte(0, OS);
-  }
-
-  // XXX: Emit relative addressing mode (1 byte)
-  EmitByte(0, OS);
-
-  // Emit kc_bank, This will be adjusted later by r600_asm
-  EmitByte(0, OS);
-
-  // Emit the literal value, if applicable (4 bytes).
-  Emit(Value.i, OS);
-
-}
-
-void R600MCCodeEmitter::EmitSrcISA(const MCInst &MI, unsigned RegOpIdx,
-                                   unsigned SelOpIdx, raw_ostream &OS) const {
-  const MCOperand &RegMO = MI.getOperand(RegOpIdx);
-  const MCOperand &SelMO = MI.getOperand(SelOpIdx);
-
-  union {
-    float f;
-    uint32_t i;
-  } InlineConstant;
-  InlineConstant.i = 0;
-  // Emit source type (1 byte) and source select (4 bytes). For GPRs type is 0
-  // and select is 0 (GPR index is encoded in the instr encoding. For constants
-  // type is 1 and select is the original const select passed from the driver.
-  unsigned Reg = RegMO.getReg();
-  if (Reg == AMDGPU::ALU_CONST) {
-    EmitByte(1, OS);
-    uint32_t Sel = SelMO.getImm();
-    Emit(Sel, OS);
-  } else {
-    EmitByte(0, OS);
-    Emit((uint32_t)0, OS);
-  }
-
-  if (Reg == AMDGPU::ALU_LITERAL_X) {
-    unsigned ImmOpIndex = MI.getNumOperands() - 2;
-    MCOperand ImmOp = MI.getOperand(ImmOpIndex);
-    if (ImmOp.isFPImm()) {
-      InlineConstant.f = ImmOp.getFPImm();
-    } else {
-      assert(ImmOp.isImm());
-      InlineConstant.i = ImmOp.getImm();
+    uint64_t Inst = getBinaryCodeForInstr(MI, Fixups);
+    if ((STI.getFeatureBits() & AMDGPU::FeatureR600ALUInst) &&
+       ((Desc.TSFlags & R600_InstFlag::OP1) ||
+         Desc.TSFlags & R600_InstFlag::OP2)) {
+      uint64_t ISAOpCode = Inst & (0x3FFULL << 39);
+      Inst &= ~(0x3FFULL << 39);
+      Inst |= ISAOpCode << 1;
     }
-  }
-
-  // Emit the literal value, if applicable (4 bytes).
-  Emit(InlineConstant.i, OS);
-}
-
-void R600MCCodeEmitter::EmitFCInstr(const MCInst &MI, raw_ostream &OS) const {
-
-  // Emit instruction type
-  EmitByte(INSTR_FC, OS);
-
-  // Emit SRC
-  unsigned NumOperands = MI.getNumOperands();
-  if (NumOperands > 0) {
-    assert(NumOperands == 1);
-    EmitSrc(MI, 0, OS);
-  } else {
-    EmitNullBytes(SRC_BYTE_COUNT, OS);
-  }
-
-  // Emit FC Instruction
-  enum FCInstr instr;
-  switch (MI.getOpcode()) {
-  case AMDGPU::PREDICATED_BREAK:
-    instr = FC_BREAK_PREDICATE;
-    break;
-  case AMDGPU::CONTINUE:
-    instr = FC_CONTINUE;
-    break;
-  case AMDGPU::IF_PREDICATE_SET:
-    instr = FC_IF_PREDICATE;
-    break;
-  case AMDGPU::ELSE:
-    instr = FC_ELSE;
-    break;
-  case AMDGPU::ENDIF:
-    instr = FC_ENDIF;
-    break;
-  case AMDGPU::ENDLOOP:
-    instr = FC_ENDLOOP;
-    break;
-  case AMDGPU::WHILELOOP:
-    instr = FC_BGNLOOP;
-    break;
-  default:
-    abort();
-    break;
-  }
-  EmitByte(instr, OS);
-}
-
-void R600MCCodeEmitter::EmitNullBytes(unsigned int ByteCount,
-                                      raw_ostream &OS) const {
-
-  for (unsigned int i = 0; i < ByteCount; i++) {
-    EmitByte(0, OS);
+    Emit(Inst, OS);
   }
 }
 
@@ -502,12 +145,6 @@ void R600MCCodeEmitter::EmitByte(unsigned int Byte, raw_ostream &OS) const {
   OS.write((uint8_t) Byte & 0xff);
 }
 
-void R600MCCodeEmitter::EmitTwoBytes(unsigned int Bytes,
-                                     raw_ostream &OS) const {
-  OS.write((uint8_t) (Bytes & 0xff));
-  OS.write((uint8_t) ((Bytes >> 8) & 0xff));
-}
-
 void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const {
   for (unsigned i = 0; i < 4; i++) {
     OS.write((uint8_t) ((Value >> (8 * i)) & 0xff));
@@ -545,55 +182,4 @@ uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI,
   }
 }
 
-//===----------------------------------------------------------------------===//
-// Encoding helper functions
-//===----------------------------------------------------------------------===//
-
-bool R600MCCodeEmitter::isFCOp(unsigned opcode) const {
-  switch(opcode) {
-  default: return false;
-  case AMDGPU::PREDICATED_BREAK:
-  case AMDGPU::CONTINUE:
-  case AMDGPU::IF_PREDICATE_SET:
-  case AMDGPU::ELSE:
-  case AMDGPU::ENDIF:
-  case AMDGPU::ENDLOOP:
-  case AMDGPU::WHILELOOP:
-    return true;
-  }
-}
-
-bool R600MCCodeEmitter::isTexOp(unsigned opcode) const {
-  switch(opcode) {
-  default: return false;
-  case AMDGPU::TEX_LD:
-  case AMDGPU::TEX_GET_TEXTURE_RESINFO:
-  case AMDGPU::TEX_SAMPLE:
-  case AMDGPU::TEX_SAMPLE_C:
-  case AMDGPU::TEX_SAMPLE_L:
-  case AMDGPU::TEX_SAMPLE_C_L:
-  case AMDGPU::TEX_SAMPLE_LB:
-  case AMDGPU::TEX_SAMPLE_C_LB:
-  case AMDGPU::TEX_SAMPLE_G:
-  case AMDGPU::TEX_SAMPLE_C_G:
-  case AMDGPU::TEX_GET_GRADIENTS_H:
-  case AMDGPU::TEX_GET_GRADIENTS_V:
-  case AMDGPU::TEX_SET_GRADIENTS_H:
-  case AMDGPU::TEX_SET_GRADIENTS_V:
-    return true;
-  }
-}
-
-bool R600MCCodeEmitter::isFlagSet(const MCInst &MI, unsigned Operand,
-                                  unsigned Flag) const {
-  const MCInstrDesc &MCDesc = MCII.get(MI.getOpcode());
-  unsigned FlagIndex = GET_FLAG_OPERAND_IDX(MCDesc.TSFlags);
-  if (FlagIndex == 0) {
-    return false;
-  }
-  assert(MI.getOperand(FlagIndex).isImm());
-  return !!((MI.getOperand(FlagIndex).getImm() >>
-            (NUM_MO_FLAGS * Operand)) & Flag);
-}
-
 #include "AMDGPUGenMCCodeEmitter.inc"
diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td
index e024e66..81f407e 100644
--- a/lib/Target/R600/Processors.td
+++ b/lib/Target/R600/Processors.td
@@ -1,4 +1,4 @@
-//===-- Processors.td - TODO: Add brief description -------===//
+//===-- Processors.td - R600 Processor definitions ------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,46 +6,45 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// AMDIL processors supported.
-//
-//===----------------------------------------------------------------------===//
 
 class Proc<string Name, ProcessorItineraries itin, list<SubtargetFeature> Features>
 : Processor<Name, itin, Features>;
 def : Proc<"",           R600_VLIW5_Itin,
-    [FeatureR600ALUInst, FeatureVertexCache]>;
+    [FeatureR600, FeatureVertexCache]>;
 def : Proc<"r600",       R600_VLIW5_Itin,
-    [FeatureR600ALUInst , FeatureVertexCache]>;
+    [FeatureR600 , FeatureVertexCache]>;
 def : Proc<"rs880",      R600_VLIW5_Itin,
-    [FeatureR600ALUInst]>;
+    [FeatureR600]>;
 def : Proc<"rv670",      R600_VLIW5_Itin,
-    [FeatureR600ALUInst, FeatureFP64, FeatureVertexCache]>;
+    [FeatureR600, FeatureFP64, FeatureVertexCache]>;
 def : Proc<"rv710",      R600_VLIW5_Itin,
-    [FeatureVertexCache]>;
+    [FeatureR700, FeatureVertexCache]>;
 def : Proc<"rv730",      R600_VLIW5_Itin,
-    [FeatureVertexCache]>;
+    [FeatureR700, FeatureVertexCache]>;
 def : Proc<"rv770",      R600_VLIW5_Itin,
-    [FeatureFP64, FeatureVertexCache]>;
+    [FeatureR700, FeatureFP64, FeatureVertexCache]>;
 def : Proc<"cedar",      R600_VLIW5_Itin,
-    [FeatureByteAddress, FeatureImages, FeatureVertexCache]>;
+    [FeatureEvergreen, FeatureVertexCache]>;
 def : Proc<"redwood",    R600_VLIW5_Itin,
-    [FeatureByteAddress, FeatureImages, FeatureVertexCache]>;
+    [FeatureEvergreen, FeatureVertexCache]>;
 def : Proc<"sumo",       R600_VLIW5_Itin,
-    [FeatureByteAddress, FeatureImages]>;
+    [FeatureEvergreen]>;
 def : Proc<"juniper",    R600_VLIW5_Itin,
-    [FeatureByteAddress, FeatureImages, FeatureVertexCache]>;
+    [FeatureEvergreen, FeatureVertexCache]>;
 def : Proc<"cypress",    R600_VLIW5_Itin,
-    [FeatureByteAddress, FeatureImages, FeatureFP64, FeatureVertexCache]>;
+    [FeatureEvergreen, FeatureFP64, FeatureVertexCache]>;
 def : Proc<"barts",      R600_VLIW5_Itin,
-    [FeatureByteAddress, FeatureImages, FeatureVertexCache]>;
+    [FeatureNorthernIslands, FeatureVertexCache]>;
 def : Proc<"turks",      R600_VLIW5_Itin,
-    [FeatureByteAddress, FeatureImages, FeatureVertexCache]>;
+    [FeatureNorthernIslands, FeatureVertexCache]>;
 def : Proc<"caicos",     R600_VLIW5_Itin,
-    [FeatureByteAddress, FeatureImages]>;
+    [FeatureNorthernIslands]>;
 def : Proc<"cayman",     R600_VLIW4_Itin,
-    [FeatureByteAddress, FeatureImages, FeatureFP64]>;def : Proc<"SI",         SI_Itin, [Feature64BitPtr, FeatureFP64]>;
-def : Proc<"tahiti",     SI_Itin, [Feature64BitPtr, FeatureFP64]>;
-def : Proc<"pitcairn",   SI_Itin, [Feature64BitPtr, FeatureFP64]>;
-def : Proc<"verde",      SI_Itin, [Feature64BitPtr, FeatureFP64]>;
-def : Proc<"oland",      SI_Itin, [Feature64BitPtr, FeatureFP64]>;
+    [FeatureNorthernIslands, FeatureFP64, FeatureCaymanISA]>;
+
+def : Proc<"SI",         SI_Itin, [FeatureSouthernIslands]>;
+def : Proc<"tahiti",     SI_Itin, [FeatureSouthernIslands]>;
+def : Proc<"pitcairn",   SI_Itin, [FeatureSouthernIslands]>;
+def : Proc<"verde",      SI_Itin, [FeatureSouthernIslands]>;
+def : Proc<"oland",      SI_Itin, [FeatureSouthernIslands]>;
+def : Proc<"hainan",     SI_Itin, [FeatureSouthernIslands]>;
diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index 0995795..ab29d60 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -14,8 +14,6 @@
 
 #define DEBUG_TYPE "r600cf"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
 #include "AMDGPU.h"
 #include "R600Defines.h"
 #include "R600InstrInfo.h"
@@ -24,8 +22,11 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
 
-namespace llvm {
+namespace {
 
 class R600ControlFlowFinalizer : public MachineFunctionPass {
 
@@ -48,7 +49,7 @@ private:
 
   static char ID;
   const R600InstrInfo *TII;
-  const R600RegisterInfo &TRI;
+  const R600RegisterInfo *TRI;
   unsigned MaxFetchInst;
   const AMDGPUSubtarget &ST;
 
@@ -64,7 +65,7 @@ private:
 
   const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const {
     unsigned Opcode = 0;
-    bool isEg = (ST.device()->getGeneration() >= AMDGPUDeviceInfo::HD5XXX);
+    bool isEg = (ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN);
     switch (CFI) {
     case CF_TC:
       Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600;
@@ -97,7 +98,7 @@ private:
       Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600;
       break;
     case CF_END:
-      if (ST.device()->getDeviceFlag() == OCL_DEVICE_CAYMAN) {
+      if (ST.hasCaymanISA()) {
         Opcode = AMDGPU::CF_END_CM;
         break;
       }
@@ -109,28 +110,33 @@ private:
   }
 
   bool isCompatibleWithClause(const MachineInstr *MI,
-  std::set<unsigned> &DstRegs, std::set<unsigned> &SrcRegs) const {
+      std::set<unsigned> &DstRegs) const {
     unsigned DstMI, SrcMI;
     for (MachineInstr::const_mop_iterator I = MI->operands_begin(),
         E = MI->operands_end(); I != E; ++I) {
       const MachineOperand &MO = *I;
       if (!MO.isReg())
         continue;
-      if (MO.isDef())
-        DstMI = MO.getReg();
+      if (MO.isDef()) {
+        unsigned Reg = MO.getReg();
+        if (AMDGPU::R600_Reg128RegClass.contains(Reg))
+          DstMI = Reg;
+        else
+          DstMI = TRI->getMatchingSuperReg(Reg,
+              TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
+              &AMDGPU::R600_Reg128RegClass);
+      }
       if (MO.isUse()) {
         unsigned Reg = MO.getReg();
         if (AMDGPU::R600_Reg128RegClass.contains(Reg))
           SrcMI = Reg;
         else
-          SrcMI = TRI.getMatchingSuperReg(Reg,
-              TRI.getSubRegFromChannel(TRI.getHWRegChan(Reg)),
+          SrcMI = TRI->getMatchingSuperReg(Reg,
+              TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
               &AMDGPU::R600_Reg128RegClass);
       }
     }
-    if ((DstRegs.find(SrcMI) == DstRegs.end()) &&
-        (SrcRegs.find(DstMI) == SrcRegs.end())) {
-      SrcRegs.insert(SrcMI);
+    if ((DstRegs.find(SrcMI) == DstRegs.end())) {
       DstRegs.insert(DstMI);
       return true;
     } else
@@ -144,16 +150,16 @@ private:
     std::vector<MachineInstr *> ClauseContent;
     unsigned AluInstCount = 0;
     bool IsTex = TII->usesTextureCache(ClauseHead);
-    std::set<unsigned> DstRegs, SrcRegs;
+    std::set<unsigned> DstRegs;
     for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
       if (IsTrivialInst(I))
         continue;
-      if (AluInstCount > MaxFetchInst)
+      if (AluInstCount >= MaxFetchInst)
         break;
       if ((IsTex && !TII->usesTextureCache(I)) ||
           (!IsTex && !TII->usesVertexCache(I)))
         break;
-      if (!isCompatibleWithClause(I, DstRegs, SrcRegs))
+      if (!isCompatibleWithClause(I, DstRegs))
         break;
       AluInstCount ++;
       ClauseContent.push_back(I);
@@ -165,29 +171,27 @@ private:
     return ClauseFile(MIb, ClauseContent);
   }
 
-  void getLiteral(MachineInstr *MI, std::vector<unsigned> &Lits) const {
+  void getLiteral(MachineInstr *MI, std::vector<int64_t> &Lits) const {
     unsigned LiteralRegs[] = {
       AMDGPU::ALU_LITERAL_X,
       AMDGPU::ALU_LITERAL_Y,
       AMDGPU::ALU_LITERAL_Z,
       AMDGPU::ALU_LITERAL_W
     };
-    for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
-      MachineOperand &MO = MI->getOperand(i);
-      if (!MO.isReg())
+    const SmallVector<std::pair<MachineOperand *, int64_t>, 3 > Srcs =
+        TII->getSrcs(MI);
+    for (unsigned i = 0, e = Srcs.size(); i < e; ++i) {
+      if (Srcs[i].first->getReg() != AMDGPU::ALU_LITERAL_X)
         continue;
-      if (MO.getReg() != AMDGPU::ALU_LITERAL_X)
-        continue;
-      unsigned ImmIdx = TII->getOperandIdx(MI->getOpcode(), R600Operands::IMM);
-      int64_t Imm = MI->getOperand(ImmIdx).getImm();
-      std::vector<unsigned>::iterator It =
+      int64_t Imm = Srcs[i].second;
+      std::vector<int64_t>::iterator It =
           std::find(Lits.begin(), Lits.end(), Imm);
       if (It != Lits.end()) {
         unsigned Index = It - Lits.begin();
-        MO.setReg(LiteralRegs[Index]);
+        Srcs[i].first->setReg(LiteralRegs[Index]);
       } else {
         assert(Lits.size() < 4 && "Too many literals in Instruction Group");
-        MO.setReg(LiteralRegs[Lits.size()]);
+        Srcs[i].first->setReg(LiteralRegs[Lits.size()]);
         Lits.push_back(Imm);
       }
     }
@@ -221,7 +225,7 @@ private:
       }
       if (!I->isBundle() && !TII->isALUInstr(I->getOpcode()))
         break;
-      std::vector<unsigned> Literals;
+      std::vector<int64_t> Literals;
       if (I->isBundle()) {
         MachineInstr *DeleteMI = I;
         MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
@@ -295,37 +299,38 @@ private:
   }
 
   unsigned getHWStackSize(unsigned StackSubEntry, bool hasPush) const {
-    switch (ST.device()->getGeneration()) {
-    case AMDGPUDeviceInfo::HD4XXX:
+    switch (ST.getGeneration()) {
+    case AMDGPUSubtarget::R600:
+    case AMDGPUSubtarget::R700:
       if (hasPush)
         StackSubEntry += 2;
       break;
-    case AMDGPUDeviceInfo::HD5XXX:
+    case AMDGPUSubtarget::EVERGREEN:
       if (hasPush)
         StackSubEntry ++;
-    case AMDGPUDeviceInfo::HD6XXX:
+    case AMDGPUSubtarget::NORTHERN_ISLANDS:
       StackSubEntry += 2;
       break;
+    default: llvm_unreachable("Not a VLIW4/VLIW5 GPU");
     }
     return (StackSubEntry + 3)/4; // Need ceil value of StackSubEntry/4
   }
 
 public:
   R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID),
-    TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo())),
-    TRI(TII->getRegisterInfo()),
+    TII (0), TRI(0),
     ST(tm.getSubtarget<AMDGPUSubtarget>()) {
       const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>();
-      if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD4XXX)
-        MaxFetchInst = 8;
-      else
-        MaxFetchInst = 16;
+      MaxFetchInst = ST.getTexVTXClauseSize();
   }
 
   virtual bool runOnMachineFunction(MachineFunction &MF) {
+    TII=static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
+    TRI=static_cast<const R600RegisterInfo *>(MF.getTarget().getRegisterInfo());
+
     unsigned MaxStack = 0;
     unsigned CurrentStack = 0;
-    bool hasPush;
+    bool HasPush = false;
     for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME;
         ++MB) {
       MachineBasicBlock &MBB = *MB;
@@ -337,6 +342,7 @@ public:
         BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
             getHWInstrDesc(CF_CALL_FS));
         CfCount++;
+        MaxStack = 1;
       }
       std::vector<ClauseFile> FetchClauses, AluClauses;
       for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
@@ -354,7 +360,7 @@ public:
         case AMDGPU::CF_ALU_PUSH_BEFORE:
           CurrentStack++;
           MaxStack = std::max(MaxStack, CurrentStack);
-          hasPush = true;
+          HasPush = true;
         case AMDGPU::CF_ALU:
           I = MI;
           AluClauses.push_back(MakeALUClause(MBB, I));
@@ -475,7 +481,7 @@ public:
           break;
         }
       }
-      MFI->StackSize = getHWStackSize(MaxStack, hasPush);
+      MFI->StackSize = getHWStackSize(MaxStack, HasPush);
     }
 
     return false;
@@ -488,7 +494,7 @@ public:
 
 char R600ControlFlowFinalizer::ID = 0;
 
-}
+} // end anonymous namespace
 
 
 llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) {
diff --git a/lib/Target/R600/R600Defines.h b/lib/Target/R600/R600Defines.h
index 303ca73..aebe581 100644
--- a/lib/Target/R600/R600Defines.h
+++ b/lib/Target/R600/R600Defines.h
@@ -54,6 +54,9 @@ namespace R600_InstFlag {
 #define GET_REG_CHAN(reg) ((reg) >> HW_CHAN_SHIFT)
 #define GET_REG_INDEX(reg) ((reg) & HW_REG_MASK)
 
+#define IS_VTX(desc) ((desc).TSFlags & R600_InstFlag::VTX_INST)
+#define IS_TEX(desc) ((desc).TSFlags & R600_InstFlag::TEX_INST)
+
 namespace R600Operands {
   enum Ops {
     DST,
@@ -95,6 +98,106 @@ namespace R600Operands {
     {0,-1,-1,-1,-1, 1, 2, 3, 4, 5,-1, 6, 7, 8, 9,-1,10,11,12,13,14,15,16,17,18}
   };
 
+  enum VecOps {
+    UPDATE_EXEC_MASK_X,
+    UPDATE_PREDICATE_X,
+    WRITE_X,
+    OMOD_X,
+    DST_REL_X,
+    CLAMP_X,
+    SRC0_X,
+    SRC0_NEG_X,
+    SRC0_REL_X,
+    SRC0_ABS_X,
+    SRC0_SEL_X,
+    SRC1_X,
+    SRC1_NEG_X,
+    SRC1_REL_X,
+    SRC1_ABS_X,
+    SRC1_SEL_X,
+    PRED_SEL_X,
+    UPDATE_EXEC_MASK_Y,
+    UPDATE_PREDICATE_Y,
+    WRITE_Y,
+    OMOD_Y,
+    DST_REL_Y,
+    CLAMP_Y,
+    SRC0_Y,
+    SRC0_NEG_Y,
+    SRC0_REL_Y,
+    SRC0_ABS_Y,
+    SRC0_SEL_Y,
+    SRC1_Y,
+    SRC1_NEG_Y,
+    SRC1_REL_Y,
+    SRC1_ABS_Y,
+    SRC1_SEL_Y,
+    PRED_SEL_Y,
+    UPDATE_EXEC_MASK_Z,
+    UPDATE_PREDICATE_Z,
+    WRITE_Z,
+    OMOD_Z,
+    DST_REL_Z,
+    CLAMP_Z,
+    SRC0_Z,
+    SRC0_NEG_Z,
+    SRC0_REL_Z,
+    SRC0_ABS_Z,
+    SRC0_SEL_Z,
+    SRC1_Z,
+    SRC1_NEG_Z,
+    SRC1_REL_Z,
+    SRC1_ABS_Z,
+    SRC1_SEL_Z,
+    PRED_SEL_Z,
+    UPDATE_EXEC_MASK_W,
+    UPDATE_PREDICATE_W,
+    WRITE_W,
+    OMOD_W,
+    DST_REL_W,
+    CLAMP_W,
+    SRC0_W,
+    SRC0_NEG_W,
+    SRC0_REL_W,
+    SRC0_ABS_W,
+    SRC0_SEL_W,
+    SRC1_W,
+    SRC1_NEG_W,
+    SRC1_REL_W,
+    SRC1_ABS_W,
+    SRC1_SEL_W,
+    PRED_SEL_W,
+    IMM_0,
+    IMM_1,
+    VEC_COUNT
+ };
+
 }
 
+//===----------------------------------------------------------------------===//
+// Config register definitions
+//===----------------------------------------------------------------------===//
+
+#define R_02880C_DB_SHADER_CONTROL                    0x02880C
+#define   S_02880C_KILL_ENABLE(x)                      (((x) & 0x1) << 6)
+
+// These fields are the same for all shader types and families.
+#define   S_NUM_GPRS(x)                         (((x) & 0xFF) << 0)
+#define   S_STACK_SIZE(x)                       (((x) & 0xFF) << 8)
+//===----------------------------------------------------------------------===//
+// R600, R700 Registers
+//===----------------------------------------------------------------------===//
+
+#define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
+#define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
+
+//===----------------------------------------------------------------------===//
+// Evergreen, Northern Islands Registers
+//===----------------------------------------------------------------------===//
+
+#define R_028844_SQ_PGM_RESOURCES_PS                 0x028844
+#define R_028860_SQ_PGM_RESOURCES_VS                 0x028860
+#define R_028878_SQ_PGM_RESOURCES_GS                 0x028878
+#define R_0288D4_SQ_PGM_RESOURCES_LS                 0x0288d4
+
 #endif // R600DEFINES_H_
diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp b/lib/Target/R600/R600EmitClauseMarkers.cpp
index 3fdc678..ff5ce5a 100644
--- a/lib/Target/R600/R600EmitClauseMarkers.cpp
+++ b/lib/Target/R600/R600EmitClauseMarkers.cpp
@@ -23,7 +23,9 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
-namespace llvm {
+using namespace llvm;
+
+namespace {
 
 class R600EmitClauseMarkersPass : public MachineFunctionPass {
 
@@ -36,8 +38,7 @@ private:
     case AMDGPU::INTERP_PAIR_XY:
     case AMDGPU::INTERP_PAIR_ZW:
     case AMDGPU::INTERP_VEC_LOAD:
-    case AMDGPU::DOT4_eg_pseudo:
-    case AMDGPU::DOT4_r600_pseudo:
+    case AMDGPU::DOT_4:
       return 4;
     case AMDGPU::KILL:
       return 0;
@@ -71,8 +72,7 @@ private:
     case AMDGPU::INTERP_PAIR_ZW:
     case AMDGPU::INTERP_VEC_LOAD:
     case AMDGPU::COPY:
-    case AMDGPU::DOT4_eg_pseudo:
-    case AMDGPU::DOT4_r600_pseudo:
+    case AMDGPU::DOT_4:
       return true;
     default:
       return false;
@@ -89,31 +89,6 @@ private:
     }
   }
 
-  // Register Idx, then Const value
-  std::vector<std::pair<unsigned, unsigned> > ExtractConstRead(MachineInstr *MI)
-      const {
-    const R600Operands::Ops OpTable[3][2] = {
-      {R600Operands::SRC0, R600Operands::SRC0_SEL},
-      {R600Operands::SRC1, R600Operands::SRC1_SEL},
-      {R600Operands::SRC2, R600Operands::SRC2_SEL},
-    };
-    std::vector<std::pair<unsigned, unsigned> > Result;
-
-    if (!TII->isALUInstr(MI->getOpcode()))
-      return Result;
-    for (unsigned j = 0; j < 3; j++) {
-      int SrcIdx = TII->getOperandIdx(MI->getOpcode(), OpTable[j][0]);
-      if (SrcIdx < 0)
-        break;
-      if (MI->getOperand(SrcIdx).getReg() == AMDGPU::ALU_CONST) {
-        unsigned Const = MI->getOperand(
-            TII->getOperandIdx(MI->getOpcode(), OpTable[j][1])).getImm();
-        Result.push_back(std::pair<unsigned, unsigned>(SrcIdx, Const));
-      }
-    }
-    return Result;
-  }
-
   std::pair<unsigned, unsigned> getAccessedBankLine(unsigned Sel) const {
     // Sel is (512 + (kc_bank << 12) + ConstIndex) << 2
     // (See also R600ISelLowering.cpp)
@@ -131,9 +106,13 @@ private:
   bool SubstituteKCacheBank(MachineInstr *MI,
       std::vector<std::pair<unsigned, unsigned> > &CachedConsts) const {
     std::vector<std::pair<unsigned, unsigned> > UsedKCache;
-    std::vector<std::pair<unsigned, unsigned> > Consts = ExtractConstRead(MI);
-    assert(TII->isALUInstr(MI->getOpcode()) && "Can't assign Const");
+    const SmallVector<std::pair<MachineOperand *, int64_t>, 3> &Consts =
+        TII->getSrcs(MI);
+    assert((TII->isALUInstr(MI->getOpcode()) ||
+        MI->getOpcode() == AMDGPU::DOT_4) && "Can't assign Const");
     for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
+      if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
+        continue;
       unsigned Sel = Consts[i].second;
       unsigned Chan = Sel & 3, Index = ((Sel >> 2) - 512) & 31;
       unsigned KCacheIndex = Index * 4 + Chan;
@@ -159,19 +138,22 @@ private:
       return false;
     }
 
-    for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
-      switch(UsedKCache[i].first) {
+    for (unsigned i = 0, j = 0, n = Consts.size(); i < n; ++i) {
+      if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
+        continue;
+      switch(UsedKCache[j].first) {
       case 0:
-        MI->getOperand(Consts[i].first).setReg(
-            AMDGPU::R600_KC0RegClass.getRegister(UsedKCache[i].second));
+        Consts[i].first->setReg(
+            AMDGPU::R600_KC0RegClass.getRegister(UsedKCache[j].second));
         break;
       case 1:
-        MI->getOperand(Consts[i].first).setReg(
-            AMDGPU::R600_KC1RegClass.getRegister(UsedKCache[i].second));
+        Consts[i].first->setReg(
+            AMDGPU::R600_KC1RegClass.getRegister(UsedKCache[j].second));
         break;
       default:
         llvm_unreachable("Wrong Cache Line");
       }
+      j++;
     }
     return true;
   }
@@ -202,6 +184,9 @@ private:
       if (TII->isALUInstr(I->getOpcode()) &&
           !SubstituteKCacheBank(I, KCacheBanks))
         break;
+      if (I->getOpcode() == AMDGPU::DOT_4 &&
+          !SubstituteKCacheBank(I, KCacheBanks))
+        break;
       AluInstCount += OccupiedDwords(I);
     }
     unsigned Opcode = PushBeforeModifier ?
@@ -220,9 +205,11 @@ private:
 
 public:
   R600EmitClauseMarkersPass(TargetMachine &tm) : MachineFunctionPass(ID),
-    TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo())) { }
+    TII(0) { }
 
   virtual bool runOnMachineFunction(MachineFunction &MF) {
+    TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
+
     for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
                                                     BB != BB_E; ++BB) {
       MachineBasicBlock &MBB = *BB;
@@ -246,7 +233,7 @@ public:
 
 char R600EmitClauseMarkersPass::ID = 0;
 
-}
+} // end anonymous namespace
 
 
 llvm::FunctionPass *llvm::createR600EmitClauseMarkers(TargetMachine &TM) {
diff --git a/lib/Target/R600/R600ExpandSpecialInstrs.cpp b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
index f8c900f..40c058f 100644
--- a/lib/Target/R600/R600ExpandSpecialInstrs.cpp
+++ b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
@@ -38,7 +38,7 @@ private:
 
 public:
   R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID),
-    TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo())) { }
+    TII(0) { }
 
   virtual bool runOnMachineFunction(MachineFunction &MF);
 
@@ -56,6 +56,7 @@ FunctionPass *llvm::createR600ExpandSpecialInstrsPass(TargetMachine &TM) {
 }
 
 bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
+  TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
 
   const R600RegisterInfo &TRI = TII->getRegisterInfo();
 
@@ -182,6 +183,45 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
         MI.eraseFromParent();
         continue;
         }
+      case AMDGPU::DOT_4: {
+
+        const R600RegisterInfo &TRI = TII->getRegisterInfo();
+
+        unsigned DstReg = MI.getOperand(0).getReg();
+        unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
+
+        for (unsigned Chan = 0; Chan < 4; ++Chan) {
+          bool Mask = (Chan != TRI.getHWRegChan(DstReg));
+          unsigned SubDstReg =
+              AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
+          MachineInstr *BMI =
+              TII->buildSlotOfVectorInstruction(MBB, &MI, Chan, SubDstReg);
+          if (Chan > 0) {
+            BMI->bundleWithPred();
+          }
+          if (Mask) {
+            TII->addFlag(BMI, 0, MO_FLAG_MASK);
+          }
+          if (Chan != 3)
+            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
+          unsigned Opcode = BMI->getOpcode();
+          // While not strictly necessary from hw point of view, we force
+          // all src operands of a dot4 inst to belong to the same slot.
+          unsigned Src0 = BMI->getOperand(
+              TII->getOperandIdx(Opcode, R600Operands::SRC0))
+              .getReg();
+          unsigned Src1 = BMI->getOperand(
+              TII->getOperandIdx(Opcode, R600Operands::SRC1))
+              .getReg();
+          (void) Src0;
+          (void) Src1;
+          if ((TRI.getEncodingValue(Src0) & 0xff) < 127 &&
+              (TRI.getEncodingValue(Src1) & 0xff) < 127)
+            assert(TRI.getHWRegChan(Src0) == TRI.getHWRegChan(Src1));
+        }
+        MI.eraseFromParent();
+        continue;
+      }
       }
 
       bool IsReduction = TII->isReductionOp(MI.getOpcode());
@@ -268,12 +308,6 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
         case AMDGPU::CUBE_eg_pseudo:
           Opcode = AMDGPU::CUBE_eg_real;
           break;
-        case AMDGPU::DOT4_r600_pseudo:
-          Opcode = AMDGPU::DOT4_r600_real;
-          break;
-        case AMDGPU::DOT4_eg_pseudo:
-          Opcode = AMDGPU::DOT4_eg_real;
-          break;
         default:
           break;
         }
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index a66baca..9cedadb 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -26,8 +26,7 @@
 using namespace llvm;
 
 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
-    AMDGPUTargetLowering(TM),
-    TII(static_cast<const R600InstrInfo*>(TM.getInstrInfo())) {
+    AMDGPUTargetLowering(TM) {
   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
@@ -43,11 +42,25 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::AND,  MVT::v4i32, Expand);
   setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
   setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
+  setOperationAction(ISD::MUL,  MVT::v2i32, Expand);
+  setOperationAction(ISD::MUL,  MVT::v4i32, Expand);
+  setOperationAction(ISD::OR, MVT::v4i32, Expand);
+  setOperationAction(ISD::OR, MVT::v2i32, Expand);
   setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
+  setOperationAction(ISD::SHL, MVT::v4i32, Expand);
+  setOperationAction(ISD::SHL, MVT::v2i32, Expand);
+  setOperationAction(ISD::SRL, MVT::v4i32, Expand);
+  setOperationAction(ISD::SRL, MVT::v2i32, Expand);
+  setOperationAction(ISD::SRA, MVT::v4i32, Expand);
+  setOperationAction(ISD::SRA, MVT::v2i32, Expand);
+  setOperationAction(ISD::SUB, MVT::v4i32, Expand);
+  setOperationAction(ISD::SUB, MVT::v2i32, Expand);
   setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
   setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
   setOperationAction(ISD::UREM, MVT::v4i32, Expand);
   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
+  setOperationAction(ISD::XOR, MVT::v4i32, Expand);
+  setOperationAction(ISD::XOR, MVT::v2i32, Expand);
 
   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
@@ -58,8 +71,6 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
 
-  setOperationAction(ISD::ROTL, MVT::i32, Custom);
-
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
 
@@ -70,6 +81,9 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::SELECT, MVT::i32, Custom);
   setOperationAction(ISD::SELECT, MVT::f32, Custom);
 
+  setOperationAction(ISD::VSELECT, MVT::v4i32, Expand);
+  setOperationAction(ISD::VSELECT, MVT::v2i32, Expand);
+
   // Legalize loads and stores to the private address space.
   setOperationAction(ISD::LOAD, MVT::i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
@@ -102,6 +116,8 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
   MachineFunction * MF = BB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   MachineBasicBlock::iterator I = *MI;
+  const R600InstrInfo *TII =
+    static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo());
 
   switch (MI->getOpcode()) {
   default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
@@ -171,23 +187,99 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
   case AMDGPU::TXD: {
     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
-
+    MachineOperand &RID = MI->getOperand(4);
+    MachineOperand &SID = MI->getOperand(5);
+    unsigned TextureId = MI->getOperand(6).getImm();
+    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
+    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
+
+    switch (TextureId) {
+    case 5: // Rect
+      CTX = CTY = 0;
+      break;
+    case 6: // Shadow1D
+      SrcW = SrcZ;
+      break;
+    case 7: // Shadow2D
+      SrcW = SrcZ;
+      break;
+    case 8: // ShadowRect
+      CTX = CTY = 0;
+      SrcW = SrcZ;
+      break;
+    case 9: // 1DArray
+      SrcZ = SrcY;
+      CTZ = 0;
+      break;
+    case 10: // 2DArray
+      CTZ = 0;
+      break;
+    case 11: // Shadow1DArray
+      SrcZ = SrcY;
+      CTZ = 0;
+      break;
+    case 12: // Shadow2DArray
+      CTZ = 0;
+      break;
+    }
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
             .addOperand(MI->getOperand(3))
-            .addOperand(MI->getOperand(4))
-            .addOperand(MI->getOperand(5))
-            .addOperand(MI->getOperand(6));
+            .addImm(SrcX)
+            .addImm(SrcY)
+            .addImm(SrcZ)
+            .addImm(SrcW)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(1)
+            .addImm(2)
+            .addImm(3)
+            .addOperand(RID)
+            .addOperand(SID)
+            .addImm(CTX)
+            .addImm(CTY)
+            .addImm(CTZ)
+            .addImm(CTW);
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
             .addOperand(MI->getOperand(2))
-            .addOperand(MI->getOperand(4))
-            .addOperand(MI->getOperand(5))
-            .addOperand(MI->getOperand(6));
+            .addImm(SrcX)
+            .addImm(SrcY)
+            .addImm(SrcZ)
+            .addImm(SrcW)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(1)
+            .addImm(2)
+            .addImm(3)
+            .addOperand(RID)
+            .addOperand(SID)
+            .addImm(CTX)
+            .addImm(CTY)
+            .addImm(CTZ)
+            .addImm(CTW);
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
             .addOperand(MI->getOperand(0))
             .addOperand(MI->getOperand(1))
-            .addOperand(MI->getOperand(4))
-            .addOperand(MI->getOperand(5))
-            .addOperand(MI->getOperand(6))
+            .addImm(SrcX)
+            .addImm(SrcY)
+            .addImm(SrcZ)
+            .addImm(SrcW)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(1)
+            .addImm(2)
+            .addImm(3)
+            .addOperand(RID)
+            .addOperand(SID)
+            .addImm(CTX)
+            .addImm(CTY)
+            .addImm(CTZ)
+            .addImm(CTW)
             .addReg(T0, RegState::Implicit)
             .addReg(T1, RegState::Implicit);
     break;
@@ -196,23 +288,100 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
   case AMDGPU::TXD_SHADOW: {
     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
+    MachineOperand &RID = MI->getOperand(4);
+    MachineOperand &SID = MI->getOperand(5);
+    unsigned TextureId = MI->getOperand(6).getImm();
+    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
+    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
+
+    switch (TextureId) {
+    case 5: // Rect
+      CTX = CTY = 0;
+      break;
+    case 6: // Shadow1D
+      SrcW = SrcZ;
+      break;
+    case 7: // Shadow2D
+      SrcW = SrcZ;
+      break;
+    case 8: // ShadowRect
+      CTX = CTY = 0;
+      SrcW = SrcZ;
+      break;
+    case 9: // 1DArray
+      SrcZ = SrcY;
+      CTZ = 0;
+      break;
+    case 10: // 2DArray
+      CTZ = 0;
+      break;
+    case 11: // Shadow1DArray
+      SrcZ = SrcY;
+      CTZ = 0;
+      break;
+    case 12: // Shadow2DArray
+      CTZ = 0;
+      break;
+    }
 
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
             .addOperand(MI->getOperand(3))
-            .addOperand(MI->getOperand(4))
-            .addOperand(MI->getOperand(5))
-            .addOperand(MI->getOperand(6));
+            .addImm(SrcX)
+            .addImm(SrcY)
+            .addImm(SrcZ)
+            .addImm(SrcW)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(1)
+            .addImm(2)
+            .addImm(3)
+            .addOperand(RID)
+            .addOperand(SID)
+            .addImm(CTX)
+            .addImm(CTY)
+            .addImm(CTZ)
+            .addImm(CTW);
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
             .addOperand(MI->getOperand(2))
-            .addOperand(MI->getOperand(4))
-            .addOperand(MI->getOperand(5))
-            .addOperand(MI->getOperand(6));
+            .addImm(SrcX)
+            .addImm(SrcY)
+            .addImm(SrcZ)
+            .addImm(SrcW)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(1)
+            .addImm(2)
+            .addImm(3)
+            .addOperand(RID)
+            .addOperand(SID)
+            .addImm(CTX)
+            .addImm(CTY)
+            .addImm(CTZ)
+            .addImm(CTW);
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
             .addOperand(MI->getOperand(0))
             .addOperand(MI->getOperand(1))
-            .addOperand(MI->getOperand(4))
-            .addOperand(MI->getOperand(5))
-            .addOperand(MI->getOperand(6))
+            .addImm(SrcX)
+            .addImm(SrcY)
+            .addImm(SrcZ)
+            .addImm(SrcW)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(1)
+            .addImm(2)
+            .addImm(3)
+            .addOperand(RID)
+            .addOperand(SID)
+            .addImm(CTX)
+            .addImm(CTY)
+            .addImm(CTZ)
+            .addImm(CTW)
             .addReg(T0, RegState::Implicit)
             .addReg(T1, RegState::Implicit);
     break;
@@ -304,13 +473,9 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 // Custom DAG Lowering Operations
 //===----------------------------------------------------------------------===//
 
-using namespace llvm::Intrinsic;
-using namespace llvm::AMDGPUIntrinsic;
-
 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
-  case ISD::ROTL: return LowerROTL(Op, DAG);
   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
   case ISD::SELECT: return LowerSELECT(Op, DAG);
   case ISD::STORE: return LowerSTORE(Op, DAG);
@@ -327,7 +492,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
       MFI->LiveOuts.push_back(Reg);
-      return DAG.getCopyToReg(Chain, Op.getDebugLoc(), Reg, Op.getOperand(2));
+      return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
     }
     case AMDGPUIntrinsic::R600_store_swizzle: {
       const SDValue Args[8] = {
@@ -340,7 +505,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
         DAG.getConstant(2, MVT::i32), // SWZ_Z
         DAG.getConstant(3, MVT::i32) // SWZ_W
       };
-      return DAG.getNode(AMDGPUISD::EXPORT, Op.getDebugLoc(), Op.getValueType(),
+      return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(),
           Args, 8);
     }
 
@@ -354,13 +519,17 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
     unsigned IntrinsicID =
                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
     EVT VT = Op.getValueType();
-    DebugLoc DL = Op.getDebugLoc();
+    SDLoc DL(Op);
     switch(IntrinsicID) {
     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
     case AMDGPUIntrinsic::R600_load_input: {
       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
-      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, Reg, VT);
+      MachineFunction &MF = DAG.getMachineFunction();
+      MachineRegisterInfo &MRI = MF.getRegInfo();
+      MRI.addLiveIn(Reg);
+      return DAG.getCopyFromReg(DAG.getEntryNode(),
+          SDLoc(DAG.getEntryNode()), Reg, VT);
     }
 
     case AMDGPUIntrinsic::R600_interp_input: {
@@ -368,6 +537,9 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
       MachineSDNode *interp;
       if (ijb < 0) {
+        const MachineFunction &MF = DAG.getMachineFunction();
+        const R600InstrInfo *TII =
+          static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo());
         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
         return DAG.getTargetExtractSubreg(
@@ -375,59 +547,153 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
             DL, MVT::f32, SDValue(interp, 0));
       }
 
+      MachineFunction &MF = DAG.getMachineFunction();
+      MachineRegisterInfo &MRI = MF.getRegInfo();
+      unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
+      unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
+      MRI.addLiveIn(RegisterI);
+      MRI.addLiveIn(RegisterJ);
+      SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
+          SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
+      SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
+          SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
+
       if (slot % 4 < 2)
         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
-            CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
-                AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32),
-            CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
-                AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32));
+            RegisterJNode, RegisterINode);
       else
         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
-            CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
-                AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32),
-            CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
-                AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32));
-
+            RegisterJNode, RegisterINode);
       return SDValue(interp, slot % 2);
     }
+    case AMDGPUIntrinsic::R600_tex:
+    case AMDGPUIntrinsic::R600_texc:
+    case AMDGPUIntrinsic::R600_txl:
+    case AMDGPUIntrinsic::R600_txlc:
+    case AMDGPUIntrinsic::R600_txb:
+    case AMDGPUIntrinsic::R600_txbc:
+    case AMDGPUIntrinsic::R600_txf:
+    case AMDGPUIntrinsic::R600_txq:
+    case AMDGPUIntrinsic::R600_ddx:
+    case AMDGPUIntrinsic::R600_ddy: {
+      unsigned TextureOp;
+      switch (IntrinsicID) {
+      case AMDGPUIntrinsic::R600_tex:
+        TextureOp = 0;
+        break;
+      case AMDGPUIntrinsic::R600_texc:
+        TextureOp = 1;
+        break;
+      case AMDGPUIntrinsic::R600_txl:
+        TextureOp = 2;
+        break;
+      case AMDGPUIntrinsic::R600_txlc:
+        TextureOp = 3;
+        break;
+      case AMDGPUIntrinsic::R600_txb:
+        TextureOp = 4;
+        break;
+      case AMDGPUIntrinsic::R600_txbc:
+        TextureOp = 5;
+        break;
+      case AMDGPUIntrinsic::R600_txf:
+        TextureOp = 6;
+        break;
+      case AMDGPUIntrinsic::R600_txq:
+        TextureOp = 7;
+        break;
+      case AMDGPUIntrinsic::R600_ddx:
+        TextureOp = 8;
+        break;
+      case AMDGPUIntrinsic::R600_ddy:
+        TextureOp = 9;
+        break;
+      default:
+        llvm_unreachable("Unknow Texture Operation");
+      }
 
-    case r600_read_ngroups_x:
+      SDValue TexArgs[19] = {
+        DAG.getConstant(TextureOp, MVT::i32),
+        Op.getOperand(1),
+        DAG.getConstant(0, MVT::i32),
+        DAG.getConstant(1, MVT::i32),
+        DAG.getConstant(2, MVT::i32),
+        DAG.getConstant(3, MVT::i32),
+        Op.getOperand(2),
+        Op.getOperand(3),
+        Op.getOperand(4),
+        DAG.getConstant(0, MVT::i32),
+        DAG.getConstant(1, MVT::i32),
+        DAG.getConstant(2, MVT::i32),
+        DAG.getConstant(3, MVT::i32),
+        Op.getOperand(5),
+        Op.getOperand(6),
+        Op.getOperand(7),
+        Op.getOperand(8),
+        Op.getOperand(9),
+        Op.getOperand(10)
+      };
+      return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19);
+    }
+    case AMDGPUIntrinsic::AMDGPU_dp4: {
+      SDValue Args[8] = {
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
+          DAG.getConstant(0, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
+          DAG.getConstant(0, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
+          DAG.getConstant(1, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
+          DAG.getConstant(1, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
+          DAG.getConstant(2, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
+          DAG.getConstant(2, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
+          DAG.getConstant(3, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
+          DAG.getConstant(3, MVT::i32))
+      };
+      return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8);
+    }
+
+    case Intrinsic::r600_read_ngroups_x:
       return LowerImplicitParameter(DAG, VT, DL, 0);
-    case r600_read_ngroups_y:
+    case Intrinsic::r600_read_ngroups_y:
       return LowerImplicitParameter(DAG, VT, DL, 1);
-    case r600_read_ngroups_z:
+    case Intrinsic::r600_read_ngroups_z:
       return LowerImplicitParameter(DAG, VT, DL, 2);
-    case r600_read_global_size_x:
+    case Intrinsic::r600_read_global_size_x:
       return LowerImplicitParameter(DAG, VT, DL, 3);
-    case r600_read_global_size_y:
+    case Intrinsic::r600_read_global_size_y:
       return LowerImplicitParameter(DAG, VT, DL, 4);
-    case r600_read_global_size_z:
+    case Intrinsic::r600_read_global_size_z:
       return LowerImplicitParameter(DAG, VT, DL, 5);
-    case r600_read_local_size_x:
+    case Intrinsic::r600_read_local_size_x:
       return LowerImplicitParameter(DAG, VT, DL, 6);
-    case r600_read_local_size_y:
+    case Intrinsic::r600_read_local_size_y:
       return LowerImplicitParameter(DAG, VT, DL, 7);
-    case r600_read_local_size_z:
+    case Intrinsic::r600_read_local_size_z:
       return LowerImplicitParameter(DAG, VT, DL, 8);
 
-    case r600_read_tgid_x:
+    case Intrinsic::r600_read_tgid_x:
       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
                                   AMDGPU::T1_X, VT);
-    case r600_read_tgid_y:
+    case Intrinsic::r600_read_tgid_y:
       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
                                   AMDGPU::T1_Y, VT);
-    case r600_read_tgid_z:
+    case Intrinsic::r600_read_tgid_z:
       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
                                   AMDGPU::T1_Z, VT);
-    case r600_read_tidig_x:
+    case Intrinsic::r600_read_tidig_x:
       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
                                   AMDGPU::T0_X, VT);
-    case r600_read_tidig_y:
+    case Intrinsic::r600_read_tidig_y:
       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
                                   AMDGPU::T0_Y, VT);
-    case r600_read_tidig_z:
+    case Intrinsic::r600_read_tidig_z:
       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
                                   AMDGPU::T0_Z, VT);
     }
@@ -464,7 +730,7 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(
       ISD::SETCC,
-      Op.getDebugLoc(),
+      SDLoc(Op),
       MVT::i1,
       Op, DAG.getConstantFP(0.0f, MVT::f32),
       DAG.getCondCode(ISD::SETNE)
@@ -472,7 +738,7 @@ SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
 }
 
 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
-                                                   DebugLoc DL,
+                                                   SDLoc DL,
                                                    unsigned DwordOffset) const {
   unsigned ByteOffset = DwordOffset * 4;
   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
@@ -501,18 +767,6 @@ SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const
   return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
 }
 
-SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
-  EVT VT = Op.getValueType();
-
-  return DAG.getNode(AMDGPUISD::BITALIGN, DL, VT,
-                     Op.getOperand(0),
-                     Op.getOperand(0),
-                     DAG.getNode(ISD::SUB, DL, VT,
-                                 DAG.getConstant(32, MVT::i32),
-                                 Op.getOperand(1)));
-}
-
 bool R600TargetLowering::isZero(SDValue Op) const {
   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
     return Cst->isNullValue();
@@ -524,7 +778,7 @@ bool R600TargetLowering::isZero(SDValue Op) const {
 }
 
 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT VT = Op.getValueType();
 
   SDValue LHS = Op.getOperand(0);
@@ -645,7 +899,7 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
 
 SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::SELECT_CC,
-      Op.getDebugLoc(),
+      SDLoc(Op),
       Op.getValueType(),
       Op.getOperand(0),
       DAG.getConstant(0, MVT::i32),
@@ -676,7 +930,7 @@ SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
   default: llvm_unreachable("Invalid stack width");
   }
 
-  return DAG.getNode(ISD::SRL, Ptr.getDebugLoc(), Ptr.getValueType(), Ptr,
+  return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
                      DAG.getConstant(SRLPad, MVT::i32));
 }
 
@@ -710,7 +964,7 @@ void R600TargetLowering::getStackAddress(unsigned StackWidth,
 }
 
 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
   SDValue Chain = Op.getOperand(0);
   SDValue Value = Op.getOperand(1);
@@ -772,7 +1026,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
     }
     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
-    DAG.getTargetConstant(0, MVT::i32)); // Channel 
+    DAG.getTargetConstant(0, MVT::i32)); // Channel
   }
 
   return Chain;
@@ -822,7 +1076,7 @@ ConstantAddressBlock(unsigned AddressSpace) {
 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
 {
   EVT VT = Op.getValueType();
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
   SDValue Chain = Op.getOperand(0);
   SDValue Ptr = Op.getOperand(1);
@@ -851,7 +1105,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
           DAG.getConstant(LoadNode->getAddressSpace() -
-	                  AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
+                          AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
           );
     }
 
@@ -924,7 +1178,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
                                       CallingConv::ID CallConv,
                                       bool isVarArg,
                                       const SmallVectorImpl<ISD::InputArg> &Ins,
-                                      DebugLoc DL, SelectionDAG &DAG,
+                                      SDLoc DL, SelectionDAG &DAG,
                                       SmallVectorImpl<SDValue> &InVals) const {
   unsigned ParamOffsetBytes = 36;
   Function::const_arg_iterator FuncArg =
@@ -955,11 +1209,105 @@ SDValue R600TargetLowering::LowerFormalArguments(
   return Chain;
 }
 
-EVT R600TargetLowering::getSetCCResultType(EVT VT) const {
+EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
    if (!VT.isVector()) return MVT::i32;
    return VT.changeVectorElementTypeToInteger();
 }
 
+static SDValue
+CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry,
+                        DenseMap<unsigned, unsigned> &RemapSwizzle) {
+  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
+  assert(RemapSwizzle.empty());
+  SDValue NewBldVec[4] = {
+      VectorEntry.getOperand(0),
+      VectorEntry.getOperand(1),
+      VectorEntry.getOperand(2),
+      VectorEntry.getOperand(3)
+  };
+
+  for (unsigned i = 0; i < 4; i++) {
+    if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
+      if (C->isZero()) {
+        RemapSwizzle[i] = 4; // SEL_0
+        NewBldVec[i] = DAG.getUNDEF(MVT::f32);
+      } else if (C->isExactlyValue(1.0)) {
+        RemapSwizzle[i] = 5; // SEL_1
+        NewBldVec[i] = DAG.getUNDEF(MVT::f32);
+      }
+    }
+
+    if (NewBldVec[i].getOpcode() == ISD::UNDEF)
+      continue;
+    for (unsigned j = 0; j < i; j++) {
+      if (NewBldVec[i] == NewBldVec[j]) {
+        NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
+        RemapSwizzle[i] = j;
+        break;
+      }
+    }
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
+      VectorEntry.getValueType(), NewBldVec, 4);
+}
+
+static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
+                                DenseMap<unsigned, unsigned> &RemapSwizzle) {
+  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
+  assert(RemapSwizzle.empty());
+  SDValue NewBldVec[4] = {
+      VectorEntry.getOperand(0),
+      VectorEntry.getOperand(1),
+      VectorEntry.getOperand(2),
+      VectorEntry.getOperand(3)
+  };
+  bool isUnmovable[4] = { false, false, false, false };
+
+  for (unsigned i = 0; i < 4; i++) {
+    if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+      unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
+          ->getZExtValue();
+      if (!isUnmovable[Idx]) {
+        // Swap i and Idx
+        std::swap(NewBldVec[Idx], NewBldVec[i]);
+        RemapSwizzle[Idx] = i;
+        RemapSwizzle[i] = Idx;
+      }
+      isUnmovable[Idx] = true;
+    }
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
+      VectorEntry.getValueType(), NewBldVec, 4);
+}
+
+
+SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
+SDValue Swz[4], SelectionDAG &DAG) const {
+  assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
+  // Old -> New swizzle values
+  DenseMap<unsigned, unsigned> SwizzleRemap;
+
+  BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
+  for (unsigned i = 0; i < 4; i++) {
+    unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
+    if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
+      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
+  }
+
+  SwizzleRemap.clear();
+  BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
+  for (unsigned i = 0; i < 4; i++) {
+    unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
+    if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
+      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
+  }
+
+  return BuildVector;
+}
+
+
 //===----------------------------------------------------------------------===//
 // Custom DAG Optimizations
 //===----------------------------------------------------------------------===//
@@ -973,7 +1321,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::FP_ROUND: {
       SDValue Arg = N->getOperand(0);
       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
-        return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), N->getValueType(0),
+        return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
                            Arg.getOperand(0));
       }
       break;
@@ -998,7 +1346,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
       return SDValue();
     }
 
-    return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), N->getValueType(0),
+    return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
                            SelectCC.getOperand(0), // LHS
                            SelectCC.getOperand(1), // RHS
                            DAG.getConstant(-1, MVT::i32), // True
@@ -1021,7 +1369,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
         unsigned Element = Const->getZExtValue();
-        return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), N->getVTList(),
+        return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
             Arg->getOperand(0).getOperand(Element));
       }
     }
@@ -1056,7 +1404,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
       LHSCC = ISD::getSetCCInverse(LHSCC,
                                   LHS.getOperand(0).getValueType().isInteger());
-      return DAG.getSelectCC(N->getDebugLoc(),
+      return DAG.getSelectCC(SDLoc(N),
                              LHS.getOperand(0),
                              LHS.getOperand(1),
                              LHS.getOperand(2),
@@ -1069,12 +1417,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
     SDValue Arg = N->getOperand(1);
     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
       break;
-    SDValue NewBldVec[4] = {
-        DAG.getUNDEF(MVT::f32),
-        DAG.getUNDEF(MVT::f32),
-        DAG.getUNDEF(MVT::f32),
-        DAG.getUNDEF(MVT::f32)
-      };
+
     SDValue NewArgs[8] = {
       N->getOperand(0), // Chain
       SDValue(),
@@ -1085,23 +1428,40 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
       N->getOperand(6), // SWZ_Z
       N->getOperand(7) // SWZ_W
     };
-    for (unsigned i = 0; i < Arg.getNumOperands(); i++) {
-      if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Arg.getOperand(i))) {
-        if (C->isZero()) {
-          NewArgs[4 + i] = DAG.getConstant(4, MVT::i32); // SEL_0
-        } else if (C->isExactlyValue(1.0)) {
-          NewArgs[4 + i] = DAG.getConstant(5, MVT::i32); // SEL_0
-        } else {
-          NewBldVec[i] = Arg.getOperand(i);
-        }
-      } else {
-        NewBldVec[i] = Arg.getOperand(i);
-      }
-    }
-    DebugLoc DL = N->getDebugLoc();
-    NewArgs[1] = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4f32, NewBldVec, 4);
+    SDLoc DL(N);
+    NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
   }
+  case AMDGPUISD::TEXTURE_FETCH: {
+    SDValue Arg = N->getOperand(1);
+    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
+      break;
+
+    SDValue NewArgs[19] = {
+      N->getOperand(0),
+      N->getOperand(1),
+      N->getOperand(2),
+      N->getOperand(3),
+      N->getOperand(4),
+      N->getOperand(5),
+      N->getOperand(6),
+      N->getOperand(7),
+      N->getOperand(8),
+      N->getOperand(9),
+      N->getOperand(10),
+      N->getOperand(11),
+      N->getOperand(12),
+      N->getOperand(13),
+      N->getOperand(14),
+      N->getOperand(15),
+      N->getOperand(16),
+      N->getOperand(17),
+      N->getOperand(18),
+    };
+    NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
+    return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
+        NewArgs, 19);
+  }
   }
   return SDValue();
 }
diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h
index 2c09acb..d4ba4c8 100644
--- a/lib/Target/R600/R600ISelLowering.h
+++ b/lib/Target/R600/R600ISelLowering.h
@@ -36,21 +36,20 @@ public:
                                       CallingConv::ID CallConv,
                                       bool isVarArg,
                                       const SmallVectorImpl<ISD::InputArg> &Ins,
-                                      DebugLoc DL, SelectionDAG &DAG,
+                                      SDLoc DL, SelectionDAG &DAG,
                                       SmallVectorImpl<SDValue> &InVals) const;
-  virtual EVT getSetCCResultType(EVT VT) const;
+  virtual EVT getSetCCResultType(LLVMContext &, EVT VT) const;
 private:
-  const R600InstrInfo * TII;
-
   /// Each OpenCL kernel has nine implicit parameters that are stored in the
   /// first nine dwords of a Vertex Buffer.  These implicit parameters are
   /// lowered to load instructions which retreive the values from the Vertex
   /// Buffer.
   SDValue LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
-                                 DebugLoc DL, unsigned DwordOffset) const;
+                                 SDLoc DL, unsigned DwordOffset) const;
 
   void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB,
       MachineRegisterInfo & MRI, unsigned dword_offset) const;
+  SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG) const;
 
   /// \brief Lower ROTL opcode to BITALIGN
   SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
index 8fd8385..4f5cfcd 100644
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp
@@ -19,8 +19,8 @@
 #include "R600Defines.h"
 #include "R600MachineFunctionInfo.h"
 #include "R600RegisterInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
 #define GET_INSTRINFO_CTOR
@@ -30,7 +30,7 @@ using namespace llvm;
 
 R600InstrInfo::R600InstrInfo(AMDGPUTargetMachine &tm)
   : AMDGPUInstrInfo(tm),
-    RI(tm, *this),
+    RI(tm),
     ST(tm.getSubtarget<AMDGPUSubtarget>())
   { }
 
@@ -116,9 +116,6 @@ bool R600InstrInfo::isPlaceHolderOpcode(unsigned Opcode) const {
 bool R600InstrInfo::isReductionOp(unsigned Opcode) const {
   switch(Opcode) {
     default: return false;
-    case AMDGPU::DOT4_r600_pseudo:
-    case AMDGPU::DOT4_eg_pseudo:
-      return true;
   }
 }
 
@@ -150,7 +147,7 @@ bool R600InstrInfo::isTransOnly(const MachineInstr *MI) const {
 }
 
 bool R600InstrInfo::usesVertexCache(unsigned Opcode) const {
-  return ST.hasVertexCache() && get(Opcode).TSFlags & R600_InstFlag::VTX_INST;
+  return ST.hasVertexCache() && IS_VTX(get(Opcode));
 }
 
 bool R600InstrInfo::usesVertexCache(const MachineInstr *MI) const {
@@ -159,8 +156,7 @@ bool R600InstrInfo::usesVertexCache(const MachineInstr *MI) const {
 }
 
 bool R600InstrInfo::usesTextureCache(unsigned Opcode) const {
-  return (!ST.hasVertexCache() && get(Opcode).TSFlags & R600_InstFlag::VTX_INST) ||
-      (get(Opcode).TSFlags & R600_InstFlag::TEX_INST);
+  return (!ST.hasVertexCache() && IS_VTX(get(Opcode))) || IS_TEX(get(Opcode));
 }
 
 bool R600InstrInfo::usesTextureCache(const MachineInstr *MI) const {
@@ -169,6 +165,181 @@ bool R600InstrInfo::usesTextureCache(const MachineInstr *MI) const {
          usesTextureCache(MI->getOpcode());
 }
 
+SmallVector<std::pair<MachineOperand *, int64_t>, 3>
+R600InstrInfo::getSrcs(MachineInstr *MI) const {
+  SmallVector<std::pair<MachineOperand *, int64_t>, 3> Result;
+
+  if (MI->getOpcode() == AMDGPU::DOT_4) {
+    static const R600Operands::VecOps OpTable[8][2] = {
+      {R600Operands::SRC0_X, R600Operands::SRC0_SEL_X},
+      {R600Operands::SRC0_Y, R600Operands::SRC0_SEL_Y},
+      {R600Operands::SRC0_Z, R600Operands::SRC0_SEL_Z},
+      {R600Operands::SRC0_W, R600Operands::SRC0_SEL_W},
+      {R600Operands::SRC1_X, R600Operands::SRC1_SEL_X},
+      {R600Operands::SRC1_Y, R600Operands::SRC1_SEL_Y},
+      {R600Operands::SRC1_Z, R600Operands::SRC1_SEL_Z},
+      {R600Operands::SRC1_W, R600Operands::SRC1_SEL_W},
+    };
+
+    for (unsigned j = 0; j < 8; j++) {
+      MachineOperand &MO = MI->getOperand(OpTable[j][0] + 1);
+      unsigned Reg = MO.getReg();
+      if (Reg == AMDGPU::ALU_CONST) {
+        unsigned Sel = MI->getOperand(OpTable[j][1] + 1).getImm();
+        Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Sel));
+        continue;
+      }
+      
+    }
+    return Result;
+  }
+
+  static const R600Operands::Ops OpTable[3][2] = {
+    {R600Operands::SRC0, R600Operands::SRC0_SEL},
+    {R600Operands::SRC1, R600Operands::SRC1_SEL},
+    {R600Operands::SRC2, R600Operands::SRC2_SEL},
+  };
+
+  for (unsigned j = 0; j < 3; j++) {
+    int SrcIdx = getOperandIdx(MI->getOpcode(), OpTable[j][0]);
+    if (SrcIdx < 0)
+      break;
+    MachineOperand &MO = MI->getOperand(SrcIdx);
+    unsigned Reg = MI->getOperand(SrcIdx).getReg();
+    if (Reg == AMDGPU::ALU_CONST) {
+      unsigned Sel = MI->getOperand(
+          getOperandIdx(MI->getOpcode(), OpTable[j][1])).getImm();
+      Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Sel));
+      continue;
+    }
+    if (Reg == AMDGPU::ALU_LITERAL_X) {
+      unsigned Imm = MI->getOperand(
+          getOperandIdx(MI->getOpcode(), R600Operands::IMM)).getImm();
+      Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Imm));
+      continue;
+    }
+    Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, 0));
+  }
+  return Result;
+}
+
+std::vector<std::pair<int, unsigned> >
+R600InstrInfo::ExtractSrcs(MachineInstr *MI,
+                           const DenseMap<unsigned, unsigned> &PV)
+    const {
+  const SmallVector<std::pair<MachineOperand *, int64_t>, 3> Srcs = getSrcs(MI);
+  const std::pair<int, unsigned> DummyPair(-1, 0);
+  std::vector<std::pair<int, unsigned> > Result;
+  unsigned i = 0;
+  for (unsigned n = Srcs.size(); i < n; ++i) {
+    unsigned Reg = Srcs[i].first->getReg();
+    unsigned Index = RI.getEncodingValue(Reg) & 0xff;
+    unsigned Chan = RI.getHWRegChan(Reg);
+    if (Index > 127) {
+      Result.push_back(DummyPair);
+      continue;
+    }
+    if (PV.find(Index) != PV.end()) {
+      Result.push_back(DummyPair);
+      continue;
+    }
+    Result.push_back(std::pair<int, unsigned>(Index, Chan));
+  }
+  for (; i < 3; ++i)
+    Result.push_back(DummyPair);
+  return Result;
+}
+
+static std::vector<std::pair<int, unsigned> >
+Swizzle(std::vector<std::pair<int, unsigned> > Src,
+        R600InstrInfo::BankSwizzle Swz) {
+  switch (Swz) {
+  case R600InstrInfo::ALU_VEC_012:
+    break;
+  case R600InstrInfo::ALU_VEC_021:
+    std::swap(Src[1], Src[2]);
+    break;
+  case R600InstrInfo::ALU_VEC_102:
+    std::swap(Src[0], Src[1]);
+    break;
+  case R600InstrInfo::ALU_VEC_120:
+    std::swap(Src[0], Src[1]);
+    std::swap(Src[0], Src[2]);
+    break;
+  case R600InstrInfo::ALU_VEC_201:
+    std::swap(Src[0], Src[2]);
+    std::swap(Src[0], Src[1]);
+    break;
+  case R600InstrInfo::ALU_VEC_210:
+    std::swap(Src[0], Src[2]);
+    break;
+  }
+  return Src;
+}
+
+static bool
+isLegal(const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+    const std::vector<R600InstrInfo::BankSwizzle> &Swz,
+    unsigned CheckedSize) {
+  int Vector[4][3];
+  memset(Vector, -1, sizeof(Vector));
+  for (unsigned i = 0; i < CheckedSize; i++) {
+    const std::vector<std::pair<int, unsigned> > &Srcs =
+        Swizzle(IGSrcs[i], Swz[i]);
+    for (unsigned j = 0; j < 3; j++) {
+      const std::pair<int, unsigned> &Src = Srcs[j];
+      if (Src.first < 0)
+        continue;
+      if (Vector[Src.second][j] < 0)
+        Vector[Src.second][j] = Src.first;
+      if (Vector[Src.second][j] != Src.first)
+        return false;
+    }
+  }
+  return true;
+}
+
+static bool recursiveFitsFPLimitation(
+const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
+unsigned Depth = 0) {
+  if (!isLegal(IGSrcs, SwzCandidate, Depth))
+    return false;
+  if (IGSrcs.size() == Depth)
+    return true;
+  unsigned i = SwzCandidate[Depth];
+  for (; i < 6; i++) {
+    SwzCandidate[Depth] = (R600InstrInfo::BankSwizzle) i;
+    if (recursiveFitsFPLimitation(IGSrcs, SwzCandidate, Depth + 1))
+      return true;
+  }
+  SwzCandidate[Depth] = R600InstrInfo::ALU_VEC_012;
+  return false;
+}
+
+bool
+R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG,
+                                      const DenseMap<unsigned, unsigned> &PV,
+                                      std::vector<BankSwizzle> &ValidSwizzle)
+    const {
+  //Todo : support shared src0 - src1 operand
+
+  std::vector<std::vector<std::pair<int, unsigned> > > IGSrcs;
+  ValidSwizzle.clear();
+  for (unsigned i = 0, e = IG.size(); i < e; ++i) {
+    IGSrcs.push_back(ExtractSrcs(IG[i], PV));
+    unsigned Op = getOperandIdx(IG[i]->getOpcode(),
+        R600Operands::BANK_SWIZZLE);
+    ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle)
+        IG[i]->getOperand(Op).getImm());
+  }
+  bool Result = recursiveFitsFPLimitation(IGSrcs, ValidSwizzle);
+  if (!Result)
+    return false;
+  return true;
+}
+
+
 bool
 R600InstrInfo::fitsConstReadLimitations(const std::vector<unsigned> &Consts)
     const {
@@ -198,34 +369,22 @@ bool
 R600InstrInfo::canBundle(const std::vector<MachineInstr *> &MIs) const {
   std::vector<unsigned> Consts;
   for (unsigned i = 0, n = MIs.size(); i < n; i++) {
-    const MachineInstr *MI = MIs[i];
-
-    const R600Operands::Ops OpTable[3][2] = {
-      {R600Operands::SRC0, R600Operands::SRC0_SEL},
-      {R600Operands::SRC1, R600Operands::SRC1_SEL},
-      {R600Operands::SRC2, R600Operands::SRC2_SEL},
-    };
-
+    MachineInstr *MI = MIs[i];
     if (!isALUInstr(MI->getOpcode()))
       continue;
 
-    for (unsigned j = 0; j < 3; j++) {
-      int SrcIdx = getOperandIdx(MI->getOpcode(), OpTable[j][0]);
-      if (SrcIdx < 0)
-        break;
-      unsigned Reg = MI->getOperand(SrcIdx).getReg();
-      if (Reg == AMDGPU::ALU_CONST) {
-        unsigned Const = MI->getOperand(
-            getOperandIdx(MI->getOpcode(), OpTable[j][1])).getImm();
-        Consts.push_back(Const);
-        continue;
-      }
-      if (AMDGPU::R600_KC0RegClass.contains(Reg) ||
-          AMDGPU::R600_KC1RegClass.contains(Reg)) {
-        unsigned Index = RI.getEncodingValue(Reg) & 0xff;
-        unsigned Chan = RI.getHWRegChan(Reg);
+    const SmallVector<std::pair<MachineOperand *, int64_t>, 3> &Srcs =
+        getSrcs(MI);
+
+    for (unsigned j = 0, e = Srcs.size(); j < e; j++) {
+      std::pair<MachineOperand *, unsigned> Src = Srcs[j];
+      if (Src.first->getReg() == AMDGPU::ALU_CONST)
+        Consts.push_back(Src.second);
+      if (AMDGPU::R600_KC0RegClass.contains(Src.first->getReg()) ||
+          AMDGPU::R600_KC1RegClass.contains(Src.first->getReg())) {
+        unsigned Index = RI.getEncodingValue(Src.first->getReg()) & 0xff;
+        unsigned Chan = RI.getHWRegChan(Src.first->getReg());
         Consts.push_back((Index << 2) | Chan);
-        continue;
       }
     }
   }
@@ -657,7 +816,8 @@ MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
 
   MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV,
                                       AddrReg, ValueReg)
-                                      .addReg(AMDGPU::AR_X, RegState::Implicit);
+                                      .addReg(AMDGPU::AR_X,
+                                           RegState::Implicit | RegState::Kill);
   setImmOperand(Mov, R600Operands::DST_REL, 1);
   return Mov;
 }
@@ -674,7 +834,8 @@ MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
   MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV,
                                       ValueReg,
                                       AddrReg)
-                                      .addReg(AMDGPU::AR_X, RegState::Implicit);
+                                      .addReg(AMDGPU::AR_X,
+                                           RegState::Implicit | RegState::Kill);
   setImmOperand(Mov, R600Operands::SRC0_REL, 1);
 
   return Mov;
@@ -729,6 +890,95 @@ MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MB
   return MIB;
 }
 
+#define OPERAND_CASE(Label) \
+  case Label: { \
+    static const R600Operands::VecOps Ops[] = \
+    { \
+      Label##_X, \
+      Label##_Y, \
+      Label##_Z, \
+      Label##_W \
+    }; \
+    return Ops[Slot]; \
+  }
+
+static R600Operands::VecOps
+getSlotedOps(R600Operands::Ops Op, unsigned Slot) {
+  switch (Op) {
+  OPERAND_CASE(R600Operands::UPDATE_EXEC_MASK)
+  OPERAND_CASE(R600Operands::UPDATE_PREDICATE)
+  OPERAND_CASE(R600Operands::WRITE)
+  OPERAND_CASE(R600Operands::OMOD)
+  OPERAND_CASE(R600Operands::DST_REL)
+  OPERAND_CASE(R600Operands::CLAMP)
+  OPERAND_CASE(R600Operands::SRC0)
+  OPERAND_CASE(R600Operands::SRC0_NEG)
+  OPERAND_CASE(R600Operands::SRC0_REL)
+  OPERAND_CASE(R600Operands::SRC0_ABS)
+  OPERAND_CASE(R600Operands::SRC0_SEL)
+  OPERAND_CASE(R600Operands::SRC1)
+  OPERAND_CASE(R600Operands::SRC1_NEG)
+  OPERAND_CASE(R600Operands::SRC1_REL)
+  OPERAND_CASE(R600Operands::SRC1_ABS)
+  OPERAND_CASE(R600Operands::SRC1_SEL)
+  OPERAND_CASE(R600Operands::PRED_SEL)
+  default:
+    llvm_unreachable("Wrong Operand");
+  }
+}
+
+#undef OPERAND_CASE
+
+static int
+getVecOperandIdx(R600Operands::VecOps Op) {
+  return 1 + Op;
+}
+
+
+MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction(
+    MachineBasicBlock &MBB, MachineInstr *MI, unsigned Slot, unsigned DstReg)
+    const {
+  assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented");
+  unsigned Opcode;
+  const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+  if (ST.getGeneration() <= AMDGPUSubtarget::R700)
+    Opcode = AMDGPU::DOT4_r600;
+  else
+    Opcode = AMDGPU::DOT4_eg;
+  MachineBasicBlock::iterator I = MI;
+  MachineOperand &Src0 = MI->getOperand(
+      getVecOperandIdx(getSlotedOps(R600Operands::SRC0, Slot)));
+  MachineOperand &Src1 = MI->getOperand(
+      getVecOperandIdx(getSlotedOps(R600Operands::SRC1, Slot)));
+  MachineInstr *MIB = buildDefaultInstruction(
+      MBB, I, Opcode, DstReg, Src0.getReg(), Src1.getReg());
+  static const R600Operands::Ops Operands[14] = {
+    R600Operands::UPDATE_EXEC_MASK,
+    R600Operands::UPDATE_PREDICATE,
+    R600Operands::WRITE,
+    R600Operands::OMOD,
+    R600Operands::DST_REL,
+    R600Operands::CLAMP,
+    R600Operands::SRC0_NEG,
+    R600Operands::SRC0_REL,
+    R600Operands::SRC0_ABS,
+    R600Operands::SRC0_SEL,
+    R600Operands::SRC1_NEG,
+    R600Operands::SRC1_REL,
+    R600Operands::SRC1_ABS,
+    R600Operands::SRC1_SEL,
+  };
+
+  for (unsigned i = 0; i < 14; i++) {
+    MachineOperand &MO = MI->getOperand(
+        getVecOperandIdx(getSlotedOps(Operands[i], Slot)));
+    assert (MO.isImm());
+    setImmOperand(MIB, Operands[i], MO.getImm());
+  }
+  MIB->getOperand(20).setImm(0);
+  return MIB;
+}
+
 MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB,
                                          MachineBasicBlock::iterator I,
                                          unsigned DstReg,
@@ -744,6 +994,11 @@ int R600InstrInfo::getOperandIdx(const MachineInstr &MI,
   return getOperandIdx(MI.getOpcode(), Op);
 }
 
+int R600InstrInfo::getOperandIdx(const MachineInstr &MI,
+                                 R600Operands::VecOps Op) const {
+  return getOperandIdx(MI.getOpcode(), Op);
+}
+
 int R600InstrInfo::getOperandIdx(unsigned Opcode,
                                  R600Operands::Ops Op) const {
   unsigned TargetFlags = get(Opcode).TSFlags;
@@ -774,6 +1029,11 @@ int R600InstrInfo::getOperandIdx(unsigned Opcode,
   return R600Operands::ALUOpTable[OpTableIdx][Op];
 }
 
+int R600InstrInfo::getOperandIdx(unsigned Opcode,
+                                 R600Operands::VecOps Op) const {
+  return Op + 1;
+}
+
 void R600InstrInfo::setImmOperand(MachineInstr *MI, R600Operands::Ops Op,
                                   int64_t Imm) const {
   int Idx = getOperandIdx(*MI, Op);
diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h
index babe4b8..6a11c63 100644
--- a/lib/Target/R600/R600InstrInfo.h
+++ b/lib/Target/R600/R600InstrInfo.h
@@ -16,7 +16,6 @@
 #define R600INSTRUCTIONINFO_H_
 
 #include "AMDGPUInstrInfo.h"
-#include "AMDIL.h"
 #include "R600Defines.h"
 #include "R600RegisterInfo.h"
 #include <map>
@@ -36,8 +35,19 @@ namespace llvm {
   const AMDGPUSubtarget &ST;
 
   int getBranchInstr(const MachineOperand &op) const;
+  std::vector<std::pair<int, unsigned> >
+  ExtractSrcs(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PV) const;
 
   public:
+  enum BankSwizzle {
+    ALU_VEC_012 = 0,
+    ALU_VEC_021,
+    ALU_VEC_120,
+    ALU_VEC_102,
+    ALU_VEC_201,
+    ALU_VEC_210
+  };
+
   explicit R600InstrInfo(AMDGPUTargetMachine &tm);
 
   const R600RegisterInfo &getRegisterInfo() const;
@@ -62,6 +72,23 @@ namespace llvm {
   bool usesTextureCache(unsigned Opcode) const;
   bool usesTextureCache(const MachineInstr *MI) const;
 
+  /// \returns a pair for each src of an ALU instructions.
+  /// The first member of a pair is the register id.
+  /// If register is ALU_CONST, second member is SEL.
+  /// If register is ALU_LITERAL, second member is IMM.
+  /// Otherwise, second member value is undefined.
+  SmallVector<std::pair<MachineOperand *, int64_t>, 3>
+      getSrcs(MachineInstr *MI) const;
+
+  /// Given the order VEC_012 < VEC_021 < VEC_120 < VEC_102 < VEC_201 < VEC_210
+  /// returns true and the first (in lexical order) BankSwizzle affectation
+  /// starting from the one already provided in the Instruction Group MIs that
+  /// fits Read Port limitations in BS if available. Otherwise returns false
+  /// and undefined content in BS.
+  /// PV holds GPR to PV registers in the Instruction Group MIs.
+  bool fitsReadPortLimitations(const std::vector<MachineInstr *> &MIs,
+                               const DenseMap<unsigned, unsigned> &PV,
+                               std::vector<BankSwizzle> &BS) const;
   bool fitsConstReadLimitations(const std::vector<unsigned>&) const;
   bool canBundle(const std::vector<MachineInstr *> &) const;
 
@@ -170,6 +197,11 @@ namespace llvm {
                                               unsigned Src0Reg,
                                               unsigned Src1Reg = 0) const;
 
+  MachineInstr *buildSlotOfVectorInstruction(MachineBasicBlock &MBB,
+                                             MachineInstr *MI,
+                                             unsigned Slot,
+                                             unsigned DstReg) const;
+
   MachineInstr *buildMovImm(MachineBasicBlock &BB,
                                   MachineBasicBlock::iterator I,
                                   unsigned DstReg,
@@ -179,11 +211,13 @@ namespace llvm {
   ///
   /// \returns -1 if the Instruction does not contain the specified \p Op.
   int getOperandIdx(const MachineInstr &MI, R600Operands::Ops Op) const;
+  int getOperandIdx(const MachineInstr &MI, R600Operands::VecOps Op) const;
 
   /// \brief Get the index of \p Op for the given Opcode.
   ///
   /// \returns -1 if the Instruction does not contain the specified \p Op.
   int getOperandIdx(unsigned Opcode, R600Operands::Ops Op) const;
+  int getOperandIdx(unsigned Opcode, R600Operands::VecOps Op) const;
 
   /// \brief Helper function for setting instruction flag values.
   void setImmOperand(MachineInstr *MI, R600Operands::Ops Op, int64_t Imm) const;
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index 1060b0a..b4131be 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -78,7 +78,7 @@ def SEL : OperandWithDefaultOps <i32, (ops (i32 -1))> {
   let PrintMethod = "printSel";
 }
 def BANK_SWIZZLE : OperandWithDefaultOps <i32, (ops (i32 0))> {
-  let PrintMethod = "printSel";
+  let PrintMethod = "printBankSwizzle";
 }
 
 def LITERAL : InstFlag<"printLiteral">;
@@ -96,6 +96,12 @@ def UP : InstFlag <"printUpdatePred">;
 // Once we start using the packetizer in this backend we should have this
 // default to 0.
 def LAST : InstFlag<"printLast", 1>;
+def RSel : Operand<i32> {
+  let PrintMethod = "printRSel";
+}
+def CT: Operand<i32> {
+  let PrintMethod = "printCT";
+}
 
 def FRAMEri : Operand<iPTR> {
   let MIOperandInfo = (ops R600_Reg32:$ptr, i32imm:$index);
@@ -358,9 +364,9 @@ class R600_1OP <bits<11> inst, string opName, list<dag> pattern,
                    LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal,
                    BANK_SWIZZLE:$bank_swizzle),
               !strconcat("  ", opName,
-                   "$clamp $dst$write$dst_rel$omod, "
+                   "$clamp $last $dst$write$dst_rel$omod, "
                    "$src0_neg$src0_abs$src0$src0_abs$src0_rel, "
-                   "$literal $pred_sel$last"),
+                   "$pred_sel $bank_swizzle"),
               pattern,
               itin>,
     R600ALU_Word0,
@@ -399,10 +405,10 @@ class R600_2OP <bits<11> inst, string opName, list<dag> pattern,
                LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal,
                BANK_SWIZZLE:$bank_swizzle),
           !strconcat("  ", opName,
-                "$clamp $update_exec_mask$update_pred$dst$write$dst_rel$omod, "
+                "$clamp $last $update_exec_mask$update_pred$dst$write$dst_rel$omod, "
                 "$src0_neg$src0_abs$src0$src0_abs$src0_rel, "
                 "$src1_neg$src1_abs$src1$src1_abs$src1_rel, "
-                "$literal $pred_sel$last"),
+                "$pred_sel $bank_swizzle"),
           pattern,
           itin>,
     R600ALU_Word0,
@@ -436,11 +442,12 @@ class R600_3OP <bits<5> inst, string opName, list<dag> pattern,
                R600_Reg32:$src2, NEG:$src2_neg, REL:$src2_rel, SEL:$src2_sel,
                LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal,
                BANK_SWIZZLE:$bank_swizzle),
-          !strconcat("  ", opName, "$clamp $dst$dst_rel, "
+          !strconcat("  ", opName, "$clamp $last $dst$dst_rel, "
                              "$src0_neg$src0$src0_rel, "
                              "$src1_neg$src1$src1_rel, "
                              "$src2_neg$src2$src2_rel, "
-                             "$literal $pred_sel$last"),
+                             "$pred_sel"
+                             "$bank_swizzle"),
           pattern,
           itin>,
     R600ALU_Word0,
@@ -462,38 +469,7 @@ class R600_REDUCTION <bits<11> inst, dag ins, string asm, list<dag> pattern,
           pattern,
           itin>;
 
-class R600_TEX <bits<11> inst, string opName, list<dag> pattern,
-                InstrItinClass itin = AnyALU> :
-  InstR600 <(outs R600_Reg128:$DST_GPR),
-          (ins R600_Reg128:$SRC_GPR, i32imm:$RESOURCE_ID, i32imm:$SAMPLER_ID, i32imm:$textureTarget),
-          !strconcat(opName, "$DST_GPR, $SRC_GPR, $RESOURCE_ID, $SAMPLER_ID, $textureTarget"),
-          pattern,
-          itin>, TEX_WORD0, TEX_WORD1, TEX_WORD2 {
-    let Inst{31-0} = Word0;
-    let Inst{63-32} = Word1;
-
-    let TEX_INST = inst{4-0};
-    let SRC_REL = 0;
-    let DST_REL = 0;
-    let DST_SEL_X = 0;
-    let DST_SEL_Y = 1;
-    let DST_SEL_Z = 2;
-    let DST_SEL_W = 3;
-    let LOD_BIAS = 0;
-
-    let INST_MOD = 0;
-    let FETCH_WHOLE_QUAD = 0;
-    let ALT_CONST = 0;
-    let SAMPLER_INDEX_MODE = 0;
-    let RESOURCE_INDEX_MODE = 0;
-
-    let COORD_TYPE_X = 0;
-    let COORD_TYPE_Y = 0;
-    let COORD_TYPE_Z = 0;
-    let COORD_TYPE_W = 0;
-
-    let TEXInst = 1;
-  }
+
 
 } // End mayLoad = 1, mayStore = 0, hasSideEffects = 0
 
@@ -575,26 +551,21 @@ def load_param : LoadParamFrag<load>;
 def load_param_zexti8 : LoadParamFrag<zextloadi8>;
 def load_param_zexti16 : LoadParamFrag<zextloadi16>;
 
-def isR600 : Predicate<"Subtarget.device()"
-                            "->getGeneration() == AMDGPUDeviceInfo::HD4XXX">;
-def isR700 : Predicate<"Subtarget.device()"
-                            "->getGeneration() == AMDGPUDeviceInfo::HD4XXX &&"
-                            "Subtarget.device()->getDeviceFlag()"
-                            ">= OCL_DEVICE_RV710">;
+def isR600 : Predicate<"Subtarget.getGeneration() <= AMDGPUSubtarget::R700">;
+def isR700 : Predicate<"Subtarget.getGeneration() == AMDGPUSubtarget::R700">;
 def isEG : Predicate<
-  "Subtarget.device()->getGeneration() >= AMDGPUDeviceInfo::HD5XXX && "
-  "Subtarget.device()->getGeneration() < AMDGPUDeviceInfo::HD7XXX && "
-  "Subtarget.device()->getDeviceFlag() != OCL_DEVICE_CAYMAN">;
+  "Subtarget.getGeneration() >= AMDGPUSubtarget::EVERGREEN && "
+  "Subtarget.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS && "
+  "!Subtarget.hasCaymanISA()">;
 
-def isCayman : Predicate<"Subtarget.device()"
-                            "->getDeviceFlag() == OCL_DEVICE_CAYMAN">;
-def isEGorCayman : Predicate<"Subtarget.device()"
-                            "->getGeneration() == AMDGPUDeviceInfo::HD5XXX"
-                            "|| Subtarget.device()->getGeneration() =="
-                            "AMDGPUDeviceInfo::HD6XXX">;
+def isCayman : Predicate<"Subtarget.hasCaymanISA()">;
+def isEGorCayman : Predicate<"Subtarget.getGeneration() == "
+                             "AMDGPUSubtarget::EVERGREEN"
+                            "|| Subtarget.getGeneration() =="
+                            "AMDGPUSubtarget::NORTHERN_ISLANDS">;
 
 def isR600toCayman : Predicate<
-                     "Subtarget.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX">;
+                     "Subtarget.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS">;
 
 //===----------------------------------------------------------------------===//
 // R600 SDNodes
@@ -602,13 +573,13 @@ def isR600toCayman : Predicate<
 
 def INTERP_PAIR_XY :  AMDGPUShaderInst <
   (outs R600_TReg32_X:$dst0, R600_TReg32_Y:$dst1),
-  (ins i32imm:$src0, R600_Reg32:$src1, R600_Reg32:$src2),
+  (ins i32imm:$src0, R600_TReg32_Y:$src1, R600_TReg32_X:$src2),
   "INTERP_PAIR_XY $src0 $src1 $src2 : $dst0 dst1",
   []>;
 
 def INTERP_PAIR_ZW :  AMDGPUShaderInst <
   (outs R600_TReg32_Z:$dst0, R600_TReg32_W:$dst1),
-  (ins i32imm:$src0, R600_Reg32:$src1, R600_Reg32:$src2),
+  (ins i32imm:$src0, R600_TReg32_Y:$src1, R600_TReg32_X:$src2),
   "INTERP_PAIR_ZW $src0 $src1 $src2 : $dst0 dst1",
   []>;
 
@@ -617,6 +588,36 @@ def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS",
   [SDNPVariadic]
 >;
 
+def DOT4 : SDNode<"AMDGPUISD::DOT4",
+  SDTypeProfile<1, 8, [SDTCisFP<0>, SDTCisVT<1, f32>, SDTCisVT<2, f32>,
+      SDTCisVT<3, f32>, SDTCisVT<4, f32>, SDTCisVT<5, f32>,
+      SDTCisVT<6, f32>, SDTCisVT<7, f32>, SDTCisVT<8, f32>]>,
+  []
+>;
+
+def TEXTURE_FETCH_Type : SDTypeProfile<1, 19, [SDTCisFP<0>]>;
+
+def TEXTURE_FETCH: SDNode<"AMDGPUISD::TEXTURE_FETCH", TEXTURE_FETCH_Type, []>;
+
+multiclass TexPattern<bits<32> TextureOp, Instruction inst, ValueType vt = v4f32> {
+def : Pat<(TEXTURE_FETCH (i32 TextureOp), vt:$SRC_GPR,
+          (i32 imm:$srcx), (i32 imm:$srcy), (i32 imm:$srcz), (i32 imm:$srcw),
+          (i32 imm:$offsetx), (i32 imm:$offsety), (i32 imm:$offsetz),
+          (i32 imm:$DST_SEL_X), (i32 imm:$DST_SEL_Y), (i32 imm:$DST_SEL_Z),
+          (i32 imm:$DST_SEL_W),
+          (i32 imm:$RESOURCE_ID), (i32 imm:$SAMPLER_ID),
+          (i32 imm:$COORD_TYPE_X), (i32 imm:$COORD_TYPE_Y), (i32 imm:$COORD_TYPE_Z),
+          (i32 imm:$COORD_TYPE_W)),
+          (inst R600_Reg128:$SRC_GPR,
+          imm:$srcx, imm:$srcy, imm:$srcz, imm:$srcw,
+          imm:$offsetx, imm:$offsety, imm:$offsetz,
+          imm:$DST_SEL_X, imm:$DST_SEL_Y, imm:$DST_SEL_Z,
+          imm:$DST_SEL_W,
+          imm:$RESOURCE_ID, imm:$SAMPLER_ID,
+          imm:$COORD_TYPE_X, imm:$COORD_TYPE_Y, imm:$COORD_TYPE_Z,
+          imm:$COORD_TYPE_W)>;
+}
+
 //===----------------------------------------------------------------------===//
 // Interpolation Instructions
 //===----------------------------------------------------------------------===//
@@ -814,12 +815,15 @@ class CF_ALU_WORD1 {
   let Word1{31} = BARRIER;
 }
 
+def KCACHE : InstFlag<"printKCache">;
+
 class ALU_CLAUSE<bits<4> inst, string OpName> : AMDGPUInst <(outs),
-(ins i32imm:$ADDR, i32imm:$KCACHE_BANK0, i32imm:$KCACHE_BANK1, i32imm:$KCACHE_MODE0, i32imm:$KCACHE_MODE1,
-i32imm:$KCACHE_ADDR0, i32imm:$KCACHE_ADDR1, i32imm:$COUNT),
+(ins i32imm:$ADDR, i32imm:$KCACHE_BANK0, i32imm:$KCACHE_BANK1,
+KCACHE:$KCACHE_MODE0, KCACHE:$KCACHE_MODE1,
+i32imm:$KCACHE_ADDR0, i32imm:$KCACHE_ADDR1,
+i32imm:$COUNT),
 !strconcat(OpName, " $COUNT, @$ADDR, "
-"KC0[CB$KCACHE_BANK0:$KCACHE_ADDR0-$KCACHE_ADDR0+32]"
-", KC1[CB$KCACHE_BANK1:$KCACHE_ADDR1-$KCACHE_ADDR1+32]"),
+"KC0[$KCACHE_MODE0], KC1[$KCACHE_MODE1]"),
 [] >, CF_ALU_WORD0, CF_ALU_WORD1 {
   field bits<64> Inst;
 
@@ -1128,92 +1132,70 @@ def CNDGT_INT : R600_3OP <
 // Texture instructions
 //===----------------------------------------------------------------------===//
 
-def TEX_LD : R600_TEX <
-  0x03, "TEX_LD",
-  [(set v4f32:$DST_GPR, (int_AMDGPU_txf v4f32:$SRC_GPR,
-      imm:$OFFSET_X, imm:$OFFSET_Y, imm:$OFFSET_Z, imm:$RESOURCE_ID,
-      imm:$SAMPLER_ID, imm:$textureTarget))]
-> {
-let AsmString = "TEX_LD $DST_GPR, $SRC_GPR, $OFFSET_X, $OFFSET_Y, $OFFSET_Z,"
-    "$RESOURCE_ID, $SAMPLER_ID, $textureTarget";
-let InOperandList = (ins R600_Reg128:$SRC_GPR, i32imm:$OFFSET_X,
-    i32imm:$OFFSET_Y, i32imm:$OFFSET_Z, i32imm:$RESOURCE_ID, i32imm:$SAMPLER_ID,
-    i32imm:$textureTarget);
-}
-
-def TEX_GET_TEXTURE_RESINFO : R600_TEX <
-  0x04, "TEX_GET_TEXTURE_RESINFO",
-  [(set v4f32:$DST_GPR, (int_AMDGPU_txq v4f32:$SRC_GPR,
-      imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))]
->;
-
-def TEX_GET_GRADIENTS_H : R600_TEX <
-  0x07, "TEX_GET_GRADIENTS_H",
-  [(set v4f32:$DST_GPR, (int_AMDGPU_ddx v4f32:$SRC_GPR,
-      imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))]
->;
-
-def TEX_GET_GRADIENTS_V : R600_TEX <
-  0x08, "TEX_GET_GRADIENTS_V",
-  [(set v4f32:$DST_GPR, (int_AMDGPU_ddy v4f32:$SRC_GPR,
-      imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))]
->;
-
-def TEX_SET_GRADIENTS_H : R600_TEX <
-  0x0B, "TEX_SET_GRADIENTS_H",
-  []
->;
-
-def TEX_SET_GRADIENTS_V : R600_TEX <
-  0x0C, "TEX_SET_GRADIENTS_V",
-  []
->;
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
 
-def TEX_SAMPLE : R600_TEX <
-  0x10, "TEX_SAMPLE",
-  [(set v4f32:$DST_GPR, (int_AMDGPU_tex v4f32:$SRC_GPR,
-      imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))]
->;
+class R600_TEX <bits<11> inst, string opName> :
+  InstR600 <(outs R600_Reg128:$DST_GPR),
+          (ins R600_Reg128:$SRC_GPR,
+          RSel:$srcx, RSel:$srcy, RSel:$srcz, RSel:$srcw,
+          i32imm:$offsetx, i32imm:$offsety, i32imm:$offsetz,
+          RSel:$DST_SEL_X, RSel:$DST_SEL_Y, RSel:$DST_SEL_Z, RSel:$DST_SEL_W,
+          i32imm:$RESOURCE_ID, i32imm:$SAMPLER_ID,
+          CT:$COORD_TYPE_X, CT:$COORD_TYPE_Y, CT:$COORD_TYPE_Z,
+          CT:$COORD_TYPE_W),
+          !strconcat(opName,
+          " $DST_GPR.$DST_SEL_X$DST_SEL_Y$DST_SEL_Z$DST_SEL_W, "
+          "$SRC_GPR.$srcx$srcy$srcz$srcw "
+          "RID:$RESOURCE_ID SID:$SAMPLER_ID "
+          "CT:$COORD_TYPE_X$COORD_TYPE_Y$COORD_TYPE_Z$COORD_TYPE_W"),
+          [],
+          NullALU>, TEX_WORD0, TEX_WORD1, TEX_WORD2 {
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
 
-def TEX_SAMPLE_C : R600_TEX <
-  0x18, "TEX_SAMPLE_C",
-  [(set v4f32:$DST_GPR, (int_AMDGPU_tex v4f32:$SRC_GPR,
-      imm:$RESOURCE_ID, imm:$SAMPLER_ID, TEX_SHADOW:$textureTarget))]
->;
+  let TEX_INST = inst{4-0};
+  let SRC_REL = 0;
+  let DST_REL = 0;
+  let LOD_BIAS = 0;
 
-def TEX_SAMPLE_L : R600_TEX <
-  0x11, "TEX_SAMPLE_L",
-  [(set v4f32:$DST_GPR, (int_AMDGPU_txl v4f32:$SRC_GPR,
-      imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))]
->;
+  let INST_MOD = 0;
+  let FETCH_WHOLE_QUAD = 0;
+  let ALT_CONST = 0;
+  let SAMPLER_INDEX_MODE = 0;
+  let RESOURCE_INDEX_MODE = 0;
 
-def TEX_SAMPLE_C_L : R600_TEX <
-  0x19, "TEX_SAMPLE_C_L",
-  [(set v4f32:$DST_GPR, (int_AMDGPU_txl v4f32:$SRC_GPR,
-      imm:$RESOURCE_ID, imm:$SAMPLER_ID, TEX_SHADOW:$textureTarget))]
->;
+  let TEXInst = 1;
+}
 
-def TEX_SAMPLE_LB : R600_TEX <
-  0x12, "TEX_SAMPLE_LB",
-  [(set v4f32:$DST_GPR, (int_AMDGPU_txb v4f32:$SRC_GPR,
-      imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))]
->;
+} // End mayLoad = 0, mayStore = 0, hasSideEffects = 0
 
-def TEX_SAMPLE_C_LB : R600_TEX <
-  0x1A, "TEX_SAMPLE_C_LB",
-  [(set v4f32:$DST_GPR, (int_AMDGPU_txb v4f32:$SRC_GPR,
-      imm:$RESOURCE_ID, imm:$SAMPLER_ID, TEX_SHADOW:$textureTarget))]
->;
 
-def TEX_SAMPLE_G : R600_TEX <
-  0x14, "TEX_SAMPLE_G",
-  []
->;
 
-def TEX_SAMPLE_C_G : R600_TEX <
-  0x1C, "TEX_SAMPLE_C_G",
-  []
->;
+def TEX_SAMPLE : R600_TEX <0x10, "TEX_SAMPLE">;
+def TEX_SAMPLE_C : R600_TEX <0x18, "TEX_SAMPLE_C">;
+def TEX_SAMPLE_L : R600_TEX <0x11, "TEX_SAMPLE_L">;
+def TEX_SAMPLE_C_L : R600_TEX <0x19, "TEX_SAMPLE_C_L">;
+def TEX_SAMPLE_LB : R600_TEX <0x12, "TEX_SAMPLE_LB">;
+def TEX_SAMPLE_C_LB : R600_TEX <0x1A, "TEX_SAMPLE_C_LB">;
+def TEX_LD : R600_TEX <0x03, "TEX_LD">;
+def TEX_GET_TEXTURE_RESINFO : R600_TEX <0x04, "TEX_GET_TEXTURE_RESINFO">;
+def TEX_GET_GRADIENTS_H : R600_TEX <0x07, "TEX_GET_GRADIENTS_H">;
+def TEX_GET_GRADIENTS_V : R600_TEX <0x08, "TEX_GET_GRADIENTS_V">;
+def TEX_SET_GRADIENTS_H : R600_TEX <0x0B, "TEX_SET_GRADIENTS_H">;
+def TEX_SET_GRADIENTS_V : R600_TEX <0x0C, "TEX_SET_GRADIENTS_V">;
+def TEX_SAMPLE_G : R600_TEX <0x14, "TEX_SAMPLE_G">;
+def TEX_SAMPLE_C_G : R600_TEX <0x1C, "TEX_SAMPLE_C_G">;
+
+defm : TexPattern<0, TEX_SAMPLE>;
+defm : TexPattern<1, TEX_SAMPLE_C>;
+defm : TexPattern<2, TEX_SAMPLE_L>;
+defm : TexPattern<3, TEX_SAMPLE_C_L>;
+defm : TexPattern<4, TEX_SAMPLE_LB>;
+defm : TexPattern<5, TEX_SAMPLE_C_LB>;
+defm : TexPattern<6, TEX_LD, v4i32>;
+defm : TexPattern<7, TEX_GET_TEXTURE_RESINFO, v4i32>;
+defm : TexPattern<8, TEX_GET_GRADIENTS_H>;
+defm : TexPattern<9, TEX_GET_GRADIENTS_V>;
 
 //===----------------------------------------------------------------------===//
 // Helper classes for common instructions
@@ -1249,17 +1231,49 @@ class CNDGE_Common <bits<5> inst> : R600_3OP <
   [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_GE))]
 >;
 
-multiclass DOT4_Common <bits<11> inst> {
-
-  def _pseudo : R600_REDUCTION <inst,
-    (ins R600_Reg128:$src0, R600_Reg128:$src1),
-    "DOT4 $dst $src0, $src1",
-    [(set f32:$dst, (int_AMDGPU_dp4 v4f32:$src0, v4f32:$src1))]
-  >;
 
-  def _real : R600_2OP <inst, "DOT4", []>;
+let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"  in {
+class R600_VEC2OP<list<dag> pattern> : InstR600 <(outs R600_Reg32:$dst), (ins
+// Slot X
+   UEM:$update_exec_mask_X, UP:$update_pred_X, WRITE:$write_X,
+   OMOD:$omod_X, REL:$dst_rel_X, CLAMP:$clamp_X,
+   R600_TReg32_X:$src0_X, NEG:$src0_neg_X, REL:$src0_rel_X, ABS:$src0_abs_X, SEL:$src0_sel_X,
+   R600_TReg32_X:$src1_X, NEG:$src1_neg_X, REL:$src1_rel_X, ABS:$src1_abs_X, SEL:$src1_sel_X,
+   R600_Pred:$pred_sel_X,
+// Slot Y
+   UEM:$update_exec_mask_Y, UP:$update_pred_Y, WRITE:$write_Y,
+   OMOD:$omod_Y, REL:$dst_rel_Y, CLAMP:$clamp_Y,
+   R600_TReg32_Y:$src0_Y, NEG:$src0_neg_Y, REL:$src0_rel_Y, ABS:$src0_abs_Y, SEL:$src0_sel_Y,
+   R600_TReg32_Y:$src1_Y, NEG:$src1_neg_Y, REL:$src1_rel_Y, ABS:$src1_abs_Y, SEL:$src1_sel_Y,
+   R600_Pred:$pred_sel_Y,
+// Slot Z
+   UEM:$update_exec_mask_Z, UP:$update_pred_Z, WRITE:$write_Z,
+   OMOD:$omod_Z, REL:$dst_rel_Z, CLAMP:$clamp_Z,
+   R600_TReg32_Z:$src0_Z, NEG:$src0_neg_Z, REL:$src0_rel_Z, ABS:$src0_abs_Z, SEL:$src0_sel_Z,
+   R600_TReg32_Z:$src1_Z, NEG:$src1_neg_Z, REL:$src1_rel_Z, ABS:$src1_abs_Z, SEL:$src1_sel_Z,
+   R600_Pred:$pred_sel_Z,
+// Slot W
+   UEM:$update_exec_mask_W, UP:$update_pred_W, WRITE:$write_W,
+   OMOD:$omod_W, REL:$dst_rel_W, CLAMP:$clamp_W,
+   R600_TReg32_W:$src0_W, NEG:$src0_neg_W, REL:$src0_rel_W, ABS:$src0_abs_W, SEL:$src0_sel_W,
+   R600_TReg32_W:$src1_W, NEG:$src1_neg_W, REL:$src1_rel_W, ABS:$src1_abs_W, SEL:$src1_sel_W,
+   R600_Pred:$pred_sel_W,
+   LITERAL:$literal0, LITERAL:$literal1),
+  "",
+  pattern,
+  AnyALU> {}
 }
 
+def DOT_4 : R600_VEC2OP<[(set R600_Reg32:$dst, (DOT4
+  R600_TReg32_X:$src0_X, R600_TReg32_X:$src1_X,
+  R600_TReg32_Y:$src0_Y, R600_TReg32_Y:$src1_Y,
+  R600_TReg32_Z:$src0_Z, R600_TReg32_Z:$src1_Z,
+  R600_TReg32_W:$src0_W, R600_TReg32_W:$src1_W))]>;
+
+
+class DOT4_Common <bits<11> inst> : R600_2OP <inst, "DOT4", []>;
+
+
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
 multiclass CUBE_Common <bits<11> inst> {
 
@@ -1432,7 +1446,7 @@ let Predicates = [isR600] in {
   def CNDE_r600 : CNDE_Common<0x18>;
   def CNDGT_r600 : CNDGT_Common<0x19>;
   def CNDGE_r600 : CNDGE_Common<0x1A>;
-  defm DOT4_r600 : DOT4_Common<0x50>;
+  def DOT4_r600 : DOT4_Common<0x50>;
   defm CUBE_r600 : CUBE_Common<0x52>;
   def EXP_IEEE_r600 : EXP_IEEE_Common<0x61>;
   def LOG_CLAMPED_r600 : LOG_CLAMPED_Common<0x62>;
@@ -1611,14 +1625,13 @@ let Predicates = [isEGorCayman] in {
                                                i32:$src2))],
     VecALU
   >;
+  def : BFEPattern <BFE_UINT_eg>;
 
-  def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", []>;
+  def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", [], VecALU>;
   defm : BFIPatterns <BFI_INT_eg>;
 
-  def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT",
-    [(set i32:$dst, (AMDGPUbitalign i32:$src0, i32:$src1, i32:$src2))],
-    VecALU
-  >;
+  def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>;
+  def : ROTRPattern <BIT_ALIGN_INT_eg>;
 
   def MULADD_eg : MULADD_Common<0x14>;
   def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>;
@@ -1630,7 +1643,7 @@ let Predicates = [isEGorCayman] in {
   def CNDGE_eg : CNDGE_Common<0x1B>;
   def MUL_LIT_eg : MUL_LIT_Common<0x1F>;
   def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>;
-  defm DOT4_eg : DOT4_Common<0xBE>;
+  def DOT4_eg : DOT4_Common<0xBE>;
   defm CUBE_eg : CUBE_Common<0xC0>;
 
 let hasSideEffects = 1 in {
@@ -1665,6 +1678,9 @@ let hasSideEffects = 1 in {
 
   def : Pat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>;
 
+  // SHA-256 Patterns
+  def : SHA256MaPattern <BFI_INT_eg, XOR_INT>;
+
   def EG_ExportSwz : ExportSwzInst {
     let Word1{19-16} = 0; // BURST_COUNT
     let Word1{20} = 1; // VALID_PIXEL_MODE
@@ -1743,8 +1759,7 @@ let usesCustomInserter = 1 in {
 
 class RAT_WRITE_CACHELESS_eg <dag ins, bits<4> comp_mask, string name,
                               list<dag> pattern>
-    : EG_CF_RAT <0x57, 0x2, 0, (outs), ins,
-                 !strconcat(name, " $rw_gpr, $index_gpr, $eop"), pattern> {
+    : EG_CF_RAT <0x57, 0x2, 0, (outs), ins, name, pattern> {
   let RIM         = 0;
   // XXX: Have a separate instruction for non-indexed writes.
   let TYPE        = 1;
@@ -1764,19 +1779,19 @@ class RAT_WRITE_CACHELESS_eg <dag ins, bits<4> comp_mask, string name,
 // 32-bit store
 def RAT_WRITE_CACHELESS_32_eg : RAT_WRITE_CACHELESS_eg <
   (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
-  0x1, "RAT_WRITE_CACHELESS_32_eg",
+  0x1, "RAT_WRITE_CACHELESS_32_eg $rw_gpr, $index_gpr, $eop",
   [(global_store i32:$rw_gpr, i32:$index_gpr)]
 >;
 
 //128-bit store
 def RAT_WRITE_CACHELESS_128_eg : RAT_WRITE_CACHELESS_eg <
   (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
-  0xf, "RAT_WRITE_CACHELESS_128",
+  0xf, "RAT_WRITE_CACHELESS_128 $rw_gpr.XYZW, $index_gpr, $eop",
   [(global_store v4i32:$rw_gpr, i32:$index_gpr)]
 >;
 
 class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
-    : InstR600ISA <outs, (ins MEMxi:$ptr), name#" $dst, $ptr", pattern>,
+    : InstR600ISA <outs, (ins MEMxi:$ptr), name, pattern>,
       VTX_WORD1_GPR, VTX_WORD0 {
 
   // Static fields
@@ -1831,7 +1846,7 @@ class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
 }
 
 class VTX_READ_8_eg <bits<8> buffer_id, list<dag> pattern>
-    : VTX_READ_eg <"VTX_READ_8", buffer_id, (outs R600_TReg32_X:$dst),
+    : VTX_READ_eg <"VTX_READ_8 $dst, $ptr", buffer_id, (outs R600_TReg32_X:$dst),
                    pattern> {
 
   let MEGA_FETCH_COUNT = 1;
@@ -1843,7 +1858,7 @@ class VTX_READ_8_eg <bits<8> buffer_id, list<dag> pattern>
 }
 
 class VTX_READ_16_eg <bits<8> buffer_id, list<dag> pattern>
-    : VTX_READ_eg <"VTX_READ_16", buffer_id, (outs R600_TReg32_X:$dst),
+    : VTX_READ_eg <"VTX_READ_16 $dst, $ptr", buffer_id, (outs R600_TReg32_X:$dst),
                     pattern> {
   let MEGA_FETCH_COUNT = 2;
   let DST_SEL_X = 0;
@@ -1855,7 +1870,7 @@ class VTX_READ_16_eg <bits<8> buffer_id, list<dag> pattern>
 }
 
 class VTX_READ_32_eg <bits<8> buffer_id, list<dag> pattern>
-    : VTX_READ_eg <"VTX_READ_32", buffer_id, (outs R600_TReg32_X:$dst),
+    : VTX_READ_eg <"VTX_READ_32 $dst, $ptr", buffer_id, (outs R600_TReg32_X:$dst),
                    pattern> {
 
   let MEGA_FETCH_COUNT = 4;
@@ -1876,7 +1891,7 @@ class VTX_READ_32_eg <bits<8> buffer_id, list<dag> pattern>
 }
 
 class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern>
-    : VTX_READ_eg <"VTX_READ_128", buffer_id, (outs R600_Reg128:$dst),
+    : VTX_READ_eg <"VTX_READ_128 $dst.XYZW, $ptr", buffer_id, (outs R600_Reg128:$dst),
                    pattern> {
 
   let MEGA_FETCH_COUNT = 16;
diff --git a/lib/Target/R600/R600Intrinsics.td b/lib/Target/R600/R600Intrinsics.td
index dc8980a..58d86b6 100644
--- a/lib/Target/R600/R600Intrinsics.td
+++ b/lib/Target/R600/R600Intrinsics.td
@@ -12,12 +12,49 @@
 //===----------------------------------------------------------------------===//
 
 let TargetPrefix = "R600", isTarget = 1 in {
+  class TextureIntrinsicFloatInput :
+    Intrinsic<[llvm_v4f32_ty], [
+      llvm_v4f32_ty, // Coord
+      llvm_i32_ty, // offset_x
+      llvm_i32_ty, // offset_y,
+      llvm_i32_ty, // offset_z,
+      llvm_i32_ty, // resource_id
+      llvm_i32_ty, // samplerid
+      llvm_i32_ty, // coord_type_x
+      llvm_i32_ty, // coord_type_y
+      llvm_i32_ty, // coord_type_z
+      llvm_i32_ty // coord_type_w
+    ], [IntrNoMem]>;
+  class TextureIntrinsicInt32Input :
+    Intrinsic<[llvm_v4i32_ty], [
+      llvm_v4i32_ty, // Coord
+      llvm_i32_ty, // offset_x
+      llvm_i32_ty, // offset_y,
+      llvm_i32_ty, // offset_z,
+      llvm_i32_ty, // resource_id
+      llvm_i32_ty, // samplerid
+      llvm_i32_ty, // coord_type_x
+      llvm_i32_ty, // coord_type_y
+      llvm_i32_ty, // coord_type_z
+      llvm_i32_ty // coord_type_w
+    ], [IntrNoMem]>;
+
   def int_R600_load_input :
     Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
   def int_R600_interp_input :
     Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_R600_load_texbuf :
     Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_R600_tex : TextureIntrinsicFloatInput;
+  def int_R600_texc : TextureIntrinsicFloatInput;
+  def int_R600_txl : TextureIntrinsicFloatInput;
+  def int_R600_txlc : TextureIntrinsicFloatInput;
+  def int_R600_txb : TextureIntrinsicFloatInput;
+  def int_R600_txbc : TextureIntrinsicFloatInput;
+  def int_R600_txf : TextureIntrinsicInt32Input;
+  def int_R600_txq : TextureIntrinsicInt32Input;
+  def int_R600_ddx : TextureIntrinsicFloatInput;
+  def int_R600_ddy : TextureIntrinsicFloatInput;
   def int_R600_store_swizzle :
     Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
   def int_R600_store_stream_output :
diff --git a/lib/Target/R600/R600MachineFunctionInfo.h b/lib/Target/R600/R600MachineFunctionInfo.h
index 70fddbb..f23d9b7 100644
--- a/lib/Target/R600/R600MachineFunctionInfo.h
+++ b/lib/Target/R600/R600MachineFunctionInfo.h
@@ -13,9 +13,9 @@
 #ifndef R600MACHINEFUNCTIONINFO_H
 #define R600MACHINEFUNCTIONINFO_H
 
+#include "AMDGPUMachineFunction.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/CodeGen/SelectionDAG.h"
-#include "AMDGPUMachineFunction.h"
 #include <vector>
 
 namespace llvm {
diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
index a777142..a330d88 100644
--- a/lib/Target/R600/R600MachineScheduler.cpp
+++ b/lib/Target/R600/R600MachineScheduler.cpp
@@ -16,12 +16,11 @@
 #define DEBUG_TYPE "misched"
 
 #include "R600MachineScheduler.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/PassManager.h"
 #include "llvm/Support/raw_ostream.h"
-#include <set>
 
 using namespace llvm;
 
@@ -31,53 +30,87 @@ void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
   TII = static_cast<const R600InstrInfo*>(DAG->TII);
   TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
   MRI = &DAG->MRI;
-  Available[IDAlu]->clear();
-  Available[IDFetch]->clear();
-  Available[IDOther]->clear();
   CurInstKind = IDOther;
   CurEmitted = 0;
   OccupedSlotsMask = 15;
   InstKindLimit[IDAlu] = TII->getMaxAlusPerClause();
-
+  InstKindLimit[IDOther] = 32;
 
   const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>();
-  if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD5XXX) {
-    InstKindLimit[IDFetch] = 7; // 8 minus 1 for security
-  } else {
-    InstKindLimit[IDFetch] = 15; // 16 minus 1 for security
-  }
+  InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
+  AluInstCount = 0;
+  FetchInstCount = 0;
 }
 
-void R600SchedStrategy::MoveUnits(ReadyQueue *QSrc, ReadyQueue *QDst)
+void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
+                                  std::vector<SUnit *> &QDst)
 {
-  if (QSrc->empty())
-    return;
-  for (ReadyQueue::iterator I = QSrc->begin(),
-      E = QSrc->end(); I != E; ++I) {
-    (*I)->NodeQueueId &= ~QSrc->getID();
-    QDst->push(*I);
-  }
-  QSrc->clear();
+  QDst.insert(QDst.end(), QSrc.begin(), QSrc.end());
+  QSrc.clear();
+}
+
+static
+unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
+  assert (GPRCount && "GPRCount cannot be 0");
+  return 248 / GPRCount;
 }
 
 SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
   SUnit *SU = 0;
-  IsTopNode = true;
   NextInstKind = IDOther;
 
+  IsTopNode = false;
+
   // check if we might want to switch current clause type
-  bool AllowSwitchToAlu = (CurInstKind == IDOther) ||
-      (CurEmitted > InstKindLimit[CurInstKind]) ||
-      (Available[CurInstKind]->empty());
-  bool AllowSwitchFromAlu = (CurEmitted > InstKindLimit[CurInstKind]) &&
-      (!Available[IDFetch]->empty() || !Available[IDOther]->empty());
-
-  if ((AllowSwitchToAlu && CurInstKind != IDAlu) ||
-      (!AllowSwitchFromAlu && CurInstKind == IDAlu)) {
+  bool AllowSwitchToAlu = (CurEmitted >= InstKindLimit[CurInstKind]) ||
+      (Available[CurInstKind].empty());
+  bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) &&
+      (!Available[IDFetch].empty() || !Available[IDOther].empty());
+
+  if (CurInstKind == IDAlu && !Available[IDFetch].empty()) {
+    // We use the heuristic provided by AMD Accelerated Parallel Processing
+    // OpenCL Programming Guide :
+    // The approx. number of WF that allows TEX inst to hide ALU inst is :
+    // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU))
+    float ALUFetchRationEstimate = 
+        (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
+        (FetchInstCount + Available[IDFetch].size());
+    unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
+    DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
+    // We assume the local GPR requirements to be "dominated" by the requirement
+    // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
+    // after TEX are indeed likely to consume or generate values from/for the
+    // TEX clause.
+    // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
+    // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
+    // one GPR) or TmXYZW = TnXYZW (need 2 GPR).
+    // (TODO : use RegisterPressure)
+    // If we are going too use too many GPR, we flush Fetch instruction to lower
+    // register pressure on 128 bits regs.
+    unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
+    if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
+      AllowSwitchFromAlu = true;
+  }
+
+
+  // We want to scheduled AR defs as soon as possible to make sure they aren't
+  // put in a different ALU clause from their uses.
+  if (!SU && !UnscheduledARDefs.empty()) {
+      SU = UnscheduledARDefs[0];
+      UnscheduledARDefs.erase(UnscheduledARDefs.begin());
+      NextInstKind = IDAlu;
+  }
+
+  if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) ||
+      (!AllowSwitchFromAlu && CurInstKind == IDAlu))) {
     // try to pick ALU
     SU = pickAlu();
+    if (!SU && !PhysicalRegCopy.empty()) {
+      SU = PhysicalRegCopy.front();
+      PhysicalRegCopy.erase(PhysicalRegCopy.begin());
+    }
     if (SU) {
-      if (CurEmitted >  InstKindLimit[IDAlu])
+      if (CurEmitted >= InstKindLimit[IDAlu])
         CurEmitted = 0;
       NextInstKind = IDAlu;
     }
@@ -97,16 +130,21 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
       NextInstKind = IDOther;
   }
 
+  // We want to schedule the AR uses as late as possible to make sure that
+  // the AR defs have been released.
+  if (!SU && !UnscheduledARUses.empty()) {
+      SU = UnscheduledARUses[0];
+      UnscheduledARUses.erase(UnscheduledARUses.begin());
+      NextInstKind = IDAlu;
+  }
+
+
   DEBUG(
       if (SU) {
-        dbgs() << "picked node: ";
+        dbgs() << " ** Pick node **\n";
         SU->dump(DAG);
       } else {
-        dbgs() << "NO NODE ";
-        for (int i = 0; i < IDLast; ++i) {
-          Available[i]->dump();
-          Pending[i]->dump();
-        }
+        dbgs() << "NO NODE \n";
         for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
           const SUnit &S = DAG->SUnits[i];
           if (!S.isScheduled)
@@ -119,10 +157,6 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
 }
 
 void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
-
-  DEBUG(dbgs() << "scheduled: ");
-  DEBUG(SU->dump(DAG));
-
   if (NextInstKind != CurInstKind) {
     DEBUG(dbgs() << "Instruction Type Switch\n");
     if (NextInstKind != IDAlu)
@@ -132,6 +166,7 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
   }
 
   if (CurInstKind == IDAlu) {
+    AluInstCount ++;
     switch (getAluKind(SU)) {
     case AluT_XYZW:
       CurEmitted += 4;
@@ -157,20 +192,51 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
 
   if (CurInstKind != IDFetch) {
     MoveUnits(Pending[IDFetch], Available[IDFetch]);
-  }
-  MoveUnits(Pending[IDOther], Available[IDOther]);
+  } else
+    FetchInstCount++;
 }
 
-void R600SchedStrategy::releaseTopNode(SUnit *SU) {
-  int IK = getInstKind(SU);
+static bool
+isPhysicalRegCopy(MachineInstr *MI) {
+  if (MI->getOpcode() != AMDGPU::COPY)
+    return false;
 
-  DEBUG(dbgs() << IK << " <= ");
-  DEBUG(SU->dump(DAG));
+  return !TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg());
+}
 
-  Pending[IK]->push(SU);
+void R600SchedStrategy::releaseTopNode(SUnit *SU) {
+  DEBUG(dbgs() << "Top Releasing ";SU->dump(DAG););
 }
 
 void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
+  DEBUG(dbgs() << "Bottom Releasing ";SU->dump(DAG););
+  if (isPhysicalRegCopy(SU->getInstr())) {
+    PhysicalRegCopy.push_back(SU);
+    return;
+  }
+
+  int IK = getInstKind(SU);
+
+  // Check for AR register defines
+  for (MachineInstr::const_mop_iterator I = SU->getInstr()->operands_begin(),
+                                        E = SU->getInstr()->operands_end();
+                                        I != E; ++I) {
+    if (I->isReg() && I->getReg() == AMDGPU::AR_X) {
+      if (I->isDef()) {
+        UnscheduledARDefs.push_back(SU);
+      } else {
+        UnscheduledARUses.push_back(SU);
+      }
+      return;
+    }
+  }
+
+  // There is no export clause, we can schedule one as soon as its ready
+  if (IK == IDOther)
+    Available[IDOther].push_back(SU);
+  else
+    Pending[IK].push_back(SU);
+
 }
 
 bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
@@ -186,17 +252,15 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
   MachineInstr *MI = SU->getInstr();
 
     switch (MI->getOpcode()) {
+    case AMDGPU::PRED_X:
+      return AluPredX;
     case AMDGPU::INTERP_PAIR_XY:
     case AMDGPU::INTERP_PAIR_ZW:
     case AMDGPU::INTERP_VEC_LOAD:
+    case AMDGPU::DOT_4:
       return AluT_XYZW;
     case AMDGPU::COPY:
-      if (TargetRegisterInfo::isPhysicalRegister(MI->getOperand(1).getReg())) {
-        // %vregX = COPY Tn_X is likely to be discarded in favor of an
-        // assignement of Tn_X to %vregX, don't considers it in scheduling
-        return AluDiscarded;
-      }
-      else if (MI->getOperand(1).isUndef()) {
+      if (MI->getOperand(1).isUndef()) {
         // MI will become a KILL, don't considers it in scheduling
         return AluDiscarded;
       }
@@ -246,57 +310,37 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
 int R600SchedStrategy::getInstKind(SUnit* SU) {
   int Opcode = SU->getInstr()->getOpcode();
 
+  if (TII->usesTextureCache(Opcode) || TII->usesVertexCache(Opcode))
+    return IDFetch;
+
   if (TII->isALUInstr(Opcode)) {
     return IDAlu;
   }
 
   switch (Opcode) {
+  case AMDGPU::PRED_X:
   case AMDGPU::COPY:
   case AMDGPU::CONST_COPY:
   case AMDGPU::INTERP_PAIR_XY:
   case AMDGPU::INTERP_PAIR_ZW:
   case AMDGPU::INTERP_VEC_LOAD:
-  case AMDGPU::DOT4_eg_pseudo:
-  case AMDGPU::DOT4_r600_pseudo:
+  case AMDGPU::DOT_4:
     return IDAlu;
-  case AMDGPU::TEX_VTX_CONSTBUF:
-  case AMDGPU::TEX_VTX_TEXBUF:
-  case AMDGPU::TEX_LD:
-  case AMDGPU::TEX_GET_TEXTURE_RESINFO:
-  case AMDGPU::TEX_GET_GRADIENTS_H:
-  case AMDGPU::TEX_GET_GRADIENTS_V:
-  case AMDGPU::TEX_SET_GRADIENTS_H:
-  case AMDGPU::TEX_SET_GRADIENTS_V:
-  case AMDGPU::TEX_SAMPLE:
-  case AMDGPU::TEX_SAMPLE_C:
-  case AMDGPU::TEX_SAMPLE_L:
-  case AMDGPU::TEX_SAMPLE_C_L:
-  case AMDGPU::TEX_SAMPLE_LB:
-  case AMDGPU::TEX_SAMPLE_C_LB:
-  case AMDGPU::TEX_SAMPLE_G:
-  case AMDGPU::TEX_SAMPLE_C_G:
-  case AMDGPU::TXD:
-  case AMDGPU::TXD_SHADOW:
-    return IDFetch;
   default:
-    DEBUG(
-        dbgs() << "other inst: ";
-        SU->dump(DAG);
-    );
     return IDOther;
   }
 }
 
-SUnit *R600SchedStrategy::PopInst(std::multiset<SUnit *, CompareSUnit> &Q) {
+SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q) {
   if (Q.empty())
     return NULL;
-  for (std::set<SUnit *, CompareSUnit>::iterator It = Q.begin(), E = Q.end();
+  for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend();
       It != E; ++It) {
     SUnit *SU = *It;
     InstructionsGroupCandidate.push_back(SU->getInstr());
     if (TII->canBundle(InstructionsGroupCandidate)) {
       InstructionsGroupCandidate.pop_back();
-      Q.erase(It);
+      Q.erase((It + 1).base());
       return SU;
     } else {
       InstructionsGroupCandidate.pop_back();
@@ -306,14 +350,12 @@ SUnit *R600SchedStrategy::PopInst(std::multiset<SUnit *, CompareSUnit> &Q) {
 }
 
 void R600SchedStrategy::LoadAlu() {
-  ReadyQueue *QSrc = Pending[IDAlu];
-  for (ReadyQueue::iterator I = QSrc->begin(),
-        E = QSrc->end(); I != E; ++I) {
-      (*I)->NodeQueueId &= ~QSrc->getID();
-      AluKind AK = getAluKind(*I);
-      AvailableAlus[AK].insert(*I);
-    }
-    QSrc->clear();
+  std::vector<SUnit *> &QSrc = Pending[IDAlu];
+  for (unsigned i = 0, e = QSrc.size(); i < e; ++i) {
+    AluKind AK = getAluKind(QSrc[i]);
+    AvailableAlus[AK].push_back(QSrc[i]);
+  }
+  QSrc.clear();
 }
 
 void R600SchedStrategy::PrepareNextSlot() {
@@ -355,35 +397,29 @@ void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
 SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot) {
   static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W};
   SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]]);
-  SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny]);
-  if (!UnslotedSU) {
+  if (SlotedSU)
     return SlotedSU;
-  } else if (!SlotedSU) {
+  SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny]);
+  if (UnslotedSU)
     AssignSlot(UnslotedSU->getInstr(), Slot);
-    return UnslotedSU;
-  } else {
-    //Determine which one to pick (the lesser one)
-    if (CompareSUnit()(SlotedSU, UnslotedSU)) {
-      AvailableAlus[AluAny].insert(UnslotedSU);
-      return SlotedSU;
-    } else {
-      AvailableAlus[IndexToID[Slot]].insert(SlotedSU);
-      AssignSlot(UnslotedSU->getInstr(), Slot);
-      return UnslotedSU;
-    }
-  }
+  return UnslotedSU;
 }
 
-bool R600SchedStrategy::isAvailablesAluEmpty() const {
-  return Pending[IDAlu]->empty() && AvailableAlus[AluAny].empty() &&
-      AvailableAlus[AluT_XYZW].empty() && AvailableAlus[AluT_X].empty() &&
-      AvailableAlus[AluT_Y].empty() && AvailableAlus[AluT_Z].empty() &&
-      AvailableAlus[AluT_W].empty() && AvailableAlus[AluDiscarded].empty();
+unsigned R600SchedStrategy::AvailablesAluCount() const {
+  return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
+      AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
+      AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
+      AvailableAlus[AluDiscarded].size() + AvailableAlus[AluPredX].size();
 }
 
 SUnit* R600SchedStrategy::pickAlu() {
-  while (!isAvailablesAluEmpty()) {
+  while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
     if (!OccupedSlotsMask) {
+      // Bottom up scheduling : predX must comes first
+      if (!AvailableAlus[AluPredX].empty()) {
+        OccupedSlotsMask = 15;
+        return PopInst(AvailableAlus[AluPredX]);
+      }
       // Flush physical reg copies (RA will discard them)
       if (!AvailableAlus[AluDiscarded].empty()) {
         OccupedSlotsMask = 15;
@@ -395,7 +431,7 @@ SUnit* R600SchedStrategy::pickAlu() {
         return PopInst(AvailableAlus[AluT_XYZW]);
       }
     }
-    for (unsigned Chan = 0; Chan < 4; ++Chan) {
+    for (int Chan = 3; Chan > -1; --Chan) {
       bool isOccupied = OccupedSlotsMask & (1 << Chan);
       if (!isOccupied) {
         SUnit *SU = AttemptFillSlot(Chan);
@@ -413,14 +449,14 @@ SUnit* R600SchedStrategy::pickAlu() {
 
 SUnit* R600SchedStrategy::pickOther(int QID) {
   SUnit *SU = 0;
-  ReadyQueue *AQ = Available[QID];
+  std::vector<SUnit *> &AQ = Available[QID];
 
-  if (AQ->empty()) {
+  if (AQ.empty()) {
     MoveUnits(Pending[QID], AQ);
   }
-  if (!AQ->empty()) {
-    SU = *AQ->begin();
-    AQ->remove(AQ->begin());
+  if (!AQ.empty()) {
+    SU = AQ.back();
+    AQ.resize(AQ.size() - 1);
   }
   return SU;
 }
diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h
index 3d0367f..aae8b3f 100644
--- a/lib/Target/R600/R600MachineScheduler.h
+++ b/lib/Target/R600/R600MachineScheduler.h
@@ -16,21 +16,14 @@
 #define R600MACHINESCHEDULER_H_
 
 #include "R600InstrInfo.h"
+#include "llvm/ADT/PriorityQueue.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/ADT/PriorityQueue.h"
 
 using namespace llvm;
 
 namespace llvm {
 
-class CompareSUnit {
-public:
-  bool operator()(const SUnit *S1, const SUnit *S2) {
-    return S1->getDepth() > S2->getDepth();
-  }
-};
-
 class R600SchedStrategy : public MachineSchedStrategy {
 
   const ScheduleDAGMI *DAG;
@@ -38,12 +31,6 @@ class R600SchedStrategy : public MachineSchedStrategy {
   const R600RegisterInfo *TRI;
   MachineRegisterInfo *MRI;
 
-  enum InstQueue {
-    QAlu = 1,
-    QFetch = 2,
-    QOther = 4
-  };
-
   enum InstKind {
     IDAlu,
     IDFetch,
@@ -58,17 +45,24 @@ class R600SchedStrategy : public MachineSchedStrategy {
     AluT_Z,
     AluT_W,
     AluT_XYZW,
+    AluPredX,
     AluDiscarded, // LLVM Instructions that are going to be eliminated
     AluLast
   };
 
-  ReadyQueue *Available[IDLast], *Pending[IDLast];
-  std::multiset<SUnit *, CompareSUnit> AvailableAlus[AluLast];
+  std::vector<SUnit *> Available[IDLast], Pending[IDLast];
+  std::vector<SUnit *> AvailableAlus[AluLast];
+  std::vector<SUnit *> UnscheduledARDefs;
+  std::vector<SUnit *> UnscheduledARUses;
+  std::vector<SUnit *> PhysicalRegCopy;
 
   InstKind CurInstKind;
   int CurEmitted;
   InstKind NextInstKind;
 
+  unsigned AluInstCount;
+  unsigned FetchInstCount;
+
   int InstKindLimit[IDLast];
 
   int OccupedSlotsMask;
@@ -76,19 +70,9 @@ class R600SchedStrategy : public MachineSchedStrategy {
 public:
   R600SchedStrategy() :
     DAG(0), TII(0), TRI(0), MRI(0) {
-    Available[IDAlu] = new ReadyQueue(QAlu, "AAlu");
-    Available[IDFetch] = new ReadyQueue(QFetch, "AFetch");
-    Available[IDOther] = new ReadyQueue(QOther, "AOther");
-    Pending[IDAlu] = new ReadyQueue(QAlu<<4, "PAlu");
-    Pending[IDFetch] = new ReadyQueue(QFetch<<4, "PFetch");
-    Pending[IDOther] = new ReadyQueue(QOther<<4, "POther");
   }
 
   virtual ~R600SchedStrategy() {
-    for (unsigned I = 0; I < IDLast; ++I) {
-      delete Available[I];
-      delete Pending[I];
-    }
   }
 
   virtual void initialize(ScheduleDAGMI *dag);
@@ -104,15 +88,15 @@ private:
   bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const;
   AluKind getAluKind(SUnit *SU) const;
   void LoadAlu();
-  bool isAvailablesAluEmpty() const;
+  unsigned AvailablesAluCount() const;
   SUnit *AttemptFillSlot (unsigned Slot);
   void PrepareNextSlot();
-  SUnit *PopInst(std::multiset<SUnit *, CompareSUnit> &Q);
+  SUnit *PopInst(std::vector<SUnit*> &Q);
 
   void AssignSlot(MachineInstr *MI, unsigned Slot);
   SUnit* pickAlu();
   SUnit* pickOther(int QID);
-  void MoveUnits(ReadyQueue *QSrc, ReadyQueue *QDst);
+  void MoveUnits(std::vector<SUnit *> &QSrc, std::vector<SUnit *> &QDst);
 };
 
 } // namespace llvm
diff --git a/lib/Target/R600/R600OptimizeVectorRegisters.cpp b/lib/Target/R600/R600OptimizeVectorRegisters.cpp
new file mode 100644
index 0000000..4636426
--- /dev/null
+++ b/lib/Target/R600/R600OptimizeVectorRegisters.cpp
@@ -0,0 +1,372 @@
+//===--------------------- R600MergeVectorRegisters.cpp -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass merges inputs of swizzeable instructions into vector sharing
+/// common data and/or have enough undef subreg using swizzle abilities.
+///
+/// For instance let's consider the following pseudo code :
+/// vreg5<def> = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3
+/// ...
+/// vreg7<def> = REG_SEQ vreg1, sub0, vreg3, sub1, undef, sub2, vreg4, sub3
+/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub1, sub2, sub3
+///
+/// is turned into :
+/// vreg5<def> = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3
+/// ...
+/// vreg7<def> = INSERT_SUBREG vreg4, sub3
+/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub2, sub1, sub3
+///
+/// This allow regalloc to reduce register pressure for vector registers and
+/// to reduce MOV count.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "vec-merger"
+#include "llvm/Support/Debug.h"
+#include "AMDGPU.h"
+#include "R600InstrInfo.h"
+#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+static bool
+isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) {
+  for (MachineRegisterInfo::def_iterator It = MRI.def_begin(Reg),
+      E = MRI.def_end(); It != E; ++It) {
+    return (*It).isImplicitDef();
+  }
+  llvm_unreachable("Reg without a def");
+  return false;
+}
+
+class RegSeqInfo {
+public:
+  MachineInstr *Instr;
+  DenseMap<unsigned, unsigned> RegToChan;
+  std::vector<unsigned> UndefReg;
+  RegSeqInfo(MachineRegisterInfo &MRI, MachineInstr *MI) : Instr(MI) {
+    assert (MI->getOpcode() == AMDGPU::REG_SEQUENCE);
+    for (unsigned i = 1, e = Instr->getNumOperands(); i < e; i+=2) {
+      MachineOperand &MO = Instr->getOperand(i);
+      unsigned Chan = Instr->getOperand(i + 1).getImm();
+      if (isImplicitlyDef(MRI, MO.getReg()))
+        UndefReg.push_back(Chan);
+      else
+        RegToChan[MO.getReg()] = Chan;
+    }
+  }
+  RegSeqInfo() {}
+
+  bool operator==(const RegSeqInfo &RSI) const {
+    return RSI.Instr == Instr;
+  }
+};
+
+class R600VectorRegMerger : public MachineFunctionPass {
+private:
+  MachineRegisterInfo *MRI;
+  const R600InstrInfo *TII;
+  bool canSwizzle(const MachineInstr &) const;
+  bool areAllUsesSwizzeable(unsigned Reg) const;
+  void SwizzleInput(MachineInstr &,
+      const std::vector<std::pair<unsigned, unsigned> > &) const;
+  bool tryMergeVector(const RegSeqInfo *, RegSeqInfo *,
+      std::vector<std::pair<unsigned, unsigned> > &Remap) const;
+  bool tryMergeUsingCommonSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI,
+      std::vector<std::pair<unsigned, unsigned> > &RemapChan);
+  bool tryMergeUsingFreeSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI,
+      std::vector<std::pair<unsigned, unsigned> > &RemapChan);
+  MachineInstr *RebuildVector(RegSeqInfo *MI,
+      const RegSeqInfo *BaseVec,
+      const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const;
+  void RemoveMI(MachineInstr *);
+  void trackRSI(const RegSeqInfo &RSI);
+
+  typedef DenseMap<unsigned, std::vector<MachineInstr *> > InstructionSetMap;
+  DenseMap<MachineInstr *, RegSeqInfo> PreviousRegSeq;
+  InstructionSetMap PreviousRegSeqByReg;
+  InstructionSetMap PreviousRegSeqByUndefCount;
+public:
+  static char ID;
+  R600VectorRegMerger(TargetMachine &tm) : MachineFunctionPass(ID),
+  TII(0) { }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachineLoopInfo>();
+    AU.addPreserved<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  const char *getPassName() const {
+    return "R600 Vector Registers Merge Pass";
+  }
+
+  bool runOnMachineFunction(MachineFunction &Fn);
+};
+
+char R600VectorRegMerger::ID = 0;
+
+bool R600VectorRegMerger::canSwizzle(const MachineInstr &MI)
+    const {
+  if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST)
+    return true;
+  switch (MI.getOpcode()) {
+  case AMDGPU::R600_ExportSwz:
+  case AMDGPU::EG_ExportSwz:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool R600VectorRegMerger::tryMergeVector(const RegSeqInfo *Untouched,
+    RegSeqInfo *ToMerge, std::vector< std::pair<unsigned, unsigned> > &Remap)
+    const {
+  unsigned CurrentUndexIdx = 0;
+  for (DenseMap<unsigned, unsigned>::iterator It = ToMerge->RegToChan.begin(),
+      E = ToMerge->RegToChan.end(); It != E; ++It) {
+    DenseMap<unsigned, unsigned>::const_iterator PosInUntouched =
+        Untouched->RegToChan.find((*It).first);
+    if (PosInUntouched != Untouched->RegToChan.end()) {
+      Remap.push_back(std::pair<unsigned, unsigned>
+          ((*It).second, (*PosInUntouched).second));
+      continue;
+    }
+    if (CurrentUndexIdx >= Untouched->UndefReg.size())
+      return false;
+    Remap.push_back(std::pair<unsigned, unsigned>
+        ((*It).second, Untouched->UndefReg[CurrentUndexIdx++]));
+  }
+
+  return true;
+}
+
+static
+unsigned getReassignedChan(
+    const std::vector<std::pair<unsigned, unsigned> > &RemapChan,
+    unsigned Chan) {
+  for (unsigned j = 0, je = RemapChan.size(); j < je; j++) {
+    if (RemapChan[j].first == Chan)
+      return RemapChan[j].second;
+  }
+  llvm_unreachable("Chan wasn't reassigned");
+}
+
+MachineInstr *R600VectorRegMerger::RebuildVector(
+    RegSeqInfo *RSI, const RegSeqInfo *BaseRSI,
+    const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const {
+  unsigned Reg = RSI->Instr->getOperand(0).getReg();
+  MachineBasicBlock::iterator Pos = RSI->Instr;
+  MachineBasicBlock &MBB = *Pos->getParent();
+  DebugLoc DL = Pos->getDebugLoc();
+
+  unsigned SrcVec = BaseRSI->Instr->getOperand(0).getReg();
+  DenseMap<unsigned, unsigned> UpdatedRegToChan = BaseRSI->RegToChan;
+  std::vector<unsigned> UpdatedUndef = BaseRSI->UndefReg;
+  for (DenseMap<unsigned, unsigned>::iterator It = RSI->RegToChan.begin(),
+      E = RSI->RegToChan.end(); It != E; ++It) {
+    if (BaseRSI->RegToChan.find((*It).first) != BaseRSI->RegToChan.end()) {
+      UpdatedRegToChan[(*It).first] = (*It).second;
+      continue;
+    }
+    unsigned DstReg = MRI->createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
+    unsigned SubReg = (*It).first;
+    unsigned Swizzle = (*It).second;
+    unsigned Chan = getReassignedChan(RemapChan, Swizzle);
+
+    MachineInstr *Tmp = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::INSERT_SUBREG),
+        DstReg)
+        .addReg(SrcVec)
+        .addReg(SubReg)
+        .addImm(Chan);
+    UpdatedRegToChan[SubReg] = Chan;
+    std::vector<unsigned>::iterator ChanPos =
+        std::find(UpdatedUndef.begin(), UpdatedUndef.end(), Chan);
+    if (ChanPos != UpdatedUndef.end())
+      UpdatedUndef.erase(ChanPos);
+    assert(std::find(UpdatedUndef.begin(), UpdatedUndef.end(), Chan) ==
+               UpdatedUndef.end() &&
+           "UpdatedUndef shouldn't contain Chan more than once!");
+    DEBUG(dbgs() << "    ->"; Tmp->dump(););
+    (void)Tmp;
+    SrcVec = DstReg;
+  }
+  Pos = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::COPY), Reg)
+      .addReg(SrcVec);
+  DEBUG(dbgs() << "    ->"; Pos->dump(););
+
+  DEBUG(dbgs() << "  Updating Swizzle:\n");
+  for (MachineRegisterInfo::use_iterator It = MRI->use_begin(Reg),
+      E = MRI->use_end(); It != E; ++It) {
+    DEBUG(dbgs() << "    ";(*It).dump(); dbgs() << "    ->");
+    SwizzleInput(*It, RemapChan);
+    DEBUG((*It).dump());
+  }
+  RSI->Instr->eraseFromParent();
+
+  // Update RSI
+  RSI->Instr = Pos;
+  RSI->RegToChan = UpdatedRegToChan;
+  RSI->UndefReg = UpdatedUndef;
+
+  return Pos;
+}
+
+void R600VectorRegMerger::RemoveMI(MachineInstr *MI) {
+  for (InstructionSetMap::iterator It = PreviousRegSeqByReg.begin(),
+      E = PreviousRegSeqByReg.end(); It != E; ++It) {
+    std::vector<MachineInstr *> &MIs = (*It).second;
+    MIs.erase(std::find(MIs.begin(), MIs.end(), MI), MIs.end());
+  }
+  for (InstructionSetMap::iterator It = PreviousRegSeqByUndefCount.begin(),
+      E = PreviousRegSeqByUndefCount.end(); It != E; ++It) {
+    std::vector<MachineInstr *> &MIs = (*It).second;
+    MIs.erase(std::find(MIs.begin(), MIs.end(), MI), MIs.end());
+  }
+}
+
+void R600VectorRegMerger::SwizzleInput(MachineInstr &MI,
+    const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const {
+  unsigned Offset;
+  if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST)
+    Offset = 2;
+  else
+    Offset = 3;
+  for (unsigned i = 0; i < 4; i++) {
+    unsigned Swizzle = MI.getOperand(i + Offset).getImm() + 1;
+    for (unsigned j = 0, e = RemapChan.size(); j < e; j++) {
+      if (RemapChan[j].first == Swizzle) {
+        MI.getOperand(i + Offset).setImm(RemapChan[j].second - 1);
+        break;
+      }
+    }
+  }
+}
+
+bool R600VectorRegMerger::areAllUsesSwizzeable(unsigned Reg) const {
+  for (MachineRegisterInfo::use_iterator It = MRI->use_begin(Reg),
+      E = MRI->use_end(); It != E; ++It) {
+    if (!canSwizzle(*It))
+      return false;
+  }
+  return true;
+}
+
+bool R600VectorRegMerger::tryMergeUsingCommonSlot(RegSeqInfo &RSI,
+    RegSeqInfo &CompatibleRSI,
+    std::vector<std::pair<unsigned, unsigned> > &RemapChan) {
+  for (MachineInstr::mop_iterator MOp = RSI.Instr->operands_begin(),
+      MOE = RSI.Instr->operands_end(); MOp != MOE; ++MOp) {
+    if (!MOp->isReg())
+      continue;
+    if (PreviousRegSeqByReg[MOp->getReg()].empty())
+      continue;
+    std::vector<MachineInstr *> MIs = PreviousRegSeqByReg[MOp->getReg()];
+    for (unsigned i = 0, e = MIs.size(); i < e; i++) {
+      CompatibleRSI = PreviousRegSeq[MIs[i]];
+      if (RSI == CompatibleRSI)
+        continue;
+      if (tryMergeVector(&CompatibleRSI, &RSI, RemapChan))
+        return true;
+    }
+  }
+  return false;
+}
+
+bool R600VectorRegMerger::tryMergeUsingFreeSlot(RegSeqInfo &RSI,
+    RegSeqInfo &CompatibleRSI,
+    std::vector<std::pair<unsigned, unsigned> > &RemapChan) {
+  unsigned NeededUndefs = 4 - RSI.UndefReg.size();
+  if (PreviousRegSeqByUndefCount[NeededUndefs].empty())
+    return false;
+  std::vector<MachineInstr *> &MIs =
+      PreviousRegSeqByUndefCount[NeededUndefs];
+  CompatibleRSI = PreviousRegSeq[MIs.back()];
+  tryMergeVector(&CompatibleRSI, &RSI, RemapChan);
+  return true;
+}
+
+void R600VectorRegMerger::trackRSI(const RegSeqInfo &RSI) {
+  for (DenseMap<unsigned, unsigned>::const_iterator
+  It = RSI.RegToChan.begin(), E = RSI.RegToChan.end(); It != E; ++It) {
+    PreviousRegSeqByReg[(*It).first].push_back(RSI.Instr);
+  }
+  PreviousRegSeqByUndefCount[RSI.UndefReg.size()].push_back(RSI.Instr);
+  PreviousRegSeq[RSI.Instr] = RSI;
+}
+
+bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
+  TII = static_cast<const R600InstrInfo *>(Fn.getTarget().getInstrInfo());
+  MRI = &(Fn.getRegInfo());
+  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
+       MBB != MBBe; ++MBB) {
+    MachineBasicBlock *MB = MBB;
+    PreviousRegSeq.clear();
+    PreviousRegSeqByReg.clear();
+    PreviousRegSeqByUndefCount.clear();
+
+    for (MachineBasicBlock::iterator MII = MB->begin(), MIIE = MB->end();
+         MII != MIIE; ++MII) {
+      MachineInstr *MI = MII;
+      if (MI->getOpcode() != AMDGPU::REG_SEQUENCE)
+        continue;
+
+      RegSeqInfo RSI(*MRI, MI);
+
+      // All uses of MI are swizzeable ?
+      unsigned Reg = MI->getOperand(0).getReg();
+      if (!areAllUsesSwizzeable(Reg))
+        continue;
+
+      DEBUG (dbgs() << "Trying to optimize ";
+          MI->dump();
+      );
+
+      RegSeqInfo CandidateRSI;
+      std::vector<std::pair<unsigned, unsigned> > RemapChan;
+      DEBUG(dbgs() << "Using common slots...\n";);
+      if (tryMergeUsingCommonSlot(RSI, CandidateRSI, RemapChan)) {
+        // Remove CandidateRSI mapping
+        RemoveMI(CandidateRSI.Instr);
+        MII = RebuildVector(&RSI, &CandidateRSI, RemapChan);
+        trackRSI(RSI);
+        continue;
+      }
+      DEBUG(dbgs() << "Using free slots...\n";);
+      RemapChan.clear();
+      if (tryMergeUsingFreeSlot(RSI, CandidateRSI, RemapChan)) {
+        RemoveMI(CandidateRSI.Instr);
+        MII = RebuildVector(&RSI, &CandidateRSI, RemapChan);
+        trackRSI(RSI);
+        continue;
+      }
+      //Failed to merge
+      trackRSI(RSI);
+    }
+  }
+  return false;
+}
+
+}
+
+llvm::FunctionPass *llvm::createR600VectorRegMerger(TargetMachine &tm) {
+  return new R600VectorRegMerger(tm);
+}
diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp
index 05e96f1..da614c7 100644
--- a/lib/Target/R600/R600Packetizer.cpp
+++ b/lib/Target/R600/R600Packetizer.cpp
@@ -14,22 +14,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef R600PACKETIZER_CPP
-#define R600PACKETIZER_CPP
-
 #define DEBUG_TYPE "packets"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
+#include "AMDGPU.h"
+#include "R600InstrInfo.h"
 #include "llvm/CodeGen/DFAPacketizer.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
-#include "AMDGPU.h"
-#include "R600InstrInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
 
-namespace llvm {
+namespace {
 
 class R600Packetizer : public MachineFunctionPass {
 
@@ -60,37 +59,59 @@ private:
   const R600InstrInfo *TII;
   const R600RegisterInfo &TRI;
 
-  enum BankSwizzle {
-    ALU_VEC_012 = 0,
-    ALU_VEC_021,
-    ALU_VEC_120,
-    ALU_VEC_102,
-    ALU_VEC_201,
-    ALU_VEC_210
-  };
-
   unsigned getSlot(const MachineInstr *MI) const {
     return TRI.getHWRegChan(MI->getOperand(0).getReg());
   }
 
-  std::vector<unsigned> getPreviousVector(MachineBasicBlock::iterator I) const {
-    std::vector<unsigned> Result;
+  /// \returns register to PV chan mapping for bundle/single instructions that
+  /// immediatly precedes I.
+  DenseMap<unsigned, unsigned> getPreviousVector(MachineBasicBlock::iterator I)
+      const {
+    DenseMap<unsigned, unsigned> Result;
     I--;
     if (!TII->isALUInstr(I->getOpcode()) && !I->isBundle())
       return Result;
     MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
     if (I->isBundle())
       BI++;
-    while (BI->isBundledWithPred() && !TII->isPredicated(BI)) {
+    do {
+      if (TII->isPredicated(BI))
+        continue;
+      if (TII->isTransOnly(BI))
+        continue;
       int OperandIdx = TII->getOperandIdx(BI->getOpcode(), R600Operands::WRITE);
-      if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm())
-        Result.push_back(BI->getOperand(0).getReg());
-      BI++;
-    }
+      if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0)
+        continue;
+      unsigned Dst = BI->getOperand(0).getReg();
+      if (BI->getOpcode() == AMDGPU::DOT4_r600 ||
+          BI->getOpcode() == AMDGPU::DOT4_eg) {
+        Result[Dst] = AMDGPU::PV_X;
+        continue;
+      }
+      unsigned PVReg = 0;
+      switch (TRI.getHWRegChan(Dst)) {
+      case 0:
+        PVReg = AMDGPU::PV_X;
+        break;
+      case 1:
+        PVReg = AMDGPU::PV_Y;
+        break;
+      case 2:
+        PVReg = AMDGPU::PV_Z;
+        break;
+      case 3:
+        PVReg = AMDGPU::PV_W;
+        break;
+      default:
+        llvm_unreachable("Invalid Chan");
+      }
+      Result[Dst] = PVReg;
+    } while ((++BI)->isBundledWithPred());
     return Result;
   }
 
-  void substitutePV(MachineInstr *MI, const std::vector<unsigned> &PV) const {
+  void substitutePV(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PVs)
+      const {
     R600Operands::Ops Ops[] = {
       R600Operands::SRC0,
       R600Operands::SRC1,
@@ -101,30 +122,9 @@ private:
       if (OperandIdx < 0)
         continue;
       unsigned Src = MI->getOperand(OperandIdx).getReg();
-      for (unsigned j = 0, e = PV.size(); j < e; j++) {
-        if (Src == PV[j]) {
-          unsigned Chan = TRI.getHWRegChan(Src);
-          unsigned PVReg;
-          switch (Chan) {
-          case 0:
-            PVReg = AMDGPU::PV_X;
-            break;
-          case 1:
-            PVReg = AMDGPU::PV_Y;
-            break;
-          case 2:
-            PVReg = AMDGPU::PV_Z;
-            break;
-          case 3:
-            PVReg = AMDGPU::PV_W;
-            break;
-          default:
-            llvm_unreachable("Invalid Chan");
-          }
-          MI->getOperand(OperandIdx).setReg(PVReg);
-          break;
-        }
-      }
+      const DenseMap<unsigned, unsigned>::const_iterator It = PVs.find(Src);
+      if (It != PVs.end())
+        MI->getOperand(OperandIdx).setReg(It->second);
     }
   }
 public:
@@ -209,8 +209,11 @@ public:
         }
         dbgs() << "because of Consts read limitations\n";
       });
-    const std::vector<unsigned> &PV = getPreviousVector(MI);
-    bool FitsReadPortLimits = fitsReadPortLimitation(CurrentPacketMIs, PV);
+    const DenseMap<unsigned, unsigned> &PV =
+        getPreviousVector(CurrentPacketMIs.front());
+    std::vector<R600InstrInfo::BankSwizzle> BS;
+    bool FitsReadPortLimits =
+        TII->fitsReadPortLimitations(CurrentPacketMIs, PV, BS);
     DEBUG(
       if (!FitsReadPortLimits) {
         dbgs() << "Couldn't pack :\n";
@@ -223,6 +226,14 @@ public:
         dbgs() << "because of Read port limitations\n";
       });
     bool isBundlable = FitsConstLimits && FitsReadPortLimits;
+    if (isBundlable) {
+      for (unsigned i = 0, e = CurrentPacketMIs.size(); i < e; i++) {
+        MachineInstr *MI = CurrentPacketMIs[i];
+            unsigned Op = TII->getOperandIdx(MI->getOpcode(),
+                R600Operands::BANK_SWIZZLE);
+            MI->getOperand(Op).setImm(BS[i]);
+      }
+    }
     CurrentPacketMIs.pop_back();
     if (!isBundlable) {
       endPacket(MI->getParent(), MI);
@@ -234,133 +245,6 @@ public:
     substitutePV(MI, PV);
     return VLIWPacketizerList::addToPacket(MI);
   }
-private:
-  std::vector<std::pair<int, unsigned> >
-  ExtractSrcs(const MachineInstr *MI, const std::vector<unsigned> &PV) const {
-    R600Operands::Ops Ops[] = {
-      R600Operands::SRC0,
-      R600Operands::SRC1,
-      R600Operands::SRC2
-    };
-    std::vector<std::pair<int, unsigned> > Result;
-    for (unsigned i = 0; i < 3; i++) {
-      int OperandIdx = TII->getOperandIdx(MI->getOpcode(), Ops[i]);
-      if (OperandIdx < 0){
-        Result.push_back(std::pair<int, unsigned>(-1,0));
-        continue;
-      }
-      unsigned Src = MI->getOperand(OperandIdx).getReg();
-      if (std::find(PV.begin(), PV.end(), Src) != PV.end()) {
-        Result.push_back(std::pair<int, unsigned>(-1,0));
-        continue;
-      }
-      unsigned Reg = TRI.getEncodingValue(Src) & 0xff;
-      if (Reg > 127) {
-        Result.push_back(std::pair<int, unsigned>(-1,0));
-        continue;
-      }
-      unsigned Chan = TRI.getHWRegChan(Src);
-      Result.push_back(std::pair<int, unsigned>(Reg, Chan));
-    }
-    return Result;
-  }
-
-  std::vector<std::pair<int, unsigned> >
-  Swizzle(std::vector<std::pair<int, unsigned> > Src,
-  BankSwizzle Swz) const {
-    switch (Swz) {
-    case ALU_VEC_012:
-      break;
-    case ALU_VEC_021:
-      std::swap(Src[1], Src[2]);
-      break;
-    case ALU_VEC_102:
-      std::swap(Src[0], Src[1]);
-      break;
-    case ALU_VEC_120:
-      std::swap(Src[0], Src[1]);
-      std::swap(Src[0], Src[2]);
-      break;
-    case ALU_VEC_201:
-      std::swap(Src[0], Src[2]);
-      std::swap(Src[0], Src[1]);
-      break;
-    case ALU_VEC_210:
-      std::swap(Src[0], Src[2]);
-      break;
-    }
-    return Src;
-  }
-
-  bool isLegal(const std::vector<MachineInstr *> &IG,
-      const std::vector<BankSwizzle> &Swz,
-      const std::vector<unsigned> &PV) const {
-    assert (Swz.size() == IG.size());
-    int Vector[4][3];
-    memset(Vector, -1, sizeof(Vector));
-    for (unsigned i = 0, e = IG.size(); i < e; i++) {
-      const std::vector<std::pair<int, unsigned> > &Srcs =
-          Swizzle(ExtractSrcs(IG[i], PV), Swz[i]);
-      for (unsigned j = 0; j < 3; j++) {
-        const std::pair<int, unsigned> &Src = Srcs[j];
-        if (Src.first < 0)
-          continue;
-        if (Vector[Src.second][j] < 0)
-          Vector[Src.second][j] = Src.first;
-        if (Vector[Src.second][j] != Src.first)
-          return false;
-      }
-    }
-    return true;
-  }
-
-  bool recursiveFitsFPLimitation(
-  std::vector<MachineInstr *> IG,
-  const std::vector<unsigned> &PV,
-  std::vector<BankSwizzle> &SwzCandidate,
-  std::vector<MachineInstr *> CurrentlyChecked)
-      const {
-    if (!isLegal(CurrentlyChecked, SwzCandidate, PV))
-      return false;
-    if (IG.size() == CurrentlyChecked.size()) {
-      return true;
-    }
-    BankSwizzle AvailableSwizzle[] = {
-      ALU_VEC_012,
-      ALU_VEC_021,
-      ALU_VEC_120,
-      ALU_VEC_102,
-      ALU_VEC_201,
-      ALU_VEC_210
-    };
-    CurrentlyChecked.push_back(IG[CurrentlyChecked.size()]);
-    for (unsigned i = 0; i < 6; i++) {
-      SwzCandidate.push_back(AvailableSwizzle[i]);
-      if (recursiveFitsFPLimitation(IG, PV, SwzCandidate, CurrentlyChecked))
-        return true;
-      SwzCandidate.pop_back();
-    }
-    return false;
-  }
-
-  bool fitsReadPortLimitation(
-  std::vector<MachineInstr *> IG,
-  const std::vector<unsigned> &PV)
-      const {
-    //Todo : support shared src0 - src1 operand
-    std::vector<BankSwizzle> SwzCandidate;
-    bool Result = recursiveFitsFPLimitation(IG, PV, SwzCandidate,
-        std::vector<MachineInstr *>());
-    if (!Result)
-      return false;
-    for (unsigned i = 0, e = IG.size(); i < e; i++) {
-      MachineInstr *MI = IG[i];
-      unsigned Op = TII->getOperandIdx(MI->getOpcode(),
-          R600Operands::BANK_SWIZZLE);
-      MI->getOperand(Op).setImm(SwzCandidate[i]);
-    }
-    return true;
-  }
 };
 
 bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
@@ -437,10 +321,8 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
 
 }
 
-}
+} // end anonymous namespace
 
 llvm::FunctionPass *llvm::createR600Packetizer(TargetMachine &tm) {
   return new R600Packetizer(tm);
 }
-
-#endif // R600PACKETIZER_CPP
diff --git a/lib/Target/R600/R600RegisterInfo.cpp b/lib/Target/R600/R600RegisterInfo.cpp
index bbd7995..a42043b 100644
--- a/lib/Target/R600/R600RegisterInfo.cpp
+++ b/lib/Target/R600/R600RegisterInfo.cpp
@@ -20,12 +20,10 @@
 
 using namespace llvm;
 
-R600RegisterInfo::R600RegisterInfo(AMDGPUTargetMachine &tm,
-    const TargetInstrInfo &tii)
-: AMDGPURegisterInfo(tm, tii),
-  TM(tm),
-  TII(tii)
-  { }
+R600RegisterInfo::R600RegisterInfo(AMDGPUTargetMachine &tm)
+: AMDGPURegisterInfo(tm),
+  TM(tm)
+  { RCW.RegWeight = 0; RCW.WeightLimit = 0;}
 
 BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
@@ -55,7 +53,8 @@ BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     Reserved.set(*I);
   }
 
-  const R600InstrInfo *RII = static_cast<const R600InstrInfo*>(&TII);
+  const R600InstrInfo *RII =
+    static_cast<const R600InstrInfo*>(TM.getInstrInfo());
   std::vector<unsigned> IndirectRegs = RII->getIndirectReservedRegs(MF);
   for (std::vector<unsigned>::iterator I = IndirectRegs.begin(),
                                        E = IndirectRegs.end();
@@ -97,3 +96,7 @@ unsigned R600RegisterInfo::getSubRegFromChannel(unsigned Channel) const {
   }
 }
 
+const RegClassWeight &R600RegisterInfo::getRegClassWeight(
+  const TargetRegisterClass *RC) const {
+  return RCW;
+}
diff --git a/lib/Target/R600/R600RegisterInfo.h b/lib/Target/R600/R600RegisterInfo.h
index f9ca918..9b286ee 100644
--- a/lib/Target/R600/R600RegisterInfo.h
+++ b/lib/Target/R600/R600RegisterInfo.h
@@ -21,13 +21,12 @@
 namespace llvm {
 
 class R600TargetMachine;
-class TargetInstrInfo;
 
 struct R600RegisterInfo : public AMDGPURegisterInfo {
   AMDGPUTargetMachine &TM;
-  const TargetInstrInfo &TII;
+  RegClassWeight RCW;
 
-  R600RegisterInfo(AMDGPUTargetMachine &tm, const TargetInstrInfo &tii);
+  R600RegisterInfo(AMDGPUTargetMachine &tm);
 
   virtual BitVector getReservedRegs(const MachineFunction &MF) const;
 
@@ -48,6 +47,8 @@ struct R600RegisterInfo : public AMDGPURegisterInfo {
   /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sel_x)
   unsigned getSubRegFromChannel(unsigned Channel) const;
 
+  virtual const RegClassWeight &getRegClassWeight(const TargetRegisterClass *RC) const;
+
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td
index 5a2e65c..a8b9b70 100644
--- a/lib/Target/R600/R600RegisterInfo.td
+++ b/lib/Target/R600/R600RegisterInfo.td
@@ -35,7 +35,7 @@ foreach Index = 0-127 in {
                                                 Chan>;
   }
   // 128-bit Temporary Registers
-  def T#Index#_XYZW : R600Reg_128 <"T"#Index#".XYZW",
+  def T#Index#_XYZW : R600Reg_128 <"T"#Index#"",
                                    [!cast<Register>("T"#Index#"_X"),
                                     !cast<Register>("T"#Index#"_Y"),
                                     !cast<Register>("T"#Index#"_Z"),
@@ -89,13 +89,13 @@ def ONE_INT : R600Reg<"1", 250>;
 def HALF : R600Reg<"0.5", 252>;
 def NEG_HALF : R600Reg<"-0.5", 252>;
 def ALU_LITERAL_X : R600RegWithChan<"literal.x", 253, "X">;
-def ALU_LITERAL_Y : R600RegWithChan<"literal.x", 253, "Y">;
-def ALU_LITERAL_Z : R600RegWithChan<"literal.x", 253, "Z">;
-def ALU_LITERAL_W : R600RegWithChan<"literal.x", 253, "W">;
-def PV_X : R600RegWithChan<"PV.x", 254, "X">;
-def PV_Y : R600RegWithChan<"PV.y", 254, "Y">;
-def PV_Z : R600RegWithChan<"PV.z", 254, "Z">;
-def PV_W : R600RegWithChan<"PV.w", 254, "W">;
+def ALU_LITERAL_Y : R600RegWithChan<"literal.y", 253, "Y">;
+def ALU_LITERAL_Z : R600RegWithChan<"literal.z", 253, "Z">;
+def ALU_LITERAL_W : R600RegWithChan<"literal.w", 253, "W">;
+def PV_X : R600RegWithChan<"PV.X", 254, "X">;
+def PV_Y : R600RegWithChan<"PV.Y", 254, "Y">;
+def PV_Z : R600RegWithChan<"PV.Z", 254, "Z">;
+def PV_W : R600RegWithChan<"PV.W", 254, "W">;
 def PREDICATE_BIT : R600Reg<"PredicateBit", 0>;
 def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>;
 def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>;
diff --git a/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp b/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp
new file mode 100644
index 0000000..3768ba0
--- /dev/null
+++ b/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp
@@ -0,0 +1,301 @@
+//===-- R600TextureIntrinsicsReplacer.cpp ---------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass translates tgsi-like texture intrinsics into R600 texture
+/// closer to hardware intrinsics.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/InstVisitor.h"
+
+using namespace llvm;
+
+namespace {
+class R600TextureIntrinsicsReplacer :
+    public FunctionPass, public InstVisitor<R600TextureIntrinsicsReplacer> {
+  static char ID;
+
+  Module *Mod;
+  Type *FloatType;
+  Type *Int32Type;
+  Type *V4f32Type;
+  Type *V4i32Type;
+  FunctionType *TexSign;
+  FunctionType *TexQSign;
+
+  void getAdjustementFromTextureTarget(unsigned TextureType, bool hasLOD,
+                                       unsigned SrcSelect[4], unsigned CT[4],
+                                       bool &useShadowVariant) {
+    enum TextureTypes {
+      TEXTURE_1D = 1,
+      TEXTURE_2D,
+      TEXTURE_3D,
+      TEXTURE_CUBE,
+      TEXTURE_RECT,
+      TEXTURE_SHADOW1D,
+      TEXTURE_SHADOW2D,
+      TEXTURE_SHADOWRECT,
+      TEXTURE_1D_ARRAY,
+      TEXTURE_2D_ARRAY,
+      TEXTURE_SHADOW1D_ARRAY,
+      TEXTURE_SHADOW2D_ARRAY,
+      TEXTURE_SHADOWCUBE,
+      TEXTURE_2D_MSAA,
+      TEXTURE_2D_ARRAY_MSAA,
+      TEXTURE_CUBE_ARRAY,
+      TEXTURE_SHADOWCUBE_ARRAY
+    };
+
+    switch (TextureType) {
+    case 0:
+      return;
+    case TEXTURE_RECT:
+    case TEXTURE_1D:
+    case TEXTURE_2D:
+    case TEXTURE_3D:
+    case TEXTURE_CUBE:
+    case TEXTURE_1D_ARRAY:
+    case TEXTURE_2D_ARRAY:
+    case TEXTURE_CUBE_ARRAY:
+    case TEXTURE_2D_MSAA:
+    case TEXTURE_2D_ARRAY_MSAA:
+      useShadowVariant = false;
+      break;
+    case TEXTURE_SHADOW1D:
+    case TEXTURE_SHADOW2D:
+    case TEXTURE_SHADOWRECT:
+    case TEXTURE_SHADOW1D_ARRAY:
+    case TEXTURE_SHADOW2D_ARRAY:
+    case TEXTURE_SHADOWCUBE:
+    case TEXTURE_SHADOWCUBE_ARRAY:
+      useShadowVariant = true;
+      break;
+    default:
+      llvm_unreachable("Unknow Texture Type");
+    }
+
+    if (TextureType == TEXTURE_RECT ||
+        TextureType == TEXTURE_SHADOWRECT) {
+      CT[0] = 0;
+      CT[1] = 0;
+    }
+
+    if (TextureType == TEXTURE_CUBE_ARRAY ||
+        TextureType == TEXTURE_SHADOWCUBE_ARRAY) {
+      CT[2] = 0;
+    }
+
+    if (TextureType == TEXTURE_1D_ARRAY ||
+        TextureType == TEXTURE_SHADOW1D_ARRAY) {
+      if (hasLOD && useShadowVariant) {
+        CT[1] = 0;
+      } else {
+        CT[2] = 0;
+        SrcSelect[2] = 1;
+      }
+    } else if (TextureType == TEXTURE_2D_ARRAY ||
+        TextureType == TEXTURE_SHADOW2D_ARRAY) {
+      CT[2] = 0;
+    }
+
+    if ((TextureType == TEXTURE_SHADOW1D ||
+        TextureType == TEXTURE_SHADOW2D ||
+        TextureType == TEXTURE_SHADOWRECT ||
+        TextureType == TEXTURE_SHADOW1D_ARRAY) &&
+        !(hasLOD && useShadowVariant)) {
+      SrcSelect[3] = 2;
+    }
+  }
+
+  void ReplaceCallInst(CallInst &I, FunctionType *FT, const char *Name,
+                       unsigned SrcSelect[4], Value *Offset[3], Value *Resource,
+                       Value *Sampler, unsigned CT[4], Value *Coord) {
+    IRBuilder<> Builder(&I);
+    Constant *Mask[] = {
+      ConstantInt::get(Int32Type, SrcSelect[0]),
+      ConstantInt::get(Int32Type, SrcSelect[1]),
+      ConstantInt::get(Int32Type, SrcSelect[2]),
+      ConstantInt::get(Int32Type, SrcSelect[3])
+    };
+    Value *SwizzleMask = ConstantVector::get(Mask);
+    Value *SwizzledCoord =
+        Builder.CreateShuffleVector(Coord, Coord, SwizzleMask);
+
+    Value *Args[] = {
+      SwizzledCoord,
+      Offset[0],
+      Offset[1],
+      Offset[2],
+      Resource,
+      Sampler,
+      ConstantInt::get(Int32Type, CT[0]),
+      ConstantInt::get(Int32Type, CT[1]),
+      ConstantInt::get(Int32Type, CT[2]),
+      ConstantInt::get(Int32Type, CT[3])
+    };
+
+    Function *F = Mod->getFunction(Name);
+    if (!F) {
+      F = Function::Create(FT, GlobalValue::ExternalLinkage, Name, Mod);
+      F->addFnAttr(Attribute::ReadNone);
+    }
+    I.replaceAllUsesWith(Builder.CreateCall(F, Args));
+    I.eraseFromParent();
+  }
+
+  void ReplaceTexIntrinsic(CallInst &I, bool hasLOD, FunctionType *FT,
+                           const char *VanillaInt,
+                           const char *ShadowInt) {
+    Value *Coord = I.getArgOperand(0);
+    Value *ResourceId = I.getArgOperand(1);
+    Value *SamplerId = I.getArgOperand(2);
+
+    unsigned TextureType =
+        dyn_cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
+
+    unsigned SrcSelect[4] = { 0, 1, 2, 3 };
+    unsigned CT[4] = {1, 1, 1, 1};
+    Value *Offset[3] = {
+      ConstantInt::get(Int32Type, 0),
+      ConstantInt::get(Int32Type, 0),
+      ConstantInt::get(Int32Type, 0)
+    };
+    bool useShadowVariant;
+
+    getAdjustementFromTextureTarget(TextureType, hasLOD, SrcSelect, CT,
+                                    useShadowVariant);
+
+    ReplaceCallInst(I, FT, useShadowVariant?ShadowInt:VanillaInt, SrcSelect,
+                    Offset, ResourceId, SamplerId, CT, Coord);
+  }
+
+  void ReplaceTXF(CallInst &I) {
+    Value *Coord = I.getArgOperand(0);
+    Value *ResourceId = I.getArgOperand(4);
+    Value *SamplerId = I.getArgOperand(5);
+
+    unsigned TextureType =
+        dyn_cast<ConstantInt>(I.getArgOperand(6))->getZExtValue();
+
+    unsigned SrcSelect[4] = { 0, 1, 2, 3 };
+    unsigned CT[4] = {1, 1, 1, 1};
+    Value *Offset[3] = {
+      I.getArgOperand(1),
+      I.getArgOperand(2),
+      I.getArgOperand(3),
+    };
+    bool useShadowVariant;
+
+    getAdjustementFromTextureTarget(TextureType, false, SrcSelect, CT,
+                                    useShadowVariant);
+
+    ReplaceCallInst(I, TexQSign, "llvm.R600.txf", SrcSelect,
+                    Offset, ResourceId, SamplerId, CT, Coord);
+  }
+
+public:
+  R600TextureIntrinsicsReplacer():
+    FunctionPass(ID) {
+  }
+
+  virtual bool doInitialization(Module &M) {
+    LLVMContext &Ctx = M.getContext();
+    Mod = &M;
+    FloatType = Type::getFloatTy(Ctx);
+    Int32Type = Type::getInt32Ty(Ctx);
+    V4f32Type = VectorType::get(FloatType, 4);
+    V4i32Type = VectorType::get(Int32Type, 4);
+    Type *ArgsType[] = {
+      V4f32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+    };
+    TexSign = FunctionType::get(V4f32Type, ArgsType, /*isVarArg=*/false);
+    Type *ArgsQType[] = {
+      V4i32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+    };
+    TexQSign = FunctionType::get(V4f32Type, ArgsQType, /*isVarArg=*/false);
+    return false;
+  }
+
+  virtual bool runOnFunction(Function &F) {
+    visit(F);
+    return false;
+  }
+
+  virtual const char *getPassName() const {
+    return "R600 Texture Intrinsics Replacer";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+  }
+
+  void visitCallInst(CallInst &I) {
+    StringRef Name = I.getCalledFunction()->getName();
+    if (Name == "llvm.AMDGPU.tex") {
+      ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.tex", "llvm.R600.texc");
+      return;
+    }
+    if (Name == "llvm.AMDGPU.txl") {
+      ReplaceTexIntrinsic(I, true, TexSign, "llvm.R600.txl", "llvm.R600.txlc");
+      return;
+    }
+    if (Name == "llvm.AMDGPU.txb") {
+      ReplaceTexIntrinsic(I, true, TexSign, "llvm.R600.txb", "llvm.R600.txbc");
+      return;
+    }
+    if (Name == "llvm.AMDGPU.txf") {
+      ReplaceTXF(I);
+      return;
+    }
+    if (Name == "llvm.AMDGPU.txq") {
+      ReplaceTexIntrinsic(I, false, TexQSign, "llvm.R600.txq", "llvm.R600.txq");
+      return;
+    }
+    if (Name == "llvm.AMDGPU.ddx") {
+      ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.ddx", "llvm.R600.ddx");
+      return;
+    }
+    if (Name == "llvm.AMDGPU.ddy") {
+      ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.ddy", "llvm.R600.ddy");
+      return;
+    }
+  }
+
+};
+
+char R600TextureIntrinsicsReplacer::ID = 0;
+
+}
+
+FunctionPass *llvm::createR600TextureIntrinsicsReplacer() {
+  return new R600TextureIntrinsicsReplacer();
+}
diff --git a/lib/Target/R600/SIAnnotateControlFlow.cpp b/lib/Target/R600/SIAnnotateControlFlow.cpp
index 2477e2a..9791ef4 100644
--- a/lib/Target/R600/SIAnnotateControlFlow.cpp
+++ b/lib/Target/R600/SIAnnotateControlFlow.cpp
@@ -15,6 +15,8 @@
 #include "AMDGPU.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/Analysis/Dominators.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index 1a07aff..d74f401 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -13,24 +13,23 @@
 //===----------------------------------------------------------------------===//
 
 #include "SIISelLowering.h"
-#include "AMDIL.h"
 #include "AMDGPU.h"
 #include "AMDILIntrinsicInfo.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
-#include "llvm/IR/Function.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/Function.h"
+
+const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
 
 using namespace llvm;
 
 SITargetLowering::SITargetLowering(TargetMachine &TM) :
-    AMDGPUTargetLowering(TM),
-    TII(static_cast<const SIInstrInfo*>(TM.getInstrInfo())),
-    TRI(TM.getRegisterInfo()) {
+    AMDGPUTargetLowering(TM) {
 
   addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass);
   addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
@@ -72,8 +71,9 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
 
   setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
 
-  setOperationAction(ISD::STORE, MVT::i32, Custom);
-  setOperationAction(ISD::STORE, MVT::i64, Custom);
+  setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom);
+
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
 
   setTargetDAGCombine(ISD::SELECT_CC);
 
@@ -82,12 +82,29 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   setSchedulingPreference(Sched::RegPressure);
 }
 
+SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT,
+                                         SDLoc DL, SDValue Chain,
+                                         unsigned Offset) const {
+  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+  PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
+                                            AMDGPUAS::CONSTANT_ADDRESS);
+  EVT ArgVT = MVT::getIntegerVT(VT.getSizeInBits());
+  SDValue BasePtr =  DAG.getCopyFromReg(Chain, DL,
+                           MRI.getLiveInVirtReg(AMDGPU::SGPR0_SGPR1), MVT::i64);
+  SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
+                                             DAG.getConstant(Offset, MVT::i64));
+  return DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, Chain, Ptr,
+                            MachinePointerInfo(UndefValue::get(PtrTy)),
+                            VT, false, false, ArgVT.getSizeInBits() >> 3);
+
+}
+
 SDValue SITargetLowering::LowerFormalArguments(
                                       SDValue Chain,
                                       CallingConv::ID CallConv,
                                       bool isVarArg,
                                       const SmallVectorImpl<ISD::InputArg> &Ins,
-                                      DebugLoc DL, SelectionDAG &DAG,
+                                      SDLoc DL, SelectionDAG &DAG,
                                       SmallVectorImpl<SDValue> &InVals) const {
 
   const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
@@ -103,8 +120,8 @@ SDValue SITargetLowering::LowerFormalArguments(
 
   for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) {
     const ISD::InputArg &Arg = Ins[i];
-   
-    // First check if it's a PS input addr 
+
+    // First check if it's a PS input addr
     if (Info->ShaderType == ShaderType::PIXEL && !Arg.Flags.isInReg()) {
 
       assert((PSInputNum <= 15) && "Too many PS inputs!");
@@ -120,7 +137,7 @@ SDValue SITargetLowering::LowerFormalArguments(
     }
 
     // Second split vertices into their elements
-    if (Arg.VT.isVector()) {
+    if (Info->ShaderType != ShaderType::COMPUTE && Arg.VT.isVector()) {
       ISD::InputArg NewArg = Arg;
       NewArg.Flags.setSplit();
       NewArg.VT = Arg.VT.getVectorElementType();
@@ -152,20 +169,37 @@ SDValue SITargetLowering::LowerFormalArguments(
     CCInfo.AllocateReg(AMDGPU::VGPR1);
   }
 
+  // The pointer to the list of arguments is stored in SGPR0, SGPR1
+  if (Info->ShaderType == ShaderType::COMPUTE) {
+    CCInfo.AllocateReg(AMDGPU::SGPR0);
+    CCInfo.AllocateReg(AMDGPU::SGPR1);
+    MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass);
+  }
+
   AnalyzeFormalArguments(CCInfo, Splits);
 
   for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
 
+    const ISD::InputArg &Arg = Ins[i];
     if (Skipped & (1 << i)) {
-      InVals.push_back(SDValue());
+      InVals.push_back(DAG.getUNDEF(Arg.VT));
       continue;
     }
 
     CCValAssign &VA = ArgLocs[ArgIdx++];
+    EVT VT = VA.getLocVT();
+
+    if (VA.isMemLoc()) {
+      // The first 36 bytes of the input buffer contains information about
+      // thread group and global sizes.
+      SDValue Arg = LowerParameter(DAG, VT, DL, DAG.getRoot(),
+                                   36 + VA.getLocMemOffset());
+      InVals.push_back(Arg);
+      continue;
+    }
     assert(VA.isRegLoc() && "Parameter must be in a register!");
 
     unsigned Reg = VA.getLocReg();
-    MVT VT = VA.getLocVT();
 
     if (VT == MVT::i64) {
       // For now assume it is a pointer
@@ -181,7 +215,6 @@ SDValue SITargetLowering::LowerFormalArguments(
     Reg = MF.addLiveIn(Reg, RC);
     SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
 
-    const ISD::InputArg &Arg = Ins[i];
     if (Arg.VT.isVector()) {
 
       // Build a vector from the registers
@@ -200,7 +233,7 @@ SDValue SITargetLowering::LowerFormalArguments(
       NumElements = Arg.VT.getVectorNumElements() - NumElements;
       for (unsigned j = 0; j != NumElements; ++j)
         Regs.push_back(DAG.getUNDEF(VT));
- 
+
       InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT,
                                    Regs.data(), Regs.size()));
       continue;
@@ -214,15 +247,45 @@ SDValue SITargetLowering::LowerFormalArguments(
 MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
     MachineInstr * MI, MachineBasicBlock * BB) const {
 
+  MachineBasicBlock::iterator I = *MI;
+
   switch (MI->getOpcode()) {
   default:
     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
   case AMDGPU::BRANCH: return BB;
+  case AMDGPU::SI_ADDR64_RSRC: {
+    const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+    unsigned SuperReg = MI->getOperand(0).getReg();
+    unsigned SubRegLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    unsigned SubRegHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    unsigned SubRegHiHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+    unsigned SubRegHiLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), SubRegLo)
+            .addOperand(MI->getOperand(1));
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiLo)
+            .addImm(0);
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiHi)
+            .addImm(RSRC_DATA_FORMAT >> 32);
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SubRegHi)
+            .addReg(SubRegHiLo)
+            .addImm(AMDGPU::sub0)
+            .addReg(SubRegHiHi)
+            .addImm(AMDGPU::sub1);
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SuperReg)
+            .addReg(SubRegLo)
+            .addImm(AMDGPU::sub0_sub1)
+            .addReg(SubRegHi)
+            .addImm(AMDGPU::sub2_sub3);
+    MI->eraseFromParent();
+    break;
+  }
   }
   return BB;
 }
 
-EVT SITargetLowering::getSetCCResultType(EVT VT) const {
+EVT SITargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
   return MVT::i1;
 }
 
@@ -239,7 +302,55 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
-  case ISD::STORE: return LowerSTORE(Op, DAG);
+  case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntrinsicID =
+                         cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    EVT VT = Op.getValueType();
+    SDLoc DL(Op);
+    //XXX: Hardcoded we only use two to store the pointer to the parameters.
+    unsigned NumUserSGPRs = 2;
+    switch (IntrinsicID) {
+    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+    case Intrinsic::r600_read_ngroups_x:
+      return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 0);
+    case Intrinsic::r600_read_ngroups_y:
+      return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 4);
+    case Intrinsic::r600_read_ngroups_z:
+      return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 8);
+    case Intrinsic::r600_read_global_size_x:
+      return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 12);
+    case Intrinsic::r600_read_global_size_y:
+      return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 16);
+    case Intrinsic::r600_read_global_size_z:
+      return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 20);
+    case Intrinsic::r600_read_local_size_x:
+      return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 24);
+    case Intrinsic::r600_read_local_size_y:
+      return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 28);
+    case Intrinsic::r600_read_local_size_z:
+      return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 32);
+    case Intrinsic::r600_read_tgid_x:
+      return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
+                     AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT);
+    case Intrinsic::r600_read_tgid_y:
+      return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
+                     AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 1), VT);
+    case Intrinsic::r600_read_tgid_z:
+      return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
+                     AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 2), VT);
+    case Intrinsic::r600_read_tidig_x:
+      return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
+                                  AMDGPU::VGPR0, VT);
+    case Intrinsic::r600_read_tidig_y:
+      return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
+                                  AMDGPU::VGPR1, VT);
+    case Intrinsic::r600_read_tidig_z:
+      return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
+                                  AMDGPU::VGPR2, VT);
+
+    }
+  }
   }
   return SDValue();
 }
@@ -265,7 +376,7 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) {
 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
                                       SelectionDAG &DAG) const {
 
-  DebugLoc DL = BRCOND.getDebugLoc();
+  SDLoc DL(BRCOND);
 
   SDNode *Intr = BRCOND.getOperand(1).getNode();
   SDValue Target = BRCOND.getOperand(2);
@@ -338,32 +449,6 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
   return Chain;
 }
 
-#define RSRC_DATA_FORMAT 0xf00000000000
-
-SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
-  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
-  SDValue Chain = Op.getOperand(0);
-  SDValue Value = Op.getOperand(1);
-  SDValue VirtualAddress = Op.getOperand(2);
-  DebugLoc DL = Op.getDebugLoc();
-
-  if (StoreNode->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS) {
-    return SDValue();
-  }
-
-  SDValue SrcSrc = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
-                               DAG.getConstant(0, MVT::i64),
-			       DAG.getConstant(RSRC_DATA_FORMAT, MVT::i64));
-
-  SDValue Ops[2];
-  Ops[0] = DAG.getNode(AMDGPUISD::BUFFER_STORE, DL, MVT::Other, Chain,
-                       Value, SrcSrc, VirtualAddress);
-  Ops[1] = Chain;
-
-  return DAG.getMergeValues(Ops, 2, DL);
-
-}
-
 SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
@@ -371,7 +456,7 @@ SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue False = Op.getOperand(3);
   SDValue CC = Op.getOperand(4);
   EVT VT = Op.getValueType();
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   // Possible Min/Max pattern
   SDValue MinMax = LowerMinMax(Op, DAG);
@@ -383,6 +468,21 @@ SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False);
 }
 
+SDValue SITargetLowering::LowerSIGN_EXTEND(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+
+  if (VT != MVT::i64) {
+    return SDValue();
+  }
+
+  SDValue Hi = DAG.getNode(ISD::SRA, DL, MVT::i32, Op.getOperand(0),
+                                                 DAG.getConstant(31, MVT::i32));
+
+  return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), Hi);
+}
+
 //===----------------------------------------------------------------------===//
 // Custom DAG optimizations
 //===----------------------------------------------------------------------===//
@@ -390,7 +490,7 @@ SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   EVT VT = N->getValueType(0);
 
   switch (N->getOpcode()) {
@@ -433,13 +533,13 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   return SDValue();
 }
 
-/// \brief Test if RegClass is one of the VSrc classes 
+/// \brief Test if RegClass is one of the VSrc classes
 static bool isVSrc(unsigned RegClass) {
   return AMDGPU::VSrc_32RegClassID == RegClass ||
          AMDGPU::VSrc_64RegClassID == RegClass;
 }
 
-/// \brief Test if RegClass is one of the SSrc classes 
+/// \brief Test if RegClass is one of the SSrc classes
 static bool isSSrc(unsigned RegClass) {
   return AMDGPU::SSrc_32RegClassID == RegClass ||
          AMDGPU::SSrc_64RegClassID == RegClass;
@@ -481,6 +581,8 @@ bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate,
                                bool &ScalarSlotUsed) const {
 
   MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand);
+  const SIInstrInfo *TII =
+    static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
   if (Mov == 0 || !TII->isMov(Mov->getMachineOpcode()))
     return false;
 
@@ -513,20 +615,33 @@ bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate,
 }
 
 /// \brief Does "Op" fit into register class "RegClass" ?
-bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, SDValue &Op,
+bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, const SDValue &Op,
                                     unsigned RegClass) const {
 
-  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 
+  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
   SDNode *Node = Op.getNode();
 
   const TargetRegisterClass *OpClass;
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
   if (MachineSDNode *MN = dyn_cast<MachineSDNode>(Node)) {
+    const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
     const MCInstrDesc &Desc = TII->get(MN->getMachineOpcode());
     int OpClassID = Desc.OpInfo[Op.getResNo()].RegClass;
-    if (OpClassID == -1)
-      OpClass = getRegClassFor(Op.getSimpleValueType());
-    else
+    if (OpClassID == -1) {
+      switch (MN->getMachineOpcode()) {
+      case AMDGPU::REG_SEQUENCE:
+        // Operand 0 is the register class id for REG_SEQUENCE instructions.
+        OpClass = TRI->getRegClass(
+                       cast<ConstantSDNode>(MN->getOperand(0))->getZExtValue());
+        break;
+      default:
+        OpClass = getRegClassFor(Op.getSimpleValueType());
+        break;
+      }
+    } else {
       OpClass = TRI->getRegClass(OpClassID);
+    }
 
   } else if (Node->getOpcode() == ISD::CopyFromReg) {
     RegisterSDNode *Reg = cast<RegisterSDNode>(Node->getOperand(1).getNode());
@@ -564,17 +679,30 @@ void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand,
   // This is a conservative aproach, it is possible that we can't determine
   // the correct register class and copy too often, but better save than sorry.
   SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32);
-  SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DebugLoc(),
+  SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(),
                                     Operand.getValueType(), Operand, RC);
   Operand = SDValue(Node, 0);
 }
 
+/// \returns true if \p Node's operands are different from the SDValue list
+/// \p Ops
+static bool isNodeChanged(const SDNode *Node, const std::vector<SDValue> &Ops) {
+  for (unsigned i = 0, e = Node->getNumOperands(); i < e; ++i) {
+    if (Ops[i].getNode() != Node->getOperand(i).getNode()) {
+      return true;
+    }
+  }
+  return false;
+}
+
 /// \brief Try to fold the Nodes operands into the Node
 SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
                                        SelectionDAG &DAG) const {
 
   // Original encoding (either e32 or e64)
   int Opcode = Node->getMachineOpcode();
+  const SIInstrInfo *TII =
+    static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
   const MCInstrDesc *Desc = &TII->get(Opcode);
 
   unsigned NumDefs = Desc->getNumDefs();
@@ -700,13 +828,19 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
   for (unsigned i = NumOps - NumDefs, e = Node->getNumOperands(); i < e; ++i)
     Ops.push_back(Node->getOperand(i));
 
+  // Nodes that have a glue result are not CSE'd by getMachineNode(), so in
+  // this case a brand new node is always be created, even if the operands
+  // are the same as before.  So, manually check if anything has been changed.
+  if (Desc->Opcode == Opcode && !isNodeChanged(Node, Ops)) {
+    return Node;
+  }
+
   // Create a complete new instruction
-  return DAG.getMachineNode(Desc->Opcode, Node->getDebugLoc(),
-                            Node->getVTList(), Ops);
+  return DAG.getMachineNode(Desc->Opcode, SDLoc(Node), Node->getVTList(), Ops);
 }
 
 /// \brief Helper function for adjustWritemask
-unsigned SubIdx2Lane(unsigned Idx) {
+static unsigned SubIdx2Lane(unsigned Idx) {
   switch (Idx) {
   default: return 0;
   case AMDGPU::sub0: return 0;
@@ -756,7 +890,7 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
   if (Writemask == (1U << Lane)) {
     SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32);
     SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
-                                      DebugLoc(), MVT::f32,
+                                      SDLoc(), Users[Lane]->getValueType(0),
                                       SDValue(Node, 0), RC);
     DAG.ReplaceAllUsesWith(Users[Lane], Copy);
     return;
@@ -784,6 +918,7 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
 /// \brief Fold the instructions after slecting them
 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
                                           SelectionDAG &DAG) const {
+  Node = AdjustRegClass(Node, DAG);
 
   if (AMDGPU::isMIMG(Node->getMachineOpcode()) != -1)
     adjustWritemask(Node, DAG);
@@ -815,3 +950,62 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
   MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
   MRI.setRegClass(VReg, RC);
 }
+
+MachineSDNode *SITargetLowering::AdjustRegClass(MachineSDNode *N,
+                                                SelectionDAG &DAG) const {
+
+  SDLoc DL(N);
+  unsigned NewOpcode = N->getMachineOpcode();
+
+  switch (N->getMachineOpcode()) {
+  default: return N;
+  case AMDGPU::REG_SEQUENCE: {
+    // MVT::i128 only use SGPRs, so i128 REG_SEQUENCEs don't need to be
+    // rewritten.
+    if (N->getValueType(0) == MVT::i128) {
+      return N;
+    }
+    const SDValue Ops[] = {
+      DAG.getTargetConstant(AMDGPU::VReg_64RegClassID, MVT::i32),
+      N->getOperand(1) , N->getOperand(2),
+      N->getOperand(3), N->getOperand(4)
+    };
+    return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::i64, Ops);
+  }
+
+  case AMDGPU::S_LOAD_DWORD_IMM:
+    NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_ADDR64;
+    // Fall-through
+  case AMDGPU::S_LOAD_DWORDX2_SGPR:
+    if (NewOpcode == N->getMachineOpcode()) {
+      NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
+    }
+    // Fall-through
+  case AMDGPU::S_LOAD_DWORDX4_IMM:
+  case AMDGPU::S_LOAD_DWORDX4_SGPR: {
+    if (NewOpcode == N->getMachineOpcode()) {
+      NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
+    }
+    if (fitsRegClass(DAG, N->getOperand(0), AMDGPU::SReg_64RegClassID)) {
+      return N;
+    }
+    ConstantSDNode *Offset = cast<ConstantSDNode>(N->getOperand(1));
+    SDValue Ops[] = {
+      SDValue(DAG.getMachineNode(AMDGPU::SI_ADDR64_RSRC, DL, MVT::i128,
+                                 DAG.getConstant(0, MVT::i64)), 0),
+      N->getOperand(0),
+      DAG.getConstant(Offset->getSExtValue() << 2, MVT::i32)
+    };
+    return DAG.getMachineNode(NewOpcode, DL, N->getVTList(), Ops);
+  }
+  }
+}
+
+SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
+                                               const TargetRegisterClass *RC,
+                                               unsigned Reg, EVT VT) const {
+  SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT);
+
+  return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()),
+                            cast<RegisterSDNode>(VReg)->getReg(), VT);
+}
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index de637be..78ae6a1 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -21,21 +21,22 @@
 namespace llvm {
 
 class SITargetLowering : public AMDGPUTargetLowering {
-  const SIInstrInfo * TII;
-  const TargetRegisterInfo * TRI;
-
-  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerParameter(SelectionDAG &DAG, EVT VT, SDLoc DL,
+                         SDValue Chain, unsigned Offset) const;
   SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
 
   bool foldImm(SDValue &Operand, int32_t &Immediate,
                bool &ScalarSlotUsed) const;
-  bool fitsRegClass(SelectionDAG &DAG, SDValue &Op, unsigned RegClass) const;
-  void ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, 
+  bool fitsRegClass(SelectionDAG &DAG, const SDValue &Op,
+                    unsigned RegClass) const;
+  void ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand,
                        unsigned RegClass, bool &ScalarSlotUsed) const;
 
   SDNode *foldOperands(MachineSDNode *N, SelectionDAG &DAG) const;
   void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
+  MachineSDNode *AdjustRegClass(MachineSDNode *N, SelectionDAG &DAG) const;
 
 public:
   SITargetLowering(TargetMachine &tm);
@@ -43,12 +44,12 @@ public:
   SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
                                bool isVarArg,
                                const SmallVectorImpl<ISD::InputArg> &Ins,
-                               DebugLoc DL, SelectionDAG &DAG,
+                               SDLoc DL, SelectionDAG &DAG,
                                SmallVectorImpl<SDValue> &InVals) const;
 
   virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
                                               MachineBasicBlock * BB) const;
-  virtual EVT getSetCCResultType(EVT VT) const;
+  virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
   virtual MVT getScalarShiftAmountTy(EVT VT) const;
   virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
   virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -57,6 +58,8 @@ public:
                                              SDNode *Node) const;
 
   int32_t analyzeImmediate(const SDNode *N) const;
+  SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC,
+                               unsigned Reg, EVT VT) const;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp
index 98bd3db..c36e1dc 100644
--- a/lib/Target/R600/SIInsertWaits.cpp
+++ b/lib/Target/R600/SIInsertWaits.cpp
@@ -47,7 +47,7 @@ class SIInsertWaits : public MachineFunctionPass {
 private:
   static char ID;
   const SIInstrInfo *TII;
-  const SIRegisterInfo &TRI;
+  const SIRegisterInfo *TRI;
   const MachineRegisterInfo *MRI;
 
   /// \brief Constant hardware limits
@@ -97,8 +97,8 @@ private:
 public:
   SIInsertWaits(TargetMachine &tm) :
     MachineFunctionPass(ID),
-    TII(static_cast<const SIInstrInfo*>(tm.getInstrInfo())),
-    TRI(TII->getRegisterInfo()) { }
+    TII(0),
+    TRI(0) { }
 
   virtual bool runOnMachineFunction(MachineFunction &MF);
 
@@ -137,7 +137,7 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
     assert(Op.isReg() && "First LGKM operand must be a register!");
 
     unsigned Reg = Op.getReg();
-    unsigned Size = TRI.getMinimalPhysRegClass(Reg)->getSize();
+    unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();
     Result.Named.LGKM = Size > 4 ? 2 : 1;
 
   } else {
@@ -182,12 +182,12 @@ RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) {
     return std::make_pair(0, 0);
 
   unsigned Reg = Op.getReg();
-  unsigned Size = TRI.getMinimalPhysRegClass(Reg)->getSize();
+  unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();
 
   assert(Size >= 4);
 
   RegInterval Result;
-  Result.first = TRI.getEncodingValue(Reg);
+  Result.first = TRI->getEncodingValue(Reg);
   Result.second = Result.first + Size / 4;
 
   return Result;
@@ -328,9 +328,11 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
 }
 
 bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
-
   bool Changes = false;
 
+  TII = static_cast<const SIInstrInfo*>(MF.getTarget().getInstrInfo());
+  TRI = static_cast<const SIRegisterInfo*>(MF.getTarget().getRegisterInfo());
+
   MRI = &MF.getRegInfo();
 
   WaitedOn = ZeroCounts;
diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
index f737ddd..51f323d 100644
--- a/lib/Target/R600/SIInstrFormats.td
+++ b/lib/Target/R600/SIInstrFormats.td
@@ -185,25 +185,25 @@ class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
 class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
     Enc64 <outs, ins, asm, pattern> {
 
-  bits<8> VDST;
-  bits<9> SRC0;
-  bits<9> SRC1;
-  bits<9> SRC2;
-  bits<3> ABS; 
-  bits<1> CLAMP;
-  bits<2> OMOD;
-  bits<3> NEG;
-
-  let Inst{7-0} = VDST;
-  let Inst{10-8} = ABS;
-  let Inst{11} = CLAMP;
+  bits<8> dst;
+  bits<9> src0;
+  bits<9> src1;
+  bits<9> src2;
+  bits<3> abs;
+  bits<1> clamp;
+  bits<2> omod;
+  bits<3> neg;
+
+  let Inst{7-0} = dst;
+  let Inst{10-8} = abs;
+  let Inst{11} = clamp;
   let Inst{25-17} = op;
   let Inst{31-26} = 0x34; //encoding
-  let Inst{40-32} = SRC0;
-  let Inst{49-41} = SRC1;
-  let Inst{58-50} = SRC2;
-  let Inst{60-59} = OMOD;
-  let Inst{63-61} = NEG;
+  let Inst{40-32} = src0;
+  let Inst{49-41} = src1;
+  let Inst{58-50} = src2;
+  let Inst{60-59} = omod;
+  let Inst{63-61} = neg;
   
   let mayLoad = 0;
   let mayStore = 0;
@@ -213,23 +213,23 @@ class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
 class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
     Enc64 <outs, ins, asm, pattern> {
 
-  bits<8> VDST;
-  bits<9> SRC0;
-  bits<9> SRC1;
-  bits<9> SRC2;
-  bits<7> SDST;
-  bits<2> OMOD;
-  bits<3> NEG;
+  bits<8> dst;
+  bits<9> src0;
+  bits<9> src1;
+  bits<9> src2;
+  bits<7> sdst;
+  bits<2> omod;
+  bits<3> neg;
 
-  let Inst{7-0} = VDST;
-  let Inst{14-8} = SDST;
+  let Inst{7-0} = dst;
+  let Inst{14-8} = sdst;
   let Inst{25-17} = op;
   let Inst{31-26} = 0x34; //encoding
-  let Inst{40-32} = SRC0;
-  let Inst{49-41} = SRC1;
-  let Inst{58-50} = SRC2;
-  let Inst{60-59} = OMOD;
-  let Inst{63-61} = NEG;
+  let Inst{40-32} = src0;
+  let Inst{49-41} = src1;
+  let Inst{58-50} = src2;
+  let Inst{60-59} = omod;
+  let Inst{63-61} = neg;
 
   let mayLoad = 0;
   let mayStore = 0;
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
index 9a04c60..cb582a6 100644
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -24,7 +24,7 @@ using namespace llvm;
 
 SIInstrInfo::SIInstrInfo(AMDGPUTargetMachine &tm)
   : AMDGPUInstrInfo(tm),
-    RI(tm, *this)
+    RI(tm)
     { }
 
 const SIRegisterInfo &SIInstrInfo::getRegisterInfo() const {
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index aafc331..42fa95f 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -26,10 +26,6 @@ def HI32 : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getZExtValue() >> 32, MVT::i32);
 }]>;
 
-def SIbuffer_store : SDNode<"AMDGPUISD::BUFFER_STORE",
-                           SDTypeProfile<0, 3, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
-                           [SDNPHasChain, SDNPMayStore]>;
-
 def IMM8bitDWORD : ImmLeaf <
   i32, [{
     return (Imm & ~0x3FC) == 0;
@@ -39,13 +35,16 @@ def IMM8bitDWORD : ImmLeaf <
   }]>
 >;
 
-def IMM12bit : ImmLeaf <
-  i16,
-  [{return isUInt<12>(Imm);}]
+def as_i16imm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue(), MVT::i16);
+}]>;
+
+def IMM12bit : PatLeaf <(imm),
+  [{return isUInt<12>(N->getZExtValue());}]
 >;
 
 class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{
-  return ((const SITargetLowering &)TLI).analyzeImmediate(N) == 0;
+  return (*(const SITargetLowering *)TLI).analyzeImmediate(N) == 0;
 }]>;
 
 //===----------------------------------------------------------------------===//
@@ -163,8 +162,8 @@ multiclass VOP1_Helper <bits<8> op, RegisterClass drc, RegisterClass src,
          i32imm:$omod, i32imm:$neg),
     opName#"_e64 $dst, $src0, $abs, $clamp, $omod, $neg", []
   >, VOP <opName> {
-    let SRC1 = SIOperand.ZERO;
-    let SRC2 = SIOperand.ZERO;
+    let src1 = SIOperand.ZERO;
+    let src2 = SIOperand.ZERO;
   }
 }
 
@@ -189,7 +188,7 @@ multiclass VOP2_Helper <bits<6> op, RegisterClass vrc, RegisterClass arc,
          i32imm:$omod, i32imm:$neg),
     opName#"_e64 $dst, $src0, $src1, $abs, $clamp, $omod, $neg", []
   >, VOP <opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)> {
-    let SRC2 = SIOperand.ZERO;
+    let src2 = SIOperand.ZERO;
   }
 }
 
@@ -217,11 +216,11 @@ multiclass VOP2b_32 <bits<6> op, string opName, list<dag> pattern,
          i32imm:$omod, i32imm:$neg),
     opName#"_e64 $dst, $src0, $src1, $abs, $clamp, $omod, $neg", []
   >, VOP <opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)> {
-    let SRC2 = SIOperand.ZERO;
+    let src2 = SIOperand.ZERO;
     /* the VOP2 variant puts the carry out into VCC, the VOP3 variant
        can write it into any SGPR. We currently don't use the carry out,
        so for now hardcode it to VCC as well */
-    let SDST = SIOperand.VCC;
+    let sdst = SIOperand.VCC;
   }
 }
 
@@ -244,7 +243,7 @@ multiclass VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
       [(set SReg_64:$dst, (i1 (setcc (vt arc:$src0), arc:$src1, cond)))]
     )
   >, VOP <opName> {
-    let SRC2 = SIOperand.ZERO;
+    let src2 = SIOperand.ZERO;
   }
 }
 
@@ -263,6 +262,19 @@ class VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
   opName#" $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern
 >, VOP <opName>;
 
+class VOP3_64_Shift <bits <9> op, string opName, list<dag> pattern> : VOP3 <
+  op, (outs VReg_64:$dst),
+  (ins VSrc_64:$src0, VSrc_32:$src1),
+  opName#" $dst, $src0, $src1", pattern
+>, VOP <opName> {
+
+  let src2 = SIOperand.ZERO;
+  let abs = 0;
+  let clamp = 0;
+  let omod = 0;
+  let neg = 0;
+}
+
 class VOP3_64 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
   op, (outs VReg_64:$dst),
   (ins VSrc_64:$src0, VSrc_64:$src1, VSrc_64:$src2,
@@ -287,31 +299,41 @@ class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBU
   let mayLoad = 0;
 }
 
-class MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> : MUBUF <
-  op,
-  (outs regClass:$vdata),
-  (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
-       i1imm:$lds, VReg_32:$vaddr, SReg_128:$srsrc, i1imm:$slc,
-       i1imm:$tfe, SSrc_32:$soffset),
-  asm#" $vdata, $offset, $offen, $idxen, $glc, $addr64, "
-     #"$lds, $vaddr, $srsrc, $slc, $tfe, $soffset",
-  []> {
-  let mayLoad = 1;
-  let mayStore = 0;
+multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> {
+
+  let glc = 0, lds = 0, slc = 0, tfe = 0, soffset = 128 /* ZERO */,
+                                          mayLoad = 1 in {
+
+  let offen = 1, idxen = 0, addr64 = 0, offset = 0 in {
+    def _OFFEN  : MUBUF <op, (outs regClass:$vdata),
+                         (ins SReg_128:$srsrc, VReg_32:$vaddr),
+                         asm#" $vdata, $srsrc + $vaddr", []>;
+  }
+
+  let offen = 0, idxen = 1, addr64 = 0 in {
+    def _IDXEN  : MUBUF <op, (outs regClass:$vdata),
+                         (ins SReg_128:$srsrc, VReg_32:$vaddr, i16imm:$offset),
+                         asm#" $vdata, $srsrc[$vaddr] + $offset", []>;
+  }
+
+  let offen = 0, idxen = 0, addr64 = 1 in {
+    def _ADDR64 : MUBUF <op, (outs regClass:$vdata),
+                         (ins SReg_128:$srsrc, VReg_64:$vaddr, i16imm:$offset),
+                         asm#" $vdata, $srsrc + $vaddr + $offset", []>;
+  }
+  }
 }
 
 class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass,
                          ValueType VT> :
-    MUBUF <op, (outs), (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr),
-          name#" $vdata, $srsrc + $vaddr",
-          [(SIbuffer_store (VT vdataClass:$vdata), (i128 SReg_128:$srsrc),
-                                                    (i64 VReg_64:$vaddr))]> {
+    MUBUF <op, (outs), (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, i16imm:$offset),
+          name#" $vdata, $srsrc + $vaddr + $offset",
+         []> {
 
   let mayLoad = 0;
   let mayStore = 1;
 
   // Encoding
-  let offset = 0;
   let offen = 0;
   let idxen = 0;
   let glc = 0;
@@ -335,7 +357,22 @@ class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF
   let mayStore = 0;
 }
 
-class MIMG_Load_Helper <bits<7> op, string asm> : MIMG <
+class MIMG_NoSampler_Helper <bits<7> op, string asm> : MIMG <
+  op,
+  (outs VReg_128:$vdata),
+  (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
+       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, unknown:$vaddr,
+       SReg_256:$srsrc),
+  asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
+     #" $tfe, $lwe, $slc, $vaddr, $srsrc",
+  []> {
+  let SSAMP = 0;
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasPostISelHook = 1;
+}
+
+class MIMG_Sampler_Helper <bits<7> op, string asm> : MIMG <
   op,
   (outs VReg_128:$vdata),
   (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
@@ -382,7 +419,7 @@ def getCommuteOrig : InstrMapping {
 
 // Test if the supplied opcode is an MIMG instruction
 def isMIMG : InstrMapping {
-  let FilterClass = "MIMG_Load_Helper";
+  let FilterClass = "MIMG";
   let RowFields = ["Inst"];
   let ColFields = ["Size"];
   let KeyCol = ["8"];
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 3ff4548..e8ed2dd 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -22,8 +22,8 @@ def InterpSlot : Operand<i32> {
   let PrintMethod = "printInterpSlot";
 }
 
-def isSI : Predicate<"Subtarget.device()"
-                            "->getGeneration() == AMDGPUDeviceInfo::HD7XXX">;
+def isSI : Predicate<"Subtarget.getGeneration() "
+                      "== AMDGPUSubtarget::SOUTHERN_ISLANDS">;
 
 let Predicates = [isSI] in {
 
@@ -394,18 +394,18 @@ defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64">;
 //def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "BUFFER_LOAD_FORMAT_X", []>;
 //def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "BUFFER_LOAD_FORMAT_XY", []>;
 //def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <0x00000002, "BUFFER_LOAD_FORMAT_XYZ", []>;
-def BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMAT_XYZW", VReg_128>;
+defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMAT_XYZW", VReg_128>;
 //def BUFFER_STORE_FORMAT_X : MUBUF_ <0x00000004, "BUFFER_STORE_FORMAT_X", []>;
 //def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "BUFFER_STORE_FORMAT_XY", []>;
 //def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "BUFFER_STORE_FORMAT_XYZ", []>;
 //def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <0x00000007, "BUFFER_STORE_FORMAT_XYZW", []>;
-//def BUFFER_LOAD_UBYTE : MUBUF_ <0x00000008, "BUFFER_LOAD_UBYTE", []>;
+defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper <0x00000008, "BUFFER_LOAD_UBYTE", VReg_32>;
 //def BUFFER_LOAD_SBYTE : MUBUF_ <0x00000009, "BUFFER_LOAD_SBYTE", []>;
 //def BUFFER_LOAD_USHORT : MUBUF_ <0x0000000a, "BUFFER_LOAD_USHORT", []>;
 //def BUFFER_LOAD_SSHORT : MUBUF_ <0x0000000b, "BUFFER_LOAD_SSHORT", []>;
-def BUFFER_LOAD_DWORD : MUBUF_Load_Helper <0x0000000c, "BUFFER_LOAD_DWORD", VReg_32>;
-def BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <0x0000000d, "BUFFER_LOAD_DWORDX2", VReg_64>;
-def BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128>;
+defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper <0x0000000c, "BUFFER_LOAD_DWORD", VReg_32>;
+defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <0x0000000d, "BUFFER_LOAD_DWORDX2", VReg_64>;
+defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128>;
 //def BUFFER_STORE_BYTE : MUBUF_ <0x00000018, "BUFFER_STORE_BYTE", []>;
 //def BUFFER_STORE_SHORT : MUBUF_ <0x0000001a, "BUFFER_STORE_SHORT", []>;
 
@@ -416,7 +416,10 @@ def BUFFER_STORE_DWORD : MUBUF_Store_Helper <
 def BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper <
   0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64, i64
 >;
-//def BUFFER_STORE_DWORDX4 : MUBUF_DWORDX4 <0x0000001e, "BUFFER_STORE_DWORDX4", []>;
+
+def BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
+  0x0000001e, "BUFFER_STORE_DWORDX4", VReg_128, v4i32
+>;
 //def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>;
 //def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "BUFFER_ATOMIC_CMPSWAP", []>;
 //def BUFFER_ATOMIC_ADD : MUBUF_ <0x00000032, "BUFFER_ATOMIC_ADD", []>;
@@ -495,7 +498,7 @@ defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
 //def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>;
 //def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>;
 //def IMAGE_LOAD : MIMG_NoPattern_ <"IMAGE_LOAD", 0x00000000>;
-//def IMAGE_LOAD_MIP : MIMG_NoPattern_ <"IMAGE_LOAD_MIP", 0x00000001>;
+def IMAGE_LOAD_MIP : MIMG_NoSampler_Helper <0x00000001, "IMAGE_LOAD_MIP">;
 //def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_PCK", 0x00000002>;
 //def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_PCK_SGN", 0x00000003>;
 //def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK", 0x00000004>;
@@ -504,7 +507,7 @@ defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
 //def IMAGE_STORE_MIP : MIMG_NoPattern_ <"IMAGE_STORE_MIP", 0x00000009>;
 //def IMAGE_STORE_PCK : MIMG_NoPattern_ <"IMAGE_STORE_PCK", 0x0000000a>;
 //def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"IMAGE_STORE_MIP_PCK", 0x0000000b>;
-//def IMAGE_GET_RESINFO : MIMG_NoPattern_ <"IMAGE_GET_RESINFO", 0x0000000e>;
+def IMAGE_GET_RESINFO : MIMG_NoSampler_Helper <0x0000000e, "IMAGE_GET_RESINFO">;
 //def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_SWAP", 0x0000000f>;
 //def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_CMPSWAP", 0x00000010>;
 //def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"IMAGE_ATOMIC_ADD", 0x00000011>;
@@ -522,20 +525,20 @@ defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
 //def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_FCMPSWAP", 0x0000001d>;
 //def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMIN", 0x0000001e>;
 //def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMAX", 0x0000001f>;
-def IMAGE_SAMPLE : MIMG_Load_Helper <0x00000020, "IMAGE_SAMPLE">; 
+def IMAGE_SAMPLE : MIMG_Sampler_Helper <0x00000020, "IMAGE_SAMPLE">; 
 //def IMAGE_SAMPLE_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CL", 0x00000021>;
-def IMAGE_SAMPLE_D : MIMG_Load_Helper <0x00000022, "IMAGE_SAMPLE_D">;
+def IMAGE_SAMPLE_D : MIMG_Sampler_Helper <0x00000022, "IMAGE_SAMPLE_D">;
 //def IMAGE_SAMPLE_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_CL", 0x00000023>;
-def IMAGE_SAMPLE_L : MIMG_Load_Helper <0x00000024, "IMAGE_SAMPLE_L">;
-def IMAGE_SAMPLE_B : MIMG_Load_Helper <0x00000025, "IMAGE_SAMPLE_B">;
+def IMAGE_SAMPLE_L : MIMG_Sampler_Helper <0x00000024, "IMAGE_SAMPLE_L">;
+def IMAGE_SAMPLE_B : MIMG_Sampler_Helper <0x00000025, "IMAGE_SAMPLE_B">;
 //def IMAGE_SAMPLE_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_CL", 0x00000026>;
 //def IMAGE_SAMPLE_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ", 0x00000027>;
-def IMAGE_SAMPLE_C : MIMG_Load_Helper <0x00000028, "IMAGE_SAMPLE_C">;
+def IMAGE_SAMPLE_C : MIMG_Sampler_Helper <0x00000028, "IMAGE_SAMPLE_C">;
 //def IMAGE_SAMPLE_C_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL", 0x00000029>;
 //def IMAGE_SAMPLE_C_D : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D", 0x0000002a>;
 //def IMAGE_SAMPLE_C_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL", 0x0000002b>;
-def IMAGE_SAMPLE_C_L : MIMG_Load_Helper <0x0000002c, "IMAGE_SAMPLE_C_L">;
-def IMAGE_SAMPLE_C_B : MIMG_Load_Helper <0x0000002d, "IMAGE_SAMPLE_C_B">;
+def IMAGE_SAMPLE_C_L : MIMG_Sampler_Helper <0x0000002c, "IMAGE_SAMPLE_C_L">;
+def IMAGE_SAMPLE_C_B : MIMG_Sampler_Helper <0x0000002d, "IMAGE_SAMPLE_C_B">;
 //def IMAGE_SAMPLE_C_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL", 0x0000002e>;
 //def IMAGE_SAMPLE_C_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ", 0x0000002f>;
 //def IMAGE_SAMPLE_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_O", 0x00000030>;
@@ -602,7 +605,9 @@ defm V_READFIRSTLANE_B32 : VOP1_32 <0x00000002, "V_READFIRSTLANE_B32", []>;
 defm V_CVT_F32_I32 : VOP1_32 <0x00000005, "V_CVT_F32_I32",
   [(set f32:$dst, (sint_to_fp i32:$src0))]
 >;
-defm V_CVT_F32_U32 : VOP1_32 <0x00000006, "V_CVT_F32_U32", []>;
+defm V_CVT_F32_U32 : VOP1_32 <0x00000006, "V_CVT_F32_U32",
+  [(set f32:$dst, (uint_to_fp i32:$src0))]
+>;
 defm V_CVT_U32_F32 : VOP1_32 <0x00000007, "V_CVT_U32_F32", []>;
 defm V_CVT_I32_F32 : VOP1_32 <0x00000008, "V_CVT_I32_F32",
   [(set i32:$dst, (fp_to_sint f32:$src0))]
@@ -624,7 +629,9 @@ defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>;
 defm V_FRACT_F32 : VOP1_32 <0x00000020, "V_FRACT_F32",
   [(set f32:$dst, (AMDGPUfract f32:$src0))]
 >;
-defm V_TRUNC_F32 : VOP1_32 <0x00000021, "V_TRUNC_F32", []>;
+defm V_TRUNC_F32 : VOP1_32 <0x00000021, "V_TRUNC_F32",
+  [(set f32:$dst, (int_AMDGPU_trunc f32:$src0))]
+>;
 defm V_CEIL_F32 : VOP1_32 <0x00000022, "V_CEIL_F32",
   [(set f32:$dst, (fceil f32:$src0))]
 >;
@@ -848,10 +855,18 @@ defm V_MAX_LEGACY_F32 : VOP2_32 <0x0000000e, "V_MAX_LEGACY_F32",
 
 defm V_MIN_F32 : VOP2_32 <0x0000000f, "V_MIN_F32", []>;
 defm V_MAX_F32 : VOP2_32 <0x00000010, "V_MAX_F32", []>;
-defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32", []>;
-defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32", []>;
-defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32", []>;
-defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32", []>;
+defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32",
+  [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))]
+>;
+defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32",
+  [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))]
+>;
+defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32",
+  [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))]
+>;
+defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32",
+  [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))]
+>;
 
 defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32",
   [(set i32:$dst, (srl i32:$src0, i32:$src1))]
@@ -952,6 +967,8 @@ def V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32", []>;
 def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64", []>;
 //def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>;
 def V_ALIGNBIT_B32 : VOP3_32 <0x0000014e, "V_ALIGNBIT_B32", []>;
+def : ROTRPattern <V_ALIGNBIT_B32>;
+
 def V_ALIGNBYTE_B32 : VOP3_32 <0x0000014f, "V_ALIGNBYTE_B32", []>;
 def V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>;
 ////def V_MIN3_F32 : VOP3_MIN3 <0x00000151, "V_MIN3_F32", []>;
@@ -970,9 +987,15 @@ def V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>;
 ////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "V_CVT_PK_U8_F32", []>;
 def V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", []>;
 def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", []>;
-def V_LSHL_B64 : VOP3_64 <0x00000161, "V_LSHL_B64", []>;
-def V_LSHR_B64 : VOP3_64 <0x00000162, "V_LSHR_B64", []>;
-def V_ASHR_I64 : VOP3_64 <0x00000163, "V_ASHR_I64", []>;
+
+def V_LSHL_B64 : VOP3_64_Shift <0x00000161, "V_LSHL_B64",
+  [(set i64:$dst, (shl i64:$src0, i32:$src1))]
+>;
+def V_LSHR_B64 : VOP3_64_Shift <0x00000162, "V_LSHR_B64",
+  [(set i64:$dst, (srl i64:$src0, i32:$src1))]
+>;
+def V_ASHR_I64 : VOP3_64_Shift <0x00000163, "V_ASHR_I64", []>;
+
 def V_ADD_F64 : VOP3_64 <0x00000164, "V_ADD_F64", []>;
 def V_MUL_F64 : VOP3_64 <0x00000165, "V_MUL_F64", []>;
 def V_MIN_F64 : VOP3_64 <0x00000166, "V_MIN_F64", []>;
@@ -1180,6 +1203,19 @@ def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
 
 } // Uses = [EXEC,VCC,M0], Defs = [EXEC,VCC,M0]
 
+// This psuedo instruction takes a pointer as input and outputs a resource
+// constant that can be used with the ADDR64 MUBUF instructions.
+
+let usesCustomInserter = 1 in {
+
+def SI_ADDR64_RSRC : InstSI <
+  (outs SReg_128:$srsrc),
+  (ins SReg_64:$ptr),
+  "", []
+>;
+
+} // end usesCustomInserter
+
 } // end IsCodeGenOnly, isPseudo
 
 def : Pat<
@@ -1194,10 +1230,8 @@ def : Pat <
 
 /* int_SI_vs_load_input */
 def : Pat<
-  (int_SI_vs_load_input v16i8:$tlst, IMM12bit:$attr_offset,
-                        i32:$buf_idx_vgpr),
-  (BUFFER_LOAD_FORMAT_XYZW imm:$attr_offset, 0, 1, 0, 0, 0,
-                           $buf_idx_vgpr, $tlst, 0, 0, 0)
+  (int_SI_vs_load_input v16i8:$tlst, IMM12bit:$attr_offset, i32:$buf_idx_vgpr),
+  (BUFFER_LOAD_FORMAT_XYZW_IDXEN $tlst, $buf_idx_vgpr, imm:$attr_offset)
 >;
 
 /* int_SI_export */
@@ -1269,6 +1303,36 @@ defm : SamplePatterns<v4i32>;
 defm : SamplePatterns<v8i32>;
 defm : SamplePatterns<v16i32>;
 
+/* int_SI_imageload for texture fetches consuming varying address parameters */
+class ImageLoadPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
+    (name addr_type:$addr, v32i8:$rsrc, imm),
+    (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc)
+>;
+
+class ImageLoadArrayPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
+    (name addr_type:$addr, v32i8:$rsrc, TEX_ARRAY),
+    (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc)
+>;
+
+multiclass ImageLoadPatterns<ValueType addr_type> {
+  def : ImageLoadPattern <int_SI_imageload, IMAGE_LOAD_MIP, addr_type>;
+  def : ImageLoadArrayPattern <int_SI_imageload, IMAGE_LOAD_MIP, addr_type>;
+}
+
+defm : ImageLoadPatterns<v2i32>;
+defm : ImageLoadPatterns<v4i32>;
+
+/* Image resource information */
+def : Pat <
+  (int_SI_resinfo i32:$mipid, v32i8:$rsrc, imm),
+  (IMAGE_GET_RESINFO 0xf, 0, 0, 0, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
+>;
+
+def : Pat <
+  (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY),
+  (IMAGE_GET_RESINFO 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
+>;
+
 /********** ============================================ **********/
 /********** Extraction, Insertion, Building and Casting  **********/
 /********** ============================================ **********/
@@ -1492,7 +1556,7 @@ def : Pat <
 // 3. Offset in an 32Bit VGPR
 def : Pat <
   (int_SI_load_const v16i8:$sbase, i32:$voff),
-  (BUFFER_LOAD_DWORD 0, 1, 0, 0, 0, 0, $voff, $sbase, 0, 0, 0)
+  (BUFFER_LOAD_DWORD_OFFEN $sbase, $voff)
 >;
 
 // The multiplication scales from [0,1] to the unsigned integer range
@@ -1539,9 +1603,59 @@ multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
 
 defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
 defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, i64>;
 defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v16i8>;
 defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
 
+//===----------------------------------------------------------------------===//
+// MUBUF Patterns
+//===----------------------------------------------------------------------===//
+
+multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt,
+                              PatFrag global_ld, PatFrag constant_ld> {
+  def : Pat <
+    (vt (global_ld (add i64:$ptr, (i64 IMM12bit:$offset)))),
+    (Instr_ADDR64 (SI_ADDR64_RSRC (i64 0)), $ptr, (as_i16imm $offset))
+  >;
+
+  def : Pat <
+    (vt (global_ld i64:$ptr)),
+    (Instr_ADDR64 (SI_ADDR64_RSRC (i64 0)), $ptr, 0)
+  >;
+
+  def : Pat <
+     (vt (global_ld (add i64:$ptr, i64:$offset))),
+     (Instr_ADDR64 (SI_ADDR64_RSRC $ptr), $offset, 0)
+  >;
+
+  def : Pat <
+     (vt (constant_ld (add i64:$ptr, i64:$offset))),
+     (Instr_ADDR64 (SI_ADDR64_RSRC $ptr), $offset, 0)
+  >;
+}
+
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORD_ADDR64, i32,
+                          global_load, constant_load>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32,
+                          zextloadi8_global, zextloadi8_constant>;
+
+multiclass MUBUFStore_Pattern <MUBUF Instr, ValueType vt> {
+
+  def : Pat <
+    (global_store vt:$value, i64:$ptr),
+    (Instr $value, (SI_ADDR64_RSRC (i64 0)), $ptr, 0)
+  >;
+
+  def : Pat <
+    (global_store vt:$value, (add i64:$ptr, i64:$offset)),
+    (Instr $value, (SI_ADDR64_RSRC $ptr), $offset, 0)
+   >;
+}
+
+defm : MUBUFStore_Pattern <BUFFER_STORE_DWORD, i32>;
+defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2, i64>;
+defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX4, v4i32>;
+
 /********** ====================== **********/
 /**********   Indirect adressing   **********/
 /********** ====================== **********/
@@ -1592,4 +1706,10 @@ def : Pat<
   (V_CMP_U_F32_e64 $src0, $src1)
 >;
 
+//============================================================================//
+// Miscellaneous Optimization Patterns
+//============================================================================//
+
+def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e32>;
+
 } // End isSI predicate
diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td
index 16d9d81..224cd2f 100644
--- a/lib/Target/R600/SIIntrinsics.td
+++ b/lib/Target/R600/SIIntrinsics.td
@@ -25,6 +25,10 @@ let TargetPrefix = "SI", isTarget = 1 in {
   def int_SI_sampleb : Sample;
   def int_SI_samplel : Sample;
 
+  def int_SI_imageload : Intrinsic <[llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
+
+  def int_SI_resinfo : Intrinsic <[llvm_v4i32_ty], [llvm_i32_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
+
   /* Interpolation Intrinsics */
 
   def int_SI_fs_constant : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp
index 2b60eb9..5b434fb 100644
--- a/lib/Target/R600/SILowerControlFlow.cpp
+++ b/lib/Target/R600/SILowerControlFlow.cpp
@@ -91,8 +91,7 @@ private:
 
 public:
   SILowerControlFlowPass(TargetMachine &tm) :
-    MachineFunctionPass(ID), TRI(tm.getRegisterInfo()),
-    TII(tm.getInstrInfo()) { }
+    MachineFunctionPass(ID), TRI(0), TII(0) { }
 
   virtual bool runOnMachineFunction(MachineFunction &MF);
 
@@ -408,6 +407,8 @@ void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
 }
 
 bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
+  TII = MF.getTarget().getInstrInfo();
+  TRI = MF.getTarget().getRegisterInfo();
 
   bool HaveKill = false;
   bool NeedWQM = false;
diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp
index 99278ae..ddfc54e 100644
--- a/lib/Target/R600/SIRegisterInfo.cpp
+++ b/lib/Target/R600/SIRegisterInfo.cpp
@@ -18,11 +18,9 @@
 
 using namespace llvm;
 
-SIRegisterInfo::SIRegisterInfo(AMDGPUTargetMachine &tm,
-    const TargetInstrInfo &tii)
-: AMDGPURegisterInfo(tm, tii),
-  TM(tm),
-  TII(tii)
+SIRegisterInfo::SIRegisterInfo(AMDGPUTargetMachine &tm)
+: AMDGPURegisterInfo(tm),
+  TM(tm)
   { }
 
 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h
index caec228..c322f94 100644
--- a/lib/Target/R600/SIRegisterInfo.h
+++ b/lib/Target/R600/SIRegisterInfo.h
@@ -21,13 +21,11 @@
 namespace llvm {
 
 class AMDGPUTargetMachine;
-class TargetInstrInfo;
 
 struct SIRegisterInfo : public AMDGPURegisterInfo {
   AMDGPUTargetMachine &TM;
-  const TargetInstrInfo &TII;
 
-  SIRegisterInfo(AMDGPUTargetMachine &tm, const TargetInstrInfo &tii);
+  SIRegisterInfo(AMDGPUTargetMachine &tm);
 
   virtual BitVector getReservedRegs(const MachineFunction &MF) const;
 
diff --git a/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp b/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp
index 46b1f18..f437564 100644
--- a/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp
+++ b/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp
@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
 #include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
diff --git a/lib/Target/Sparc/CMakeLists.txt b/lib/Target/Sparc/CMakeLists.txt
index efb10db..0ab7a1c 100644
--- a/lib/Target/Sparc/CMakeLists.txt
+++ b/lib/Target/Sparc/CMakeLists.txt
@@ -10,7 +10,6 @@ add_public_tablegen_target(SparcCommonTableGen)
 
 add_llvm_target(SparcCodeGen
   DelaySlotFiller.cpp
-  FPMover.cpp
   SparcAsmPrinter.cpp
   SparcInstrInfo.cpp
   SparcISelDAGToDAG.cpp
diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp
index 6123773..b93f5e4 100644
--- a/lib/Target/Sparc/DelaySlotFiller.cpp
+++ b/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -39,11 +39,10 @@ namespace {
     /// layout, etc.
     ///
     TargetMachine &TM;
-    const TargetInstrInfo *TII;
 
     static char ID;
-    Filler(TargetMachine &tm) 
-      : MachineFunctionPass(ID), TM(tm), TII(tm.getInstrInfo()) { }
+    Filler(TargetMachine &tm)
+      : MachineFunctionPass(ID), TM(tm) { }
 
     virtual const char *getPassName() const {
       return "SPARC Delay Slot Filler";
@@ -61,8 +60,9 @@ namespace {
     bool isDelayFiller(MachineBasicBlock &MBB,
                        MachineBasicBlock::iterator candidate);
 
-    void insertCallUses(MachineBasicBlock::iterator MI,
-                        SmallSet<unsigned, 32>& RegUses);
+    void insertCallDefsUses(MachineBasicBlock::iterator MI,
+                            SmallSet<unsigned, 32>& RegDefs,
+                            SmallSet<unsigned, 32>& RegUses);
 
     void insertDefsUses(MachineBasicBlock::iterator MI,
                         SmallSet<unsigned, 32>& RegDefs,
@@ -81,6 +81,9 @@ namespace {
 
     bool needsUnimp(MachineBasicBlock::iterator I, unsigned &StructSize);
 
+    bool tryCombineRestoreWithPrevInst(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI);
+
   };
   char Filler::ID = 0;
 } // end of anonymous namespace
@@ -99,29 +102,45 @@ FunctionPass *llvm::createSparcDelaySlotFillerPass(TargetMachine &tm) {
 bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
   bool Changed = false;
 
-  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I)
-    if (I->hasDelaySlot()) {
-      MachineBasicBlock::iterator D = MBB.end();
-      MachineBasicBlock::iterator J = I;
-
-      if (!DisableDelaySlotFiller)
-        D = findDelayInstr(MBB, I);
-
-      ++FilledSlots;
-      Changed = true;
-
-      if (D == MBB.end())
-        BuildMI(MBB, ++J, I->getDebugLoc(), TII->get(SP::NOP));
-      else
-        MBB.splice(++J, &MBB, D);
-      unsigned structSize = 0;
-      if (needsUnimp(I, structSize)) {
-        MachineBasicBlock::iterator J = I;
-        ++J; //skip the delay filler.
-        BuildMI(MBB, ++J, I->getDebugLoc(),
-                TII->get(SP::UNIMP)).addImm(structSize);
-      }
+  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ) {
+    MachineBasicBlock::iterator MI = I;
+    ++I;
+
+    // If MI is restore, try combining it with previous inst.
+    if (!DisableDelaySlotFiller &&
+        (MI->getOpcode() == SP::RESTORErr
+         || MI->getOpcode() == SP::RESTOREri)) {
+      Changed |= tryCombineRestoreWithPrevInst(MBB, MI);
+      continue;
+    }
+
+    // If MI has no delay slot, skip.
+    if (!MI->hasDelaySlot())
+      continue;
+
+    MachineBasicBlock::iterator D = MBB.end();
+
+    if (!DisableDelaySlotFiller)
+      D = findDelayInstr(MBB, MI);
+
+    ++FilledSlots;
+    Changed = true;
+
+    const TargetInstrInfo *TII = TM.getInstrInfo();
+    if (D == MBB.end())
+      BuildMI(MBB, I, MI->getDebugLoc(), TII->get(SP::NOP));
+    else
+      MBB.splice(I, &MBB, D);
+
+    unsigned structSize = 0;
+    if (needsUnimp(MI, structSize)) {
+      MachineBasicBlock::iterator J = MI;
+      ++J; // skip the delay filler.
+      assert (J != MBB.end() && "MI needs a delay instruction.");
+      BuildMI(MBB, ++J, I->getDebugLoc(),
+              TII->get(SP::UNIMP)).addImm(structSize);
     }
+  }
   return Changed;
 }
 
@@ -134,28 +153,34 @@ Filler::findDelayInstr(MachineBasicBlock &MBB,
   bool sawLoad = false;
   bool sawStore = false;
 
-  MachineBasicBlock::iterator I = slot;
+  if (slot == MBB.begin())
+    return MBB.end();
 
   if (slot->getOpcode() == SP::RET)
     return MBB.end();
 
   if (slot->getOpcode() == SP::RETL) {
-    --I;
-    if (I->getOpcode() != SP::RESTORErr)
-      return MBB.end();
-    //change retl to ret
-    slot->setDesc(TII->get(SP::RET));
-    return I;
+    MachineBasicBlock::iterator J = slot;
+    --J;
+
+    if (J->getOpcode() == SP::RESTORErr
+        || J->getOpcode() == SP::RESTOREri) {
+      // change retl to ret.
+      slot->setDesc(TM.getInstrInfo()->get(SP::RET));
+      return J;
+    }
   }
 
-  //Call's delay filler can def some of call's uses.
+  // Call's delay filler can def some of call's uses.
   if (slot->isCall())
-    insertCallUses(slot, RegUses);
+    insertCallDefsUses(slot, RegDefs, RegUses);
   else
     insertDefsUses(slot, RegDefs, RegUses);
 
   bool done = false;
 
+  MachineBasicBlock::iterator I = slot;
+
   while (!done) {
     done = (I == MBB.begin());
 
@@ -216,12 +241,12 @@ bool Filler::delayHasHazard(MachineBasicBlock::iterator candidate,
     unsigned Reg = MO.getReg();
 
     if (MO.isDef()) {
-      //check whether Reg is defined or used before delay slot.
+      // check whether Reg is defined or used before delay slot.
       if (IsRegInSet(RegDefs, Reg) || IsRegInSet(RegUses, Reg))
         return true;
     }
     if (MO.isUse()) {
-      //check whether Reg is defined before delay slot.
+      // check whether Reg is defined before delay slot.
       if (IsRegInSet(RegDefs, Reg))
         return true;
     }
@@ -230,9 +255,12 @@ bool Filler::delayHasHazard(MachineBasicBlock::iterator candidate,
 }
 
 
-void Filler::insertCallUses(MachineBasicBlock::iterator MI,
-                            SmallSet<unsigned, 32>& RegUses)
+void Filler::insertCallDefsUses(MachineBasicBlock::iterator MI,
+                                SmallSet<unsigned, 32>& RegDefs,
+                                SmallSet<unsigned, 32>& RegUses)
 {
+  // Call defines o7, which is visible to the instruction in delay slot.
+  RegDefs.insert(SP::O7);
 
   switch(MI->getOpcode()) {
   default: llvm_unreachable("Unknown opcode.");
@@ -255,7 +283,7 @@ void Filler::insertCallUses(MachineBasicBlock::iterator MI,
   }
 }
 
-//Insert Defs and Uses of MI into the sets RegDefs and RegUses.
+// Insert Defs and Uses of MI into the sets RegDefs and RegUses.
 void Filler::insertDefsUses(MachineBasicBlock::iterator MI,
                             SmallSet<unsigned, 32>& RegDefs,
                             SmallSet<unsigned, 32>& RegUses)
@@ -270,13 +298,17 @@ void Filler::insertDefsUses(MachineBasicBlock::iterator MI,
       continue;
     if (MO.isDef())
       RegDefs.insert(Reg);
-    if (MO.isUse())
+    if (MO.isUse()) {
+      // Implicit register uses of retl are return values and
+      // retl does not use them.
+      if (MO.isImplicit() && MI->getOpcode() == SP::RETL)
+        continue;
       RegUses.insert(Reg);
-
+    }
   }
 }
 
-//returns true if the Reg or its alias is in the RegSet.
+// returns true if the Reg or its alias is in the RegSet.
 bool Filler::IsRegInSet(SmallSet<unsigned, 32>& RegSet, unsigned Reg)
 {
   // Check Reg and all aliased Registers.
@@ -318,3 +350,142 @@ bool Filler::needsUnimp(MachineBasicBlock::iterator I, unsigned &StructSize)
   StructSize = MO.getImm();
   return true;
 }
+
+static bool combineRestoreADD(MachineBasicBlock::iterator RestoreMI,
+                              MachineBasicBlock::iterator AddMI,
+                              const TargetInstrInfo *TII)
+{
+  // Before:  add  <op0>, <op1>, %i[0-7]
+  //          restore %g0, %g0, %i[0-7]
+  //
+  // After :  restore <op0>, <op1>, %o[0-7]
+
+  unsigned reg = AddMI->getOperand(0).getReg();
+  if (reg < SP::I0 || reg > SP::I7)
+    return false;
+
+  // Erase RESTORE.
+  RestoreMI->eraseFromParent();
+
+  // Change ADD to RESTORE.
+  AddMI->setDesc(TII->get((AddMI->getOpcode() == SP::ADDrr)
+                          ? SP::RESTORErr
+                          : SP::RESTOREri));
+
+  // Map the destination register.
+  AddMI->getOperand(0).setReg(reg - SP::I0 + SP::O0);
+
+  return true;
+}
+
+static bool combineRestoreOR(MachineBasicBlock::iterator RestoreMI,
+                             MachineBasicBlock::iterator OrMI,
+                             const TargetInstrInfo *TII)
+{
+  // Before:  or  <op0>, <op1>, %i[0-7]
+  //          restore %g0, %g0, %i[0-7]
+  //    and <op0> or <op1> is zero,
+  //
+  // After :  restore <op0>, <op1>, %o[0-7]
+
+  unsigned reg = OrMI->getOperand(0).getReg();
+  if (reg < SP::I0 || reg > SP::I7)
+    return false;
+
+  // check whether it is a copy.
+  if (OrMI->getOpcode() == SP::ORrr
+      && OrMI->getOperand(1).getReg() != SP::G0
+      && OrMI->getOperand(2).getReg() != SP::G0)
+    return false;
+
+  if (OrMI->getOpcode() == SP::ORri
+      && OrMI->getOperand(1).getReg() != SP::G0
+      && (!OrMI->getOperand(2).isImm() || OrMI->getOperand(2).getImm() != 0))
+    return false;
+
+  // Erase RESTORE.
+  RestoreMI->eraseFromParent();
+
+  // Change OR to RESTORE.
+  OrMI->setDesc(TII->get((OrMI->getOpcode() == SP::ORrr)
+                         ? SP::RESTORErr
+                         : SP::RESTOREri));
+
+  // Map the destination register.
+  OrMI->getOperand(0).setReg(reg - SP::I0 + SP::O0);
+
+  return true;
+}
+
+static bool combineRestoreSETHIi(MachineBasicBlock::iterator RestoreMI,
+                                 MachineBasicBlock::iterator SetHiMI,
+                                 const TargetInstrInfo *TII)
+{
+  // Before:  sethi imm3, %i[0-7]
+  //          restore %g0, %g0, %g0
+  //
+  // After :  restore %g0, (imm3<<10), %o[0-7]
+
+  unsigned reg = SetHiMI->getOperand(0).getReg();
+  if (reg < SP::I0 || reg > SP::I7)
+    return false;
+
+  if (!SetHiMI->getOperand(1).isImm())
+    return false;
+
+  int64_t imm = SetHiMI->getOperand(1).getImm();
+
+  // Is it a 3 bit immediate?
+  if (!isInt<3>(imm))
+    return false;
+
+  // Make it a 13 bit immediate.
+  imm = (imm << 10) & 0x1FFF;
+
+  assert(RestoreMI->getOpcode() == SP::RESTORErr);
+
+  RestoreMI->setDesc(TII->get(SP::RESTOREri));
+
+  RestoreMI->getOperand(0).setReg(reg - SP::I0 + SP::O0);
+  RestoreMI->getOperand(1).setReg(SP::G0);
+  RestoreMI->getOperand(2).ChangeToImmediate(imm);
+
+
+  // Erase the original SETHI.
+  SetHiMI->eraseFromParent();
+
+  return true;
+}
+
+bool Filler::tryCombineRestoreWithPrevInst(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MBBI)
+{
+  // No previous instruction.
+  if (MBBI == MBB.begin())
+    return false;
+
+  // assert that MBBI is a "restore %g0, %g0, %g0".
+  assert(MBBI->getOpcode() == SP::RESTORErr
+         && MBBI->getOperand(0).getReg() == SP::G0
+         && MBBI->getOperand(1).getReg() == SP::G0
+         && MBBI->getOperand(2).getReg() == SP::G0);
+
+  MachineBasicBlock::iterator PrevInst = MBBI; --PrevInst;
+
+  // It cannot combine with a delay filler.
+  if (isDelayFiller(MBB, PrevInst))
+    return false;
+
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+
+  switch (PrevInst->getOpcode()) {
+  default: break;
+  case SP::ADDrr:
+  case SP::ADDri: return combineRestoreADD(MBBI, PrevInst, TII); break;
+  case SP::ORrr:
+  case SP::ORri:  return combineRestoreOR(MBBI, PrevInst, TII); break;
+  case SP::SETHIi: return combineRestoreSETHIi(MBBI, PrevInst, TII); break;
+  }
+  // It cannot combine with the previous instruction.
+  return false;
+}
diff --git a/lib/Target/Sparc/FPMover.cpp b/lib/Target/Sparc/FPMover.cpp
deleted file mode 100644
index 1325b98..0000000
--- a/lib/Target/Sparc/FPMover.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-//===-- FPMover.cpp - Sparc double-precision floating point move fixer ----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Expand FpMOVD/FpABSD/FpNEGD instructions into their single-precision pieces.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "fpmover"
-#include "Sparc.h"
-#include "SparcSubtarget.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-using namespace llvm;
-
-STATISTIC(NumFpDs , "Number of instructions translated");
-STATISTIC(NoopFpDs, "Number of noop instructions removed");
-
-namespace {
-  struct FPMover : public MachineFunctionPass {
-    /// Target machine description which we query for reg. names, data
-    /// layout, etc.
-    ///
-    TargetMachine &TM;
-    
-    static char ID;
-    explicit FPMover(TargetMachine &tm) 
-      : MachineFunctionPass(ID), TM(tm) { }
-
-    virtual const char *getPassName() const {
-      return "Sparc Double-FP Move Fixer";
-    }
-
-    bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
-    bool runOnMachineFunction(MachineFunction &F);
-  };
-  char FPMover::ID = 0;
-} // end of anonymous namespace
-
-/// createSparcFPMoverPass - Returns a pass that turns FpMOVD
-/// instructions into FMOVS instructions
-///
-FunctionPass *llvm::createSparcFPMoverPass(TargetMachine &tm) {
-  return new FPMover(tm);
-}
-
-/// getDoubleRegPair - Given a DFP register, return the even and odd FP
-/// registers that correspond to it.
-static void getDoubleRegPair(unsigned DoubleReg, unsigned &EvenReg,
-                             unsigned &OddReg) {
-  static const uint16_t EvenHalvesOfPairs[] = {
-    SP::F0, SP::F2, SP::F4, SP::F6, SP::F8, SP::F10, SP::F12, SP::F14,
-    SP::F16, SP::F18, SP::F20, SP::F22, SP::F24, SP::F26, SP::F28, SP::F30
-  };
-  static const uint16_t OddHalvesOfPairs[] = {
-    SP::F1, SP::F3, SP::F5, SP::F7, SP::F9, SP::F11, SP::F13, SP::F15,
-    SP::F17, SP::F19, SP::F21, SP::F23, SP::F25, SP::F27, SP::F29, SP::F31
-  };
-  static const uint16_t DoubleRegsInOrder[] = {
-    SP::D0, SP::D1, SP::D2, SP::D3, SP::D4, SP::D5, SP::D6, SP::D7, SP::D8,
-    SP::D9, SP::D10, SP::D11, SP::D12, SP::D13, SP::D14, SP::D15
-  };
-  for (unsigned i = 0; i < array_lengthof(DoubleRegsInOrder); ++i)
-    if (DoubleRegsInOrder[i] == DoubleReg) {
-      EvenReg = EvenHalvesOfPairs[i];
-      OddReg = OddHalvesOfPairs[i];
-      return;
-    }
-  llvm_unreachable("Can't find reg");
-}
-
-/// runOnMachineBasicBlock - Fixup FpMOVD instructions in this MBB.
-///
-bool FPMover::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
-  bool Changed = false;
-  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ) {
-    MachineInstr *MI = I++;
-    DebugLoc dl = MI->getDebugLoc();
-    if (MI->getOpcode() == SP::FpMOVD || MI->getOpcode() == SP::FpABSD ||
-        MI->getOpcode() == SP::FpNEGD) {
-      Changed = true;
-      unsigned DestDReg = MI->getOperand(0).getReg();
-      unsigned SrcDReg  = MI->getOperand(1).getReg();
-      if (DestDReg == SrcDReg && MI->getOpcode() == SP::FpMOVD) {
-        MBB.erase(MI);   // Eliminate the noop copy.
-        ++NoopFpDs;
-        continue;
-      }
-      
-      unsigned EvenSrcReg = 0, OddSrcReg = 0, EvenDestReg = 0, OddDestReg = 0;
-      getDoubleRegPair(DestDReg, EvenDestReg, OddDestReg);
-      getDoubleRegPair(SrcDReg, EvenSrcReg, OddSrcReg);
-
-      const TargetInstrInfo *TII = TM.getInstrInfo();
-      if (MI->getOpcode() == SP::FpMOVD)
-        MI->setDesc(TII->get(SP::FMOVS));
-      else if (MI->getOpcode() == SP::FpNEGD)
-        MI->setDesc(TII->get(SP::FNEGS));
-      else if (MI->getOpcode() == SP::FpABSD)
-        MI->setDesc(TII->get(SP::FABSS));
-      else
-        llvm_unreachable("Unknown opcode!");
-        
-      MI->getOperand(0).setReg(EvenDestReg);
-      MI->getOperand(1).setReg(EvenSrcReg);
-      DEBUG(errs() << "FPMover: the modified instr is: " << *MI);
-      // Insert copy for the other half of the double.
-      if (DestDReg != SrcDReg) {
-        MI = BuildMI(MBB, I, dl, TM.getInstrInfo()->get(SP::FMOVS), OddDestReg)
-          .addReg(OddSrcReg);
-        DEBUG(errs() << "FPMover: the inserted instr is: " << *MI);
-      }
-      ++NumFpDs;
-    }
-  }
-  return Changed;
-}
-
-bool FPMover::runOnMachineFunction(MachineFunction &F) {
-  // If the target has V9 instructions, the fp-mover pseudos will never be
-  // emitted.  Avoid a scan of the instructions to improve compile time.
-  if (TM.getSubtarget<SparcSubtarget>().isV9())
-    return false;
-  
-  bool Changed = false;
-  for (MachineFunction::iterator FI = F.begin(), FE = F.end();
-       FI != FE; ++FI)
-    Changed |= runOnMachineBasicBlock(*FI);
-  return Changed;
-}
diff --git a/lib/Target/Sparc/LLVMBuild.txt b/lib/Target/Sparc/LLVMBuild.txt
index fe20d2f..7d54d32 100644
--- a/lib/Target/Sparc/LLVMBuild.txt
+++ b/lib/Target/Sparc/LLVMBuild.txt
@@ -28,5 +28,6 @@ has_asmprinter = 1
 type = Library
 name = SparcCodeGen
 parent = Sparc
-required_libraries = AsmPrinter CodeGen Core MC SelectionDAG SparcDesc SparcInfo Support Target
+required_libraries = AsmPrinter CodeGen Core MC SelectionDAG SparcDesc
+                     SparcInfo Support Target
 add_to_library_groups = Sparc
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
index 3d4bfdc..5a52abe 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
@@ -18,7 +18,7 @@ using namespace llvm;
 
 void SparcELFMCAsmInfo::anchor() { }
 
-SparcELFMCAsmInfo::SparcELFMCAsmInfo(const Target &T, StringRef TT) {
+SparcELFMCAsmInfo::SparcELFMCAsmInfo(StringRef TT) {
   IsLittleEndian = false;
   Triple TheTriple(TT);
   if (TheTriple.getArch() == Triple::sparcv9) {
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
index f0e1354..621e8ff 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
@@ -18,12 +18,11 @@
 
 namespace llvm {
   class StringRef;
-  class Target;
 
   class SparcELFMCAsmInfo : public MCAsmInfo {
     virtual void anchor();
   public:
-    explicit SparcELFMCAsmInfo(const Target &T, StringRef TT);
+    explicit SparcELFMCAsmInfo(StringRef TT);
   };
 
 } // namespace llvm
diff --git a/lib/Target/Sparc/README.txt b/lib/Target/Sparc/README.txt
index b4991fe..34e68cf 100644
--- a/lib/Target/Sparc/README.txt
+++ b/lib/Target/Sparc/README.txt
@@ -38,7 +38,7 @@ t1:
 
 1) should be replaced with a brz in V9 mode.
 
-* Same as above, but emit conditional move on register zero (p192) in V9 
+* Same as above, but emit conditional move on register zero (p192) in V9
   mode.  Testcase:
 
 int %t1(int %a, int %b) {
@@ -47,13 +47,15 @@ int %t1(int %a, int %b) {
         ret int %D
 }
 
-* Emit MULX/[SU]DIVX instructions in V9 mode instead of fiddling 
+* Emit MULX/[SU]DIVX instructions in V9 mode instead of fiddling
   with the Y register, if they are faster.
 
 * Codegen bswap(load)/store(bswap) -> load/store ASI
 
-* Implement frame pointer elimination, e.g. eliminate save/restore for 
+* Implement frame pointer elimination, e.g. eliminate save/restore for
   leaf fns.
 * Fill delay slots
 
 * Implement JIT support
+
+* Use %g0 directly to materialize 0. No instruction is required.
diff --git a/lib/Target/Sparc/Sparc.h b/lib/Target/Sparc/Sparc.h
index ce6ae17..98563db 100644
--- a/lib/Target/Sparc/Sparc.h
+++ b/lib/Target/Sparc/Sparc.h
@@ -26,7 +26,6 @@ namespace llvm {
 
   FunctionPass *createSparcISelDag(SparcTargetMachine &TM);
   FunctionPass *createSparcDelaySlotFillerPass(TargetMachine &TM);
-  FunctionPass *createSparcFPMoverPass(TargetMachine &TM);
 
 } // end namespace llvm;
 
@@ -51,7 +50,7 @@ namespace llvm {
       ICC_NEG =  6   ,  // Negative
       ICC_VC  = 15   ,  // Overflow Clear
       ICC_VS  =  7   ,  // Overflow Set
-      
+
       //FCC_A   =  8+16,  // Always
       //FCC_N   =  0+16,  // Never
       FCC_U   =  7+16,  // Unordered
@@ -70,7 +69,7 @@ namespace llvm {
       FCC_O   = 15+16   // Ordered
     };
   }
-  
+
   inline static const char *SPARCCondCodeToString(SPCC::CondCodes CC) {
     switch (CC) {
     case SPCC::ICC_NE:  return "ne";
diff --git a/lib/Target/Sparc/Sparc.td b/lib/Target/Sparc/Sparc.td
index 611f8e8..d42c40f 100644
--- a/lib/Target/Sparc/Sparc.td
+++ b/lib/Target/Sparc/Sparc.td
@@ -19,7 +19,7 @@ include "llvm/Target/Target.td"
 //===----------------------------------------------------------------------===//
 // SPARC Subtarget features.
 //
- 
+
 def FeatureV9
   : SubtargetFeature<"v9", "IsV9", "true",
                      "Enable SPARC-V9 instructions">;
diff --git a/lib/Target/Sparc/SparcAsmPrinter.cpp b/lib/Target/Sparc/SparcAsmPrinter.cpp
index 108eb90..b538d5c 100644
--- a/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -60,7 +60,7 @@ namespace {
                                raw_ostream &O);
 
     bool printGetPCX(const MachineInstr *MI, unsigned OpNo, raw_ostream &OS);
-    
+
     virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB)
                        const;
 
@@ -120,6 +120,9 @@ void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
   case MachineOperand::MO_GlobalAddress:
     O << *Mang->getSymbol(MO.getGlobal());
     break;
+  case MachineOperand::MO_BlockAddress:
+    O <<  GetBlockAddressSymbol(MO.getBlockAddress())->getName();
+    break;
   case MachineOperand::MO_ExternalSymbol:
     O << MO.getSymbolName();
     break;
@@ -164,7 +167,7 @@ bool SparcAsmPrinter::printGetPCX(const MachineInstr *MI, unsigned opNum,
   case MachineOperand::MO_Register:
     assert(TargetRegisterInfo::isPhysicalRegister(MO.getReg()) &&
            "Operand is not a physical register ");
-    assert(MO.getReg() != SP::O7 && 
+    assert(MO.getReg() != SP::O7 &&
            "%o7 is assigned as destination for getpcx!");
     operand = "%" + StringRef(getRegisterName(MO.getReg())).lower();
     break;
@@ -177,15 +180,15 @@ bool SparcAsmPrinter::printGetPCX(const MachineInstr *MI, unsigned opNum,
   O << "\tcall\t.LLGETPC" << mfNum << '_' << bbNum << '\n' ;
 
   O << "\t  sethi\t"
-    << "%hi(_GLOBAL_OFFSET_TABLE_+(.-.LLGETPCH" << mfNum << '_' << bbNum 
+    << "%hi(_GLOBAL_OFFSET_TABLE_+(.-.LLGETPCH" << mfNum << '_' << bbNum
     << ")), "  << operand << '\n' ;
 
   O << ".LLGETPC" << mfNum << '_' << bbNum << ":\n" ;
-  O << "\tor\t" << operand  
+  O << "\tor\t" << operand
     << ", %lo(_GLOBAL_OFFSET_TABLE_+(.-.LLGETPCH" << mfNum << '_' << bbNum
     << ")), " << operand << '\n';
-  O << "\tadd\t" << operand << ", %o7, " << operand << '\n'; 
-  
+  O << "\tadd\t" << operand << ", %o7, " << operand << '\n';
+
   return true;
 }
 
@@ -243,19 +246,19 @@ isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
   // then nothing falls through to it.
   if (MBB->isLandingPad() || MBB->pred_empty())
     return false;
-  
+
   // If there isn't exactly one predecessor, it can't be a fall through.
   MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(), PI2 = PI;
   ++PI2;
   if (PI2 != MBB->pred_end())
     return false;
-  
+
   // The predecessor has to be immediately before this block.
   const MachineBasicBlock *Pred = *PI;
-  
+
   if (!Pred->isLayoutSuccessor(MBB))
     return false;
-  
+
   // Check if the last terminator is an unconditional branch.
   MachineBasicBlock::const_iterator I = Pred->end();
   while (I != Pred->begin() && !(--I)->isTerminator())
@@ -273,7 +276,7 @@ getDebugValueLocation(const MachineInstr *MI) const {
 }
 
 // Force static initialization.
-extern "C" void LLVMInitializeSparcAsmPrinter() { 
+extern "C" void LLVMInitializeSparcAsmPrinter() {
   RegisterAsmPrinter<SparcAsmPrinter> X(TheSparcTarget);
   RegisterAsmPrinter<SparcAsmPrinter> Y(TheSparcV9Target);
 }
diff --git a/lib/Target/Sparc/SparcCallingConv.td b/lib/Target/Sparc/SparcCallingConv.td
index 54784e0..a181bcf 100644
--- a/lib/Target/Sparc/SparcCallingConv.td
+++ b/lib/Target/Sparc/SparcCallingConv.td
@@ -16,7 +16,7 @@
 //===----------------------------------------------------------------------===//
 
 def CC_Sparc32 : CallingConv<[
-  //Custom assign SRet to [sp+64].
+  // Custom assign SRet to [sp+64].
   CCIfSRet<CCCustom<"CC_Sparc_Assign_SRet">>,
   // i32 f32 arguments get passed in integer registers if there is space.
   CCIfType<[i32, f32], CCAssignToReg<[I0, I1, I2, I3, I4, I5]>>,
diff --git a/lib/Target/Sparc/SparcFrameLowering.cpp b/lib/Target/Sparc/SparcFrameLowering.cpp
index 7874240..7e91bc3 100644
--- a/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -26,7 +26,16 @@
 
 using namespace llvm;
 
+static cl::opt<bool>
+DisableLeafProc("disable-sparc-leaf-proc",
+                cl::init(false),
+                cl::desc("Disable Sparc leaf procedure optimization."),
+                cl::Hidden);
+
+
 void SparcFrameLowering::emitPrologue(MachineFunction &MF) const {
+  SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
+
   MachineBasicBlock &MBB = MF.front();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const SparcInstrInfo &TII =
@@ -37,31 +46,18 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF) const {
   // Get the number of bytes to allocate from the FrameInfo
   int NumBytes = (int) MFI->getStackSize();
 
-  if (SubTarget.is64Bit()) {
-    // All 64-bit stack frames must be 16-byte aligned, and must reserve space
-    // for spilling the 16 window registers at %sp+BIAS..%sp+BIAS+128.
-    NumBytes += 128;
-    // Frames with calls must also reserve space for 6 outgoing arguments
-    // whether they are used or not. LowerCall_64 takes care of that.
-    assert(NumBytes % 16 == 0 && "Stack size not 16-byte aligned");
-  } else {
-    // Emit the correct save instruction based on the number of bytes in
-    // the frame. Minimum stack frame size according to V8 ABI is:
-    //   16 words for register window spill
-    //    1 word for address of returned aggregate-value
-    // +  6 words for passing parameters on the stack
-    // ----------
-    //   23 words * 4 bytes per word = 92 bytes
-    NumBytes += 92;
-
-    // Round up to next doubleword boundary -- a double-word boundary
-    // is required by the ABI.
-    NumBytes = RoundUpToAlignment(NumBytes, 8);
+  unsigned SAVEri = SP::SAVEri;
+  unsigned SAVErr = SP::SAVErr;
+  if (FuncInfo->isLeafProc()) {
+    if (NumBytes == 0)
+      return;
+    SAVEri = SP::ADDri;
+    SAVErr = SP::ADDrr;
   }
-  NumBytes = -NumBytes;
+  NumBytes = - SubTarget.getAdjustedFrameSize(NumBytes);
 
   if (NumBytes >= -4096) {
-    BuildMI(MBB, MBBI, dl, TII.get(SP::SAVEri), SP::O6)
+    BuildMI(MBB, MBBI, dl, TII.get(SAVEri), SP::O6)
       .addReg(SP::O6).addImm(NumBytes);
   } else {
     // Emit this the hard way.  This clobbers G1 which we always know is
@@ -71,7 +67,7 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF) const {
     // Emit G1 = G1 + I6
     BuildMI(MBB, MBBI, dl, TII.get(SP::ORri), SP::G1)
       .addReg(SP::G1).addImm(NumBytes & ((1 << 10)-1));
-    BuildMI(MBB, MBBI, dl, TII.get(SP::SAVErr), SP::O6)
+    BuildMI(MBB, MBBI, dl, TII.get(SAVErr), SP::O6)
       .addReg(SP::O6).addReg(SP::G1);
   }
 }
@@ -97,12 +93,115 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
 
 void SparcFrameLowering::emitEpilogue(MachineFunction &MF,
                                   MachineBasicBlock &MBB) const {
+  SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   const SparcInstrInfo &TII =
     *static_cast<const SparcInstrInfo*>(MF.getTarget().getInstrInfo());
   DebugLoc dl = MBBI->getDebugLoc();
   assert(MBBI->getOpcode() == SP::RETL &&
          "Can only put epilog before 'retl' instruction!");
-  BuildMI(MBB, MBBI, dl, TII.get(SP::RESTORErr), SP::G0).addReg(SP::G0)
-    .addReg(SP::G0);
+  if (!FuncInfo->isLeafProc()) {
+    BuildMI(MBB, MBBI, dl, TII.get(SP::RESTORErr), SP::G0).addReg(SP::G0)
+      .addReg(SP::G0);
+    return;
+  }
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  int NumBytes = (int) MFI->getStackSize();
+  if (NumBytes == 0)
+    return;
+
+  NumBytes = SubTarget.getAdjustedFrameSize(NumBytes);
+
+  if (NumBytes < 4096) {
+    BuildMI(MBB, MBBI, dl, TII.get(SP::ADDri), SP::O6)
+      .addReg(SP::O6).addImm(NumBytes);
+  } else {
+    // Emit this the hard way.  This clobbers G1 which we always know is
+    // available here.
+    unsigned OffHi = (unsigned)NumBytes >> 10U;
+    BuildMI(MBB, MBBI, dl, TII.get(SP::SETHIi), SP::G1).addImm(OffHi);
+    // Emit G1 = G1 + I6
+    BuildMI(MBB, MBBI, dl, TII.get(SP::ORri), SP::G1)
+      .addReg(SP::G1).addImm(NumBytes & ((1 << 10)-1));
+    BuildMI(MBB, MBBI, dl, TII.get(SP::ADDrr), SP::O6)
+      .addReg(SP::O6).addReg(SP::G1);
+  }
+}
+
+bool SparcFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  // Reserve call frame if there are no variable sized objects on the stack.
+  return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+// hasFP - Return true if the specified function should have a dedicated frame
+// pointer register.  This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+bool SparcFrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return MF.getTarget().Options.DisableFramePointerElim(MF) ||
+    MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken();
+}
+
+
+static bool LLVM_ATTRIBUTE_UNUSED verifyLeafProcRegUse(MachineRegisterInfo *MRI)
+{
+
+  for (unsigned reg = SP::I0; reg <= SP::I7; ++reg)
+    if (MRI->isPhysRegUsed(reg))
+      return false;
+
+  for (unsigned reg = SP::L0; reg <= SP::L7; ++reg)
+    if (MRI->isPhysRegUsed(reg))
+      return false;
+
+  return true;
+}
+
+bool SparcFrameLowering::isLeafProc(MachineFunction &MF) const
+{
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MachineFrameInfo    *MFI = MF.getFrameInfo();
+
+  return !(MFI->hasCalls()              // has calls
+           || MRI.isPhysRegUsed(SP::L0) // Too many registers needed
+           || MRI.isPhysRegUsed(SP::O6) // %SP is used
+           || hasFP(MF));               // need %FP
+}
+
+void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const {
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // Remap %i[0-7] to %o[0-7].
+  for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) {
+    if (!MRI.isPhysRegUsed(reg))
+      continue;
+    unsigned mapped_reg = (reg - SP::I0 + SP::O0);
+    assert(!MRI.isPhysRegUsed(mapped_reg));
+
+    // Replace I register with O register.
+    MRI.replaceRegWith(reg, mapped_reg);
+
+    // Mark the reg unused.
+    MRI.setPhysRegUnused(reg);
+  }
+
+  assert(verifyLeafProcRegUse(&MRI));
+#ifdef XDEBUG
+  MF.verify(0, "After LeafProc Remapping");
+#endif
+}
+
+void SparcFrameLowering::processFunctionBeforeCalleeSavedScan
+                  (MachineFunction &MF, RegScavenger *RS) const {
+
+  if (!DisableLeafProc && isLeafProc(MF)) {
+    SparcMachineFunctionInfo *MFI = MF.getInfo<SparcMachineFunctionInfo>();
+    MFI->setLeafProc(true);
+
+    remapRegsForLeafProc(MF);
+  }
+
 }
diff --git a/lib/Target/Sparc/SparcFrameLowering.h b/lib/Target/Sparc/SparcFrameLowering.h
index c375662..8eaef59 100644
--- a/lib/Target/Sparc/SparcFrameLowering.h
+++ b/lib/Target/Sparc/SparcFrameLowering.h
@@ -38,7 +38,17 @@ public:
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I) const;
 
-  bool hasFP(const MachineFunction &MF) const { return false; }
+  bool hasReservedCallFrame(const MachineFunction &MF) const;
+  bool hasFP(const MachineFunction &MF) const;
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS = NULL) const;
+
+private:
+  // Remap input registers to output registers for leaf procedure.
+  void remapRegsForLeafProc(MachineFunction &MF) const;
+
+  // Returns true if MF is a leaf procedure.
+  bool isLeafProc(MachineFunction &MF) const;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
index a709685..e85cf74 100644
--- a/lib/Target/Sparc/SparcISelDAGToDAG.cpp
+++ b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
@@ -67,13 +67,13 @@ private:
 
 SDNode* SparcDAGToDAGISel::getGlobalBaseReg() {
   unsigned GlobalBaseReg = TM.getInstrInfo()->getGlobalBaseReg(MF);
-  return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode();
+  return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy()).getNode();
 }
 
 bool SparcDAGToDAGISel::SelectADDRri(SDValue Addr,
                                      SDValue &Base, SDValue &Offset) {
   if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
-    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), TLI.getPointerTy());
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), TLI->getPointerTy());
     Offset = CurDAG->getTargetConstant(0, MVT::i32);
     return true;
   }
@@ -88,7 +88,7 @@ bool SparcDAGToDAGISel::SelectADDRri(SDValue Addr,
                 dyn_cast<FrameIndexSDNode>(Addr.getOperand(0))) {
           // Constant offset from frame ref.
           Base = CurDAG->getTargetFrameIndex(FIN->getIndex(),
-                                             TLI.getPointerTy());
+                                             TLI->getPointerTy());
         } else {
           Base = Addr.getOperand(0);
         }
@@ -131,12 +131,12 @@ bool SparcDAGToDAGISel::SelectADDRrr(SDValue Addr, SDValue &R1, SDValue &R2) {
   }
 
   R1 = Addr;
-  R2 = CurDAG->getRegister(SP::G0, TLI.getPointerTy());
+  R2 = CurDAG->getRegister(SP::G0, TLI->getPointerTy());
   return true;
 }
 
 SDNode *SparcDAGToDAGISel::Select(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   if (N->isMachineOpcode())
     return NULL;   // Already selected.
 
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index 3863e2c..1d765f2 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -40,7 +40,7 @@ static bool CC_Sparc_Assign_SRet(unsigned &ValNo, MVT &ValVT,
 {
   assert (ArgFlags.isSRet());
 
-  //Assign SRet argument
+  // Assign SRet argument.
   State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
                                          0,
                                          LocVT, LocInfo));
@@ -54,18 +54,18 @@ static bool CC_Sparc_Assign_f64(unsigned &ValNo, MVT &ValVT,
   static const uint16_t RegList[] = {
     SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
   };
-  //Try to get first reg
+  // Try to get first reg.
   if (unsigned Reg = State.AllocateReg(RegList, 6)) {
     State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
   } else {
-    //Assign whole thing in stack
+    // Assign whole thing in stack.
     State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
                                            State.AllocateStack(8,4),
                                            LocVT, LocInfo));
     return true;
   }
 
-  //Try to get second reg
+  // Try to get second reg.
   if (unsigned Reg = State.AllocateReg(RegList, 6))
     State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
   else
@@ -164,7 +164,7 @@ SparcTargetLowering::LowerReturn(SDValue Chain,
                                  CallingConv::ID CallConv, bool IsVarArg,
                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
                                  const SmallVectorImpl<SDValue> &OutVals,
-                                 DebugLoc DL, SelectionDAG &DAG) const {
+                                 SDLoc DL, SelectionDAG &DAG) const {
   if (Subtarget->is64Bit())
     return LowerReturn_64(Chain, CallConv, IsVarArg, Outs, OutVals, DL, DAG);
   return LowerReturn_32(Chain, CallConv, IsVarArg, Outs, OutVals, DL, DAG);
@@ -175,7 +175,7 @@ SparcTargetLowering::LowerReturn_32(SDValue Chain,
                                     CallingConv::ID CallConv, bool IsVarArg,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
                                     const SmallVectorImpl<SDValue> &OutVals,
-                                    DebugLoc DL, SelectionDAG &DAG) const {
+                                    SDLoc DL, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
 
   // CCValAssign - represent the assignment of the return value to locations.
@@ -206,7 +206,7 @@ SparcTargetLowering::LowerReturn_32(SDValue Chain,
     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }
 
-  unsigned RetAddrOffset = 8; //Call Inst + Delay Slot
+  unsigned RetAddrOffset = 8; // Call Inst + Delay Slot
   // If the function returns a struct, copy the SRetReturnReg to I0
   if (MF.getFunction()->hasStructRetAttr()) {
     SparcMachineFunctionInfo *SFI = MF.getInfo<SparcMachineFunctionInfo>();
@@ -238,7 +238,7 @@ SparcTargetLowering::LowerReturn_64(SDValue Chain,
                                     CallingConv::ID CallConv, bool IsVarArg,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
                                     const SmallVectorImpl<SDValue> &OutVals,
-                                    DebugLoc DL, SelectionDAG &DAG) const {
+                                    SDLoc DL, SelectionDAG &DAG) const {
   // CCValAssign - represent the assignment of the return value to locations.
   SmallVector<CCValAssign, 16> RVLocs;
 
@@ -314,7 +314,7 @@ LowerFormalArguments(SDValue Chain,
                      CallingConv::ID CallConv,
                      bool IsVarArg,
                      const SmallVectorImpl<ISD::InputArg> &Ins,
-                     DebugLoc DL,
+                     SDLoc DL,
                      SelectionDAG &DAG,
                      SmallVectorImpl<SDValue> &InVals) const {
   if (Subtarget->is64Bit())
@@ -332,7 +332,7 @@ LowerFormalArguments_32(SDValue Chain,
                         CallingConv::ID CallConv,
                         bool isVarArg,
                         const SmallVectorImpl<ISD::InputArg> &Ins,
-                        DebugLoc dl,
+                        SDLoc dl,
                         SelectionDAG &DAG,
                         SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -351,7 +351,7 @@ LowerFormalArguments_32(SDValue Chain,
     CCValAssign &VA = ArgLocs[i];
 
     if (i == 0  && Ins[i].Flags.isSRet()) {
-      //Get SRet from [%fp+64]
+      // Get SRet from [%fp+64].
       int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, 64, true);
       SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
       SDValue Arg = DAG.getLoad(MVT::i32, dl, Chain, FIPtr,
@@ -410,7 +410,7 @@ LowerFormalArguments_32(SDValue Chain,
 
     if (VA.needsCustom()) {
       assert(VA.getValVT() == MVT::f64);
-      //If it is double-word aligned, just load.
+      // If it is double-word aligned, just load.
       if (Offset % 8 == 0) {
         int FI = MF.getFrameInfo()->CreateFixedObject(8,
                                                       Offset,
@@ -470,7 +470,7 @@ LowerFormalArguments_32(SDValue Chain,
   }
 
   if (MF.getFunction()->hasStructRetAttr()) {
-    //Copy the SRet Argument to SRetReturnReg
+    // Copy the SRet Argument to SRetReturnReg.
     SparcMachineFunctionInfo *SFI = MF.getInfo<SparcMachineFunctionInfo>();
     unsigned Reg = SFI->getSRetReturnReg();
     if (!Reg) {
@@ -532,7 +532,7 @@ LowerFormalArguments_64(SDValue Chain,
                         CallingConv::ID CallConv,
                         bool IsVarArg,
                         const SmallVectorImpl<ISD::InputArg> &Ins,
-                        DebugLoc DL,
+                        SDLoc DL,
                         SelectionDAG &DAG,
                         SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -653,7 +653,7 @@ SDValue
 SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
                                   SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG                     = CLI.DAG;
-  DebugLoc &dl                          = CLI.DL;
+  SDLoc &dl                             = CLI.DL;
   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
   SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
   SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
@@ -680,7 +680,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
 
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
 
-  //Create local copies for byval args.
+  // Create local copies for byval args.
   SmallVector<SDValue, 8> ByValArgs;
   for (unsigned i = 0,  e = Outs.size(); i != e; ++i) {
     ISD::ArgFlagsTy Flags = Outs[i].Flags;
@@ -696,13 +696,14 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
     SDValue SizeNode = DAG.getConstant(Size, MVT::i32);
 
     Chain = DAG.getMemcpy(Chain, dl, FIPtr, Arg, SizeNode, Align,
-                          false,        //isVolatile,
-                          (Size <= 32), //AlwaysInline if size <= 32
+                          false,        // isVolatile,
+                          (Size <= 32), // AlwaysInline if size <= 32
                           MachinePointerInfo(), MachinePointerInfo());
     ByValArgs.push_back(FIPtr);
   }
 
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, true));
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, true),
+                               dl);
 
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
@@ -718,7 +719,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
 
     ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
 
-    //Use local copy if it is a byval arg.
+    // Use local copy if it is a byval arg.
     if (Flags.isByVal())
       Arg = ByValArgs[byvalArgIdx++];
 
@@ -758,7 +759,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
 
       if (VA.isMemLoc()) {
         unsigned Offset = VA.getLocMemOffset() + StackOffset;
-        //if it is double-word aligned, just store.
+        // if it is double-word aligned, just store.
         if (Offset % 8 == 0) {
           SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
           SDValue PtrOff = DAG.getIntPtrConstant(Offset);
@@ -791,7 +792,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
         if (NextVA.isRegLoc()) {
           RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Lo));
         } else {
-          //Store the low part in stack.
+          // Store the low part in stack.
           unsigned Offset = NextVA.getLocMemOffset() + StackOffset;
           SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
           SDValue PtrOff = DAG.getIntPtrConstant(Offset);
@@ -886,7 +887,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
   InFlag = Chain.getValue(1);
 
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(ArgsSize, true),
-                             DAG.getIntPtrConstant(0, true), InFlag);
+                             DAG.getIntPtrConstant(0, true), InFlag, dl);
   InFlag = Chain.getValue(1);
 
   // Assign locations to each value returned by this call.
@@ -979,7 +980,7 @@ SDValue
 SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
                                   SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG = CLI.DAG;
-  DebugLoc DL = CLI.DL;
+  SDLoc DL = CLI.DL;
   SDValue Chain = CLI.Chain;
 
   // Analyze operands of the call, assigning locations to each operand.
@@ -1004,7 +1005,8 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
   // Adjust the stack pointer to make room for the arguments.
   // FIXME: Use hasReservedCallFrame to avoid %sp adjustments around all calls
   // with more than 6 arguments.
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, true));
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, true),
+                               DL);
 
   // Collect the set of registers to pass to the function and their values.
   // This will be emitted as a sequence of CopyToReg nodes glued to the call
@@ -1122,7 +1124,7 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
 
   // Revert the stack pointer immediately after the call.
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(ArgsSize, true),
-                             DAG.getIntPtrConstant(0, true), InGlue);
+                             DAG.getIntPtrConstant(0, true), InGlue, DL);
   InGlue = Chain.getValue(1);
 
   // Now extract the return values. This is more or less the same as
@@ -1256,6 +1258,7 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::GlobalAddress, getPointerTy(), Custom);
   setOperationAction(ISD::GlobalTLSAddress, getPointerTy(), Custom);
   setOperationAction(ISD::ConstantPool, getPointerTy(), Custom);
+  setOperationAction(ISD::BlockAddress, getPointerTy(), Custom);
 
   // Sparc doesn't have sext_inreg, replace them with shl/sra
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
@@ -1300,6 +1303,10 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
 
   if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::BITCAST, MVT::f64, Expand);
+    setOperationAction(ISD::BITCAST, MVT::i64, Expand);
+    setOperationAction(ISD::SELECT, MVT::i64, Expand);
+    setOperationAction(ISD::SETCC, MVT::i64, Expand);
     setOperationAction(ISD::BR_CC, MVT::i64, Custom);
     setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
   }
@@ -1308,6 +1315,12 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   // on SparcV8 and later.
   setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand);
 
+  if (!Subtarget->isV9()) {
+    // SparcV8 does not have FNEGD and FABSD.
+    setOperationAction(ISD::FNEG, MVT::f64, Custom);
+    setOperationAction(ISD::FABS, MVT::f64, Custom);
+  }
+
   setOperationAction(ISD::FSIN , MVT::f64, Expand);
   setOperationAction(ISD::FCOS , MVT::f64, Expand);
   setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
@@ -1358,7 +1371,7 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
 
   setStackPointerRegisterToSaveRestore(SP::O6);
 
-  if (TM.getSubtarget<SparcSubtarget>().isV9())
+  if (Subtarget->isV9())
     setOperationAction(ISD::CTPOP, MVT::i32, Legal);
 
   setMinFunctionAlignment(2);
@@ -1391,11 +1404,12 @@ const char *SparcTargetLowering::getTargetNodeName(unsigned Opcode) const {
 /// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to
 /// be zero. Op is expected to be a target specific node. Used by DAG
 /// combiner.
-void SparcTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
-                                                         APInt &KnownZero,
-                                                         APInt &KnownOne,
-                                                         const SelectionDAG &DAG,
-                                                         unsigned Depth) const {
+void SparcTargetLowering::computeMaskedBitsForTargetNode
+                                (const SDValue Op,
+                                 APInt &KnownZero,
+                                 APInt &KnownOne,
+                                 const SelectionDAG &DAG,
+                                 unsigned Depth) const {
   APInt KnownZero2, KnownOne2;
   KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0);
 
@@ -1444,7 +1458,7 @@ SDValue SparcTargetLowering::withTargetFlags(SDValue Op, unsigned TF,
                                              SelectionDAG &DAG) const {
   if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op))
     return DAG.getTargetGlobalAddress(GA->getGlobal(),
-                                      GA->getDebugLoc(),
+                                      SDLoc(GA),
                                       GA->getValueType(0),
                                       GA->getOffset(), TF);
 
@@ -1454,6 +1468,12 @@ SDValue SparcTargetLowering::withTargetFlags(SDValue Op, unsigned TF,
                                      CP->getAlignment(),
                                      CP->getOffset(), TF);
 
+  if (const BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(Op))
+    return DAG.getTargetBlockAddress(BA->getBlockAddress(),
+                                     Op.getValueType(),
+                                     0,
+                                     TF);
+
   if (const ExternalSymbolSDNode *ES = dyn_cast<ExternalSymbolSDNode>(Op))
     return DAG.getTargetExternalSymbol(ES->getSymbol(),
                                        ES->getValueType(0), TF);
@@ -1466,7 +1486,7 @@ SDValue SparcTargetLowering::withTargetFlags(SDValue Op, unsigned TF,
 SDValue SparcTargetLowering::makeHiLoPair(SDValue Op,
                                           unsigned HiTF, unsigned LoTF,
                                           SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT VT = Op.getValueType();
   SDValue Hi = DAG.getNode(SPISD::Hi, DL, VT, withTargetFlags(Op, HiTF, DAG));
   SDValue Lo = DAG.getNode(SPISD::Lo, DL, VT, withTargetFlags(Op, LoTF, DAG));
@@ -1476,7 +1496,7 @@ SDValue SparcTargetLowering::makeHiLoPair(SDValue Op,
 // Build SDNodes for producing an address from a GlobalAddress, ConstantPool,
 // or ExternalSymbol SDNode.
 SDValue SparcTargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT VT = getPointerTy();
 
   // Handle PIC mode first.
@@ -1524,8 +1544,13 @@ SDValue SparcTargetLowering::LowerConstantPool(SDValue Op,
   return makeAddress(Op, DAG);
 }
 
+SDValue SparcTargetLowering::LowerBlockAddress(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  return makeAddress(Op, DAG);
+}
+
 static SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   // Convert the fp value to integer in an FP register.
   assert(Op.getValueType() == MVT::i32);
   Op = DAG.getNode(SPISD::FTOI, dl, MVT::f32, Op.getOperand(0));
@@ -1533,7 +1558,7 @@ static SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) {
 }
 
 static SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   assert(Op.getOperand(0).getValueType() == MVT::i32);
   SDValue Tmp = DAG.getNode(ISD::BITCAST, dl, MVT::f32, Op.getOperand(0));
   // Convert the int value to FP in an FP register.
@@ -1546,7 +1571,7 @@ static SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) {
   SDValue LHS = Op.getOperand(2);
   SDValue RHS = Op.getOperand(3);
   SDValue Dest = Op.getOperand(4);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned Opc, SPCC = ~0U;
 
   // If this is a br_cc of a "setcc", and if the setcc got lowered into
@@ -1556,9 +1581,7 @@ static SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) {
   // Get the condition flag.
   SDValue CompareFlag;
   if (LHS.getValueType().isInteger()) {
-    EVT VTs[] = { LHS.getValueType(), MVT::Glue };
-    SDValue Ops[2] = { LHS, RHS };
-    CompareFlag = DAG.getNode(SPISD::CMPICC, dl, VTs, Ops, 2).getValue(1);
+    CompareFlag = DAG.getNode(SPISD::CMPICC, dl, MVT::Glue, LHS, RHS);
     if (SPCC == ~0U) SPCC = IntCondCCodeToICC(CC);
     // 32-bit compares use the icc flags, 64-bit uses the xcc flags.
     Opc = LHS.getValueType() == MVT::i32 ? SPISD::BRICC : SPISD::BRXCC;
@@ -1577,7 +1600,7 @@ static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
   SDValue TrueVal = Op.getOperand(2);
   SDValue FalseVal = Op.getOperand(3);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned Opc, SPCC = ~0U;
 
   // If this is a select_cc of a "setcc", and if the setcc got lowered into
@@ -1586,10 +1609,7 @@ static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
 
   SDValue CompareFlag;
   if (LHS.getValueType().isInteger()) {
-    // subcc returns a value
-    EVT VTs[] = { LHS.getValueType(), MVT::Glue };
-    SDValue Ops[2] = { LHS, RHS };
-    CompareFlag = DAG.getNode(SPISD::CMPICC, dl, VTs, Ops, 2).getValue(1);
+    CompareFlag = DAG.getNode(SPISD::CMPICC, dl, MVT::Glue, LHS, RHS);
     Opc = LHS.getValueType() == MVT::i32 ?
           SPISD::SELECT_ICC : SPISD::SELECT_XCC;
     if (SPCC == ~0U) SPCC = IntCondCCodeToICC(CC);
@@ -1607,9 +1627,12 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
   MachineFunction &MF = DAG.getMachineFunction();
   SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
 
+  // Need frame address to find the address of VarArgsFrameIndex.
+  MF.getFrameInfo()->setFrameAddressIsTaken(true);
+
   // vastart just stores the address of the VarArgsFrameIndex slot into the
   // memory location argument.
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   SDValue Offset =
     DAG.getNode(ISD::ADD, DL, TLI.getPointerTy(),
                 DAG.getRegister(SP::I6, TLI.getPointerTy()),
@@ -1626,7 +1649,7 @@ static SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) {
   SDValue VAListPtr = Node->getOperand(1);
   EVT PtrVT = VAListPtr.getValueType();
   const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
-  DebugLoc DL = Node->getDebugLoc();
+  SDLoc DL(Node);
   SDValue VAList = DAG.getLoad(PtrVT, DL, InChain, VAListPtr,
                                MachinePointerInfo(SV), false, false, false, 0);
   // Increment the pointer, VAList, to the next vaarg.
@@ -1645,7 +1668,7 @@ static SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) {
 static SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) {
   SDValue Chain = Op.getOperand(0);  // Legalize the chain.
   SDValue Size  = Op.getOperand(1);  // Legalize the size.
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   unsigned SPReg = SP::O6;
   SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, MVT::i32);
@@ -1662,7 +1685,7 @@ static SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) {
 
 
 static SDValue getFLUSHW(SDValue Op, SelectionDAG &DAG) {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue Chain = DAG.getNode(SPISD::FLUSHW,
                               dl, MVT::Other, DAG.getEntryNode());
   return Chain;
@@ -1673,7 +1696,7 @@ static SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) {
   MFI->setFrameAddressIsTaken(true);
 
   EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned FrameReg = SP::I6;
 
   uint64_t depth = Op.getConstantOperandVal(0);
@@ -1704,7 +1727,7 @@ static SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) {
   MFI->setReturnAddressIsTaken(true);
 
   EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned RetReg = SP::I7;
 
   uint64_t depth = Op.getConstantOperandVal(0);
@@ -1713,6 +1736,9 @@ static SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) {
   if (depth == 0)
     RetAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, RetReg, VT);
   else {
+    // Need frame address to find return address of the caller.
+    MFI->setFrameAddressIsTaken(true);
+
     // flush first to make sure the windowed registers' values are in stack
     SDValue Chain = getFLUSHW(Op, DAG);
     RetAddr = DAG.getCopyFromReg(Chain, dl, SP::I6, VT);
@@ -1731,15 +1757,48 @@ static SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) {
   return RetAddr;
 }
 
+static SDValue LowerF64Op(SDValue Op, SelectionDAG &DAG)
+{
+  SDLoc dl(Op);
+
+  assert(Op.getValueType() == MVT::f64 && "LowerF64Op called on non-double!");
+  assert(Op.getOpcode() == ISD::FNEG || Op.getOpcode() == ISD::FABS);
+
+  // Lower fneg/fabs on f64 to fneg/fabs on f32.
+  // fneg f64 => fneg f32:sub_even, fmov f32:sub_odd.
+  // fabs f64 => fabs f32:sub_even, fmov f32:sub_odd.
+
+  SDValue SrcReg64 = Op.getOperand(0);
+  SDValue Hi32 = DAG.getTargetExtractSubreg(SP::sub_even, dl, MVT::f32,
+                                            SrcReg64);
+  SDValue Lo32 = DAG.getTargetExtractSubreg(SP::sub_odd, dl, MVT::f32,
+                                            SrcReg64);
+
+  Hi32 = DAG.getNode(Op.getOpcode(), dl, MVT::f32, Hi32);
+
+  SDValue DstReg64 = SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                dl, MVT::f64), 0);
+  DstReg64 = DAG.getTargetInsertSubreg(SP::sub_even, dl, MVT::f64,
+                                       DstReg64, Hi32);
+  DstReg64 = DAG.getTargetInsertSubreg(SP::sub_odd, dl, MVT::f64,
+                                       DstReg64, Lo32);
+  return DstReg64;
+}
+
 SDValue SparcTargetLowering::
 LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Should not custom lower this!");
+
+  case ISD::FNEG:
+  case ISD::FABS:               return LowerF64Op(Op, DAG);
+
   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
   case ISD::GlobalTLSAddress:
     llvm_unreachable("TLS not implemented for Sparc.");
   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+  case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h
index fd706be..7137171 100644
--- a/lib/Target/Sparc/SparcISelLowering.h
+++ b/lib/Target/Sparc/SparcISelLowering.h
@@ -78,19 +78,19 @@ namespace llvm {
                            CallingConv::ID CallConv,
                            bool isVarArg,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
-                           DebugLoc dl, SelectionDAG &DAG,
+                           SDLoc dl, SelectionDAG &DAG,
                            SmallVectorImpl<SDValue> &InVals) const;
     SDValue LowerFormalArguments_32(SDValue Chain,
                                     CallingConv::ID CallConv,
                                     bool isVarArg,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SDLoc dl, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &InVals) const;
     SDValue LowerFormalArguments_64(SDValue Chain,
                                     CallingConv::ID CallConv,
                                     bool isVarArg,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SDLoc dl, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
@@ -106,20 +106,21 @@ namespace llvm {
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
                   const SmallVectorImpl<SDValue> &OutVals,
-                  DebugLoc dl, SelectionDAG &DAG) const;
+                  SDLoc dl, SelectionDAG &DAG) const;
     SDValue LowerReturn_32(SDValue Chain,
                            CallingConv::ID CallConv, bool IsVarArg,
                            const SmallVectorImpl<ISD::OutputArg> &Outs,
                            const SmallVectorImpl<SDValue> &OutVals,
-                           DebugLoc DL, SelectionDAG &DAG) const;
+                           SDLoc DL, SelectionDAG &DAG) const;
     SDValue LowerReturn_64(SDValue Chain,
                            CallingConv::ID CallConv, bool IsVarArg,
                            const SmallVectorImpl<ISD::OutputArg> &Outs,
                            const SmallVectorImpl<SDValue> &OutVals,
-                           DebugLoc DL, SelectionDAG &DAG) const;
+                           SDLoc DL, SelectionDAG &DAG) const;
 
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
 
     unsigned getSRetArgSize(SelectionDAG &DAG, SDValue Callee) const;
     SDValue withTargetFlags(SDValue Op, unsigned TF, SelectionDAG &DAG) const;
diff --git a/lib/Target/Sparc/SparcInstr64Bit.td b/lib/Target/Sparc/SparcInstr64Bit.td
index 91805f9..47658ee 100644
--- a/lib/Target/Sparc/SparcInstr64Bit.td
+++ b/lib/Target/Sparc/SparcInstr64Bit.td
@@ -59,10 +59,6 @@ defm SRAX : F3_S<"srax", 0b100111, 1, sra, i64, I64Regs>;
 // preferable to use a constant pool load instead, depending on the
 // microarchitecture.
 
-// The %g0 register is constant 0.
-// This is useful for stx %g0, [...], for example.
-def : Pat<(i64 0), (i64 G0)>, Requires<[Is64Bit]>;
-
 // Single-instruction patterns.
 
 // The ALU instructions want their simm13 operands as i32 immediates.
@@ -164,7 +160,7 @@ def : Pat<(sube i64:$a, i64:$b), (SUBXrr $a, $b)>;
 def : Pat<(addc i64:$a, i64:$b), (ADDCCrr $a, $b)>;
 def : Pat<(subc i64:$a, i64:$b), (SUBCCrr $a, $b)>;
 
-def : Pat<(SPcmpicc i64:$a, i64:$b), (SUBCCrr $a, $b)>;
+def : Pat<(SPcmpicc i64:$a, i64:$b), (CMPrr $a, $b)>;
 
 // Register-immediate instructions.
 
@@ -175,7 +171,7 @@ def : Pat<(xor i64:$a, (i64 simm13:$b)), (XORri $a, (as_i32imm $b))>;
 def : Pat<(add i64:$a, (i64 simm13:$b)), (ADDri $a, (as_i32imm $b))>;
 def : Pat<(sub i64:$a, (i64 simm13:$b)), (SUBri $a, (as_i32imm $b))>;
 
-def : Pat<(SPcmpicc i64:$a, (i64 simm13:$b)), (SUBCCri $a, (as_i32imm $b))>;
+def : Pat<(SPcmpicc i64:$a, (i64 simm13:$b)), (CMPri $a, (as_i32imm $b))>;
 
 } // Predicates = [Is64Bit]
 
@@ -243,6 +239,11 @@ def LDXri  : F3_2<3, 0b001011,
                   [(set i64:$dst, (load ADDRri:$addr))]>;
 
 // Extending loads to i64.
+def : Pat<(i64 (zextloadi1 ADDRrr:$addr)), (LDUBrr ADDRrr:$addr)>;
+def : Pat<(i64 (zextloadi1 ADDRri:$addr)), (LDUBri ADDRri:$addr)>;
+def : Pat<(i64 (extloadi1 ADDRrr:$addr)), (LDUBrr ADDRrr:$addr)>;
+def : Pat<(i64 (extloadi1 ADDRri:$addr)), (LDUBri ADDRri:$addr)>;
+
 def : Pat<(i64 (zextloadi8 ADDRrr:$addr)), (LDUBrr ADDRrr:$addr)>;
 def : Pat<(i64 (zextloadi8 ADDRri:$addr)), (LDUBri ADDRri:$addr)>;
 def : Pat<(i64 (extloadi8 ADDRrr:$addr)),  (LDUBrr ADDRrr:$addr)>;
@@ -290,6 +291,10 @@ def : Pat<(truncstorei16 i64:$src, ADDRri:$addr), (STHri ADDRri:$addr, $src)>;
 def : Pat<(truncstorei32 i64:$src, ADDRrr:$addr), (STrr  ADDRrr:$addr, $src)>;
 def : Pat<(truncstorei32 i64:$src, ADDRri:$addr), (STri  ADDRri:$addr, $src)>;
 
+// store 0, addr -> store %g0, addr
+def : Pat<(store (i64 0), ADDRrr:$dst), (STXrr ADDRrr:$dst, (i64 G0))>;
+def : Pat<(store (i64 0), ADDRri:$dst), (STXri ADDRri:$dst, (i64 G0))>;
+
 } // Predicates = [Is64Bit]
 
 
@@ -308,7 +313,7 @@ let Predicates = [Is64Bit] in {
 
 let Uses = [ICC] in
 def BPXCC : BranchSP<0, (ins brtarget:$dst, CCOp:$cc),
-                     "bp$cc %xcc, $dst",
+                     "b$cc %xcc, $dst",
                      [(SPbrxcc bb:$dst, imm:$cc)]>;
 
 // Conditional moves on %xcc.
@@ -322,7 +327,17 @@ def MOVXCCri : Pseudo<(outs IntRegs:$rd),
                       (ins i32imm:$i, IntRegs:$f, CCOp:$cond),
                       "mov$cond %xcc, $i, $rd",
                       [(set i32:$rd,
-                       (SPselecticc simm11:$i, i32:$f, imm:$cond))]>;
+                       (SPselectxcc simm11:$i, i32:$f, imm:$cond))]>;
+def FMOVS_XCC : Pseudo<(outs FPRegs:$rd),
+                      (ins FPRegs:$rs2, FPRegs:$f, CCOp:$cond),
+                      "fmovs$cond %xcc, $rs2, $rd",
+                      [(set f32:$rd,
+                       (SPselectxcc f32:$rs2, f32:$f, imm:$cond))]>;
+def FMOVD_XCC : Pseudo<(outs DFPRegs:$rd),
+                      (ins DFPRegs:$rs2, DFPRegs:$f, CCOp:$cond),
+                      "fmovd$cond %xcc, $rs2, $rd",
+                      [(set f64:$rd,
+                       (SPselectxcc f64:$rs2, f64:$f, imm:$cond))]>;
 } // Uses, Constraints
 
 def : Pat<(SPselectxcc i64:$t, i64:$f, imm:$cond),
@@ -330,4 +345,14 @@ def : Pat<(SPselectxcc i64:$t, i64:$f, imm:$cond),
 def : Pat<(SPselectxcc (i64 simm11:$t), i64:$f, imm:$cond),
           (MOVXCCri (as_i32imm $t), $f, imm:$cond)>;
 
+def : Pat<(SPselecticc i64:$t, i64:$f, imm:$cond),
+          (MOVICCrr $t, $f, imm:$cond)>;
+def : Pat<(SPselecticc (i64 simm11:$t), i64:$f, imm:$cond),
+          (MOVICCri (as_i32imm $t), $f, imm:$cond)>;
+
+def : Pat<(SPselectfcc i64:$t, i64:$f, imm:$cond),
+          (MOVFCCrr $t, $f, imm:$cond)>;
+def : Pat<(SPselectfcc (i64 simm11:$t), i64:$f, imm:$cond),
+          (MOVFCCri (as_i32imm $t), $f, imm:$cond)>;
+
 } // Predicates = [Is64Bit]
diff --git a/lib/Target/Sparc/SparcInstrFormats.td b/lib/Target/Sparc/SparcInstrFormats.td
index e7fde08..6cdf6bc 100644
--- a/lib/Target/Sparc/SparcInstrFormats.td
+++ b/lib/Target/Sparc/SparcInstrFormats.td
@@ -7,14 +7,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-class InstSP<dag outs, dag ins, string asmstr, list<dag> pattern> : Instruction {
+class InstSP<dag outs, dag ins, string asmstr, list<dag> pattern>
+          : Instruction {
   field bits<32> Inst;
 
   let Namespace = "SP";
 
   bits<2> op;
   let Inst{31-30} = op;               // Top two bits are the 'op' field
-  
+
   dag OutOperandList = outs;
   dag InOperandList = ins;
   let AsmString   = asmstr;
@@ -46,7 +47,7 @@ class F2_1<bits<3> op2Val, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{29-25} = rd;
 }
 
-class F2_2<bits<4> condVal, bits<3> op2Val, dag outs, dag ins, string asmstr, 
+class F2_2<bits<4> condVal, bits<3> op2Val, dag outs, dag ins, string asmstr,
            list<dag> pattern> : F2<outs, ins, asmstr, pattern> {
   bits<4>   cond;
   bit       annul = 0;     // currently unused
@@ -88,7 +89,7 @@ class F3_1<bits<2> opVal, bits<6> op3val, dag outs, dag ins,
   let Inst{4-0}  = rs2;
 }
 
-class F3_2<bits<2> opVal, bits<6> op3val, dag outs, dag ins, 
+class F3_2<bits<2> opVal, bits<6> op3val, dag outs, dag ins,
            string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
   bits<13> simm13;
 
diff --git a/lib/Target/Sparc/SparcInstrInfo.cpp b/lib/Target/Sparc/SparcInstrInfo.cpp
index 39d7329..626bc40 100644
--- a/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -29,7 +29,7 @@ using namespace llvm;
 
 SparcInstrInfo::SparcInstrInfo(SparcSubtarget &ST)
   : SparcGenInstrInfo(SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP),
-    RI(ST, *this), Subtarget(ST) {
+    RI(ST), Subtarget(ST) {
 }
 
 /// isLoadFromStackSlot - If the specified machine instruction is a direct
@@ -40,6 +40,7 @@ SparcInstrInfo::SparcInstrInfo(SparcSubtarget &ST)
 unsigned SparcInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
                                              int &FrameIndex) const {
   if (MI->getOpcode() == SP::LDri ||
+      MI->getOpcode() == SP::LDXri ||
       MI->getOpcode() == SP::LDFri ||
       MI->getOpcode() == SP::LDDFri) {
     if (MI->getOperand(1).isFI() && MI->getOperand(2).isImm() &&
@@ -59,6 +60,7 @@ unsigned SparcInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
 unsigned SparcInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
                                             int &FrameIndex) const {
   if (MI->getOpcode() == SP::STri ||
+      MI->getOpcode() == SP::STXri ||
       MI->getOpcode() == SP::STFri ||
       MI->getOpcode() == SP::STDFri) {
     if (MI->getOperand(0).isFI() && MI->getOperand(1).isImm() &&
@@ -139,15 +141,15 @@ bool SparcInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
     if (I->isDebugValue())
       continue;
 
-    //When we see a non-terminator, we are done
+    // When we see a non-terminator, we are done.
     if (!isUnpredicatedTerminator(I))
       break;
 
-    //Terminator is not a branch
+    // Terminator is not a branch.
     if (!I->isBranch())
       return true;
 
-    //Handle Unconditional branches
+    // Handle Unconditional branches.
     if (I->getOpcode() == SP::BA) {
       UnCondBrIter = I;
 
@@ -176,7 +178,7 @@ bool SparcInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
 
     unsigned Opcode = I->getOpcode();
     if (Opcode != SP::BCOND && Opcode != SP::FBCOND)
-      return true; //Unknown Opcode
+      return true; // Unknown Opcode.
 
     SPCC::CondCodes BranchCode = (SPCC::CondCodes)I->getOperand(1).getImm();
 
@@ -185,7 +187,7 @@ bool SparcInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
       if (AllowModify && UnCondBrIter != MBB.end() &&
           MBB.isLayoutSuccessor(TargetBB)) {
 
-        //Transform the code
+        // Transform the code
         //
         //    brCC L1
         //    ba L2
@@ -219,8 +221,8 @@ bool SparcInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
       Cond.push_back(MachineOperand::CreateImm(BranchCode));
       continue;
     }
-    //FIXME: Handle subsequent conditional branches
-    //For now, we can't handle multiple conditional branches
+    // FIXME: Handle subsequent conditional branches.
+    // For now, we can't handle multiple conditional branches.
     return true;
   }
   return false;
@@ -241,7 +243,7 @@ SparcInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
     return 1;
   }
 
-  //Conditional branch
+  // Conditional branch
   unsigned CC = Cond[0].getImm();
 
   if (IsIntegerCC(CC))
@@ -287,10 +289,28 @@ void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   else if (SP::FPRegsRegClass.contains(DestReg, SrcReg))
     BuildMI(MBB, I, DL, get(SP::FMOVS), DestReg)
       .addReg(SrcReg, getKillRegState(KillSrc));
-  else if (SP::DFPRegsRegClass.contains(DestReg, SrcReg))
-    BuildMI(MBB, I, DL, get(Subtarget.isV9() ? SP::FMOVD : SP::FpMOVD), DestReg)
-      .addReg(SrcReg, getKillRegState(KillSrc));
-  else
+  else if (SP::DFPRegsRegClass.contains(DestReg, SrcReg)) {
+    if (Subtarget.isV9()) {
+      BuildMI(MBB, I, DL, get(SP::FMOVD), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      // Use two FMOVS instructions.
+      const TargetRegisterInfo *TRI = &getRegisterInfo();
+      MachineInstr *MovMI = 0;
+      unsigned subRegIdx[] = {SP::sub_even, SP::sub_odd};
+      for (unsigned i = 0; i != 2; ++i) {
+        unsigned Dst = TRI->getSubReg(DestReg, subRegIdx[i]);
+        unsigned Src = TRI->getSubReg(SrcReg,  subRegIdx[i]);
+        assert(Dst && Src && "Bad sub-register");
+
+        MovMI = BuildMI(MBB, I, DL, get(SP::FMOVS), Dst).addReg(Src);
+      }
+      // Add implicit super-register defs and kills to the last MovMI.
+      MovMI->addRegisterDefined(DestReg, TRI);
+      if (KillSrc)
+        MovMI->addRegisterKilled(SrcReg, TRI);
+    }
+  } else
     llvm_unreachable("Impossible reg-to-reg copy");
 }
 
@@ -303,7 +323,10 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   if (I != MBB.end()) DL = I->getDebugLoc();
 
   // On the order of operands here: think "[FrameIdx + 0] = SrcReg".
-  if (RC == &SP::IntRegsRegClass)
+  if (RC == &SP::I64RegsRegClass)
+    BuildMI(MBB, I, DL, get(SP::STXri)).addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, getKillRegState(isKill));
+  else if (RC == &SP::IntRegsRegClass)
     BuildMI(MBB, I, DL, get(SP::STri)).addFrameIndex(FI).addImm(0)
       .addReg(SrcReg, getKillRegState(isKill));
   else if (RC == &SP::FPRegsRegClass)
@@ -324,7 +347,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   DebugLoc DL;
   if (I != MBB.end()) DL = I->getDebugLoc();
 
-  if (RC == &SP::IntRegsRegClass)
+  if (RC == &SP::I64RegsRegClass)
+    BuildMI(MBB, I, DL, get(SP::LDXri), DestReg).addFrameIndex(FI).addImm(0);
+  else if (RC == &SP::IntRegsRegClass)
     BuildMI(MBB, I, DL, get(SP::LDri), DestReg).addFrameIndex(FI).addImm(0);
   else if (RC == &SP::FPRegsRegClass)
     BuildMI(MBB, I, DL, get(SP::LDFri), DestReg).addFrameIndex(FI).addImm(0);
diff --git a/lib/Target/Sparc/SparcInstrInfo.h b/lib/Target/Sparc/SparcInstrInfo.h
index 204f698..a0a0ffd8 100644
--- a/lib/Target/Sparc/SparcInstrInfo.h
+++ b/lib/Target/Sparc/SparcInstrInfo.h
@@ -53,7 +53,7 @@ public:
   /// any side effects other than loading from the stack slot.
   virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
                                        int &FrameIndex) const;
-  
+
   /// isStoreToStackSlot - If the specified machine instruction is a direct
   /// store to a stack slot, return the virtual or physical register number of
   /// the source reg along with the FrameIndex of the loaded stack slot.  If
@@ -86,7 +86,7 @@ public:
                            MachineBasicBlock::iterator I, DebugLoc DL,
                            unsigned DestReg, unsigned SrcReg,
                            bool KillSrc) const;
-  
+
   virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MBBI,
                                    unsigned SrcReg, bool isKill, int FrameIndex,
@@ -98,7 +98,7 @@ public:
                                     unsigned DestReg, int FrameIndex,
                                     const TargetRegisterClass *RC,
                                     const TargetRegisterInfo *TRI) const;
-  
+
   unsigned getGlobalBaseReg(MachineFunction *MF) const;
 };
 
diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
index baefb06..d4cac4d 100644
--- a/lib/Target/Sparc/SparcInstrInfo.td
+++ b/lib/Target/Sparc/SparcInstrInfo.td
@@ -89,9 +89,11 @@ def calltarget : Operand<i32>;
 let PrintMethod = "printCCOperand" in
   def CCOp : Operand<i32>;
 
-def SDTSPcmpfcc : 
+def SDTSPcmpicc :
+SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>]>;
+def SDTSPcmpfcc :
 SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisSameAs<0, 1>]>;
-def SDTSPbrcc : 
+def SDTSPbrcc :
 SDTypeProfile<0, 2, [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>]>;
 def SDTSPselectcc :
 SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i32>]>;
@@ -100,7 +102,7 @@ SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisFP<1>]>;
 def SDTSPITOF :
 SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, f32>]>;
 
-def SPcmpicc : SDNode<"SPISD::CMPICC", SDTIntBinOp, [SDNPOutGlue]>;
+def SPcmpicc : SDNode<"SPISD::CMPICC", SDTSPcmpicc, [SDNPOutGlue]>;
 def SPcmpfcc : SDNode<"SPISD::CMPFCC", SDTSPcmpfcc, [SDNPOutGlue]>;
 def SPbricc : SDNode<"SPISD::BRICC", SDTSPbrcc, [SDNPHasChain, SDNPInGlue]>;
 def SPbrxcc : SDNode<"SPISD::BRXCC", SDTSPbrcc, [SDNPHasChain, SDNPInGlue]>;
@@ -186,7 +188,7 @@ def FCC_O   : FCC_VAL<29>;  // Ordered
 
 /// F3_12 multiclass - Define a normal F3_1/F3_2 pattern in one shot.
 multiclass F3_12<string OpcStr, bits<6> Op3Val, SDNode OpNode> {
-  def rr  : F3_1<2, Op3Val, 
+  def rr  : F3_1<2, Op3Val,
                  (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
                  !strconcat(OpcStr, " $b, $c, $dst"),
                  [(set i32:$dst, (OpNode i32:$b, i32:$c))]>;
@@ -199,7 +201,7 @@ multiclass F3_12<string OpcStr, bits<6> Op3Val, SDNode OpNode> {
 /// F3_12np multiclass - Define a normal F3_1/F3_2 pattern in one shot, with no
 /// pattern.
 multiclass F3_12np<string OpcStr, bits<6> Op3Val> {
-  def rr  : F3_1<2, Op3Val, 
+  def rr  : F3_1<2, Op3Val,
                  (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
                  !strconcat(OpcStr, " $b, $c, $dst"), []>;
   def ri  : F3_2<2, Op3Val,
@@ -243,24 +245,11 @@ let hasSideEffects = 1, mayStore = 1 in {
 def UNIMP : F2_1<0b000, (outs), (ins i32imm:$val),
                 "unimp $val", []>;
 
-// FpMOVD/FpNEGD/FpABSD - These are lowered to single-precision ops by the 
-// fpmover pass.
-let Predicates = [HasNoV9] in {  // Only emit these in V8 mode.
-  def FpMOVD : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$src),
-                      "!FpMOVD $src, $dst", []>;
-  def FpNEGD : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$src),
-                      "!FpNEGD $src, $dst",
-                      [(set f64:$dst, (fneg f64:$src))]>;
-  def FpABSD : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$src),
-                      "!FpABSD $src, $dst",
-                      [(set f64:$dst, (fabs f64:$src))]>;
-}
-
 // SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
 // instruction selection into a branch sequence.  This has to handle all
 // permutations of selection between i32/f32/f64 on ICC and FCC.
-  // Expanded after instruction selection.
-let Uses = [ICC], usesCustomInserter = 1 in { 
+// Expanded after instruction selection.
+let Uses = [ICC], usesCustomInserter = 1 in {
   def SELECT_CC_Int_ICC
    : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, IntRegs:$F, i32imm:$Cond),
             "; SELECT_CC_Int_ICC PSEUDO!",
@@ -463,9 +452,9 @@ defm ADD   : F3_12<"add", 0b000000, add>;
 def LEA_ADDri   : F3_2<2, 0b000000,
                    (outs IntRegs:$dst), (ins MEMri:$addr),
                    "add ${addr:arith}, $dst",
-                   [(set i32:$dst, ADDRri:$addr)]>;
+                   [(set iPTR:$dst, ADDRri:$addr)]>;
 
-let Defs = [ICC] in                   
+let Defs = [ICC] in
   defm ADDCC  : F3_12<"addcc", 0b010000, addc>;
 
 let Uses = [ICC] in
@@ -473,14 +462,24 @@ let Uses = [ICC] in
 
 // Section B.15 - Subtract Instructions, p. 110
 defm SUB    : F3_12  <"sub"  , 0b000100, sub>;
-let Uses = [ICC] in 
+let Uses = [ICC] in
   defm SUBX   : F3_12  <"subx" , 0b001100, sube>;
 
-let Defs = [ICC] in 
-  defm SUBCC  : F3_12  <"subcc", 0b010100, SPcmpicc>;
+let Defs = [ICC] in {
+  defm SUBCC  : F3_12  <"subcc", 0b010100, subc>;
+
+  def CMPrr   : F3_1<2, 0b010100,
+                     (outs), (ins IntRegs:$b, IntRegs:$c),
+                     "cmp $b, $c",
+                     [(SPcmpicc i32:$b, i32:$c)]>;
+  def CMPri   : F3_1<2, 0b010100,
+                     (outs), (ins IntRegs:$b, i32imm:$c),
+                     "cmp $b, $c",
+                     [(SPcmpicc i32:$b, (i32 simm13:$c))]>;
+}
 
 let Uses = [ICC], Defs = [ICC] in
-  def SUBXCCrr: F3_1<2, 0b011100, 
+  def SUBXCCrr: F3_1<2, 0b011100,
                 (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
                 "subxcc $b, $c, $dst", []>;
 
@@ -516,6 +515,20 @@ let isBarrier = 1 in
                       "ba $dst",
                       [(br bb:$dst)]>;
 
+// Indirect branch instructions.
+let isTerminator = 1, isBarrier = 1,
+     hasDelaySlot = 1, isBranch =1,
+     isIndirectBranch = 1 in {
+  def BINDrr  : F3_1<2, 0b111000,
+                   (outs), (ins MEMrr:$ptr),
+                   "jmp $ptr",
+                   [(brind ADDRrr:$ptr)]>;
+  def BINDri  : F3_2<2, 0b111000,
+                   (outs), (ins MEMri:$ptr),
+                   "jmp $ptr",
+                   [(brind ADDRri:$ptr)]>;
+}
+
 // FIXME: the encoding for the JIT should look at the condition field.
 let Uses = [ICC] in
   def BCOND : BranchSP<0, (ins brtarget:$dst, CCOp:$cc),
@@ -553,7 +566,7 @@ let Uses = [O6],
     let op = 1;
     let Inst{29-0} = disp;
   }
-  
+
   // indirect calls
   def JMPLrr : F3_1<2, 0b111000,
                     (outs), (ins MEMrr:$ptr, variable_ops),
@@ -566,7 +579,7 @@ let Uses = [O6],
 }
 
 // Section B.28 - Read State Register Instructions
-let Uses = [Y] in 
+let Uses = [Y] in
   def RDY : F3_1<2, 0b101000,
                  (outs IntRegs:$dst), (ins),
                  "rd %y, $dst", []>;
@@ -585,7 +598,7 @@ def FITOS : F3_3<2, 0b110100, 0b011000100,
                  (outs FPRegs:$dst), (ins FPRegs:$src),
                  "fitos $src, $dst",
                  [(set FPRegs:$dst, (SPitof FPRegs:$src))]>;
-def FITOD : F3_3<2, 0b110100, 0b011001000, 
+def FITOD : F3_3<2, 0b110100, 0b011001000,
                  (outs DFPRegs:$dst), (ins FPRegs:$src),
                  "fitod $src, $dst",
                  [(set DFPRegs:$dst, (SPitof FPRegs:$src))]>;
@@ -601,7 +614,7 @@ def FDTOI : F3_3<2, 0b110100, 0b011010010,
                  [(set FPRegs:$dst, (SPftoi DFPRegs:$src))]>;
 
 // Convert between Floating-point Formats Instructions, p. 143
-def FSTOD : F3_3<2, 0b110100, 0b011001001, 
+def FSTOD : F3_3<2, 0b110100, 0b011001001,
                  (outs DFPRegs:$dst), (ins FPRegs:$src),
                  "fstod $src, $dst",
                  [(set f64:$dst, (fextend f32:$src))]>;
@@ -614,22 +627,22 @@ def FDTOS : F3_3<2, 0b110100, 0b011000110,
 def FMOVS : F3_3<2, 0b110100, 0b000000001,
                  (outs FPRegs:$dst), (ins FPRegs:$src),
                  "fmovs $src, $dst", []>;
-def FNEGS : F3_3<2, 0b110100, 0b000000101, 
+def FNEGS : F3_3<2, 0b110100, 0b000000101,
                  (outs FPRegs:$dst), (ins FPRegs:$src),
                  "fnegs $src, $dst",
                  [(set f32:$dst, (fneg f32:$src))]>;
-def FABSS : F3_3<2, 0b110100, 0b000001001, 
+def FABSS : F3_3<2, 0b110100, 0b000001001,
                  (outs FPRegs:$dst), (ins FPRegs:$src),
                  "fabss $src, $dst",
                  [(set f32:$dst, (fabs f32:$src))]>;
 
 
 // Floating-point Square Root Instructions, p.145
-def FSQRTS : F3_3<2, 0b110100, 0b000101001, 
+def FSQRTS : F3_3<2, 0b110100, 0b000101001,
                   (outs FPRegs:$dst), (ins FPRegs:$src),
                   "fsqrts $src, $dst",
                   [(set f32:$dst, (fsqrt f32:$src))]>;
-def FSQRTD : F3_3<2, 0b110100, 0b000101010, 
+def FSQRTD : F3_3<2, 0b110100, 0b000101010,
                   (outs DFPRegs:$dst), (ins DFPRegs:$src),
                   "fsqrtd $src, $dst",
                   [(set f64:$dst, (fsqrt f64:$src))]>;
@@ -698,52 +711,51 @@ let Defs = [FCC] in {
 //===----------------------------------------------------------------------===//
 
 // V9 Conditional Moves.
-let Predicates = [HasV9], Constraints = "$T = $dst" in {
+let Predicates = [HasV9], Constraints = "$f = $rd" in {
   // Move Integer Register on Condition (MOVcc) p. 194 of the V9 manual.
   // FIXME: Add instruction encodings for the JIT some day.
   let Uses = [ICC] in {
     def MOVICCrr
-      : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, IntRegs:$F, CCOp:$cc),
-               "mov$cc %icc, $F, $dst",
-               [(set i32:$dst, (SPselecticc i32:$F, i32:$T, imm:$cc))]>;
+      : Pseudo<(outs IntRegs:$rd), (ins IntRegs:$rs2, IntRegs:$f, CCOp:$cc),
+               "mov$cc %icc, $rs2, $rd",
+               [(set i32:$rd, (SPselecticc i32:$rs2, i32:$f, imm:$cc))]>;
     def MOVICCri
-      : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, i32imm:$F, CCOp:$cc),
-               "mov$cc %icc, $F, $dst",
-               [(set i32:$dst, (SPselecticc simm11:$F, i32:$T, imm:$cc))]>;
+      : Pseudo<(outs IntRegs:$rd), (ins i32imm:$i, IntRegs:$f, CCOp:$cc),
+               "mov$cc %icc, $i, $rd",
+               [(set i32:$rd, (SPselecticc simm11:$i, i32:$f, imm:$cc))]>;
   }
 
   let Uses = [FCC] in {
     def MOVFCCrr
-      : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, IntRegs:$F, CCOp:$cc),
-               "mov$cc %fcc0, $F, $dst",
-               [(set i32:$dst, (SPselectfcc i32:$F, i32:$T, imm:$cc))]>;
+      : Pseudo<(outs IntRegs:$rd), (ins IntRegs:$rs2, IntRegs:$f, CCOp:$cc),
+               "mov$cc %fcc0, $rs2, $rd",
+               [(set i32:$rd, (SPselectfcc i32:$rs2, i32:$f, imm:$cc))]>;
     def MOVFCCri
-      : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, i32imm:$F, CCOp:$cc),
-               "mov$cc %fcc0, $F, $dst",
-               [(set i32:$dst, (SPselectfcc simm11:$F, i32:$T, imm:$cc))]>;
+      : Pseudo<(outs IntRegs:$rd), (ins i32imm:$i, IntRegs:$f, CCOp:$cc),
+               "mov$cc %fcc0, $i, $rd",
+               [(set i32:$rd, (SPselectfcc simm11:$i, i32:$f, imm:$cc))]>;
   }
 
   let Uses = [ICC] in {
     def FMOVS_ICC
-      : Pseudo<(outs FPRegs:$dst), (ins FPRegs:$T, FPRegs:$F, CCOp:$cc),
-               "fmovs$cc %icc, $F, $dst",
-               [(set f32:$dst,
-                           (SPselecticc f32:$F, f32:$T, imm:$cc))]>;
+      : Pseudo<(outs FPRegs:$rd), (ins FPRegs:$rs2, FPRegs:$f, CCOp:$cc),
+               "fmovs$cc %icc, $rs2, $rd",
+               [(set f32:$rd, (SPselecticc f32:$rs2, f32:$f, imm:$cc))]>;
     def FMOVD_ICC
-      : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$T, DFPRegs:$F, CCOp:$cc),
-               "fmovd$cc %icc, $F, $dst",
-               [(set f64:$dst, (SPselecticc f64:$F, f64:$T, imm:$cc))]>;
+      : Pseudo<(outs DFPRegs:$rd), (ins DFPRegs:$rs2, DFPRegs:$f, CCOp:$cc),
+               "fmovd$cc %icc, $rs2, $rd",
+               [(set f64:$rd, (SPselecticc f64:$rs2, f64:$f, imm:$cc))]>;
   }
 
   let Uses = [FCC] in {
     def FMOVS_FCC
-      : Pseudo<(outs FPRegs:$dst), (ins FPRegs:$T, FPRegs:$F, CCOp:$cc),
-               "fmovs$cc %fcc0, $F, $dst",
-               [(set f32:$dst, (SPselectfcc f32:$F, f32:$T, imm:$cc))]>;
+      : Pseudo<(outs FPRegs:$rd), (ins FPRegs:$rs2, FPRegs:$f, CCOp:$cc),
+               "fmovs$cc %fcc0, $rs2, $rd",
+               [(set f32:$rd, (SPselectfcc f32:$rs2, f32:$f, imm:$cc))]>;
     def FMOVD_FCC
-      : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$T, DFPRegs:$F, CCOp:$cc),
-               "fmovd$cc %fcc0, $F, $dst",
-               [(set f64:$dst, (SPselectfcc f64:$F, f64:$T, imm:$cc))]>;
+      : Pseudo<(outs DFPRegs:$rd), (ins DFPRegs:$rs2, DFPRegs:$f, CCOp:$cc),
+               "fmovd$cc %fcc0, $rs2, $rd",
+               [(set f64:$rd, (SPselectfcc f64:$rs2, f64:$f, imm:$cc))]>;
   }
 
 }
@@ -753,11 +765,11 @@ let Predicates = [HasV9] in {
   def FMOVD : F3_3<2, 0b110100, 0b000000010,
                    (outs DFPRegs:$dst), (ins DFPRegs:$src),
                    "fmovd $src, $dst", []>;
-  def FNEGD : F3_3<2, 0b110100, 0b000000110, 
+  def FNEGD : F3_3<2, 0b110100, 0b000000110,
                    (outs DFPRegs:$dst), (ins DFPRegs:$src),
                    "fnegd $src, $dst",
                    [(set f64:$dst, (fneg f64:$src))]>;
-  def FABSD : F3_3<2, 0b110100, 0b000001010, 
+  def FABSD : F3_3<2, 0b110100, 0b000001010,
                    (outs DFPRegs:$dst), (ins DFPRegs:$src),
                    "fabsd $src, $dst",
                    [(set f64:$dst, (fabs f64:$src))]>;
@@ -765,7 +777,7 @@ let Predicates = [HasV9] in {
 
 // POPCrr - This does a ctpop of a 64-bit register.  As such, we have to clear
 // the top 32-bits before using it.  To do this clearing, we use a SLLri X,0.
-def POPCrr : F3_1<2, 0b101110, 
+def POPCrr : F3_1<2, 0b101110,
                   (outs IntRegs:$dst), (ins IntRegs:$src),
                   "popc $src, $dst", []>, Requires<[HasV9]>;
 def : Pat<(ctpop i32:$src),
@@ -782,11 +794,6 @@ def : Pat<(i32 simm13:$val),
 def : Pat<(i32 imm:$val),
           (ORri (SETHIi (HI22 imm:$val)), (LO10 imm:$val))>;
 
-// subc
-def : Pat<(subc i32:$b, i32:$c),
-          (SUBCCrr $b, $c)>;
-def : Pat<(subc i32:$b, simm13:$val),
-          (SUBCCri $b, imm:$val)>;
 
 // Global addresses, constant pool entries
 def : Pat<(SPhi tglobaladdr:$in), (SETHIi tglobaladdr:$in)>;
@@ -794,11 +801,17 @@ def : Pat<(SPlo tglobaladdr:$in), (ORri (i32 G0), tglobaladdr:$in)>;
 def : Pat<(SPhi tconstpool:$in), (SETHIi tconstpool:$in)>;
 def : Pat<(SPlo tconstpool:$in), (ORri (i32 G0), tconstpool:$in)>;
 
+// Blockaddress
+def : Pat<(SPhi tblockaddress:$in), (SETHIi tblockaddress:$in)>;
+def : Pat<(SPlo tblockaddress:$in), (ORri (i32 G0), tblockaddress:$in)>;
+
 // Add reg, lo.  This is used when taking the addr of a global/constpool entry.
 def : Pat<(add iPTR:$r, (SPlo tglobaladdr:$in)), (ADDri $r, tglobaladdr:$in)>;
 def : Pat<(add iPTR:$r, (SPlo tconstpool:$in)),  (ADDri $r, tconstpool:$in)>;
+def : Pat<(add iPTR:$r, (SPlo tblockaddress:$in)),
+                        (ADDri $r, tblockaddress:$in)>;
 
-// Calls: 
+// Calls:
 def : Pat<(call tglobaladdr:$dst),
           (CALL tglobaladdr:$dst)>;
 def : Pat<(call texternalsym:$dst),
@@ -816,4 +829,8 @@ def : Pat<(i32 (extloadi16 ADDRri:$src)), (LDUHri ADDRri:$src)>;
 def : Pat<(i32 (zextloadi1 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>;
 def : Pat<(i32 (zextloadi1 ADDRri:$src)), (LDUBri ADDRri:$src)>;
 
+// store 0, addr -> store %g0, addr
+def : Pat<(store (i32 0), ADDRrr:$dst), (STrr ADDRrr:$dst, (i32 G0))>;
+def : Pat<(store (i32 0), ADDRri:$dst), (STri ADDRri:$dst, (i32 G0))>;
+
 include "SparcInstr64Bit.td"
diff --git a/lib/Target/Sparc/SparcMachineFunctionInfo.h b/lib/Target/Sparc/SparcMachineFunctionInfo.h
index 90c27a4..3783c16 100644
--- a/lib/Target/Sparc/SparcMachineFunctionInfo.h
+++ b/lib/Target/Sparc/SparcMachineFunctionInfo.h
@@ -28,11 +28,16 @@ namespace llvm {
     /// SRetReturnReg - Holds the virtual register into which the sret
     /// argument is passed.
     unsigned SRetReturnReg;
+
+    /// IsLeafProc - True if the function is a leaf procedure.
+    bool IsLeafProc;
   public:
     SparcMachineFunctionInfo()
-      : GlobalBaseReg(0), VarArgsFrameOffset(0), SRetReturnReg(0) {}
+      : GlobalBaseReg(0), VarArgsFrameOffset(0), SRetReturnReg(0),
+        IsLeafProc(false) {}
     explicit SparcMachineFunctionInfo(MachineFunction &MF)
-      : GlobalBaseReg(0), VarArgsFrameOffset(0), SRetReturnReg(0) {}
+      : GlobalBaseReg(0), VarArgsFrameOffset(0), SRetReturnReg(0),
+        IsLeafProc(false) {}
 
     unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
     void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
@@ -42,6 +47,9 @@ namespace llvm {
 
     unsigned getSRetReturnReg() const { return SRetReturnReg; }
     void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
+    void setLeafProc(bool rhs) { IsLeafProc = rhs; }
+    bool isLeafProc() const { return IsLeafProc; }
   };
 }
 
diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp
index 3af4c61..dc97f06 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -13,6 +13,7 @@
 
 #include "SparcRegisterInfo.h"
 #include "Sparc.h"
+#include "SparcMachineFunctionInfo.h"
 #include "SparcSubtarget.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
@@ -20,6 +21,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetInstrInfo.h"
 
@@ -28,9 +30,12 @@
 
 using namespace llvm;
 
-SparcRegisterInfo::SparcRegisterInfo(SparcSubtarget &st,
-                                     const TargetInstrInfo &tii)
-  : SparcGenRegisterInfo(SP::I7), Subtarget(st), TII(tii) {
+static cl::opt<bool>
+ReserveAppRegisters("sparc-reserve-app-registers", cl::Hidden, cl::init(false),
+                    cl::desc("Reserve application registers (%g2-%g4)"));
+
+SparcRegisterInfo::SparcRegisterInfo(SparcSubtarget &st)
+  : SparcGenRegisterInfo(SP::I7), Subtarget(st) {
 }
 
 const uint16_t* SparcRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
@@ -43,14 +48,21 @@ BitVector SparcRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   // FIXME: G1 reserved for now for large imm generation by frame code.
   Reserved.set(SP::G1);
-  Reserved.set(SP::G2);
-  Reserved.set(SP::G3);
-  Reserved.set(SP::G4);
+
+  // G1-G4 can be used in applications.
+  if (ReserveAppRegisters) {
+    Reserved.set(SP::G2);
+    Reserved.set(SP::G3);
+    Reserved.set(SP::G4);
+  }
+  // G5 is not reserved in 64 bit mode.
+  if (!Subtarget.is64Bit())
+    Reserved.set(SP::G5);
+
   Reserved.set(SP::O6);
   Reserved.set(SP::I6);
   Reserved.set(SP::I7);
   Reserved.set(SP::G0);
-  Reserved.set(SP::G5);
   Reserved.set(SP::G6);
   Reserved.set(SP::G7);
   return Reserved;
@@ -77,21 +89,30 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   int64_t Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
                    MI.getOperand(FIOperandNum + 1).getImm() +
                    Subtarget.getStackPointerBias();
+  SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
+  unsigned FramePtr = SP::I6;
+  if (FuncInfo->isLeafProc()) {
+    // Use %sp and adjust offset if needed.
+    FramePtr = SP::O6;
+    int stackSize = MF.getFrameInfo()->getStackSize();
+    Offset += (stackSize) ? Subtarget.getAdjustedFrameSize(stackSize) : 0 ;
+  }
 
   // Replace frame index with a frame pointer reference.
   if (Offset >= -4096 && Offset <= 4095) {
     // If the offset is small enough to fit in the immediate field, directly
     // encode it.
-    MI.getOperand(FIOperandNum).ChangeToRegister(SP::I6, false);
+    MI.getOperand(FIOperandNum).ChangeToRegister(FramePtr, false);
     MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
   } else {
-    // Otherwise, emit a G1 = SETHI %hi(offset).  FIXME: it would be better to 
+    // Otherwise, emit a G1 = SETHI %hi(offset).  FIXME: it would be better to
     // scavenge a register here instead of reserving G1 all of the time.
+    const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
     unsigned OffHi = (unsigned)Offset >> 10U;
     BuildMI(*MI.getParent(), II, dl, TII.get(SP::SETHIi), SP::G1).addImm(OffHi);
     // Emit G1 = G1 + I6
     BuildMI(*MI.getParent(), II, dl, TII.get(SP::ADDrr), SP::G1).addReg(SP::G1)
-      .addReg(SP::I6);
+      .addReg(FramePtr);
     // Insert: G1+%lo(offset) into the user.
     MI.getOperand(FIOperandNum).ChangeToRegister(SP::G1, false);
     MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset & ((1 << 10)-1));
diff --git a/lib/Target/Sparc/SparcRegisterInfo.h b/lib/Target/Sparc/SparcRegisterInfo.h
index f91df53..6b77d4e 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.h
+++ b/lib/Target/Sparc/SparcRegisterInfo.h
@@ -27,9 +27,8 @@ class Type;
 
 struct SparcRegisterInfo : public SparcGenRegisterInfo {
   SparcSubtarget &Subtarget;
-  const TargetInstrInfo &TII;
 
-  SparcRegisterInfo(SparcSubtarget &st, const TargetInstrInfo &tii);
+  SparcRegisterInfo(SparcSubtarget &st);
 
   /// Code Generation virtual methods...
   const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
diff --git a/lib/Target/Sparc/SparcRegisterInfo.td b/lib/Target/Sparc/SparcRegisterInfo.td
index 497e7c5..a59c442 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.td
+++ b/lib/Target/Sparc/SparcRegisterInfo.td
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-//  Declarations that describe the Sparc register file 
+//  Declarations that describe the Sparc register file
 //===----------------------------------------------------------------------===//
 
 class SparcReg<string n> : Register<n> {
@@ -21,8 +21,8 @@ class SparcCtrlReg<string n>: Register<n> {
 }
 
 let Namespace = "SP" in {
-def sub_even : SubRegIndex;
-def sub_odd  : SubRegIndex;
+def sub_even : SubRegIndex<32>;
+def sub_odd  : SubRegIndex<32, 32>;
 }
 
 // Registers are identified with 5-bit ID numbers.
@@ -52,68 +52,68 @@ def Y : SparcCtrlReg<"Y">;
 // Integer registers
 def G0 : Ri< 0, "G0">, DwarfRegNum<[0]>;
 def G1 : Ri< 1, "G1">, DwarfRegNum<[1]>;
-def G2 : Ri< 2, "G2">, DwarfRegNum<[2]>; 
+def G2 : Ri< 2, "G2">, DwarfRegNum<[2]>;
 def G3 : Ri< 3, "G3">, DwarfRegNum<[3]>;
 def G4 : Ri< 4, "G4">, DwarfRegNum<[4]>;
-def G5 : Ri< 5, "G5">, DwarfRegNum<[5]>; 
+def G5 : Ri< 5, "G5">, DwarfRegNum<[5]>;
 def G6 : Ri< 6, "G6">, DwarfRegNum<[6]>;
 def G7 : Ri< 7, "G7">, DwarfRegNum<[7]>;
 def O0 : Ri< 8, "O0">, DwarfRegNum<[8]>;
 def O1 : Ri< 9, "O1">, DwarfRegNum<[9]>;
-def O2 : Ri<10, "O2">, DwarfRegNum<[10]>; 
+def O2 : Ri<10, "O2">, DwarfRegNum<[10]>;
 def O3 : Ri<11, "O3">, DwarfRegNum<[11]>;
 def O4 : Ri<12, "O4">, DwarfRegNum<[12]>;
-def O5 : Ri<13, "O5">, DwarfRegNum<[13]>; 
+def O5 : Ri<13, "O5">, DwarfRegNum<[13]>;
 def O6 : Ri<14, "SP">, DwarfRegNum<[14]>;
 def O7 : Ri<15, "O7">, DwarfRegNum<[15]>;
 def L0 : Ri<16, "L0">, DwarfRegNum<[16]>;
 def L1 : Ri<17, "L1">, DwarfRegNum<[17]>;
-def L2 : Ri<18, "L2">, DwarfRegNum<[18]>; 
+def L2 : Ri<18, "L2">, DwarfRegNum<[18]>;
 def L3 : Ri<19, "L3">, DwarfRegNum<[19]>;
 def L4 : Ri<20, "L4">, DwarfRegNum<[20]>;
-def L5 : Ri<21, "L5">, DwarfRegNum<[21]>; 
+def L5 : Ri<21, "L5">, DwarfRegNum<[21]>;
 def L6 : Ri<22, "L6">, DwarfRegNum<[22]>;
 def L7 : Ri<23, "L7">, DwarfRegNum<[23]>;
 def I0 : Ri<24, "I0">, DwarfRegNum<[24]>;
 def I1 : Ri<25, "I1">, DwarfRegNum<[25]>;
-def I2 : Ri<26, "I2">, DwarfRegNum<[26]>; 
+def I2 : Ri<26, "I2">, DwarfRegNum<[26]>;
 def I3 : Ri<27, "I3">, DwarfRegNum<[27]>;
 def I4 : Ri<28, "I4">, DwarfRegNum<[28]>;
-def I5 : Ri<29, "I5">, DwarfRegNum<[29]>; 
+def I5 : Ri<29, "I5">, DwarfRegNum<[29]>;
 def I6 : Ri<30, "FP">, DwarfRegNum<[30]>;
 def I7 : Ri<31, "I7">, DwarfRegNum<[31]>;
 
 // Floating-point registers
 def F0  : Rf< 0,  "F0">, DwarfRegNum<[32]>;
 def F1  : Rf< 1,  "F1">, DwarfRegNum<[33]>;
-def F2  : Rf< 2,  "F2">, DwarfRegNum<[34]>; 
+def F2  : Rf< 2,  "F2">, DwarfRegNum<[34]>;
 def F3  : Rf< 3,  "F3">, DwarfRegNum<[35]>;
 def F4  : Rf< 4,  "F4">, DwarfRegNum<[36]>;
-def F5  : Rf< 5,  "F5">, DwarfRegNum<[37]>; 
+def F5  : Rf< 5,  "F5">, DwarfRegNum<[37]>;
 def F6  : Rf< 6,  "F6">, DwarfRegNum<[38]>;
 def F7  : Rf< 7,  "F7">, DwarfRegNum<[39]>;
-def F8  : Rf< 8,  "F8">, DwarfRegNum<[40]>; 
+def F8  : Rf< 8,  "F8">, DwarfRegNum<[40]>;
 def F9  : Rf< 9,  "F9">, DwarfRegNum<[41]>;
 def F10 : Rf<10, "F10">, DwarfRegNum<[42]>;
-def F11 : Rf<11, "F11">, DwarfRegNum<[43]>; 
+def F11 : Rf<11, "F11">, DwarfRegNum<[43]>;
 def F12 : Rf<12, "F12">, DwarfRegNum<[44]>;
 def F13 : Rf<13, "F13">, DwarfRegNum<[45]>;
-def F14 : Rf<14, "F14">, DwarfRegNum<[46]>; 
+def F14 : Rf<14, "F14">, DwarfRegNum<[46]>;
 def F15 : Rf<15, "F15">, DwarfRegNum<[47]>;
 def F16 : Rf<16, "F16">, DwarfRegNum<[48]>;
-def F17 : Rf<17, "F17">, DwarfRegNum<[49]>; 
+def F17 : Rf<17, "F17">, DwarfRegNum<[49]>;
 def F18 : Rf<18, "F18">, DwarfRegNum<[50]>;
 def F19 : Rf<19, "F19">, DwarfRegNum<[51]>;
-def F20 : Rf<20, "F20">, DwarfRegNum<[52]>; 
+def F20 : Rf<20, "F20">, DwarfRegNum<[52]>;
 def F21 : Rf<21, "F21">, DwarfRegNum<[53]>;
 def F22 : Rf<22, "F22">, DwarfRegNum<[54]>;
 def F23 : Rf<23, "F23">, DwarfRegNum<[55]>;
 def F24 : Rf<24, "F24">, DwarfRegNum<[56]>;
 def F25 : Rf<25, "F25">, DwarfRegNum<[57]>;
-def F26 : Rf<26, "F26">, DwarfRegNum<[58]>; 
+def F26 : Rf<26, "F26">, DwarfRegNum<[58]>;
 def F27 : Rf<27, "F27">, DwarfRegNum<[59]>;
 def F28 : Rf<28, "F28">, DwarfRegNum<[60]>;
-def F29 : Rf<29, "F29">, DwarfRegNum<[61]>; 
+def F29 : Rf<29, "F29">, DwarfRegNum<[61]>;
 def F30 : Rf<30, "F30">, DwarfRegNum<[62]>;
 def F31 : Rf<31, "F31">, DwarfRegNum<[63]>;
 
@@ -144,19 +144,10 @@ def D15 : Rd<30, "F30", [F30, F31]>, DwarfRegNum<[87]>;
 // register class for that. The i64 type is included here to allow i64 patterns
 // using the integer instructions.
 def IntRegs : RegisterClass<"SP", [i32, i64], 32,
-                            (add L0, L1, L2, L3, L4, L5, L6,
-                                 L7, I0, I1, I2, I3, I4, I5,
-                                 O0, O1, O2, O3, O4, O5, O7,
-                                 G1,
-                                 // Non-allocatable regs:
-                                 G2, G3, G4, // FIXME: OK for use only in
-                                             // applications, not libraries.
-                                 O6, // stack ptr
-                                 I6, // frame ptr
-                                 I7, // return address
-                                 G0, // constant zero
-                                 G5, G6, G7 // reserved for kernel
-                                 )>;
+                            (add (sequence "I%u", 0, 7),
+                                 (sequence "G%u", 0, 7),
+                                 (sequence "L%u", 0, 7),
+                                 (sequence "O%u", 0, 7))>;
 
 // Register class for 64-bit mode, with a 64-bit spill slot size.
 // These are the same as the 32-bit registers, so TableGen will consider this
diff --git a/lib/Target/Sparc/SparcSubtarget.cpp b/lib/Target/Sparc/SparcSubtarget.cpp
index e5b2aeb..f9ce098 100644
--- a/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/lib/Target/Sparc/SparcSubtarget.cpp
@@ -13,6 +13,7 @@
 
 #include "SparcSubtarget.h"
 #include "Sparc.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TargetRegistry.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
@@ -30,7 +31,7 @@ SparcSubtarget::SparcSubtarget(const std::string &TT, const std::string &CPU,
   V8DeprecatedInsts(false),
   IsVIS(false),
   Is64Bit(is64Bit) {
-  
+
   // Determine default and user specified characteristics
   std::string CPUName = CPU;
   if (CPUName.empty()) {
@@ -44,3 +45,30 @@ SparcSubtarget::SparcSubtarget(const std::string &TT, const std::string &CPU,
   // Parse features string.
   ParseSubtargetFeatures(CPUName, FS);
 }
+
+
+int SparcSubtarget::getAdjustedFrameSize(int frameSize) const {
+
+  if (is64Bit()) {
+    // All 64-bit stack frames must be 16-byte aligned, and must reserve space
+    // for spilling the 16 window registers at %sp+BIAS..%sp+BIAS+128.
+    frameSize += 128;
+    // Frames with calls must also reserve space for 6 outgoing arguments
+    // whether they are used or not. LowerCall_64 takes care of that.
+    assert(frameSize % 16 == 0 && "Stack size not 16-byte aligned");
+  } else {
+    // Emit the correct save instruction based on the number of bytes in
+    // the frame. Minimum stack frame size according to V8 ABI is:
+    //   16 words for register window spill
+    //    1 word for address of returned aggregate-value
+    // +  6 words for passing parameters on the stack
+    // ----------
+    //   23 words * 4 bytes per word = 92 bytes
+    frameSize += 92;
+
+    // Round up to next doubleword boundary -- a double-word boundary
+    // is required by the ABI.
+    frameSize = RoundUpToAlignment(frameSize, 8);
+  }
+  return frameSize;
+}
diff --git a/lib/Target/Sparc/SparcSubtarget.h b/lib/Target/Sparc/SparcSubtarget.h
index b94dd11..2bf599d 100644
--- a/lib/Target/Sparc/SparcSubtarget.h
+++ b/lib/Target/Sparc/SparcSubtarget.h
@@ -29,7 +29,7 @@ class SparcSubtarget : public SparcGenSubtargetInfo {
   bool V8DeprecatedInsts;
   bool IsVIS;
   bool Is64Bit;
-  
+
 public:
   SparcSubtarget(const std::string &TT, const std::string &CPU,
                  const std::string &FS, bool is64bit);
@@ -37,11 +37,11 @@ public:
   bool isV9() const { return IsV9; }
   bool isVIS() const { return IsVIS; }
   bool useDeprecatedV8Instructions() const { return V8DeprecatedInsts; }
-  
-  /// ParseSubtargetFeatures - Parses features string setting specified 
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
-  
+
   bool is64Bit() const { return Is64Bit; }
   std::string getDataLayout() const {
     const char *p;
@@ -58,6 +58,12 @@ public:
   int64_t getStackPointerBias() const {
     return is64Bit() ? 2047 : 0;
   }
+
+  /// Given a actual stack size as determined by FrameInfo, this function
+  /// returns adjusted framesize which includes space for register window
+  /// spills and arguments.
+  int getAdjustedFrameSize(int stackSize) const;
+
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp
index 60bceb7..a7355f4 100644
--- a/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -37,6 +37,7 @@ SparcTargetMachine::SparcTargetMachine(const Target &T, StringRef TT,
     InstrInfo(Subtarget),
     TLInfo(*this), TSInfo(*this),
     FrameLowering(Subtarget) {
+  initAsmInfo();
 }
 
 namespace {
@@ -68,7 +69,6 @@ bool SparcPassConfig::addInstSelector() {
 /// passes immediately before machine code is emitted.  This should return
 /// true if -print-machineinstrs should print out the code after the passes.
 bool SparcPassConfig::addPreEmitPass(){
-  addPass(createSparcFPMoverPass(getSparcTargetMachine()));
   addPass(createSparcDelaySlotFillerPass(getSparcTargetMachine()));
   return true;
 }
diff --git a/lib/Target/SystemZ/AsmParser/CMakeLists.txt b/lib/Target/SystemZ/AsmParser/CMakeLists.txt
new file mode 100644
index 0000000..78a5714
--- /dev/null
+++ b/lib/Target/SystemZ/AsmParser/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMSystemZAsmParser
+  SystemZAsmParser.cpp
+  )
+
+add_dependencies(LLVMSystemZAsmParser SystemZCommonTableGen)
diff --git a/lib/Target/SystemZ/AsmParser/LLVMBuild.txt b/lib/Target/SystemZ/AsmParser/LLVMBuild.txt
new file mode 100644
index 0000000..0b97e71
--- /dev/null
+++ b/lib/Target/SystemZ/AsmParser/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/SystemZ/AsmParser/LLVMBuild.txt -------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = SystemZAsmParser
+parent = SystemZ
+required_libraries = SystemZDesc SystemZInfo MC MCParser Support
+add_to_library_groups = SystemZ
diff --git a/lib/Target/SystemZ/AsmParser/Makefile b/lib/Target/SystemZ/AsmParser/Makefile
new file mode 100644
index 0000000..623ae2c
--- /dev/null
+++ b/lib/Target/SystemZ/AsmParser/Makefile
@@ -0,0 +1,16 @@
+##===- lib/Target/SystemZ/AsmParser/Makefile ---------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMSystemZAsmParser
+
+# Hack: we need to include 'main' SystemZ target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
new file mode 100644
index 0000000..7c28abd
--- /dev/null
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -0,0 +1,758 @@
+//===-- SystemZAsmParser.cpp - Parse SystemZ assembly instructions --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+// Return true if Expr is in the range [MinValue, MaxValue].
+static bool inRange(const MCExpr *Expr, int64_t MinValue, int64_t MaxValue) {
+  if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr)) {
+    int64_t Value = CE->getValue();
+    return Value >= MinValue && Value <= MaxValue;
+  }
+  return false;
+}
+
+namespace {
+class SystemZOperand : public MCParsedAsmOperand {
+public:
+  enum RegisterKind {
+    GR32Reg,
+    GR64Reg,
+    GR128Reg,
+    ADDR32Reg,
+    ADDR64Reg,
+    FP32Reg,
+    FP64Reg,
+    FP128Reg
+  };
+
+private:
+  enum OperandKind {
+    KindInvalid,
+    KindToken,
+    KindReg,
+    KindAccessReg,
+    KindImm,
+    KindMem
+  };
+
+  OperandKind Kind;
+  SMLoc StartLoc, EndLoc;
+
+  // A string of length Length, starting at Data.
+  struct TokenOp {
+    const char *Data;
+    unsigned Length;
+  };
+
+  // LLVM register Num, which has kind Kind.  In some ways it might be
+  // easier for this class to have a register bank (general, floating-point
+  // or access) and a raw register number (0-15).  This would postpone the
+  // interpretation of the operand to the add*() methods and avoid the need
+  // for context-dependent parsing.  However, we do things the current way
+  // because of the virtual getReg() method, which needs to distinguish
+  // between (say) %r0 used as a single register and %r0 used as a pair.
+  // Context-dependent parsing can also give us slightly better error
+  // messages when invalid pairs like %r1 are used.
+  struct RegOp {
+    RegisterKind Kind;
+    unsigned Num;
+  };
+
+  // Base + Disp + Index, where Base and Index are LLVM registers or 0.
+  // RegKind says what type the registers have (ADDR32Reg or ADDR64Reg).
+  struct MemOp {
+    unsigned Base : 8;
+    unsigned Index : 8;
+    unsigned RegKind : 8;
+    unsigned Unused : 8;
+    const MCExpr *Disp;
+  };
+
+  union {
+    TokenOp Token;
+    RegOp Reg;
+    unsigned AccessReg;
+    const MCExpr *Imm;
+    MemOp Mem;
+  };
+
+  SystemZOperand(OperandKind kind, SMLoc startLoc, SMLoc endLoc)
+    : Kind(kind), StartLoc(startLoc), EndLoc(endLoc)
+  {}
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+    // Add as immediates when possible.  Null MCExpr = 0.
+    if (Expr == 0)
+      Inst.addOperand(MCOperand::CreateImm(0));
+    else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(Expr));
+  }
+
+public:
+  // Create particular kinds of operand.
+  static SystemZOperand *createInvalid(SMLoc StartLoc, SMLoc EndLoc) {
+    return new SystemZOperand(KindInvalid, StartLoc, EndLoc);
+  }
+  static SystemZOperand *createToken(StringRef Str, SMLoc Loc) {
+    SystemZOperand *Op = new SystemZOperand(KindToken, Loc, Loc);
+    Op->Token.Data = Str.data();
+    Op->Token.Length = Str.size();
+    return Op;
+  }
+  static SystemZOperand *createReg(RegisterKind Kind, unsigned Num,
+                                   SMLoc StartLoc, SMLoc EndLoc) {
+    SystemZOperand *Op = new SystemZOperand(KindReg, StartLoc, EndLoc);
+    Op->Reg.Kind = Kind;
+    Op->Reg.Num = Num;
+    return Op;
+  }
+  static SystemZOperand *createAccessReg(unsigned Num, SMLoc StartLoc,
+                                         SMLoc EndLoc) {
+    SystemZOperand *Op = new SystemZOperand(KindAccessReg, StartLoc, EndLoc);
+    Op->AccessReg = Num;
+    return Op;
+  }
+  static SystemZOperand *createImm(const MCExpr *Expr, SMLoc StartLoc,
+                                   SMLoc EndLoc) {
+    SystemZOperand *Op = new SystemZOperand(KindImm, StartLoc, EndLoc);
+    Op->Imm = Expr;
+    return Op;
+  }
+  static SystemZOperand *createMem(RegisterKind RegKind, unsigned Base,
+                                   const MCExpr *Disp, unsigned Index,
+                                   SMLoc StartLoc, SMLoc EndLoc) {
+    SystemZOperand *Op = new SystemZOperand(KindMem, StartLoc, EndLoc);
+    Op->Mem.RegKind = RegKind;
+    Op->Mem.Base = Base;
+    Op->Mem.Index = Index;
+    Op->Mem.Disp = Disp;
+    return Op;
+  }
+
+  // Token operands
+  virtual bool isToken() const LLVM_OVERRIDE {
+    return Kind == KindToken;
+  }
+  StringRef getToken() const {
+    assert(Kind == KindToken && "Not a token");
+    return StringRef(Token.Data, Token.Length);
+  }
+
+  // Register operands.
+  virtual bool isReg() const LLVM_OVERRIDE {
+    return Kind == KindReg;
+  }
+  bool isReg(RegisterKind RegKind) const {
+    return Kind == KindReg && Reg.Kind == RegKind;
+  }
+  virtual unsigned getReg() const LLVM_OVERRIDE {
+    assert(Kind == KindReg && "Not a register");
+    return Reg.Num;
+  }
+
+  // Access register operands.  Access registers aren't exposed to LLVM
+  // as registers.
+  bool isAccessReg() const {
+    return Kind == KindAccessReg;
+  }
+
+  // Immediate operands.
+  virtual bool isImm() const LLVM_OVERRIDE {
+    return Kind == KindImm;
+  }
+  bool isImm(int64_t MinValue, int64_t MaxValue) const {
+    return Kind == KindImm && inRange(Imm, MinValue, MaxValue);
+  }
+  const MCExpr *getImm() const {
+    assert(Kind == KindImm && "Not an immediate");
+    return Imm;
+  }
+
+  // Memory operands.
+  virtual bool isMem() const LLVM_OVERRIDE {
+    return Kind == KindMem;
+  }
+  bool isMem(RegisterKind RegKind, bool HasIndex) const {
+    return (Kind == KindMem &&
+            Mem.RegKind == RegKind &&
+            (HasIndex || !Mem.Index));
+  }
+  bool isMemDisp12(RegisterKind RegKind, bool HasIndex) const {
+    return isMem(RegKind, HasIndex) && inRange(Mem.Disp, 0, 0xfff);
+  }
+  bool isMemDisp20(RegisterKind RegKind, bool HasIndex) const {
+    return isMem(RegKind, HasIndex) && inRange(Mem.Disp, -524288, 524287);
+  }
+
+  // Override MCParsedAsmOperand.
+  virtual SMLoc getStartLoc() const LLVM_OVERRIDE { return StartLoc; }
+  virtual SMLoc getEndLoc() const LLVM_OVERRIDE { return EndLoc; }
+  virtual void print(raw_ostream &OS) const LLVM_OVERRIDE;
+
+  // Used by the TableGen code to add particular types of operand
+  // to an instruction.
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands");
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
+  }
+  void addAccessRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands");
+    assert(Kind == KindAccessReg && "Invalid operand type");
+    Inst.addOperand(MCOperand::CreateImm(AccessReg));
+  }
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands");
+    addExpr(Inst, getImm());
+  }
+  void addBDAddrOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands");
+    assert(Kind == KindMem && Mem.Index == 0 && "Invalid operand type");
+    Inst.addOperand(MCOperand::CreateReg(Mem.Base));
+    addExpr(Inst, Mem.Disp);
+  }
+  void addBDXAddrOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands");
+    assert(Kind == KindMem && "Invalid operand type");
+    Inst.addOperand(MCOperand::CreateReg(Mem.Base));
+    addExpr(Inst, Mem.Disp);
+    Inst.addOperand(MCOperand::CreateReg(Mem.Index));
+  }
+
+  // Used by the TableGen code to check for particular operand types.
+  bool isGR32() const { return isReg(GR32Reg); }
+  bool isGR64() const { return isReg(GR64Reg); }
+  bool isGR128() const { return isReg(GR128Reg); }
+  bool isADDR32() const { return isReg(ADDR32Reg); }
+  bool isADDR64() const { return isReg(ADDR64Reg); }
+  bool isADDR128() const { return false; }
+  bool isFP32() const { return isReg(FP32Reg); }
+  bool isFP64() const { return isReg(FP64Reg); }
+  bool isFP128() const { return isReg(FP128Reg); }
+  bool isBDAddr32Disp12() const { return isMemDisp12(ADDR32Reg, false); }
+  bool isBDAddr32Disp20() const { return isMemDisp20(ADDR32Reg, false); }
+  bool isBDAddr64Disp12() const { return isMemDisp12(ADDR64Reg, false); }
+  bool isBDAddr64Disp20() const { return isMemDisp20(ADDR64Reg, false); }
+  bool isBDXAddr64Disp12() const { return isMemDisp12(ADDR64Reg, true); }
+  bool isBDXAddr64Disp20() const { return isMemDisp20(ADDR64Reg, true); }
+  bool isU4Imm() const { return isImm(0, 15); }
+  bool isU6Imm() const { return isImm(0, 63); }
+  bool isU8Imm() const { return isImm(0, 255); }
+  bool isS8Imm() const { return isImm(-128, 127); }
+  bool isU16Imm() const { return isImm(0, 65535); }
+  bool isS16Imm() const { return isImm(-32768, 32767); }
+  bool isU32Imm() const { return isImm(0, (1LL << 32) - 1); }
+  bool isS32Imm() const { return isImm(-(1LL << 31), (1LL << 31) - 1); }
+};
+
+class SystemZAsmParser : public MCTargetAsmParser {
+#define GET_ASSEMBLER_HEADER
+#include "SystemZGenAsmMatcher.inc"
+
+private:
+  MCSubtargetInfo &STI;
+  MCAsmParser &Parser;
+  enum RegisterGroup {
+    RegGR,
+    RegFP,
+    RegAccess
+  };
+  struct Register {
+    RegisterGroup Group;
+    unsigned Num;
+    SMLoc StartLoc, EndLoc;
+  };
+
+  bool parseRegister(Register &Reg);
+
+  bool parseRegister(Register &Reg, RegisterGroup Group, const unsigned *Regs,
+                     bool IsAddress = false);
+
+  OperandMatchResultTy
+  parseRegister(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                RegisterGroup Group, const unsigned *Regs,
+                SystemZOperand::RegisterKind Kind,
+                bool IsAddress = false);
+
+  bool parseAddress(unsigned &Base, const MCExpr *&Disp,
+                    unsigned &Index, const unsigned *Regs,
+                    SystemZOperand::RegisterKind RegKind,
+                    bool HasIndex);
+
+  OperandMatchResultTy
+  parseAddress(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+               const unsigned *Regs, SystemZOperand::RegisterKind RegKind,
+               bool HasIndex);
+
+  bool parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                    StringRef Mnemonic);
+
+public:
+  SystemZAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser)
+    : MCTargetAsmParser(), STI(sti), Parser(parser) {
+    MCAsmParserExtension::Initialize(Parser);
+
+    // Initialize the set of available features.
+    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+  }
+
+  // Override MCTargetAsmParser.
+  virtual bool ParseDirective(AsmToken DirectiveID) LLVM_OVERRIDE;
+  virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                             SMLoc &EndLoc) LLVM_OVERRIDE;
+  virtual bool ParseInstruction(ParseInstructionInfo &Info,
+                                StringRef Name, SMLoc NameLoc,
+                                SmallVectorImpl<MCParsedAsmOperand*> &Operands)
+    LLVM_OVERRIDE;
+  virtual bool
+    MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                            SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                            MCStreamer &Out, unsigned &ErrorInfo,
+                            bool MatchingInlineAsm) LLVM_OVERRIDE;
+
+  // Used by the TableGen code to parse particular operand types.
+  OperandMatchResultTy
+  parseGR32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+    return parseRegister(Operands, RegGR, SystemZMC::GR32Regs,
+                         SystemZOperand::GR32Reg);
+  }
+  OperandMatchResultTy
+  parseGR64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+    return parseRegister(Operands, RegGR, SystemZMC::GR64Regs,
+                         SystemZOperand::GR64Reg);
+  }
+  OperandMatchResultTy
+  parseGR128(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+    return parseRegister(Operands, RegGR, SystemZMC::GR128Regs,
+                         SystemZOperand::GR128Reg);
+  }
+  OperandMatchResultTy
+  parseADDR32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+    return parseRegister(Operands, RegGR, SystemZMC::GR32Regs,
+                         SystemZOperand::ADDR32Reg, true);
+  }
+  OperandMatchResultTy
+  parseADDR64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+    return parseRegister(Operands, RegGR, SystemZMC::GR64Regs,
+                         SystemZOperand::ADDR64Reg, true);
+  }
+  OperandMatchResultTy
+  parseADDR128(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+    llvm_unreachable("Shouldn't be used as an operand");
+  }
+  OperandMatchResultTy
+  parseFP32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+    return parseRegister(Operands, RegFP, SystemZMC::FP32Regs,
+                         SystemZOperand::FP32Reg);
+  }
+  OperandMatchResultTy
+  parseFP64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+    return parseRegister(Operands, RegFP, SystemZMC::FP64Regs,
+                         SystemZOperand::FP64Reg);
+  }
+  OperandMatchResultTy
+  parseFP128(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+    return parseRegister(Operands, RegFP, SystemZMC::FP128Regs,
+                         SystemZOperand::FP128Reg);
+  }
+  OperandMatchResultTy
+  parseBDAddr32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+    return parseAddress(Operands, SystemZMC::GR32Regs,
+                        SystemZOperand::ADDR32Reg, false);
+  }
+  OperandMatchResultTy
+  parseBDAddr64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+    return parseAddress(Operands, SystemZMC::GR64Regs,
+                        SystemZOperand::ADDR64Reg, false);
+  }
+  OperandMatchResultTy
+  parseBDXAddr64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+    return parseAddress(Operands, SystemZMC::GR64Regs,
+                        SystemZOperand::ADDR64Reg, true);
+  }
+  OperandMatchResultTy
+  parseAccessReg(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  OperandMatchResultTy
+  parsePCRel(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+             int64_t MinVal, int64_t MaxVal);
+  OperandMatchResultTy
+  parsePCRel16(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+    return parsePCRel(Operands, -(1LL << 16), (1LL << 16) - 1);
+  }
+  OperandMatchResultTy
+  parsePCRel32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+    return parsePCRel(Operands, -(1LL << 32), (1LL << 32) - 1);
+  }
+};
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_SUBTARGET_FEATURE_NAME
+#define GET_MATCHER_IMPLEMENTATION
+#include "SystemZGenAsmMatcher.inc"
+
+void SystemZOperand::print(raw_ostream &OS) const {
+  llvm_unreachable("Not implemented");
+}
+
+// Parse one register of the form %<prefix><number>.
+bool SystemZAsmParser::parseRegister(Register &Reg) {
+  Reg.StartLoc = Parser.getTok().getLoc();
+
+  // Eat the % prefix.
+  if (Parser.getTok().isNot(AsmToken::Percent))
+    return Error(Parser.getTok().getLoc(), "register expected");
+  Parser.Lex();
+
+  // Expect a register name.
+  if (Parser.getTok().isNot(AsmToken::Identifier))
+    return Error(Reg.StartLoc, "invalid register");
+
+  // Check that there's a prefix.
+  StringRef Name = Parser.getTok().getString();
+  if (Name.size() < 2)
+    return Error(Reg.StartLoc, "invalid register");
+  char Prefix = Name[0];
+
+  // Treat the rest of the register name as a register number.
+  if (Name.substr(1).getAsInteger(10, Reg.Num))
+    return Error(Reg.StartLoc, "invalid register");
+
+  // Look for valid combinations of prefix and number.
+  if (Prefix == 'r' && Reg.Num < 16)
+    Reg.Group = RegGR;
+  else if (Prefix == 'f' && Reg.Num < 16)
+    Reg.Group = RegFP;
+  else if (Prefix == 'a' && Reg.Num < 16)
+    Reg.Group = RegAccess;
+  else
+    return Error(Reg.StartLoc, "invalid register");
+
+  Reg.EndLoc = Parser.getTok().getLoc();
+  Parser.Lex();
+  return false;
+}
+
+// Parse a register of group Group.  If Regs is nonnull, use it to map
+// the raw register number to LLVM numbering, with zero entries indicating
+// an invalid register.  IsAddress says whether the register appears in an
+// address context.
+bool SystemZAsmParser::parseRegister(Register &Reg, RegisterGroup Group,
+                                     const unsigned *Regs, bool IsAddress) {
+  if (parseRegister(Reg))
+    return true;
+  if (Reg.Group != Group)
+    return Error(Reg.StartLoc, "invalid operand for instruction");
+  if (Regs && Regs[Reg.Num] == 0)
+    return Error(Reg.StartLoc, "invalid register pair");
+  if (Reg.Num == 0 && IsAddress)
+    return Error(Reg.StartLoc, "%r0 used in an address");
+  if (Regs)
+    Reg.Num = Regs[Reg.Num];
+  return false;
+}
+
+// Parse a register and add it to Operands.  The other arguments are as above.
+SystemZAsmParser::OperandMatchResultTy
+SystemZAsmParser::parseRegister(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                                RegisterGroup Group, const unsigned *Regs,
+                                SystemZOperand::RegisterKind Kind,
+                                bool IsAddress) {
+  if (Parser.getTok().isNot(AsmToken::Percent))
+    return MatchOperand_NoMatch;
+
+  Register Reg;
+  if (parseRegister(Reg, Group, Regs, IsAddress))
+    return MatchOperand_ParseFail;
+
+  Operands.push_back(SystemZOperand::createReg(Kind, Reg.Num,
+                                               Reg.StartLoc, Reg.EndLoc));
+  return MatchOperand_Success;
+}
+
+// Parse a memory operand into Base, Disp and Index.  Regs maps asm
+// register numbers to LLVM register numbers and RegKind says what kind
+// of address register we're using (ADDR32Reg or ADDR64Reg).  HasIndex
+// says whether the address allows index registers.
+bool SystemZAsmParser::parseAddress(unsigned &Base, const MCExpr *&Disp,
+                                    unsigned &Index, const unsigned *Regs,
+                                    SystemZOperand::RegisterKind RegKind,
+                                    bool HasIndex) {
+  // Parse the displacement, which must always be present.
+  if (getParser().parseExpression(Disp))
+    return true;
+
+  // Parse the optional base and index.
+  Index = 0;
+  Base = 0;
+  if (getLexer().is(AsmToken::LParen)) {
+    Parser.Lex();
+
+    // Parse the first register.
+    Register Reg;
+    if (parseRegister(Reg, RegGR, Regs, RegKind))
+      return true;
+
+    // Check whether there's a second register.  If so, the one that we
+    // just parsed was the index.
+    if (getLexer().is(AsmToken::Comma)) {
+      Parser.Lex();
+
+      if (!HasIndex)
+        return Error(Reg.StartLoc, "invalid use of indexed addressing");
+
+      Index = Reg.Num;
+      if (parseRegister(Reg, RegGR, Regs, RegKind))
+        return true;
+    }
+    Base = Reg.Num;
+
+    // Consume the closing bracket.
+    if (getLexer().isNot(AsmToken::RParen))
+      return Error(Parser.getTok().getLoc(), "unexpected token in address");
+    Parser.Lex();
+  }
+  return false;
+}
+
+// Parse a memory operand and add it to Operands.  The other arguments
+// are as above.
+SystemZAsmParser::OperandMatchResultTy
+SystemZAsmParser::parseAddress(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                               const unsigned *Regs,
+                               SystemZOperand::RegisterKind RegKind,
+                               bool HasIndex) {
+  SMLoc StartLoc = Parser.getTok().getLoc();
+  unsigned Base, Index;
+  const MCExpr *Disp;
+  if (parseAddress(Base, Disp, Index, Regs, RegKind, HasIndex))
+    return MatchOperand_ParseFail;
+
+  SMLoc EndLoc =
+    SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  Operands.push_back(SystemZOperand::createMem(RegKind, Base, Disp, Index,
+                                               StartLoc, EndLoc));
+  return MatchOperand_Success;
+}
+
+bool SystemZAsmParser::ParseDirective(AsmToken DirectiveID) {
+  return true;
+}
+
+bool SystemZAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                     SMLoc &EndLoc) {
+  Register Reg;
+  if (parseRegister(Reg))
+    return true;
+  if (Reg.Group == RegGR)
+    RegNo = SystemZMC::GR64Regs[Reg.Num];
+  else if (Reg.Group == RegFP)
+    RegNo = SystemZMC::FP64Regs[Reg.Num];
+  else
+    // FIXME: Access registers aren't modelled as LLVM registers yet.
+    return Error(Reg.StartLoc, "invalid operand for instruction");
+  StartLoc = Reg.StartLoc;
+  EndLoc = Reg.EndLoc;
+  return false;
+}
+
+bool SystemZAsmParser::
+ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
+                 SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  Operands.push_back(SystemZOperand::createToken(Name, NameLoc));
+
+  // Read the remaining operands.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    // Read the first operand.
+    if (parseOperand(Operands, Name)) {
+      Parser.eatToEndOfStatement();
+      return true;
+    }
+
+    // Read any subsequent operands.
+    while (getLexer().is(AsmToken::Comma)) {
+      Parser.Lex();
+      if (parseOperand(Operands, Name)) {
+        Parser.eatToEndOfStatement();
+        return true;
+      }
+    }
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      SMLoc Loc = getLexer().getLoc();
+      Parser.eatToEndOfStatement();
+      return Error(Loc, "unexpected token in argument list");
+    }
+  }
+
+  // Consume the EndOfStatement.
+  Parser.Lex();
+  return false;
+}
+
+bool SystemZAsmParser::
+parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+             StringRef Mnemonic) {
+  // Check if the current operand has a custom associated parser, if so, try to
+  // custom parse the operand, or fallback to the general approach.
+  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+  if (ResTy == MatchOperand_Success)
+    return false;
+
+  // If there wasn't a custom match, try the generic matcher below. Otherwise,
+  // there was a match, but an error occurred, in which case, just return that
+  // the operand parsing failed.
+  if (ResTy == MatchOperand_ParseFail)
+    return true;
+
+  // Check for a register.  All real register operands should have used
+  // a context-dependent parse routine, which gives the required register
+  // class.  The code is here to mop up other cases, like those where
+  // the instruction isn't recognized.
+  if (Parser.getTok().is(AsmToken::Percent)) {
+    Register Reg;
+    if (parseRegister(Reg))
+      return true;
+    Operands.push_back(SystemZOperand::createInvalid(Reg.StartLoc, Reg.EndLoc));
+    return false;
+  }
+
+  // The only other type of operand is an immediate or address.  As above,
+  // real address operands should have used a context-dependent parse routine,
+  // so we treat any plain expression as an immediate.
+  SMLoc StartLoc = Parser.getTok().getLoc();
+  unsigned Base, Index;
+  const MCExpr *Expr;
+  if (parseAddress(Base, Expr, Index, SystemZMC::GR64Regs,
+                   SystemZOperand::ADDR64Reg, true))
+    return true;
+
+  SMLoc EndLoc =
+    SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  if (Base || Index)
+    Operands.push_back(SystemZOperand::createInvalid(StartLoc, EndLoc));
+  else
+    Operands.push_back(SystemZOperand::createImm(Expr, StartLoc, EndLoc));
+  return false;
+}
+
+bool SystemZAsmParser::
+MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                        SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                        MCStreamer &Out, unsigned &ErrorInfo,
+                        bool MatchingInlineAsm) {
+  MCInst Inst;
+  unsigned MatchResult;
+
+  MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo,
+                                     MatchingInlineAsm);
+  switch (MatchResult) {
+  default: break;
+  case Match_Success:
+    Inst.setLoc(IDLoc);
+    Out.EmitInstruction(Inst);
+    return false;
+
+  case Match_MissingFeature: {
+    assert(ErrorInfo && "Unknown missing feature!");
+    // Special case the error message for the very common case where only
+    // a single subtarget feature is missing
+    std::string Msg = "instruction requires:";
+    unsigned Mask = 1;
+    for (unsigned I = 0; I < sizeof(ErrorInfo) * 8 - 1; ++I) {
+      if (ErrorInfo & Mask) {
+        Msg += " ";
+        Msg += getSubtargetFeatureName(ErrorInfo & Mask);
+      }
+      Mask <<= 1;
+    }
+    return Error(IDLoc, Msg);
+  }
+
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0U) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction");
+
+      ErrorLoc = ((SystemZOperand*)Operands[ErrorInfo])->getStartLoc();
+      if (ErrorLoc == SMLoc())
+        ErrorLoc = IDLoc;
+    }
+    return Error(ErrorLoc, "invalid operand for instruction");
+  }
+
+  case Match_MnemonicFail:
+    return Error(IDLoc, "invalid instruction");
+  }
+
+  llvm_unreachable("Unexpected match type");
+}
+
+SystemZAsmParser::OperandMatchResultTy SystemZAsmParser::
+parseAccessReg(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  if (Parser.getTok().isNot(AsmToken::Percent))
+    return MatchOperand_NoMatch;
+
+  Register Reg;
+  if (parseRegister(Reg, RegAccess, 0))
+    return MatchOperand_ParseFail;
+
+  Operands.push_back(SystemZOperand::createAccessReg(Reg.Num,
+                                                     Reg.StartLoc,
+                                                     Reg.EndLoc));
+  return MatchOperand_Success;
+}
+
+SystemZAsmParser::OperandMatchResultTy SystemZAsmParser::
+parsePCRel(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+           int64_t MinVal, int64_t MaxVal) {
+  MCContext &Ctx = getContext();
+  MCStreamer &Out = getStreamer();
+  const MCExpr *Expr;
+  SMLoc StartLoc = Parser.getTok().getLoc();
+  if (getParser().parseExpression(Expr))
+    return MatchOperand_NoMatch;
+
+  // For consistency with the GNU assembler, treat immediates as offsets
+  // from ".".
+  if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr)) {
+    int64_t Value = CE->getValue();
+    if ((Value & 1) || Value < MinVal || Value > MaxVal) {
+      Error(StartLoc, "offset out of range");
+      return MatchOperand_ParseFail;
+    }
+    MCSymbol *Sym = Ctx.CreateTempSymbol();
+    Out.EmitLabel(Sym);
+    const MCExpr *Base = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None,
+                                                 Ctx);
+    Expr = Value == 0 ? Base : MCBinaryExpr::CreateAdd(Base, Expr, Ctx);
+  }
+
+  SMLoc EndLoc =
+    SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  Operands.push_back(SystemZOperand::createImm(Expr, StartLoc, EndLoc));
+  return MatchOperand_Success;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeSystemZAsmParser() {
+  RegisterMCAsmParser<SystemZAsmParser> X(TheSystemZTarget);
+}
diff --git a/lib/Target/SystemZ/CMakeLists.txt b/lib/Target/SystemZ/CMakeLists.txt
new file mode 100644
index 0000000..edb679d
--- /dev/null
+++ b/lib/Target/SystemZ/CMakeLists.txt
@@ -0,0 +1,35 @@
+set(LLVM_TARGET_DEFINITIONS SystemZ.td)
+
+tablegen(LLVM SystemZGenAsmMatcher.inc -gen-asm-matcher)
+tablegen(LLVM SystemZGenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM SystemZGenCallingConv.inc -gen-callingconv)
+tablegen(LLVM SystemZGenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM SystemZGenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM SystemZGenMCCodeEmitter.inc -gen-emitter -mc-emitter)
+tablegen(LLVM SystemZGenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM SystemZGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM SystemZGenSubtargetInfo.inc -gen-subtarget)
+add_public_tablegen_target(SystemZCommonTableGen)
+
+add_llvm_target(SystemZCodeGen
+  SystemZAsmPrinter.cpp
+  SystemZCallingConv.cpp
+  SystemZConstantPoolValue.cpp
+  SystemZFrameLowering.cpp
+  SystemZISelDAGToDAG.cpp
+  SystemZISelLowering.cpp
+  SystemZInstrInfo.cpp
+  SystemZLongBranch.cpp
+  SystemZMCInstLower.cpp
+  SystemZRegisterInfo.cpp
+  SystemZSubtarget.cpp
+  SystemZTargetMachine.cpp
+  )
+
+add_dependencies(LLVMSystemZCodeGen intrinsics_gen)
+
+add_subdirectory(AsmParser)
+add_subdirectory(Disassembler)
+add_subdirectory(InstPrinter)
+add_subdirectory(TargetInfo)
+add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/SystemZ/Disassembler/CMakeLists.txt b/lib/Target/SystemZ/Disassembler/CMakeLists.txt
new file mode 100644
index 0000000..5bc1859
--- /dev/null
+++ b/lib/Target/SystemZ/Disassembler/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMSystemZDisassembler
+  SystemZDisassembler.cpp
+  )
+
+add_dependencies(LLVMSystemZDisassembler SystemZCommonTableGen)
diff --git a/lib/Target/SystemZ/Disassembler/LLVMBuild.txt b/lib/Target/SystemZ/Disassembler/LLVMBuild.txt
new file mode 100644
index 0000000..c3081f5
--- /dev/null
+++ b/lib/Target/SystemZ/Disassembler/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===-- ./lib/Target/SystemZ/Disassembler/LLVMBuild.txt ---------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = SystemZDisassembler
+parent = SystemZ
+required_libraries = MC Support SystemZDesc SystemZInfo
+add_to_library_groups = SystemZ
diff --git a/lib/Target/SystemZ/Disassembler/Makefile b/lib/Target/SystemZ/Disassembler/Makefile
new file mode 100644
index 0000000..efc4cc8
--- /dev/null
+++ b/lib/Target/SystemZ/Disassembler/Makefile
@@ -0,0 +1,16 @@
+##===-- lib/Target/SystemZ/Disassembler/Makefile -----------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMSystemZDisassembler
+
+# Hack: we need to include 'main' x86 target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
new file mode 100644
index 0000000..4e4816b
--- /dev/null
+++ b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
@@ -0,0 +1,301 @@
+//===-- SystemZDisassembler.cpp - Disassembler for SystemZ ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZ.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+class SystemZDisassembler : public MCDisassembler {
+public:
+  SystemZDisassembler(const MCSubtargetInfo &STI)
+    : MCDisassembler(STI) {}
+  virtual ~SystemZDisassembler() {}
+
+  // Override MCDisassembler.
+  virtual DecodeStatus getInstruction(MCInst &instr,
+                                      uint64_t &size,
+                                      const MemoryObject &region,
+                                      uint64_t address,
+                                      raw_ostream &vStream,
+                                      raw_ostream &cStream) const LLVM_OVERRIDE;
+};
+} // end anonymous namespace
+
+static MCDisassembler *createSystemZDisassembler(const Target &T,
+                                                 const MCSubtargetInfo &STI) {
+  return new SystemZDisassembler(STI);
+}
+
+extern "C" void LLVMInitializeSystemZDisassembler() {
+  // Register the disassembler.
+  TargetRegistry::RegisterMCDisassembler(TheSystemZTarget,
+                                         createSystemZDisassembler);
+}
+
+static DecodeStatus decodeRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                        const unsigned *Regs,
+                                        bool isAddress = false) {
+  assert(RegNo < 16 && "Invalid register");
+  if (!isAddress || RegNo) {
+    RegNo = Regs[RegNo];
+    if (RegNo == 0)
+      return MCDisassembler::Fail;
+  }
+  Inst.addOperand(MCOperand::CreateReg(RegNo));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeGR32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::GR32Regs);
+}
+
+static DecodeStatus DecodeGR64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::GR64Regs);
+}
+
+static DecodeStatus DecodeGR128BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::GR128Regs);
+}
+
+static DecodeStatus DecodeADDR64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                                 uint64_t Address,
+                                                 const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::GR64Regs, true);
+}
+
+static DecodeStatus DecodeFP32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::FP32Regs);
+}
+
+static DecodeStatus DecodeFP64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::FP64Regs);
+}
+
+static DecodeStatus DecodeFP128BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::FP128Regs);
+}
+
+template<unsigned N>
+static DecodeStatus decodeUImmOperand(MCInst &Inst, uint64_t Imm) {
+  assert(isUInt<N>(Imm) && "Invalid immediate");
+  Inst.addOperand(MCOperand::CreateImm(Imm));
+  return MCDisassembler::Success;
+}
+
+template<unsigned N>
+static DecodeStatus decodeSImmOperand(MCInst &Inst, uint64_t Imm) {
+  assert(isUInt<N>(Imm) && "Invalid immediate");
+  Inst.addOperand(MCOperand::CreateImm(SignExtend64<N>(Imm)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeAccessRegOperand(MCInst &Inst, uint64_t Imm,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  return decodeUImmOperand<4>(Inst, Imm);
+}
+
+static DecodeStatus decodeU4ImmOperand(MCInst &Inst, uint64_t Imm,
+                                       uint64_t Address, const void *Decoder) {
+  return decodeUImmOperand<4>(Inst, Imm);
+}
+
+static DecodeStatus decodeU6ImmOperand(MCInst &Inst, uint64_t Imm,
+                                       uint64_t Address, const void *Decoder) {
+  return decodeUImmOperand<6>(Inst, Imm);
+}
+
+static DecodeStatus decodeU8ImmOperand(MCInst &Inst, uint64_t Imm,
+                                       uint64_t Address, const void *Decoder) {
+  return decodeUImmOperand<8>(Inst, Imm);
+}
+
+static DecodeStatus decodeU16ImmOperand(MCInst &Inst, uint64_t Imm,
+                                        uint64_t Address, const void *Decoder) {
+  return decodeUImmOperand<16>(Inst, Imm);
+}
+
+static DecodeStatus decodeU32ImmOperand(MCInst &Inst, uint64_t Imm,
+                                        uint64_t Address, const void *Decoder) {
+  return decodeUImmOperand<32>(Inst, Imm);
+}
+
+static DecodeStatus decodeS8ImmOperand(MCInst &Inst, uint64_t Imm,
+                                       uint64_t Address, const void *Decoder) {
+  return decodeSImmOperand<8>(Inst, Imm);
+}
+
+static DecodeStatus decodeS16ImmOperand(MCInst &Inst, uint64_t Imm,
+                                        uint64_t Address, const void *Decoder) {
+  return decodeSImmOperand<16>(Inst, Imm);
+}
+
+static DecodeStatus decodeS32ImmOperand(MCInst &Inst, uint64_t Imm,
+                                        uint64_t Address, const void *Decoder) {
+  return decodeSImmOperand<32>(Inst, Imm);
+}
+
+template<unsigned N>
+static DecodeStatus decodePCDBLOperand(MCInst &Inst, uint64_t Imm,
+                                       uint64_t Address) {
+  assert(isUInt<N>(Imm) && "Invalid PC-relative offset");
+  Inst.addOperand(MCOperand::CreateImm(SignExtend64<N>(Imm) * 2 + Address));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodePC16DBLOperand(MCInst &Inst, uint64_t Imm,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  return decodePCDBLOperand<16>(Inst, Imm, Address);
+}
+
+static DecodeStatus decodePC32DBLOperand(MCInst &Inst, uint64_t Imm,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  return decodePCDBLOperand<32>(Inst, Imm, Address);
+}
+
+static DecodeStatus decodeBDAddr12Operand(MCInst &Inst, uint64_t Field,
+                                          const unsigned *Regs) {
+  uint64_t Base = Field >> 12;
+  uint64_t Disp = Field & 0xfff;
+  assert(Base < 16 && "Invalid BDAddr12");
+  Inst.addOperand(MCOperand::CreateReg(Base == 0 ? 0 : Regs[Base]));
+  Inst.addOperand(MCOperand::CreateImm(Disp));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeBDAddr20Operand(MCInst &Inst, uint64_t Field,
+                                          const unsigned *Regs) {
+  uint64_t Base = Field >> 20;
+  uint64_t Disp = ((Field << 12) & 0xff000) | ((Field >> 8) & 0xfff);
+  assert(Base < 16 && "Invalid BDAddr20");
+  Inst.addOperand(MCOperand::CreateReg(Base == 0 ? 0 : Regs[Base]));
+  Inst.addOperand(MCOperand::CreateImm(SignExtend64<20>(Disp)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeBDXAddr12Operand(MCInst &Inst, uint64_t Field,
+                                           const unsigned *Regs) {
+  uint64_t Index = Field >> 16;
+  uint64_t Base = (Field >> 12) & 0xf;
+  uint64_t Disp = Field & 0xfff;
+  assert(Index < 16 && "Invalid BDXAddr12");
+  Inst.addOperand(MCOperand::CreateReg(Base == 0 ? 0 : Regs[Base]));
+  Inst.addOperand(MCOperand::CreateImm(Disp));
+  Inst.addOperand(MCOperand::CreateReg(Index == 0 ? 0 : Regs[Index]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeBDXAddr20Operand(MCInst &Inst, uint64_t Field,
+                                           const unsigned *Regs) {
+  uint64_t Index = Field >> 24;
+  uint64_t Base = (Field >> 20) & 0xf;
+  uint64_t Disp = ((Field & 0xfff00) >> 8) | ((Field & 0xff) << 12);
+  assert(Index < 16 && "Invalid BDXAddr20");
+  Inst.addOperand(MCOperand::CreateReg(Base == 0 ? 0 : Regs[Base]));
+  Inst.addOperand(MCOperand::CreateImm(SignExtend64<20>(Disp)));
+  Inst.addOperand(MCOperand::CreateReg(Index == 0 ? 0 : Regs[Index]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeBDAddr32Disp12Operand(MCInst &Inst, uint64_t Field,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  return decodeBDAddr12Operand(Inst, Field, SystemZMC::GR32Regs);
+}
+
+static DecodeStatus decodeBDAddr32Disp20Operand(MCInst &Inst, uint64_t Field,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  return decodeBDAddr20Operand(Inst, Field, SystemZMC::GR32Regs);
+}
+
+static DecodeStatus decodeBDAddr64Disp12Operand(MCInst &Inst, uint64_t Field,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  return decodeBDAddr12Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
+static DecodeStatus decodeBDAddr64Disp20Operand(MCInst &Inst, uint64_t Field,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  return decodeBDAddr20Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
+static DecodeStatus decodeBDXAddr64Disp12Operand(MCInst &Inst, uint64_t Field,
+                                                 uint64_t Address,
+                                                 const void *Decoder) {
+  return decodeBDXAddr12Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
+static DecodeStatus decodeBDXAddr64Disp20Operand(MCInst &Inst, uint64_t Field,
+                                                 uint64_t Address,
+                                                 const void *Decoder) {
+  return decodeBDXAddr20Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
+#include "SystemZGenDisassemblerTables.inc"
+
+DecodeStatus SystemZDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                                 const MemoryObject &Region,
+                                                 uint64_t Address,
+                                                 raw_ostream &os,
+                                                 raw_ostream &cs) const {
+  // Get the first two bytes of the instruction.
+  uint8_t Bytes[6];
+  Size = 0;
+  if (Region.readBytes(Address, 2, Bytes) == -1)
+    return MCDisassembler::Fail;
+
+  // The top 2 bits of the first byte specify the size.
+  const uint8_t *Table;
+  if (Bytes[0] < 0x40) {
+    Size = 2;
+    Table = DecoderTable16;
+  } else if (Bytes[0] < 0xc0) {
+    Size = 4;
+    Table = DecoderTable32;
+  } else {
+    Size = 6;
+    Table = DecoderTable48;
+  }
+
+  // Read any remaining bytes.
+  if (Size > 2 && Region.readBytes(Address + 2, Size - 2, Bytes + 2) == -1)
+    return MCDisassembler::Fail;
+
+  // Construct the instruction.
+  uint64_t Inst = 0;
+  for (uint64_t I = 0; I < Size; ++I)
+    Inst = (Inst << 8) | Bytes[I];
+
+  return decodeInstruction(Table, MI, Inst, Address, this, STI);
+}
diff --git a/lib/Target/SystemZ/InstPrinter/CMakeLists.txt b/lib/Target/SystemZ/InstPrinter/CMakeLists.txt
new file mode 100644
index 0000000..ddbf82f
--- /dev/null
+++ b/lib/Target/SystemZ/InstPrinter/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMSystemZAsmPrinter
+  SystemZInstPrinter.cpp
+  )
+
+add_dependencies(LLVMSystemZAsmPrinter SystemZCommonTableGen)
diff --git a/lib/Target/SystemZ/InstPrinter/LLVMBuild.txt b/lib/Target/SystemZ/InstPrinter/LLVMBuild.txt
new file mode 100644
index 0000000..fdfd738
--- /dev/null
+++ b/lib/Target/SystemZ/InstPrinter/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/SystemZ/InstPrinter/LLVMBuild.txt -----------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = SystemZAsmPrinter
+parent = SystemZ
+required_libraries = MC Support
+add_to_library_groups = SystemZ
diff --git a/lib/Target/SystemZ/InstPrinter/Makefile b/lib/Target/SystemZ/InstPrinter/Makefile
new file mode 100644
index 0000000..3ba8126
--- /dev/null
+++ b/lib/Target/SystemZ/InstPrinter/Makefile
@@ -0,0 +1,16 @@
+##===- lib/Target/SystemZ/AsmPrinter/Makefile --------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMSystemZAsmPrinter
+
+# Hack: we need to include 'main' mips target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
new file mode 100644
index 0000000..369802b
--- /dev/null
+++ b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
@@ -0,0 +1,166 @@
+//===-- SystemZInstPrinter.cpp - Convert SystemZ MCInst to assembly syntax ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+
+#include "SystemZInstPrinter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#include "SystemZGenAsmWriter.inc"
+
+void SystemZInstPrinter::printAddress(unsigned Base, int64_t Disp,
+                                      unsigned Index, raw_ostream &O) {
+  O << Disp;
+  if (Base) {
+    O << '(';
+    if (Index)
+      O << '%' << getRegisterName(Index) << ',';
+    O << '%' << getRegisterName(Base) << ')';
+  } else
+    assert(!Index && "Shouldn't have an index without a base");
+}
+
+void SystemZInstPrinter::printOperand(const MCOperand &MO, raw_ostream &O) {
+  if (MO.isReg())
+    O << '%' << getRegisterName(MO.getReg());
+  else if (MO.isImm())
+    O << MO.getImm();
+  else if (MO.isExpr())
+    O << *MO.getExpr();
+  else
+    llvm_unreachable("Invalid operand");
+}
+
+void SystemZInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                   StringRef Annot) {
+  printInstruction(MI, O);
+  printAnnotation(O, Annot);
+}
+
+void SystemZInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const {
+  O << '%' << getRegisterName(RegNo);
+}
+
+void SystemZInstPrinter::printU4ImmOperand(const MCInst *MI, int OpNum,
+                                           raw_ostream &O) {
+  int64_t Value = MI->getOperand(OpNum).getImm();
+  assert(isUInt<4>(Value) && "Invalid u4imm argument");
+  O << Value;
+}
+
+void SystemZInstPrinter::printU6ImmOperand(const MCInst *MI, int OpNum,
+                                           raw_ostream &O) {
+  int64_t Value = MI->getOperand(OpNum).getImm();
+  assert(isUInt<6>(Value) && "Invalid u6imm argument");
+  O << Value;
+}
+
+void SystemZInstPrinter::printS8ImmOperand(const MCInst *MI, int OpNum,
+                                           raw_ostream &O) {
+  int64_t Value = MI->getOperand(OpNum).getImm();
+  assert(isInt<8>(Value) && "Invalid s8imm argument");
+  O << Value;
+}
+
+void SystemZInstPrinter::printU8ImmOperand(const MCInst *MI, int OpNum,
+                                           raw_ostream &O) {
+  int64_t Value = MI->getOperand(OpNum).getImm();
+  assert(isUInt<8>(Value) && "Invalid u8imm argument");
+  O << Value;
+}
+
+void SystemZInstPrinter::printS16ImmOperand(const MCInst *MI, int OpNum,
+                                            raw_ostream &O) {
+  int64_t Value = MI->getOperand(OpNum).getImm();
+  assert(isInt<16>(Value) && "Invalid s16imm argument");
+  O << Value;
+}
+
+void SystemZInstPrinter::printU16ImmOperand(const MCInst *MI, int OpNum,
+                                            raw_ostream &O) {
+  int64_t Value = MI->getOperand(OpNum).getImm();
+  assert(isUInt<16>(Value) && "Invalid u16imm argument");
+  O << Value;
+}
+
+void SystemZInstPrinter::printS32ImmOperand(const MCInst *MI, int OpNum,
+                                            raw_ostream &O) {
+  int64_t Value = MI->getOperand(OpNum).getImm();
+  assert(isInt<32>(Value) && "Invalid s32imm argument");
+  O << Value;
+}
+
+void SystemZInstPrinter::printU32ImmOperand(const MCInst *MI, int OpNum,
+                                            raw_ostream &O) {
+  int64_t Value = MI->getOperand(OpNum).getImm();
+  assert(isUInt<32>(Value) && "Invalid u32imm argument");
+  O << Value;
+}
+
+void SystemZInstPrinter::printAccessRegOperand(const MCInst *MI, int OpNum,
+                                               raw_ostream &O) {
+  uint64_t Value = MI->getOperand(OpNum).getImm();
+  assert(Value < 16 && "Invalid access register number");
+  O << "%a" << (unsigned int)Value;
+}
+
+void SystemZInstPrinter::printPCRelOperand(const MCInst *MI, int OpNum,
+                                           raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  if (MO.isImm()) {
+    O << "0x";
+    O.write_hex(MO.getImm());
+  } else
+    O << *MO.getExpr();
+}
+
+void SystemZInstPrinter::printCallOperand(const MCInst *MI, int OpNum,
+                                          raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  if (MO.isImm()) {
+    O << "0x";
+    O.write_hex(MO.getImm());
+  } else {
+    O << *MO.getExpr();
+    O << "@PLT";
+  }
+}
+
+void SystemZInstPrinter::printOperand(const MCInst *MI, int OpNum,
+                                      raw_ostream &O) {
+  printOperand(MI->getOperand(OpNum), O);
+}
+
+void SystemZInstPrinter::printBDAddrOperand(const MCInst *MI, int OpNum,
+                                            raw_ostream &O) {
+  printAddress(MI->getOperand(OpNum).getReg(),
+               MI->getOperand(OpNum + 1).getImm(), 0, O);
+}
+
+void SystemZInstPrinter::printBDXAddrOperand(const MCInst *MI, int OpNum,
+                                             raw_ostream &O) {
+  printAddress(MI->getOperand(OpNum).getReg(),
+               MI->getOperand(OpNum + 1).getImm(),
+               MI->getOperand(OpNum + 2).getReg(), O);
+}
+
+void SystemZInstPrinter::printCond4Operand(const MCInst *MI, int OpNum,
+                                           raw_ostream &O) {
+  static const char *const CondNames[] = {
+    "o", "h", "nle", "l", "nhe", "lh", "ne",
+    "e", "nlh", "he", "nl", "le", "nh", "no"
+  };
+  uint64_t Imm = MI->getOperand(OpNum).getImm();
+  assert(Imm > 0 && Imm < 15 && "Invalid condition");
+  O << CondNames[Imm - 1];
+}
diff --git a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
new file mode 100644
index 0000000..f77282e
--- /dev/null
+++ b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
@@ -0,0 +1,69 @@
+//==- SystemZInstPrinter.h - Convert SystemZ MCInst to assembly --*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints a SystemZ MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SYSTEMZINSTPRINTER_H
+#define LLVM_SYSTEMZINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class MCOperand;
+
+class SystemZInstPrinter : public MCInstPrinter {
+public:
+  SystemZInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                     const MCRegisterInfo &MRI)
+    : MCInstPrinter(MAI, MII, MRI) {}
+
+  // Automatically generated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+
+  // Print an address with the given base, displacement and index.
+  static void printAddress(unsigned Base, int64_t Disp, unsigned Index,
+                           raw_ostream &O);
+
+  // Print the given operand.
+  static void printOperand(const MCOperand &MO, raw_ostream &O);
+
+  // Override MCInstPrinter.
+  virtual void printRegName(raw_ostream &O, unsigned RegNo) const
+    LLVM_OVERRIDE;
+  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot)
+    LLVM_OVERRIDE;
+
+private:
+  // Print various types of operand.
+  void printOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printBDAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printBDXAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printU4ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printU6ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printS8ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printU8ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printS16ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printU16ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printS32ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printU32ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printPCRelOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printCallOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printAccessRegOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+
+  // Print the mnemonic for a condition-code mask ("ne", "lh", etc.)
+  // This forms part of the instruction name rather than the operand list.
+  void printCond4Operand(const MCInst *MI, int OpNum, raw_ostream &O);
+};
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/SystemZ/LLVMBuild.txt b/lib/Target/SystemZ/LLVMBuild.txt
new file mode 100644
index 0000000..95e657f
--- /dev/null
+++ b/lib/Target/SystemZ/LLVMBuild.txt
@@ -0,0 +1,35 @@
+;===- ./lib/Target/SystemZ/LLVMBuild.txt -----------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[common]
+subdirectories = AsmParser Disassembler InstPrinter MCTargetDesc TargetInfo
+
+[component_0]
+type = TargetGroup
+name = SystemZ
+parent = Target
+has_asmparser = 1
+has_asmprinter = 1
+has_disassembler = 1
+has_jit = 1
+
+[component_1]
+type = Library
+name = SystemZCodeGen
+parent = SystemZ
+required_libraries = AsmPrinter CodeGen Core MC SelectionDAG SystemZDesc SystemZInfo Support Target
+add_to_library_groups = SystemZ
diff --git a/lib/Target/SystemZ/MCTargetDesc/CMakeLists.txt b/lib/Target/SystemZ/MCTargetDesc/CMakeLists.txt
new file mode 100644
index 0000000..3d13128
--- /dev/null
+++ b/lib/Target/SystemZ/MCTargetDesc/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_llvm_library(LLVMSystemZDesc
+  SystemZMCAsmBackend.cpp
+  SystemZMCAsmInfo.cpp
+  SystemZMCCodeEmitter.cpp
+  SystemZMCObjectWriter.cpp
+  SystemZMCTargetDesc.cpp
+  )
+
+add_dependencies(LLVMSystemZDesc SystemZCommonTableGen)
diff --git a/lib/Target/SystemZ/MCTargetDesc/LLVMBuild.txt b/lib/Target/SystemZ/MCTargetDesc/LLVMBuild.txt
new file mode 100644
index 0000000..cbdb59c
--- /dev/null
+++ b/lib/Target/SystemZ/MCTargetDesc/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/SystemZ/MCTargetDesc/LLVMBuild.txt ----------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = SystemZDesc
+parent = SystemZ
+required_libraries = MC SystemZAsmPrinter SystemZInfo Support
+add_to_library_groups = SystemZ
diff --git a/lib/Target/SystemZ/MCTargetDesc/Makefile b/lib/Target/SystemZ/MCTargetDesc/Makefile
new file mode 100644
index 0000000..08f1a9d
--- /dev/null
+++ b/lib/Target/SystemZ/MCTargetDesc/Makefile
@@ -0,0 +1,16 @@
+##===- lib/Target/SystemZ/TargetDesc/Makefile --------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMSystemZDesc
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
new file mode 100644
index 0000000..027db44
--- /dev/null
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
@@ -0,0 +1,150 @@
+//===-- SystemZMCAsmBackend.cpp - SystemZ assembler backend ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "MCTargetDesc/SystemZMCFixups.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectWriter.h"
+
+using namespace llvm;
+
+// Value is a fully-resolved relocation value: Symbol + Addend [- Pivot].
+// Return the bits that should be installed in a relocation field for
+// fixup kind Kind.
+static uint64_t extractBitsForFixup(MCFixupKind Kind, uint64_t Value) {
+  if (Kind < FirstTargetFixupKind)
+    return Value;
+
+  switch (unsigned(Kind)) {
+  case SystemZ::FK_390_PC16DBL:
+  case SystemZ::FK_390_PC32DBL:
+  case SystemZ::FK_390_PLT16DBL:
+  case SystemZ::FK_390_PLT32DBL:
+    return (int64_t)Value / 2;
+  }
+
+  llvm_unreachable("Unknown fixup kind!");
+}
+
+// If Opcode is a relaxable interprocedural reference, return the relaxed form,
+// otherwise return 0.
+static unsigned getRelaxedOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  case SystemZ::BRAS: return SystemZ::BRASL;
+  }
+  return 0;
+}
+
+namespace {
+class SystemZMCAsmBackend : public MCAsmBackend {
+  uint8_t OSABI;
+public:
+  SystemZMCAsmBackend(uint8_t osABI)
+    : OSABI(osABI) {}
+
+  // Override MCAsmBackend
+  virtual unsigned getNumFixupKinds() const LLVM_OVERRIDE {
+    return SystemZ::NumTargetFixupKinds;
+  }
+  virtual const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const
+    LLVM_OVERRIDE;
+  virtual void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                          uint64_t Value) const LLVM_OVERRIDE;
+  virtual bool mayNeedRelaxation(const MCInst &Inst) const LLVM_OVERRIDE;
+  virtual bool fixupNeedsRelaxation(const MCFixup &Fixup,
+                                    uint64_t Value,
+                                    const MCRelaxableFragment *Fragment,
+                                    const MCAsmLayout &Layout) const
+    LLVM_OVERRIDE;
+  virtual void relaxInstruction(const MCInst &Inst,
+                                MCInst &Res) const LLVM_OVERRIDE;
+  virtual bool writeNopData(uint64_t Count,
+                            MCObjectWriter *OW) const LLVM_OVERRIDE;
+  virtual MCObjectWriter *createObjectWriter(raw_ostream &OS) const
+    LLVM_OVERRIDE {
+    return createSystemZObjectWriter(OS, OSABI);
+  }
+  virtual bool doesSectionRequireSymbols(const MCSection &Section) const
+    LLVM_OVERRIDE {
+    return false;
+  }
+};
+} // end anonymous namespace
+
+const MCFixupKindInfo &
+SystemZMCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
+  const static MCFixupKindInfo Infos[SystemZ::NumTargetFixupKinds] = {
+    { "FK_390_PC16DBL",  0, 16, MCFixupKindInfo::FKF_IsPCRel },
+    { "FK_390_PC32DBL",  0, 32, MCFixupKindInfo::FKF_IsPCRel },
+    { "FK_390_PLT16DBL", 0, 16, MCFixupKindInfo::FKF_IsPCRel },
+    { "FK_390_PLT32DBL", 0, 32, MCFixupKindInfo::FKF_IsPCRel }
+  };
+
+  if (Kind < FirstTargetFixupKind)
+    return MCAsmBackend::getFixupKindInfo(Kind);
+
+  assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+         "Invalid kind!");
+  return Infos[Kind - FirstTargetFixupKind];
+}
+
+void SystemZMCAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                                     unsigned DataSize, uint64_t Value) const {
+  MCFixupKind Kind = Fixup.getKind();
+  unsigned Offset = Fixup.getOffset();
+  unsigned Size = (getFixupKindInfo(Kind).TargetSize + 7) / 8;
+
+  assert(Offset + Size <= DataSize && "Invalid fixup offset!");
+
+  // Big-endian insertion of Size bytes.
+  Value = extractBitsForFixup(Kind, Value);
+  unsigned ShiftValue = (Size * 8) - 8;
+  for (unsigned I = 0; I != Size; ++I) {
+    Data[Offset + I] |= uint8_t(Value >> ShiftValue);
+    ShiftValue -= 8;
+  }
+}
+
+bool SystemZMCAsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
+  return getRelaxedOpcode(Inst.getOpcode()) != 0;
+}
+
+bool
+SystemZMCAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
+                                          uint64_t Value,
+                                          const MCRelaxableFragment *Fragment,
+                                          const MCAsmLayout &Layout) const {
+  // At the moment we just need to relax 16-bit fields to wider fields.
+  Value = extractBitsForFixup(Fixup.getKind(), Value);
+  return (int16_t)Value != (int64_t)Value;
+}
+
+void SystemZMCAsmBackend::relaxInstruction(const MCInst &Inst,
+                                           MCInst &Res) const {
+  unsigned Opcode = getRelaxedOpcode(Inst.getOpcode());
+  assert(Opcode && "Unexpected insn to relax");
+  Res = Inst;
+  Res.setOpcode(Opcode);
+}
+
+bool SystemZMCAsmBackend::writeNopData(uint64_t Count,
+                                       MCObjectWriter *OW) const {
+  for (uint64_t I = 0; I != Count; ++I)
+    OW->Write8(7);
+  return true;
+}
+
+MCAsmBackend *llvm::createSystemZMCAsmBackend(const Target &T, StringRef TT,
+                                              StringRef CPU) {
+  uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(Triple(TT).getOS());
+  return new SystemZMCAsmBackend(OSABI);
+}
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
new file mode 100644
index 0000000..9e27aa0
--- /dev/null
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
@@ -0,0 +1,38 @@
+//===-- SystemZMCAsmInfo.cpp - SystemZ asm properties ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZMCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+
+using namespace llvm;
+
+SystemZMCAsmInfo::SystemZMCAsmInfo(StringRef TT) {
+  PointerSize = 8;
+  CalleeSaveStackSlotSize = 8;
+  IsLittleEndian = false;
+
+  CommentString = "#";
+  PCSymbol = ".";
+  GlobalPrefix = "";
+  PrivateGlobalPrefix = ".L";
+  WeakRefDirective = "\t.weak\t";
+  ZeroDirective = "\t.space\t";
+  Data64bitsDirective = "\t.quad\t";
+  UsesELFSectionDirectiveForBSS = true;
+  SupportsDebugInformation = true;
+  HasLEB128 = true;
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+}
+
+const MCSection *
+SystemZMCAsmInfo::getNonexecutableStackSection(MCContext &Ctx) const {
+  return Ctx.getELFSection(".note.GNU-stack", ELF::SHT_PROGBITS,
+                           0, SectionKind::getMetadata());
+}
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
new file mode 100644
index 0000000..d440787
--- /dev/null
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
@@ -0,0 +1,30 @@
+//====-- SystemZMCAsmInfo.h - SystemZ asm properties -----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SystemZTARGETASMINFO_H
+#define SystemZTARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class StringRef;
+
+class SystemZMCAsmInfo : public MCAsmInfo {
+public:
+  explicit SystemZMCAsmInfo(StringRef TT);
+
+  // Override MCAsmInfo;
+  virtual const MCSection *getNonexecutableStackSection(MCContext &Ctx) const
+    LLVM_OVERRIDE;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
new file mode 100644
index 0000000..7721b1f
--- /dev/null
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
@@ -0,0 +1,183 @@
+//===-- SystemZMCCodeEmitter.cpp - Convert SystemZ code to machine code ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SystemZMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mccodeemitter"
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "MCTargetDesc/SystemZMCFixups.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstrInfo.h"
+
+using namespace llvm;
+
+namespace {
+class SystemZMCCodeEmitter : public MCCodeEmitter {
+  const MCInstrInfo &MCII;
+  MCContext &Ctx;
+
+public:
+  SystemZMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
+    : MCII(mcii), Ctx(ctx) {
+  }
+
+  ~SystemZMCCodeEmitter() {}
+
+  // OVerride MCCodeEmitter.
+  virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                 SmallVectorImpl<MCFixup> &Fixups) const
+    LLVM_OVERRIDE;
+
+private:
+  // Automatically generated by TableGen.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups) const;
+
+  // Called by the TableGen code to get the binary encoding of operand
+  // MO in MI.  Fixups is the list of fixups against MI.
+  uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+
+  // Called by the TableGen code to get the binary encoding of an address.
+  // The index, if any, is encoded first, followed by the base,
+  // followed by the displacement.  In a 20-bit displacement,
+  // the low 12 bits are encoded before the high 8 bits.
+  uint64_t getBDAddr12Encoding(const MCInst &MI, unsigned OpNum,
+                               SmallVectorImpl<MCFixup> &Fixups) const;
+  uint64_t getBDAddr20Encoding(const MCInst &MI, unsigned OpNum,
+                               SmallVectorImpl<MCFixup> &Fixups) const;
+  uint64_t getBDXAddr12Encoding(const MCInst &MI, unsigned OpNum,
+                                SmallVectorImpl<MCFixup> &Fixups) const;
+  uint64_t getBDXAddr20Encoding(const MCInst &MI, unsigned OpNum,
+                                SmallVectorImpl<MCFixup> &Fixups) const;
+
+  // Operand OpNum of MI needs a PC-relative fixup of kind Kind at
+  // Offset bytes from the start of MI.  Add the fixup to Fixups
+  // and return the in-place addend, which since we're a RELA target
+  // is always 0.
+  uint64_t getPCRelEncoding(const MCInst &MI, unsigned OpNum,
+                            SmallVectorImpl<MCFixup> &Fixups,
+                            unsigned Kind, int64_t Offset) const;
+
+  uint64_t getPC16DBLEncoding(const MCInst &MI, unsigned OpNum,
+                              SmallVectorImpl<MCFixup> &Fixups) const {
+    return getPCRelEncoding(MI, OpNum, Fixups, SystemZ::FK_390_PC16DBL, 2);
+  }
+  uint64_t getPC32DBLEncoding(const MCInst &MI, unsigned OpNum,
+                              SmallVectorImpl<MCFixup> &Fixups) const {
+    return getPCRelEncoding(MI, OpNum, Fixups, SystemZ::FK_390_PC32DBL, 2);
+  }
+  uint64_t getPLT16DBLEncoding(const MCInst &MI, unsigned OpNum,
+                               SmallVectorImpl<MCFixup> &Fixups) const {
+    return getPCRelEncoding(MI, OpNum, Fixups, SystemZ::FK_390_PLT16DBL, 2);
+  }
+  uint64_t getPLT32DBLEncoding(const MCInst &MI, unsigned OpNum,
+                               SmallVectorImpl<MCFixup> &Fixups) const {
+    return getPCRelEncoding(MI, OpNum, Fixups, SystemZ::FK_390_PLT32DBL, 2);
+  }
+};
+}
+
+MCCodeEmitter *llvm::createSystemZMCCodeEmitter(const MCInstrInfo &MCII,
+                                                const MCRegisterInfo &MRI,
+                                                const MCSubtargetInfo &MCSTI,
+                                                MCContext &Ctx) {
+  return new SystemZMCCodeEmitter(MCII, Ctx);
+}
+
+void SystemZMCCodeEmitter::
+EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                  SmallVectorImpl<MCFixup> &Fixups) const {
+  uint64_t Bits = getBinaryCodeForInstr(MI, Fixups);
+  unsigned Size = MCII.get(MI.getOpcode()).getSize();
+  // Big-endian insertion of Size bytes.
+  unsigned ShiftValue = (Size * 8) - 8;
+  for (unsigned I = 0; I != Size; ++I) {
+    OS << uint8_t(Bits >> ShiftValue);
+    ShiftValue -= 8;
+  }
+}
+
+uint64_t SystemZMCCodeEmitter::
+getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                  SmallVectorImpl<MCFixup> &Fixups) const {
+  if (MO.isReg())
+    return Ctx.getRegisterInfo().getEncodingValue(MO.getReg());
+  if (MO.isImm())
+    return static_cast<uint64_t>(MO.getImm());
+  llvm_unreachable("Unexpected operand type!");
+}
+
+uint64_t SystemZMCCodeEmitter::
+getBDAddr12Encoding(const MCInst &MI, unsigned OpNum,
+                    SmallVectorImpl<MCFixup> &Fixups) const {
+  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups);
+  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups);
+  assert(isUInt<4>(Base) && isUInt<12>(Disp));
+  return (Base << 12) | Disp;
+}
+
+uint64_t SystemZMCCodeEmitter::
+getBDAddr20Encoding(const MCInst &MI, unsigned OpNum,
+                    SmallVectorImpl<MCFixup> &Fixups) const {
+  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups);
+  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups);
+  assert(isUInt<4>(Base) && isInt<20>(Disp));
+  return (Base << 20) | ((Disp & 0xfff) << 8) | ((Disp & 0xff000) >> 12);
+}
+
+uint64_t SystemZMCCodeEmitter::
+getBDXAddr12Encoding(const MCInst &MI, unsigned OpNum,
+                     SmallVectorImpl<MCFixup> &Fixups) const {
+  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups);
+  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups);
+  uint64_t Index = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups);
+  assert(isUInt<4>(Base) && isUInt<12>(Disp) && isUInt<4>(Index));
+  return (Index << 16) | (Base << 12) | Disp;
+}
+
+uint64_t SystemZMCCodeEmitter::
+getBDXAddr20Encoding(const MCInst &MI, unsigned OpNum,
+                     SmallVectorImpl<MCFixup> &Fixups) const {
+  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups);
+  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups);
+  uint64_t Index = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups);
+  assert(isUInt<4>(Base) && isInt<20>(Disp) && isUInt<4>(Index));
+  return (Index << 24) | (Base << 20) | ((Disp & 0xfff) << 8)
+    | ((Disp & 0xff000) >> 12);
+}
+
+uint64_t
+SystemZMCCodeEmitter::getPCRelEncoding(const MCInst &MI, unsigned OpNum,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       unsigned Kind, int64_t Offset) const {
+  const MCOperand &MO = MI.getOperand(OpNum);
+  const MCExpr *Expr;
+  if (MO.isImm())
+    Expr = MCConstantExpr::Create(MO.getImm() + Offset, Ctx);
+  else {
+    Expr = MO.getExpr();
+    if (Offset) {
+      // The operand value is relative to the start of MI, but the fixup
+      // is relative to the operand field itself, which is Offset bytes
+      // into MI.  Add Offset to the relocation value to cancel out
+      // this difference.
+      const MCExpr *OffsetExpr = MCConstantExpr::Create(Offset, Ctx);
+      Expr = MCBinaryExpr::CreateAdd(Expr, OffsetExpr, Ctx);
+    }
+  }
+  Fixups.push_back(MCFixup::Create(Offset, Expr, (MCFixupKind)Kind));
+  return 0;
+}
+
+#include "SystemZGenMCCodeEmitter.inc"
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h b/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h
new file mode 100644
index 0000000..9c94ebb
--- /dev/null
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h
@@ -0,0 +1,31 @@
+//===-- SystemZMCFixups.h - SystemZ-specific fixup entries ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SYSTEMZMCFIXUPS_H
+#define LLVM_SYSTEMZMCFIXUPS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace SystemZ {
+  enum FixupKind {
+    // These correspond directly to R_390_* relocations.
+    FK_390_PC16DBL = FirstTargetFixupKind,
+    FK_390_PC32DBL,
+    FK_390_PLT16DBL,
+    FK_390_PLT32DBL,
+
+    // Marker
+    LastTargetFixupKind,
+    NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+  };
+}
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
new file mode 100644
index 0000000..36e3d83
--- /dev/null
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
@@ -0,0 +1,140 @@
+//===-- SystemZMCObjectWriter.cpp - SystemZ ELF writer --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "MCTargetDesc/SystemZMCFixups.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+
+using namespace llvm;
+
+namespace {
+class SystemZObjectWriter : public MCELFObjectTargetWriter {
+public:
+  SystemZObjectWriter(uint8_t OSABI);
+
+  virtual ~SystemZObjectWriter();
+
+protected:
+  // Override MCELFObjectTargetWriter.
+  virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                                bool IsPCRel, bool IsRelocWithSymbol,
+                                int64_t Addend) const LLVM_OVERRIDE;
+  virtual const MCSymbol *ExplicitRelSym(const MCAssembler &Asm,
+                                         const MCValue &Target,
+                                         const MCFragment &F,
+                                         const MCFixup &Fixup,
+                                         bool IsPCRel) const LLVM_OVERRIDE;
+};
+} // end anonymouse namespace
+
+SystemZObjectWriter::SystemZObjectWriter(uint8_t OSABI)
+  : MCELFObjectTargetWriter(/*Is64Bit=*/true, OSABI, ELF::EM_S390,
+                            /*HasRelocationAddend=*/ true) {}
+
+SystemZObjectWriter::~SystemZObjectWriter() {
+}
+
+// Return the relocation type for an absolute value of MCFixupKind Kind.
+static unsigned getAbsoluteReloc(unsigned Kind) {
+  switch (Kind) {
+  case FK_Data_1: return ELF::R_390_8;
+  case FK_Data_2: return ELF::R_390_16;
+  case FK_Data_4: return ELF::R_390_32;
+  case FK_Data_8: return ELF::R_390_64;
+  }
+  llvm_unreachable("Unsupported absolute address");
+}
+
+// Return the relocation type for a PC-relative value of MCFixupKind Kind.
+static unsigned getPCRelReloc(unsigned Kind) {
+  switch (Kind) {
+  case FK_Data_2:                return ELF::R_390_PC16;
+  case FK_Data_4:                return ELF::R_390_PC32;
+  case FK_Data_8:                return ELF::R_390_PC64;
+  case SystemZ::FK_390_PC16DBL:  return ELF::R_390_PC16DBL;
+  case SystemZ::FK_390_PC32DBL:  return ELF::R_390_PC32DBL;
+  case SystemZ::FK_390_PLT16DBL: return ELF::R_390_PLT16DBL;
+  case SystemZ::FK_390_PLT32DBL: return ELF::R_390_PLT32DBL;
+  }
+  llvm_unreachable("Unsupported PC-relative address");
+}
+
+// Return the R_390_TLS_LE* relocation type for MCFixupKind Kind.
+static unsigned getTLSLEReloc(unsigned Kind) {
+  switch (Kind) {
+  case FK_Data_4: return ELF::R_390_TLS_LE32;
+  case FK_Data_8: return ELF::R_390_TLS_LE64;
+  }
+  llvm_unreachable("Unsupported absolute address");
+}
+
+// Return the PLT relocation counterpart of MCFixupKind Kind.
+static unsigned getPLTReloc(unsigned Kind) {
+  switch (Kind) {
+  case SystemZ::FK_390_PC16DBL: return ELF::R_390_PLT16DBL;
+  case SystemZ::FK_390_PC32DBL: return ELF::R_390_PLT32DBL;
+  }
+  llvm_unreachable("Unsupported absolute address");
+}
+
+unsigned SystemZObjectWriter::GetRelocType(const MCValue &Target,
+                                           const MCFixup &Fixup,
+                                           bool IsPCRel,
+                                           bool IsRelocWithSymbol,
+                                           int64_t Addend) const {
+  MCSymbolRefExpr::VariantKind Modifier = (Target.isAbsolute() ?
+                                           MCSymbolRefExpr::VK_None :
+                                           Target.getSymA()->getKind());
+  unsigned Kind = Fixup.getKind();
+  switch (Modifier) {
+  case MCSymbolRefExpr::VK_None:
+    if (IsPCRel)
+      return getPCRelReloc(Kind);
+    return getAbsoluteReloc(Kind);
+
+  case MCSymbolRefExpr::VK_NTPOFF:
+    assert(!IsPCRel && "NTPOFF shouldn't be PC-relative");
+    return getTLSLEReloc(Kind);
+
+  case MCSymbolRefExpr::VK_GOT:
+    if (IsPCRel && Kind == SystemZ::FK_390_PC32DBL)
+      return ELF::R_390_GOTENT;
+    llvm_unreachable("Only PC-relative GOT accesses are supported for now");
+
+  case MCSymbolRefExpr::VK_PLT:
+    assert(IsPCRel && "@PLT shouldt be PC-relative");
+    return getPLTReloc(Kind);
+
+  default:
+    llvm_unreachable("Modifier not supported");
+  }
+}
+
+const MCSymbol *SystemZObjectWriter::ExplicitRelSym(const MCAssembler &Asm,
+                                                    const MCValue &Target,
+                                                    const MCFragment &F,
+                                                    const MCFixup &Fixup,
+                                                    bool IsPCRel) const {
+  // The addend in a PC-relative R_390_* relocation is always applied to
+  // the PC-relative part of the address.  If some kind of indirection
+  // is applied to the symbol first, we can't use an addend there too.
+  if (!Target.isAbsolute() &&
+      Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None &&
+      IsPCRel)
+    return &Target.getSymA()->getSymbol().AliasedSymbol();
+  return NULL;
+}
+
+MCObjectWriter *llvm::createSystemZObjectWriter(raw_ostream &OS,
+                                                uint8_t OSABI) {
+  MCELFObjectTargetWriter *MOTW = new SystemZObjectWriter(OSABI);
+  return createELFObjectWriter(MOTW, OS, /*IsLittleEndian=*/false);
+}
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
new file mode 100644
index 0000000..3653192
--- /dev/null
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -0,0 +1,204 @@
+//===-- SystemZMCTargetDesc.cpp - SystemZ target descriptions -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZMCTargetDesc.h"
+#include "InstPrinter/SystemZInstPrinter.h"
+#include "SystemZMCAsmInfo.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "SystemZGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "SystemZGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "SystemZGenRegisterInfo.inc"
+
+using namespace llvm;
+
+const unsigned SystemZMC::GR32Regs[16] = {
+  SystemZ::R0W, SystemZ::R1W, SystemZ::R2W, SystemZ::R3W,
+  SystemZ::R4W, SystemZ::R5W, SystemZ::R6W, SystemZ::R7W,
+  SystemZ::R8W, SystemZ::R9W, SystemZ::R10W, SystemZ::R11W,
+  SystemZ::R12W, SystemZ::R13W, SystemZ::R14W, SystemZ::R15W
+};
+
+const unsigned SystemZMC::GR64Regs[16] = {
+  SystemZ::R0D, SystemZ::R1D, SystemZ::R2D, SystemZ::R3D,
+  SystemZ::R4D, SystemZ::R5D, SystemZ::R6D, SystemZ::R7D,
+  SystemZ::R8D, SystemZ::R9D, SystemZ::R10D, SystemZ::R11D,
+  SystemZ::R12D, SystemZ::R13D, SystemZ::R14D, SystemZ::R15D
+};
+
+const unsigned SystemZMC::GR128Regs[16] = {
+  SystemZ::R0Q, 0, SystemZ::R2Q, 0,
+  SystemZ::R4Q, 0, SystemZ::R6Q, 0,
+  SystemZ::R8Q, 0, SystemZ::R10Q, 0,
+  SystemZ::R12Q, 0, SystemZ::R14Q, 0
+};
+
+const unsigned SystemZMC::FP32Regs[16] = {
+  SystemZ::F0S, SystemZ::F1S, SystemZ::F2S, SystemZ::F3S,
+  SystemZ::F4S, SystemZ::F5S, SystemZ::F6S, SystemZ::F7S,
+  SystemZ::F8S, SystemZ::F9S, SystemZ::F10S, SystemZ::F11S,
+  SystemZ::F12S, SystemZ::F13S, SystemZ::F14S, SystemZ::F15S
+};
+
+const unsigned SystemZMC::FP64Regs[16] = {
+  SystemZ::F0D, SystemZ::F1D, SystemZ::F2D, SystemZ::F3D,
+  SystemZ::F4D, SystemZ::F5D, SystemZ::F6D, SystemZ::F7D,
+  SystemZ::F8D, SystemZ::F9D, SystemZ::F10D, SystemZ::F11D,
+  SystemZ::F12D, SystemZ::F13D, SystemZ::F14D, SystemZ::F15D
+};
+
+const unsigned SystemZMC::FP128Regs[16] = {
+  SystemZ::F0Q, SystemZ::F1Q, 0, 0,
+  SystemZ::F4Q, SystemZ::F5Q, 0, 0,
+  SystemZ::F8Q, SystemZ::F9Q, 0, 0,
+  SystemZ::F12Q, SystemZ::F13Q, 0, 0
+};
+
+static MCAsmInfo *createSystemZMCAsmInfo(const MCRegisterInfo &MRI,
+                                         StringRef TT) {
+  MCAsmInfo *MAI = new SystemZMCAsmInfo(TT);
+  MCCFIInstruction Inst =
+      MCCFIInstruction::createDefCfa(0, MRI.getDwarfRegNum(SystemZ::R15D, true),
+                                     SystemZMC::CFAOffsetFromInitialSP);
+  MAI->addInitialFrameState(Inst);
+  return MAI;
+}
+
+static MCInstrInfo *createSystemZMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitSystemZMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createSystemZMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitSystemZMCRegisterInfo(X, SystemZ::R14D);
+  return X;
+}
+
+static MCSubtargetInfo *createSystemZMCSubtargetInfo(StringRef TT,
+                                                     StringRef CPU,
+                                                     StringRef FS) {
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  InitSystemZMCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+static MCCodeGenInfo *createSystemZMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                                 CodeModel::Model CM,
+                                                 CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+
+  // Static code is suitable for use in a dynamic executable; there is no
+  // separate DynamicNoPIC model.
+  if (RM == Reloc::Default || RM == Reloc::DynamicNoPIC)
+    RM = Reloc::Static;
+
+  // For SystemZ we define the models as follows:
+  //
+  // Small:  BRASL can call any function and will use a stub if necessary.
+  //         Locally-binding symbols will always be in range of LARL.
+  //
+  // Medium: BRASL can call any function and will use a stub if necessary.
+  //         GOT slots and locally-defined text will always be in range
+  //         of LARL, but other symbols might not be.
+  //
+  // Large:  Equivalent to Medium for now.
+  //
+  // Kernel: Equivalent to Medium for now.
+  //
+  // This means that any PIC module smaller than 4GB meets the
+  // requirements of Small, so Small seems like the best default there.
+  //
+  // All symbols bind locally in a non-PIC module, so the choice is less
+  // obvious.  There are two cases:
+  //
+  // - When creating an executable, PLTs and copy relocations allow
+  //   us to treat external symbols as part of the executable.
+  //   Any executable smaller than 4GB meets the requirements of Small,
+  //   so that seems like the best default.
+  //
+  // - When creating JIT code, stubs will be in range of BRASL if the
+  //   image is less than 4GB in size.  GOT entries will likewise be
+  //   in range of LARL.  However, the JIT environment has no equivalent
+  //   of copy relocs, so locally-binding data symbols might not be in
+  //   the range of LARL.  We need the Medium model in that case.
+  if (CM == CodeModel::Default)
+    CM = CodeModel::Small;
+  else if (CM == CodeModel::JITDefault)
+    CM = RM == Reloc::PIC_ ? CodeModel::Small : CodeModel::Medium;
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+static MCInstPrinter *createSystemZMCInstPrinter(const Target &T,
+                                                 unsigned SyntaxVariant,
+                                                 const MCAsmInfo &MAI,
+                                                 const MCInstrInfo &MII,
+                                                 const MCRegisterInfo &MRI,
+                                                 const MCSubtargetInfo &STI) {
+  return new SystemZInstPrinter(MAI, MII, MRI);
+}
+
+static MCStreamer *createSystemZMCObjectStreamer(const Target &T, StringRef TT,
+                                                 MCContext &Ctx,
+                                                 MCAsmBackend &MAB,
+                                                 raw_ostream &OS,
+                                                 MCCodeEmitter *Emitter,
+                                                 bool RelaxAll,
+                                                 bool NoExecStack) {
+  return createELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll, NoExecStack);
+}
+
+extern "C" void LLVMInitializeSystemZTargetMC() {
+  // Register the MCAsmInfo.
+  TargetRegistry::RegisterMCAsmInfo(TheSystemZTarget,
+                                    createSystemZMCAsmInfo);
+
+  // Register the MCCodeGenInfo.
+  TargetRegistry::RegisterMCCodeGenInfo(TheSystemZTarget,
+                                        createSystemZMCCodeGenInfo);
+
+  // Register the MCCodeEmitter.
+  TargetRegistry::RegisterMCCodeEmitter(TheSystemZTarget,
+					createSystemZMCCodeEmitter);
+
+  // Register the MCInstrInfo.
+  TargetRegistry::RegisterMCInstrInfo(TheSystemZTarget,
+                                      createSystemZMCInstrInfo);
+
+  // Register the MCRegisterInfo.
+  TargetRegistry::RegisterMCRegInfo(TheSystemZTarget,
+                                    createSystemZMCRegisterInfo);
+
+  // Register the MCSubtargetInfo.
+  TargetRegistry::RegisterMCSubtargetInfo(TheSystemZTarget,
+                                          createSystemZMCSubtargetInfo);
+
+  // Register the MCAsmBackend.
+  TargetRegistry::RegisterMCAsmBackend(TheSystemZTarget,
+                                       createSystemZMCAsmBackend);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(TheSystemZTarget,
+                                        createSystemZMCInstPrinter);
+
+  // Register the MCObjectStreamer;
+  TargetRegistry::RegisterMCObjectStreamer(TheSystemZTarget,
+                                           createSystemZMCObjectStreamer);
+}
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
new file mode 100644
index 0000000..3c9f0cb
--- /dev/null
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
@@ -0,0 +1,75 @@
+//===-- SystemZMCTargetDesc.h - SystemZ target descriptions -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SYSTEMZMCTARGETDESC_H
+#define SYSTEMZMCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class StringRef;
+class Target;
+class raw_ostream;
+
+extern Target TheSystemZTarget;
+
+namespace SystemZMC {
+  // How many bytes are in the ABI-defined, caller-allocated part of
+  // a stack frame.
+  const int64_t CallFrameSize = 160;
+
+  // The offset of the DWARF CFA from the incoming stack pointer.
+  const int64_t CFAOffsetFromInitialSP = CallFrameSize;
+
+  // Maps of asm register numbers to LLVM register numbers, with 0 indicating
+  // an invalid register.  In principle we could use 32-bit and 64-bit register
+  // classes directly, provided that we relegated the GPR allocation order
+  // in SystemZRegisterInfo.td to an AltOrder and left the default order
+  // as %r0-%r15.  It seems better to provide the same interface for
+  // all classes though.
+  extern const unsigned GR32Regs[16];
+  extern const unsigned GR64Regs[16];
+  extern const unsigned GR128Regs[16];
+  extern const unsigned FP32Regs[16];
+  extern const unsigned FP64Regs[16];
+  extern const unsigned FP128Regs[16];
+}
+
+MCCodeEmitter *createSystemZMCCodeEmitter(const MCInstrInfo &MCII,
+                                          const MCRegisterInfo &MRI,
+                                          const MCSubtargetInfo &STI,
+                                          MCContext &Ctx);
+
+MCAsmBackend *createSystemZMCAsmBackend(const Target &T, StringRef TT,
+                                        StringRef CPU);
+
+MCObjectWriter *createSystemZObjectWriter(raw_ostream &OS, uint8_t OSABI);
+} // end namespace llvm
+
+// Defines symbolic names for SystemZ registers.
+// This defines a mapping from register name to register number.
+#define GET_REGINFO_ENUM
+#include "SystemZGenRegisterInfo.inc"
+
+// Defines symbolic names for the SystemZ instructions.
+#define GET_INSTRINFO_ENUM
+#include "SystemZGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "SystemZGenSubtargetInfo.inc"
+
+#endif
diff --git a/lib/Target/SystemZ/Makefile b/lib/Target/SystemZ/Makefile
new file mode 100644
index 0000000..445725b
--- /dev/null
+++ b/lib/Target/SystemZ/Makefile
@@ -0,0 +1,29 @@
+##===- lib/Target/SystemZ/Makefile -------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMSystemZCodeGen
+TARGET = SystemZ
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = SystemZGenRegisterInfo.inc \
+		SystemZGenAsmWriter.inc \
+		SystemZGenAsmMatcher.inc \
+		SystemZGenCodeEmitter.inc \
+		SystemZGenDisassemblerTables.inc \
+		SystemZGenInstrInfo.inc \
+		SystemZGenDAGISel.inc \
+		SystemZGenSubtargetInfo.inc \
+		SystemZGenCallingConv.inc \
+		SystemZGenMCCodeEmitter.inc
+
+DIRS = InstPrinter AsmParser Disassembler TargetInfo MCTargetDesc
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Target/SystemZ/README.txt b/lib/Target/SystemZ/README.txt
new file mode 100644
index 0000000..55e9fc0
--- /dev/null
+++ b/lib/Target/SystemZ/README.txt
@@ -0,0 +1,216 @@
+//===---------------------------------------------------------------------===//
+// Random notes about and ideas for the SystemZ backend.
+//===---------------------------------------------------------------------===//
+
+The initial backend is deliberately restricted to z10.  We should add support
+for later architectures at some point.
+
+--
+
+SystemZDAGToDAGISel::SelectInlineAsmMemoryOperand() is passed "m" for all
+inline asm memory constraints; it doesn't get to see the original constraint.
+This means that it must conservatively treat all inline asm constraints
+as the most restricted type, "R".
+
+--
+
+If an inline asm ties an i32 "r" result to an i64 input, the input
+will be treated as an i32, leaving the upper bits uninitialised.
+For example:
+
+define void @f4(i32 *%dst) {
+  %val = call i32 asm "blah $0", "=r,0" (i64 103)
+  store i32 %val, i32 *%dst
+  ret void
+}
+
+from CodeGen/SystemZ/asm-09.ll will use LHI rather than LGHI.
+to load 103.  This seems to be a general target-independent problem.
+
+--
+
+The tuning of the choice between LOAD ADDRESS (LA) and addition in
+SystemZISelDAGToDAG.cpp is suspect.  It should be tweaked based on
+performance measurements.
+
+--
+
+We don't support tail calls at present.
+
+--
+
+We don't support prefetching yet.
+
+--
+
+There is no scheduling support.
+
+--
+
+We don't use the BRANCH ON COUNT or BRANCH ON INDEX families of instruction.
+
+--
+
+We might want to use BRANCH ON CONDITION for conditional indirect calls
+and conditional returns.
+
+--
+
+We don't use the condition code results of anything except comparisons.
+
+Implementing this may need something more finely grained than the z_cmp
+and z_ucmp that we have now.  It might (or might not) also be useful to
+have a mask of "don't care" values in conditional branches.  For example,
+integer comparisons never set CC to 3, so the bottom bit of the CC mask
+isn't particularly relevant.  JNLH and JE are equally good for testing
+equality after an integer comparison, etc.
+
+--
+
+We don't use the LOAD AND TEST or TEST DATA CLASS instructions.
+
+--
+
+We could use the generic floating-point forms of LOAD COMPLEMENT,
+LOAD NEGATIVE and LOAD POSITIVE in cases where we don't need the
+condition codes.  For example, we could use LCDFR instead of LCDBR.
+
+--
+
+We don't optimize block memory operations.
+
+It's definitely worth using things like MVC, CLC, NC, XC and OC with
+constant lengths.  MVCIN may be worthwhile too.
+
+We should probably implement things like memcpy using MVC with EXECUTE.
+Likewise memcmp and CLC.  MVCLE and CLCLE could be useful too.
+
+--
+
+We don't optimize string operations.
+
+MVST, CLST, SRST and CUSE could be useful here.  Some of the TRANSLATE
+family might be too, although they are probably more difficult to exploit.
+
+--
+
+We don't take full advantage of builtins like fabsl because the calling
+conventions require f128s to be returned by invisible reference.
+
+--
+
+ADD LOGICAL WITH SIGNED IMMEDIATE could be useful when we need to
+produce a carry.  SUBTRACT LOGICAL IMMEDIATE could be useful when we
+need to produce a borrow.  (Note that there are no memory forms of
+ADD LOGICAL WITH CARRY and SUBTRACT LOGICAL WITH BORROW, so the high
+part of 128-bit memory operations would probably need to be done
+via a register.)
+
+--
+
+We don't use the halfword forms of LOAD REVERSED and STORE REVERSED
+(LRVH and STRVH).
+
+--
+
+We could take advantage of the various ... UNDER MASK instructions,
+such as ICM and STCM.
+
+--
+
+We could make more use of the ROTATE AND ... SELECTED BITS instructions.
+At the moment we only use RISBG, and only then for subword atomic operations.
+
+--
+
+DAGCombiner can detect integer absolute, but there's not yet an associated
+ISD opcode.  We could add one and implement it using LOAD POSITIVE.
+Negated absolutes could use LOAD NEGATIVE.
+
+--
+
+DAGCombiner doesn't yet fold truncations of extended loads.  Functions like:
+
+    unsigned long f (unsigned long x, unsigned short *y)
+    {
+      return (x << 32) | *y;
+    }
+
+therefore end up as:
+
+        sllg    %r2, %r2, 32
+        llgh    %r0, 0(%r3)
+        lr      %r2, %r0
+        br      %r14
+
+but truncating the load would give:
+
+        sllg    %r2, %r2, 32
+        lh      %r2, 0(%r3)
+        br      %r14
+
+--
+
+Functions like:
+
+define i64 @f1(i64 %a) {
+  %and = and i64 %a, 1
+  ret i64 %and
+}
+
+ought to be implemented as:
+
+        lhi     %r0, 1
+        ngr     %r2, %r0
+        br      %r14
+
+but two-address optimisations reverse the order of the AND and force:
+
+        lhi     %r0, 1
+        ngr     %r0, %r2
+        lgr     %r2, %r0
+        br      %r14
+
+CodeGen/SystemZ/and-04.ll has several examples of this.
+
+--
+
+Out-of-range displacements are usually handled by loading the full
+address into a register.  In many cases it would be better to create
+an anchor point instead.  E.g. for:
+
+define void @f4a(i128 *%aptr, i64 %base) {
+  %addr = add i64 %base, 524288
+  %bptr = inttoptr i64 %addr to i128 *
+  %a = load volatile i128 *%aptr
+  %b = load i128 *%bptr
+  %add = add i128 %a, %b
+  store i128 %add, i128 *%aptr
+  ret void
+}
+
+(from CodeGen/SystemZ/int-add-08.ll) we load %base+524288 and %base+524296
+into separate registers, rather than using %base+524288 as a base for both.
+
+--
+
+Dynamic stack allocations round the size to 8 bytes and then allocate
+that rounded amount.  It would be simpler to subtract the unrounded
+size from the copy of the stack pointer and then align the result.
+See CodeGen/SystemZ/alloca-01.ll for an example.
+
+--
+
+Atomic loads and stores use the default compare-and-swap based implementation.
+This is much too conservative in practice, since the architecture guarantees
+that 1-, 2-, 4- and 8-byte loads and stores to aligned addresses are
+inherently atomic.
+
+--
+
+If needed, we can support 16-byte atomics using LPQ, STPQ and CSDG.
+
+--
+
+We might want to model all access registers and use them to spill
+32-bit values.
diff --git a/lib/Target/SystemZ/SystemZ.h b/lib/Target/SystemZ/SystemZ.h
new file mode 100644
index 0000000..24612bb
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZ.h
@@ -0,0 +1,78 @@
+//==- SystemZ.h - Top-Level Interface for SystemZ representation -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in
+// the LLVM SystemZ backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SYSTEMZ_H
+#define SYSTEMZ_H
+
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "llvm/Support/CodeGen.h"
+
+namespace llvm {
+  class SystemZTargetMachine;
+  class FunctionPass;
+
+  namespace SystemZ {
+    // Condition-code mask values.
+    const unsigned CCMASK_0 = 1 << 3;
+    const unsigned CCMASK_1 = 1 << 2;
+    const unsigned CCMASK_2 = 1 << 1;
+    const unsigned CCMASK_3 = 1 << 0;
+    const unsigned CCMASK_ANY = CCMASK_0 | CCMASK_1 | CCMASK_2 | CCMASK_3;
+
+    // Condition-code mask assignments for floating-point comparisons.
+    const unsigned CCMASK_CMP_EQ = CCMASK_0;
+    const unsigned CCMASK_CMP_LT = CCMASK_1;
+    const unsigned CCMASK_CMP_GT = CCMASK_2;
+    const unsigned CCMASK_CMP_UO = CCMASK_3;
+    const unsigned CCMASK_CMP_NE = CCMASK_CMP_LT | CCMASK_CMP_GT;
+    const unsigned CCMASK_CMP_LE = CCMASK_CMP_EQ | CCMASK_CMP_LT;
+    const unsigned CCMASK_CMP_GE = CCMASK_CMP_EQ | CCMASK_CMP_GT;
+    const unsigned CCMASK_CMP_O  = CCMASK_ANY ^ CCMASK_CMP_UO;
+
+    // Return true if Val fits an LLILL operand.
+    static inline bool isImmLL(uint64_t Val) {
+      return (Val & ~0x000000000000ffffULL) == 0;
+    }
+
+    // Return true if Val fits an LLILH operand.
+    static inline bool isImmLH(uint64_t Val) {
+      return (Val & ~0x00000000ffff0000ULL) == 0;
+    }
+
+    // Return true if Val fits an LLIHL operand.
+    static inline bool isImmHL(uint64_t Val) {
+      return (Val & ~0x00000ffff00000000ULL) == 0;
+    }
+
+    // Return true if Val fits an LLIHH operand.
+    static inline bool isImmHH(uint64_t Val) {
+      return (Val & ~0xffff000000000000ULL) == 0;
+    }
+
+    // Return true if Val fits an LLILF operand.
+    static inline bool isImmLF(uint64_t Val) {
+      return (Val & ~0x00000000ffffffffULL) == 0;
+    }
+
+    // Return true if Val fits an LLIHF operand.
+    static inline bool isImmHF(uint64_t Val) {
+      return (Val & ~0xffffffff00000000ULL) == 0;
+    }
+  }
+
+  FunctionPass *createSystemZISelDag(SystemZTargetMachine &TM,
+                                     CodeGenOpt::Level OptLevel);
+  FunctionPass *createSystemZLongBranchPass(SystemZTargetMachine &TM);
+} // end namespace llvm;
+#endif
diff --git a/lib/Target/SystemZ/SystemZ.td b/lib/Target/SystemZ/SystemZ.td
new file mode 100644
index 0000000..e03c32f
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZ.td
@@ -0,0 +1,75 @@
+//===-- SystemZ.td - Describe the SystemZ target machine -----*- tblgen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// SystemZ supported processors
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"z10", []>;
+
+//===----------------------------------------------------------------------===//
+// Register file description
+//===----------------------------------------------------------------------===//
+
+include "SystemZRegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Calling convention description
+//===----------------------------------------------------------------------===//
+
+include "SystemZCallingConv.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction descriptions
+//===----------------------------------------------------------------------===//
+
+include "SystemZOperators.td"
+include "SystemZOperands.td"
+include "SystemZPatterns.td"
+include "SystemZInstrFormats.td"
+include "SystemZInstrInfo.td"
+include "SystemZInstrFP.td"
+
+def SystemZInstrInfo : InstrInfo {}
+
+//===----------------------------------------------------------------------===//
+// Assembly parser
+//===----------------------------------------------------------------------===//
+
+def SystemZAsmParser : AsmParser {
+  let ShouldEmitMatchRegisterName = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Assembly writer
+//===----------------------------------------------------------------------===//
+
+def SystemZAsmWriter : AsmWriter {
+  string AsmWriterClassName = "InstPrinter";
+  bit isMCAsmWriter = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Top-level target declaration
+//===----------------------------------------------------------------------===//
+
+def SystemZ : Target {
+  let InstructionSet = SystemZInstrInfo;
+  let AssemblyParsers = [SystemZAsmParser];
+  let AssemblyWriters = [SystemZAsmWriter];
+}
diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
new file mode 100644
index 0000000..1e15ab1
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -0,0 +1,113 @@
+//===-- SystemZAsmPrinter.cpp - SystemZ LLVM assembly printer -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Streams SystemZ assembly language and associated data, in the form of
+// MCInsts and MCExprs respectively.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZAsmPrinter.h"
+#include "InstPrinter/SystemZInstPrinter.h"
+#include "SystemZConstantPoolValue.h"
+#include "SystemZMCInstLower.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/Mangler.h"
+
+using namespace llvm;
+
+void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  SystemZMCInstLower Lower(Mang, MF->getContext(), *this);
+  MCInst LoweredMI;
+  Lower.lower(MI, LoweredMI);
+  OutStreamer.EmitInstruction(LoweredMI);
+}
+
+// Convert a SystemZ-specific constant pool modifier into the associated
+// MCSymbolRefExpr variant kind.
+static MCSymbolRefExpr::VariantKind
+getModifierVariantKind(SystemZCP::SystemZCPModifier Modifier) {
+  switch (Modifier) {
+  case SystemZCP::NTPOFF: return MCSymbolRefExpr::VK_NTPOFF;
+  }
+  llvm_unreachable("Invalid SystemCPModifier!");
+}
+
+void SystemZAsmPrinter::
+EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
+  SystemZConstantPoolValue *ZCPV =
+    static_cast<SystemZConstantPoolValue*>(MCPV);
+
+  const MCExpr *Expr =
+    MCSymbolRefExpr::Create(Mang->getSymbol(ZCPV->getGlobalValue()),
+                            getModifierVariantKind(ZCPV->getModifier()),
+                            OutContext);
+  uint64_t Size = TM.getDataLayout()->getTypeAllocSize(ZCPV->getType());
+
+  OutStreamer.EmitValue(Expr, Size);
+}
+
+bool SystemZAsmPrinter::PrintAsmOperand(const MachineInstr *MI,
+                                        unsigned OpNo,
+                                        unsigned AsmVariant,
+                                        const char *ExtraCode,
+                                        raw_ostream &OS) {
+  if (ExtraCode && *ExtraCode == 'n') {
+    if (!MI->getOperand(OpNo).isImm())
+      return true;
+    OS << -int64_t(MI->getOperand(OpNo).getImm());
+  } else {
+    SystemZMCInstLower Lower(Mang, MF->getContext(), *this);
+    MCOperand MO(Lower.lowerOperand(MI->getOperand(OpNo)));
+    SystemZInstPrinter::printOperand(MO, OS);
+  }
+  return false;
+}
+
+bool SystemZAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                              unsigned OpNo,
+                                              unsigned AsmVariant,
+                                              const char *ExtraCode,
+                                              raw_ostream &OS) {
+  SystemZInstPrinter::printAddress(MI->getOperand(OpNo).getReg(),
+                                   MI->getOperand(OpNo + 1).getImm(),
+                                   MI->getOperand(OpNo + 2).getReg(), OS);
+  return false;
+}
+
+void SystemZAsmPrinter::EmitEndOfAsmFile(Module &M) {
+  if (Subtarget->isTargetELF()) {
+    const TargetLoweringObjectFileELF &TLOFELF =
+      static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering());
+
+    MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>();
+
+    // Output stubs for external and common global variables.
+    MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
+    if (!Stubs.empty()) {
+      OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
+      const DataLayout *TD = TM.getDataLayout();
+
+      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+        OutStreamer.EmitLabel(Stubs[i].first);
+        OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(),
+                                    TD->getPointerSize(0), 0);
+      }
+      Stubs.clear();
+    }
+  }
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeSystemZAsmPrinter() {
+  RegisterAsmPrinter<SystemZAsmPrinter> X(TheSystemZTarget);
+}
diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.h b/lib/Target/SystemZ/SystemZAsmPrinter.h
new file mode 100644
index 0000000..4b6c51b
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZAsmPrinter.h
@@ -0,0 +1,52 @@
+//===-- SystemZAsmPrinter.h - SystemZ LLVM assembly printer ----*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SYSTEMZASMPRINTER_H
+#define SYSTEMZASMPRINTER_H
+
+#include "SystemZTargetMachine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class MCStreamer;
+class MachineBasicBlock;
+class MachineInstr;
+class Module;
+class raw_ostream;
+
+class LLVM_LIBRARY_VISIBILITY SystemZAsmPrinter : public AsmPrinter {
+private:
+  const SystemZSubtarget *Subtarget;
+
+public:
+  SystemZAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+    : AsmPrinter(TM, Streamer) {
+    Subtarget = &TM.getSubtarget<SystemZSubtarget>();
+  }
+
+  // Override AsmPrinter.
+  virtual const char *getPassName() const LLVM_OVERRIDE {
+    return "SystemZ Assembly Printer";
+  }
+  virtual void EmitInstruction(const MachineInstr *MI) LLVM_OVERRIDE;
+  virtual void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV)
+    LLVM_OVERRIDE;
+  virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                               unsigned AsmVariant, const char *ExtraCode,
+                               raw_ostream &OS) LLVM_OVERRIDE;
+  virtual bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                                     unsigned AsmVariant,
+                                     const char *ExtraCode,
+                                     raw_ostream &OS) LLVM_OVERRIDE;
+  virtual void EmitEndOfAsmFile(Module &M) LLVM_OVERRIDE;
+};
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/SystemZ/SystemZCallingConv.cpp b/lib/Target/SystemZ/SystemZCallingConv.cpp
new file mode 100644
index 0000000..cc9c84b
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZCallingConv.cpp
@@ -0,0 +1,21 @@
+//===-- SystemZCallingConv.cpp - Calling conventions for SystemZ ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZCallingConv.h"
+#include "SystemZRegisterInfo.h"
+
+using namespace llvm;
+
+const unsigned SystemZ::ArgGPRs[SystemZ::NumArgGPRs] = {
+  SystemZ::R2D, SystemZ::R3D, SystemZ::R4D, SystemZ::R5D, SystemZ::R6D
+};
+
+const unsigned SystemZ::ArgFPRs[SystemZ::NumArgFPRs] = {
+  SystemZ::F0D, SystemZ::F2D, SystemZ::F4D, SystemZ::F6D
+};
diff --git a/lib/Target/SystemZ/SystemZCallingConv.h b/lib/Target/SystemZ/SystemZCallingConv.h
new file mode 100644
index 0000000..298985e
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZCallingConv.h
@@ -0,0 +1,23 @@
+//===-- SystemZCallingConv.h - Calling conventions for SystemZ --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SYSTEMZCALLINGCONV_H
+#define SYSTEMZCALLINGCONV_H
+
+namespace llvm {
+  namespace SystemZ {
+    const unsigned NumArgGPRs = 5;
+    extern const unsigned ArgGPRs[NumArgGPRs];
+
+    const unsigned NumArgFPRs = 4;
+    extern const unsigned ArgFPRs[NumArgFPRs];
+  }
+}
+
+#endif
diff --git a/lib/Target/SystemZ/SystemZCallingConv.td b/lib/Target/SystemZ/SystemZCallingConv.td
new file mode 100644
index 0000000..c2d727f
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZCallingConv.td
@@ -0,0 +1,65 @@
+//=- SystemZCallingConv.td - Calling conventions for SystemZ -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for the SystemZ ABI.
+//===----------------------------------------------------------------------===//
+
+class CCIfExtend<CCAction A>
+  : CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>;
+
+//===----------------------------------------------------------------------===//
+// SVR4 return value calling convention
+//===----------------------------------------------------------------------===//
+def RetCC_SystemZ : CallingConv<[
+  // Promote i32 to i64 if it has an explicit extension type.
+  CCIfType<[i32], CCIfExtend<CCPromoteToType<i64>>>,
+
+  // ABI-compliant code returns 64-bit integers in R2.  Make the other
+  // call-clobbered argument registers available for code that doesn't
+  // care about the ABI.  (R6 is an argument register too, but is
+  // call-saved and therefore not suitable for return values.)
+  CCIfType<[i32], CCAssignToReg<[R2W, R3W, R4W, R5W]>>,
+  CCIfType<[i64], CCAssignToReg<[R2D, R3D, R4D, R5D]>>,
+
+  // ABI-complaint code returns float and double in F0.  Make the
+  // other floating-point argument registers available for code that
+  // doesn't care about the ABI.  All floating-point argument registers
+  // are call-clobbered, so we can use all of them here.
+  CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
+  CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>
+
+  // ABI-compliant code returns long double by reference, but that conversion
+  // is left to higher-level code.  Perhaps we could add an f128 definition
+  // here for code that doesn't care about the ABI?
+]>;
+
+//===----------------------------------------------------------------------===//
+// SVR4 argument calling conventions
+//===----------------------------------------------------------------------===//
+def CC_SystemZ : CallingConv<[
+  // Promote i32 to i64 if it has an explicit extension type.
+  // The convention is that true integer arguments that are smaller
+  // than 64 bits should be marked as extended, but structures that
+  // are smaller than 64 bits shouldn't.
+  CCIfType<[i32], CCIfExtend<CCPromoteToType<i64>>>,
+
+  // Force long double values to the stack and pass i64 pointers to them.
+  CCIfType<[f128], CCPassIndirect<i64>>,
+
+  // The first 5 integer arguments are passed in R2-R6.  Note that R6
+  // is call-saved.
+  CCIfType<[i32], CCAssignToReg<[R2W, R3W, R4W, R5W, R6W]>>,
+  CCIfType<[i64], CCAssignToReg<[R2D, R3D, R4D, R5D, R6D]>>,
+
+  // The first 4 float and double arguments are passed in even registers F0-F6.
+  CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
+  CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
+
+  // Other arguments are passed in 8-byte-aligned 8-byte stack slots.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>
+]>;
diff --git a/lib/Target/SystemZ/SystemZConstantPoolValue.cpp b/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
new file mode 100644
index 0000000..e9c4f6d
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
@@ -0,0 +1,62 @@
+//===-- SystemZConstantPoolValue.cpp - SystemZ constant-pool value --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZConstantPoolValue.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+SystemZConstantPoolValue::
+SystemZConstantPoolValue(const GlobalValue *gv,
+                         SystemZCP::SystemZCPModifier modifier)
+  : MachineConstantPoolValue(gv->getType()), GV(gv), Modifier(modifier) {}
+
+SystemZConstantPoolValue *
+SystemZConstantPoolValue::Create(const GlobalValue *GV,
+                                 SystemZCP::SystemZCPModifier Modifier) {
+  return new SystemZConstantPoolValue(GV, Modifier);
+}
+
+unsigned SystemZConstantPoolValue::getRelocationInfo() const {
+  switch (Modifier) {
+  case SystemZCP::NTPOFF:
+    // May require a relocation, but the relocations are always resolved
+    // by the static linker.
+    return 1;
+  }
+  llvm_unreachable("Unknown modifier");
+}
+
+int SystemZConstantPoolValue::
+getExistingMachineCPValue(MachineConstantPool *CP, unsigned Alignment) {
+  unsigned AlignMask = Alignment - 1;
+  const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants();
+  for (unsigned I = 0, E = Constants.size(); I != E; ++I) {
+    if (Constants[I].isMachineConstantPoolEntry() &&
+        (Constants[I].getAlignment() & AlignMask) == 0) {
+      SystemZConstantPoolValue *ZCPV =
+        static_cast<SystemZConstantPoolValue *>(Constants[I].Val.MachineCPVal);
+      if (ZCPV->GV == GV && ZCPV->Modifier == Modifier)
+        return I;
+    }
+  }
+  return -1;
+}
+
+void SystemZConstantPoolValue::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
+  ID.AddPointer(GV);
+  ID.AddInteger(Modifier);
+}
+
+void SystemZConstantPoolValue::print(raw_ostream &O) const {
+  O << GV << "@" << int(Modifier);
+}
diff --git a/lib/Target/SystemZ/SystemZConstantPoolValue.h b/lib/Target/SystemZ/SystemZConstantPoolValue.h
new file mode 100644
index 0000000..9927bdb
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZConstantPoolValue.h
@@ -0,0 +1,55 @@
+//===- SystemZConstantPoolValue.h - SystemZ constant-pool value -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SYSTEMZCONSTANTPOOLVALUE_H
+#define SYSTEMZCONSTANTPOOLVALUE_H
+
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+class GlobalValue;
+
+namespace SystemZCP {
+  enum SystemZCPModifier {
+    NTPOFF
+  };
+}
+
+/// A SystemZ-specific constant pool value.  At present, the only
+/// defined constant pool values are offsets of thread-local variables
+/// (written x@NTPOFF).
+class SystemZConstantPoolValue : public MachineConstantPoolValue {
+  const GlobalValue *GV;
+  SystemZCP::SystemZCPModifier Modifier;
+
+protected:
+  SystemZConstantPoolValue(const GlobalValue *GV,
+                           SystemZCP::SystemZCPModifier Modifier);
+
+public:
+  static SystemZConstantPoolValue *
+    Create(const GlobalValue *GV, SystemZCP::SystemZCPModifier Modifier);
+
+  // Override MachineConstantPoolValue.
+  virtual unsigned getRelocationInfo() const LLVM_OVERRIDE;
+  virtual int getExistingMachineCPValue(MachineConstantPool *CP,
+                                        unsigned Alignment) LLVM_OVERRIDE;
+  virtual void addSelectionDAGCSEId(FoldingSetNodeID &ID) LLVM_OVERRIDE;
+  virtual void print(raw_ostream &O) const LLVM_OVERRIDE;
+
+  // Access SystemZ-specific fields.
+  const GlobalValue *getGlobalValue() const { return GV; }
+  SystemZCP::SystemZCPModifier getModifier() const { return Modifier; }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.cpp b/lib/Target/SystemZ/SystemZFrameLowering.cpp
new file mode 100644
index 0000000..c0d72c3
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -0,0 +1,531 @@
+//===-- SystemZFrameLowering.cpp - Frame lowering for SystemZ -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZFrameLowering.h"
+#include "SystemZCallingConv.h"
+#include "SystemZInstrBuilder.h"
+#include "SystemZMachineFunctionInfo.h"
+#include "SystemZTargetMachine.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+
+using namespace llvm;
+
+SystemZFrameLowering::SystemZFrameLowering(const SystemZTargetMachine &tm,
+                                           const SystemZSubtarget &sti)
+  : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 8,
+                        -SystemZMC::CallFrameSize),
+    TM(tm),
+    STI(sti) {
+  // The ABI-defined register save slots, relative to the incoming stack
+  // pointer.
+  static const unsigned SpillOffsetTable[][2] = {
+    { SystemZ::R2D,  0x10 },
+    { SystemZ::R3D,  0x18 },
+    { SystemZ::R4D,  0x20 },
+    { SystemZ::R5D,  0x28 },
+    { SystemZ::R6D,  0x30 },
+    { SystemZ::R7D,  0x38 },
+    { SystemZ::R8D,  0x40 },
+    { SystemZ::R9D,  0x48 },
+    { SystemZ::R10D, 0x50 },
+    { SystemZ::R11D, 0x58 },
+    { SystemZ::R12D, 0x60 },
+    { SystemZ::R13D, 0x68 },
+    { SystemZ::R14D, 0x70 },
+    { SystemZ::R15D, 0x78 },
+    { SystemZ::F0D,  0x80 },
+    { SystemZ::F2D,  0x88 },
+    { SystemZ::F4D,  0x90 },
+    { SystemZ::F6D,  0x98 }
+  };
+
+  // Create a mapping from register number to save slot offset.
+  RegSpillOffsets.grow(SystemZ::NUM_TARGET_REGS);
+  for (unsigned I = 0, E = array_lengthof(SpillOffsetTable); I != E; ++I)
+    RegSpillOffsets[SpillOffsetTable[I][0]] = SpillOffsetTable[I][1];
+}
+
+void SystemZFrameLowering::
+processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                     RegScavenger *RS) const {
+  MachineFrameInfo *MFFrame = MF.getFrameInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+  bool HasFP = hasFP(MF);
+  SystemZMachineFunctionInfo *MFI = MF.getInfo<SystemZMachineFunctionInfo>();
+  bool IsVarArg = MF.getFunction()->isVarArg();
+
+  // va_start stores incoming FPR varargs in the normal way, but delegates
+  // the saving of incoming GPR varargs to spillCalleeSavedRegisters().
+  // Record these pending uses, which typically include the call-saved
+  // argument register R6D.
+  if (IsVarArg)
+    for (unsigned I = MFI->getVarArgsFirstGPR(); I < SystemZ::NumArgGPRs; ++I)
+      MRI.setPhysRegUsed(SystemZ::ArgGPRs[I]);
+
+  // If the function requires a frame pointer, record that the hard
+  // frame pointer will be clobbered.
+  if (HasFP)
+    MRI.setPhysRegUsed(SystemZ::R11D);
+
+  // If the function calls other functions, record that the return
+  // address register will be clobbered.
+  if (MFFrame->hasCalls())
+    MRI.setPhysRegUsed(SystemZ::R14D);
+
+  // If we are saving GPRs other than the stack pointer, we might as well
+  // save and restore the stack pointer at the same time, via STMG and LMG.
+  // This allows the deallocation to be done by the LMG, rather than needing
+  // a separate %r15 addition.
+  const uint16_t *CSRegs = TRI->getCalleeSavedRegs(&MF);
+  for (unsigned I = 0; CSRegs[I]; ++I) {
+    unsigned Reg = CSRegs[I];
+    if (SystemZ::GR64BitRegClass.contains(Reg) && MRI.isPhysRegUsed(Reg)) {
+      MRI.setPhysRegUsed(SystemZ::R15D);
+      break;
+    }
+  }
+}
+
+// Add GPR64 to the save instruction being built by MIB, which is in basic
+// block MBB.  IsImplicit says whether this is an explicit operand to the
+// instruction, or an implicit one that comes between the explicit start
+// and end registers.
+static void addSavedGPR(MachineBasicBlock &MBB, MachineInstrBuilder &MIB,
+                        const SystemZTargetMachine &TM,
+                        unsigned GPR64, bool IsImplicit) {
+  const SystemZRegisterInfo *RI = TM.getRegisterInfo();
+  unsigned GPR32 = RI->getSubReg(GPR64, SystemZ::subreg_32bit);
+  bool IsLive = MBB.isLiveIn(GPR64) || MBB.isLiveIn(GPR32);
+  if (!IsLive || !IsImplicit) {
+    MIB.addReg(GPR64, getImplRegState(IsImplicit) | getKillRegState(!IsLive));
+    if (!IsLive)
+      MBB.addLiveIn(GPR64);
+  }
+}
+
+bool SystemZFrameLowering::
+spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MBBI,
+                          const std::vector<CalleeSavedInfo> &CSI,
+                          const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
+  SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
+  bool IsVarArg = MF.getFunction()->isVarArg();
+  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  // Scan the call-saved GPRs and find the bounds of the register spill area.
+  unsigned SavedGPRFrameSize = 0;
+  unsigned LowGPR = 0;
+  unsigned HighGPR = SystemZ::R15D;
+  unsigned StartOffset = -1U;
+  for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+    unsigned Reg = CSI[I].getReg();
+    if (SystemZ::GR64BitRegClass.contains(Reg)) {
+      SavedGPRFrameSize += 8;
+      unsigned Offset = RegSpillOffsets[Reg];
+      assert(Offset && "Unexpected GPR save");
+      if (StartOffset > Offset) {
+        LowGPR = Reg;
+        StartOffset = Offset;
+      }
+    }
+  }
+
+  // Save information about the range and location of the call-saved
+  // registers, for use by the epilogue inserter.
+  ZFI->setSavedGPRFrameSize(SavedGPRFrameSize);
+  ZFI->setLowSavedGPR(LowGPR);
+  ZFI->setHighSavedGPR(HighGPR);
+
+  // Include the GPR varargs, if any.  R6D is call-saved, so would
+  // be included by the loop above, but we also need to handle the
+  // call-clobbered argument registers.
+  if (IsVarArg) {
+    unsigned FirstGPR = ZFI->getVarArgsFirstGPR();
+    if (FirstGPR < SystemZ::NumArgGPRs) {
+      unsigned Reg = SystemZ::ArgGPRs[FirstGPR];
+      unsigned Offset = RegSpillOffsets[Reg];
+      if (StartOffset > Offset) {
+        LowGPR = Reg; StartOffset = Offset;
+      }
+    }
+  }
+
+  // Save GPRs
+  if (LowGPR) {
+    assert(LowGPR != HighGPR && "Should be saving %r15 and something else");
+
+    // Build an STMG instruction.
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(SystemZ::STMG));
+
+    // Add the explicit register operands.
+    addSavedGPR(MBB, MIB, TM, LowGPR, false);
+    addSavedGPR(MBB, MIB, TM, HighGPR, false);
+
+    // Add the address.
+    MIB.addReg(SystemZ::R15D).addImm(StartOffset);
+
+    // Make sure all call-saved GPRs are included as operands and are
+    // marked as live on entry.
+    for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+      unsigned Reg = CSI[I].getReg();
+      if (SystemZ::GR64BitRegClass.contains(Reg))
+        addSavedGPR(MBB, MIB, TM, Reg, true);
+    }
+
+    // ...likewise GPR varargs.
+    if (IsVarArg)
+      for (unsigned I = ZFI->getVarArgsFirstGPR(); I < SystemZ::NumArgGPRs; ++I)
+        addSavedGPR(MBB, MIB, TM, SystemZ::ArgGPRs[I], true);
+  }
+
+  // Save FPRs in the normal TargetInstrInfo way.
+  for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+    unsigned Reg = CSI[I].getReg();
+    if (SystemZ::FP64BitRegClass.contains(Reg)) {
+      MBB.addLiveIn(Reg);
+      TII->storeRegToStackSlot(MBB, MBBI, Reg, true, CSI[I].getFrameIdx(),
+                               &SystemZ::FP64BitRegClass, TRI);
+    }
+  }
+
+  return true;
+}
+
+bool SystemZFrameLowering::
+restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            const std::vector<CalleeSavedInfo> &CSI,
+                            const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
+  SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
+  bool HasFP = hasFP(MF);
+  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  // Restore FPRs in the normal TargetInstrInfo way.
+  for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+    unsigned Reg = CSI[I].getReg();
+    if (SystemZ::FP64BitRegClass.contains(Reg))
+      TII->loadRegFromStackSlot(MBB, MBBI, Reg, CSI[I].getFrameIdx(),
+                                &SystemZ::FP64BitRegClass, TRI);
+  }
+
+  // Restore call-saved GPRs (but not call-clobbered varargs, which at
+  // this point might hold return values).
+  unsigned LowGPR = ZFI->getLowSavedGPR();
+  unsigned HighGPR = ZFI->getHighSavedGPR();
+  unsigned StartOffset = RegSpillOffsets[LowGPR];
+  if (LowGPR) {
+    // If we saved any of %r2-%r5 as varargs, we should also be saving
+    // and restoring %r6.  If we're saving %r6 or above, we should be
+    // restoring it too.
+    assert(LowGPR != HighGPR && "Should be loading %r15 and something else");
+
+    // Build an LMG instruction.
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(SystemZ::LMG));
+
+    // Add the explicit register operands.
+    MIB.addReg(LowGPR, RegState::Define);
+    MIB.addReg(HighGPR, RegState::Define);
+
+    // Add the address.
+    MIB.addReg(HasFP ? SystemZ::R11D : SystemZ::R15D);
+    MIB.addImm(StartOffset);
+
+    // Do a second scan adding regs as being defined by instruction
+    for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+      unsigned Reg = CSI[I].getReg();
+      if (Reg != LowGPR && Reg != HighGPR)
+        MIB.addReg(Reg, RegState::ImplicitDefine);
+    }
+  }
+
+  return true;
+}
+
+// Emit instructions before MBBI (in MBB) to add NumBytes to Reg.
+static void emitIncrement(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator &MBBI,
+                          const DebugLoc &DL,
+                          unsigned Reg, int64_t NumBytes,
+                          const TargetInstrInfo *TII) {
+  while (NumBytes) {
+    unsigned Opcode;
+    int64_t ThisVal = NumBytes;
+    if (isInt<16>(NumBytes))
+      Opcode = SystemZ::AGHI;
+    else {
+      Opcode = SystemZ::AGFI;
+      // Make sure we maintain 8-byte stack alignment.
+      int64_t MinVal = -int64_t(1) << 31;
+      int64_t MaxVal = (int64_t(1) << 31) - 8;
+      if (ThisVal < MinVal)
+        ThisVal = MinVal;
+      else if (ThisVal > MaxVal)
+        ThisVal = MaxVal;
+    }
+    MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII->get(Opcode), Reg)
+      .addReg(Reg).addImm(ThisVal);
+    // The CC implicit def is dead.
+    MI->getOperand(3).setIsDead();
+    NumBytes -= ThisVal;
+  }
+}
+
+void SystemZFrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();
+  MachineFrameInfo *MFFrame = MF.getFrameInfo();
+  const SystemZInstrInfo *ZII =
+    static_cast<const SystemZInstrInfo*>(MF.getTarget().getInstrInfo());
+  SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineModuleInfo &MMI = MF.getMMI();
+  const MCRegisterInfo &MRI = MMI.getContext().getRegisterInfo();
+  const std::vector<CalleeSavedInfo> &CSI = MFFrame->getCalleeSavedInfo();
+  bool HasFP = hasFP(MF);
+  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  // The current offset of the stack pointer from the CFA.
+  int64_t SPOffsetFromCFA = -SystemZMC::CFAOffsetFromInitialSP;
+
+  if (ZFI->getLowSavedGPR()) {
+    // Skip over the GPR saves.
+    if (MBBI != MBB.end() && MBBI->getOpcode() == SystemZ::STMG)
+      ++MBBI;
+    else
+      llvm_unreachable("Couldn't skip over GPR saves");
+
+    // Add CFI for the GPR saves.
+    MCSymbol *GPRSaveLabel = MMI.getContext().CreateTempSymbol();
+    BuildMI(MBB, MBBI, DL,
+            ZII->get(TargetOpcode::PROLOG_LABEL)).addSym(GPRSaveLabel);
+    for (std::vector<CalleeSavedInfo>::const_iterator
+           I = CSI.begin(), E = CSI.end(); I != E; ++I) {
+      unsigned Reg = I->getReg();
+      if (SystemZ::GR64BitRegClass.contains(Reg)) {
+        int64_t Offset = SPOffsetFromCFA + RegSpillOffsets[Reg];
+        MMI.addFrameInst(MCCFIInstruction::createOffset(
+            GPRSaveLabel, MRI.getDwarfRegNum(Reg, true), Offset));
+      }
+    }
+  }
+
+  uint64_t StackSize = getAllocatedStackSize(MF);
+  if (StackSize) {
+    // Allocate StackSize bytes.
+    int64_t Delta = -int64_t(StackSize);
+    emitIncrement(MBB, MBBI, DL, SystemZ::R15D, Delta, ZII);
+
+    // Add CFI for the allocation.
+    MCSymbol *AdjustSPLabel = MMI.getContext().CreateTempSymbol();
+    BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::PROLOG_LABEL))
+      .addSym(AdjustSPLabel);
+    MMI.addFrameInst(MCCFIInstruction::createDefCfaOffset(
+        AdjustSPLabel, SPOffsetFromCFA + Delta));
+    SPOffsetFromCFA += Delta;
+  }
+
+  if (HasFP) {
+    // Copy the base of the frame to R11.
+    BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::LGR), SystemZ::R11D)
+      .addReg(SystemZ::R15D);
+
+    // Add CFI for the new frame location.
+    MCSymbol *SetFPLabel = MMI.getContext().CreateTempSymbol();
+    BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::PROLOG_LABEL))
+      .addSym(SetFPLabel);
+    unsigned HardFP = MRI.getDwarfRegNum(SystemZ::R11D, true);
+    MMI.addFrameInst(
+        MCCFIInstruction::createDefCfaRegister(SetFPLabel, HardFP));
+
+    // Mark the FramePtr as live at the beginning of every block except
+    // the entry block.  (We'll have marked R11 as live on entry when
+    // saving the GPRs.)
+    for (MachineFunction::iterator
+           I = llvm::next(MF.begin()), E = MF.end(); I != E; ++I)
+      I->addLiveIn(SystemZ::R11D);
+  }
+
+  // Skip over the FPR saves.
+  MCSymbol *FPRSaveLabel = 0;
+  for (std::vector<CalleeSavedInfo>::const_iterator
+         I = CSI.begin(), E = CSI.end(); I != E; ++I) {
+    unsigned Reg = I->getReg();
+    if (SystemZ::FP64BitRegClass.contains(Reg)) {
+      if (MBBI != MBB.end() &&
+          (MBBI->getOpcode() == SystemZ::STD ||
+           MBBI->getOpcode() == SystemZ::STDY))
+        ++MBBI;
+      else
+        llvm_unreachable("Couldn't skip over FPR save");
+
+      // Add CFI for the this save.
+      if (!FPRSaveLabel)
+        FPRSaveLabel = MMI.getContext().CreateTempSymbol();
+      unsigned Reg = MRI.getDwarfRegNum(I->getReg(), true);
+      int64_t Offset = getFrameIndexOffset(MF, I->getFrameIdx());
+      MMI.addFrameInst(MCCFIInstruction::createOffset(
+          FPRSaveLabel, Reg, SPOffsetFromCFA + Offset));
+    }
+  }
+  // Complete the CFI for the FPR saves, modelling them as taking effect
+  // after the last save.
+  if (FPRSaveLabel)
+    BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::PROLOG_LABEL))
+      .addSym(FPRSaveLabel);
+}
+
+void SystemZFrameLowering::emitEpilogue(MachineFunction &MF,
+                                        MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  const SystemZInstrInfo *ZII =
+    static_cast<const SystemZInstrInfo*>(MF.getTarget().getInstrInfo());
+  SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
+
+  // Skip the return instruction.
+  assert(MBBI->getOpcode() == SystemZ::RET &&
+         "Can only insert epilogue into returning blocks");
+
+  uint64_t StackSize = getAllocatedStackSize(MF);
+  if (ZFI->getLowSavedGPR()) {
+    --MBBI;
+    unsigned Opcode = MBBI->getOpcode();
+    if (Opcode != SystemZ::LMG)
+      llvm_unreachable("Expected to see callee-save register restore code");
+
+    unsigned AddrOpNo = 2;
+    DebugLoc DL = MBBI->getDebugLoc();
+    uint64_t Offset = StackSize + MBBI->getOperand(AddrOpNo + 1).getImm();
+    unsigned NewOpcode = ZII->getOpcodeForOffset(Opcode, Offset);
+
+    // If the offset is too large, use the largest stack-aligned offset
+    // and add the rest to the base register (the stack or frame pointer).
+    if (!NewOpcode) {
+      uint64_t NumBytes = Offset - 0x7fff8;
+      emitIncrement(MBB, MBBI, DL, MBBI->getOperand(AddrOpNo).getReg(),
+                    NumBytes, ZII);
+      Offset -= NumBytes;
+      NewOpcode = ZII->getOpcodeForOffset(Opcode, Offset);
+      assert(NewOpcode && "No restore instruction available");
+    }
+
+    MBBI->setDesc(ZII->get(NewOpcode));
+    MBBI->getOperand(AddrOpNo + 1).ChangeToImmediate(Offset);
+  } else if (StackSize) {
+    DebugLoc DL = MBBI->getDebugLoc();
+    emitIncrement(MBB, MBBI, DL, SystemZ::R15D, StackSize, ZII);
+  }
+}
+
+bool SystemZFrameLowering::hasFP(const MachineFunction &MF) const {
+  return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
+          MF.getFrameInfo()->hasVarSizedObjects() ||
+          MF.getInfo<SystemZMachineFunctionInfo>()->getManipulatesSP());
+}
+
+int SystemZFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
+                                              int FI) const {
+  const MachineFrameInfo *MFFrame = MF.getFrameInfo();
+
+  // Start with the offset of FI from the top of the caller-allocated frame
+  // (i.e. the top of the 160 bytes allocated by the caller).  This initial
+  // offset is therefore negative.
+  int64_t Offset = (MFFrame->getObjectOffset(FI) +
+                    MFFrame->getOffsetAdjustment());
+  if (FI >= 0)
+    // Non-fixed objects are allocated below the incoming stack pointer.
+    // Account for the space at the top of the frame that we choose not
+    // to allocate.
+    Offset += getUnallocatedTopBytes(MF);
+
+  // Make the offset relative to the incoming stack pointer.
+  Offset -= getOffsetOfLocalArea();
+
+  // Make the offset relative to the bottom of the frame.
+  Offset += getAllocatedStackSize(MF);
+
+  return Offset;
+}
+
+uint64_t SystemZFrameLowering::
+getUnallocatedTopBytes(const MachineFunction &MF) const {
+  return MF.getInfo<SystemZMachineFunctionInfo>()->getSavedGPRFrameSize();
+}
+
+uint64_t SystemZFrameLowering::
+getAllocatedStackSize(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFFrame = MF.getFrameInfo();
+
+  // Start with the size of the local variables and spill slots.
+  uint64_t StackSize = MFFrame->getStackSize();
+
+  // Remove any bytes that we choose not to allocate.
+  StackSize -= getUnallocatedTopBytes(MF);
+
+  // Include space for an emergency spill slot, if one might be needed.
+  StackSize += getEmergencySpillSlotSize(MF);
+
+  // We need to allocate the ABI-defined 160-byte base area whenever
+  // we allocate stack space for our own use and whenever we call another
+  // function.
+  if (StackSize || MFFrame->hasVarSizedObjects() || MFFrame->hasCalls())
+    StackSize += SystemZMC::CallFrameSize;
+
+  return StackSize;
+}
+
+unsigned SystemZFrameLowering::
+getEmergencySpillSlotSize(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFFrame = MF.getFrameInfo();
+  uint64_t MaxReach = MFFrame->getStackSize() + SystemZMC::CallFrameSize * 2;
+  return isUInt<12>(MaxReach) ? 0 : 8;
+}
+
+unsigned SystemZFrameLowering::
+getEmergencySpillSlotOffset(const MachineFunction &MF) const {
+  assert(getEmergencySpillSlotSize(MF) && "No emergency spill slot");
+  return SystemZMC::CallFrameSize;
+}
+
+bool
+SystemZFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  // The ABI requires us to allocate 160 bytes of stack space for the callee,
+  // with any outgoing stack arguments being placed above that.  It seems
+  // better to make that area a permanent feature of the frame even if
+  // we're using a frame pointer.
+  return true;
+}
+
+void SystemZFrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF,
+                              MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI) const {
+  switch (MI->getOpcode()) {
+  case SystemZ::ADJCALLSTACKDOWN:
+  case SystemZ::ADJCALLSTACKUP:
+    assert(hasReservedCallFrame(MF) &&
+           "ADJSTACKDOWN and ADJSTACKUP should be no-ops");
+    MBB.erase(MI);
+    break;
+
+  default:
+    llvm_unreachable("Unexpected call frame instruction");
+  }
+}
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.h b/lib/Target/SystemZ/SystemZFrameLowering.h
new file mode 100644
index 0000000..5ca049c
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -0,0 +1,93 @@
+//===-- SystemZFrameLowering.h - Frame lowering for SystemZ -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SYSTEMZFRAMELOWERING_H
+#define SYSTEMZFRAMELOWERING_H
+
+#include "SystemZSubtarget.h"
+#include "llvm/ADT/IndexedMap.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+class SystemZTargetMachine;
+class SystemZSubtarget;
+
+class SystemZFrameLowering : public TargetFrameLowering {
+  IndexedMap<unsigned> RegSpillOffsets;
+
+protected:
+  const SystemZTargetMachine &TM;
+  const SystemZSubtarget &STI;
+
+public:
+  SystemZFrameLowering(const SystemZTargetMachine &tm,
+                       const SystemZSubtarget &sti);
+
+  // Override FrameLowering.
+  virtual void
+    processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                         RegScavenger *RS) const LLVM_OVERRIDE;
+  virtual bool
+    spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MBBI,
+                              const std::vector<CalleeSavedInfo> &CSI,
+                              const TargetRegisterInfo *TRI) const
+    LLVM_OVERRIDE;
+  virtual bool
+    restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MBBII,
+                                const std::vector<CalleeSavedInfo> &CSI,
+                                const TargetRegisterInfo *TRI) const
+    LLVM_OVERRIDE;
+  virtual void emitPrologue(MachineFunction &MF) const LLVM_OVERRIDE;
+  virtual void emitEpilogue(MachineFunction &MF,
+                            MachineBasicBlock &MBB) const LLVM_OVERRIDE;
+  virtual bool hasFP(const MachineFunction &MF) const LLVM_OVERRIDE;
+  virtual int getFrameIndexOffset(const MachineFunction &MF,
+                                  int FI) const LLVM_OVERRIDE;
+  virtual bool hasReservedCallFrame(const MachineFunction &MF) const
+    LLVM_OVERRIDE;
+  virtual void
+  eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI) const
+    LLVM_OVERRIDE;
+
+  // The target-independent code automatically allocates save slots for
+  // call-saved GPRs.  However, we don't need those slots for SystemZ,
+  // because the ABI sets aside GPR save slots in the caller-allocated part
+  // of the frame.  Since the target-independent code puts this unneeded
+  // area at the top of the callee-allocated part of frame, we choose not
+  // to allocate it and adjust the offsets accordingly.  Return the
+  // size of this unallocated area.
+  // FIXME: seems a bit hackish.
+  uint64_t getUnallocatedTopBytes(const MachineFunction &MF) const;
+
+  // Return the number of bytes in the callee-allocated part of the frame.
+  uint64_t getAllocatedStackSize(const MachineFunction &MF) const;
+
+  // Return the number of frame bytes that should be reserved for
+  // an emergency spill slot, for use by the register scaveneger.
+  // Return 0 if register scaveging won't be needed.
+  unsigned getEmergencySpillSlotSize(const MachineFunction &MF) const;
+
+  // Return the offset from the frame pointer of the emergency spill slot,
+  // which always fits within a 12-bit unsigned displacement field.
+  // Only valid if getEmergencySpillSlotSize(MF) returns nonzero.
+  unsigned getEmergencySpillSlotOffset(const MachineFunction &MF) const;
+
+  // Return the byte offset from the incoming stack pointer of Reg's
+  // ABI-defined save slot.  Return 0 if no slot is defined for Reg.
+  unsigned getRegSpillOffset(unsigned Reg) const {
+    return RegSpillOffsets[Reg];
+  }
+};
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
new file mode 100644
index 0000000..f10ba23
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -0,0 +1,616 @@
+//===-- SystemZISelDAGToDAG.cpp - A dag to dag inst selector for SystemZ --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the SystemZ target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+// Used to build addressing modes.
+struct SystemZAddressingMode {
+  // The shape of the address.
+  enum AddrForm {
+    // base+displacement
+    FormBD,
+
+    // base+displacement+index for load and store operands
+    FormBDXNormal,
+
+    // base+displacement+index for load address operands
+    FormBDXLA,
+
+    // base+displacement+index+ADJDYNALLOC
+    FormBDXDynAlloc
+  };
+  AddrForm Form;
+
+  // The type of displacement.  The enum names here correspond directly
+  // to the definitions in SystemZOperand.td.  We could split them into
+  // flags -- single/pair, 128-bit, etc. -- but it hardly seems worth it.
+  enum DispRange {
+    Disp12Only,
+    Disp12Pair,
+    Disp20Only,
+    Disp20Only128,
+    Disp20Pair
+  };
+  DispRange DR;
+
+  // The parts of the address.  The address is equivalent to:
+  //
+  //     Base + Disp + Index + (IncludesDynAlloc ? ADJDYNALLOC : 0)
+  SDValue Base;
+  int64_t Disp;
+  SDValue Index;
+  bool IncludesDynAlloc;
+
+  SystemZAddressingMode(AddrForm form, DispRange dr)
+    : Form(form), DR(dr), Base(), Disp(0), Index(),
+      IncludesDynAlloc(false) {}
+
+  // True if the address can have an index register.
+  bool hasIndexField() { return Form != FormBD; }
+
+  // True if the address can (and must) include ADJDYNALLOC.
+  bool isDynAlloc() { return Form == FormBDXDynAlloc; }
+
+  void dump() {
+    errs() << "SystemZAddressingMode " << this << '\n';
+
+    errs() << " Base ";
+    if (Base.getNode() != 0)
+      Base.getNode()->dump();
+    else
+      errs() << "null\n";
+
+    if (hasIndexField()) {
+      errs() << " Index ";
+      if (Index.getNode() != 0)
+        Index.getNode()->dump();
+      else
+        errs() << "null\n";
+    }
+
+    errs() << " Disp " << Disp;
+    if (IncludesDynAlloc)
+      errs() << " + ADJDYNALLOC";
+    errs() << '\n';
+  }
+};
+
+class SystemZDAGToDAGISel : public SelectionDAGISel {
+  const SystemZTargetLowering &Lowering;
+  const SystemZSubtarget &Subtarget;
+
+  // Used by SystemZOperands.td to create integer constants.
+  inline SDValue getImm(const SDNode *Node, uint64_t Imm) {
+    return CurDAG->getTargetConstant(Imm, Node->getValueType(0));
+  }
+
+  // Try to fold more of the base or index of AM into AM, where IsBase
+  // selects between the base and index.
+  bool expandAddress(SystemZAddressingMode &AM, bool IsBase);
+
+  // Try to describe N in AM, returning true on success.
+  bool selectAddress(SDValue N, SystemZAddressingMode &AM);
+
+  // Extract individual target operands from matched address AM.
+  void getAddressOperands(const SystemZAddressingMode &AM, EVT VT,
+                          SDValue &Base, SDValue &Disp);
+  void getAddressOperands(const SystemZAddressingMode &AM, EVT VT,
+                          SDValue &Base, SDValue &Disp, SDValue &Index);
+
+  // Try to match Addr as a FormBD address with displacement type DR.
+  // Return true on success, storing the base and displacement in
+  // Base and Disp respectively.
+  bool selectBDAddr(SystemZAddressingMode::DispRange DR, SDValue Addr,
+                    SDValue &Base, SDValue &Disp);
+
+  // Try to match Addr as a FormBDX* address of form Form with
+  // displacement type DR.  Return true on success, storing the base,
+  // displacement and index in Base, Disp and Index respectively.
+  bool selectBDXAddr(SystemZAddressingMode::AddrForm Form,
+                     SystemZAddressingMode::DispRange DR, SDValue Addr,
+                     SDValue &Base, SDValue &Disp, SDValue &Index);
+
+  // PC-relative address matching routines used by SystemZOperands.td.
+  bool selectPCRelAddress(SDValue Addr, SDValue &Target) {
+    if (Addr.getOpcode() == SystemZISD::PCREL_WRAPPER) {
+      Target = Addr.getOperand(0);
+      return true;
+    }
+    return false;
+  }
+
+  // BD matching routines used by SystemZOperands.td.
+  bool selectBDAddr12Only(SDValue Addr, SDValue &Base, SDValue &Disp) {
+    return selectBDAddr(SystemZAddressingMode::Disp12Only, Addr, Base, Disp);
+  }
+  bool selectBDAddr12Pair(SDValue Addr, SDValue &Base, SDValue &Disp) {
+    return selectBDAddr(SystemZAddressingMode::Disp12Pair, Addr, Base, Disp);
+  }
+  bool selectBDAddr20Only(SDValue Addr, SDValue &Base, SDValue &Disp) {
+    return selectBDAddr(SystemZAddressingMode::Disp20Only, Addr, Base, Disp);
+  }
+  bool selectBDAddr20Pair(SDValue Addr, SDValue &Base, SDValue &Disp) {
+    return selectBDAddr(SystemZAddressingMode::Disp20Pair, Addr, Base, Disp);
+  }
+
+  // BDX matching routines used by SystemZOperands.td.
+  bool selectBDXAddr12Only(SDValue Addr, SDValue &Base, SDValue &Disp,
+                           SDValue &Index) {
+    return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
+                         SystemZAddressingMode::Disp12Only,
+                         Addr, Base, Disp, Index);
+  }
+  bool selectBDXAddr12Pair(SDValue Addr, SDValue &Base, SDValue &Disp,
+                           SDValue &Index) {
+    return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
+                         SystemZAddressingMode::Disp12Pair,
+                         Addr, Base, Disp, Index);
+  }
+  bool selectDynAlloc12Only(SDValue Addr, SDValue &Base, SDValue &Disp,
+                            SDValue &Index) {
+    return selectBDXAddr(SystemZAddressingMode::FormBDXDynAlloc,
+                         SystemZAddressingMode::Disp12Only,
+                         Addr, Base, Disp, Index);
+  }
+  bool selectBDXAddr20Only(SDValue Addr, SDValue &Base, SDValue &Disp,
+                           SDValue &Index) {
+    return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
+                         SystemZAddressingMode::Disp20Only,
+                         Addr, Base, Disp, Index);
+  }
+  bool selectBDXAddr20Only128(SDValue Addr, SDValue &Base, SDValue &Disp,
+                              SDValue &Index) {
+    return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
+                         SystemZAddressingMode::Disp20Only128,
+                         Addr, Base, Disp, Index);
+  }
+  bool selectBDXAddr20Pair(SDValue Addr, SDValue &Base, SDValue &Disp,
+                           SDValue &Index) {
+    return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
+                         SystemZAddressingMode::Disp20Pair,
+                         Addr, Base, Disp, Index);
+  }
+  bool selectLAAddr12Pair(SDValue Addr, SDValue &Base, SDValue &Disp,
+                          SDValue &Index) {
+    return selectBDXAddr(SystemZAddressingMode::FormBDXLA,
+                         SystemZAddressingMode::Disp12Pair,
+                         Addr, Base, Disp, Index);
+  }
+  bool selectLAAddr20Pair(SDValue Addr, SDValue &Base, SDValue &Disp,
+                          SDValue &Index) {
+    return selectBDXAddr(SystemZAddressingMode::FormBDXLA,
+                         SystemZAddressingMode::Disp20Pair,
+                         Addr, Base, Disp, Index);
+  }
+
+  // If Op0 is null, then Node is a constant that can be loaded using:
+  //
+  //   (Opcode UpperVal LowerVal)
+  //
+  // If Op0 is nonnull, then Node can be implemented using:
+  //
+  //   (Opcode (Opcode Op0 UpperVal) LowerVal)
+  SDNode *splitLargeImmediate(unsigned Opcode, SDNode *Node, SDValue Op0,
+                              uint64_t UpperVal, uint64_t LowerVal);
+
+public:
+  SystemZDAGToDAGISel(SystemZTargetMachine &TM, CodeGenOpt::Level OptLevel)
+    : SelectionDAGISel(TM, OptLevel),
+      Lowering(*TM.getTargetLowering()),
+      Subtarget(*TM.getSubtargetImpl()) { }
+
+  // Override MachineFunctionPass.
+  virtual const char *getPassName() const LLVM_OVERRIDE {
+    return "SystemZ DAG->DAG Pattern Instruction Selection";
+  }
+
+  // Override SelectionDAGISel.
+  virtual SDNode *Select(SDNode *Node) LLVM_OVERRIDE;
+  virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                            char ConstraintCode,
+                                            std::vector<SDValue> &OutOps)
+    LLVM_OVERRIDE;
+
+  // Include the pieces autogenerated from the target description.
+  #include "SystemZGenDAGISel.inc"
+};
+} // end anonymous namespace
+
+FunctionPass *llvm::createSystemZISelDag(SystemZTargetMachine &TM,
+                                         CodeGenOpt::Level OptLevel) {
+  return new SystemZDAGToDAGISel(TM, OptLevel);
+}
+
+// Return true if Val should be selected as a displacement for an address
+// with range DR.  Here we're interested in the range of both the instruction
+// described by DR and of any pairing instruction.
+static bool selectDisp(SystemZAddressingMode::DispRange DR, int64_t Val) {
+  switch (DR) {
+  case SystemZAddressingMode::Disp12Only:
+    return isUInt<12>(Val);
+
+  case SystemZAddressingMode::Disp12Pair:
+  case SystemZAddressingMode::Disp20Only:
+  case SystemZAddressingMode::Disp20Pair:
+    return isInt<20>(Val);
+
+  case SystemZAddressingMode::Disp20Only128:
+    return isInt<20>(Val) && isInt<20>(Val + 8);
+  }
+  llvm_unreachable("Unhandled displacement range");
+}
+
+// Change the base or index in AM to Value, where IsBase selects
+// between the base and index.
+static void changeComponent(SystemZAddressingMode &AM, bool IsBase,
+                            SDValue Value) {
+  if (IsBase)
+    AM.Base = Value;
+  else
+    AM.Index = Value;
+}
+
+// The base or index of AM is equivalent to Value + ADJDYNALLOC,
+// where IsBase selects between the base and index.  Try to fold the
+// ADJDYNALLOC into AM.
+static bool expandAdjDynAlloc(SystemZAddressingMode &AM, bool IsBase,
+                              SDValue Value) {
+  if (AM.isDynAlloc() && !AM.IncludesDynAlloc) {
+    changeComponent(AM, IsBase, Value);
+    AM.IncludesDynAlloc = true;
+    return true;
+  }
+  return false;
+}
+
+// The base of AM is equivalent to Base + Index.  Try to use Index as
+// the index register.
+static bool expandIndex(SystemZAddressingMode &AM, SDValue Base,
+                        SDValue Index) {
+  if (AM.hasIndexField() && !AM.Index.getNode()) {
+    AM.Base = Base;
+    AM.Index = Index;
+    return true;
+  }
+  return false;
+}
+
+// The base or index of AM is equivalent to Op0 + Op1, where IsBase selects
+// between the base and index.  Try to fold Op1 into AM's displacement.
+static bool expandDisp(SystemZAddressingMode &AM, bool IsBase,
+                       SDValue Op0, ConstantSDNode *Op1) {
+  // First try adjusting the displacement.
+  int64_t TestDisp = AM.Disp + Op1->getSExtValue();
+  if (selectDisp(AM.DR, TestDisp)) {
+    changeComponent(AM, IsBase, Op0);
+    AM.Disp = TestDisp;
+    return true;
+  }
+
+  // We could consider forcing the displacement into a register and
+  // using it as an index, but it would need to be carefully tuned.
+  return false;
+}
+
+bool SystemZDAGToDAGISel::expandAddress(SystemZAddressingMode &AM,
+                                        bool IsBase) {
+  SDValue N = IsBase ? AM.Base : AM.Index;
+  unsigned Opcode = N.getOpcode();
+  if (Opcode == ISD::TRUNCATE) {
+    N = N.getOperand(0);
+    Opcode = N.getOpcode();
+  }
+  if (Opcode == ISD::ADD || CurDAG->isBaseWithConstantOffset(N)) {
+    SDValue Op0 = N.getOperand(0);
+    SDValue Op1 = N.getOperand(1);
+
+    unsigned Op0Code = Op0->getOpcode();
+    unsigned Op1Code = Op1->getOpcode();
+
+    if (Op0Code == SystemZISD::ADJDYNALLOC)
+      return expandAdjDynAlloc(AM, IsBase, Op1);
+    if (Op1Code == SystemZISD::ADJDYNALLOC)
+      return expandAdjDynAlloc(AM, IsBase, Op0);
+
+    if (Op0Code == ISD::Constant)
+      return expandDisp(AM, IsBase, Op1, cast<ConstantSDNode>(Op0));
+    if (Op1Code == ISD::Constant)
+      return expandDisp(AM, IsBase, Op0, cast<ConstantSDNode>(Op1));
+
+    if (IsBase && expandIndex(AM, Op0, Op1))
+      return true;
+  }
+  return false;
+}
+
+// Return true if an instruction with displacement range DR should be
+// used for displacement value Val.  selectDisp(DR, Val) must already hold.
+static bool isValidDisp(SystemZAddressingMode::DispRange DR, int64_t Val) {
+  assert(selectDisp(DR, Val) && "Invalid displacement");
+  switch (DR) {
+  case SystemZAddressingMode::Disp12Only:
+  case SystemZAddressingMode::Disp20Only:
+  case SystemZAddressingMode::Disp20Only128:
+    return true;
+
+  case SystemZAddressingMode::Disp12Pair:
+    // Use the other instruction if the displacement is too large.
+    return isUInt<12>(Val);
+
+  case SystemZAddressingMode::Disp20Pair:
+    // Use the other instruction if the displacement is small enough.
+    return !isUInt<12>(Val);
+  }
+  llvm_unreachable("Unhandled displacement range");
+}
+
+// Return true if Base + Disp + Index should be performed by LA(Y).
+static bool shouldUseLA(SDNode *Base, int64_t Disp, SDNode *Index) {
+  // Don't use LA(Y) for constants.
+  if (!Base)
+    return false;
+
+  // Always use LA(Y) for frame addresses, since we know that the destination
+  // register is almost always (perhaps always) going to be different from
+  // the frame register.
+  if (Base->getOpcode() == ISD::FrameIndex)
+    return true;
+
+  if (Disp) {
+    // Always use LA(Y) if there is a base, displacement and index.
+    if (Index)
+      return true;
+
+    // Always use LA if the displacement is small enough.  It should always
+    // be no worse than AGHI (and better if it avoids a move).
+    if (isUInt<12>(Disp))
+      return true;
+
+    // For similar reasons, always use LAY if the constant is too big for AGHI.
+    // LAY should be no worse than AGFI.
+    if (!isInt<16>(Disp))
+      return true;
+  } else {
+    // Don't use LA for plain registers.
+    if (!Index)
+      return false;
+
+    // Don't use LA for plain addition if the index operand is only used
+    // once.  It should be a natural two-operand addition in that case.
+    if (Index->hasOneUse())
+      return false;
+
+    // Prefer addition if the second operation is sign-extended, in the
+    // hope of using AGF.
+    unsigned IndexOpcode = Index->getOpcode();
+    if (IndexOpcode == ISD::SIGN_EXTEND ||
+        IndexOpcode == ISD::SIGN_EXTEND_INREG)
+      return false;
+  }
+
+  // Don't use LA for two-operand addition if either operand is only
+  // used once.  The addition instructions are better in that case.
+  if (Base->hasOneUse())
+    return false;
+
+  return true;
+}
+
+// Return true if Addr is suitable for AM, updating AM if so.
+bool SystemZDAGToDAGISel::selectAddress(SDValue Addr,
+                                        SystemZAddressingMode &AM) {
+  // Start out assuming that the address will need to be loaded separately,
+  // then try to extend it as much as we can.
+  AM.Base = Addr;
+
+  // First try treating the address as a constant.
+  if (Addr.getOpcode() == ISD::Constant &&
+      expandDisp(AM, true, SDValue(), cast<ConstantSDNode>(Addr)))
+    ;
+  else
+    // Otherwise try expanding each component.
+    while (expandAddress(AM, true) ||
+           (AM.Index.getNode() && expandAddress(AM, false)))
+      continue;
+
+  // Reject cases where it isn't profitable to use LA(Y).
+  if (AM.Form == SystemZAddressingMode::FormBDXLA &&
+      !shouldUseLA(AM.Base.getNode(), AM.Disp, AM.Index.getNode()))
+    return false;
+
+  // Reject cases where the other instruction in a pair should be used.
+  if (!isValidDisp(AM.DR, AM.Disp))
+    return false;
+
+  // Make sure that ADJDYNALLOC is included where necessary.
+  if (AM.isDynAlloc() && !AM.IncludesDynAlloc)
+    return false;
+
+  DEBUG(AM.dump());
+  return true;
+}
+
+// Insert a node into the DAG at least before Pos.  This will reposition
+// the node as needed, and will assign it a node ID that is <= Pos's ID.
+// Note that this does *not* preserve the uniqueness of node IDs!
+// The selection DAG must no longer depend on their uniqueness when this
+// function is used.
+static void insertDAGNode(SelectionDAG *DAG, SDNode *Pos, SDValue N) {
+  if (N.getNode()->getNodeId() == -1 ||
+      N.getNode()->getNodeId() > Pos->getNodeId()) {
+    DAG->RepositionNode(Pos, N.getNode());
+    N.getNode()->setNodeId(Pos->getNodeId());
+  }
+}
+
+void SystemZDAGToDAGISel::getAddressOperands(const SystemZAddressingMode &AM,
+                                             EVT VT, SDValue &Base,
+                                             SDValue &Disp) {
+  Base = AM.Base;
+  if (!Base.getNode())
+    // Register 0 means "no base".  This is mostly useful for shifts.
+    Base = CurDAG->getRegister(0, VT);
+  else if (Base.getOpcode() == ISD::FrameIndex) {
+    // Lower a FrameIndex to a TargetFrameIndex.
+    int64_t FrameIndex = cast<FrameIndexSDNode>(Base)->getIndex();
+    Base = CurDAG->getTargetFrameIndex(FrameIndex, VT);
+  } else if (Base.getValueType() != VT) {
+    // Truncate values from i64 to i32, for shifts.
+    assert(VT == MVT::i32 && Base.getValueType() == MVT::i64 &&
+           "Unexpected truncation");
+    SDLoc DL(Base);
+    SDValue Trunc = CurDAG->getNode(ISD::TRUNCATE, DL, VT, Base);
+    insertDAGNode(CurDAG, Base.getNode(), Trunc);
+    Base = Trunc;
+  }
+
+  // Lower the displacement to a TargetConstant.
+  Disp = CurDAG->getTargetConstant(AM.Disp, VT);
+}
+
+void SystemZDAGToDAGISel::getAddressOperands(const SystemZAddressingMode &AM,
+                                             EVT VT, SDValue &Base,
+                                             SDValue &Disp, SDValue &Index) {
+  getAddressOperands(AM, VT, Base, Disp);
+
+  Index = AM.Index;
+  if (!Index.getNode())
+    // Register 0 means "no index".
+    Index = CurDAG->getRegister(0, VT);
+}
+
+bool SystemZDAGToDAGISel::selectBDAddr(SystemZAddressingMode::DispRange DR,
+                                       SDValue Addr, SDValue &Base,
+                                       SDValue &Disp) {
+  SystemZAddressingMode AM(SystemZAddressingMode::FormBD, DR);
+  if (!selectAddress(Addr, AM))
+    return false;
+
+  getAddressOperands(AM, Addr.getValueType(), Base, Disp);
+  return true;
+}
+
+bool SystemZDAGToDAGISel::selectBDXAddr(SystemZAddressingMode::AddrForm Form,
+                                        SystemZAddressingMode::DispRange DR,
+                                        SDValue Addr, SDValue &Base,
+                                        SDValue &Disp, SDValue &Index) {
+  SystemZAddressingMode AM(Form, DR);
+  if (!selectAddress(Addr, AM))
+    return false;
+
+  getAddressOperands(AM, Addr.getValueType(), Base, Disp, Index);
+  return true;
+}
+
+SDNode *SystemZDAGToDAGISel::splitLargeImmediate(unsigned Opcode, SDNode *Node,
+                                                 SDValue Op0, uint64_t UpperVal,
+                                                 uint64_t LowerVal) {
+  EVT VT = Node->getValueType(0);
+  SDLoc DL(Node);
+  SDValue Upper = CurDAG->getConstant(UpperVal, VT);
+  if (Op0.getNode())
+    Upper = CurDAG->getNode(Opcode, DL, VT, Op0, Upper);
+  Upper = SDValue(Select(Upper.getNode()), 0);
+
+  SDValue Lower = CurDAG->getConstant(LowerVal, VT);
+  SDValue Or = CurDAG->getNode(Opcode, DL, VT, Upper, Lower);
+  return Or.getNode();
+}
+
+SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) {
+  // Dump information about the Node being selected
+  DEBUG(errs() << "Selecting: "; Node->dump(CurDAG); errs() << "\n");
+
+  // If we have a custom node, we already have selected!
+  if (Node->isMachineOpcode()) {
+    DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+    return 0;
+  }
+
+  unsigned Opcode = Node->getOpcode();
+  switch (Opcode) {
+  case ISD::OR:
+  case ISD::XOR:
+    // If this is a 64-bit operation in which both 32-bit halves are nonzero,
+    // split the operation into two.
+    if (Node->getValueType(0) == MVT::i64)
+      if (ConstantSDNode *Op1 = dyn_cast<ConstantSDNode>(Node->getOperand(1))) {
+        uint64_t Val = Op1->getZExtValue();
+        if (!SystemZ::isImmLF(Val) && !SystemZ::isImmHF(Val))
+          Node = splitLargeImmediate(Opcode, Node, Node->getOperand(0),
+                                     Val - uint32_t(Val), uint32_t(Val));
+      }
+    break;
+
+  case ISD::Constant:
+    // If this is a 64-bit constant that is out of the range of LLILF,
+    // LLIHF and LGFI, split it into two 32-bit pieces.
+    if (Node->getValueType(0) == MVT::i64) {
+      uint64_t Val = cast<ConstantSDNode>(Node)->getZExtValue();
+      if (!SystemZ::isImmLF(Val) && !SystemZ::isImmHF(Val) && !isInt<32>(Val))
+        Node = splitLargeImmediate(ISD::OR, Node, SDValue(),
+                                   Val - uint32_t(Val), uint32_t(Val));
+    }
+    break;
+
+  case ISD::ATOMIC_LOAD_SUB:
+    // Try to convert subtractions of constants to additions.
+    if (ConstantSDNode *Op2 = dyn_cast<ConstantSDNode>(Node->getOperand(2))) {
+      uint64_t Value = -Op2->getZExtValue();
+      EVT VT = Node->getValueType(0);
+      if (VT == MVT::i32 || isInt<32>(Value)) {
+        SDValue Ops[] = { Node->getOperand(0), Node->getOperand(1),
+                          CurDAG->getConstant(int32_t(Value), VT) };
+        Node = CurDAG->MorphNodeTo(Node, ISD::ATOMIC_LOAD_ADD,
+                                   Node->getVTList(), Ops, array_lengthof(Ops));
+      }
+    }
+    break;
+  }
+
+  // Select the default instruction
+  SDNode *ResNode = SelectCode(Node);
+
+  DEBUG(errs() << "=> ";
+        if (ResNode == NULL || ResNode == Node)
+          Node->dump(CurDAG);
+        else
+          ResNode->dump(CurDAG);
+        errs() << "\n";
+        );
+  return ResNode;
+}
+
+bool SystemZDAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op,
+                             char ConstraintCode,
+                             std::vector<SDValue> &OutOps) {
+  assert(ConstraintCode == 'm' && "Unexpected constraint code");
+  // Accept addresses with short displacements, which are compatible
+  // with Q, R, S and T.  But keep the index operand for future expansion.
+  SDValue Base, Disp, Index;
+  if (!selectBDXAddr(SystemZAddressingMode::FormBD,
+                     SystemZAddressingMode::Disp12Only,
+                     Op, Base, Disp, Index))
+    return true;
+  OutOps.push_back(Base);
+  OutOps.push_back(Disp);
+  OutOps.push_back(Index);
+  return false;
+}
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
new file mode 100644
index 0000000..1dc187f
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -0,0 +1,2292 @@
+//===-- SystemZISelLowering.cpp - SystemZ DAG lowering implementation -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SystemZTargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "systemz-lower"
+
+#include "SystemZISelLowering.h"
+#include "SystemZCallingConv.h"
+#include "SystemZConstantPoolValue.h"
+#include "SystemZMachineFunctionInfo.h"
+#include "SystemZTargetMachine.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+using namespace llvm;
+
+// Classify VT as either 32 or 64 bit.
+static bool is32Bit(EVT VT) {
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::i32:
+    return true;
+  case MVT::i64:
+    return false;
+  default:
+    llvm_unreachable("Unsupported type");
+  }
+}
+
+// Return a version of MachineOperand that can be safely used before the
+// final use.
+static MachineOperand earlyUseOperand(MachineOperand Op) {
+  if (Op.isReg())
+    Op.setIsKill(false);
+  return Op;
+}
+
+SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
+  : TargetLowering(tm, new TargetLoweringObjectFileELF()),
+    Subtarget(*tm.getSubtargetImpl()), TM(tm) {
+  MVT PtrVT = getPointerTy();
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i32,  &SystemZ::GR32BitRegClass);
+  addRegisterClass(MVT::i64,  &SystemZ::GR64BitRegClass);
+  addRegisterClass(MVT::f32,  &SystemZ::FP32BitRegClass);
+  addRegisterClass(MVT::f64,  &SystemZ::FP64BitRegClass);
+  addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass);
+
+  // Compute derived properties from the register classes
+  computeRegisterProperties();
+
+  // Set up special registers.
+  setExceptionPointerRegister(SystemZ::R6D);
+  setExceptionSelectorRegister(SystemZ::R7D);
+  setStackPointerRegisterToSaveRestore(SystemZ::R15D);
+
+  // TODO: It may be better to default to latency-oriented scheduling, however
+  // LLVM's current latency-oriented scheduler can't handle physreg definitions
+  // such as SystemZ has with CC, so set this to the register-pressure
+  // scheduler, because it can.
+  setSchedulingPreference(Sched::RegPressure);
+
+  setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
+
+  // Instructions are strings of 2-byte aligned 2-byte values.
+  setMinFunctionAlignment(2);
+
+  // Handle operations that are handled in a similar way for all types.
+  for (unsigned I = MVT::FIRST_INTEGER_VALUETYPE;
+       I <= MVT::LAST_FP_VALUETYPE;
+       ++I) {
+    MVT VT = MVT::SimpleValueType(I);
+    if (isTypeLegal(VT)) {
+      // Expand SETCC(X, Y, COND) into SELECT_CC(X, Y, 1, 0, COND).
+      setOperationAction(ISD::SETCC, VT, Expand);
+
+      // Expand SELECT(C, A, B) into SELECT_CC(X, 0, A, B, NE).
+      setOperationAction(ISD::SELECT, VT, Expand);
+
+      // Lower SELECT_CC and BR_CC into separate comparisons and branches.
+      setOperationAction(ISD::SELECT_CC, VT, Custom);
+      setOperationAction(ISD::BR_CC,     VT, Custom);
+    }
+  }
+
+  // Expand jump table branches as address arithmetic followed by an
+  // indirect jump.
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+
+  // Expand BRCOND into a BR_CC (see above).
+  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+
+  // Handle integer types.
+  for (unsigned I = MVT::FIRST_INTEGER_VALUETYPE;
+       I <= MVT::LAST_INTEGER_VALUETYPE;
+       ++I) {
+    MVT VT = MVT::SimpleValueType(I);
+    if (isTypeLegal(VT)) {
+      // Expand individual DIV and REMs into DIVREMs.
+      setOperationAction(ISD::SDIV, VT, Expand);
+      setOperationAction(ISD::UDIV, VT, Expand);
+      setOperationAction(ISD::SREM, VT, Expand);
+      setOperationAction(ISD::UREM, VT, Expand);
+      setOperationAction(ISD::SDIVREM, VT, Custom);
+      setOperationAction(ISD::UDIVREM, VT, Custom);
+
+      // Expand ATOMIC_LOAD and ATOMIC_STORE using ATOMIC_CMP_SWAP.
+      // FIXME: probably much too conservative.
+      setOperationAction(ISD::ATOMIC_LOAD,  VT, Expand);
+      setOperationAction(ISD::ATOMIC_STORE, VT, Expand);
+
+      // No special instructions for these.
+      setOperationAction(ISD::CTPOP,           VT, Expand);
+      setOperationAction(ISD::CTTZ,            VT, Expand);
+      setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
+      setOperationAction(ISD::ROTR,            VT, Expand);
+
+      // Use *MUL_LOHI where possible and a wider multiplication otherwise.
+      setOperationAction(ISD::MULHS, VT, Expand);
+      setOperationAction(ISD::MULHU, VT, Expand);
+
+      // We have instructions for signed but not unsigned FP conversion.
+      setOperationAction(ISD::FP_TO_UINT, VT, Expand);
+    }
+  }
+
+  // Type legalization will convert 8- and 16-bit atomic operations into
+  // forms that operate on i32s (but still keeping the original memory VT).
+  // Lower them into full i32 operations.
+  setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_MIN,  MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_MAX,  MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Custom);
+
+  // We have instructions for signed but not unsigned FP conversion.
+  // Handle unsigned 32-bit types as signed 64-bit types.
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Promote);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
+
+  // We have native support for a 64-bit CTLZ, via FLOGR.
+  setOperationAction(ISD::CTLZ, MVT::i32, Promote);
+  setOperationAction(ISD::CTLZ, MVT::i64, Legal);
+
+  // Give LowerOperation the chance to replace 64-bit ORs with subregs.
+  setOperationAction(ISD::OR, MVT::i64, Custom);
+
+  // The architecture has 32-bit SMUL_LOHI and UMUL_LOHI (MR and MLR),
+  // but they aren't really worth using.  There is no 64-bit SMUL_LOHI,
+  // but there is a 64-bit UMUL_LOHI: MLGR.
+  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Custom);
+
+  // FIXME: Can we support these natively?
+  setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
+  setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
+  setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
+
+  // We have native instructions for i8, i16 and i32 extensions, but not i1.
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::EXTLOAD,  MVT::i1, Promote);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  // Handle the various types of symbolic address.
+  setOperationAction(ISD::ConstantPool,     PtrVT, Custom);
+  setOperationAction(ISD::GlobalAddress,    PtrVT, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, PtrVT, Custom);
+  setOperationAction(ISD::BlockAddress,     PtrVT, Custom);
+  setOperationAction(ISD::JumpTable,        PtrVT, Custom);
+
+  // We need to handle dynamic allocations specially because of the
+  // 160-byte area at the bottom of the stack.
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
+
+  // Use custom expanders so that we can force the function to use
+  // a frame pointer.
+  setOperationAction(ISD::STACKSAVE,    MVT::Other, Custom);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Custom);
+
+  // Expand these using getExceptionSelectorRegister() and
+  // getExceptionPointerRegister().
+  setOperationAction(ISD::EXCEPTIONADDR, PtrVT, Expand);
+  setOperationAction(ISD::EHSELECTION,   PtrVT, Expand);
+
+  // Handle floating-point types.
+  for (unsigned I = MVT::FIRST_FP_VALUETYPE;
+       I <= MVT::LAST_FP_VALUETYPE;
+       ++I) {
+    MVT VT = MVT::SimpleValueType(I);
+    if (isTypeLegal(VT)) {
+      // We can use FI for FRINT.
+      setOperationAction(ISD::FRINT, VT, Legal);
+
+      // No special instructions for these.
+      setOperationAction(ISD::FSIN, VT, Expand);
+      setOperationAction(ISD::FCOS, VT, Expand);
+      setOperationAction(ISD::FREM, VT, Expand);
+    }
+  }
+
+  // We have fused multiply-addition for f32 and f64 but not f128.
+  setOperationAction(ISD::FMA, MVT::f32,  Legal);
+  setOperationAction(ISD::FMA, MVT::f64,  Legal);
+  setOperationAction(ISD::FMA, MVT::f128, Expand);
+
+  // Needed so that we don't try to implement f128 constant loads using
+  // a load-and-extend of a f80 constant (in cases where the constant
+  // would fit in an f80).
+  setLoadExtAction(ISD::EXTLOAD, MVT::f80, Expand);
+
+  // Floating-point truncation and stores need to be done separately.
+  setTruncStoreAction(MVT::f64,  MVT::f32, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f64, Expand);
+
+  // We have 64-bit FPR<->GPR moves, but need special handling for
+  // 32-bit forms.
+  setOperationAction(ISD::BITCAST, MVT::i32, Custom);
+  setOperationAction(ISD::BITCAST, MVT::f32, Custom);
+
+  // VASTART and VACOPY need to deal with the SystemZ-specific varargs
+  // structure, but VAEND is a no-op.
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  setOperationAction(ISD::VACOPY,  MVT::Other, Custom);
+  setOperationAction(ISD::VAEND,   MVT::Other, Expand);
+}
+
+bool SystemZTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  // We can load zero using LZ?R and negative zero using LZ?R;LC?BR.
+  return Imm.isZero() || Imm.isNegZero();
+}
+
+bool SystemZTargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
+                                                          bool *Fast) const {
+  // Unaligned accesses should never be slower than the expanded version.
+  // We check specifically for aligned accesses in the few cases where
+  // they are required.
+  if (Fast)
+    *Fast = true;
+  return true;
+}
+  
+//===----------------------------------------------------------------------===//
+// Inline asm support
+//===----------------------------------------------------------------------===//
+
+TargetLowering::ConstraintType
+SystemZTargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'a': // Address register
+    case 'd': // Data register (equivalent to 'r')
+    case 'f': // Floating-point register
+    case 'r': // General-purpose register
+      return C_RegisterClass;
+
+    case 'Q': // Memory with base and unsigned 12-bit displacement
+    case 'R': // Likewise, plus an index
+    case 'S': // Memory with base and signed 20-bit displacement
+    case 'T': // Likewise, plus an index
+    case 'm': // Equivalent to 'T'.
+      return C_Memory;
+
+    case 'I': // Unsigned 8-bit constant
+    case 'J': // Unsigned 12-bit constant
+    case 'K': // Signed 16-bit constant
+    case 'L': // Signed 20-bit displacement (on all targets we support)
+    case 'M': // 0x7fffffff
+      return C_Other;
+
+    default:
+      break;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+TargetLowering::ConstraintWeight SystemZTargetLowering::
+getSingleConstraintMatchWeight(AsmOperandInfo &info,
+                               const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+  // If we don't have a value, we can't do a match,
+  // but allow it at the lowest weight.
+  if (CallOperandVal == NULL)
+    return CW_Default;
+  Type *type = CallOperandVal->getType();
+  // Look at the constraint type.
+  switch (*constraint) {
+  default:
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+    break;
+
+  case 'a': // Address register
+  case 'd': // Data register (equivalent to 'r')
+  case 'r': // General-purpose register
+    if (CallOperandVal->getType()->isIntegerTy())
+      weight = CW_Register;
+    break;
+
+  case 'f': // Floating-point register
+    if (type->isFloatingPointTy())
+      weight = CW_Register;
+    break;
+
+  case 'I': // Unsigned 8-bit constant
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal))
+      if (isUInt<8>(C->getZExtValue()))
+        weight = CW_Constant;
+    break;
+
+  case 'J': // Unsigned 12-bit constant
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal))
+      if (isUInt<12>(C->getZExtValue()))
+        weight = CW_Constant;
+    break;
+
+  case 'K': // Signed 16-bit constant
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal))
+      if (isInt<16>(C->getSExtValue()))
+        weight = CW_Constant;
+    break;
+
+  case 'L': // Signed 20-bit displacement (on all targets we support)
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal))
+      if (isInt<20>(C->getSExtValue()))
+        weight = CW_Constant;
+    break;
+
+  case 'M': // 0x7fffffff
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal))
+      if (C->getZExtValue() == 0x7fffffff)
+        weight = CW_Constant;
+    break;
+  }
+  return weight;
+}
+
+std::pair<unsigned, const TargetRegisterClass *> SystemZTargetLowering::
+getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const {
+  if (Constraint.size() == 1) {
+    // GCC Constraint Letters
+    switch (Constraint[0]) {
+    default: break;
+    case 'd': // Data register (equivalent to 'r')
+    case 'r': // General-purpose register
+      if (VT == MVT::i64)
+        return std::make_pair(0U, &SystemZ::GR64BitRegClass);
+      else if (VT == MVT::i128)
+        return std::make_pair(0U, &SystemZ::GR128BitRegClass);
+      return std::make_pair(0U, &SystemZ::GR32BitRegClass);
+
+    case 'a': // Address register
+      if (VT == MVT::i64)
+        return std::make_pair(0U, &SystemZ::ADDR64BitRegClass);
+      else if (VT == MVT::i128)
+        return std::make_pair(0U, &SystemZ::ADDR128BitRegClass);
+      return std::make_pair(0U, &SystemZ::ADDR32BitRegClass);
+
+    case 'f': // Floating-point register
+      if (VT == MVT::f64)
+        return std::make_pair(0U, &SystemZ::FP64BitRegClass);
+      else if (VT == MVT::f128)
+        return std::make_pair(0U, &SystemZ::FP128BitRegClass);
+      return std::make_pair(0U, &SystemZ::FP32BitRegClass);
+    }
+  }
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+void SystemZTargetLowering::
+LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+                             std::vector<SDValue> &Ops,
+                             SelectionDAG &DAG) const {
+  // Only support length 1 constraints for now.
+  if (Constraint.length() == 1) {
+    switch (Constraint[0]) {
+    case 'I': // Unsigned 8-bit constant
+      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
+        if (isUInt<8>(C->getZExtValue()))
+          Ops.push_back(DAG.getTargetConstant(C->getZExtValue(),
+                                              Op.getValueType()));
+      return;
+
+    case 'J': // Unsigned 12-bit constant
+      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
+        if (isUInt<12>(C->getZExtValue()))
+          Ops.push_back(DAG.getTargetConstant(C->getZExtValue(),
+                                              Op.getValueType()));
+      return;
+
+    case 'K': // Signed 16-bit constant
+      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
+        if (isInt<16>(C->getSExtValue()))
+          Ops.push_back(DAG.getTargetConstant(C->getSExtValue(),
+                                              Op.getValueType()));
+      return;
+
+    case 'L': // Signed 20-bit displacement (on all targets we support)
+      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
+        if (isInt<20>(C->getSExtValue()))
+          Ops.push_back(DAG.getTargetConstant(C->getSExtValue(),
+                                              Op.getValueType()));
+      return;
+
+    case 'M': // 0x7fffffff
+      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
+        if (C->getZExtValue() == 0x7fffffff)
+          Ops.push_back(DAG.getTargetConstant(C->getZExtValue(),
+                                              Op.getValueType()));
+      return;
+    }
+  }
+  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+//===----------------------------------------------------------------------===//
+// Calling conventions
+//===----------------------------------------------------------------------===//
+
+#include "SystemZGenCallingConv.inc"
+
+// Value is a value that has been passed to us in the location described by VA
+// (and so has type VA.getLocVT()).  Convert Value to VA.getValVT(), chaining
+// any loads onto Chain.
+static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDLoc DL,
+                                   CCValAssign &VA, SDValue Chain,
+                                   SDValue Value) {
+  // If the argument has been promoted from a smaller type, insert an
+  // assertion to capture this.
+  if (VA.getLocInfo() == CCValAssign::SExt)
+    Value = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Value,
+                        DAG.getValueType(VA.getValVT()));
+  else if (VA.getLocInfo() == CCValAssign::ZExt)
+    Value = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Value,
+                        DAG.getValueType(VA.getValVT()));
+
+  if (VA.isExtInLoc())
+    Value = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Value);
+  else if (VA.getLocInfo() == CCValAssign::Indirect)
+    Value = DAG.getLoad(VA.getValVT(), DL, Chain, Value,
+                        MachinePointerInfo(), false, false, false, 0);
+  else
+    assert(VA.getLocInfo() == CCValAssign::Full && "Unsupported getLocInfo");
+  return Value;
+}
+
+// Value is a value of type VA.getValVT() that we need to copy into
+// the location described by VA.  Return a copy of Value converted to
+// VA.getValVT().  The caller is responsible for handling indirect values.
+static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDLoc DL,
+                                   CCValAssign &VA, SDValue Value) {
+  switch (VA.getLocInfo()) {
+  case CCValAssign::SExt:
+    return DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Value);
+  case CCValAssign::ZExt:
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Value);
+  case CCValAssign::AExt:
+    return DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Value);
+  case CCValAssign::Full:
+    return Value;
+  default:
+    llvm_unreachable("Unhandled getLocInfo()");
+  }
+}
+
+SDValue SystemZTargetLowering::
+LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+                     const SmallVectorImpl<ISD::InputArg> &Ins,
+                     SDLoc DL, SelectionDAG &DAG,
+                     SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  SystemZMachineFunctionInfo *FuncInfo =
+    MF.getInfo<SystemZMachineFunctionInfo>();
+  const SystemZFrameLowering *TFL =
+    static_cast<const SystemZFrameLowering *>(TM.getFrameLowering());
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, MF, TM, ArgLocs, *DAG.getContext());
+  CCInfo.AnalyzeFormalArguments(Ins, CC_SystemZ);
+
+  unsigned NumFixedGPRs = 0;
+  unsigned NumFixedFPRs = 0;
+  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+    SDValue ArgValue;
+    CCValAssign &VA = ArgLocs[I];
+    EVT LocVT = VA.getLocVT();
+    if (VA.isRegLoc()) {
+      // Arguments passed in registers
+      const TargetRegisterClass *RC;
+      switch (LocVT.getSimpleVT().SimpleTy) {
+      default:
+        // Integers smaller than i64 should be promoted to i64.
+        llvm_unreachable("Unexpected argument type");
+      case MVT::i32:
+        NumFixedGPRs += 1;
+        RC = &SystemZ::GR32BitRegClass;
+        break;
+      case MVT::i64:
+        NumFixedGPRs += 1;
+        RC = &SystemZ::GR64BitRegClass;
+        break;
+      case MVT::f32:
+        NumFixedFPRs += 1;
+        RC = &SystemZ::FP32BitRegClass;
+        break;
+      case MVT::f64:
+        NumFixedFPRs += 1;
+        RC = &SystemZ::FP64BitRegClass;
+        break;
+      }
+
+      unsigned VReg = MRI.createVirtualRegister(RC);
+      MRI.addLiveIn(VA.getLocReg(), VReg);
+      ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, LocVT);
+    } else {
+      assert(VA.isMemLoc() && "Argument not register or memory");
+
+      // Create the frame index object for this incoming parameter.
+      int FI = MFI->CreateFixedObject(LocVT.getSizeInBits() / 8,
+                                      VA.getLocMemOffset(), true);
+
+      // Create the SelectionDAG nodes corresponding to a load
+      // from this parameter.  Unpromoted ints and floats are
+      // passed as right-justified 8-byte values.
+      EVT PtrVT = getPointerTy();
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32)
+        FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4));
+      ArgValue = DAG.getLoad(LocVT, DL, Chain, FIN,
+                             MachinePointerInfo::getFixedStack(FI),
+                             false, false, false, 0);
+    }
+
+    // Convert the value of the argument register into the value that's
+    // being passed.
+    InVals.push_back(convertLocVTToValVT(DAG, DL, VA, Chain, ArgValue));
+  }
+
+  if (IsVarArg) {
+    // Save the number of non-varargs registers for later use by va_start, etc.
+    FuncInfo->setVarArgsFirstGPR(NumFixedGPRs);
+    FuncInfo->setVarArgsFirstFPR(NumFixedFPRs);
+
+    // Likewise the address (in the form of a frame index) of where the
+    // first stack vararg would be.  The 1-byte size here is arbitrary.
+    int64_t StackSize = CCInfo.getNextStackOffset();
+    FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize, true));
+
+    // ...and a similar frame index for the caller-allocated save area
+    // that will be used to store the incoming registers.
+    int64_t RegSaveOffset = TFL->getOffsetOfLocalArea();
+    unsigned RegSaveIndex = MFI->CreateFixedObject(1, RegSaveOffset, true);
+    FuncInfo->setRegSaveFrameIndex(RegSaveIndex);
+
+    // Store the FPR varargs in the reserved frame slots.  (We store the
+    // GPRs as part of the prologue.)
+    if (NumFixedFPRs < SystemZ::NumArgFPRs) {
+      SDValue MemOps[SystemZ::NumArgFPRs];
+      for (unsigned I = NumFixedFPRs; I < SystemZ::NumArgFPRs; ++I) {
+        unsigned Offset = TFL->getRegSpillOffset(SystemZ::ArgFPRs[I]);
+        int FI = MFI->CreateFixedObject(8, RegSaveOffset + Offset, true);
+        SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+        unsigned VReg = MF.addLiveIn(SystemZ::ArgFPRs[I],
+                                     &SystemZ::FP64BitRegClass);
+        SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f64);
+        MemOps[I] = DAG.getStore(ArgValue.getValue(1), DL, ArgValue, FIN,
+                                 MachinePointerInfo::getFixedStack(FI),
+                                 false, false, 0);
+
+      }
+      // Join the stores, which are independent of one another.
+      Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                          &MemOps[NumFixedFPRs],
+                          SystemZ::NumArgFPRs - NumFixedFPRs);
+    }
+  }
+
+  return Chain;
+}
+
+SDValue
+SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
+                                 SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG = CLI.DAG;
+  SDLoc &DL = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
+  SDValue Chain = CLI.Chain;
+  SDValue Callee = CLI.Callee;
+  bool &isTailCall = CLI.IsTailCall;
+  CallingConv::ID CallConv = CLI.CallConv;
+  bool IsVarArg = CLI.IsVarArg;
+  MachineFunction &MF = DAG.getMachineFunction();
+  EVT PtrVT = getPointerTy();
+
+  // SystemZ target does not yet support tail call optimization.
+  isTailCall = false;
+
+  // Analyze the operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState ArgCCInfo(CallConv, IsVarArg, MF, TM, ArgLocs, *DAG.getContext());
+  ArgCCInfo.AnalyzeCallOperands(Outs, CC_SystemZ);
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = ArgCCInfo.getNextStackOffset();
+
+  // Mark the start of the call.
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getConstant(NumBytes, PtrVT, true),
+                               DL);
+
+  // Copy argument values to their designated locations.
+  SmallVector<std::pair<unsigned, SDValue>, 9> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+  SDValue StackPtr;
+  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+    CCValAssign &VA = ArgLocs[I];
+    SDValue ArgValue = OutVals[I];
+
+    if (VA.getLocInfo() == CCValAssign::Indirect) {
+      // Store the argument in a stack slot and pass its address.
+      SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
+      int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
+      MemOpChains.push_back(DAG.getStore(Chain, DL, ArgValue, SpillSlot,
+                                         MachinePointerInfo::getFixedStack(FI),
+                                         false, false, 0));
+      ArgValue = SpillSlot;
+    } else
+      ArgValue = convertValVTToLocVT(DAG, DL, VA, ArgValue);
+
+    if (VA.isRegLoc())
+      // Queue up the argument copies and emit them at the end.
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue));
+    else {
+      assert(VA.isMemLoc() && "Argument not register or memory");
+
+      // Work out the address of the stack slot.  Unpromoted ints and
+      // floats are passed as right-justified 8-byte values.
+      if (!StackPtr.getNode())
+        StackPtr = DAG.getCopyFromReg(Chain, DL, SystemZ::R15D, PtrVT);
+      unsigned Offset = SystemZMC::CallFrameSize + VA.getLocMemOffset();
+      if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32)
+        Offset += 4;
+      SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr,
+                                    DAG.getIntPtrConstant(Offset));
+
+      // Emit the store.
+      MemOpChains.push_back(DAG.getStore(Chain, DL, ArgValue, Address,
+                                         MachinePointerInfo(),
+                                         false, false, 0));
+    }
+  }
+
+  // Join the stores, which are independent of one another.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes, chained and glued together.
+  SDValue Glue;
+  for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I) {
+    Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[I].first,
+                             RegsToPass[I].second, Glue);
+    Glue = Chain.getValue(1);
+  }
+
+  // Accept direct calls by converting symbolic call addresses to the
+  // associated Target* opcodes.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, PtrVT);
+    Callee = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Callee);
+  } else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT);
+    Callee = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Callee);
+  }
+
+  // The first call operand is the chain and the second is the target address.
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I)
+    Ops.push_back(DAG.getRegister(RegsToPass[I].first,
+                                  RegsToPass[I].second.getValueType()));
+
+  // Glue the call to the argument copies, if any.
+  if (Glue.getNode())
+    Ops.push_back(Glue);
+
+  // Emit the call.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  Chain = DAG.getNode(SystemZISD::CALL, DL, NodeTys, &Ops[0], Ops.size());
+  Glue = Chain.getValue(1);
+
+  // Mark the end of the call, which is glued to the call itself.
+  Chain = DAG.getCALLSEQ_END(Chain,
+                             DAG.getConstant(NumBytes, PtrVT, true),
+                             DAG.getConstant(0, PtrVT, true),
+                             Glue, DL);
+  Glue = Chain.getValue(1);
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RetLocs;
+  CCState RetCCInfo(CallConv, IsVarArg, MF, TM, RetLocs, *DAG.getContext());
+  RetCCInfo.AnalyzeCallResult(Ins, RetCC_SystemZ);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned I = 0, E = RetLocs.size(); I != E; ++I) {
+    CCValAssign &VA = RetLocs[I];
+
+    // Copy the value out, gluing the copy to the end of the call sequence.
+    SDValue RetValue = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(),
+                                          VA.getLocVT(), Glue);
+    Chain = RetValue.getValue(1);
+    Glue = RetValue.getValue(2);
+
+    // Convert the value of the return register into the value that's
+    // being returned.
+    InVals.push_back(convertLocVTToValVT(DAG, DL, VA, Chain, RetValue));
+  }
+
+  return Chain;
+}
+
+SDValue
+SystemZTargetLowering::LowerReturn(SDValue Chain,
+                                   CallingConv::ID CallConv, bool IsVarArg,
+                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                   const SmallVectorImpl<SDValue> &OutVals,
+                                   SDLoc DL, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // Assign locations to each returned value.
+  SmallVector<CCValAssign, 16> RetLocs;
+  CCState RetCCInfo(CallConv, IsVarArg, MF, TM, RetLocs, *DAG.getContext());
+  RetCCInfo.AnalyzeReturn(Outs, RetCC_SystemZ);
+
+  // Quick exit for void returns
+  if (RetLocs.empty())
+    return DAG.getNode(SystemZISD::RET_FLAG, DL, MVT::Other, Chain);
+
+  // Copy the result values into the output registers.
+  SDValue Glue;
+  SmallVector<SDValue, 4> RetOps;
+  RetOps.push_back(Chain);
+  for (unsigned I = 0, E = RetLocs.size(); I != E; ++I) {
+    CCValAssign &VA = RetLocs[I];
+    SDValue RetValue = OutVals[I];
+
+    // Make the return register live on exit.
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    // Promote the value as required.
+    RetValue = convertValVTToLocVT(DAG, DL, VA, RetValue);
+
+    // Chain and glue the copies together.
+    unsigned Reg = VA.getLocReg();
+    Chain = DAG.getCopyToReg(Chain, DL, Reg, RetValue, Glue);
+    Glue = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(Reg, VA.getLocVT()));
+  }
+
+  // Update chain and glue.
+  RetOps[0] = Chain;
+  if (Glue.getNode())
+    RetOps.push_back(Glue);
+
+  return DAG.getNode(SystemZISD::RET_FLAG, DL, MVT::Other,
+                     RetOps.data(), RetOps.size());
+}
+
+// CC is a comparison that will be implemented using an integer or
+// floating-point comparison.  Return the condition code mask for
+// a branch on true.  In the integer case, CCMASK_CMP_UO is set for
+// unsigned comparisons and clear for signed ones.  In the floating-point
+// case, CCMASK_CMP_UO has its normal mask meaning (unordered).
+static unsigned CCMaskForCondCode(ISD::CondCode CC) {
+#define CONV(X) \
+  case ISD::SET##X: return SystemZ::CCMASK_CMP_##X; \
+  case ISD::SETO##X: return SystemZ::CCMASK_CMP_##X; \
+  case ISD::SETU##X: return SystemZ::CCMASK_CMP_UO | SystemZ::CCMASK_CMP_##X
+
+  switch (CC) {
+  default:
+    llvm_unreachable("Invalid integer condition!");
+
+  CONV(EQ);
+  CONV(NE);
+  CONV(GT);
+  CONV(GE);
+  CONV(LT);
+  CONV(LE);
+
+  case ISD::SETO:  return SystemZ::CCMASK_CMP_O;
+  case ISD::SETUO: return SystemZ::CCMASK_CMP_UO;
+  }
+#undef CONV
+}
+
+// If a comparison described by IsUnsigned, CCMask, CmpOp0 and CmpOp1
+// is suitable for CLI(Y), CHHSI or CLHHSI, adjust the operands as necessary.
+static void adjustSubwordCmp(SelectionDAG &DAG, bool &IsUnsigned,
+                             SDValue &CmpOp0, SDValue &CmpOp1,
+                             unsigned &CCMask) {
+  // For us to make any changes, it must a comparison between a single-use
+  // load and a constant.
+  if (!CmpOp0.hasOneUse() ||
+      CmpOp0.getOpcode() != ISD::LOAD ||
+      CmpOp1.getOpcode() != ISD::Constant)
+    return;
+
+  // We must have an 8- or 16-bit load.
+  LoadSDNode *Load = cast<LoadSDNode>(CmpOp0);
+  unsigned NumBits = Load->getMemoryVT().getStoreSizeInBits();
+  if (NumBits != 8 && NumBits != 16)
+    return;
+
+  // The load must be an extending one and the constant must be within the
+  // range of the unextended value.
+  ConstantSDNode *Constant = cast<ConstantSDNode>(CmpOp1);
+  uint64_t Value = Constant->getZExtValue();
+  uint64_t Mask = (1 << NumBits) - 1;
+  if (Load->getExtensionType() == ISD::SEXTLOAD) {
+    int64_t SignedValue = Constant->getSExtValue();
+    if (uint64_t(SignedValue) + (1ULL << (NumBits - 1)) > Mask)
+      return;
+    // Unsigned comparison between two sign-extended values is equivalent
+    // to unsigned comparison between two zero-extended values.
+    if (IsUnsigned)
+      Value &= Mask;
+    else if (CCMask == SystemZ::CCMASK_CMP_EQ ||
+             CCMask == SystemZ::CCMASK_CMP_NE)
+      // Any choice of IsUnsigned is OK for equality comparisons.
+      // We could use either CHHSI or CLHHSI for 16-bit comparisons,
+      // but since we use CLHHSI for zero extensions, it seems better
+      // to be consistent and do the same here.
+      Value &= Mask, IsUnsigned = true;
+    else if (NumBits == 8) {
+      // Try to treat the comparison as unsigned, so that we can use CLI.
+      // Adjust CCMask and Value as necessary.
+      if (Value == 0 && CCMask == SystemZ::CCMASK_CMP_LT)
+        // Test whether the high bit of the byte is set.
+        Value = 127, CCMask = SystemZ::CCMASK_CMP_GT, IsUnsigned = true;
+      else if (SignedValue == -1 && CCMask == SystemZ::CCMASK_CMP_GT)
+        // Test whether the high bit of the byte is clear.
+        Value = 128, CCMask = SystemZ::CCMASK_CMP_LT, IsUnsigned = true;
+      else
+        // No instruction exists for this combination.
+        return;
+    }
+  } else if (Load->getExtensionType() == ISD::ZEXTLOAD) {
+    if (Value > Mask)
+      return;
+    // Signed comparison between two zero-extended values is equivalent
+    // to unsigned comparison.
+    IsUnsigned = true;
+  } else
+    return;
+
+  // Make sure that the first operand is an i32 of the right extension type.
+  ISD::LoadExtType ExtType = IsUnsigned ? ISD::ZEXTLOAD : ISD::SEXTLOAD;
+  if (CmpOp0.getValueType() != MVT::i32 ||
+      Load->getExtensionType() != ExtType)
+    CmpOp0 = DAG.getExtLoad(ExtType, SDLoc(Load), MVT::i32,
+                            Load->getChain(), Load->getBasePtr(),
+                            Load->getPointerInfo(), Load->getMemoryVT(),
+                            Load->isVolatile(), Load->isNonTemporal(),
+                            Load->getAlignment());
+
+  // Make sure that the second operand is an i32 with the right value.
+  if (CmpOp1.getValueType() != MVT::i32 ||
+      Value != Constant->getZExtValue())
+    CmpOp1 = DAG.getConstant(Value, MVT::i32);
+}
+
+// Return true if a comparison described by CCMask, CmpOp0 and CmpOp1
+// is an equality comparison that is better implemented using unsigned
+// rather than signed comparison instructions.
+static bool preferUnsignedComparison(SelectionDAG &DAG, SDValue CmpOp0,
+                                     SDValue CmpOp1, unsigned CCMask) {
+  // The test must be for equality or inequality.
+  if (CCMask != SystemZ::CCMASK_CMP_EQ && CCMask != SystemZ::CCMASK_CMP_NE)
+    return false;
+
+  if (CmpOp1.getOpcode() == ISD::Constant) {
+    uint64_t Value = cast<ConstantSDNode>(CmpOp1)->getSExtValue();
+
+    // If we're comparing with memory, prefer unsigned comparisons for
+    // values that are in the unsigned 16-bit range but not the signed
+    // 16-bit range.  We want to use CLFHSI and CLGHSI.
+    if (CmpOp0.hasOneUse() &&
+        ISD::isNormalLoad(CmpOp0.getNode()) &&
+        (Value >= 32768 && Value < 65536))
+      return true;
+
+    // Use unsigned comparisons for values that are in the CLGFI range
+    // but not in the CGFI range.
+    if (CmpOp0.getValueType() == MVT::i64 && (Value >> 31) == 1)
+      return true;
+
+    return false;
+  }
+
+  // Prefer CL for zero-extended loads.
+  if (CmpOp1.getOpcode() == ISD::ZERO_EXTEND ||
+      ISD::isZEXTLoad(CmpOp1.getNode()))
+    return true;
+
+  // ...and for "in-register" zero extensions.
+  if (CmpOp1.getOpcode() == ISD::AND && CmpOp1.getValueType() == MVT::i64) {
+    SDValue Mask = CmpOp1.getOperand(1);
+    if (Mask.getOpcode() == ISD::Constant &&
+        cast<ConstantSDNode>(Mask)->getZExtValue() == 0xffffffff)
+      return true;
+  }
+
+  return false;
+}
+
+// Return a target node that compares CmpOp0 and CmpOp1.  Set CCMask to the
+// 4-bit condition-code mask for CC.
+static SDValue emitCmp(SelectionDAG &DAG, SDValue CmpOp0, SDValue CmpOp1,
+                       ISD::CondCode CC, unsigned &CCMask) {
+  bool IsUnsigned = false;
+  CCMask = CCMaskForCondCode(CC);
+  if (!CmpOp0.getValueType().isFloatingPoint()) {
+    IsUnsigned = CCMask & SystemZ::CCMASK_CMP_UO;
+    CCMask &= ~SystemZ::CCMASK_CMP_UO;
+    adjustSubwordCmp(DAG, IsUnsigned, CmpOp0, CmpOp1, CCMask);
+    if (preferUnsignedComparison(DAG, CmpOp0, CmpOp1, CCMask))
+      IsUnsigned = true;
+  }
+
+  SDLoc DL(CmpOp0);
+  return DAG.getNode((IsUnsigned ? SystemZISD::UCMP : SystemZISD::CMP),
+                     DL, MVT::Glue, CmpOp0, CmpOp1);
+}
+
+// Lower a binary operation that produces two VT results, one in each
+// half of a GR128 pair.  Op0 and Op1 are the VT operands to the operation,
+// Extend extends Op0 to a GR128, and Opcode performs the GR128 operation
+// on the extended Op0 and (unextended) Op1.  Store the even register result
+// in Even and the odd register result in Odd.
+static void lowerGR128Binary(SelectionDAG &DAG, SDLoc DL, EVT VT,
+                             unsigned Extend, unsigned Opcode,
+                             SDValue Op0, SDValue Op1,
+                             SDValue &Even, SDValue &Odd) {
+  SDNode *In128 = DAG.getMachineNode(Extend, DL, MVT::Untyped, Op0);
+  SDValue Result = DAG.getNode(Opcode, DL, MVT::Untyped,
+                               SDValue(In128, 0), Op1);
+  bool Is32Bit = is32Bit(VT);
+  SDValue SubReg0 = DAG.getTargetConstant(SystemZ::even128(Is32Bit), VT);
+  SDValue SubReg1 = DAG.getTargetConstant(SystemZ::odd128(Is32Bit), VT);
+  SDNode *Reg0 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
+                                    VT, Result, SubReg0);
+  SDNode *Reg1 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
+                                    VT, Result, SubReg1);
+  Even = SDValue(Reg0, 0);
+  Odd = SDValue(Reg1, 0);
+}
+
+SDValue SystemZTargetLowering::lowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain    = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue CmpOp0   = Op.getOperand(2);
+  SDValue CmpOp1   = Op.getOperand(3);
+  SDValue Dest     = Op.getOperand(4);
+  SDLoc DL(Op);
+
+  unsigned CCMask;
+  SDValue Flags = emitCmp(DAG, CmpOp0, CmpOp1, CC, CCMask);
+  return DAG.getNode(SystemZISD::BR_CCMASK, DL, Op.getValueType(),
+                     Chain, DAG.getConstant(CCMask, MVT::i32), Dest, Flags);
+}
+
+SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDValue CmpOp0   = Op.getOperand(0);
+  SDValue CmpOp1   = Op.getOperand(1);
+  SDValue TrueOp   = Op.getOperand(2);
+  SDValue FalseOp  = Op.getOperand(3);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  SDLoc DL(Op);
+
+  unsigned CCMask;
+  SDValue Flags = emitCmp(DAG, CmpOp0, CmpOp1, CC, CCMask);
+
+  SmallVector<SDValue, 4> Ops;
+  Ops.push_back(TrueOp);
+  Ops.push_back(FalseOp);
+  Ops.push_back(DAG.getConstant(CCMask, MVT::i32));
+  Ops.push_back(Flags);
+
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+  return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, VTs, &Ops[0], Ops.size());
+}
+
+SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
+                                                  SelectionDAG &DAG) const {
+  SDLoc DL(Node);
+  const GlobalValue *GV = Node->getGlobal();
+  int64_t Offset = Node->getOffset();
+  EVT PtrVT = getPointerTy();
+  Reloc::Model RM = TM.getRelocationModel();
+  CodeModel::Model CM = TM.getCodeModel();
+
+  SDValue Result;
+  if (Subtarget.isPC32DBLSymbol(GV, RM, CM)) {
+    // Make sure that the offset is aligned to a halfword.  If it isn't,
+    // create an "anchor" at the previous 12-bit boundary.
+    // FIXME check whether there is a better way of handling this.
+    if (Offset & 1) {
+      Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT,
+                                          Offset & ~uint64_t(0xfff));
+      Offset &= 0xfff;
+    } else {
+      Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, Offset);
+      Offset = 0;
+    }
+    Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
+  } else {
+    Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, SystemZII::MO_GOT);
+    Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
+    Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
+                         MachinePointerInfo::getGOT(), false, false, false, 0);
+  }
+
+  // If there was a non-zero offset that we didn't fold, create an explicit
+  // addition for it.
+  if (Offset != 0)
+    Result = DAG.getNode(ISD::ADD, DL, PtrVT, Result,
+                         DAG.getConstant(Offset, PtrVT));
+
+  return Result;
+}
+
+SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
+						     SelectionDAG &DAG) const {
+  SDLoc DL(Node);
+  const GlobalValue *GV = Node->getGlobal();
+  EVT PtrVT = getPointerTy();
+  TLSModel::Model model = TM.getTLSModel(GV);
+
+  if (model != TLSModel::LocalExec)
+    llvm_unreachable("only local-exec TLS mode supported");
+
+  // The high part of the thread pointer is in access register 0.
+  SDValue TPHi = DAG.getNode(SystemZISD::EXTRACT_ACCESS, DL, MVT::i32,
+                             DAG.getConstant(0, MVT::i32));
+  TPHi = DAG.getNode(ISD::ANY_EXTEND, DL, PtrVT, TPHi);
+
+  // The low part of the thread pointer is in access register 1.
+  SDValue TPLo = DAG.getNode(SystemZISD::EXTRACT_ACCESS, DL, MVT::i32,
+                             DAG.getConstant(1, MVT::i32));
+  TPLo = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TPLo);
+
+  // Merge them into a single 64-bit address.
+  SDValue TPHiShifted = DAG.getNode(ISD::SHL, DL, PtrVT, TPHi,
+				    DAG.getConstant(32, PtrVT));
+  SDValue TP = DAG.getNode(ISD::OR, DL, PtrVT, TPHiShifted, TPLo);
+
+  // Get the offset of GA from the thread pointer.
+  SystemZConstantPoolValue *CPV =
+    SystemZConstantPoolValue::Create(GV, SystemZCP::NTPOFF);
+
+  // Force the offset into the constant pool and load it from there.
+  SDValue CPAddr = DAG.getConstantPool(CPV, PtrVT, 8);
+  SDValue Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
+			       CPAddr, MachinePointerInfo::getConstantPool(),
+			       false, false, false, 0);
+
+  // Add the base and offset together.
+  return DAG.getNode(ISD::ADD, DL, PtrVT, TP, Offset);
+}
+
+SDValue SystemZTargetLowering::lowerBlockAddress(BlockAddressSDNode *Node,
+                                                 SelectionDAG &DAG) const {
+  SDLoc DL(Node);
+  const BlockAddress *BA = Node->getBlockAddress();
+  int64_t Offset = Node->getOffset();
+  EVT PtrVT = getPointerTy();
+
+  SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset);
+  Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
+  return Result;
+}
+
+SDValue SystemZTargetLowering::lowerJumpTable(JumpTableSDNode *JT,
+                                              SelectionDAG &DAG) const {
+  SDLoc DL(JT);
+  EVT PtrVT = getPointerTy();
+  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
+
+  // Use LARL to load the address of the table.
+  return DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
+}
+
+SDValue SystemZTargetLowering::lowerConstantPool(ConstantPoolSDNode *CP,
+                                                 SelectionDAG &DAG) const {
+  SDLoc DL(CP);
+  EVT PtrVT = getPointerTy();
+
+  SDValue Result;
+  if (CP->isMachineConstantPoolEntry())
+    Result = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
+				       CP->getAlignment());
+  else
+    Result = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
+				       CP->getAlignment(), CP->getOffset());
+
+  // Use LARL to load the address of the constant pool entry.
+  return DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
+}
+
+SDValue SystemZTargetLowering::lowerBITCAST(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue In = Op.getOperand(0);
+  EVT InVT = In.getValueType();
+  EVT ResVT = Op.getValueType();
+
+  SDValue SubReg32 = DAG.getTargetConstant(SystemZ::subreg_32bit, MVT::i64);
+  SDValue Shift32 = DAG.getConstant(32, MVT::i64);
+  if (InVT == MVT::i32 && ResVT == MVT::f32) {
+    SDValue In64 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, In);
+    SDValue Shift = DAG.getNode(ISD::SHL, DL, MVT::i64, In64, Shift32);
+    SDValue Out64 = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Shift);
+    SDNode *Out = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
+                                     MVT::f32, Out64, SubReg32);
+    return SDValue(Out, 0);
+  }
+  if (InVT == MVT::f32 && ResVT == MVT::i32) {
+    SDNode *U64 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::f64);
+    SDNode *In64 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
+                                      MVT::f64, SDValue(U64, 0), In, SubReg32);
+    SDValue Out64 = DAG.getNode(ISD::BITCAST, DL, MVT::i64, SDValue(In64, 0));
+    SDValue Shift = DAG.getNode(ISD::SRL, DL, MVT::i64, Out64, Shift32);
+    SDValue Out = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Shift);
+    return Out;
+  }
+  llvm_unreachable("Unexpected bitcast combination");
+}
+
+SDValue SystemZTargetLowering::lowerVASTART(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  SystemZMachineFunctionInfo *FuncInfo =
+    MF.getInfo<SystemZMachineFunctionInfo>();
+  EVT PtrVT = getPointerTy();
+
+  SDValue Chain   = Op.getOperand(0);
+  SDValue Addr    = Op.getOperand(1);
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  SDLoc DL(Op);
+
+  // The initial values of each field.
+  const unsigned NumFields = 4;
+  SDValue Fields[NumFields] = {
+    DAG.getConstant(FuncInfo->getVarArgsFirstGPR(), PtrVT),
+    DAG.getConstant(FuncInfo->getVarArgsFirstFPR(), PtrVT),
+    DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT),
+    DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT)
+  };
+
+  // Store each field into its respective slot.
+  SDValue MemOps[NumFields];
+  unsigned Offset = 0;
+  for (unsigned I = 0; I < NumFields; ++I) {
+    SDValue FieldAddr = Addr;
+    if (Offset != 0)
+      FieldAddr = DAG.getNode(ISD::ADD, DL, PtrVT, FieldAddr,
+                              DAG.getIntPtrConstant(Offset));
+    MemOps[I] = DAG.getStore(Chain, DL, Fields[I], FieldAddr,
+                             MachinePointerInfo(SV, Offset),
+                             false, false, 0);
+    Offset += 8;
+  }
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps, NumFields);
+}
+
+SDValue SystemZTargetLowering::lowerVACOPY(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDValue Chain      = Op.getOperand(0);
+  SDValue DstPtr     = Op.getOperand(1);
+  SDValue SrcPtr     = Op.getOperand(2);
+  const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+  SDLoc DL(Op);
+
+  return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(32),
+                       /*Align*/8, /*isVolatile*/false, /*AlwaysInline*/false,
+                       MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
+}
+
+SDValue SystemZTargetLowering::
+lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size  = Op.getOperand(1);
+  SDLoc DL(Op);
+
+  unsigned SPReg = getStackPointerRegisterToSaveRestore();
+
+  // Get a reference to the stack pointer.
+  SDValue OldSP = DAG.getCopyFromReg(Chain, DL, SPReg, MVT::i64);
+
+  // Get the new stack pointer value.
+  SDValue NewSP = DAG.getNode(ISD::SUB, DL, MVT::i64, OldSP, Size);
+
+  // Copy the new stack pointer back.
+  Chain = DAG.getCopyToReg(Chain, DL, SPReg, NewSP);
+
+  // The allocated data lives above the 160 bytes allocated for the standard
+  // frame, plus any outgoing stack arguments.  We don't know how much that
+  // amounts to yet, so emit a special ADJDYNALLOC placeholder.
+  SDValue ArgAdjust = DAG.getNode(SystemZISD::ADJDYNALLOC, DL, MVT::i64);
+  SDValue Result = DAG.getNode(ISD::ADD, DL, MVT::i64, NewSP, ArgAdjust);
+
+  SDValue Ops[2] = { Result, Chain };
+  return DAG.getMergeValues(Ops, 2, DL);
+}
+
+SDValue SystemZTargetLowering::lowerUMUL_LOHI(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  assert(!is32Bit(VT) && "Only support 64-bit UMUL_LOHI");
+
+  // UMUL_LOHI64 returns the low result in the odd register and the high
+  // result in the even register.  UMUL_LOHI is defined to return the
+  // low half first, so the results are in reverse order.
+  SDValue Ops[2];
+  lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, SystemZISD::UMUL_LOHI64,
+                   Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
+  return DAG.getMergeValues(Ops, 2, DL);
+}
+
+SDValue SystemZTargetLowering::lowerSDIVREM(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+
+  // We use DSGF for 32-bit division.
+  if (is32Bit(VT)) {
+    Op0 = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Op0);
+    Op1 = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Op1);
+  }
+
+  // DSG(F) takes a 64-bit dividend, so the even register in the GR128
+  // input is "don't care".  The instruction returns the remainder in
+  // the even register and the quotient in the odd register.
+  SDValue Ops[2];
+  lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, SystemZISD::SDIVREM64,
+                   Op0, Op1, Ops[1], Ops[0]);
+  return DAG.getMergeValues(Ops, 2, DL);
+}
+
+SDValue SystemZTargetLowering::lowerUDIVREM(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+
+  // DL(G) uses a double-width dividend, so we need to clear the even
+  // register in the GR128 input.  The instruction returns the remainder
+  // in the even register and the quotient in the odd register.
+  SDValue Ops[2];
+  if (is32Bit(VT))
+    lowerGR128Binary(DAG, DL, VT, SystemZ::ZEXT128_32, SystemZISD::UDIVREM32,
+                     Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
+  else
+    lowerGR128Binary(DAG, DL, VT, SystemZ::ZEXT128_64, SystemZISD::UDIVREM64,
+                     Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
+  return DAG.getMergeValues(Ops, 2, DL);
+}
+
+SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const {
+  assert(Op.getValueType() == MVT::i64 && "Should be 64-bit operation");
+
+  // Get the known-zero masks for each operand.
+  SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) };
+  APInt KnownZero[2], KnownOne[2];
+  DAG.ComputeMaskedBits(Ops[0], KnownZero[0], KnownOne[0]);
+  DAG.ComputeMaskedBits(Ops[1], KnownZero[1], KnownOne[1]);
+
+  // See if the upper 32 bits of one operand and the lower 32 bits of the
+  // other are known zero.  They are the low and high operands respectively.
+  uint64_t Masks[] = { KnownZero[0].getZExtValue(),
+                       KnownZero[1].getZExtValue() };
+  unsigned High, Low;
+  if ((Masks[0] >> 32) == 0xffffffff && uint32_t(Masks[1]) == 0xffffffff)
+    High = 1, Low = 0;
+  else if ((Masks[1] >> 32) == 0xffffffff && uint32_t(Masks[0]) == 0xffffffff)
+    High = 0, Low = 1;
+  else
+    return Op;
+
+  SDValue LowOp = Ops[Low];
+  SDValue HighOp = Ops[High];
+
+  // If the high part is a constant, we're better off using IILH.
+  if (HighOp.getOpcode() == ISD::Constant)
+    return Op;
+
+  // If the low part is a constant that is outside the range of LHI,
+  // then we're better off using IILF.
+  if (LowOp.getOpcode() == ISD::Constant) {
+    int64_t Value = int32_t(cast<ConstantSDNode>(LowOp)->getZExtValue());
+    if (!isInt<16>(Value))
+      return Op;
+  }
+
+  // Check whether the high part is an AND that doesn't change the
+  // high 32 bits and just masks out low bits.  We can skip it if so.
+  if (HighOp.getOpcode() == ISD::AND &&
+      HighOp.getOperand(1).getOpcode() == ISD::Constant) {
+    ConstantSDNode *MaskNode = cast<ConstantSDNode>(HighOp.getOperand(1));
+    uint64_t Mask = MaskNode->getZExtValue() | Masks[High];
+    if ((Mask >> 32) == 0xffffffff)
+      HighOp = HighOp.getOperand(0);
+  }
+
+  // Take advantage of the fact that all GR32 operations only change the
+  // low 32 bits by truncating Low to an i32 and inserting it directly
+  // using a subreg.  The interesting cases are those where the truncation
+  // can be folded.
+  SDLoc DL(Op);
+  SDValue Low32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, LowOp);
+  SDValue SubReg32 = DAG.getTargetConstant(SystemZ::subreg_32bit, MVT::i64);
+  SDNode *Result = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
+                                      MVT::i64, HighOp, Low32, SubReg32);
+  return SDValue(Result, 0);
+}
+
+// Op is an 8-, 16-bit or 32-bit ATOMIC_LOAD_* operation.  Lower the first
+// two into the fullword ATOMIC_LOADW_* operation given by Opcode.
+SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
+                                                SelectionDAG &DAG,
+                                                unsigned Opcode) const {
+  AtomicSDNode *Node = cast<AtomicSDNode>(Op.getNode());
+
+  // 32-bit operations need no code outside the main loop.
+  EVT NarrowVT = Node->getMemoryVT();
+  EVT WideVT = MVT::i32;
+  if (NarrowVT == WideVT)
+    return Op;
+
+  int64_t BitSize = NarrowVT.getSizeInBits();
+  SDValue ChainIn = Node->getChain();
+  SDValue Addr = Node->getBasePtr();
+  SDValue Src2 = Node->getVal();
+  MachineMemOperand *MMO = Node->getMemOperand();
+  SDLoc DL(Node);
+  EVT PtrVT = Addr.getValueType();
+
+  // Convert atomic subtracts of constants into additions.
+  if (Opcode == SystemZISD::ATOMIC_LOADW_SUB)
+    if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Src2)) {
+      Opcode = SystemZISD::ATOMIC_LOADW_ADD;
+      Src2 = DAG.getConstant(-Const->getSExtValue(), Src2.getValueType());
+    }
+
+  // Get the address of the containing word.
+  SDValue AlignedAddr = DAG.getNode(ISD::AND, DL, PtrVT, Addr,
+                                    DAG.getConstant(-4, PtrVT));
+
+  // Get the number of bits that the word must be rotated left in order
+  // to bring the field to the top bits of a GR32.
+  SDValue BitShift = DAG.getNode(ISD::SHL, DL, PtrVT, Addr,
+                                 DAG.getConstant(3, PtrVT));
+  BitShift = DAG.getNode(ISD::TRUNCATE, DL, WideVT, BitShift);
+
+  // Get the complementing shift amount, for rotating a field in the top
+  // bits back to its proper position.
+  SDValue NegBitShift = DAG.getNode(ISD::SUB, DL, WideVT,
+                                    DAG.getConstant(0, WideVT), BitShift);
+
+  // Extend the source operand to 32 bits and prepare it for the inner loop.
+  // ATOMIC_SWAPW uses RISBG to rotate the field left, but all other
+  // operations require the source to be shifted in advance.  (This shift
+  // can be folded if the source is constant.)  For AND and NAND, the lower
+  // bits must be set, while for other opcodes they should be left clear.
+  if (Opcode != SystemZISD::ATOMIC_SWAPW)
+    Src2 = DAG.getNode(ISD::SHL, DL, WideVT, Src2,
+                       DAG.getConstant(32 - BitSize, WideVT));
+  if (Opcode == SystemZISD::ATOMIC_LOADW_AND ||
+      Opcode == SystemZISD::ATOMIC_LOADW_NAND)
+    Src2 = DAG.getNode(ISD::OR, DL, WideVT, Src2,
+                       DAG.getConstant(uint32_t(-1) >> BitSize, WideVT));
+
+  // Construct the ATOMIC_LOADW_* node.
+  SDVTList VTList = DAG.getVTList(WideVT, MVT::Other);
+  SDValue Ops[] = { ChainIn, AlignedAddr, Src2, BitShift, NegBitShift,
+                    DAG.getConstant(BitSize, WideVT) };
+  SDValue AtomicOp = DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops,
+                                             array_lengthof(Ops),
+                                             NarrowVT, MMO);
+
+  // Rotate the result of the final CS so that the field is in the lower
+  // bits of a GR32, then truncate it.
+  SDValue ResultShift = DAG.getNode(ISD::ADD, DL, WideVT, BitShift,
+                                    DAG.getConstant(BitSize, WideVT));
+  SDValue Result = DAG.getNode(ISD::ROTL, DL, WideVT, AtomicOp, ResultShift);
+
+  SDValue RetOps[2] = { Result, AtomicOp.getValue(1) };
+  return DAG.getMergeValues(RetOps, 2, DL);
+}
+
+// Node is an 8- or 16-bit ATOMIC_CMP_SWAP operation.  Lower the first two
+// into a fullword ATOMIC_CMP_SWAPW operation.
+SDValue SystemZTargetLowering::lowerATOMIC_CMP_SWAP(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  AtomicSDNode *Node = cast<AtomicSDNode>(Op.getNode());
+
+  // We have native support for 32-bit compare and swap.
+  EVT NarrowVT = Node->getMemoryVT();
+  EVT WideVT = MVT::i32;
+  if (NarrowVT == WideVT)
+    return Op;
+
+  int64_t BitSize = NarrowVT.getSizeInBits();
+  SDValue ChainIn = Node->getOperand(0);
+  SDValue Addr = Node->getOperand(1);
+  SDValue CmpVal = Node->getOperand(2);
+  SDValue SwapVal = Node->getOperand(3);
+  MachineMemOperand *MMO = Node->getMemOperand();
+  SDLoc DL(Node);
+  EVT PtrVT = Addr.getValueType();
+
+  // Get the address of the containing word.
+  SDValue AlignedAddr = DAG.getNode(ISD::AND, DL, PtrVT, Addr,
+                                    DAG.getConstant(-4, PtrVT));
+
+  // Get the number of bits that the word must be rotated left in order
+  // to bring the field to the top bits of a GR32.
+  SDValue BitShift = DAG.getNode(ISD::SHL, DL, PtrVT, Addr,
+                                 DAG.getConstant(3, PtrVT));
+  BitShift = DAG.getNode(ISD::TRUNCATE, DL, WideVT, BitShift);
+
+  // Get the complementing shift amount, for rotating a field in the top
+  // bits back to its proper position.
+  SDValue NegBitShift = DAG.getNode(ISD::SUB, DL, WideVT,
+                                    DAG.getConstant(0, WideVT), BitShift);
+
+  // Construct the ATOMIC_CMP_SWAPW node.
+  SDVTList VTList = DAG.getVTList(WideVT, MVT::Other);
+  SDValue Ops[] = { ChainIn, AlignedAddr, CmpVal, SwapVal, BitShift,
+                    NegBitShift, DAG.getConstant(BitSize, WideVT) };
+  SDValue AtomicOp = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_CMP_SWAPW, DL,
+                                             VTList, Ops, array_lengthof(Ops),
+                                             NarrowVT, MMO);
+  return AtomicOp;
+}
+
+SDValue SystemZTargetLowering::lowerSTACKSAVE(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MF.getInfo<SystemZMachineFunctionInfo>()->setManipulatesSP(true);
+  return DAG.getCopyFromReg(Op.getOperand(0), SDLoc(Op),
+                            SystemZ::R15D, Op.getValueType());
+}
+
+SDValue SystemZTargetLowering::lowerSTACKRESTORE(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MF.getInfo<SystemZMachineFunctionInfo>()->setManipulatesSP(true);
+  return DAG.getCopyToReg(Op.getOperand(0), SDLoc(Op),
+                          SystemZ::R15D, Op.getOperand(1));
+}
+
+SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  case ISD::BR_CC:
+    return lowerBR_CC(Op, DAG);
+  case ISD::SELECT_CC:
+    return lowerSELECT_CC(Op, DAG);
+  case ISD::GlobalAddress:
+    return lowerGlobalAddress(cast<GlobalAddressSDNode>(Op), DAG);
+  case ISD::GlobalTLSAddress:
+    return lowerGlobalTLSAddress(cast<GlobalAddressSDNode>(Op), DAG);
+  case ISD::BlockAddress:
+    return lowerBlockAddress(cast<BlockAddressSDNode>(Op), DAG);
+  case ISD::JumpTable:
+    return lowerJumpTable(cast<JumpTableSDNode>(Op), DAG);
+  case ISD::ConstantPool:
+    return lowerConstantPool(cast<ConstantPoolSDNode>(Op), DAG);
+  case ISD::BITCAST:
+    return lowerBITCAST(Op, DAG);
+  case ISD::VASTART:
+    return lowerVASTART(Op, DAG);
+  case ISD::VACOPY:
+    return lowerVACOPY(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC:
+    return lowerDYNAMIC_STACKALLOC(Op, DAG);
+  case ISD::UMUL_LOHI:
+    return lowerUMUL_LOHI(Op, DAG);
+  case ISD::SDIVREM:
+    return lowerSDIVREM(Op, DAG);
+  case ISD::UDIVREM:
+    return lowerUDIVREM(Op, DAG);
+  case ISD::OR:
+    return lowerOR(Op, DAG);
+  case ISD::ATOMIC_SWAP:
+    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_SWAPW);
+  case ISD::ATOMIC_LOAD_ADD:
+    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_LOADW_ADD);
+  case ISD::ATOMIC_LOAD_SUB:
+    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_LOADW_SUB);
+  case ISD::ATOMIC_LOAD_AND:
+    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_LOADW_AND);
+  case ISD::ATOMIC_LOAD_OR:
+    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_LOADW_OR);
+  case ISD::ATOMIC_LOAD_XOR:
+    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_LOADW_XOR);
+  case ISD::ATOMIC_LOAD_NAND:
+    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_LOADW_NAND);
+  case ISD::ATOMIC_LOAD_MIN:
+    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_LOADW_MIN);
+  case ISD::ATOMIC_LOAD_MAX:
+    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_LOADW_MAX);
+  case ISD::ATOMIC_LOAD_UMIN:
+    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_LOADW_UMIN);
+  case ISD::ATOMIC_LOAD_UMAX:
+    return lowerATOMIC_LOAD(Op, DAG, SystemZISD::ATOMIC_LOADW_UMAX);
+  case ISD::ATOMIC_CMP_SWAP:
+    return lowerATOMIC_CMP_SWAP(Op, DAG);
+  case ISD::STACKSAVE:
+    return lowerSTACKSAVE(Op, DAG);
+  case ISD::STACKRESTORE:
+    return lowerSTACKRESTORE(Op, DAG);
+  default:
+    llvm_unreachable("Unexpected node to lower");
+  }
+}
+
+const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
+#define OPCODE(NAME) case SystemZISD::NAME: return "SystemZISD::" #NAME
+  switch (Opcode) {
+    OPCODE(RET_FLAG);
+    OPCODE(CALL);
+    OPCODE(PCREL_WRAPPER);
+    OPCODE(CMP);
+    OPCODE(UCMP);
+    OPCODE(BR_CCMASK);
+    OPCODE(SELECT_CCMASK);
+    OPCODE(ADJDYNALLOC);
+    OPCODE(EXTRACT_ACCESS);
+    OPCODE(UMUL_LOHI64);
+    OPCODE(SDIVREM64);
+    OPCODE(UDIVREM32);
+    OPCODE(UDIVREM64);
+    OPCODE(ATOMIC_SWAPW);
+    OPCODE(ATOMIC_LOADW_ADD);
+    OPCODE(ATOMIC_LOADW_SUB);
+    OPCODE(ATOMIC_LOADW_AND);
+    OPCODE(ATOMIC_LOADW_OR);
+    OPCODE(ATOMIC_LOADW_XOR);
+    OPCODE(ATOMIC_LOADW_NAND);
+    OPCODE(ATOMIC_LOADW_MIN);
+    OPCODE(ATOMIC_LOADW_MAX);
+    OPCODE(ATOMIC_LOADW_UMIN);
+    OPCODE(ATOMIC_LOADW_UMAX);
+    OPCODE(ATOMIC_CMP_SWAPW);
+  }
+  return NULL;
+#undef OPCODE
+}
+
+//===----------------------------------------------------------------------===//
+// Custom insertion
+//===----------------------------------------------------------------------===//
+
+// Create a new basic block after MBB.
+static MachineBasicBlock *emitBlockAfter(MachineBasicBlock *MBB) {
+  MachineFunction &MF = *MBB->getParent();
+  MachineBasicBlock *NewMBB = MF.CreateMachineBasicBlock(MBB->getBasicBlock());
+  MF.insert(llvm::next(MachineFunction::iterator(MBB)), NewMBB);
+  return NewMBB;
+}
+
+// Split MBB after MI and return the new block (the one that contains
+// instructions after MI).
+static MachineBasicBlock *splitBlockAfter(MachineInstr *MI,
+                                          MachineBasicBlock *MBB) {
+  MachineBasicBlock *NewMBB = emitBlockAfter(MBB);
+  NewMBB->splice(NewMBB->begin(), MBB,
+                 llvm::next(MachineBasicBlock::iterator(MI)),
+                 MBB->end());
+  NewMBB->transferSuccessorsAndUpdatePHIs(MBB);
+  return NewMBB;
+}
+
+bool SystemZTargetLowering::
+convertPrevCompareToBranch(MachineBasicBlock *MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned CCMask, MachineBasicBlock *Target) const {
+  MachineBasicBlock::iterator Compare = MBBI;
+  MachineBasicBlock::iterator Begin = MBB->begin();
+  do
+    {
+      if (Compare == Begin)
+        return false;
+      --Compare;
+    }
+  while (Compare->isDebugValue());
+
+  const SystemZInstrInfo *TII = TM.getInstrInfo();
+  unsigned FusedOpcode = TII->getCompareAndBranch(Compare->getOpcode(),
+                                                  Compare);
+  if (!FusedOpcode)
+    return false;
+
+  DebugLoc DL = Compare->getDebugLoc();
+  BuildMI(*MBB, MBBI, DL, TII->get(FusedOpcode))
+    .addOperand(Compare->getOperand(0)).addOperand(Compare->getOperand(1))
+    .addImm(CCMask).addMBB(Target);
+  Compare->removeFromParent();
+  return true;
+}
+
+// Implement EmitInstrWithCustomInserter for pseudo Select* instruction MI.
+MachineBasicBlock *
+SystemZTargetLowering::emitSelect(MachineInstr *MI,
+                                  MachineBasicBlock *MBB) const {
+  const SystemZInstrInfo *TII = TM.getInstrInfo();
+
+  unsigned DestReg  = MI->getOperand(0).getReg();
+  unsigned TrueReg  = MI->getOperand(1).getReg();
+  unsigned FalseReg = MI->getOperand(2).getReg();
+  unsigned CCMask   = MI->getOperand(3).getImm();
+  DebugLoc DL       = MI->getDebugLoc();
+
+  MachineBasicBlock *StartMBB = MBB;
+  MachineBasicBlock *JoinMBB  = splitBlockAfter(MI, MBB);
+  MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB);
+
+  //  StartMBB:
+  //   BRC CCMask, JoinMBB
+  //   # fallthrough to FalseMBB
+  //
+  // The original DAG glues comparisons to their uses, both to ensure
+  // that no CC-clobbering instructions are inserted between them, and
+  // to ensure that comparison results are not reused.  This means that
+  // this Select is the sole user of any preceding comparison instruction
+  // and that we can try to use a fused compare and branch instead.
+  MBB = StartMBB;
+  if (!convertPrevCompareToBranch(MBB, MI, CCMask, JoinMBB))
+    BuildMI(MBB, DL, TII->get(SystemZ::BRC)).addImm(CCMask).addMBB(JoinMBB);
+  MBB->addSuccessor(JoinMBB);
+  MBB->addSuccessor(FalseMBB);
+
+  //  FalseMBB:
+  //   # fallthrough to JoinMBB
+  MBB = FalseMBB;
+  MBB->addSuccessor(JoinMBB);
+
+  //  JoinMBB:
+  //   %Result = phi [ %FalseReg, FalseMBB ], [ %TrueReg, StartMBB ]
+  //  ...
+  MBB = JoinMBB;
+  BuildMI(*MBB, MBB->begin(), DL, TII->get(SystemZ::PHI), DestReg)
+    .addReg(TrueReg).addMBB(StartMBB)
+    .addReg(FalseReg).addMBB(FalseMBB);
+
+  MI->eraseFromParent();
+  return JoinMBB;
+}
+
+// Implement EmitInstrWithCustomInserter for pseudo ATOMIC_LOAD{,W}_*
+// or ATOMIC_SWAP{,W} instruction MI.  BinOpcode is the instruction that
+// performs the binary operation elided by "*", or 0 for ATOMIC_SWAP{,W}.
+// BitSize is the width of the field in bits, or 0 if this is a partword
+// ATOMIC_LOADW_* or ATOMIC_SWAPW instruction, in which case the bitsize
+// is one of the operands.  Invert says whether the field should be
+// inverted after performing BinOpcode (e.g. for NAND).
+MachineBasicBlock *
+SystemZTargetLowering::emitAtomicLoadBinary(MachineInstr *MI,
+                                            MachineBasicBlock *MBB,
+                                            unsigned BinOpcode,
+                                            unsigned BitSize,
+                                            bool Invert) const {
+  const SystemZInstrInfo *TII = TM.getInstrInfo();
+  MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  unsigned MaskNE = CCMaskForCondCode(ISD::SETNE);
+  bool IsSubWord = (BitSize < 32);
+
+  // Extract the operands.  Base can be a register or a frame index.
+  // Src2 can be a register or immediate.
+  unsigned Dest        = MI->getOperand(0).getReg();
+  MachineOperand Base  = earlyUseOperand(MI->getOperand(1));
+  int64_t Disp         = MI->getOperand(2).getImm();
+  MachineOperand Src2  = earlyUseOperand(MI->getOperand(3));
+  unsigned BitShift    = (IsSubWord ? MI->getOperand(4).getReg() : 0);
+  unsigned NegBitShift = (IsSubWord ? MI->getOperand(5).getReg() : 0);
+  DebugLoc DL          = MI->getDebugLoc();
+  if (IsSubWord)
+    BitSize = MI->getOperand(6).getImm();
+
+  // Subword operations use 32-bit registers.
+  const TargetRegisterClass *RC = (BitSize <= 32 ?
+                                   &SystemZ::GR32BitRegClass :
+                                   &SystemZ::GR64BitRegClass);
+  unsigned LOpcode  = BitSize <= 32 ? SystemZ::L  : SystemZ::LG;
+  unsigned CSOpcode = BitSize <= 32 ? SystemZ::CS : SystemZ::CSG;
+
+  // Get the right opcodes for the displacement.
+  LOpcode  = TII->getOpcodeForOffset(LOpcode,  Disp);
+  CSOpcode = TII->getOpcodeForOffset(CSOpcode, Disp);
+  assert(LOpcode && CSOpcode && "Displacement out of range");
+
+  // Create virtual registers for temporary results.
+  unsigned OrigVal       = MRI.createVirtualRegister(RC);
+  unsigned OldVal        = MRI.createVirtualRegister(RC);
+  unsigned NewVal        = (BinOpcode || IsSubWord ?
+                            MRI.createVirtualRegister(RC) : Src2.getReg());
+  unsigned RotatedOldVal = (IsSubWord ? MRI.createVirtualRegister(RC) : OldVal);
+  unsigned RotatedNewVal = (IsSubWord ? MRI.createVirtualRegister(RC) : NewVal);
+
+  // Insert a basic block for the main loop.
+  MachineBasicBlock *StartMBB = MBB;
+  MachineBasicBlock *DoneMBB  = splitBlockAfter(MI, MBB);
+  MachineBasicBlock *LoopMBB  = emitBlockAfter(StartMBB);
+
+  //  StartMBB:
+  //   ...
+  //   %OrigVal = L Disp(%Base)
+  //   # fall through to LoopMMB
+  MBB = StartMBB;
+  BuildMI(MBB, DL, TII->get(LOpcode), OrigVal)
+    .addOperand(Base).addImm(Disp).addReg(0);
+  MBB->addSuccessor(LoopMBB);
+
+  //  LoopMBB:
+  //   %OldVal        = phi [ %OrigVal, StartMBB ], [ %Dest, LoopMBB ]
+  //   %RotatedOldVal = RLL %OldVal, 0(%BitShift)
+  //   %RotatedNewVal = OP %RotatedOldVal, %Src2
+  //   %NewVal        = RLL %RotatedNewVal, 0(%NegBitShift)
+  //   %Dest          = CS %OldVal, %NewVal, Disp(%Base)
+  //   JNE LoopMBB
+  //   # fall through to DoneMMB
+  MBB = LoopMBB;
+  BuildMI(MBB, DL, TII->get(SystemZ::PHI), OldVal)
+    .addReg(OrigVal).addMBB(StartMBB)
+    .addReg(Dest).addMBB(LoopMBB);
+  if (IsSubWord)
+    BuildMI(MBB, DL, TII->get(SystemZ::RLL), RotatedOldVal)
+      .addReg(OldVal).addReg(BitShift).addImm(0);
+  if (Invert) {
+    // Perform the operation normally and then invert every bit of the field.
+    unsigned Tmp = MRI.createVirtualRegister(RC);
+    BuildMI(MBB, DL, TII->get(BinOpcode), Tmp)
+      .addReg(RotatedOldVal).addOperand(Src2);
+    if (BitSize < 32)
+      // XILF with the upper BitSize bits set.
+      BuildMI(MBB, DL, TII->get(SystemZ::XILF32), RotatedNewVal)
+        .addReg(Tmp).addImm(uint32_t(~0 << (32 - BitSize)));
+    else if (BitSize == 32)
+      // XILF with every bit set.
+      BuildMI(MBB, DL, TII->get(SystemZ::XILF32), RotatedNewVal)
+        .addReg(Tmp).addImm(~uint32_t(0));
+    else {
+      // Use LCGR and add -1 to the result, which is more compact than
+      // an XILF, XILH pair.
+      unsigned Tmp2 = MRI.createVirtualRegister(RC);
+      BuildMI(MBB, DL, TII->get(SystemZ::LCGR), Tmp2).addReg(Tmp);
+      BuildMI(MBB, DL, TII->get(SystemZ::AGHI), RotatedNewVal)
+        .addReg(Tmp2).addImm(-1);
+    }
+  } else if (BinOpcode)
+    // A simply binary operation.
+    BuildMI(MBB, DL, TII->get(BinOpcode), RotatedNewVal)
+      .addReg(RotatedOldVal).addOperand(Src2);
+  else if (IsSubWord)
+    // Use RISBG to rotate Src2 into position and use it to replace the
+    // field in RotatedOldVal.
+    BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RotatedNewVal)
+      .addReg(RotatedOldVal).addReg(Src2.getReg())
+      .addImm(32).addImm(31 + BitSize).addImm(32 - BitSize);
+  if (IsSubWord)
+    BuildMI(MBB, DL, TII->get(SystemZ::RLL), NewVal)
+      .addReg(RotatedNewVal).addReg(NegBitShift).addImm(0);
+  BuildMI(MBB, DL, TII->get(CSOpcode), Dest)
+    .addReg(OldVal).addReg(NewVal).addOperand(Base).addImm(Disp);
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC)).addImm(MaskNE).addMBB(LoopMBB);
+  MBB->addSuccessor(LoopMBB);
+  MBB->addSuccessor(DoneMBB);
+
+  MI->eraseFromParent();
+  return DoneMBB;
+}
+
+// Implement EmitInstrWithCustomInserter for pseudo
+// ATOMIC_LOAD{,W}_{,U}{MIN,MAX} instruction MI.  CompareOpcode is the
+// instruction that should be used to compare the current field with the
+// minimum or maximum value.  KeepOldMask is the BRC condition-code mask
+// for when the current field should be kept.  BitSize is the width of
+// the field in bits, or 0 if this is a partword ATOMIC_LOADW_* instruction.
+MachineBasicBlock *
+SystemZTargetLowering::emitAtomicLoadMinMax(MachineInstr *MI,
+                                            MachineBasicBlock *MBB,
+                                            unsigned CompareOpcode,
+                                            unsigned KeepOldMask,
+                                            unsigned BitSize) const {
+  const SystemZInstrInfo *TII = TM.getInstrInfo();
+  MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  unsigned MaskNE = CCMaskForCondCode(ISD::SETNE);
+  bool IsSubWord = (BitSize < 32);
+
+  // Extract the operands.  Base can be a register or a frame index.
+  unsigned Dest        = MI->getOperand(0).getReg();
+  MachineOperand Base  = earlyUseOperand(MI->getOperand(1));
+  int64_t  Disp        = MI->getOperand(2).getImm();
+  unsigned Src2        = MI->getOperand(3).getReg();
+  unsigned BitShift    = (IsSubWord ? MI->getOperand(4).getReg() : 0);
+  unsigned NegBitShift = (IsSubWord ? MI->getOperand(5).getReg() : 0);
+  DebugLoc DL          = MI->getDebugLoc();
+  if (IsSubWord)
+    BitSize = MI->getOperand(6).getImm();
+
+  // Subword operations use 32-bit registers.
+  const TargetRegisterClass *RC = (BitSize <= 32 ?
+                                   &SystemZ::GR32BitRegClass :
+                                   &SystemZ::GR64BitRegClass);
+  unsigned LOpcode  = BitSize <= 32 ? SystemZ::L  : SystemZ::LG;
+  unsigned CSOpcode = BitSize <= 32 ? SystemZ::CS : SystemZ::CSG;
+
+  // Get the right opcodes for the displacement.
+  LOpcode  = TII->getOpcodeForOffset(LOpcode,  Disp);
+  CSOpcode = TII->getOpcodeForOffset(CSOpcode, Disp);
+  assert(LOpcode && CSOpcode && "Displacement out of range");
+
+  // Create virtual registers for temporary results.
+  unsigned OrigVal       = MRI.createVirtualRegister(RC);
+  unsigned OldVal        = MRI.createVirtualRegister(RC);
+  unsigned NewVal        = MRI.createVirtualRegister(RC);
+  unsigned RotatedOldVal = (IsSubWord ? MRI.createVirtualRegister(RC) : OldVal);
+  unsigned RotatedAltVal = (IsSubWord ? MRI.createVirtualRegister(RC) : Src2);
+  unsigned RotatedNewVal = (IsSubWord ? MRI.createVirtualRegister(RC) : NewVal);
+
+  // Insert 3 basic blocks for the loop.
+  MachineBasicBlock *StartMBB  = MBB;
+  MachineBasicBlock *DoneMBB   = splitBlockAfter(MI, MBB);
+  MachineBasicBlock *LoopMBB   = emitBlockAfter(StartMBB);
+  MachineBasicBlock *UseAltMBB = emitBlockAfter(LoopMBB);
+  MachineBasicBlock *UpdateMBB = emitBlockAfter(UseAltMBB);
+
+  //  StartMBB:
+  //   ...
+  //   %OrigVal     = L Disp(%Base)
+  //   # fall through to LoopMMB
+  MBB = StartMBB;
+  BuildMI(MBB, DL, TII->get(LOpcode), OrigVal)
+    .addOperand(Base).addImm(Disp).addReg(0);
+  MBB->addSuccessor(LoopMBB);
+
+  //  LoopMBB:
+  //   %OldVal        = phi [ %OrigVal, StartMBB ], [ %Dest, UpdateMBB ]
+  //   %RotatedOldVal = RLL %OldVal, 0(%BitShift)
+  //   CompareOpcode %RotatedOldVal, %Src2
+  //   BRC KeepOldMask, UpdateMBB
+  MBB = LoopMBB;
+  BuildMI(MBB, DL, TII->get(SystemZ::PHI), OldVal)
+    .addReg(OrigVal).addMBB(StartMBB)
+    .addReg(Dest).addMBB(UpdateMBB);
+  if (IsSubWord)
+    BuildMI(MBB, DL, TII->get(SystemZ::RLL), RotatedOldVal)
+      .addReg(OldVal).addReg(BitShift).addImm(0);
+  unsigned FusedOpcode = TII->getCompareAndBranch(CompareOpcode);
+  if (FusedOpcode)
+    BuildMI(MBB, DL, TII->get(FusedOpcode))
+      .addReg(RotatedOldVal).addReg(Src2)
+      .addImm(KeepOldMask).addMBB(UpdateMBB);
+  else {
+    BuildMI(MBB, DL, TII->get(CompareOpcode))
+      .addReg(RotatedOldVal).addReg(Src2);
+    BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+      .addImm(KeepOldMask).addMBB(UpdateMBB);
+  }
+  MBB->addSuccessor(UpdateMBB);
+  MBB->addSuccessor(UseAltMBB);
+
+  //  UseAltMBB:
+  //   %RotatedAltVal = RISBG %RotatedOldVal, %Src2, 32, 31 + BitSize, 0
+  //   # fall through to UpdateMMB
+  MBB = UseAltMBB;
+  if (IsSubWord)
+    BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RotatedAltVal)
+      .addReg(RotatedOldVal).addReg(Src2)
+      .addImm(32).addImm(31 + BitSize).addImm(0);
+  MBB->addSuccessor(UpdateMBB);
+
+  //  UpdateMBB:
+  //   %RotatedNewVal = PHI [ %RotatedOldVal, LoopMBB ],
+  //                        [ %RotatedAltVal, UseAltMBB ]
+  //   %NewVal        = RLL %RotatedNewVal, 0(%NegBitShift)
+  //   %Dest          = CS %OldVal, %NewVal, Disp(%Base)
+  //   JNE LoopMBB
+  //   # fall through to DoneMMB
+  MBB = UpdateMBB;
+  BuildMI(MBB, DL, TII->get(SystemZ::PHI), RotatedNewVal)
+    .addReg(RotatedOldVal).addMBB(LoopMBB)
+    .addReg(RotatedAltVal).addMBB(UseAltMBB);
+  if (IsSubWord)
+    BuildMI(MBB, DL, TII->get(SystemZ::RLL), NewVal)
+      .addReg(RotatedNewVal).addReg(NegBitShift).addImm(0);
+  BuildMI(MBB, DL, TII->get(CSOpcode), Dest)
+    .addReg(OldVal).addReg(NewVal).addOperand(Base).addImm(Disp);
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC)).addImm(MaskNE).addMBB(LoopMBB);
+  MBB->addSuccessor(LoopMBB);
+  MBB->addSuccessor(DoneMBB);
+
+  MI->eraseFromParent();
+  return DoneMBB;
+}
+
+// Implement EmitInstrWithCustomInserter for pseudo ATOMIC_CMP_SWAPW
+// instruction MI.
+MachineBasicBlock *
+SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr *MI,
+                                          MachineBasicBlock *MBB) const {
+  const SystemZInstrInfo *TII = TM.getInstrInfo();
+  MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  unsigned MaskNE = CCMaskForCondCode(ISD::SETNE);
+
+  // Extract the operands.  Base can be a register or a frame index.
+  unsigned Dest        = MI->getOperand(0).getReg();
+  MachineOperand Base  = earlyUseOperand(MI->getOperand(1));
+  int64_t  Disp        = MI->getOperand(2).getImm();
+  unsigned OrigCmpVal  = MI->getOperand(3).getReg();
+  unsigned OrigSwapVal = MI->getOperand(4).getReg();
+  unsigned BitShift    = MI->getOperand(5).getReg();
+  unsigned NegBitShift = MI->getOperand(6).getReg();
+  int64_t  BitSize     = MI->getOperand(7).getImm();
+  DebugLoc DL          = MI->getDebugLoc();
+
+  const TargetRegisterClass *RC = &SystemZ::GR32BitRegClass;
+
+  // Get the right opcodes for the displacement.
+  unsigned LOpcode  = TII->getOpcodeForOffset(SystemZ::L,  Disp);
+  unsigned CSOpcode = TII->getOpcodeForOffset(SystemZ::CS, Disp);
+  assert(LOpcode && CSOpcode && "Displacement out of range");
+
+  // Create virtual registers for temporary results.
+  unsigned OrigOldVal   = MRI.createVirtualRegister(RC);
+  unsigned OldVal       = MRI.createVirtualRegister(RC);
+  unsigned CmpVal       = MRI.createVirtualRegister(RC);
+  unsigned SwapVal      = MRI.createVirtualRegister(RC);
+  unsigned StoreVal     = MRI.createVirtualRegister(RC);
+  unsigned RetryOldVal  = MRI.createVirtualRegister(RC);
+  unsigned RetryCmpVal  = MRI.createVirtualRegister(RC);
+  unsigned RetrySwapVal = MRI.createVirtualRegister(RC);
+
+  // Insert 2 basic blocks for the loop.
+  MachineBasicBlock *StartMBB = MBB;
+  MachineBasicBlock *DoneMBB  = splitBlockAfter(MI, MBB);
+  MachineBasicBlock *LoopMBB  = emitBlockAfter(StartMBB);
+  MachineBasicBlock *SetMBB   = emitBlockAfter(LoopMBB);
+
+  //  StartMBB:
+  //   ...
+  //   %OrigOldVal     = L Disp(%Base)
+  //   # fall through to LoopMMB
+  MBB = StartMBB;
+  BuildMI(MBB, DL, TII->get(LOpcode), OrigOldVal)
+    .addOperand(Base).addImm(Disp).addReg(0);
+  MBB->addSuccessor(LoopMBB);
+
+  //  LoopMBB:
+  //   %OldVal        = phi [ %OrigOldVal, EntryBB ], [ %RetryOldVal, SetMBB ]
+  //   %CmpVal        = phi [ %OrigCmpVal, EntryBB ], [ %RetryCmpVal, SetMBB ]
+  //   %SwapVal       = phi [ %OrigSwapVal, EntryBB ], [ %RetrySwapVal, SetMBB ]
+  //   %Dest          = RLL %OldVal, BitSize(%BitShift)
+  //                      ^^ The low BitSize bits contain the field
+  //                         of interest.
+  //   %RetryCmpVal   = RISBG32 %CmpVal, %Dest, 32, 63-BitSize, 0
+  //                      ^^ Replace the upper 32-BitSize bits of the
+  //                         comparison value with those that we loaded,
+  //                         so that we can use a full word comparison.
+  //   CRJNE %Dest, %RetryCmpVal, DoneMBB
+  //   # Fall through to SetMBB
+  MBB = LoopMBB;
+  BuildMI(MBB, DL, TII->get(SystemZ::PHI), OldVal)
+    .addReg(OrigOldVal).addMBB(StartMBB)
+    .addReg(RetryOldVal).addMBB(SetMBB);
+  BuildMI(MBB, DL, TII->get(SystemZ::PHI), CmpVal)
+    .addReg(OrigCmpVal).addMBB(StartMBB)
+    .addReg(RetryCmpVal).addMBB(SetMBB);
+  BuildMI(MBB, DL, TII->get(SystemZ::PHI), SwapVal)
+    .addReg(OrigSwapVal).addMBB(StartMBB)
+    .addReg(RetrySwapVal).addMBB(SetMBB);
+  BuildMI(MBB, DL, TII->get(SystemZ::RLL), Dest)
+    .addReg(OldVal).addReg(BitShift).addImm(BitSize);
+  BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RetryCmpVal)
+    .addReg(CmpVal).addReg(Dest).addImm(32).addImm(63 - BitSize).addImm(0);
+  BuildMI(MBB, DL, TII->get(SystemZ::CRJ))
+    .addReg(Dest).addReg(RetryCmpVal)
+    .addImm(MaskNE).addMBB(DoneMBB);
+  MBB->addSuccessor(DoneMBB);
+  MBB->addSuccessor(SetMBB);
+
+  //  SetMBB:
+  //   %RetrySwapVal = RISBG32 %SwapVal, %Dest, 32, 63-BitSize, 0
+  //                      ^^ Replace the upper 32-BitSize bits of the new
+  //                         value with those that we loaded.
+  //   %StoreVal    = RLL %RetrySwapVal, -BitSize(%NegBitShift)
+  //                      ^^ Rotate the new field to its proper position.
+  //   %RetryOldVal = CS %Dest, %StoreVal, Disp(%Base)
+  //   JNE LoopMBB
+  //   # fall through to ExitMMB
+  MBB = SetMBB;
+  BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RetrySwapVal)
+    .addReg(SwapVal).addReg(Dest).addImm(32).addImm(63 - BitSize).addImm(0);
+  BuildMI(MBB, DL, TII->get(SystemZ::RLL), StoreVal)
+    .addReg(RetrySwapVal).addReg(NegBitShift).addImm(-BitSize);
+  BuildMI(MBB, DL, TII->get(CSOpcode), RetryOldVal)
+    .addReg(OldVal).addReg(StoreVal).addOperand(Base).addImm(Disp);
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC)).addImm(MaskNE).addMBB(LoopMBB);
+  MBB->addSuccessor(LoopMBB);
+  MBB->addSuccessor(DoneMBB);
+
+  MI->eraseFromParent();
+  return DoneMBB;
+}
+
+// Emit an extension from a GR32 or GR64 to a GR128.  ClearEven is true
+// if the high register of the GR128 value must be cleared or false if
+// it's "don't care".  SubReg is subreg_odd32 when extending a GR32
+// and subreg_odd when extending a GR64.
+MachineBasicBlock *
+SystemZTargetLowering::emitExt128(MachineInstr *MI,
+                                  MachineBasicBlock *MBB,
+                                  bool ClearEven, unsigned SubReg) const {
+  const SystemZInstrInfo *TII = TM.getInstrInfo();
+  MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  unsigned Dest  = MI->getOperand(0).getReg();
+  unsigned Src   = MI->getOperand(1).getReg();
+  unsigned In128 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
+
+  BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::IMPLICIT_DEF), In128);
+  if (ClearEven) {
+    unsigned NewIn128 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
+    unsigned Zero64   = MRI.createVirtualRegister(&SystemZ::GR64BitRegClass);
+
+    BuildMI(*MBB, MI, DL, TII->get(SystemZ::LLILL), Zero64)
+      .addImm(0);
+    BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewIn128)
+      .addReg(In128).addReg(Zero64).addImm(SystemZ::subreg_high);
+    In128 = NewIn128;
+  }
+  BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dest)
+    .addReg(In128).addReg(Src).addImm(SubReg);
+
+  MI->eraseFromParent();
+  return MBB;
+}
+
+MachineBasicBlock *SystemZTargetLowering::
+EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const {
+  switch (MI->getOpcode()) {
+  case SystemZ::Select32:
+  case SystemZ::SelectF32:
+  case SystemZ::Select64:
+  case SystemZ::SelectF64:
+  case SystemZ::SelectF128:
+    return emitSelect(MI, MBB);
+
+  case SystemZ::AEXT128_64:
+    return emitExt128(MI, MBB, false, SystemZ::subreg_low);
+  case SystemZ::ZEXT128_32:
+    return emitExt128(MI, MBB, true, SystemZ::subreg_low32);
+  case SystemZ::ZEXT128_64:
+    return emitExt128(MI, MBB, true, SystemZ::subreg_low);
+
+  case SystemZ::ATOMIC_SWAPW:
+    return emitAtomicLoadBinary(MI, MBB, 0, 0);
+  case SystemZ::ATOMIC_SWAP_32:
+    return emitAtomicLoadBinary(MI, MBB, 0, 32);
+  case SystemZ::ATOMIC_SWAP_64:
+    return emitAtomicLoadBinary(MI, MBB, 0, 64);
+
+  case SystemZ::ATOMIC_LOADW_AR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::AR, 0);
+  case SystemZ::ATOMIC_LOADW_AFI:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::AFI, 0);
+  case SystemZ::ATOMIC_LOAD_AR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::AR, 32);
+  case SystemZ::ATOMIC_LOAD_AHI:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::AHI, 32);
+  case SystemZ::ATOMIC_LOAD_AFI:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::AFI, 32);
+  case SystemZ::ATOMIC_LOAD_AGR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::AGR, 64);
+  case SystemZ::ATOMIC_LOAD_AGHI:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::AGHI, 64);
+  case SystemZ::ATOMIC_LOAD_AGFI:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::AGFI, 64);
+
+  case SystemZ::ATOMIC_LOADW_SR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::SR, 0);
+  case SystemZ::ATOMIC_LOAD_SR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::SR, 32);
+  case SystemZ::ATOMIC_LOAD_SGR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::SGR, 64);
+
+  case SystemZ::ATOMIC_LOADW_NR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 0);
+  case SystemZ::ATOMIC_LOADW_NILH:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH32, 0);
+  case SystemZ::ATOMIC_LOAD_NR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 32);
+  case SystemZ::ATOMIC_LOAD_NILL32:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL32, 32);
+  case SystemZ::ATOMIC_LOAD_NILH32:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH32, 32);
+  case SystemZ::ATOMIC_LOAD_NILF32:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF32, 32);
+  case SystemZ::ATOMIC_LOAD_NGR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NGR, 64);
+  case SystemZ::ATOMIC_LOAD_NILL:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL, 64);
+  case SystemZ::ATOMIC_LOAD_NILH:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 64);
+  case SystemZ::ATOMIC_LOAD_NIHL:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHL, 64);
+  case SystemZ::ATOMIC_LOAD_NIHH:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHH, 64);
+  case SystemZ::ATOMIC_LOAD_NILF:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF, 64);
+  case SystemZ::ATOMIC_LOAD_NIHF:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHF, 64);
+
+  case SystemZ::ATOMIC_LOADW_OR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OR, 0);
+  case SystemZ::ATOMIC_LOADW_OILH:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH32, 0);
+  case SystemZ::ATOMIC_LOAD_OR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OR, 32);
+  case SystemZ::ATOMIC_LOAD_OILL32:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILL32, 32);
+  case SystemZ::ATOMIC_LOAD_OILH32:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH32, 32);
+  case SystemZ::ATOMIC_LOAD_OILF32:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILF32, 32);
+  case SystemZ::ATOMIC_LOAD_OGR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OGR, 64);
+  case SystemZ::ATOMIC_LOAD_OILL:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILL, 64);
+  case SystemZ::ATOMIC_LOAD_OILH:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH, 64);
+  case SystemZ::ATOMIC_LOAD_OIHL:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHL, 64);
+  case SystemZ::ATOMIC_LOAD_OIHH:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHH, 64);
+  case SystemZ::ATOMIC_LOAD_OILF:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILF, 64);
+  case SystemZ::ATOMIC_LOAD_OIHF:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHF, 64);
+
+  case SystemZ::ATOMIC_LOADW_XR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XR, 0);
+  case SystemZ::ATOMIC_LOADW_XILF:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF32, 0);
+  case SystemZ::ATOMIC_LOAD_XR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XR, 32);
+  case SystemZ::ATOMIC_LOAD_XILF32:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF32, 32);
+  case SystemZ::ATOMIC_LOAD_XGR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XGR, 64);
+  case SystemZ::ATOMIC_LOAD_XILF:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF, 64);
+  case SystemZ::ATOMIC_LOAD_XIHF:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XIHF, 64);
+
+  case SystemZ::ATOMIC_LOADW_NRi:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 0, true);
+  case SystemZ::ATOMIC_LOADW_NILHi:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH32, 0, true);
+  case SystemZ::ATOMIC_LOAD_NRi:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 32, true);
+  case SystemZ::ATOMIC_LOAD_NILL32i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL32, 32, true);
+  case SystemZ::ATOMIC_LOAD_NILH32i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH32, 32, true);
+  case SystemZ::ATOMIC_LOAD_NILF32i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF32, 32, true);
+  case SystemZ::ATOMIC_LOAD_NGRi:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NGR, 64, true);
+  case SystemZ::ATOMIC_LOAD_NILLi:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL, 64, true);
+  case SystemZ::ATOMIC_LOAD_NILHi:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 64, true);
+  case SystemZ::ATOMIC_LOAD_NIHLi:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHL, 64, true);
+  case SystemZ::ATOMIC_LOAD_NIHHi:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHH, 64, true);
+  case SystemZ::ATOMIC_LOAD_NILFi:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF, 64, true);
+  case SystemZ::ATOMIC_LOAD_NIHFi:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHF, 64, true);
+
+  case SystemZ::ATOMIC_LOADW_MIN:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
+                                SystemZ::CCMASK_CMP_LE, 0);
+  case SystemZ::ATOMIC_LOAD_MIN_32:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
+                                SystemZ::CCMASK_CMP_LE, 32);
+  case SystemZ::ATOMIC_LOAD_MIN_64:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CGR,
+                                SystemZ::CCMASK_CMP_LE, 64);
+
+  case SystemZ::ATOMIC_LOADW_MAX:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
+                                SystemZ::CCMASK_CMP_GE, 0);
+  case SystemZ::ATOMIC_LOAD_MAX_32:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
+                                SystemZ::CCMASK_CMP_GE, 32);
+  case SystemZ::ATOMIC_LOAD_MAX_64:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CGR,
+                                SystemZ::CCMASK_CMP_GE, 64);
+
+  case SystemZ::ATOMIC_LOADW_UMIN:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR,
+                                SystemZ::CCMASK_CMP_LE, 0);
+  case SystemZ::ATOMIC_LOAD_UMIN_32:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR,
+                                SystemZ::CCMASK_CMP_LE, 32);
+  case SystemZ::ATOMIC_LOAD_UMIN_64:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLGR,
+                                SystemZ::CCMASK_CMP_LE, 64);
+
+  case SystemZ::ATOMIC_LOADW_UMAX:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR,
+                                SystemZ::CCMASK_CMP_GE, 0);
+  case SystemZ::ATOMIC_LOAD_UMAX_32:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR,
+                                SystemZ::CCMASK_CMP_GE, 32);
+  case SystemZ::ATOMIC_LOAD_UMAX_64:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLGR,
+                                SystemZ::CCMASK_CMP_GE, 64);
+
+  case SystemZ::ATOMIC_CMP_SWAPW:
+    return emitAtomicCmpSwapW(MI, MBB);
+  case SystemZ::BRC:
+    // The original DAG glues comparisons to their uses, both to ensure
+    // that no CC-clobbering instructions are inserted between them, and
+    // to ensure that comparison results are not reused.  This means that
+    // a BRC is the sole user of a preceding comparison and that we can
+    // try to use a fused compare and branch instead.
+    if (convertPrevCompareToBranch(MBB, MI, MI->getOperand(0).getImm(),
+                                   MI->getOperand(1).getMBB()))
+      MI->eraseFromParent();
+    return MBB;
+  default:
+    llvm_unreachable("Unexpected instr type to insert");
+  }
+}
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
new file mode 100644
index 0000000..f17e9e4
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -0,0 +1,223 @@
+//===-- SystemZISelLowering.h - SystemZ DAG lowering interface --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that SystemZ uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_SystemZ_ISELLOWERING_H
+#define LLVM_TARGET_SystemZ_ISELLOWERING_H
+
+#include "SystemZ.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+namespace SystemZISD {
+  enum {
+    FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+    // Return with a flag operand.  Operand 0 is the chain operand.
+    RET_FLAG,
+
+    // Calls a function.  Operand 0 is the chain operand and operand 1
+    // is the target address.  The arguments start at operand 2.
+    // There is an optional glue operand at the end.
+    CALL,
+
+    // Wraps a TargetGlobalAddress that should be loaded using PC-relative
+    // accesses (LARL).  Operand 0 is the address.
+    PCREL_WRAPPER,
+
+    // Signed integer and floating-point comparisons.  The operands are the
+    // two values to compare.
+    CMP,
+
+    // Likewise unsigned integer comparison.
+    UCMP,
+
+    // Branches if a condition is true.  Operand 0 is the chain operand;
+    // operand 1 is the 4-bit condition-code mask, with bit N in
+    // big-endian order meaning "branch if CC=N"; operand 2 is the
+    // target block and operand 3 is the flag operand.
+    BR_CCMASK,
+
+    // Selects between operand 0 and operand 1.  Operand 2 is the
+    // mask of condition-code values for which operand 0 should be
+    // chosen over operand 1; it has the same form as BR_CCMASK.
+    // Operand 3 is the flag operand.
+    SELECT_CCMASK,
+
+    // Evaluates to the gap between the stack pointer and the
+    // base of the dynamically-allocatable area.
+    ADJDYNALLOC,
+
+    // Extracts the value of a 32-bit access register.  Operand 0 is
+    // the number of the register.
+    EXTRACT_ACCESS,
+
+    // Wrappers around the ISD opcodes of the same name.  The output and
+    // first input operands are GR128s.  The trailing numbers are the
+    // widths of the second operand in bits.
+    UMUL_LOHI64,
+    SDIVREM64,
+    UDIVREM32,
+    UDIVREM64,
+
+    // Wrappers around the inner loop of an 8- or 16-bit ATOMIC_SWAP or
+    // ATOMIC_LOAD_<op>.
+    //
+    // Operand 0: the address of the containing 32-bit-aligned field
+    // Operand 1: the second operand of <op>, in the high bits of an i32
+    //            for everything except ATOMIC_SWAPW
+    // Operand 2: how many bits to rotate the i32 left to bring the first
+    //            operand into the high bits
+    // Operand 3: the negative of operand 2, for rotating the other way
+    // Operand 4: the width of the field in bits (8 or 16)
+    ATOMIC_SWAPW = ISD::FIRST_TARGET_MEMORY_OPCODE,
+    ATOMIC_LOADW_ADD,
+    ATOMIC_LOADW_SUB,
+    ATOMIC_LOADW_AND,
+    ATOMIC_LOADW_OR,
+    ATOMIC_LOADW_XOR,
+    ATOMIC_LOADW_NAND,
+    ATOMIC_LOADW_MIN,
+    ATOMIC_LOADW_MAX,
+    ATOMIC_LOADW_UMIN,
+    ATOMIC_LOADW_UMAX,
+
+    // A wrapper around the inner loop of an ATOMIC_CMP_SWAP.
+    //
+    // Operand 0: the address of the containing 32-bit-aligned field
+    // Operand 1: the compare value, in the low bits of an i32
+    // Operand 2: the swap value, in the low bits of an i32
+    // Operand 3: how many bits to rotate the i32 left to bring the first
+    //            operand into the high bits
+    // Operand 4: the negative of operand 2, for rotating the other way
+    // Operand 5: the width of the field in bits (8 or 16)
+    ATOMIC_CMP_SWAPW
+  };
+}
+
+class SystemZSubtarget;
+class SystemZTargetMachine;
+
+class SystemZTargetLowering : public TargetLowering {
+public:
+  explicit SystemZTargetLowering(SystemZTargetMachine &TM);
+
+  // Override TargetLowering.
+  virtual MVT getScalarShiftAmountTy(EVT LHSTy) const LLVM_OVERRIDE {
+    return MVT::i32;
+  }
+  virtual EVT getSetCCResultType(LLVMContext &, EVT) const {
+    return MVT::i32;
+  }
+  virtual bool isFMAFasterThanMulAndAdd(EVT) const LLVM_OVERRIDE {
+    return true;
+  }
+  virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
+  virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const;
+  virtual const char *getTargetNodeName(unsigned Opcode) const LLVM_OVERRIDE;
+  virtual std::pair<unsigned, const TargetRegisterClass *>
+    getRegForInlineAsmConstraint(const std::string &Constraint,
+                                 EVT VT) const LLVM_OVERRIDE;
+  virtual TargetLowering::ConstraintType
+    getConstraintType(const std::string &Constraint) const LLVM_OVERRIDE;
+  virtual TargetLowering::ConstraintWeight
+    getSingleConstraintMatchWeight(AsmOperandInfo &info,
+                                   const char *constraint) const LLVM_OVERRIDE;
+  virtual void
+    LowerAsmOperandForConstraint(SDValue Op,
+                                 std::string &Constraint,
+                                 std::vector<SDValue> &Ops,
+                                 SelectionDAG &DAG) const LLVM_OVERRIDE;
+  virtual MachineBasicBlock *
+    EmitInstrWithCustomInserter(MachineInstr *MI,
+                                MachineBasicBlock *BB) const LLVM_OVERRIDE;
+  virtual SDValue LowerOperation(SDValue Op,
+                                 SelectionDAG &DAG) const LLVM_OVERRIDE;
+  virtual SDValue
+    LowerFormalArguments(SDValue Chain,
+                         CallingConv::ID CallConv, bool isVarArg,
+                         const SmallVectorImpl<ISD::InputArg> &Ins,
+                         SDLoc DL, SelectionDAG &DAG,
+                         SmallVectorImpl<SDValue> &InVals) const LLVM_OVERRIDE;
+  virtual SDValue
+    LowerCall(CallLoweringInfo &CLI,
+              SmallVectorImpl<SDValue> &InVals) const LLVM_OVERRIDE;
+
+  virtual SDValue
+    LowerReturn(SDValue Chain,
+                CallingConv::ID CallConv, bool IsVarArg,
+                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<SDValue> &OutVals,
+                SDLoc DL, SelectionDAG &DAG) const LLVM_OVERRIDE;
+
+private:
+  const SystemZSubtarget &Subtarget;
+  const SystemZTargetMachine &TM;
+
+  // Implement LowerOperation for individual opcodes.
+  SDValue lowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerGlobalAddress(GlobalAddressSDNode *Node,
+                             SelectionDAG &DAG) const;
+  SDValue lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
+                                SelectionDAG &DAG) const;
+  SDValue lowerBlockAddress(BlockAddressSDNode *Node,
+                            SelectionDAG &DAG) const;
+  SDValue lowerJumpTable(JumpTableSDNode *JT, SelectionDAG &DAG) const;
+  SDValue lowerConstantPool(ConstantPoolSDNode *CP, SelectionDAG &DAG) const;
+  SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerUMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG,
+                           unsigned Opcode) const;
+  SDValue lowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const;
+
+  // If the last instruction before MBBI in MBB was some form of COMPARE,
+  // try to replace it with a COMPARE AND BRANCH just before MBBI.
+  // CCMask and Target are the BRC-like operands for the branch.
+  // Return true if the change was made.
+  bool convertPrevCompareToBranch(MachineBasicBlock *MBB,
+                                  MachineBasicBlock::iterator MBBI,
+                                  unsigned CCMask,
+                                  MachineBasicBlock *Target) const;
+
+  // Implement EmitInstrWithCustomInserter for individual operation types.
+  MachineBasicBlock *emitSelect(MachineInstr *MI,
+                                MachineBasicBlock *BB) const;
+  MachineBasicBlock *emitExt128(MachineInstr *MI,
+                                MachineBasicBlock *MBB,
+                                bool ClearEven, unsigned SubReg) const;
+  MachineBasicBlock *emitAtomicLoadBinary(MachineInstr *MI,
+                                          MachineBasicBlock *BB,
+                                          unsigned BinOpcode, unsigned BitSize,
+                                          bool Invert = false) const;
+  MachineBasicBlock *emitAtomicLoadMinMax(MachineInstr *MI,
+                                          MachineBasicBlock *MBB,
+                                          unsigned CompareOpcode,
+                                          unsigned KeepOldMask,
+                                          unsigned BitSize) const;
+  MachineBasicBlock *emitAtomicCmpSwapW(MachineInstr *MI,
+                                        MachineBasicBlock *BB) const;
+};
+} // end namespace llvm
+
+#endif // LLVM_TARGET_SystemZ_ISELLOWERING_H
diff --git a/lib/Target/SystemZ/SystemZInstrBuilder.h b/lib/Target/SystemZ/SystemZInstrBuilder.h
new file mode 100644
index 0000000..fb699b9
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZInstrBuilder.h
@@ -0,0 +1,48 @@
+//===-- SystemZInstrBuilder.h - Functions to aid building insts -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to handle SystemZ'isms in a clean way.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SYSTEMZINSTRBUILDER_H
+#define SYSTEMZINSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+
+namespace llvm {
+
+/// Add a BDX memory reference for frame object FI to MIB.
+static inline const MachineInstrBuilder &
+addFrameReference(const MachineInstrBuilder &MIB, int FI) {
+  MachineInstr *MI = MIB;
+  MachineFunction &MF = *MI->getParent()->getParent();
+  MachineFrameInfo *MFFrame = MF.getFrameInfo();
+  const MCInstrDesc &MCID = MI->getDesc();
+  unsigned Flags = 0;
+  if (MCID.mayLoad())
+    Flags |= MachineMemOperand::MOLoad;
+  if (MCID.mayStore())
+    Flags |= MachineMemOperand::MOStore;
+  int64_t Offset = 0;
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(MachinePointerInfo(
+                              PseudoSourceValue::getFixedStack(FI), Offset),
+                            Flags, MFFrame->getObjectSize(FI),
+                            MFFrame->getObjectAlignment(FI));
+  return MIB.addFrameIndex(FI).addImm(Offset).addReg(0).addMemOperand(MMO);
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td
new file mode 100644
index 0000000..86ef14c
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZInstrFP.td
@@ -0,0 +1,315 @@
+//==- SystemZInstrFP.td - Floating-point SystemZ instructions --*- tblgen-*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Control-flow instructions
+//===----------------------------------------------------------------------===//
+
+// C's ?: operator for floating-point operands.
+def SelectF32  : SelectWrapper<FP32>;
+def SelectF64  : SelectWrapper<FP64>;
+def SelectF128 : SelectWrapper<FP128>;
+
+//===----------------------------------------------------------------------===//
+// Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load zero.
+let neverHasSideEffects = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
+  def LZER : InherentRRE<"lzer", 0xB374, FP32,  (fpimm0)>;
+  def LZDR : InherentRRE<"lzdr", 0xB375, FP64,  (fpimm0)>;
+  def LZXR : InherentRRE<"lzxr", 0xB376, FP128, (fpimm0)>;
+}
+
+// Moves between two floating-point registers.
+let neverHasSideEffects = 1 in {
+  def LER : UnaryRR <"ler", 0x38,   null_frag, FP32,  FP32>;
+  def LDR : UnaryRR <"ldr", 0x28,   null_frag, FP64,  FP64>;
+  def LXR : UnaryRRE<"lxr", 0xB365, null_frag, FP128, FP128>;
+}
+
+// Moves between 64-bit integer and floating-point registers.
+def LGDR : UnaryRRE<"lgdr", 0xB3CD, bitconvert, GR64, FP64>;
+def LDGR : UnaryRRE<"ldgr", 0xB3C1, bitconvert, FP64, GR64>;
+
+// fcopysign with an FP32 result.
+let isCodeGenOnly = 1 in {
+  def CPSDRss : BinaryRRF<"cpsdr", 0xB372, fcopysign, FP32, FP32>;
+  def CPSDRsd : BinaryRRF<"cpsdr", 0xB372, fcopysign, FP32, FP64>;
+}
+
+// The sign of an FP128 is in the high register.
+def : Pat<(fcopysign FP32:$src1, FP128:$src2),
+          (CPSDRsd FP32:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_high))>;
+
+// fcopysign with an FP64 result.
+let isCodeGenOnly = 1 in
+  def CPSDRds : BinaryRRF<"cpsdr", 0xB372, fcopysign, FP64, FP32>;
+def CPSDRdd : BinaryRRF<"cpsdr", 0xB372, fcopysign, FP64, FP64>;
+
+// The sign of an FP128 is in the high register.
+def : Pat<(fcopysign FP64:$src1, FP128:$src2),
+          (CPSDRdd FP64:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_high))>;
+
+// fcopysign with an FP128 result.  Use "upper" as the high half and leave
+// the low half as-is.
+class CopySign128<RegisterOperand cls, dag upper>
+  : Pat<(fcopysign FP128:$src1, cls:$src2),
+        (INSERT_SUBREG FP128:$src1, upper, subreg_high)>;
+
+def : CopySign128<FP32,  (CPSDRds (EXTRACT_SUBREG FP128:$src1, subreg_high),
+                                  FP32:$src2)>;
+def : CopySign128<FP64,  (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_high),
+                                  FP64:$src2)>;
+def : CopySign128<FP128, (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_high),
+                                  (EXTRACT_SUBREG FP128:$src2, subreg_high))>;
+
+//===----------------------------------------------------------------------===//
+// Load instructions
+//===----------------------------------------------------------------------===//
+
+let canFoldAsLoad = 1, SimpleBDXLoad = 1 in {
+  defm LE : UnaryRXPair<"le", 0x78, 0xED64, load, FP32>;
+  defm LD : UnaryRXPair<"ld", 0x68, 0xED65, load, FP64>;
+
+  // These instructions are split after register allocation, so we don't
+  // want a custom inserter.
+  let Has20BitOffset = 1, HasIndex = 1, Is128Bit = 1 in {
+    def LX : Pseudo<(outs FP128:$dst), (ins bdxaddr20only128:$src),
+                     [(set FP128:$dst, (load bdxaddr20only128:$src))]>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Store instructions
+//===----------------------------------------------------------------------===//
+
+let SimpleBDXStore = 1 in {
+  defm STE : StoreRXPair<"ste", 0x70, 0xED66, store, FP32>;
+  defm STD : StoreRXPair<"std", 0x60, 0xED67, store, FP64>;
+
+  // These instructions are split after register allocation, so we don't
+  // want a custom inserter.
+  let Has20BitOffset = 1, HasIndex = 1, Is128Bit = 1 in {
+    def STX : Pseudo<(outs), (ins FP128:$src, bdxaddr20only128:$dst),
+                     [(store FP128:$src, bdxaddr20only128:$dst)]>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Convert floating-point values to narrower representations, rounding
+// according to the current mode.  The destination of LEXBR and LDXBR
+// is a 128-bit value, but only the first register of the pair is used.
+def LEDBR : UnaryRRE<"ledbr", 0xB344, fround,    FP32,  FP64>;
+def LEXBR : UnaryRRE<"lexbr", 0xB346, null_frag, FP128, FP128>;
+def LDXBR : UnaryRRE<"ldxbr", 0xB345, null_frag, FP128, FP128>;
+
+def : Pat<(f32 (fround FP128:$src)),
+          (EXTRACT_SUBREG (LEXBR FP128:$src), subreg_32bit)>;
+def : Pat<(f64 (fround FP128:$src)),
+          (EXTRACT_SUBREG (LDXBR FP128:$src), subreg_high)>;
+
+// Extend register floating-point values to wider representations.
+def LDEBR : UnaryRRE<"ldebr", 0xB304, fextend, FP64,  FP32>;
+def LXEBR : UnaryRRE<"lxebr", 0xB306, fextend, FP128, FP32>;
+def LXDBR : UnaryRRE<"lxdbr", 0xB305, fextend, FP128, FP64>;
+
+// Extend memory floating-point values to wider representations.
+def LDEB : UnaryRXE<"ldeb", 0xED04, extloadf32, FP64>;
+def LXEB : UnaryRXE<"lxeb", 0xED06, extloadf32, FP128>;
+def LXDB : UnaryRXE<"lxdb", 0xED05, extloadf64, FP128>;
+
+// Convert a signed integer register value to a floating-point one.
+let Defs = [CC] in {
+  def CEFBR : UnaryRRE<"cefbr", 0xB394, sint_to_fp, FP32,  GR32>;
+  def CDFBR : UnaryRRE<"cdfbr", 0xB395, sint_to_fp, FP64,  GR32>;
+  def CXFBR : UnaryRRE<"cxfbr", 0xB396, sint_to_fp, FP128, GR32>;
+
+  def CEGBR : UnaryRRE<"cegbr", 0xB3A4, sint_to_fp, FP32,  GR64>;
+  def CDGBR : UnaryRRE<"cdgbr", 0xB3A5, sint_to_fp, FP64,  GR64>;
+  def CXGBR : UnaryRRE<"cxgbr", 0xB3A6, sint_to_fp, FP128, GR64>;
+}
+
+// Convert a floating-point register value to a signed integer value,
+// with the second operand (modifier M3) specifying the rounding mode.
+let Defs = [CC] in {
+  def CFEBR : UnaryRRF<"cfebr", 0xB398, GR32, FP32>;
+  def CFDBR : UnaryRRF<"cfdbr", 0xB399, GR32, FP64>;
+  def CFXBR : UnaryRRF<"cfxbr", 0xB39A, GR32, FP128>;
+
+  def CGEBR : UnaryRRF<"cgebr", 0xB3A8, GR64, FP32>;
+  def CGDBR : UnaryRRF<"cgdbr", 0xB3A9, GR64, FP64>;
+  def CGXBR : UnaryRRF<"cgxbr", 0xB3AA, GR64, FP128>;
+}
+
+// fp_to_sint always rounds towards zero, which is modifier value 5.
+def : Pat<(i32 (fp_to_sint FP32:$src)),  (CFEBR 5, FP32:$src)>;
+def : Pat<(i32 (fp_to_sint FP64:$src)),  (CFDBR 5, FP64:$src)>;
+def : Pat<(i32 (fp_to_sint FP128:$src)), (CFXBR 5, FP128:$src)>;
+
+def : Pat<(i64 (fp_to_sint FP32:$src)),  (CGEBR 5, FP32:$src)>;
+def : Pat<(i64 (fp_to_sint FP64:$src)),  (CGDBR 5, FP64:$src)>;
+def : Pat<(i64 (fp_to_sint FP128:$src)), (CGXBR 5, FP128:$src)>;
+
+//===----------------------------------------------------------------------===//
+// Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Negation (Load Complement).
+let Defs = [CC] in {
+  def LCEBR : UnaryRRE<"lcebr", 0xB303, fneg, FP32,  FP32>;
+  def LCDBR : UnaryRRE<"lcdbr", 0xB313, fneg, FP64,  FP64>;
+  def LCXBR : UnaryRRE<"lcxbr", 0xB343, fneg, FP128, FP128>;
+}
+
+// Absolute value (Load Positive).
+let Defs = [CC] in {
+  def LPEBR : UnaryRRE<"lpebr", 0xB300, fabs, FP32,  FP32>;
+  def LPDBR : UnaryRRE<"lpdbr", 0xB310, fabs, FP64,  FP64>;
+  def LPXBR : UnaryRRE<"lpxbr", 0xB340, fabs, FP128, FP128>;
+}
+
+// Negative absolute value (Load Negative).
+let Defs = [CC] in {
+  def LNEBR : UnaryRRE<"lnebr", 0xB301, fnabs, FP32,  FP32>;
+  def LNDBR : UnaryRRE<"lndbr", 0xB311, fnabs, FP64,  FP64>;
+  def LNXBR : UnaryRRE<"lnxbr", 0xB341, fnabs, FP128, FP128>;
+}
+
+// Square root.
+def SQEBR : UnaryRRE<"sqebr", 0xB314, fsqrt, FP32,  FP32>;
+def SQDBR : UnaryRRE<"sqdbr", 0xB315, fsqrt, FP64,  FP64>;
+def SQXBR : UnaryRRE<"sqxbr", 0xB316, fsqrt, FP128, FP128>;
+
+def SQEB : UnaryRXE<"sqeb", 0xED14, loadu<fsqrt>, FP32>;
+def SQDB : UnaryRXE<"sqdb", 0xED15, loadu<fsqrt>, FP64>;
+
+// Round to an integer, with the second operand (modifier M3) specifying
+// the rounding mode.
+//
+// These forms always check for inexact conditions.  z196 added versions
+// that allow this to suppressed (as for fnearbyint), but we don't yet
+// support -march=z196.
+let Defs = [CC] in {
+  def FIEBR : UnaryRRF<"fiebr", 0xB357, FP32,  FP32>;
+  def FIDBR : UnaryRRF<"fidbr", 0xB35F, FP64,  FP64>;
+  def FIXBR : UnaryRRF<"fixbr", 0xB347, FP128, FP128>;
+}
+
+// frint rounds according to the current mode (modifier 0) and detects
+// inexact conditions.
+def : Pat<(frint FP32:$src),  (FIEBR 0, FP32:$src)>;
+def : Pat<(frint FP64:$src),  (FIDBR 0, FP64:$src)>;
+def : Pat<(frint FP128:$src), (FIXBR 0, FP128:$src)>;
+
+//===----------------------------------------------------------------------===//
+// Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition.
+let Defs = [CC] in {
+  let isCommutable = 1 in {
+    def AEBR : BinaryRRE<"aebr", 0xB30A, fadd, FP32,  FP32>;
+    def ADBR : BinaryRRE<"adbr", 0xB31A, fadd, FP64,  FP64>;
+    def AXBR : BinaryRRE<"axbr", 0xB34A, fadd, FP128, FP128>;
+  }
+  def AEB : BinaryRXE<"aeb", 0xED0A, fadd, FP32, load>;
+  def ADB : BinaryRXE<"adb", 0xED1A, fadd, FP64, load>;
+}
+
+// Subtraction.
+let Defs = [CC] in {
+  def SEBR : BinaryRRE<"sebr", 0xB30B, fsub, FP32,  FP32>;
+  def SDBR : BinaryRRE<"sdbr", 0xB31B, fsub, FP64,  FP64>;
+  def SXBR : BinaryRRE<"sxbr", 0xB34B, fsub, FP128, FP128>;
+
+  def SEB : BinaryRXE<"seb",  0xED0B, fsub, FP32, load>;
+  def SDB : BinaryRXE<"sdb",  0xED1B, fsub, FP64, load>;
+}
+
+// Multiplication.
+let isCommutable = 1 in {
+  def MEEBR : BinaryRRE<"meebr", 0xB317, fmul, FP32,  FP32>;
+  def MDBR  : BinaryRRE<"mdbr",  0xB31C, fmul, FP64,  FP64>;
+  def MXBR  : BinaryRRE<"mxbr",  0xB34C, fmul, FP128, FP128>;
+}
+def MEEB : BinaryRXE<"meeb", 0xED17, fmul, FP32, load>;
+def MDB  : BinaryRXE<"mdb",  0xED1C, fmul, FP64, load>;
+
+// f64 multiplication of two FP32 registers.
+def MDEBR : BinaryRRE<"mdebr", 0xB30C, null_frag, FP64, FP32>;
+def : Pat<(fmul (f64 (fextend FP32:$src1)), (f64 (fextend FP32:$src2))),
+          (MDEBR (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                FP32:$src1, subreg_32bit), FP32:$src2)>;
+
+// f64 multiplication of an FP32 register and an f32 memory.
+def MDEB : BinaryRXE<"mdeb", 0xED0C, null_frag, FP64, load>;
+def : Pat<(fmul (f64 (fextend FP32:$src1)),
+                (f64 (extloadf32 bdxaddr12only:$addr))),
+          (MDEB (INSERT_SUBREG (f64 (IMPLICIT_DEF)), FP32:$src1, subreg_32bit),
+                bdxaddr12only:$addr)>;
+
+// f128 multiplication of two FP64 registers.
+def MXDBR : BinaryRRE<"mxdbr", 0xB307, null_frag, FP128, FP64>;
+def : Pat<(fmul (f128 (fextend FP64:$src1)), (f128 (fextend FP64:$src2))),
+          (MXDBR (INSERT_SUBREG (f128 (IMPLICIT_DEF)),
+                                FP64:$src1, subreg_high), FP64:$src2)>;
+
+// f128 multiplication of an FP64 register and an f64 memory.
+def MXDB : BinaryRXE<"mxdb", 0xED07, null_frag, FP128, load>;
+def : Pat<(fmul (f128 (fextend FP64:$src1)),
+                (f128 (extloadf64 bdxaddr12only:$addr))),
+          (MXDB (INSERT_SUBREG (f128 (IMPLICIT_DEF)), FP64:$src1, subreg_high),
+                bdxaddr12only:$addr)>;
+
+// Fused multiply-add.
+def MAEBR : TernaryRRD<"maebr", 0xB30E, z_fma, FP32>;
+def MADBR : TernaryRRD<"madbr", 0xB31E, z_fma, FP64>;
+
+def MAEB : TernaryRXF<"maeb", 0xED0E, z_fma, FP32, load>;
+def MADB : TernaryRXF<"madb", 0xED1E, z_fma, FP64, load>;
+
+// Fused multiply-subtract.
+def MSEBR : TernaryRRD<"msebr", 0xB30F, z_fms, FP32>;
+def MSDBR : TernaryRRD<"msdbr", 0xB31F, z_fms, FP64>;
+
+def MSEB : TernaryRXF<"mseb", 0xED0F, z_fms, FP32, load>;
+def MSDB : TernaryRXF<"msdb", 0xED1F, z_fms, FP64, load>;
+
+// Division.
+def DEBR : BinaryRRE<"debr", 0xB30D, fdiv, FP32,  FP32>;
+def DDBR : BinaryRRE<"ddbr", 0xB31D, fdiv, FP64,  FP64>;
+def DXBR : BinaryRRE<"dxbr", 0xB34D, fdiv, FP128, FP128>;
+
+def DEB : BinaryRXE<"deb", 0xED0D, fdiv, FP32, load>;
+def DDB : BinaryRXE<"ddb", 0xED1D, fdiv, FP64, load>;
+
+//===----------------------------------------------------------------------===//
+// Comparisons
+//===----------------------------------------------------------------------===//
+
+let Defs = [CC] in {
+  def CEBR : CompareRRE<"cebr", 0xB309, z_cmp, FP32,  FP32>;
+  def CDBR : CompareRRE<"cdbr", 0xB319, z_cmp, FP64,  FP64>;
+  def CXBR : CompareRRE<"cxbr", 0xB349, z_cmp, FP128, FP128>;
+
+  def CEB : CompareRXE<"ceb", 0xED09, z_cmp, FP32, load>;
+  def CDB : CompareRXE<"cdb", 0xED19, z_cmp, FP64, load>;
+}
+
+//===----------------------------------------------------------------------===//
+// Peepholes
+//===----------------------------------------------------------------------===//
+
+def : Pat<(f32  fpimmneg0), (LCEBR (LZER))>;
+def : Pat<(f64  fpimmneg0), (LCDBR (LZDR))>;
+def : Pat<(f128 fpimmneg0), (LCXBR (LZXR))>;
diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td
new file mode 100644
index 0000000..ad050fd
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -0,0 +1,1002 @@
+//==- SystemZInstrFormats.td - SystemZ Instruction Formats --*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Basic SystemZ instruction definition
+//===----------------------------------------------------------------------===//
+
+class InstSystemZ<int size, dag outs, dag ins, string asmstr,
+                  list<dag> pattern> : Instruction {
+  let Namespace = "SystemZ";
+
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let Size = size;
+  let Pattern = pattern;
+  let AsmString = asmstr;
+
+  // Used to identify a group of related instructions, such as ST and STY.
+  string Function = "";
+
+  // "12" for an instruction that has a ...Y equivalent, "20" for that
+  // ...Y equivalent.
+  string PairType = "none";
+
+  // True if this instruction is a simple D(X,B) load of a register
+  // (with no sign or zero extension).
+  bit SimpleBDXLoad = 0;
+
+  // True if this instruction is a simple D(X,B) store of a register
+  // (with no truncation).
+  bit SimpleBDXStore = 0;
+
+  // True if this instruction has a 20-bit displacement field.
+  bit Has20BitOffset = 0;
+
+  // True if addresses in this instruction have an index register.
+  bit HasIndex = 0;
+
+  // True if this is a 128-bit pseudo instruction that combines two 64-bit
+  // operations.
+  bit Is128Bit = 0;
+
+  let TSFlags{0} = SimpleBDXLoad;
+  let TSFlags{1} = SimpleBDXStore;
+  let TSFlags{2} = Has20BitOffset;
+  let TSFlags{3} = HasIndex;
+  let TSFlags{4} = Is128Bit;
+}
+
+//===----------------------------------------------------------------------===//
+// Mappings between instructions
+//===----------------------------------------------------------------------===//
+
+// Return the version of an instruction that has an unsigned 12-bit
+// displacement.
+def getDisp12Opcode : InstrMapping {
+  let FilterClass = "InstSystemZ";
+  let RowFields = ["Function"];
+  let ColFields = ["PairType"];
+  let KeyCol = ["20"];
+  let ValueCols = [["12"]];
+}
+
+// Return the version of an instruction that has a signed 20-bit displacement.
+def getDisp20Opcode : InstrMapping {
+  let FilterClass = "InstSystemZ";
+  let RowFields = ["Function"];
+  let ColFields = ["PairType"];
+  let KeyCol = ["12"];
+  let ValueCols = [["20"]];
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction formats
+//===----------------------------------------------------------------------===//
+//
+// Formats are specified using operand field declarations of the form:
+//
+//   bits<4> Rn   : register input or output for operand n
+//   bits<m> In   : immediate value of width m for operand n
+//   bits<4> BDn  : address operand n, which has a base and a displacement
+//   bits<m> XBDn : address operand n, which has an index, a base and a
+//                  displacement
+//   bits<4> Xn   : index register for address operand n
+//   bits<4> Mn   : mode value for operand n
+//
+// The operand numbers ("n" in the list above) follow the architecture manual.
+// Assembly operands sometimes have a different order; in particular, R3 often
+// is often written between operands 1 and 2.
+//
+//===----------------------------------------------------------------------===//
+
+class InstRI<bits<12> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<4> R1;
+  bits<16> I2;
+
+  let Inst{31-24} = op{11-4};
+  let Inst{23-20} = R1;
+  let Inst{19-16} = op{3-0};
+  let Inst{15-0}  = I2;
+}
+
+class InstRIEb<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R2;
+  bits<4> M3;
+  bits<16> RI4;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = R2;
+  let Inst{31-16} = RI4;
+  let Inst{15-12} = M3;
+  let Inst{11-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstRIEc<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<8> I2;
+  bits<4> M3;
+  bits<16> RI4;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = M3;
+  let Inst{31-16} = RI4;
+  let Inst{15-8}  = I2;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstRIEf<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R2;
+  bits<8> I3;
+  bits<8> I4;
+  bits<8> I5;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = R2;
+  let Inst{31-24} = I3;
+  let Inst{23-16} = I4;
+  let Inst{15-8}  = I5;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstRIL<bits<12> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<32> I2;
+
+  let Inst{47-40} = op{11-4};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = op{3-0};
+  let Inst{31-0}  = I2;
+}
+
+class InstRR<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<2, outs, ins, asmstr, pattern> {
+  field bits<16> Inst;
+  field bits<16> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R2;
+
+  let Inst{15-8} = op;
+  let Inst{7-4}  = R1;
+  let Inst{3-0}  = R2;
+}
+
+class InstRRD<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R3;
+  bits<4> R2;
+
+  let Inst{31-16} = op;
+  let Inst{15-12} = R1;
+  let Inst{11-8}  = 0;
+  let Inst{7-4}   = R3;
+  let Inst{3-0}   = R2;
+}
+
+class InstRRE<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R2;
+
+  let Inst{31-16} = op;
+  let Inst{15-8}  = 0;
+  let Inst{7-4}   = R1;
+  let Inst{3-0}   = R2;
+}
+
+class InstRRF<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R2;
+  bits<4> R3;
+
+  let Inst{31-16} = op;
+  let Inst{15-12} = R3;
+  let Inst{11-8}  = 0;
+  let Inst{7-4}   = R1;
+  let Inst{3-0}   = R2;
+}
+
+class InstRX<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<4> R1;
+  bits<20> XBD2;
+
+  let Inst{31-24} = op;
+  let Inst{23-20} = R1;
+  let Inst{19-0}  = XBD2;
+
+  let HasIndex = 1;
+}
+
+class InstRXE<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<20> XBD2;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-16} = XBD2;
+  let Inst{15-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+
+  let HasIndex = 1;
+}
+
+class InstRXF<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R3;
+  bits<20> XBD2;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R3;
+  let Inst{35-16} = XBD2;
+  let Inst{15-12} = R1;
+  let Inst{11-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+
+  let HasIndex = 1;
+}
+
+class InstRXY<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<28> XBD2;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-8}  = XBD2;
+  let Inst{7-0}   = op{7-0};
+
+  let Has20BitOffset = 1;
+  let HasIndex = 1;
+}
+
+class InstRS<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R3;
+  bits<16> BD2;
+
+  let Inst{31-24} = op;
+  let Inst{23-20} = R1;
+  let Inst{19-16} = R3;
+  let Inst{15-0}  = BD2;
+}
+
+class InstRSY<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R3;
+  bits<24> BD2;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = R3;
+  let Inst{31-8}  = BD2;
+  let Inst{7-0}   = op{7-0};
+
+  let Has20BitOffset = 1;
+}
+
+class InstSI<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<16> BD1;
+  bits<8> I2;
+
+  let Inst{31-24} = op;
+  let Inst{23-16} = I2;
+  let Inst{15-0}  = BD1;
+}
+
+class InstSIL<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<16> BD1;
+  bits<16> I2;
+
+  let Inst{47-32} = op;
+  let Inst{31-16} = BD1;
+  let Inst{15-0}  = I2;
+}
+
+class InstSIY<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<24> BD1;
+  bits<8> I2;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-32} = I2;
+  let Inst{31-8}  = BD1;
+  let Inst{7-0}   = op{7-0};
+
+  let Has20BitOffset = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction definitions with semantics
+//===----------------------------------------------------------------------===//
+//
+// These classes have the form <Category><Format>, where <Format> is one
+// of the formats defined above and where <Category> describes the inputs
+// and outputs.  <Category> can be one of:
+//
+//   Inherent:
+//     One register output operand and no input operands.
+//
+//   Store:
+//     One register or immediate input operand and one address input operand.
+//     The instruction stores the first operand to the address.
+//
+//     This category is used for both pure and truncating stores.
+//
+//   LoadMultiple:
+//     One address input operand and two explicit output operands.
+//     The instruction loads a range of registers from the address,
+//     with the explicit operands giving the first and last register
+//     to load.  Other loaded registers are added as implicit definitions.
+//
+//   StoreMultiple:
+//     Two explicit input register operands and an address operand.
+//     The instruction stores a range of registers to the address,
+//     with the explicit operands giving the first and last register
+//     to store.  Other stored registers are added as implicit uses.
+//
+//   Unary:
+//     One register output operand and one input operand.  The input
+//     operand may be a register, immediate or memory.
+//
+//   Binary:
+//     One register output operand and two input operands.  The first
+//     input operand is always a register and he second may be a register,
+//     immediate or memory.
+//
+//   Shift:
+//     One register output operand and two input operands.  The first
+//     input operand is a register and the second has the same form as
+//     an address (although it isn't actually used to address memory).
+//
+//   Compare:
+//     Two input operands.  The first operand is always a register,
+//     the second may be a register, immediate or memory.
+//
+//   Ternary:
+//     One register output operand and three register input operands.
+//
+//   CmpSwap:
+//     One output operand and three input operands.  The first two
+//     operands are registers and the third is an address.  The instruction
+//     both reads from and writes to the address.
+//
+//   RotateSelect:
+//     One output operand and five input operands.  The first two operands
+//     are registers and the other three are immediates.
+//
+// The format determines which input operands are tied to output operands,
+// and also determines the shape of any address operand.
+//
+// Multiclasses of the form <Category><Format>Pair define two instructions,
+// one with <Category><Format> and one with <Category><Format>Y.  The name
+// of the first instruction has no suffix, the name of the second has
+// an extra "y".
+//
+//===----------------------------------------------------------------------===//
+
+class InherentRRE<string mnemonic, bits<16> opcode, RegisterOperand cls,
+                  dag src>
+  : InstRRE<opcode, (outs cls:$R1), (ins),
+            mnemonic#"\t$R1",
+            [(set cls:$R1, src)]> {
+  let R2 = 0;
+}
+
+class LoadMultipleRSY<string mnemonic, bits<16> opcode, RegisterOperand cls>
+  : InstRSY<opcode, (outs cls:$R1, cls:$R3), (ins bdaddr20only:$BD2),
+            mnemonic#"\t$R1, $R3, $BD2", []> {
+  let mayLoad = 1;
+}
+
+class StoreRILPC<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+                 RegisterOperand cls>
+  : InstRIL<opcode, (outs), (ins cls:$R1, pcrel32:$I2),
+            mnemonic#"\t$R1, $I2",
+            [(operator cls:$R1, pcrel32:$I2)]> {
+  let mayStore = 1;
+  // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
+  // However, BDXs have two extra operands and are therefore 6 units more
+  // complex.
+  let AddedComplexity = 7;
+}
+
+class StoreRX<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+              RegisterOperand cls, AddressingMode mode = bdxaddr12only>
+  : InstRX<opcode, (outs), (ins cls:$R1, mode:$XBD2),
+           mnemonic#"\t$R1, $XBD2",
+           [(operator cls:$R1, mode:$XBD2)]> {
+  let mayStore = 1;
+}
+
+class StoreRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+               RegisterOperand cls, AddressingMode mode = bdxaddr20only>
+  : InstRXY<opcode, (outs), (ins cls:$R1, mode:$XBD2),
+            mnemonic#"\t$R1, $XBD2",
+            [(operator cls:$R1, mode:$XBD2)]> {
+  let mayStore = 1;
+}
+
+multiclass StoreRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode,
+                       SDPatternOperator operator, RegisterOperand cls> {
+  let Function = mnemonic ## #cls in {
+    let PairType = "12" in
+      def "" : StoreRX<mnemonic, rxOpcode, operator, cls, bdxaddr12pair>;
+    let PairType = "20" in
+      def Y  : StoreRXY<mnemonic#"y", rxyOpcode, operator, cls, bdxaddr20pair>;
+  }
+}
+
+class StoreMultipleRSY<string mnemonic, bits<16> opcode, RegisterOperand cls>
+  : InstRSY<opcode, (outs), (ins cls:$R1, cls:$R3, bdaddr20only:$BD2),
+            mnemonic#"\t$R1, $R3, $BD2", []> {
+  let mayStore = 1;
+}
+
+class StoreSI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+              Immediate imm, AddressingMode mode = bdaddr12only>
+  : InstSI<opcode, (outs), (ins mode:$BD1, imm:$I2),
+           mnemonic#"\t$BD1, $I2",
+           [(operator imm:$I2, mode:$BD1)]> {
+  let mayStore = 1;
+}
+
+class StoreSIY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+               Immediate imm, AddressingMode mode = bdaddr20only>
+  : InstSIY<opcode, (outs), (ins mode:$BD1, imm:$I2),
+            mnemonic#"\t$BD1, $I2",
+            [(operator imm:$I2, mode:$BD1)]> {
+  let mayStore = 1;
+}
+
+class StoreSIL<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+               Immediate imm>
+  : InstSIL<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2),
+            mnemonic#"\t$BD1, $I2",
+            [(operator imm:$I2, bdaddr12only:$BD1)]> {
+  let mayStore = 1;
+}
+
+multiclass StoreSIPair<string mnemonic, bits<8> siOpcode, bits<16> siyOpcode,
+                       SDPatternOperator operator, Immediate imm> {
+  let Function = mnemonic in {
+    let PairType = "12" in
+      def "" : StoreSI<mnemonic, siOpcode, operator, imm, bdaddr12pair>;
+    let PairType = "20" in
+      def Y  : StoreSIY<mnemonic#"y", siyOpcode, operator, imm, bdaddr20pair>;
+  }
+}
+
+class UnaryRR<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+              RegisterOperand cls1, RegisterOperand cls2>
+  : InstRR<opcode, (outs cls1:$R1), (ins cls2:$R2),
+           mnemonic#"\t$R1, $R2",
+           [(set cls1:$R1, (operator cls2:$R2))]>;
+
+class UnaryRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+               RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRE<opcode, (outs cls1:$R1), (ins cls2:$R2),
+            mnemonic#"\t$R1, $R2",
+            [(set cls1:$R1, (operator cls2:$R2))]>;
+
+class UnaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
+               RegisterOperand cls2>
+  : InstRRF<opcode, (outs cls1:$R1), (ins uimm8zx4:$R3, cls2:$R2),
+            mnemonic#"\t$R1, $R3, $R2", []>;
+
+class UnaryRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+              RegisterOperand cls, Immediate imm>
+  : InstRI<opcode, (outs cls:$R1), (ins imm:$I2),
+           mnemonic#"\t$R1, $I2",
+           [(set cls:$R1, (operator imm:$I2))]>;
+
+class UnaryRIL<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+               RegisterOperand cls, Immediate imm>
+  : InstRIL<opcode, (outs cls:$R1), (ins imm:$I2),
+            mnemonic#"\t$R1, $I2",
+            [(set cls:$R1, (operator imm:$I2))]>;
+
+class UnaryRILPC<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+                 RegisterOperand cls>
+  : InstRIL<opcode, (outs cls:$R1), (ins pcrel32:$I2),
+            mnemonic#"\t$R1, $I2",
+            [(set cls:$R1, (operator pcrel32:$I2))]> {
+  let mayLoad = 1;
+  // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
+  // However, BDXs have two extra operands and are therefore 6 units more
+  // complex.
+  let AddedComplexity = 7;
+}
+
+class UnaryRX<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+              RegisterOperand cls, AddressingMode mode = bdxaddr12only>
+  : InstRX<opcode, (outs cls:$R1), (ins mode:$XBD2),
+           mnemonic#"\t$R1, $XBD2",
+           [(set cls:$R1, (operator mode:$XBD2))]> {
+  let mayLoad = 1;
+}
+
+class UnaryRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+               RegisterOperand cls>
+  : InstRXE<opcode, (outs cls:$R1), (ins bdxaddr12only:$XBD2),
+            mnemonic#"\t$R1, $XBD2",
+            [(set cls:$R1, (operator bdxaddr12only:$XBD2))]> {
+  let mayLoad = 1;
+}
+
+class UnaryRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+               RegisterOperand cls, AddressingMode mode = bdxaddr20only>
+  : InstRXY<opcode, (outs cls:$R1), (ins mode:$XBD2),
+            mnemonic#"\t$R1, $XBD2",
+            [(set cls:$R1, (operator mode:$XBD2))]> {
+  let mayLoad = 1;
+}
+
+multiclass UnaryRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode,
+                       SDPatternOperator operator, RegisterOperand cls> {
+  let Function = mnemonic ## #cls in {
+    let PairType = "12" in
+      def "" : UnaryRX<mnemonic, rxOpcode, operator, cls, bdxaddr12pair>;
+    let PairType = "20" in
+      def Y  : UnaryRXY<mnemonic#"y", rxyOpcode, operator, cls, bdxaddr20pair>;
+  }
+}
+
+class BinaryRR<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+               RegisterOperand cls1, RegisterOperand cls2>
+  : InstRR<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2),
+           mnemonic#"\t$R1, $R2",
+           [(set cls1:$R1, (operator cls1:$R1src, cls2:$R2))]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class BinaryRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRE<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2),
+            mnemonic#"\t$R1, $R2",
+            [(set cls1:$R1, (operator cls1:$R1src, cls2:$R2))]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class BinaryRRF<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRF<opcode, (outs cls1:$R1), (ins cls1:$R3, cls2:$R2),
+            mnemonic#"\t$R1, $R3, $R2",
+            [(set cls1:$R1, (operator cls1:$R3, cls2:$R2))]>;
+
+class BinaryRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+               RegisterOperand cls, Immediate imm>
+  : InstRI<opcode, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
+           mnemonic#"\t$R1, $I2",
+           [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class BinaryRIL<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+                RegisterOperand cls, Immediate imm>
+  : InstRIL<opcode, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
+            mnemonic#"\t$R1, $I2",
+            [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class BinaryRX<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+               RegisterOperand cls, SDPatternOperator load,
+               AddressingMode mode = bdxaddr12only>
+  : InstRX<opcode, (outs cls:$R1), (ins cls:$R1src, mode:$XBD2),
+           mnemonic#"\t$R1, $XBD2",
+           [(set cls:$R1, (operator cls:$R1src, (load mode:$XBD2)))]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let mayLoad = 1;
+}
+
+class BinaryRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                  RegisterOperand cls, SDPatternOperator load>
+  : InstRXE<opcode, (outs cls:$R1), (ins cls:$R1src, bdxaddr12only:$XBD2),
+            mnemonic#"\t$R1, $XBD2",
+            [(set cls:$R1, (operator cls:$R1src,
+                                     (load bdxaddr12only:$XBD2)))]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let mayLoad = 1;
+}
+
+class BinaryRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                RegisterOperand cls, SDPatternOperator load,
+                AddressingMode mode = bdxaddr20only>
+  : InstRXY<opcode, (outs cls:$R1), (ins cls:$R1src, mode:$XBD2),
+            mnemonic#"\t$R1, $XBD2",
+            [(set cls:$R1, (operator cls:$R1src, (load mode:$XBD2)))]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let mayLoad = 1;
+}
+
+multiclass BinaryRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode,
+                        SDPatternOperator operator, RegisterOperand cls,
+                        SDPatternOperator load> {
+  let Function = mnemonic ## #cls in {
+    let PairType = "12" in
+      def "" : BinaryRX<mnemonic, rxOpcode, operator, cls, load, bdxaddr12pair>;
+    let PairType = "20" in
+      def Y  : BinaryRXY<mnemonic#"y", rxyOpcode, operator, cls, load,
+                         bdxaddr20pair>;
+  }
+}
+
+class BinarySI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+               Operand imm, AddressingMode mode = bdaddr12only>
+  : InstSI<opcode, (outs), (ins mode:$BD1, imm:$I2),
+           mnemonic#"\t$BD1, $I2",
+           [(store (operator (load mode:$BD1), imm:$I2), mode:$BD1)]> {
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+class BinarySIY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                Operand imm, AddressingMode mode = bdaddr20only>
+  : InstSIY<opcode, (outs), (ins mode:$BD1, imm:$I2),
+            mnemonic#"\t$BD1, $I2",
+            [(store (operator (load mode:$BD1), imm:$I2), mode:$BD1)]> {
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+multiclass BinarySIPair<string mnemonic, bits<8> siOpcode,
+                        bits<16> siyOpcode, SDPatternOperator operator,
+                        Operand imm> {
+  let Function = mnemonic ## #cls in {
+    let PairType = "12" in
+      def "" : BinarySI<mnemonic, siOpcode, operator, imm, bdaddr12pair>;
+    let PairType = "20" in
+      def Y  : BinarySIY<mnemonic#"y", siyOpcode, operator, imm, bdaddr20pair>;
+  }
+}
+
+class ShiftRS<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+              RegisterOperand cls, AddressingMode mode>
+  : InstRS<opcode, (outs cls:$R1), (ins cls:$R1src, mode:$BD2),
+           mnemonic#"\t$R1, $BD2",
+           [(set cls:$R1, (operator cls:$R1src, mode:$BD2))]> {
+  let R3 = 0;
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class ShiftRSY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+               RegisterOperand cls, AddressingMode mode>
+  : InstRSY<opcode, (outs cls:$R1), (ins cls:$R3, mode:$BD2),
+            mnemonic#"\t$R1, $R3, $BD2",
+            [(set cls:$R1, (operator cls:$R3, mode:$BD2))]>;
+
+class CompareRR<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+                RegisterOperand cls1, RegisterOperand cls2>
+  : InstRR<opcode, (outs), (ins cls1:$R1, cls2:$R2),
+           mnemonic#"\t$R1, $R2",
+           [(operator cls1:$R1, cls2:$R2)]>;
+
+class CompareRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRE<opcode, (outs), (ins cls1:$R1, cls2:$R2),
+            mnemonic#"\t$R1, $R2",
+            [(operator cls1:$R1, cls2:$R2)]>;
+
+class CompareRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+                RegisterOperand cls, Immediate imm>
+  : InstRI<opcode, (outs), (ins cls:$R1, imm:$I2),
+           mnemonic#"\t$R1, $I2",
+           [(operator cls:$R1, imm:$I2)]>;
+
+class CompareRIL<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+                 RegisterOperand cls, Immediate imm>
+  : InstRIL<opcode, (outs), (ins cls:$R1, imm:$I2),
+            mnemonic#"\t$R1, $I2",
+            [(operator cls:$R1, imm:$I2)]>;
+
+class CompareRILPC<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+                   RegisterOperand cls, SDPatternOperator load>
+  : InstRIL<opcode, (outs), (ins cls:$R1, pcrel32:$I2),
+            mnemonic#"\t$R1, $I2",
+            [(operator cls:$R1, (load pcrel32:$I2))]> {
+  let mayLoad = 1;
+  // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
+  // However, BDXs have two extra operands and are therefore 6 units more
+  // complex.
+  let AddedComplexity = 7;
+}
+
+class CompareRX<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+                RegisterOperand cls, SDPatternOperator load,
+                AddressingMode mode = bdxaddr12only>
+  : InstRX<opcode, (outs), (ins cls:$R1, mode:$XBD2),
+           mnemonic#"\t$R1, $XBD2",
+           [(operator cls:$R1, (load mode:$XBD2))]> {
+  let mayLoad = 1;
+}
+
+class CompareRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 RegisterOperand cls, SDPatternOperator load>
+  : InstRXE<opcode, (outs), (ins cls:$R1, bdxaddr12only:$XBD2),
+            mnemonic#"\t$R1, $XBD2",
+            [(operator cls:$R1, (load bdxaddr12only:$XBD2))]> {
+  let mayLoad = 1;
+}
+
+class CompareRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 RegisterOperand cls, SDPatternOperator load,
+                 AddressingMode mode = bdxaddr20only>
+  : InstRXY<opcode, (outs), (ins cls:$R1, mode:$XBD2),
+            mnemonic#"\t$R1, $XBD2",
+            [(operator cls:$R1, (load mode:$XBD2))]> {
+  let mayLoad = 1;
+}
+
+multiclass CompareRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode,
+                         SDPatternOperator operator, RegisterOperand cls,
+                         SDPatternOperator load> {
+  let Function = mnemonic ## #cls in {
+    let PairType = "12" in
+      def "" : CompareRX<mnemonic, rxOpcode, operator, cls,
+                         load, bdxaddr12pair>;
+    let PairType = "20" in
+      def Y  : CompareRXY<mnemonic#"y", rxyOpcode, operator, cls,
+                          load, bdxaddr20pair>;
+  }
+}
+
+class CompareSI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+                SDPatternOperator load, Immediate imm,
+                AddressingMode mode = bdaddr12only>
+  : InstSI<opcode, (outs), (ins mode:$BD1, imm:$I2),
+           mnemonic#"\t$BD1, $I2",
+           [(operator (load mode:$BD1), imm:$I2)]> {
+  let mayLoad = 1;
+}
+
+class CompareSIL<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 SDPatternOperator load, Immediate imm>
+  : InstSIL<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2),
+            mnemonic#"\t$BD1, $I2",
+            [(operator (load bdaddr12only:$BD1), imm:$I2)]> {
+  let mayLoad = 1;
+}
+
+class CompareSIY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 SDPatternOperator load, Immediate imm,
+                 AddressingMode mode = bdaddr20only>
+  : InstSIY<opcode, (outs), (ins mode:$BD1, imm:$I2),
+            mnemonic#"\t$BD1, $I2",
+            [(operator (load mode:$BD1), imm:$I2)]> {
+  let mayLoad = 1;
+}
+
+multiclass CompareSIPair<string mnemonic, bits<8> siOpcode, bits<16> siyOpcode,
+                         SDPatternOperator operator, SDPatternOperator load,
+                         Immediate imm> {
+  let Function = mnemonic in {
+    let PairType = "12" in
+      def "" : CompareSI<mnemonic, siOpcode, operator, load, imm, bdaddr12pair>;
+    let PairType = "20" in
+      def Y  : CompareSIY<mnemonic#"y", siyOpcode, operator, load, imm,
+                          bdaddr20pair>;
+  }
+}
+
+class TernaryRRD<string mnemonic, bits<16> opcode,
+                 SDPatternOperator operator, RegisterOperand cls>
+  : InstRRD<opcode, (outs cls:$R1), (ins cls:$R1src, cls:$R3, cls:$R2),
+            mnemonic#"\t$R1, $R3, $R2",
+            [(set cls:$R1, (operator cls:$R1src, cls:$R3, cls:$R2))]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class TernaryRXF<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 RegisterOperand cls, SDPatternOperator load>
+  : InstRXF<opcode, (outs cls:$R1),
+            (ins cls:$R1src, cls:$R3, bdxaddr12only:$XBD2),
+            mnemonic#"\t$R1, $R3, $XBD2",
+            [(set cls:$R1, (operator cls:$R1src, cls:$R3,
+                                     (load bdxaddr12only:$XBD2)))]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let mayLoad = 1;
+}
+
+class CmpSwapRS<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+                RegisterOperand cls, AddressingMode mode = bdaddr12only>
+  : InstRS<opcode, (outs cls:$R1), (ins cls:$R1src, cls:$R3, mode:$BD2),
+           mnemonic#"\t$R1, $R3, $BD2",
+           [(set cls:$R1, (operator mode:$BD2, cls:$R1src, cls:$R3))]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+class CmpSwapRSY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 RegisterOperand cls, AddressingMode mode = bdaddr20only>
+  : InstRSY<opcode, (outs cls:$R1), (ins cls:$R1src, cls:$R3, mode:$BD2),
+            mnemonic#"\t$R1, $R3, $BD2",
+            [(set cls:$R1, (operator mode:$BD2, cls:$R1src, cls:$R3))]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+multiclass CmpSwapRSPair<string mnemonic, bits<8> rsOpcode, bits<16> rsyOpcode,
+                         SDPatternOperator operator, RegisterOperand cls> {
+  let Function = mnemonic ## #cls in {
+    let PairType = "12" in
+      def "" : CmpSwapRS<mnemonic, rsOpcode, operator, cls, bdaddr12pair>;
+    let PairType = "20" in
+      def Y  : CmpSwapRSY<mnemonic#"y", rsyOpcode, operator, cls, bdaddr20pair>;
+  }
+}
+
+class RotateSelectRIEf<string mnemonic, bits<16> opcode, RegisterOperand cls1,
+                       RegisterOperand cls2>
+  : InstRIEf<opcode, (outs cls1:$R1),
+             (ins cls1:$R1src, cls2:$R2,
+                  uimm8zx6:$I3, uimm8zx6:$I4, uimm8zx6:$I5),
+             mnemonic#"\t$R1, $R2, $I3, $I4, $I5", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions
+//===----------------------------------------------------------------------===//
+//
+// Convenience instructions that get lowered to real instructions
+// by either SystemZTargetLowering::EmitInstrWithCustomInserter()
+// or SystemZInstrInfo::expandPostRAPseudo().
+//
+//===----------------------------------------------------------------------===//
+
+class Pseudo<dag outs, dag ins, list<dag> pattern>
+  : InstSystemZ<0, outs, ins, "", pattern> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
+
+// Implements "$dst = $cc & (8 >> CC) ? $src1 : $src2", where CC is
+// the value of the PSW's 2-bit condition code field.
+class SelectWrapper<RegisterOperand cls>
+  : Pseudo<(outs cls:$dst), (ins cls:$src1, cls:$src2, i8imm:$cc),
+           [(set cls:$dst, (z_select_ccmask cls:$src1, cls:$src2, imm:$cc))]> {
+  let usesCustomInserter = 1;
+  // Although the instructions used by these nodes do not in themselves
+  // change CC, the insertion requires new blocks, and CC cannot be live
+  // across them.
+  let Defs = [CC];
+  let Uses = [CC];
+}
+
+// OPERATOR is ATOMIC_SWAP or an ATOMIC_LOAD_* operation.  PAT and OPERAND
+// describe the second (non-memory) operand.
+class AtomicLoadBinary<SDPatternOperator operator, RegisterOperand cls,
+                       dag pat, DAGOperand operand>
+  : Pseudo<(outs cls:$dst), (ins bdaddr20only:$ptr, operand:$src2),
+           [(set cls:$dst, (operator bdaddr20only:$ptr, pat))]> {
+  let Defs = [CC];
+  let Has20BitOffset = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
+  let usesCustomInserter = 1;
+}
+
+// Specializations of AtomicLoadWBinary.
+class AtomicLoadBinaryReg32<SDPatternOperator operator>
+  : AtomicLoadBinary<operator, GR32, (i32 GR32:$src2), GR32>;
+class AtomicLoadBinaryImm32<SDPatternOperator operator, Immediate imm>
+  : AtomicLoadBinary<operator, GR32, (i32 imm:$src2), imm>;
+class AtomicLoadBinaryReg64<SDPatternOperator operator>
+  : AtomicLoadBinary<operator, GR64, (i64 GR64:$src2), GR64>;
+class AtomicLoadBinaryImm64<SDPatternOperator operator, Immediate imm>
+  : AtomicLoadBinary<operator, GR64, (i64 imm:$src2), imm>;
+
+// OPERATOR is ATOMIC_SWAPW or an ATOMIC_LOADW_* operation.  PAT and OPERAND
+// describe the second (non-memory) operand.
+class AtomicLoadWBinary<SDPatternOperator operator, dag pat,
+                        DAGOperand operand>
+  : Pseudo<(outs GR32:$dst),
+           (ins bdaddr20only:$ptr, operand:$src2, ADDR32:$bitshift,
+                ADDR32:$negbitshift, uimm32:$bitsize),
+           [(set GR32:$dst, (operator bdaddr20only:$ptr, pat, ADDR32:$bitshift,
+                                      ADDR32:$negbitshift, uimm32:$bitsize))]> {
+  let Defs = [CC];
+  let Has20BitOffset = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
+  let usesCustomInserter = 1;
+}
+
+// Specializations of AtomicLoadWBinary.
+class AtomicLoadWBinaryReg<SDPatternOperator operator>
+  : AtomicLoadWBinary<operator, (i32 GR32:$src2), GR32>;
+class AtomicLoadWBinaryImm<SDPatternOperator operator, Immediate imm>
+  : AtomicLoadWBinary<operator, (i32 imm:$src2), imm>;
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
new file mode 100644
index 0000000..0d30432
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -0,0 +1,478 @@
+//===-- SystemZInstrInfo.cpp - SystemZ instruction information ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SystemZ implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZInstrInfo.h"
+#include "SystemZInstrBuilder.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define GET_INSTRINFO_CTOR
+#define GET_INSTRMAP_INFO
+#include "SystemZGenInstrInfo.inc"
+
+using namespace llvm;
+
+SystemZInstrInfo::SystemZInstrInfo(SystemZTargetMachine &tm)
+  : SystemZGenInstrInfo(SystemZ::ADJCALLSTACKDOWN, SystemZ::ADJCALLSTACKUP),
+    RI(tm) {
+}
+
+// MI is a 128-bit load or store.  Split it into two 64-bit loads or stores,
+// each having the opcode given by NewOpcode.
+void SystemZInstrInfo::splitMove(MachineBasicBlock::iterator MI,
+                                 unsigned NewOpcode) const {
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineFunction &MF = *MBB->getParent();
+
+  // Get two load or store instructions.  Use the original instruction for one
+  // of them (arbitarily the second here) and create a clone for the other.
+  MachineInstr *EarlierMI = MF.CloneMachineInstr(MI);
+  MBB->insert(MI, EarlierMI);
+
+  // Set up the two 64-bit registers.
+  MachineOperand &HighRegOp = EarlierMI->getOperand(0);
+  MachineOperand &LowRegOp = MI->getOperand(0);
+  HighRegOp.setReg(RI.getSubReg(HighRegOp.getReg(), SystemZ::subreg_high));
+  LowRegOp.setReg(RI.getSubReg(LowRegOp.getReg(), SystemZ::subreg_low));
+
+  // The address in the first (high) instruction is already correct.
+  // Adjust the offset in the second (low) instruction.
+  MachineOperand &HighOffsetOp = EarlierMI->getOperand(2);
+  MachineOperand &LowOffsetOp = MI->getOperand(2);
+  LowOffsetOp.setImm(LowOffsetOp.getImm() + 8);
+
+  // Set the opcodes.
+  unsigned HighOpcode = getOpcodeForOffset(NewOpcode, HighOffsetOp.getImm());
+  unsigned LowOpcode = getOpcodeForOffset(NewOpcode, LowOffsetOp.getImm());
+  assert(HighOpcode && LowOpcode && "Both offsets should be in range");
+
+  EarlierMI->setDesc(get(HighOpcode));
+  MI->setDesc(get(LowOpcode));
+}
+
+// Split ADJDYNALLOC instruction MI.
+void SystemZInstrInfo::splitAdjDynAlloc(MachineBasicBlock::iterator MI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineFunction &MF = *MBB->getParent();
+  MachineFrameInfo *MFFrame = MF.getFrameInfo();
+  MachineOperand &OffsetMO = MI->getOperand(2);
+
+  uint64_t Offset = (MFFrame->getMaxCallFrameSize() +
+                     SystemZMC::CallFrameSize +
+                     OffsetMO.getImm());
+  unsigned NewOpcode = getOpcodeForOffset(SystemZ::LA, Offset);
+  assert(NewOpcode && "No support for huge argument lists yet");
+  MI->setDesc(get(NewOpcode));
+  OffsetMO.setImm(Offset);
+}
+
+// If MI is a simple load or store for a frame object, return the register
+// it loads or stores and set FrameIndex to the index of the frame object.
+// Return 0 otherwise.
+//
+// Flag is SimpleBDXLoad for loads and SimpleBDXStore for stores.
+static int isSimpleMove(const MachineInstr *MI, int &FrameIndex, int Flag) {
+  const MCInstrDesc &MCID = MI->getDesc();
+  if ((MCID.TSFlags & Flag) &&
+      MI->getOperand(1).isFI() &&
+      MI->getOperand(2).getImm() == 0 &&
+      MI->getOperand(3).getReg() == 0) {
+    FrameIndex = MI->getOperand(1).getIndex();
+    return MI->getOperand(0).getReg();
+  }
+  return 0;
+}
+
+unsigned SystemZInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                               int &FrameIndex) const {
+  return isSimpleMove(MI, FrameIndex, SystemZII::SimpleBDXLoad);
+}
+
+unsigned SystemZInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                              int &FrameIndex) const {
+  return isSimpleMove(MI, FrameIndex, SystemZII::SimpleBDXStore);
+}
+
+bool SystemZInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                     MachineBasicBlock *&TBB,
+                                     MachineBasicBlock *&FBB,
+                                     SmallVectorImpl<MachineOperand> &Cond,
+                                     bool AllowModify) const {
+  // Most of the code and comments here are boilerplate.
+
+  // Start from the bottom of the block and work up, examining the
+  // terminator instructions.
+  MachineBasicBlock::iterator I = MBB.end();
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugValue())
+      continue;
+
+    // Working from the bottom, when we see a non-terminator instruction, we're
+    // done.
+    if (!isUnpredicatedTerminator(I))
+      break;
+
+    // A terminator that isn't a branch can't easily be handled by this
+    // analysis.
+    if (!I->isBranch())
+      return true;
+
+    // Can't handle indirect branches.
+    SystemZII::Branch Branch(getBranchInfo(I));
+    if (!Branch.Target->isMBB())
+      return true;
+
+    // Punt on compound branches.
+    if (Branch.Type != SystemZII::BranchNormal)
+      return true;
+
+    if (Branch.CCMask == SystemZ::CCMASK_ANY) {
+      // Handle unconditional branches.
+      if (!AllowModify) {
+        TBB = Branch.Target->getMBB();
+        continue;
+      }
+
+      // If the block has any instructions after a JMP, delete them.
+      while (llvm::next(I) != MBB.end())
+        llvm::next(I)->eraseFromParent();
+
+      Cond.clear();
+      FBB = 0;
+
+      // Delete the JMP if it's equivalent to a fall-through.
+      if (MBB.isLayoutSuccessor(Branch.Target->getMBB())) {
+        TBB = 0;
+        I->eraseFromParent();
+        I = MBB.end();
+        continue;
+      }
+
+      // TBB is used to indicate the unconditinal destination.
+      TBB = Branch.Target->getMBB();
+      continue;
+    }
+
+    // Working from the bottom, handle the first conditional branch.
+    if (Cond.empty()) {
+      // FIXME: add X86-style branch swap
+      FBB = TBB;
+      TBB = Branch.Target->getMBB();
+      Cond.push_back(MachineOperand::CreateImm(Branch.CCMask));
+      continue;
+    }
+
+    // Handle subsequent conditional branches.
+    assert(Cond.size() == 1);
+    assert(TBB);
+
+    // Only handle the case where all conditional branches branch to the same
+    // destination.
+    if (TBB != Branch.Target->getMBB())
+      return true;
+
+    // If the conditions are the same, we can leave them alone.
+    unsigned OldCond = Cond[0].getImm();
+    if (OldCond == Branch.CCMask)
+      continue;
+
+    // FIXME: Try combining conditions like X86 does.  Should be easy on Z!
+  }
+
+  return false;
+}
+
+unsigned SystemZInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  // Most of the code and comments here are boilerplate.
+  MachineBasicBlock::iterator I = MBB.end();
+  unsigned Count = 0;
+
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugValue())
+      continue;
+    if (!I->isBranch())
+      break;
+    if (!getBranchInfo(I).Target->isMBB())
+      break;
+    // Remove the branch.
+    I->eraseFromParent();
+    I = MBB.end();
+    ++Count;
+  }
+
+  return Count;
+}
+
+unsigned
+SystemZInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                               MachineBasicBlock *FBB,
+                               const SmallVectorImpl<MachineOperand> &Cond,
+                               DebugLoc DL) const {
+  // In this function we output 32-bit branches, which should always
+  // have enough range.  They can be shortened and relaxed by later code
+  // in the pipeline, if desired.
+
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 1 || Cond.size() == 0) &&
+         "SystemZ branch conditions have one component!");
+
+  if (Cond.empty()) {
+    // Unconditional branch?
+    assert(!FBB && "Unconditional branch with multiple successors!");
+    BuildMI(&MBB, DL, get(SystemZ::J)).addMBB(TBB);
+    return 1;
+  }
+
+  // Conditional branch.
+  unsigned Count = 0;
+  unsigned CC = Cond[0].getImm();
+  BuildMI(&MBB, DL, get(SystemZ::BRC)).addImm(CC).addMBB(TBB);
+  ++Count;
+
+  if (FBB) {
+    // Two-way Conditional branch. Insert the second branch.
+    BuildMI(&MBB, DL, get(SystemZ::J)).addMBB(FBB);
+    ++Count;
+  }
+  return Count;
+}
+
+void
+SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+			      MachineBasicBlock::iterator MBBI, DebugLoc DL,
+			      unsigned DestReg, unsigned SrcReg,
+			      bool KillSrc) const {
+  // Split 128-bit GPR moves into two 64-bit moves.  This handles ADDR128 too.
+  if (SystemZ::GR128BitRegClass.contains(DestReg, SrcReg)) {
+    copyPhysReg(MBB, MBBI, DL, RI.getSubReg(DestReg, SystemZ::subreg_high),
+                RI.getSubReg(SrcReg, SystemZ::subreg_high), KillSrc);
+    copyPhysReg(MBB, MBBI, DL, RI.getSubReg(DestReg, SystemZ::subreg_low),
+                RI.getSubReg(SrcReg, SystemZ::subreg_low), KillSrc);
+    return;
+  }
+
+  // Everything else needs only one instruction.
+  unsigned Opcode;
+  if (SystemZ::GR32BitRegClass.contains(DestReg, SrcReg))
+    Opcode = SystemZ::LR;
+  else if (SystemZ::GR64BitRegClass.contains(DestReg, SrcReg))
+    Opcode = SystemZ::LGR;
+  else if (SystemZ::FP32BitRegClass.contains(DestReg, SrcReg))
+    Opcode = SystemZ::LER;
+  else if (SystemZ::FP64BitRegClass.contains(DestReg, SrcReg))
+    Opcode = SystemZ::LDR;
+  else if (SystemZ::FP128BitRegClass.contains(DestReg, SrcReg))
+    Opcode = SystemZ::LXR;
+  else
+    llvm_unreachable("Impossible reg-to-reg copy");
+
+  BuildMI(MBB, MBBI, DL, get(Opcode), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+}
+
+void
+SystemZInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+				      MachineBasicBlock::iterator MBBI,
+				      unsigned SrcReg, bool isKill,
+				      int FrameIdx,
+				      const TargetRegisterClass *RC,
+				      const TargetRegisterInfo *TRI) const {
+  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  // Callers may expect a single instruction, so keep 128-bit moves
+  // together for now and lower them after register allocation.
+  unsigned LoadOpcode, StoreOpcode;
+  getLoadStoreOpcodes(RC, LoadOpcode, StoreOpcode);
+  addFrameReference(BuildMI(MBB, MBBI, DL, get(StoreOpcode))
+		    .addReg(SrcReg, getKillRegState(isKill)), FrameIdx);
+}
+
+void
+SystemZInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+				       MachineBasicBlock::iterator MBBI,
+				       unsigned DestReg, int FrameIdx,
+				       const TargetRegisterClass *RC,
+				       const TargetRegisterInfo *TRI) const {
+  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  // Callers may expect a single instruction, so keep 128-bit moves
+  // together for now and lower them after register allocation.
+  unsigned LoadOpcode, StoreOpcode;
+  getLoadStoreOpcodes(RC, LoadOpcode, StoreOpcode);
+  addFrameReference(BuildMI(MBB, MBBI, DL, get(LoadOpcode), DestReg),
+                    FrameIdx);
+}
+
+bool
+SystemZInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+  switch (MI->getOpcode()) {
+  case SystemZ::L128:
+    splitMove(MI, SystemZ::LG);
+    return true;
+
+  case SystemZ::ST128:
+    splitMove(MI, SystemZ::STG);
+    return true;
+
+  case SystemZ::LX:
+    splitMove(MI, SystemZ::LD);
+    return true;
+
+  case SystemZ::STX:
+    splitMove(MI, SystemZ::STD);
+    return true;
+
+  case SystemZ::ADJDYNALLOC:
+    splitAdjDynAlloc(MI);
+    return true;
+
+  default:
+    return false;
+  }
+}
+
+bool SystemZInstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 1 && "Invalid branch condition!");
+  Cond[0].setImm(Cond[0].getImm() ^ SystemZ::CCMASK_ANY);
+  return false;
+}
+
+uint64_t SystemZInstrInfo::getInstSizeInBytes(const MachineInstr *MI) const {
+  if (MI->getOpcode() == TargetOpcode::INLINEASM) {
+    const MachineFunction *MF = MI->getParent()->getParent();
+    const char *AsmStr = MI->getOperand(0).getSymbolName();
+    return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
+  }
+  return MI->getDesc().getSize();
+}
+
+SystemZII::Branch
+SystemZInstrInfo::getBranchInfo(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  case SystemZ::BR:
+  case SystemZ::J:
+  case SystemZ::JG:
+    return SystemZII::Branch(SystemZII::BranchNormal, SystemZ::CCMASK_ANY,
+                             &MI->getOperand(0));
+
+  case SystemZ::BRC:
+  case SystemZ::BRCL:
+    return SystemZII::Branch(SystemZII::BranchNormal,
+                             MI->getOperand(0).getImm(), &MI->getOperand(1));
+
+  case SystemZ::CIJ:
+  case SystemZ::CRJ:
+    return SystemZII::Branch(SystemZII::BranchC, MI->getOperand(2).getImm(),
+                             &MI->getOperand(3));
+
+  case SystemZ::CGIJ:
+  case SystemZ::CGRJ:
+    return SystemZII::Branch(SystemZII::BranchCG, MI->getOperand(2).getImm(),
+                             &MI->getOperand(3));
+
+  default:
+    llvm_unreachable("Unrecognized branch opcode");
+  }
+}
+
+void SystemZInstrInfo::getLoadStoreOpcodes(const TargetRegisterClass *RC,
+                                           unsigned &LoadOpcode,
+                                           unsigned &StoreOpcode) const {
+  if (RC == &SystemZ::GR32BitRegClass || RC == &SystemZ::ADDR32BitRegClass) {
+    LoadOpcode = SystemZ::L;
+    StoreOpcode = SystemZ::ST32;
+  } else if (RC == &SystemZ::GR64BitRegClass ||
+             RC == &SystemZ::ADDR64BitRegClass) {
+    LoadOpcode = SystemZ::LG;
+    StoreOpcode = SystemZ::STG;
+  } else if (RC == &SystemZ::GR128BitRegClass ||
+             RC == &SystemZ::ADDR128BitRegClass) {
+    LoadOpcode = SystemZ::L128;
+    StoreOpcode = SystemZ::ST128;
+  } else if (RC == &SystemZ::FP32BitRegClass) {
+    LoadOpcode = SystemZ::LE;
+    StoreOpcode = SystemZ::STE;
+  } else if (RC == &SystemZ::FP64BitRegClass) {
+    LoadOpcode = SystemZ::LD;
+    StoreOpcode = SystemZ::STD;
+  } else if (RC == &SystemZ::FP128BitRegClass) {
+    LoadOpcode = SystemZ::LX;
+    StoreOpcode = SystemZ::STX;
+  } else
+    llvm_unreachable("Unsupported regclass to load or store");
+}
+
+unsigned SystemZInstrInfo::getOpcodeForOffset(unsigned Opcode,
+                                              int64_t Offset) const {
+  const MCInstrDesc &MCID = get(Opcode);
+  int64_t Offset2 = (MCID.TSFlags & SystemZII::Is128Bit ? Offset + 8 : Offset);
+  if (isUInt<12>(Offset) && isUInt<12>(Offset2)) {
+    // Get the instruction to use for unsigned 12-bit displacements.
+    int Disp12Opcode = SystemZ::getDisp12Opcode(Opcode);
+    if (Disp12Opcode >= 0)
+      return Disp12Opcode;
+
+    // All address-related instructions can use unsigned 12-bit
+    // displacements.
+    return Opcode;
+  }
+  if (isInt<20>(Offset) && isInt<20>(Offset2)) {
+    // Get the instruction to use for signed 20-bit displacements.
+    int Disp20Opcode = SystemZ::getDisp20Opcode(Opcode);
+    if (Disp20Opcode >= 0)
+      return Disp20Opcode;
+
+    // Check whether Opcode allows signed 20-bit displacements.
+    if (MCID.TSFlags & SystemZII::Has20BitOffset)
+      return Opcode;
+  }
+  return 0;
+}
+
+unsigned SystemZInstrInfo::getCompareAndBranch(unsigned Opcode,
+                                               const MachineInstr *MI) const {
+  switch (Opcode) {
+  case SystemZ::CR:
+    return SystemZ::CRJ;
+  case SystemZ::CGR:
+    return SystemZ::CGRJ;
+  case SystemZ::CHI:
+    return MI && isInt<8>(MI->getOperand(1).getImm()) ? SystemZ::CIJ : 0;
+  case SystemZ::CGHI:
+    return MI && isInt<8>(MI->getOperand(1).getImm()) ? SystemZ::CGIJ : 0;
+  default:
+    return 0;
+  }
+}
+
+void SystemZInstrInfo::loadImmediate(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     unsigned Reg, uint64_t Value) const {
+  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+  unsigned Opcode;
+  if (isInt<16>(Value))
+    Opcode = SystemZ::LGHI;
+  else if (SystemZ::isImmLL(Value))
+    Opcode = SystemZ::LLILL;
+  else if (SystemZ::isImmLH(Value)) {
+    Opcode = SystemZ::LLILH;
+    Value >>= 16;
+  } else {
+    assert(isInt<32>(Value) && "Huge values not handled yet");
+    Opcode = SystemZ::LGFI;
+  }
+  BuildMI(MBB, MBBI, DL, get(Opcode), Reg).addImm(Value);
+}
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h
new file mode 100644
index 0000000..d6980f7
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -0,0 +1,158 @@
+//===-- SystemZInstrInfo.h - SystemZ instruction information ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SystemZ implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_SYSTEMZINSTRINFO_H
+#define LLVM_TARGET_SYSTEMZINSTRINFO_H
+
+#include "SystemZ.h"
+#include "SystemZRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "SystemZGenInstrInfo.inc"
+
+namespace llvm {
+
+class SystemZTargetMachine;
+
+namespace SystemZII {
+  enum {
+    // See comments in SystemZInstrFormats.td.
+    SimpleBDXLoad  = (1 << 0),
+    SimpleBDXStore = (1 << 1),
+    Has20BitOffset = (1 << 2),
+    HasIndex       = (1 << 3),
+    Is128Bit       = (1 << 4)
+  };
+  // SystemZ MachineOperand target flags.
+  enum {
+    // Masks out the bits for the access model.
+    MO_SYMBOL_MODIFIER = (1 << 0),
+
+    // @GOT (aka @GOTENT)
+    MO_GOT = (1 << 0)
+  };
+  // Classifies a branch.
+  enum BranchType {
+    // An instruction that branches on the current value of CC.
+    BranchNormal,
+
+    // An instruction that peforms a 32-bit signed comparison and branches
+    // on the result.
+    BranchC,
+
+    // An instruction that peforms a 64-bit signed comparison and branches
+    // on the result.
+    BranchCG
+  };
+  // Information about a branch instruction.
+  struct Branch {
+    // The type of the branch.
+    BranchType Type;
+
+    // CCMASK_<N> is set if the branch should be taken when CC == N.
+    unsigned CCMask;
+
+    // The target of the branch.
+    const MachineOperand *Target;
+
+    Branch(BranchType type, unsigned ccMask, const MachineOperand *target)
+      : Type(type), CCMask(ccMask), Target(target) {}
+  };
+}
+
+class SystemZInstrInfo : public SystemZGenInstrInfo {
+  const SystemZRegisterInfo RI;
+
+  void splitMove(MachineBasicBlock::iterator MI, unsigned NewOpcode) const;
+  void splitAdjDynAlloc(MachineBasicBlock::iterator MI) const;
+
+public:
+  explicit SystemZInstrInfo(SystemZTargetMachine &TM);
+
+  // Override TargetInstrInfo.
+  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                                       int &FrameIndex) const LLVM_OVERRIDE;
+  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+                                      int &FrameIndex) const LLVM_OVERRIDE;
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB,
+                             MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             bool AllowModify) const LLVM_OVERRIDE;
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const LLVM_OVERRIDE;
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                                const SmallVectorImpl<MachineOperand> &Cond,
+                                DebugLoc DL) const LLVM_OVERRIDE;
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const LLVM_OVERRIDE;
+  virtual void
+    storeRegToStackSlot(MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator MBBI,
+                        unsigned SrcReg, bool isKill, int FrameIndex,
+                        const TargetRegisterClass *RC,
+                        const TargetRegisterInfo *TRI) const LLVM_OVERRIDE;
+  virtual void
+    loadRegFromStackSlot(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator MBBI,
+                         unsigned DestReg, int FrameIdx,
+                         const TargetRegisterClass *RC,
+                         const TargetRegisterInfo *TRI) const LLVM_OVERRIDE;
+  virtual bool
+    expandPostRAPseudo(MachineBasicBlock::iterator MBBI) const LLVM_OVERRIDE;
+  virtual bool
+    ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const
+    LLVM_OVERRIDE;
+
+  // Return the SystemZRegisterInfo, which this class owns.
+  const SystemZRegisterInfo &getRegisterInfo() const { return RI; }
+
+  // Return the size in bytes of MI.
+  uint64_t getInstSizeInBytes(const MachineInstr *MI) const;
+
+  // Return true if MI is a conditional or unconditional branch.
+  // When returning true, set Cond to the mask of condition-code
+  // values on which the instruction will branch, and set Target
+  // to the operand that contains the branch target.  This target
+  // can be a register or a basic block.
+  SystemZII::Branch getBranchInfo(const MachineInstr *MI) const;
+
+  // Get the load and store opcodes for a given register class.
+  void getLoadStoreOpcodes(const TargetRegisterClass *RC,
+                           unsigned &LoadOpcode, unsigned &StoreOpcode) const;
+
+  // Opcode is the opcode of an instruction that has an address operand,
+  // and the caller wants to perform that instruction's operation on an
+  // address that has displacement Offset.  Return the opcode of a suitable
+  // instruction (which might be Opcode itself) or 0 if no such instruction
+  // exists.
+  unsigned getOpcodeForOffset(unsigned Opcode, int64_t Offset) const;
+
+  // If Opcode is a COMPARE opcode for which an associated COMPARE AND
+  // BRANCH exists, return the opcode for the latter, otherwise return 0.
+  // MI, if nonnull, is the compare instruction.
+  unsigned getCompareAndBranch(unsigned Opcode,
+                               const MachineInstr *MI = 0) const;
+
+  // Emit code before MBBI in MI to move immediate value Value into
+  // physical register Reg.
+  void loadImmediate(MachineBasicBlock &MBB,
+                     MachineBasicBlock::iterator MBBI,
+                     unsigned Reg, uint64_t Value) const;
+};
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
new file mode 100644
index 0000000..c9ec6bc
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -0,0 +1,1017 @@
+//===-- SystemZInstrInfo.td - General SystemZ instructions ----*- tblgen-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Stack allocation
+//===----------------------------------------------------------------------===//
+
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt),
+                              [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP   : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
+                              [(callseq_end timm:$amt1, timm:$amt2)]>;
+
+let neverHasSideEffects = 1 in {
+  // Takes as input the value of the stack pointer after a dynamic allocation
+  // has been made.  Sets the output to the address of the dynamically-
+  // allocated area itself, skipping the outgoing arguments.
+  //
+  // This expands to an LA or LAY instruction.  We restrict the offset
+  // to the range of LA and keep the LAY range in reserve for when
+  // the size of the outgoing arguments is added.
+  def ADJDYNALLOC : Pseudo<(outs GR64:$dst), (ins dynalloc12only:$src),
+                           [(set GR64:$dst, dynalloc12only:$src)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Control flow instructions
+//===----------------------------------------------------------------------===//
+
+// A return instruction.  R1 is the condition-code mask (all 1s)
+// and R2 is the target address, which is always stored in %r14.
+let isReturn = 1, isTerminator = 1, isBarrier = 1, hasCtrlDep = 1,
+    R1 = 15, R2 = 14, isCodeGenOnly = 1 in {
+  def RET : InstRR<0x07, (outs), (ins), "br\t%r14", [(z_retflag)]>;
+}
+
+// Unconditional branches.  R1 is the condition-code mask (all 1s).
+let isBranch = 1, isTerminator = 1, isBarrier = 1, R1 = 15 in {
+  let isIndirectBranch = 1 in
+    def BR : InstRR<0x07, (outs), (ins ADDR64:$R2),
+                    "br\t$R2", [(brind ADDR64:$R2)]>;
+
+  // An assembler extended mnemonic for BRC.
+  def J : InstRI<0xA74, (outs), (ins brtarget16:$I2), "j\t$I2",
+                 [(br bb:$I2)]>;
+
+  // An assembler extended mnemonic for BRCL.  (The extension is "G"
+  // rather than "L" because "JL" is "Jump if Less".)
+  def JG : InstRIL<0xC04, (outs), (ins brtarget32:$I2), "jg\t$I2", []>;
+}
+
+// Conditional branches.  It's easier for LLVM to handle these branches
+// in their raw BRC/BRCL form, with the 4-bit condition-code mask being
+// the first operand.  It seems friendlier to use mnemonic forms like
+// JE and JLH when writing out the assembly though.
+//
+// Using a custom inserter for BRC gives us a chance to convert the BRC
+// and a preceding compare into a single compare-and-branch instruction.
+// The inserter makes no change in cases where a separate branch really
+// is needed.
+multiclass CondBranches<Operand ccmask, string short, string long> {
+  let isBranch = 1, isTerminator = 1, Uses = [CC] in {
+    def "" : InstRI<0xA74, (outs), (ins ccmask:$R1, brtarget16:$I2), short, []>;
+    def L  : InstRIL<0xC04, (outs), (ins ccmask:$R1, brtarget32:$I2), long, []>;
+  }
+}
+let isCodeGenOnly = 1, usesCustomInserter = 1 in
+  defm BRC : CondBranches<cond4, "j$R1\t$I2", "jg$R1\t$I2">;
+defm AsmBRC : CondBranches<uimm8zx4, "brc\t$R1, $I2", "brcl\t$R1, $I2">;
+
+def : Pat<(z_br_ccmask cond4:$cond, bb:$dst), (BRC cond4:$cond, bb:$dst)>;
+
+// Fused compare-and-branch instructions.  As for normal branches,
+// we handle these instructions internally in their raw CRJ-like form,
+// but use assembly macros like CRJE when writing them out.
+//
+// These instructions do not use or clobber the condition codes.
+// We nevertheless pretend that they clobber CC, so that we can lower
+// them to separate comparisons and BRCLs if the branch ends up being
+// out of range.
+multiclass CompareBranches<Operand ccmask, string pos1, string pos2> {
+  let isBranch = 1, isTerminator = 1, Defs = [CC] in {
+    def RJ  : InstRIEb<0xEC76, (outs), (ins GR32:$R1, GR32:$R2, ccmask:$M3,
+                                            brtarget16:$RI4),
+                       "crj"##pos1##"\t$R1, $R2, "##pos2##"$RI4", []>;
+    def GRJ : InstRIEb<0xEC64, (outs), (ins GR64:$R1, GR64:$R2, ccmask:$M3,
+                                            brtarget16:$RI4),
+                       "cgrj"##pos1##"\t$R1, $R2, "##pos2##"$RI4", []>;
+    def IJ  : InstRIEc<0xEC7E, (outs), (ins GR32:$R1, imm32sx8:$I2, ccmask:$M3,
+                                            brtarget16:$RI4),
+                       "cij"##pos1##"\t$R1, $I2, "##pos2##"$RI4", []>;
+    def GIJ : InstRIEc<0xEC7C, (outs), (ins GR64:$R1, imm64sx8:$I2, ccmask:$M3,
+                                            brtarget16:$RI4),
+                       "cgij"##pos1##"\t$R1, $I2, "##pos2##"$RI4", []>;
+  }
+}
+let isCodeGenOnly = 1 in
+  defm C : CompareBranches<cond4, "$M3", "">;
+defm AsmC : CompareBranches<uimm8zx4, "", "$M3, ">;
+
+// Define AsmParser mnemonics for each general condition-code mask
+// (integer or floating-point)
+multiclass CondExtendedMnemonic<bits<4> ccmask, string name> {
+  let R1 = ccmask in {
+    def "" : InstRI<0xA74, (outs), (ins brtarget16:$I2),
+                    "j"##name##"\t$I2", []>;
+    def L  : InstRIL<0xC04, (outs), (ins brtarget32:$I2),
+                     "jg"##name##"\t$I2", []>;
+  }
+}
+defm AsmJO   : CondExtendedMnemonic<1,  "o">;
+defm AsmJH   : CondExtendedMnemonic<2,  "h">;
+defm AsmJNLE : CondExtendedMnemonic<3,  "nle">;
+defm AsmJL   : CondExtendedMnemonic<4,  "l">;
+defm AsmJNHE : CondExtendedMnemonic<5,  "nhe">;
+defm AsmJLH  : CondExtendedMnemonic<6,  "lh">;
+defm AsmJNE  : CondExtendedMnemonic<7,  "ne">;
+defm AsmJE   : CondExtendedMnemonic<8,  "e">;
+defm AsmJNLH : CondExtendedMnemonic<9,  "nlh">;
+defm AsmJHE  : CondExtendedMnemonic<10, "he">;
+defm AsmJNL  : CondExtendedMnemonic<11, "nl">;
+defm AsmJLE  : CondExtendedMnemonic<12, "le">;
+defm AsmJNH  : CondExtendedMnemonic<13, "nh">;
+defm AsmJNO  : CondExtendedMnemonic<14, "no">;
+
+// Define AsmParser mnemonics for each integer condition-code mask.
+// This is like the list above, except that condition 3 is not possible
+// and that the low bit of the mask is therefore always 0.  This means
+// that each condition has two names.  Conditions "o" and "no" are not used.
+//
+// We don't make one of the two names an alias of the other because
+// we need the custom parsing routines to select the correct register class.
+multiclass IntCondExtendedMnemonicA<bits<4> ccmask, string name> {
+  let M3 = ccmask in {
+    def CR  : InstRIEb<0xEC76, (outs), (ins GR32:$R1, GR32:$R2,
+                                            brtarget16:$RI4),
+                       "crj"##name##"\t$R1, $R2, $RI4", []>;
+    def CGR : InstRIEb<0xEC64, (outs), (ins GR64:$R1, GR64:$R2,
+                                            brtarget16:$RI4),
+                       "cgrj"##name##"\t$R1, $R2, $RI4", []>;
+    def CI  : InstRIEc<0xEC7E, (outs), (ins GR32:$R1, imm32sx8:$I2,
+                                            brtarget16:$RI4),
+                       "cij"##name##"\t$R1, $I2, $RI4", []>;
+    def CGI : InstRIEc<0xEC7C, (outs), (ins GR64:$R1, imm64sx8:$I2,
+                                            brtarget16:$RI4),
+                       "cgij"##name##"\t$R1, $I2, $RI4", []>;
+  }
+}
+multiclass IntCondExtendedMnemonic<bits<4> ccmask, string name1, string name2>
+  : IntCondExtendedMnemonicA<ccmask, name1> {
+  let isAsmParserOnly = 1 in
+    defm Alt : IntCondExtendedMnemonicA<ccmask, name2>;
+}
+defm AsmJH   : IntCondExtendedMnemonic<2,  "h",  "nle">;
+defm AsmJL   : IntCondExtendedMnemonic<4,  "l",  "nhe">;
+defm AsmJLH  : IntCondExtendedMnemonic<6,  "lh", "ne">;
+defm AsmJE   : IntCondExtendedMnemonic<8,  "e",  "nlh">;
+defm AsmJHE  : IntCondExtendedMnemonic<10, "he", "nl">;
+defm AsmJLE  : IntCondExtendedMnemonic<12, "le", "nh">;
+
+def Select32 : SelectWrapper<GR32>;
+def Select64 : SelectWrapper<GR64>;
+
+//===----------------------------------------------------------------------===//
+// Call instructions
+//===----------------------------------------------------------------------===//
+
+// The definitions here are for the call-clobbered registers.
+let isCall = 1, Defs = [R0D, R1D, R2D, R3D, R4D, R5D, R14D,
+                        F0D, F1D, F2D, F3D, F4D, F5D, F6D, F7D],
+    R1 = 14, isCodeGenOnly = 1 in {
+  def BRAS  : InstRI<0xA75, (outs), (ins pcrel16call:$I2, variable_ops),
+                     "bras\t%r14, $I2", []>;
+  def BRASL : InstRIL<0xC05, (outs), (ins pcrel32call:$I2, variable_ops),
+                      "brasl\t%r14, $I2", [(z_call pcrel32call:$I2)]>;
+  def BASR  : InstRR<0x0D, (outs), (ins ADDR64:$R2, variable_ops),
+                     "basr\t%r14, $R2", [(z_call ADDR64:$R2)]>;
+}
+
+// Define the general form of the call instructions for the asm parser.
+// These instructions don't hard-code %r14 as the return address register.
+def AsmBRAS  : InstRI<0xA75, (outs), (ins GR64:$R1, brtarget16:$I2),
+                      "bras\t$R1, $I2", []>;
+def AsmBRASL : InstRIL<0xC05, (outs), (ins GR64:$R1, brtarget32:$I2),
+                       "brasl\t$R1, $I2", []>;
+def AsmBASR  : InstRR<0x0D, (outs), (ins GR64:$R1, ADDR64:$R2),
+                      "basr\t$R1, $R2", []>;
+
+//===----------------------------------------------------------------------===//
+// Move instructions
+//===----------------------------------------------------------------------===//
+
+// Register moves.
+let neverHasSideEffects = 1 in {
+  def LR  : UnaryRR <"lr",  0x18,   null_frag, GR32, GR32>;
+  def LGR : UnaryRRE<"lgr", 0xB904, null_frag, GR64, GR64>;
+}
+
+// Immediate moves.
+let neverHasSideEffects = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
+  // 16-bit sign-extended immediates.
+  def LHI  : UnaryRI<"lhi",  0xA78, bitconvert, GR32, imm32sx16>;
+  def LGHI : UnaryRI<"lghi", 0xA79, bitconvert, GR64, imm64sx16>;
+
+  // Other 16-bit immediates.
+  def LLILL : UnaryRI<"llill", 0xA5F, bitconvert, GR64, imm64ll16>;
+  def LLILH : UnaryRI<"llilh", 0xA5E, bitconvert, GR64, imm64lh16>;
+  def LLIHL : UnaryRI<"llihl", 0xA5D, bitconvert, GR64, imm64hl16>;
+  def LLIHH : UnaryRI<"llihh", 0xA5C, bitconvert, GR64, imm64hh16>;
+
+  // 32-bit immediates.
+  def LGFI  : UnaryRIL<"lgfi",  0xC01, bitconvert, GR64, imm64sx32>;
+  def LLILF : UnaryRIL<"llilf", 0xC0F, bitconvert, GR64, imm64lf32>;
+  def LLIHF : UnaryRIL<"llihf", 0xC0E, bitconvert, GR64, imm64hf32>;
+}
+
+// Register loads.
+let canFoldAsLoad = 1, SimpleBDXLoad = 1 in {
+  defm L   : UnaryRXPair<"l", 0x58, 0xE358, load, GR32>;
+  def  LRL : UnaryRILPC<"lrl", 0xC4D, aligned_load, GR32>;
+
+  def LG   : UnaryRXY<"lg", 0xE304, load, GR64>;
+  def LGRL : UnaryRILPC<"lgrl", 0xC48, aligned_load, GR64>;
+
+  // These instructions are split after register allocation, so we don't
+  // want a custom inserter.
+  let Has20BitOffset = 1, HasIndex = 1, Is128Bit = 1 in {
+    def L128 : Pseudo<(outs GR128:$dst), (ins bdxaddr20only128:$src),
+                      [(set GR128:$dst, (load bdxaddr20only128:$src))]>;
+  }
+}
+
+// Register stores.
+let SimpleBDXStore = 1 in {
+  let isCodeGenOnly = 1 in {
+    defm ST32   : StoreRXPair<"st", 0x50, 0xE350, store, GR32>;
+    def  STRL32 : StoreRILPC<"strl", 0xC4F, aligned_store, GR32>;
+  }
+
+  def STG   : StoreRXY<"stg", 0xE324, store, GR64>;
+  def STGRL : StoreRILPC<"stgrl", 0xC4B, aligned_store, GR64>;
+
+  // These instructions are split after register allocation, so we don't
+  // want a custom inserter.
+  let Has20BitOffset = 1, HasIndex = 1, Is128Bit = 1 in {
+    def ST128 : Pseudo<(outs), (ins GR128:$src, bdxaddr20only128:$dst),
+                       [(store GR128:$src, bdxaddr20only128:$dst)]>;
+  }
+}
+
+// 8-bit immediate stores to 8-bit fields.
+defm MVI : StoreSIPair<"mvi", 0x92, 0xEB52, truncstorei8, imm32zx8trunc>;
+
+// 16-bit immediate stores to 16-, 32- or 64-bit fields.
+def MVHHI : StoreSIL<"mvhhi", 0xE544, truncstorei16, imm32sx16trunc>;
+def MVHI  : StoreSIL<"mvhi",  0xE54C, store,         imm32sx16>;
+def MVGHI : StoreSIL<"mvghi", 0xE548, store,         imm64sx16>;
+
+//===----------------------------------------------------------------------===//
+// Sign extensions
+//===----------------------------------------------------------------------===//
+
+// 32-bit extensions from registers.
+let neverHasSideEffects = 1 in {
+  def LBR : UnaryRRE<"lbr", 0xB926, sext8,  GR32, GR32>;
+  def LHR : UnaryRRE<"lhr", 0xB927, sext16, GR32, GR32>;
+}
+
+// 64-bit extensions from registers.
+let neverHasSideEffects = 1 in {
+  def LGBR : UnaryRRE<"lgbr", 0xB906, sext8,  GR64, GR64>;
+  def LGHR : UnaryRRE<"lghr", 0xB907, sext16, GR64, GR64>;
+  def LGFR : UnaryRRE<"lgfr", 0xB914, sext32, GR64, GR32>;
+}
+
+// Match 32-to-64-bit sign extensions in which the source is already
+// in a 64-bit register.
+def : Pat<(sext_inreg GR64:$src, i32),
+          (LGFR (EXTRACT_SUBREG GR64:$src, subreg_32bit))>;
+
+// 32-bit extensions from memory.
+def  LB   : UnaryRXY<"lb", 0xE376, sextloadi8, GR32>;
+defm LH   : UnaryRXPair<"lh", 0x48, 0xE378, sextloadi16, GR32>;
+def  LHRL : UnaryRILPC<"lhrl", 0xC45, aligned_sextloadi16, GR32>;
+
+// 64-bit extensions from memory.
+def LGB   : UnaryRXY<"lgb", 0xE377, sextloadi8,  GR64>;
+def LGH   : UnaryRXY<"lgh", 0xE315, sextloadi16, GR64>;
+def LGF   : UnaryRXY<"lgf", 0xE314, sextloadi32, GR64>;
+def LGHRL : UnaryRILPC<"lghrl", 0xC44, aligned_sextloadi16, GR64>;
+def LGFRL : UnaryRILPC<"lgfrl", 0xC4C, aligned_sextloadi32, GR64>;
+
+// If the sign of a load-extend operation doesn't matter, use the signed ones.
+// There's not really much to choose between the sign and zero extensions,
+// but LH is more compact than LLH for small offsets.
+def : Pat<(i32 (extloadi8  bdxaddr20only:$src)), (LB  bdxaddr20only:$src)>;
+def : Pat<(i32 (extloadi16 bdxaddr12pair:$src)), (LH  bdxaddr12pair:$src)>;
+def : Pat<(i32 (extloadi16 bdxaddr20pair:$src)), (LHY bdxaddr20pair:$src)>;
+
+def : Pat<(i64 (extloadi8  bdxaddr20only:$src)), (LGB bdxaddr20only:$src)>;
+def : Pat<(i64 (extloadi16 bdxaddr20only:$src)), (LGH bdxaddr20only:$src)>;
+def : Pat<(i64 (extloadi32 bdxaddr20only:$src)), (LGF bdxaddr20only:$src)>;
+
+//===----------------------------------------------------------------------===//
+// Zero extensions
+//===----------------------------------------------------------------------===//
+
+// 32-bit extensions from registers.
+let neverHasSideEffects = 1 in {
+  def LLCR : UnaryRRE<"llcr", 0xB994, zext8,  GR32, GR32>;
+  def LLHR : UnaryRRE<"llhr", 0xB995, zext16, GR32, GR32>;
+}
+
+// 64-bit extensions from registers.
+let neverHasSideEffects = 1 in {
+  def LLGCR : UnaryRRE<"llgcr", 0xB984, zext8,  GR64, GR64>;
+  def LLGHR : UnaryRRE<"llghr", 0xB985, zext16, GR64, GR64>;
+  def LLGFR : UnaryRRE<"llgfr", 0xB916, zext32, GR64, GR32>;
+}
+
+// Match 32-to-64-bit zero extensions in which the source is already
+// in a 64-bit register.
+def : Pat<(and GR64:$src, 0xffffffff),
+          (LLGFR (EXTRACT_SUBREG GR64:$src, subreg_32bit))>;
+
+// 32-bit extensions from memory.
+def LLC   : UnaryRXY<"llc", 0xE394, zextloadi8,  GR32>;
+def LLH   : UnaryRXY<"llh", 0xE395, zextloadi16, GR32>;
+def LLHRL : UnaryRILPC<"llhrl", 0xC42, aligned_zextloadi16, GR32>;
+
+// 64-bit extensions from memory.
+def LLGC   : UnaryRXY<"llgc", 0xE390, zextloadi8,  GR64>;
+def LLGH   : UnaryRXY<"llgh", 0xE391, zextloadi16, GR64>;
+def LLGF   : UnaryRXY<"llgf", 0xE316, zextloadi32, GR64>;
+def LLGHRL : UnaryRILPC<"llghrl", 0xC46, aligned_zextloadi16, GR64>;
+def LLGFRL : UnaryRILPC<"llgfrl", 0xC4E, aligned_zextloadi32, GR64>;
+
+//===----------------------------------------------------------------------===//
+// Truncations
+//===----------------------------------------------------------------------===//
+
+// Truncations of 64-bit registers to 32-bit registers.
+def : Pat<(i32 (trunc GR64:$src)),
+          (EXTRACT_SUBREG GR64:$src, subreg_32bit)>;
+
+// Truncations of 32-bit registers to memory.
+let isCodeGenOnly = 1 in {
+  defm STC32   : StoreRXPair<"stc", 0x42, 0xE372, truncstorei8,  GR32>;
+  defm STH32   : StoreRXPair<"sth", 0x40, 0xE370, truncstorei16, GR32>;
+  def  STHRL32 : StoreRILPC<"sthrl", 0xC47, aligned_truncstorei16, GR32>;
+}
+
+// Truncations of 64-bit registers to memory.
+defm STC   : StoreRXPair<"stc", 0x42, 0xE372, truncstorei8,  GR64>;
+defm STH   : StoreRXPair<"sth", 0x40, 0xE370, truncstorei16, GR64>;
+def  STHRL : StoreRILPC<"sthrl", 0xC47, aligned_truncstorei16, GR64>;
+defm ST    : StoreRXPair<"st", 0x50, 0xE350, truncstorei32, GR64>;
+def  STRL  : StoreRILPC<"strl", 0xC4F, aligned_truncstorei32, GR64>;
+
+//===----------------------------------------------------------------------===//
+// Multi-register moves
+//===----------------------------------------------------------------------===//
+
+// Multi-register loads.
+def LMG : LoadMultipleRSY<"lmg", 0xEB04, GR64>;
+
+// Multi-register stores.
+def STMG : StoreMultipleRSY<"stmg", 0xEB24, GR64>;
+
+//===----------------------------------------------------------------------===//
+// Byte swaps
+//===----------------------------------------------------------------------===//
+
+// Byte-swapping register moves.
+let neverHasSideEffects = 1 in {
+  def LRVR  : UnaryRRE<"lrvr",  0xB91F, bswap, GR32, GR32>;
+  def LRVGR : UnaryRRE<"lrvgr", 0xB90F, bswap, GR64, GR64>;
+}
+
+// Byte-swapping loads.  Unlike normal loads, these instructions are
+// allowed to access storage more than once.
+def LRV  : UnaryRXY<"lrv",  0xE31E, loadu<bswap, nonvolatile_load>, GR32>;
+def LRVG : UnaryRXY<"lrvg", 0xE30F, loadu<bswap, nonvolatile_load>, GR64>;
+
+// Likewise byte-swapping stores.
+def STRV  : StoreRXY<"strv",  0xE33E, storeu<bswap, nonvolatile_store>, GR32>;
+def STRVG : StoreRXY<"strvg", 0xE32F, storeu<bswap, nonvolatile_store>, GR64>;
+
+//===----------------------------------------------------------------------===//
+// Load address instructions
+//===----------------------------------------------------------------------===//
+
+// Load BDX-style addresses.
+let neverHasSideEffects = 1, Function = "la" in {
+  let PairType = "12" in
+    def LA : InstRX<0x41, (outs GR64:$R1), (ins laaddr12pair:$XBD2),
+                    "la\t$R1, $XBD2",
+                    [(set GR64:$R1, laaddr12pair:$XBD2)]>;
+  let PairType = "20" in
+    def LAY : InstRXY<0xE371, (outs GR64:$R1), (ins laaddr20pair:$XBD2),
+                      "lay\t$R1, $XBD2",
+                      [(set GR64:$R1, laaddr20pair:$XBD2)]>;
+}
+
+// Load a PC-relative address.  There's no version of this instruction
+// with a 16-bit offset, so there's no relaxation.
+let neverHasSideEffects = 1 in {
+  def LARL : InstRIL<0xC00, (outs GR64:$R1), (ins pcrel32:$I2),
+                     "larl\t$R1, $I2",
+                     [(set GR64:$R1, pcrel32:$I2)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Negation
+//===----------------------------------------------------------------------===//
+
+let Defs = [CC] in {
+  def LCR   : UnaryRR <"lcr",   0x13,   ineg,      GR32, GR32>;
+  def LCGR  : UnaryRRE<"lcgr",  0xB903, ineg,      GR64, GR64>;
+  def LCGFR : UnaryRRE<"lcgfr", 0xB913, null_frag, GR64, GR32>;
+}
+defm : SXU<ineg, LCGFR>;
+
+//===----------------------------------------------------------------------===//
+// Insertion
+//===----------------------------------------------------------------------===//
+
+let isCodeGenOnly = 1 in
+  defm IC32 : BinaryRXPair<"ic", 0x43, 0xE373, inserti8, GR32, zextloadi8>;
+defm IC : BinaryRXPair<"ic", 0x43, 0xE373, inserti8, GR64, zextloadi8>;
+
+defm : InsertMem<"inserti8", IC32,  GR32, zextloadi8, bdxaddr12pair>;
+defm : InsertMem<"inserti8", IC32Y, GR32, zextloadi8, bdxaddr20pair>;
+
+defm : InsertMem<"inserti8", IC,  GR64, zextloadi8, bdxaddr12pair>;
+defm : InsertMem<"inserti8", ICY, GR64, zextloadi8, bdxaddr20pair>;
+
+// Insertions of a 16-bit immediate, leaving other bits unaffected.
+// We don't have or_as_insert equivalents of these operations because
+// OI is available instead.
+let isCodeGenOnly = 1 in {
+  def IILL32 : BinaryRI<"iill", 0xA53, insertll, GR32, imm32ll16>;
+  def IILH32 : BinaryRI<"iilh", 0xA52, insertlh, GR32, imm32lh16>;
+}
+def IILL : BinaryRI<"iill", 0xA53, insertll, GR64, imm64ll16>;
+def IILH : BinaryRI<"iilh", 0xA52, insertlh, GR64, imm64lh16>;
+def IIHL : BinaryRI<"iihl", 0xA51, inserthl, GR64, imm64hl16>;
+def IIHH : BinaryRI<"iihh", 0xA50, inserthh, GR64, imm64hh16>;
+
+// ...likewise for 32-bit immediates.  For GR32s this is a general
+// full-width move.  (We use IILF rather than something like LLILF
+// for 32-bit moves because IILF leaves the upper 32 bits of the
+// GR64 unchanged.)
+let isCodeGenOnly = 1 in {
+  def IILF32 : UnaryRIL<"iilf", 0xC09, bitconvert, GR32, uimm32>;
+}
+def IILF : BinaryRIL<"iilf", 0xC09, insertlf, GR64, imm64lf32>;
+def IIHF : BinaryRIL<"iihf", 0xC08, inserthf, GR64, imm64hf32>;
+
+// An alternative model of inserthf, with the first operand being
+// a zero-extended value.
+def : Pat<(or (zext32 GR32:$src), imm64hf32:$imm),
+          (IIHF (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_32bit),
+                imm64hf32:$imm)>;
+
+//===----------------------------------------------------------------------===//
+// Addition
+//===----------------------------------------------------------------------===//
+
+// Plain addition.
+let Defs = [CC] in {
+  // Addition of a register.
+  let isCommutable = 1 in {
+    def AR  : BinaryRR <"ar",  0x1A,   add, GR32, GR32>;
+    def AGR : BinaryRRE<"agr", 0xB908, add, GR64, GR64>;
+  }
+  def AGFR : BinaryRRE<"agfr", 0xB918, null_frag, GR64, GR32>;
+
+  // Addition of signed 16-bit immediates.
+  def AHI  : BinaryRI<"ahi",  0xA7A, add, GR32, imm32sx16>;
+  def AGHI : BinaryRI<"aghi", 0xA7B, add, GR64, imm64sx16>;
+
+  // Addition of signed 32-bit immediates.
+  def AFI  : BinaryRIL<"afi",  0xC29, add, GR32, simm32>;
+  def AGFI : BinaryRIL<"agfi", 0xC28, add, GR64, imm64sx32>;
+
+  // Addition of memory.
+  defm AH  : BinaryRXPair<"ah", 0x4A, 0xE37A, add, GR32, sextloadi16>;
+  defm A   : BinaryRXPair<"a",  0x5A, 0xE35A, add, GR32, load>;
+  def  AGF : BinaryRXY<"agf", 0xE318, add, GR64, sextloadi32>;
+  def  AG  : BinaryRXY<"ag",  0xE308, add, GR64, load>;
+
+  // Addition to memory.
+  def ASI  : BinarySIY<"asi",  0xEB6A, add, imm32sx8>;
+  def AGSI : BinarySIY<"agsi", 0xEB7A, add, imm64sx8>;
+}
+defm : SXB<add, GR64, AGFR>;
+
+// Addition producing a carry.
+let Defs = [CC] in {
+  // Addition of a register.
+  let isCommutable = 1 in {
+    def ALR  : BinaryRR <"alr",  0x1E,   addc, GR32, GR32>;
+    def ALGR : BinaryRRE<"algr", 0xB90A, addc, GR64, GR64>;
+  }
+  def ALGFR : BinaryRRE<"algfr", 0xB91A, null_frag, GR64, GR32>;
+
+  // Addition of unsigned 32-bit immediates.
+  def ALFI  : BinaryRIL<"alfi",  0xC2B, addc, GR32, uimm32>;
+  def ALGFI : BinaryRIL<"algfi", 0xC2A, addc, GR64, imm64zx32>;
+
+  // Addition of memory.
+  defm AL   : BinaryRXPair<"al", 0x5E, 0xE35E, addc, GR32, load>;
+  def  ALGF : BinaryRXY<"algf", 0xE31A, addc, GR64, zextloadi32>;
+  def  ALG  : BinaryRXY<"alg",  0xE30A, addc, GR64, load>;
+}
+defm : ZXB<addc, GR64, ALGFR>;
+
+// Addition producing and using a carry.
+let Defs = [CC], Uses = [CC] in {
+  // Addition of a register.
+  def ALCR  : BinaryRRE<"alcr",  0xB998, adde, GR32, GR32>;
+  def ALCGR : BinaryRRE<"alcgr", 0xB988, adde, GR64, GR64>;
+
+  // Addition of memory.
+  def ALC  : BinaryRXY<"alc",  0xE398, adde, GR32, load>;
+  def ALCG : BinaryRXY<"alcg", 0xE388, adde, GR64, load>;
+}
+
+//===----------------------------------------------------------------------===//
+// Subtraction
+//===----------------------------------------------------------------------===//
+
+// Plain substraction.  Although immediate forms exist, we use the
+// add-immediate instruction instead.
+let Defs = [CC] in {
+  // Subtraction of a register.
+  def SR   : BinaryRR <"sr",   0x1B,   sub,       GR32, GR32>;
+  def SGFR : BinaryRRE<"sgfr", 0xB919, null_frag, GR64, GR32>;
+  def SGR  : BinaryRRE<"sgr",  0xB909, sub,       GR64, GR64>;
+
+  // Subtraction of memory.
+  defm SH  : BinaryRXPair<"sh", 0x4B, 0xE37B, sub, GR32, sextloadi16>;
+  defm S   : BinaryRXPair<"s", 0x5B, 0xE35B, sub, GR32, load>;
+  def  SGF : BinaryRXY<"sgf", 0xE319, sub, GR64, sextloadi32>;
+  def  SG  : BinaryRXY<"sg",  0xE309, sub, GR64, load>;
+}
+defm : SXB<sub, GR64, SGFR>;
+
+// Subtraction producing a carry.
+let Defs = [CC] in {
+  // Subtraction of a register.
+  def SLR   : BinaryRR <"slr",   0x1F,   subc,      GR32, GR32>;
+  def SLGFR : BinaryRRE<"slgfr", 0xB91B, null_frag, GR64, GR32>;
+  def SLGR  : BinaryRRE<"slgr",  0xB90B, subc,      GR64, GR64>;
+
+  // Subtraction of unsigned 32-bit immediates.  These don't match
+  // subc because we prefer addc for constants.
+  def SLFI  : BinaryRIL<"slfi",  0xC25, null_frag, GR32, uimm32>;
+  def SLGFI : BinaryRIL<"slgfi", 0xC24, null_frag, GR64, imm64zx32>;
+
+  // Subtraction of memory.
+  defm SL   : BinaryRXPair<"sl", 0x5F, 0xE35F, subc, GR32, load>;
+  def  SLGF : BinaryRXY<"slgf", 0xE31B, subc, GR64, zextloadi32>;
+  def  SLG  : BinaryRXY<"slg",  0xE30B, subc, GR64, load>;
+}
+defm : ZXB<subc, GR64, SLGFR>;
+
+// Subtraction producing and using a carry.
+let Defs = [CC], Uses = [CC] in {
+  // Subtraction of a register.
+  def SLBR  : BinaryRRE<"slbr",  0xB999, sube, GR32, GR32>;
+  def SLGBR : BinaryRRE<"slbgr", 0xB989, sube, GR64, GR64>;
+
+  // Subtraction of memory.
+  def SLB  : BinaryRXY<"slb",  0xE399, sube, GR32, load>;
+  def SLBG : BinaryRXY<"slbg", 0xE389, sube, GR64, load>;
+}
+
+//===----------------------------------------------------------------------===//
+// AND
+//===----------------------------------------------------------------------===//
+
+let Defs = [CC] in {
+  // ANDs of a register.
+  let isCommutable = 1 in {
+    def NR  : BinaryRR <"nr",  0x14,   and, GR32, GR32>;
+    def NGR : BinaryRRE<"ngr", 0xB980, and, GR64, GR64>;
+  }
+
+  // ANDs of a 16-bit immediate, leaving other bits unaffected.
+  let isCodeGenOnly = 1 in {
+    def NILL32 : BinaryRI<"nill", 0xA57, and, GR32, imm32ll16c>;
+    def NILH32 : BinaryRI<"nilh", 0xA56, and, GR32, imm32lh16c>;
+  }
+  def NILL : BinaryRI<"nill", 0xA57, and, GR64, imm64ll16c>;
+  def NILH : BinaryRI<"nilh", 0xA56, and, GR64, imm64lh16c>;
+  def NIHL : BinaryRI<"nihl", 0xA55, and, GR64, imm64hl16c>;
+  def NIHH : BinaryRI<"nihh", 0xA54, and, GR64, imm64hh16c>;
+
+  // ANDs of a 32-bit immediate, leaving other bits unaffected.
+  let isCodeGenOnly = 1 in
+    def NILF32 : BinaryRIL<"nilf", 0xC0B, and, GR32, uimm32>;
+  def NILF : BinaryRIL<"nilf", 0xC0B, and, GR64, imm64lf32c>;
+  def NIHF : BinaryRIL<"nihf", 0xC0A, and, GR64, imm64hf32c>;
+
+  // ANDs of memory.
+  defm N  : BinaryRXPair<"n", 0x54, 0xE354, and, GR32, load>;
+  def  NG : BinaryRXY<"ng", 0xE380, and, GR64, load>;
+
+  // AND to memory
+  defm NI : BinarySIPair<"ni", 0x94, 0xEB54, null_frag, uimm8>;
+}
+defm : RMWIByte<and, bdaddr12pair, NI>;
+defm : RMWIByte<and, bdaddr20pair, NIY>;
+
+//===----------------------------------------------------------------------===//
+// OR
+//===----------------------------------------------------------------------===//
+
+let Defs = [CC] in {
+  // ORs of a register.
+  let isCommutable = 1 in {
+    def OR  : BinaryRR <"or",  0x16,   or, GR32, GR32>;
+    def OGR : BinaryRRE<"ogr", 0xB981, or, GR64, GR64>;
+  }
+
+  // ORs of a 16-bit immediate, leaving other bits unaffected.
+  let isCodeGenOnly = 1 in {
+    def OILL32 : BinaryRI<"oill", 0xA5B, or, GR32, imm32ll16>;
+    def OILH32 : BinaryRI<"oilh", 0xA5A, or, GR32, imm32lh16>;
+  }
+  def OILL : BinaryRI<"oill", 0xA5B, or, GR64, imm64ll16>;
+  def OILH : BinaryRI<"oilh", 0xA5A, or, GR64, imm64lh16>;
+  def OIHL : BinaryRI<"oihl", 0xA59, or, GR64, imm64hl16>;
+  def OIHH : BinaryRI<"oihh", 0xA58, or, GR64, imm64hh16>;
+
+  // ORs of a 32-bit immediate, leaving other bits unaffected.
+  let isCodeGenOnly = 1 in
+    def OILF32 : BinaryRIL<"oilf", 0xC0D, or, GR32, uimm32>;
+  def OILF : BinaryRIL<"oilf", 0xC0D, or, GR64, imm64lf32>;
+  def OIHF : BinaryRIL<"oihf", 0xC0C, or, GR64, imm64hf32>;
+
+  // ORs of memory.
+  defm O  : BinaryRXPair<"o", 0x56, 0xE356, or, GR32, load>;
+  def  OG : BinaryRXY<"og", 0xE381, or, GR64, load>;
+
+  // OR to memory
+  defm OI : BinarySIPair<"oi", 0x96, 0xEB56, null_frag, uimm8>;
+}
+defm : RMWIByte<or, bdaddr12pair, OI>;
+defm : RMWIByte<or, bdaddr20pair, OIY>;
+
+//===----------------------------------------------------------------------===//
+// XOR
+//===----------------------------------------------------------------------===//
+
+let Defs = [CC] in {
+  // XORs of a register.
+  let isCommutable = 1 in {
+    def XR  : BinaryRR <"xr",  0x17,   xor, GR32, GR32>;
+    def XGR : BinaryRRE<"xgr", 0xB982, xor, GR64, GR64>;
+  }
+
+  // XORs of a 32-bit immediate, leaving other bits unaffected.
+  let isCodeGenOnly = 1 in
+    def XILF32 : BinaryRIL<"xilf", 0xC07, xor, GR32, uimm32>;
+  def XILF : BinaryRIL<"xilf", 0xC07, xor, GR64, imm64lf32>;
+  def XIHF : BinaryRIL<"xihf", 0xC06, xor, GR64, imm64hf32>;
+
+  // XORs of memory.
+  defm X  : BinaryRXPair<"x",0x57, 0xE357, xor, GR32, load>;
+  def  XG : BinaryRXY<"xg", 0xE382, xor, GR64, load>;
+
+  // XOR to memory
+  defm XI : BinarySIPair<"xi", 0x97, 0xEB57, null_frag, uimm8>;
+}
+defm : RMWIByte<xor, bdaddr12pair, XI>;
+defm : RMWIByte<xor, bdaddr20pair, XIY>;
+
+//===----------------------------------------------------------------------===//
+// Multiplication
+//===----------------------------------------------------------------------===//
+
+// Multiplication of a register.
+let isCommutable = 1 in {
+  def MSR  : BinaryRRE<"msr",  0xB252, mul, GR32, GR32>;
+  def MSGR : BinaryRRE<"msgr", 0xB90C, mul, GR64, GR64>;
+}
+def MSGFR : BinaryRRE<"msgfr", 0xB91C, null_frag, GR64, GR32>;
+defm : SXB<mul, GR64, MSGFR>;
+
+// Multiplication of a signed 16-bit immediate.
+def MHI  : BinaryRI<"mhi",  0xA7C, mul, GR32, imm32sx16>;
+def MGHI : BinaryRI<"mghi", 0xA7D, mul, GR64, imm64sx16>;
+
+// Multiplication of a signed 32-bit immediate.
+def MSFI  : BinaryRIL<"msfi",  0xC21, mul, GR32, simm32>;
+def MSGFI : BinaryRIL<"msgfi", 0xC20, mul, GR64, imm64sx32>;
+
+// Multiplication of memory.
+defm MH   : BinaryRXPair<"mh", 0x4C, 0xE37C, mul, GR32, sextloadi16>;
+defm MS   : BinaryRXPair<"ms", 0x71, 0xE351, mul, GR32, load>;
+def  MSGF : BinaryRXY<"msgf", 0xE31C, mul, GR64, sextloadi32>;
+def  MSG  : BinaryRXY<"msg",  0xE30C, mul, GR64, load>;
+
+// Multiplication of a register, producing two results.
+def MLGR : BinaryRRE<"mlgr", 0xB986, z_umul_lohi64, GR128, GR64>;
+
+// Multiplication of memory, producing two results.
+def MLG : BinaryRXY<"mlg", 0xE386, z_umul_lohi64, GR128, load>;
+
+//===----------------------------------------------------------------------===//
+// Division and remainder
+//===----------------------------------------------------------------------===//
+
+// Division and remainder, from registers.
+def DSGFR : BinaryRRE<"dsgfr", 0xB91D, null_frag,   GR128, GR32>;
+def DSGR  : BinaryRRE<"dsgr",  0xB90D, z_sdivrem64, GR128, GR64>;
+def DLR   : BinaryRRE<"dlr",   0xB997, z_udivrem32, GR128, GR32>;
+def DLGR  : BinaryRRE<"dlgr",  0xB987, z_udivrem64, GR128, GR64>;
+defm : SXB<z_sdivrem64, GR128, DSGFR>;
+
+// Division and remainder, from memory.
+def DSGF : BinaryRXY<"dsgf", 0xE31D, z_sdivrem64, GR128, sextloadi32>;
+def DSG  : BinaryRXY<"dsg",  0xE30D, z_sdivrem64, GR128, load>;
+def DL   : BinaryRXY<"dl",   0xE397, z_udivrem32, GR128, load>;
+def DLG  : BinaryRXY<"dlg",  0xE387, z_udivrem64, GR128, load>;
+
+//===----------------------------------------------------------------------===//
+// Shifts
+//===----------------------------------------------------------------------===//
+
+// Shift left.
+let neverHasSideEffects = 1 in {
+  def SLL  : ShiftRS <"sll",  0x89,   shl, GR32, shift12only>;
+  def SLLG : ShiftRSY<"sllg", 0xEB0D, shl, GR64, shift20only>;
+}
+
+// Logical shift right.
+let neverHasSideEffects = 1 in {
+  def SRL  : ShiftRS <"srl",  0x88,   srl, GR32, shift12only>;
+  def SRLG : ShiftRSY<"srlg", 0xEB0C, srl, GR64, shift20only>;
+}
+
+// Arithmetic shift right.
+let Defs = [CC] in {
+  def SRA  : ShiftRS <"sra",  0x8A,   sra, GR32, shift12only>;
+  def SRAG : ShiftRSY<"srag", 0xEB0A, sra, GR64, shift20only>;
+}
+
+// Rotate left.
+let neverHasSideEffects = 1 in {
+  def RLL  : ShiftRSY<"rll",  0xEB1D, rotl, GR32, shift20only>;
+  def RLLG : ShiftRSY<"rllg", 0xEB1C, rotl, GR64, shift20only>;
+}
+
+// Rotate second operand left and inserted selected bits into first operand.
+// These can act like 32-bit operands provided that the constant start and
+// end bits (operands 2 and 3) are in the range [32, 64)
+let Defs = [CC] in {
+  let isCodeGenOnly = 1 in
+    def RISBG32 : RotateSelectRIEf<"risbg",  0xEC55, GR32, GR32>;
+  def RISBG : RotateSelectRIEf<"risbg",  0xEC55, GR64, GR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// Comparison
+//===----------------------------------------------------------------------===//
+
+// Signed comparisons.
+let Defs = [CC] in {
+  // Comparison with a register.
+  def CR   : CompareRR <"cr",   0x19,   z_cmp,     GR32, GR32>;
+  def CGFR : CompareRRE<"cgfr", 0xB930, null_frag, GR64, GR32>;
+  def CGR  : CompareRRE<"cgr",  0xB920, z_cmp,     GR64, GR64>;
+
+  // Comparison with a signed 16-bit immediate.
+  def CHI  : CompareRI<"chi",  0xA7E, z_cmp, GR32, imm32sx16>;
+  def CGHI : CompareRI<"cghi", 0xA7F, z_cmp, GR64, imm64sx16>;
+
+  // Comparison with a signed 32-bit immediate.
+  def CFI  : CompareRIL<"cfi",  0xC2D, z_cmp, GR32, simm32>;
+  def CGFI : CompareRIL<"cgfi", 0xC2C, z_cmp, GR64, imm64sx32>;
+
+  // Comparison with memory.
+  defm CH    : CompareRXPair<"ch", 0x49, 0xE379, z_cmp, GR32, sextloadi16>;
+  defm C     : CompareRXPair<"c",  0x59, 0xE359, z_cmp, GR32, load>;
+  def  CGH   : CompareRXY<"cgh", 0xE334, z_cmp, GR64, sextloadi16>;
+  def  CGF   : CompareRXY<"cgf", 0xE330, z_cmp, GR64, sextloadi32>;
+  def  CG    : CompareRXY<"cg",  0xE320, z_cmp, GR64, load>;
+  def  CHRL  : CompareRILPC<"chrl",  0xC65, z_cmp, GR32, aligned_sextloadi16>;
+  def  CRL   : CompareRILPC<"crl",   0xC6D, z_cmp, GR32, aligned_load>;
+  def  CGHRL : CompareRILPC<"cghrl", 0xC64, z_cmp, GR64, aligned_sextloadi16>;
+  def  CGFRL : CompareRILPC<"cgfrl", 0xC6C, z_cmp, GR64, aligned_sextloadi32>;
+  def  CGRL  : CompareRILPC<"cgrl",  0xC68, z_cmp, GR64, aligned_load>;
+
+  // Comparison between memory and a signed 16-bit immediate.
+  def CHHSI : CompareSIL<"chhsi", 0xE554, z_cmp, sextloadi16, imm32sx16>;
+  def CHSI  : CompareSIL<"chsi",  0xE55C, z_cmp, load,        imm32sx16>;
+  def CGHSI : CompareSIL<"cghsi", 0xE558, z_cmp, load,        imm64sx16>;
+}
+defm : SXB<z_cmp, GR64, CGFR>;
+
+// Unsigned comparisons.
+let Defs = [CC] in {
+  // Comparison with a register.
+  def CLR   : CompareRR <"clr",   0x15,   z_ucmp,    GR32, GR32>;
+  def CLGFR : CompareRRE<"clgfr", 0xB931, null_frag, GR64, GR32>;
+  def CLGR  : CompareRRE<"clgr",  0xB921, z_ucmp,    GR64, GR64>;
+
+  // Comparison with a signed 32-bit immediate.
+  def CLFI  : CompareRIL<"clfi",  0xC2F, z_ucmp, GR32, uimm32>;
+  def CLGFI : CompareRIL<"clgfi", 0xC2E, z_ucmp, GR64, imm64zx32>;
+
+  // Comparison with memory.
+  defm CL     : CompareRXPair<"cl", 0x55, 0xE355, z_ucmp, GR32, load>;
+  def  CLGF   : CompareRXY<"clgf", 0xE331, z_ucmp, GR64, zextloadi32>;
+  def  CLG    : CompareRXY<"clg",  0xE321, z_ucmp, GR64, load>;
+  def  CLHRL  : CompareRILPC<"clhrl",  0xC67, z_ucmp, GR32,
+                             aligned_zextloadi16>;
+  def  CLRL   : CompareRILPC<"clrl",   0xC6F, z_ucmp, GR32,
+                             aligned_load>;
+  def  CLGHRL : CompareRILPC<"clghrl", 0xC66, z_ucmp, GR64,
+                             aligned_zextloadi16>;
+  def  CLGFRL : CompareRILPC<"clgfrl", 0xC6E, z_ucmp, GR64,
+                             aligned_zextloadi32>;
+  def  CLGRL  : CompareRILPC<"clgrl",  0xC6A, z_ucmp, GR64,
+                             aligned_load>;
+
+  // Comparison between memory and an unsigned 8-bit immediate.
+  defm CLI : CompareSIPair<"cli", 0x95, 0xEB55, z_ucmp, zextloadi8, imm32zx8>;
+
+  // Comparison between memory and an unsigned 16-bit immediate.
+  def CLHHSI : CompareSIL<"clhhsi", 0xE555, z_ucmp, zextloadi16, imm32zx16>;
+  def CLFHSI : CompareSIL<"clfhsi", 0xE55D, z_ucmp, load,        imm32zx16>;
+  def CLGHSI : CompareSIL<"clghsi", 0xE559, z_ucmp, load,        imm64zx16>;
+}
+defm : ZXB<z_ucmp, GR64, CLGFR>;
+
+//===----------------------------------------------------------------------===//
+// Atomic operations
+//===----------------------------------------------------------------------===//
+
+def ATOMIC_SWAPW        : AtomicLoadWBinaryReg<z_atomic_swapw>;
+def ATOMIC_SWAP_32      : AtomicLoadBinaryReg32<atomic_swap_32>;
+def ATOMIC_SWAP_64      : AtomicLoadBinaryReg64<atomic_swap_64>;
+
+def ATOMIC_LOADW_AR     : AtomicLoadWBinaryReg<z_atomic_loadw_add>;
+def ATOMIC_LOADW_AFI    : AtomicLoadWBinaryImm<z_atomic_loadw_add, simm32>;
+def ATOMIC_LOAD_AR      : AtomicLoadBinaryReg32<atomic_load_add_32>;
+def ATOMIC_LOAD_AHI     : AtomicLoadBinaryImm32<atomic_load_add_32, imm32sx16>;
+def ATOMIC_LOAD_AFI     : AtomicLoadBinaryImm32<atomic_load_add_32, simm32>;
+def ATOMIC_LOAD_AGR     : AtomicLoadBinaryReg64<atomic_load_add_64>;
+def ATOMIC_LOAD_AGHI    : AtomicLoadBinaryImm64<atomic_load_add_64, imm64sx16>;
+def ATOMIC_LOAD_AGFI    : AtomicLoadBinaryImm64<atomic_load_add_64, imm64sx32>;
+
+def ATOMIC_LOADW_SR     : AtomicLoadWBinaryReg<z_atomic_loadw_sub>;
+def ATOMIC_LOAD_SR      : AtomicLoadBinaryReg32<atomic_load_sub_32>;
+def ATOMIC_LOAD_SGR     : AtomicLoadBinaryReg64<atomic_load_sub_64>;
+
+def ATOMIC_LOADW_NR     : AtomicLoadWBinaryReg<z_atomic_loadw_and>;
+def ATOMIC_LOADW_NILH   : AtomicLoadWBinaryImm<z_atomic_loadw_and, imm32lh16c>;
+def ATOMIC_LOAD_NR      : AtomicLoadBinaryReg32<atomic_load_and_32>;
+def ATOMIC_LOAD_NILL32  : AtomicLoadBinaryImm32<atomic_load_and_32, imm32ll16c>;
+def ATOMIC_LOAD_NILH32  : AtomicLoadBinaryImm32<atomic_load_and_32, imm32lh16c>;
+def ATOMIC_LOAD_NILF32  : AtomicLoadBinaryImm32<atomic_load_and_32, uimm32>;
+def ATOMIC_LOAD_NGR     : AtomicLoadBinaryReg64<atomic_load_and_64>;
+def ATOMIC_LOAD_NILL    : AtomicLoadBinaryImm64<atomic_load_and_64, imm64ll16c>;
+def ATOMIC_LOAD_NILH    : AtomicLoadBinaryImm64<atomic_load_and_64, imm64lh16c>;
+def ATOMIC_LOAD_NIHL    : AtomicLoadBinaryImm64<atomic_load_and_64, imm64hl16c>;
+def ATOMIC_LOAD_NIHH    : AtomicLoadBinaryImm64<atomic_load_and_64, imm64hh16c>;
+def ATOMIC_LOAD_NILF    : AtomicLoadBinaryImm64<atomic_load_and_64, imm64lf32c>;
+def ATOMIC_LOAD_NIHF    : AtomicLoadBinaryImm64<atomic_load_and_64, imm64hf32c>;
+
+def ATOMIC_LOADW_OR     : AtomicLoadWBinaryReg<z_atomic_loadw_or>;
+def ATOMIC_LOADW_OILH   : AtomicLoadWBinaryImm<z_atomic_loadw_or, imm32lh16>;
+def ATOMIC_LOAD_OR      : AtomicLoadBinaryReg32<atomic_load_or_32>;
+def ATOMIC_LOAD_OILL32  : AtomicLoadBinaryImm32<atomic_load_or_32, imm32ll16>;
+def ATOMIC_LOAD_OILH32  : AtomicLoadBinaryImm32<atomic_load_or_32, imm32lh16>;
+def ATOMIC_LOAD_OILF32  : AtomicLoadBinaryImm32<atomic_load_or_32, uimm32>;
+def ATOMIC_LOAD_OGR     : AtomicLoadBinaryReg64<atomic_load_or_64>;
+def ATOMIC_LOAD_OILL    : AtomicLoadBinaryImm64<atomic_load_or_64, imm64ll16>;
+def ATOMIC_LOAD_OILH    : AtomicLoadBinaryImm64<atomic_load_or_64, imm64lh16>;
+def ATOMIC_LOAD_OIHL    : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hl16>;
+def ATOMIC_LOAD_OIHH    : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hh16>;
+def ATOMIC_LOAD_OILF    : AtomicLoadBinaryImm64<atomic_load_or_64, imm64lf32>;
+def ATOMIC_LOAD_OIHF    : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hf32>;
+
+def ATOMIC_LOADW_XR     : AtomicLoadWBinaryReg<z_atomic_loadw_xor>;
+def ATOMIC_LOADW_XILF   : AtomicLoadWBinaryImm<z_atomic_loadw_xor, uimm32>;
+def ATOMIC_LOAD_XR      : AtomicLoadBinaryReg32<atomic_load_xor_32>;
+def ATOMIC_LOAD_XILF32  : AtomicLoadBinaryImm32<atomic_load_xor_32, uimm32>;
+def ATOMIC_LOAD_XGR     : AtomicLoadBinaryReg64<atomic_load_xor_64>;
+def ATOMIC_LOAD_XILF    : AtomicLoadBinaryImm64<atomic_load_xor_64, imm64lf32>;
+def ATOMIC_LOAD_XIHF    : AtomicLoadBinaryImm64<atomic_load_xor_64, imm64hf32>;
+
+def ATOMIC_LOADW_NRi    : AtomicLoadWBinaryReg<z_atomic_loadw_nand>;
+def ATOMIC_LOADW_NILHi  : AtomicLoadWBinaryImm<z_atomic_loadw_nand,
+                                               imm32lh16c>;
+def ATOMIC_LOAD_NRi     : AtomicLoadBinaryReg32<atomic_load_nand_32>;
+def ATOMIC_LOAD_NILL32i : AtomicLoadBinaryImm32<atomic_load_nand_32,
+                                                imm32ll16c>;
+def ATOMIC_LOAD_NILH32i : AtomicLoadBinaryImm32<atomic_load_nand_32,
+                                                imm32lh16c>;
+def ATOMIC_LOAD_NILF32i : AtomicLoadBinaryImm32<atomic_load_nand_32, uimm32>;
+def ATOMIC_LOAD_NGRi    : AtomicLoadBinaryReg64<atomic_load_nand_64>;
+def ATOMIC_LOAD_NILLi   : AtomicLoadBinaryImm64<atomic_load_nand_64,
+                                                imm64ll16c>;
+def ATOMIC_LOAD_NILHi   : AtomicLoadBinaryImm64<atomic_load_nand_64,
+                                                imm64lh16c>;
+def ATOMIC_LOAD_NIHLi   : AtomicLoadBinaryImm64<atomic_load_nand_64,
+                                                imm64hl16c>;
+def ATOMIC_LOAD_NIHHi   : AtomicLoadBinaryImm64<atomic_load_nand_64,
+                                                imm64hh16c>;
+def ATOMIC_LOAD_NILFi   : AtomicLoadBinaryImm64<atomic_load_nand_64,
+                                                imm64lf32c>;
+def ATOMIC_LOAD_NIHFi   : AtomicLoadBinaryImm64<atomic_load_nand_64,
+                                                imm64hf32c>;
+
+def ATOMIC_LOADW_MIN    : AtomicLoadWBinaryReg<z_atomic_loadw_min>;
+def ATOMIC_LOAD_MIN_32  : AtomicLoadBinaryReg32<atomic_load_min_32>;
+def ATOMIC_LOAD_MIN_64  : AtomicLoadBinaryReg64<atomic_load_min_64>;
+
+def ATOMIC_LOADW_MAX    : AtomicLoadWBinaryReg<z_atomic_loadw_max>;
+def ATOMIC_LOAD_MAX_32  : AtomicLoadBinaryReg32<atomic_load_max_32>;
+def ATOMIC_LOAD_MAX_64  : AtomicLoadBinaryReg64<atomic_load_max_64>;
+
+def ATOMIC_LOADW_UMIN   : AtomicLoadWBinaryReg<z_atomic_loadw_umin>;
+def ATOMIC_LOAD_UMIN_32 : AtomicLoadBinaryReg32<atomic_load_umin_32>;
+def ATOMIC_LOAD_UMIN_64 : AtomicLoadBinaryReg64<atomic_load_umin_64>;
+
+def ATOMIC_LOADW_UMAX   : AtomicLoadWBinaryReg<z_atomic_loadw_umax>;
+def ATOMIC_LOAD_UMAX_32 : AtomicLoadBinaryReg32<atomic_load_umax_32>;
+def ATOMIC_LOAD_UMAX_64 : AtomicLoadBinaryReg64<atomic_load_umax_64>;
+
+def ATOMIC_CMP_SWAPW
+  : Pseudo<(outs GR32:$dst), (ins bdaddr20only:$addr, GR32:$cmp, GR32:$swap,
+                                  ADDR32:$bitshift, ADDR32:$negbitshift,
+                                  uimm32:$bitsize),
+           [(set GR32:$dst,
+                 (z_atomic_cmp_swapw bdaddr20only:$addr, GR32:$cmp, GR32:$swap,
+                                     ADDR32:$bitshift, ADDR32:$negbitshift,
+                                     uimm32:$bitsize))]> {
+  let Defs = [CC];
+  let mayLoad = 1;
+  let mayStore = 1;
+  let usesCustomInserter = 1;
+}
+
+let Defs = [CC] in {
+  defm CS  : CmpSwapRSPair<"cs", 0xBA, 0xEB14, atomic_cmp_swap_32, GR32>;
+  def  CSG : CmpSwapRSY<"csg", 0xEB30, atomic_cmp_swap_64, GR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions.
+//===----------------------------------------------------------------------===//
+
+// Read a 32-bit access register into a GR32.  As with all GR32 operations,
+// the upper 32 bits of the enclosing GR64 remain unchanged, which is useful
+// when a 64-bit address is stored in a pair of access registers.
+def EAR : InstRRE<0xB24F, (outs GR32:$R1), (ins access_reg:$R2),
+                  "ear\t$R1, $R2",
+                  [(set GR32:$R1, (z_extract_access access_reg:$R2))]>;
+
+// Find leftmost one, AKA count leading zeros.  The instruction actually
+// returns a pair of GR64s, the first giving the number of leading zeros
+// and the second giving a copy of the source with the leftmost one bit
+// cleared.  We only use the first result here.
+let Defs = [CC] in {
+  def FLOGR : UnaryRRE<"flogr", 0xB983, null_frag, GR128, GR64>;
+}
+def : Pat<(ctlz GR64:$src),
+          (EXTRACT_SUBREG (FLOGR GR64:$src), subreg_high)>;
+
+// Use subregs to populate the "don't care" bits in a 32-bit to 64-bit anyext.
+def : Pat<(i64 (anyext GR32:$src)),
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_32bit)>;
+
+// There are no 32-bit equivalents of LLILL and LLILH, so use a full
+// 64-bit move followed by a subreg.  This preserves the invariant that
+// all GR32 operations only modify the low 32 bits.
+def : Pat<(i32 imm32ll16:$src),
+          (EXTRACT_SUBREG (LLILL (LL16 imm:$src)), subreg_32bit)>;
+def : Pat<(i32 imm32lh16:$src),
+          (EXTRACT_SUBREG (LLILH (LH16 imm:$src)), subreg_32bit)>;
+
+// Extend GR32s and GR64s to GR128s.
+let usesCustomInserter = 1 in {
+  def AEXT128_64 : Pseudo<(outs GR128:$dst), (ins GR64:$src), []>;
+  def ZEXT128_32 : Pseudo<(outs GR128:$dst), (ins GR32:$src), []>;
+  def ZEXT128_64 : Pseudo<(outs GR128:$dst), (ins GR64:$src), []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Peepholes.
+//===----------------------------------------------------------------------===//
+
+// Use AL* for GR64 additions of unsigned 32-bit values.
+defm : ZXB<add, GR64, ALGFR>;
+def  : Pat<(add GR64:$src1, imm64zx32:$src2),
+           (ALGFI GR64:$src1, imm64zx32:$src2)>;
+def  : Pat<(add GR64:$src1, (zextloadi32 bdxaddr20only:$addr)),
+           (ALGF GR64:$src1, bdxaddr20only:$addr)>;
+
+// Use SL* for GR64 subtractions of unsigned 32-bit values.
+defm : ZXB<sub, GR64, SLGFR>;
+def  : Pat<(add GR64:$src1, imm64zx32n:$src2),
+           (SLGFI GR64:$src1, imm64zx32n:$src2)>;
+def  : Pat<(sub GR64:$src1, (zextloadi32 bdxaddr20only:$addr)),
+           (SLGF GR64:$src1, bdxaddr20only:$addr)>;
diff --git a/lib/Target/SystemZ/SystemZLongBranch.cpp b/lib/Target/SystemZ/SystemZLongBranch.cpp
new file mode 100644
index 0000000..2cb5823
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZLongBranch.cpp
@@ -0,0 +1,413 @@
+//===-- SystemZLongBranch.cpp - Branch lengthening for SystemZ ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass makes sure that all branches are in range.  There are several ways
+// in which this could be done.  One aggressive approach is to assume that all
+// branches are in range and successively replace those that turn out not
+// to be in range with a longer form (branch relaxation).  A simple
+// implementation is to continually walk through the function relaxing
+// branches until no more changes are needed and a fixed point is reached.
+// However, in the pathological worst case, this implementation is
+// quadratic in the number of blocks; relaxing branch N can make branch N-1
+// go out of range, which in turn can make branch N-2 go out of range,
+// and so on.
+//
+// An alternative approach is to assume that all branches must be
+// converted to their long forms, then reinstate the short forms of
+// branches that, even under this pessimistic assumption, turn out to be
+// in range (branch shortening).  This too can be implemented as a function
+// walk that is repeated until a fixed point is reached.  In general,
+// the result of shortening is not as good as that of relaxation, and
+// shortening is also quadratic in the worst case; shortening branch N
+// can bring branch N-1 in range of the short form, which in turn can do
+// the same for branch N-2, and so on.  The main advantage of shortening
+// is that each walk through the function produces valid code, so it is
+// possible to stop at any point after the first walk.  The quadraticness
+// could therefore be handled with a maximum pass count, although the
+// question then becomes: what maximum count should be used?
+//
+// On SystemZ, long branches are only needed for functions bigger than 64k,
+// which are relatively rare to begin with, and the long branch sequences
+// are actually relatively cheap.  It therefore doesn't seem worth spending
+// much compilation time on the problem.  Instead, the approach we take is:
+//
+// (1) Work out the address that each block would have if no branches
+//     need relaxing.  Exit the pass early if all branches are in range
+//     according to this assumption.
+//
+// (2) Work out the address that each block would have if all branches
+//     need relaxing.
+//
+// (3) Walk through the block calculating the final address of each instruction
+//     and relaxing those that need to be relaxed.  For backward branches,
+//     this check uses the final address of the target block, as calculated
+//     earlier in the walk.  For forward branches, this check uses the
+//     address of the target block that was calculated in (2).  Both checks
+//     give a conservatively-correct range.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "systemz-long-branch"
+
+#include "SystemZTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+STATISTIC(LongBranches, "Number of long branches.");
+
+namespace {
+  typedef MachineBasicBlock::iterator Iter;
+
+  // Represents positional information about a basic block.
+  struct MBBInfo {
+    // The address that we currently assume the block has.
+    uint64_t Address;
+
+    // The size of the block in bytes, excluding terminators.
+    // This value never changes.
+    uint64_t Size;
+
+    // The minimum alignment of the block, as a log2 value.
+    // This value never changes.
+    unsigned Alignment;
+
+    // The number of terminators in this block.  This value never changes.
+    unsigned NumTerminators;
+
+    MBBInfo()
+      : Address(0), Size(0), Alignment(0), NumTerminators(0) {} 
+  };
+
+  // Represents the state of a block terminator.
+  struct TerminatorInfo {
+    // If this terminator is a relaxable branch, this points to the branch
+    // instruction, otherwise it is null.
+    MachineInstr *Branch;
+
+    // The address that we currently assume the terminator has.
+    uint64_t Address;
+
+    // The current size of the terminator in bytes.
+    uint64_t Size;
+
+    // If Branch is nonnull, this is the number of the target block,
+    // otherwise it is unused.
+    unsigned TargetBlock;
+
+    // If Branch is nonnull, this is the length of the longest relaxed form,
+    // otherwise it is zero.
+    unsigned ExtraRelaxSize;
+
+    TerminatorInfo() : Branch(0), Size(0), TargetBlock(0), ExtraRelaxSize(0) {}
+  };
+
+  // Used to keep track of the current position while iterating over the blocks.
+  struct BlockPosition {
+    // The address that we assume this position has.
+    uint64_t Address;
+
+    // The number of low bits in Address that are known to be the same
+    // as the runtime address.
+    unsigned KnownBits;
+
+    BlockPosition(unsigned InitialAlignment)
+      : Address(0), KnownBits(InitialAlignment) {}
+  };
+
+  class SystemZLongBranch : public MachineFunctionPass {
+  public:
+    static char ID;
+    SystemZLongBranch(const SystemZTargetMachine &tm)
+      : MachineFunctionPass(ID), TII(0) {}
+
+    virtual const char *getPassName() const {
+      return "SystemZ Long Branch";
+    }
+
+    bool runOnMachineFunction(MachineFunction &F);
+
+  private:
+    void skipNonTerminators(BlockPosition &Position, MBBInfo &Block);
+    void skipTerminator(BlockPosition &Position, TerminatorInfo &Terminator,
+                        bool AssumeRelaxed);
+    TerminatorInfo describeTerminator(MachineInstr *MI);
+    uint64_t initMBBInfo();
+    bool mustRelaxBranch(const TerminatorInfo &Terminator, uint64_t Address);
+    bool mustRelaxABranch();
+    void setWorstCaseAddresses();
+    void splitCompareBranch(MachineInstr *MI, unsigned CompareOpcode);
+    void relaxBranch(TerminatorInfo &Terminator);
+    void relaxBranches();
+
+    const SystemZInstrInfo *TII;
+    MachineFunction *MF;
+    SmallVector<MBBInfo, 16> MBBs;
+    SmallVector<TerminatorInfo, 16> Terminators;
+  };
+
+  char SystemZLongBranch::ID = 0;
+
+  const uint64_t MaxBackwardRange = 0x10000;
+  const uint64_t MaxForwardRange = 0xfffe;
+} // end of anonymous namespace
+
+FunctionPass *llvm::createSystemZLongBranchPass(SystemZTargetMachine &TM) {
+  return new SystemZLongBranch(TM);
+}
+
+// Position describes the state immediately before Block.  Update Block
+// accordingly and move Position to the end of the block's non-terminator
+// instructions.
+void SystemZLongBranch::skipNonTerminators(BlockPosition &Position,
+                                           MBBInfo &Block) {
+  if (Block.Alignment > Position.KnownBits) {
+    // When calculating the address of Block, we need to conservatively
+    // assume that Block had the worst possible misalignment.
+    Position.Address += ((uint64_t(1) << Block.Alignment) -
+                         (uint64_t(1) << Position.KnownBits));
+    Position.KnownBits = Block.Alignment;
+  }
+
+  // Align the addresses.
+  uint64_t AlignMask = (uint64_t(1) << Block.Alignment) - 1;
+  Position.Address = (Position.Address + AlignMask) & ~AlignMask;
+
+  // Record the block's position.
+  Block.Address = Position.Address;
+
+  // Move past the non-terminators in the block.
+  Position.Address += Block.Size;
+}
+
+// Position describes the state immediately before Terminator.
+// Update Terminator accordingly and move Position past it.
+// Assume that Terminator will be relaxed if AssumeRelaxed.
+void SystemZLongBranch::skipTerminator(BlockPosition &Position,
+                                       TerminatorInfo &Terminator,
+                                       bool AssumeRelaxed) {
+  Terminator.Address = Position.Address;
+  Position.Address += Terminator.Size;
+  if (AssumeRelaxed)
+    Position.Address += Terminator.ExtraRelaxSize;
+}
+
+// Return a description of terminator instruction MI.
+TerminatorInfo SystemZLongBranch::describeTerminator(MachineInstr *MI) {
+  TerminatorInfo Terminator;
+  Terminator.Size = TII->getInstSizeInBytes(MI);
+  if (MI->isConditionalBranch() || MI->isUnconditionalBranch()) {
+    switch (MI->getOpcode()) {
+    case SystemZ::J:
+      // Relaxes to JG, which is 2 bytes longer.
+      Terminator.ExtraRelaxSize = 2;
+      break;
+    case SystemZ::BRC:
+      // Relaxes to BRCL, which is 2 bytes longer.
+      Terminator.ExtraRelaxSize = 2;
+      break;
+    case SystemZ::CRJ:
+      // Relaxes to a CR/BRCL sequence, which is 2 bytes longer.
+      Terminator.ExtraRelaxSize = 2;
+      break;
+    case SystemZ::CGRJ:
+      // Relaxes to a CGR/BRCL sequence, which is 4 bytes longer.
+      Terminator.ExtraRelaxSize = 4;
+      break;
+    case SystemZ::CIJ:
+    case SystemZ::CGIJ:
+      // Relaxes to a C(G)HI/BRCL sequence, which is 4 bytes longer.
+      Terminator.ExtraRelaxSize = 4;
+      break;
+    default:
+      llvm_unreachable("Unrecognized branch instruction");
+    }
+    Terminator.Branch = MI;
+    Terminator.TargetBlock =
+      TII->getBranchInfo(MI).Target->getMBB()->getNumber();
+  }
+  return Terminator;
+}
+
+// Fill MBBs and Terminators, setting the addresses on the assumption
+// that no branches need relaxation.  Return the size of the function under
+// this assumption.
+uint64_t SystemZLongBranch::initMBBInfo() {
+  MF->RenumberBlocks();
+  unsigned NumBlocks = MF->size();
+
+  MBBs.clear();
+  MBBs.resize(NumBlocks);
+
+  Terminators.clear();
+  Terminators.reserve(NumBlocks);
+
+  BlockPosition Position(MF->getAlignment());
+  for (unsigned I = 0; I < NumBlocks; ++I) {
+    MachineBasicBlock *MBB = MF->getBlockNumbered(I);
+    MBBInfo &Block = MBBs[I];
+
+    // Record the alignment, for quick access.
+    Block.Alignment = MBB->getAlignment();
+
+    // Calculate the size of the fixed part of the block.
+    MachineBasicBlock::iterator MI = MBB->begin();
+    MachineBasicBlock::iterator End = MBB->end();
+    while (MI != End && !MI->isTerminator()) {
+      Block.Size += TII->getInstSizeInBytes(MI);
+      ++MI;
+    }
+    skipNonTerminators(Position, Block);
+
+    // Add the terminators.
+    while (MI != End) {
+      if (!MI->isDebugValue()) {
+        assert(MI->isTerminator() && "Terminator followed by non-terminator");
+        Terminators.push_back(describeTerminator(MI));
+        skipTerminator(Position, Terminators.back(), false);
+        ++Block.NumTerminators;
+      }
+      ++MI;
+    }
+  }
+
+  return Position.Address;
+}
+
+// Return true if, under current assumptions, Terminator would need to be
+// relaxed if it were placed at address Address.
+bool SystemZLongBranch::mustRelaxBranch(const TerminatorInfo &Terminator,
+                                        uint64_t Address) {
+  if (!Terminator.Branch)
+    return false;
+
+  const MBBInfo &Target = MBBs[Terminator.TargetBlock];
+  if (Address >= Target.Address) {
+    if (Address - Target.Address <= MaxBackwardRange)
+      return false;
+  } else {
+    if (Target.Address - Address <= MaxForwardRange)
+      return false;
+  }
+
+  return true;
+}
+
+// Return true if, under current assumptions, any terminator needs
+// to be relaxed.
+bool SystemZLongBranch::mustRelaxABranch() {
+  for (SmallVector<TerminatorInfo, 16>::iterator TI = Terminators.begin(),
+         TE = Terminators.end(); TI != TE; ++TI)
+    if (mustRelaxBranch(*TI, TI->Address))
+      return true;
+  return false;
+}
+
+// Set the address of each block on the assumption that all branches
+// must be long.
+void SystemZLongBranch::setWorstCaseAddresses() {
+  SmallVector<TerminatorInfo, 16>::iterator TI = Terminators.begin();
+  BlockPosition Position(MF->getAlignment());
+  for (SmallVector<MBBInfo, 16>::iterator BI = MBBs.begin(), BE = MBBs.end();
+       BI != BE; ++BI) {
+    skipNonTerminators(Position, *BI);
+    for (unsigned BTI = 0, BTE = BI->NumTerminators; BTI != BTE; ++BTI) {
+      skipTerminator(Position, *TI, true);
+      ++TI;
+    }
+  }
+}
+
+// Split MI into the comparison given by CompareOpcode followed
+// a BRCL on the result.
+void SystemZLongBranch::splitCompareBranch(MachineInstr *MI,
+                                           unsigned CompareOpcode) {
+  MachineBasicBlock *MBB = MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  BuildMI(*MBB, MI, DL, TII->get(CompareOpcode))
+    .addOperand(MI->getOperand(0))
+    .addOperand(MI->getOperand(1));
+  MachineInstr *BRCL = BuildMI(*MBB, MI, DL, TII->get(SystemZ::BRCL))
+    .addOperand(MI->getOperand(2))
+    .addOperand(MI->getOperand(3));
+  // The implicit use of CC is a killing use.
+  BRCL->getOperand(2).setIsKill();
+  MI->eraseFromParent();
+}
+
+// Relax the branch described by Terminator.
+void SystemZLongBranch::relaxBranch(TerminatorInfo &Terminator) {
+  MachineInstr *Branch = Terminator.Branch;
+  switch (Branch->getOpcode()) {
+  case SystemZ::J:
+    Branch->setDesc(TII->get(SystemZ::JG));
+    break;
+  case SystemZ::BRC:
+    Branch->setDesc(TII->get(SystemZ::BRCL));
+    break;
+  case SystemZ::CRJ:
+    splitCompareBranch(Branch, SystemZ::CR);
+    break;
+  case SystemZ::CGRJ:
+    splitCompareBranch(Branch, SystemZ::CGR);
+    break;
+  case SystemZ::CIJ:
+    splitCompareBranch(Branch, SystemZ::CHI);
+    break;
+  case SystemZ::CGIJ:
+    splitCompareBranch(Branch, SystemZ::CGHI);
+    break;
+  default:
+    llvm_unreachable("Unrecognized branch");
+  }
+
+  Terminator.Size += Terminator.ExtraRelaxSize;
+  Terminator.ExtraRelaxSize = 0;
+  Terminator.Branch = 0;
+
+  ++LongBranches;
+}
+
+// Run a shortening pass and relax any branches that need to be relaxed.
+void SystemZLongBranch::relaxBranches() {
+  SmallVector<TerminatorInfo, 16>::iterator TI = Terminators.begin();
+  BlockPosition Position(MF->getAlignment());
+  for (SmallVector<MBBInfo, 16>::iterator BI = MBBs.begin(), BE = MBBs.end();
+       BI != BE; ++BI) {
+    skipNonTerminators(Position, *BI);
+    for (unsigned BTI = 0, BTE = BI->NumTerminators; BTI != BTE; ++BTI) {
+      assert(Position.Address <= TI->Address &&
+             "Addresses shouldn't go forwards");
+      if (mustRelaxBranch(*TI, Position.Address))
+        relaxBranch(*TI);
+      skipTerminator(Position, *TI, false);
+      ++TI;
+    }
+  }
+}
+
+bool SystemZLongBranch::runOnMachineFunction(MachineFunction &F) {
+  TII = static_cast<const SystemZInstrInfo *>(F.getTarget().getInstrInfo());
+  MF = &F;
+  uint64_t Size = initMBBInfo();
+  if (Size <= MaxForwardRange || !mustRelaxABranch())
+    return false;
+
+  setWorstCaseAddresses();
+  relaxBranches();
+  return true;
+}
diff --git a/lib/Target/SystemZ/SystemZMCInstLower.cpp b/lib/Target/SystemZ/SystemZMCInstLower.cpp
new file mode 100644
index 0000000..fd3f867
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZMCInstLower.cpp
@@ -0,0 +1,111 @@
+//===-- SystemZMCInstLower.cpp - Lower MachineInstr to MCInst -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZMCInstLower.h"
+#include "SystemZAsmPrinter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Target/Mangler.h"
+
+using namespace llvm;
+
+// If Opcode is an interprocedural reference that can be shortened,
+// return the short form, otherwise return 0.
+static unsigned getShortenedInstr(unsigned Opcode) {
+  switch (Opcode) {
+  case SystemZ::BRASL: return SystemZ::BRAS;
+  }
+  return Opcode;
+}
+
+// Return the VK_* enumeration for MachineOperand target flags Flags.
+static MCSymbolRefExpr::VariantKind getVariantKind(unsigned Flags) {
+  switch (Flags & SystemZII::MO_SYMBOL_MODIFIER) {
+    case 0:
+      return MCSymbolRefExpr::VK_None;
+    case SystemZII::MO_GOT:
+      return MCSymbolRefExpr::VK_GOT;
+  }
+  llvm_unreachable("Unrecognised MO_ACCESS_MODEL");
+}
+
+SystemZMCInstLower::SystemZMCInstLower(Mangler *mang, MCContext &ctx,
+                                       SystemZAsmPrinter &asmprinter)
+  : Mang(mang), Ctx(ctx), AsmPrinter(asmprinter) {}
+
+MCOperand SystemZMCInstLower::lowerSymbolOperand(const MachineOperand &MO,
+                                                 const MCSymbol *Symbol,
+                                                 int64_t Offset) const {
+  MCSymbolRefExpr::VariantKind Kind = getVariantKind(MO.getTargetFlags());
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Symbol, Kind, Ctx);
+  if (Offset) {
+    const MCExpr *OffsetExpr = MCConstantExpr::Create(Offset, Ctx);
+    Expr = MCBinaryExpr::CreateAdd(Expr, OffsetExpr, Ctx);
+  }
+  return MCOperand::CreateExpr(Expr);
+}
+
+MCOperand SystemZMCInstLower::lowerOperand(const MachineOperand &MO) const {
+  switch (MO.getType()) {
+  default:
+    llvm_unreachable("unknown operand type");
+
+  case MachineOperand::MO_Register:
+    // Ignore all implicit register operands.
+    if (MO.isImplicit())
+      return MCOperand();
+    return MCOperand::CreateReg(MO.getReg());
+
+  case MachineOperand::MO_Immediate:
+    return MCOperand::CreateImm(MO.getImm());
+
+  case MachineOperand::MO_MachineBasicBlock:
+    return lowerSymbolOperand(MO, MO.getMBB()->getSymbol(),
+                              /* MO has no offset field */0);
+
+  case MachineOperand::MO_GlobalAddress:
+    return lowerSymbolOperand(MO, Mang->getSymbol(MO.getGlobal()),
+                              MO.getOffset());
+
+  case MachineOperand::MO_ExternalSymbol: {
+    StringRef Name = MO.getSymbolName();
+    return lowerSymbolOperand(MO, AsmPrinter.GetExternalSymbolSymbol(Name),
+                              MO.getOffset());
+  }
+
+  case MachineOperand::MO_JumpTableIndex:
+    return lowerSymbolOperand(MO, AsmPrinter.GetJTISymbol(MO.getIndex()),
+                              /* MO has no offset field */0);
+
+  case MachineOperand::MO_ConstantPoolIndex:
+    return lowerSymbolOperand(MO, AsmPrinter.GetCPISymbol(MO.getIndex()),
+                              MO.getOffset());
+
+  case MachineOperand::MO_BlockAddress: {
+    const BlockAddress *BA = MO.getBlockAddress();
+    return lowerSymbolOperand(MO, AsmPrinter.GetBlockAddressSymbol(BA),
+                              MO.getOffset());
+  }
+  }
+}
+
+void SystemZMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
+  unsigned Opcode = MI->getOpcode();
+  // When emitting binary code, start with the shortest form of an instruction
+  // and then relax it where necessary.
+  if (!AsmPrinter.OutStreamer.hasRawTextSupport())
+    Opcode = getShortenedInstr(Opcode);
+  OutMI.setOpcode(Opcode);
+  for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
+    const MachineOperand &MO = MI->getOperand(I);
+    MCOperand MCOp = lowerOperand(MO);
+    if (MCOp.isValid())
+      OutMI.addOperand(MCOp);
+  }
+}
diff --git a/lib/Target/SystemZ/SystemZMCInstLower.h b/lib/Target/SystemZ/SystemZMCInstLower.h
new file mode 100644
index 0000000..afa72f3
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZMCInstLower.h
@@ -0,0 +1,47 @@
+//===-- SystemZMCInstLower.h - Lower MachineInstr to MCInst ----*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SYSTEMZMCINSTLOWER_H
+#define LLVM_SYSTEMZMCINSTLOWER_H
+
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class MCContext;
+class MCInst;
+class MCOperand;
+class MCSymbol;
+class MachineInstr;
+class MachineOperand;
+class Mangler;
+class SystemZAsmPrinter;
+
+class LLVM_LIBRARY_VISIBILITY SystemZMCInstLower {
+  Mangler *Mang;
+  MCContext &Ctx;
+  SystemZAsmPrinter &AsmPrinter;
+
+public:
+  SystemZMCInstLower(Mangler *mang, MCContext &ctx,
+                     SystemZAsmPrinter &asmPrinter);
+
+  // Lower MachineInstr MI to MCInst OutMI.
+  void lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+  // Return an MCOperand for MO.  Return an empty operand if MO is implicit.
+  MCOperand lowerOperand(const MachineOperand& MO) const;
+
+  // Return an MCOperand for MO, given that it equals Symbol + Offset.
+  MCOperand lowerSymbolOperand(const MachineOperand &MO,
+                               const MCSymbol *Symbol, int64_t Offset) const;
+};
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/SystemZ/SystemZMachineFunctionInfo.h b/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
new file mode 100644
index 0000000..1dc05a7
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
@@ -0,0 +1,74 @@
+//==- SystemZMachineFuctionInfo.h - SystemZ machine function info -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SYSTEMZMACHINEFUNCTIONINFO_H
+#define SYSTEMZMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+class SystemZMachineFunctionInfo : public MachineFunctionInfo {
+  unsigned SavedGPRFrameSize;
+  unsigned LowSavedGPR;
+  unsigned HighSavedGPR;
+  unsigned VarArgsFirstGPR;
+  unsigned VarArgsFirstFPR;
+  unsigned VarArgsFrameIndex;
+  unsigned RegSaveFrameIndex;
+  bool ManipulatesSP;
+
+public:
+  explicit SystemZMachineFunctionInfo(MachineFunction &MF)
+    : SavedGPRFrameSize(0), LowSavedGPR(0), HighSavedGPR(0), VarArgsFirstGPR(0),
+      VarArgsFirstFPR(0), VarArgsFrameIndex(0), RegSaveFrameIndex(0),
+      ManipulatesSP(false) {}
+
+  // Get and set the number of bytes allocated by generic code to store
+  // call-saved GPRs.
+  unsigned getSavedGPRFrameSize() const { return SavedGPRFrameSize; }
+  void setSavedGPRFrameSize(unsigned bytes) { SavedGPRFrameSize = bytes; }
+
+  // Get and set the first call-saved GPR that should be saved and restored
+  // by this function.  This is 0 if no GPRs need to be saved or restored.
+  unsigned getLowSavedGPR() const { return LowSavedGPR; }
+  void setLowSavedGPR(unsigned Reg) { LowSavedGPR = Reg; }
+
+  // Get and set the last call-saved GPR that should be saved and restored
+  // by this function.
+  unsigned getHighSavedGPR() const { return HighSavedGPR; }
+  void setHighSavedGPR(unsigned Reg) { HighSavedGPR = Reg; }
+
+  // Get and set the number of fixed (as opposed to variable) arguments
+  // that are passed in GPRs to this function.
+  unsigned getVarArgsFirstGPR() const { return VarArgsFirstGPR; }
+  void setVarArgsFirstGPR(unsigned GPR) { VarArgsFirstGPR = GPR; }
+
+  // Likewise FPRs.
+  unsigned getVarArgsFirstFPR() const { return VarArgsFirstFPR; }
+  void setVarArgsFirstFPR(unsigned FPR) { VarArgsFirstFPR = FPR; }
+
+  // Get and set the frame index of the first stack vararg.
+  unsigned getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+  void setVarArgsFrameIndex(unsigned FI) { VarArgsFrameIndex = FI; }
+
+  // Get and set the frame index of the register save area
+  // (i.e. the incoming stack pointer).
+  unsigned getRegSaveFrameIndex() const { return RegSaveFrameIndex; }
+  void setRegSaveFrameIndex(unsigned FI) { RegSaveFrameIndex = FI; }
+
+  // Get and set whether the function directly manipulates the stack pointer,
+  // e.g. through STACKSAVE or STACKRESTORE.
+  bool getManipulatesSP() const { return ManipulatesSP; }
+  void setManipulatesSP(bool MSP) { ManipulatesSP = MSP; }
+};
+
+} // end llvm namespace
+
+#endif
diff --git a/lib/Target/SystemZ/SystemZOperands.td b/lib/Target/SystemZ/SystemZOperands.td
new file mode 100644
index 0000000..66d9c5f
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZOperands.td
@@ -0,0 +1,465 @@
+//===-- SystemZOperands.td - SystemZ instruction operands ----*- tblgen-*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Class definitions
+//===----------------------------------------------------------------------===//
+
+class ImmediateAsmOperand<string name>
+  : AsmOperandClass {
+  let Name = name;
+  let RenderMethod = "addImmOperands";
+}
+
+// Constructs both a DAG pattern and instruction operand for an immediate
+// of type VT.  PRED returns true if a node is acceptable and XFORM returns
+// the operand value associated with the node.  ASMOP is the name of the
+// associated asm operand, and also forms the basis of the asm print method.
+class Immediate<ValueType vt, code pred, SDNodeXForm xform, string asmop>
+  : PatLeaf<(vt imm), pred, xform>, Operand<vt> {
+  let PrintMethod = "print"##asmop##"Operand";
+  let DecoderMethod = "decode"##asmop##"Operand";
+  let ParserMatchClass = !cast<AsmOperandClass>(asmop);
+}
+
+// Constructs an asm operand for a PC-relative address.  SIZE says how
+// many bits there are.
+class PCRelAsmOperand<string size> : ImmediateAsmOperand<"PCRel"##size> {
+  let PredicateMethod = "isImm";
+  let ParserMethod = "parsePCRel"##size;
+}
+
+// Constructs an operand for a PC-relative address with address type VT.
+// ASMOP is the associated asm operand.
+class PCRelOperand<ValueType vt, AsmOperandClass asmop> : Operand<vt> {
+  let PrintMethod = "printPCRelOperand";
+  let ParserMatchClass = asmop;
+}
+
+// Constructs both a DAG pattern and instruction operand for a PC-relative
+// address with address size VT.  SELF is the name of the operand and
+// ASMOP is the associated asm operand.
+class PCRelAddress<ValueType vt, string self, AsmOperandClass asmop>
+  : ComplexPattern<vt, 1, "selectPCRelAddress", [z_pcrel_wrapper]>,
+    PCRelOperand<vt, asmop> {
+  let MIOperandInfo = (ops !cast<Operand>(self));
+}
+
+// Constructs an AsmOperandClass for addressing mode FORMAT, treating the
+// registers as having BITSIZE bits and displacements as having DISPSIZE bits.
+class AddressAsmOperand<string format, string bitsize, string dispsize>
+  : AsmOperandClass {
+  let Name = format##bitsize##"Disp"##dispsize;
+  let ParserMethod = "parse"##format##bitsize;
+  let RenderMethod = "add"##format##"Operands";
+}
+
+// Constructs both a DAG pattern and instruction operand for an addressing mode.
+// The mode is selected by custom code in select<TYPE><DISPSIZE><SUFFIX>(),
+// encoded by custom code in get<FORMAT><DISPSIZE>Encoding() and decoded
+// by custom code in decode<TYPE><BITSIZE>Disp<DISPSIZE>Operand().
+// The address registers have BITSIZE bits and displacements have
+// DISPSIZE bits.  NUMOPS is the number of operands that make up an
+// address and OPERANDS lists the types of those operands using (ops ...).
+// FORMAT is the type of addressing mode, which needs to match the names
+// used in AddressAsmOperand.
+class AddressingMode<string type, string bitsize, string dispsize,
+                     string suffix, int numops, string format, dag operands>
+  : ComplexPattern<!cast<ValueType>("i"##bitsize), numops,
+                   "select"##type##dispsize##suffix,
+                   [add, sub, or, frameindex, z_adjdynalloc]>,
+    Operand<!cast<ValueType>("i"##bitsize)> {
+  let PrintMethod = "print"##format##"Operand";
+  let EncoderMethod = "get"##format##dispsize##"Encoding";
+  let DecoderMethod = "decode"##format##bitsize##"Disp"##dispsize##"Operand";
+  let MIOperandInfo = operands;
+  let ParserMatchClass =
+    !cast<AddressAsmOperand>(format##bitsize##"Disp"##dispsize);
+}
+
+// An addressing mode with a base and displacement but no index.
+class BDMode<string type, string bitsize, string dispsize, string suffix>
+  : AddressingMode<type, bitsize, dispsize, suffix, 2, "BDAddr",
+                   (ops !cast<RegisterOperand>("ADDR"##bitsize),
+                        !cast<Immediate>("disp"##dispsize##"imm"##bitsize))>;
+
+// An addressing mode with a base, displacement and index.
+class BDXMode<string type, string bitsize, string dispsize, string suffix>
+  : AddressingMode<type, bitsize, dispsize, suffix, 3, "BDXAddr",
+                   (ops !cast<RegisterOperand>("ADDR"##bitsize),
+                        !cast<Immediate>("disp"##dispsize##"imm"##bitsize),
+                        !cast<RegisterOperand>("ADDR"##bitsize))>;
+
+//===----------------------------------------------------------------------===//
+// Extracting immediate operands from nodes
+// These all create MVT::i64 nodes to ensure the value is not sign-extended
+// when converted from an SDNode to a MachineOperand later on.
+//===----------------------------------------------------------------------===//
+
+// Bits 0-15 (counting from the lsb).
+def LL16 : SDNodeXForm<imm, [{
+  uint64_t Value = N->getZExtValue() & 0x000000000000FFFFULL;
+  return CurDAG->getTargetConstant(Value, MVT::i64);
+}]>;
+
+// Bits 16-31 (counting from the lsb).
+def LH16 : SDNodeXForm<imm, [{
+  uint64_t Value = (N->getZExtValue() & 0x00000000FFFF0000ULL) >> 16;
+  return CurDAG->getTargetConstant(Value, MVT::i64);
+}]>;
+
+// Bits 32-47 (counting from the lsb).
+def HL16 : SDNodeXForm<imm, [{
+  uint64_t Value = (N->getZExtValue() & 0x0000FFFF00000000ULL) >> 32;
+  return CurDAG->getTargetConstant(Value, MVT::i64);
+}]>;
+
+// Bits 48-63 (counting from the lsb).
+def HH16 : SDNodeXForm<imm, [{
+  uint64_t Value = (N->getZExtValue() & 0xFFFF000000000000ULL) >> 48;
+  return CurDAG->getTargetConstant(Value, MVT::i64);
+}]>;
+
+// Low 32 bits.
+def LF32 : SDNodeXForm<imm, [{
+  uint64_t Value = N->getZExtValue() & 0x00000000FFFFFFFFULL;
+  return CurDAG->getTargetConstant(Value, MVT::i64);
+}]>;
+
+// High 32 bits.
+def HF32 : SDNodeXForm<imm, [{
+  uint64_t Value = N->getZExtValue() >> 32;
+  return CurDAG->getTargetConstant(Value, MVT::i64);
+}]>;
+
+// Truncate an immediate to a 8-bit signed quantity.
+def SIMM8 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(int8_t(N->getZExtValue()), MVT::i64);
+}]>;
+
+// Truncate an immediate to a 8-bit unsigned quantity.
+def UIMM8 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(uint8_t(N->getZExtValue()), MVT::i64);
+}]>;
+
+// Truncate an immediate to a 16-bit signed quantity.
+def SIMM16 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(int16_t(N->getZExtValue()), MVT::i64);
+}]>;
+
+// Truncate an immediate to a 16-bit unsigned quantity.
+def UIMM16 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(uint16_t(N->getZExtValue()), MVT::i64);
+}]>;
+
+// Truncate an immediate to a 32-bit signed quantity.
+def SIMM32 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(int32_t(N->getZExtValue()), MVT::i64);
+}]>;
+
+// Truncate an immediate to a 32-bit unsigned quantity.
+def UIMM32 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(uint32_t(N->getZExtValue()), MVT::i64);
+}]>;
+
+// Negate and then truncate an immediate to a 32-bit unsigned quantity.
+def NEGIMM32 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(uint32_t(-N->getZExtValue()), MVT::i64);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Immediate asm operands.
+//===----------------------------------------------------------------------===//
+
+def U4Imm  : ImmediateAsmOperand<"U4Imm">;
+def U6Imm  : ImmediateAsmOperand<"U6Imm">;
+def S8Imm  : ImmediateAsmOperand<"S8Imm">;
+def U8Imm  : ImmediateAsmOperand<"U8Imm">;
+def S16Imm : ImmediateAsmOperand<"S16Imm">;
+def U16Imm : ImmediateAsmOperand<"U16Imm">;
+def S32Imm : ImmediateAsmOperand<"S32Imm">;
+def U32Imm : ImmediateAsmOperand<"U32Imm">;
+
+//===----------------------------------------------------------------------===//
+// 8-bit immediates
+//===----------------------------------------------------------------------===//
+
+def uimm8zx4 : Immediate<i8, [{
+  return isUInt<4>(N->getZExtValue());
+}], NOOP_SDNodeXForm, "U4Imm">;
+
+def uimm8zx6 : Immediate<i8, [{
+  return isUInt<6>(N->getZExtValue());
+}], NOOP_SDNodeXForm, "U6Imm">;
+
+def simm8    : Immediate<i8, [{}], SIMM8, "S8Imm">;
+def uimm8    : Immediate<i8, [{}], UIMM8, "U8Imm">;
+
+//===----------------------------------------------------------------------===//
+// i32 immediates
+//===----------------------------------------------------------------------===//
+
+// Immediates for the lower and upper 16 bits of an i32, with the other
+// bits of the i32 being zero.
+def imm32ll16 : Immediate<i32, [{
+  return SystemZ::isImmLL(N->getZExtValue());
+}], LL16, "U16Imm">;
+
+def imm32lh16 : Immediate<i32, [{
+  return SystemZ::isImmLH(N->getZExtValue());
+}], LH16, "U16Imm">;
+
+// Immediates for the lower and upper 16 bits of an i32, with the other
+// bits of the i32 being one.
+def imm32ll16c : Immediate<i32, [{
+  return SystemZ::isImmLL(uint32_t(~N->getZExtValue()));
+}], LL16, "U16Imm">;
+
+def imm32lh16c : Immediate<i32, [{
+  return SystemZ::isImmLH(uint32_t(~N->getZExtValue()));
+}], LH16, "U16Imm">;
+
+// Short immediates
+def imm32sx8 : Immediate<i32, [{
+  return isInt<8>(N->getSExtValue());
+}], SIMM8, "S8Imm">;
+
+def imm32zx8 : Immediate<i32, [{
+  return isUInt<8>(N->getZExtValue());
+}], UIMM8, "U8Imm">;
+
+def imm32zx8trunc : Immediate<i32, [{}], UIMM8, "U8Imm">;
+
+def imm32sx16 : Immediate<i32, [{
+  return isInt<16>(N->getSExtValue());
+}], SIMM16, "S16Imm">;
+
+def imm32zx16 : Immediate<i32, [{
+  return isUInt<16>(N->getZExtValue());
+}], UIMM16, "U16Imm">;
+
+def imm32sx16trunc : Immediate<i32, [{}], SIMM16, "S16Imm">;
+
+// Full 32-bit immediates.  we need both signed and unsigned versions
+// because the assembler is picky.  E.g. AFI requires signed operands
+// while NILF requires unsigned ones.
+def simm32 : Immediate<i32, [{}], SIMM32, "S32Imm">;
+def uimm32 : Immediate<i32, [{}], UIMM32, "U32Imm">;
+
+def imm32 : ImmLeaf<i32, [{}]>;
+
+//===----------------------------------------------------------------------===//
+// 64-bit immediates
+//===----------------------------------------------------------------------===//
+
+// Immediates for 16-bit chunks of an i64, with the other bits of the
+// i32 being zero.
+def imm64ll16 : Immediate<i64, [{
+  return SystemZ::isImmLL(N->getZExtValue());
+}], LL16, "U16Imm">;
+
+def imm64lh16 : Immediate<i64, [{
+  return SystemZ::isImmLH(N->getZExtValue());
+}], LH16, "U16Imm">;
+
+def imm64hl16 : Immediate<i64, [{
+  return SystemZ::isImmHL(N->getZExtValue());
+}], HL16, "U16Imm">;
+
+def imm64hh16 : Immediate<i64, [{
+  return SystemZ::isImmHH(N->getZExtValue());
+}], HH16, "U16Imm">;
+
+// Immediates for 16-bit chunks of an i64, with the other bits of the
+// i32 being one.
+def imm64ll16c : Immediate<i64, [{
+  return SystemZ::isImmLL(uint64_t(~N->getZExtValue()));
+}], LL16, "U16Imm">;
+
+def imm64lh16c : Immediate<i64, [{
+  return SystemZ::isImmLH(uint64_t(~N->getZExtValue()));
+}], LH16, "U16Imm">;
+
+def imm64hl16c : Immediate<i64, [{
+  return SystemZ::isImmHL(uint64_t(~N->getZExtValue()));
+}], HL16, "U16Imm">;
+
+def imm64hh16c : Immediate<i64, [{
+  return SystemZ::isImmHH(uint64_t(~N->getZExtValue()));
+}], HH16, "U16Imm">;
+
+// Immediates for the lower and upper 32 bits of an i64, with the other
+// bits of the i32 being zero.
+def imm64lf32 : Immediate<i64, [{
+  return SystemZ::isImmLF(N->getZExtValue());
+}], LF32, "U32Imm">;
+
+def imm64hf32 : Immediate<i64, [{
+  return SystemZ::isImmHF(N->getZExtValue());
+}], HF32, "U32Imm">;
+
+// Immediates for the lower and upper 32 bits of an i64, with the other
+// bits of the i32 being one.
+def imm64lf32c : Immediate<i64, [{
+  return SystemZ::isImmLF(uint64_t(~N->getZExtValue()));
+}], LF32, "U32Imm">;
+
+def imm64hf32c : Immediate<i64, [{
+  return SystemZ::isImmHF(uint64_t(~N->getZExtValue()));
+}], HF32, "U32Imm">;
+
+// Short immediates.
+def imm64sx8 : Immediate<i64, [{
+  return isInt<8>(N->getSExtValue());
+}], SIMM8, "S8Imm">;
+
+def imm64sx16 : Immediate<i64, [{
+  return isInt<16>(N->getSExtValue());
+}], SIMM16, "S16Imm">;
+
+def imm64zx16 : Immediate<i64, [{
+  return isUInt<16>(N->getZExtValue());
+}], UIMM16, "U16Imm">;
+
+def imm64sx32 : Immediate<i64, [{
+  return isInt<32>(N->getSExtValue());
+}], SIMM32, "S32Imm">;
+
+def imm64zx32 : Immediate<i64, [{
+  return isUInt<32>(N->getZExtValue());
+}], UIMM32, "U32Imm">;
+
+def imm64zx32n : Immediate<i64, [{
+  return isUInt<32>(-N->getSExtValue());
+}], NEGIMM32, "U32Imm">;
+
+def imm64 : ImmLeaf<i64, [{}]>;
+
+//===----------------------------------------------------------------------===//
+// Floating-point immediates
+//===----------------------------------------------------------------------===//
+
+// Floating-point zero.
+def fpimm0 : PatLeaf<(fpimm), [{ return N->isExactlyValue(+0.0); }]>;
+
+// Floating point negative zero.
+def fpimmneg0 : PatLeaf<(fpimm), [{ return N->isExactlyValue(-0.0); }]>;
+
+//===----------------------------------------------------------------------===//
+// Symbolic address operands
+//===----------------------------------------------------------------------===//
+
+// PC-relative asm operands.
+def PCRel16 : PCRelAsmOperand<"16">;
+def PCRel32 : PCRelAsmOperand<"32">;
+
+// PC-relative offsets of a basic block.  The offset is sign-extended
+// and multiplied by 2.
+def brtarget16 : PCRelOperand<OtherVT, PCRel16> {
+  let EncoderMethod = "getPC16DBLEncoding";
+  let DecoderMethod = "decodePC16DBLOperand";
+}
+def brtarget32 : PCRelOperand<OtherVT, PCRel32> {
+  let EncoderMethod = "getPC32DBLEncoding";
+  let DecoderMethod = "decodePC32DBLOperand";
+}
+
+// A PC-relative offset of a global value.  The offset is sign-extended
+// and multiplied by 2.
+def pcrel32 : PCRelAddress<i64, "pcrel32", PCRel32> {
+  let EncoderMethod = "getPC32DBLEncoding";
+  let DecoderMethod = "decodePC32DBLOperand";
+}
+
+// A PC-relative offset of a global value when the value is used as a
+// call target.  The offset is sign-extended and multiplied by 2.
+def pcrel16call : PCRelAddress<i64, "pcrel16call", PCRel16> {
+  let PrintMethod = "printCallOperand";
+  let EncoderMethod = "getPLT16DBLEncoding";
+  let DecoderMethod = "decodePC16DBLOperand";
+}
+def pcrel32call : PCRelAddress<i64, "pcrel32call", PCRel32> {
+  let PrintMethod = "printCallOperand";
+  let EncoderMethod = "getPLT32DBLEncoding";
+  let DecoderMethod = "decodePC32DBLOperand";
+}
+
+//===----------------------------------------------------------------------===//
+// Addressing modes
+//===----------------------------------------------------------------------===//
+
+// 12-bit displacement operands.
+def disp12imm32 : Operand<i32>;
+def disp12imm64 : Operand<i64>;
+
+// 20-bit displacement operands.
+def disp20imm32 : Operand<i32>;
+def disp20imm64 : Operand<i64>;
+
+def BDAddr32Disp12  : AddressAsmOperand<"BDAddr",  "32", "12">;
+def BDAddr32Disp20  : AddressAsmOperand<"BDAddr",  "32", "20">;
+def BDAddr64Disp12  : AddressAsmOperand<"BDAddr",  "64", "12">;
+def BDAddr64Disp20  : AddressAsmOperand<"BDAddr",  "64", "20">;
+def BDXAddr64Disp12 : AddressAsmOperand<"BDXAddr", "64", "12">;
+def BDXAddr64Disp20 : AddressAsmOperand<"BDXAddr", "64", "20">;
+
+// DAG patterns and operands for addressing modes.  Each mode has
+// the form <type><range><group> where:
+//
+// <type> is one of:
+//   shift    : base + displacement (32-bit)
+//   bdaddr   : base + displacement
+//   bdxaddr  : base + displacement + index
+//   laaddr   : like bdxaddr, but used for Load Address operations
+//   dynalloc : base + displacement + index + ADJDYNALLOC
+//
+// <range> is one of:
+//   12       : the displacement is an unsigned 12-bit value
+//   20       : the displacement is a signed 20-bit value
+//
+// <group> is one of:
+//   pair     : used when there is an equivalent instruction with the opposite
+//              range value (12 or 20)
+//   only     : used when there is no equivalent instruction with the opposite
+//              range value
+def shift12only      : BDMode <"BDAddr",   "32", "12", "Only">;
+def shift20only      : BDMode <"BDAddr",   "32", "20", "Only">;
+def bdaddr12only     : BDMode <"BDAddr",   "64", "12", "Only">;
+def bdaddr12pair     : BDMode <"BDAddr",   "64", "12", "Pair">;
+def bdaddr20only     : BDMode <"BDAddr",   "64", "20", "Only">;
+def bdaddr20pair     : BDMode <"BDAddr",   "64", "20", "Pair">;
+def bdxaddr12only    : BDXMode<"BDXAddr",  "64", "12", "Only">;
+def bdxaddr12pair    : BDXMode<"BDXAddr",  "64", "12", "Pair">;
+def bdxaddr20only    : BDXMode<"BDXAddr",  "64", "20", "Only">;
+def bdxaddr20only128 : BDXMode<"BDXAddr",  "64", "20", "Only128">;
+def bdxaddr20pair    : BDXMode<"BDXAddr",  "64", "20", "Pair">;
+def dynalloc12only   : BDXMode<"DynAlloc", "64", "12", "Only">;
+def laaddr12pair     : BDXMode<"LAAddr",   "64", "12", "Pair">;
+def laaddr20pair     : BDXMode<"LAAddr",   "64", "20", "Pair">;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous
+//===----------------------------------------------------------------------===//
+
+// Access registers.  At present we just use them for accessing the thread
+// pointer, so we don't expose them as register to LLVM.
+def AccessReg : AsmOperandClass {
+  let Name = "AccessReg";
+  let ParserMethod = "parseAccessReg";
+}
+def access_reg : Immediate<i8, [{ return N->getZExtValue() < 16; }],
+                           NOOP_SDNodeXForm, "AccessReg"> {
+  let ParserMatchClass = AccessReg;
+}
+
+// A 4-bit condition-code mask.
+def cond4 : PatLeaf<(i8 imm), [{ return (N->getZExtValue() < 16); }]>,
+            Operand<i8> {
+  let PrintMethod = "printCond4Operand";
+}
diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td
new file mode 100644
index 0000000..ab01b25
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZOperators.td
@@ -0,0 +1,213 @@
+//===-- SystemZOperators.td - SystemZ-specific operators ------*- tblgen-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Type profiles
+//===----------------------------------------------------------------------===//
+def SDT_CallSeqStart        : SDCallSeqStart<[SDTCisVT<0, i64>]>;
+def SDT_CallSeqEnd          : SDCallSeqEnd<[SDTCisVT<0, i64>,
+                                            SDTCisVT<1, i64>]>;
+def SDT_ZCall               : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
+def SDT_ZCmp                : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+def SDT_ZBRCCMask           : SDTypeProfile<0, 2,
+                                            [SDTCisVT<0, i8>,
+                                             SDTCisVT<1, OtherVT>]>;
+def SDT_ZSelectCCMask       : SDTypeProfile<1, 3,
+                                            [SDTCisSameAs<0, 1>,
+                                             SDTCisSameAs<1, 2>,
+                                             SDTCisVT<3, i8>]>;
+def SDT_ZWrapPtr            : SDTypeProfile<1, 1,
+                                            [SDTCisSameAs<0, 1>,
+                                             SDTCisPtrTy<0>]>;
+def SDT_ZAdjDynAlloc        : SDTypeProfile<1, 0, [SDTCisVT<0, i64>]>;
+def SDT_ZExtractAccess      : SDTypeProfile<1, 1,
+                                            [SDTCisVT<0, i32>,
+                                             SDTCisVT<1, i8>]>;
+def SDT_ZGR128Binary32      : SDTypeProfile<1, 2,
+                                            [SDTCisVT<0, untyped>,
+                                             SDTCisVT<1, untyped>,
+                                             SDTCisVT<2, i32>]>;
+def SDT_ZGR128Binary64      : SDTypeProfile<1, 2,
+                                            [SDTCisVT<0, untyped>,
+                                             SDTCisVT<1, untyped>,
+                                             SDTCisVT<2, i64>]>;
+def SDT_ZAtomicLoadBinaryW  : SDTypeProfile<1, 5,
+                                            [SDTCisVT<0, i32>,
+                                             SDTCisPtrTy<1>,
+                                             SDTCisVT<2, i32>,
+                                             SDTCisVT<3, i32>,
+                                             SDTCisVT<4, i32>,
+                                             SDTCisVT<5, i32>]>;
+def SDT_ZAtomicCmpSwapW     : SDTypeProfile<1, 6,
+                                            [SDTCisVT<0, i32>,
+                                             SDTCisPtrTy<1>,
+                                             SDTCisVT<2, i32>,
+                                             SDTCisVT<3, i32>,
+                                             SDTCisVT<4, i32>,
+                                             SDTCisVT<5, i32>,
+                                             SDTCisVT<6, i32>]>;
+
+//===----------------------------------------------------------------------===//
+// Node definitions
+//===----------------------------------------------------------------------===//
+
+// These are target-independent nodes, but have target-specific formats.
+def callseq_start       : SDNode<"ISD::CALLSEQ_START", SDT_CallSeqStart,
+                                 [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
+def callseq_end         : SDNode<"ISD::CALLSEQ_END",   SDT_CallSeqEnd,
+                                 [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue,
+                                  SDNPOutGlue]>;
+
+// Nodes for SystemZISD::*.  See SystemZISelLowering.h for more details.
+def z_retflag           : SDNode<"SystemZISD::RET_FLAG", SDTNone,
+                                 [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def z_call              : SDNode<"SystemZISD::CALL", SDT_ZCall,
+                                 [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
+                                  SDNPVariadic]>;
+def z_pcrel_wrapper     : SDNode<"SystemZISD::PCREL_WRAPPER", SDT_ZWrapPtr, []>;
+def z_cmp               : SDNode<"SystemZISD::CMP", SDT_ZCmp, [SDNPOutGlue]>;
+def z_ucmp              : SDNode<"SystemZISD::UCMP", SDT_ZCmp, [SDNPOutGlue]>;
+def z_br_ccmask         : SDNode<"SystemZISD::BR_CCMASK", SDT_ZBRCCMask,
+                                 [SDNPHasChain, SDNPInGlue]>;
+def z_select_ccmask     : SDNode<"SystemZISD::SELECT_CCMASK", SDT_ZSelectCCMask,
+    		                 [SDNPInGlue]>;
+def z_adjdynalloc       : SDNode<"SystemZISD::ADJDYNALLOC", SDT_ZAdjDynAlloc>;
+def z_extract_access    : SDNode<"SystemZISD::EXTRACT_ACCESS",
+                                 SDT_ZExtractAccess>;
+def z_umul_lohi64       : SDNode<"SystemZISD::UMUL_LOHI64", SDT_ZGR128Binary64>;
+def z_sdivrem64         : SDNode<"SystemZISD::SDIVREM64", SDT_ZGR128Binary64>;
+def z_udivrem32         : SDNode<"SystemZISD::UDIVREM32", SDT_ZGR128Binary32>;
+def z_udivrem64         : SDNode<"SystemZISD::UDIVREM64", SDT_ZGR128Binary64>;
+
+class AtomicWOp<string name, SDTypeProfile profile = SDT_ZAtomicLoadBinaryW>
+  : SDNode<"SystemZISD::"##name, profile,
+           [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
+
+def z_atomic_swapw      : AtomicWOp<"ATOMIC_SWAPW">;
+def z_atomic_loadw_add  : AtomicWOp<"ATOMIC_LOADW_ADD">;
+def z_atomic_loadw_sub  : AtomicWOp<"ATOMIC_LOADW_SUB">;
+def z_atomic_loadw_and  : AtomicWOp<"ATOMIC_LOADW_AND">;
+def z_atomic_loadw_or   : AtomicWOp<"ATOMIC_LOADW_OR">;
+def z_atomic_loadw_xor  : AtomicWOp<"ATOMIC_LOADW_XOR">;
+def z_atomic_loadw_nand : AtomicWOp<"ATOMIC_LOADW_NAND">;
+def z_atomic_loadw_min  : AtomicWOp<"ATOMIC_LOADW_MIN">;
+def z_atomic_loadw_max  : AtomicWOp<"ATOMIC_LOADW_MAX">;
+def z_atomic_loadw_umin : AtomicWOp<"ATOMIC_LOADW_UMIN">;
+def z_atomic_loadw_umax : AtomicWOp<"ATOMIC_LOADW_UMAX">;
+def z_atomic_cmp_swapw  : AtomicWOp<"ATOMIC_CMP_SWAPW", SDT_ZAtomicCmpSwapW>;
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments
+//===----------------------------------------------------------------------===//
+
+// Register sign-extend operations.  Sub-32-bit values are represented as i32s.
+def sext8  : PatFrag<(ops node:$src), (sext_inreg node:$src, i8)>;
+def sext16 : PatFrag<(ops node:$src), (sext_inreg node:$src, i16)>;
+def sext32 : PatFrag<(ops node:$src), (sext (i32 node:$src))>;
+
+// Register zero-extend operations.  Sub-32-bit values are represented as i32s.
+def zext8  : PatFrag<(ops node:$src), (and node:$src, 0xff)>;
+def zext16 : PatFrag<(ops node:$src), (and node:$src, 0xffff)>;
+def zext32 : PatFrag<(ops node:$src), (zext (i32 node:$src))>;
+
+// Typed floating-point loads.
+def loadf32 : PatFrag<(ops node:$src), (f32 (load node:$src))>;
+def loadf64 : PatFrag<(ops node:$src), (f64 (load node:$src))>;
+
+// Aligned loads.
+class AlignedLoad<SDPatternOperator load>
+  : PatFrag<(ops node:$addr), (load node:$addr), [{
+  LoadSDNode *Load = cast<LoadSDNode>(N);
+  return Load->getAlignment() >= Load->getMemoryVT().getStoreSize();
+}]>;
+def aligned_load        : AlignedLoad<load>;
+def aligned_sextloadi16 : AlignedLoad<sextloadi16>;
+def aligned_sextloadi32 : AlignedLoad<sextloadi32>;
+def aligned_zextloadi16 : AlignedLoad<zextloadi16>;
+def aligned_zextloadi32 : AlignedLoad<zextloadi32>;
+
+// Aligned stores.
+class AlignedStore<SDPatternOperator store>
+  : PatFrag<(ops node:$src, node:$addr), (store node:$src, node:$addr), [{
+  StoreSDNode *Store = cast<StoreSDNode>(N);
+  return Store->getAlignment() >= Store->getMemoryVT().getStoreSize();
+}]>;
+def aligned_store         : AlignedStore<store>;
+def aligned_truncstorei16 : AlignedStore<truncstorei16>;
+def aligned_truncstorei32 : AlignedStore<truncstorei32>;
+
+// Non-volatile loads.  Used for instructions that might access the storage
+// location multiple times.
+class NonvolatileLoad<SDPatternOperator load>
+  : PatFrag<(ops node:$addr), (load node:$addr), [{
+  LoadSDNode *Load = cast<LoadSDNode>(N);
+  return !Load->isVolatile();
+}]>;
+def nonvolatile_load : NonvolatileLoad<load>;
+
+// Non-volatile stores.
+class NonvolatileStore<SDPatternOperator store>
+  : PatFrag<(ops node:$src, node:$addr), (store node:$src, node:$addr), [{
+  StoreSDNode *Store = cast<StoreSDNode>(N);
+  return !Store->isVolatile();
+}]>;
+def nonvolatile_store : NonvolatileStore<store>;
+
+// Insertions.
+def inserti8 : PatFrag<(ops node:$src1, node:$src2),
+                       (or (and node:$src1, -256), node:$src2)>;
+def insertll : PatFrag<(ops node:$src1, node:$src2),
+                       (or (and node:$src1, 0xffffffffffff0000), node:$src2)>;
+def insertlh : PatFrag<(ops node:$src1, node:$src2),
+                       (or (and node:$src1, 0xffffffff0000ffff), node:$src2)>;
+def inserthl : PatFrag<(ops node:$src1, node:$src2),
+                       (or (and node:$src1, 0xffff0000ffffffff), node:$src2)>;
+def inserthh : PatFrag<(ops node:$src1, node:$src2),
+                       (or (and node:$src1, 0x0000ffffffffffff), node:$src2)>;
+def insertlf : PatFrag<(ops node:$src1, node:$src2),
+                       (or (and node:$src1, 0xffffffff00000000), node:$src2)>;
+def inserthf : PatFrag<(ops node:$src1, node:$src2),
+                       (or (and node:$src1, 0x00000000ffffffff), node:$src2)>;
+
+// ORs that can be treated as insertions.
+def or_as_inserti8 : PatFrag<(ops node:$src1, node:$src2),
+                             (or node:$src1, node:$src2), [{
+  unsigned BitWidth = N->getValueType(0).getScalarType().getSizeInBits();
+  return CurDAG->MaskedValueIsZero(N->getOperand(0),
+                                   APInt::getLowBitsSet(BitWidth, 8));
+}]>;
+
+// ORs that can be treated as reversed insertions.
+def or_as_revinserti8 : PatFrag<(ops node:$src1, node:$src2),
+                                (or node:$src1, node:$src2), [{
+  unsigned BitWidth = N->getValueType(0).getScalarType().getSizeInBits();
+  return CurDAG->MaskedValueIsZero(N->getOperand(1),
+                                   APInt::getLowBitsSet(BitWidth, 8));
+}]>;
+
+// Fused multiply-add and multiply-subtract, but with the order of the
+// operands matching SystemZ's MA and MS instructions.
+def z_fma : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                    (fma node:$src2, node:$src3, node:$src1)>;
+def z_fms : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                    (fma node:$src2, node:$src3, (fneg node:$src1))>;
+
+// Floating-point negative absolute.
+def fnabs : PatFrag<(ops node:$ptr), (fneg (fabs node:$ptr))>;
+
+// Create a unary operator that loads from memory and then performs
+// the given operation on it.
+class loadu<SDPatternOperator operator, SDPatternOperator load = load>
+  : PatFrag<(ops node:$addr), (operator (load node:$addr))>;
+
+// Create a store operator that performs the given unary operation
+// on the value before storing it.
+class storeu<SDPatternOperator operator, SDPatternOperator store = store>
+  : PatFrag<(ops node:$value, node:$addr),
+            (store (operator node:$value), node:$addr)>;
diff --git a/lib/Target/SystemZ/SystemZPatterns.td b/lib/Target/SystemZ/SystemZPatterns.td
new file mode 100644
index 0000000..3689f74
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZPatterns.td
@@ -0,0 +1,71 @@
+//===-- SystemZPatterns.td - SystemZ-specific pattern rules ---*- tblgen-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Record that INSN performs a 64-bit version of unary operator OPERATOR
+// in which the operand is sign-extended from 32 to 64 bits.
+multiclass SXU<SDPatternOperator operator, Instruction insn> {
+  def : Pat<(operator (sext (i32 GR32:$src))),
+            (insn GR32:$src)>;
+  def : Pat<(operator (sext_inreg GR64:$src, i32)),
+            (insn (EXTRACT_SUBREG GR64:$src, subreg_32bit))>;
+}
+
+// Record that INSN performs a 64-bit version of binary operator OPERATOR
+// in which the first operand has class CLS and which the second operand
+// is sign-extended from a 32-bit register.
+multiclass SXB<SDPatternOperator operator, RegisterOperand cls,
+               Instruction insn> {
+  def : Pat<(operator cls:$src1, (sext GR32:$src2)),
+            (insn cls:$src1, GR32:$src2)>;
+  def : Pat<(operator cls:$src1, (sext_inreg GR64:$src2, i32)),
+            (insn cls:$src1, (EXTRACT_SUBREG GR64:$src2, subreg_32bit))>;
+}
+
+// Like SXB, but for zero extension.
+multiclass ZXB<SDPatternOperator operator, RegisterOperand cls,
+               Instruction insn> {
+  def : Pat<(operator cls:$src1, (zext GR32:$src2)),
+            (insn cls:$src1, GR32:$src2)>;
+  def : Pat<(operator cls:$src1, (and GR64:$src2, 0xffffffff)),
+            (insn cls:$src1, (EXTRACT_SUBREG GR64:$src2, subreg_32bit))>;
+}
+
+// Record that INSN performs a binary read-modify-write operation,
+// with LOAD, OPERATOR and STORE being the read, modify and write
+// respectively.  MODE is the addressing mode and IMM is the type
+// of the second operand.
+class RMWI<SDPatternOperator load, SDPatternOperator operator,
+           SDPatternOperator store, AddressingMode mode,
+           PatFrag imm, Instruction insn>
+  : Pat<(store (operator (load mode:$addr), imm:$src), mode:$addr),
+        (insn mode:$addr, (UIMM8 imm:$src))>;
+
+// Record that INSN performs binary operation OPERATION on a byte
+// memory location.  IMM is the type of the second operand.
+multiclass RMWIByte<SDPatternOperator operator, AddressingMode mode,
+                    Instruction insn> {
+  def : RMWI<zextloadi8, operator, truncstorei8, mode, imm32, insn>;
+  def : RMWI<zextloadi8, operator, truncstorei8, mode, imm64, insn>;
+  def : RMWI<sextloadi8, operator, truncstorei8, mode, imm32, insn>;
+  def : RMWI<sextloadi8, operator, truncstorei8, mode, imm64, insn>;
+  def : RMWI<extloadi8, operator, truncstorei8, mode, imm32, insn>;
+  def : RMWI<extloadi8, operator, truncstorei8, mode, imm64, insn>;
+}
+
+// Record that INSN performs insertion TYPE into a register of class CLS.
+// The inserted operand is loaded using LOAD from an address of mode MODE.
+multiclass InsertMem<string type, Instruction insn, RegisterOperand cls,
+                     SDPatternOperator load, AddressingMode mode> {
+  def : Pat<(!cast<SDPatternOperator>("or_as_"##type)
+              cls:$src1, (load mode:$src2)),
+            (insn cls:$src1, mode:$src2)>;
+  def : Pat<(!cast<SDPatternOperator>("or_as_rev"##type)
+              (load mode:$src2), cls:$src1),
+            (insn cls:$src1, mode:$src2)>;
+}
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
new file mode 100644
index 0000000..c695bb3
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -0,0 +1,165 @@
+//===-- SystemZRegisterInfo.cpp - SystemZ register information ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZRegisterInfo.h"
+#include "SystemZTargetMachine.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+#define GET_REGINFO_TARGET_DESC
+#include "SystemZGenRegisterInfo.inc"
+
+using namespace llvm;
+
+SystemZRegisterInfo::SystemZRegisterInfo(SystemZTargetMachine &tm)
+  : SystemZGenRegisterInfo(SystemZ::R14D), TM(tm) {}
+
+const uint16_t*
+SystemZRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  static const uint16_t CalleeSavedRegs[] = {
+    SystemZ::R6D,  SystemZ::R7D,  SystemZ::R8D,  SystemZ::R9D,
+    SystemZ::R10D, SystemZ::R11D, SystemZ::R12D, SystemZ::R13D,
+    SystemZ::R14D, SystemZ::R15D,
+    SystemZ::F8D,  SystemZ::F9D,  SystemZ::F10D, SystemZ::F11D,
+    SystemZ::F12D, SystemZ::F13D, SystemZ::F14D, SystemZ::F15D,
+    0
+  };
+
+  return CalleeSavedRegs;
+}
+
+BitVector
+SystemZRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  if (TFI->hasFP(MF)) {
+    // R11D is the frame pointer.  Reserve all aliases.
+    Reserved.set(SystemZ::R11D);
+    Reserved.set(SystemZ::R11W);
+    Reserved.set(SystemZ::R10Q);
+  }
+
+  // R15D is the stack pointer.  Reserve all aliases.
+  Reserved.set(SystemZ::R15D);
+  Reserved.set(SystemZ::R15W);
+  Reserved.set(SystemZ::R14Q);
+  return Reserved;
+}
+
+bool
+SystemZRegisterInfo::saveScavengerRegister(MachineBasicBlock &MBB,
+					   MachineBasicBlock::iterator SaveMBBI,
+					   MachineBasicBlock::iterator &UseMBBI,
+					   const TargetRegisterClass *RC,
+					   unsigned Reg) const {
+  MachineFunction &MF = *MBB.getParent();
+  const SystemZInstrInfo &TII =
+    *static_cast<const SystemZInstrInfo*>(TM.getInstrInfo());
+  const SystemZFrameLowering *TFI =
+    static_cast<const SystemZFrameLowering *>(TM.getFrameLowering());
+  unsigned Base = getFrameRegister(MF);
+  uint64_t Offset = TFI->getEmergencySpillSlotOffset(MF);
+  DebugLoc DL;
+
+  unsigned LoadOpcode, StoreOpcode;
+  TII.getLoadStoreOpcodes(RC, LoadOpcode, StoreOpcode);
+
+  // The offset must always be in range of a 12-bit unsigned displacement.
+  BuildMI(MBB, SaveMBBI, DL, TII.get(StoreOpcode))
+    .addReg(Reg, RegState::Kill).addReg(Base).addImm(Offset).addReg(0);
+  BuildMI(MBB, UseMBBI, DL, TII.get(LoadOpcode), Reg)
+    .addReg(Base).addImm(Offset).addReg(0);
+  return true;
+}
+
+void
+SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                                         int SPAdj, unsigned FIOperandNum,
+                                         RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Outgoing arguments should be part of the frame");
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const SystemZInstrInfo &TII =
+    *static_cast<const SystemZInstrInfo*>(TM.getInstrInfo());
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  DebugLoc DL = MI->getDebugLoc();
+
+  // Decompose the frame index into a base and offset.
+  int FrameIndex = MI->getOperand(FIOperandNum).getIndex();
+  unsigned BasePtr = getFrameRegister(MF);
+  int64_t Offset = (TFI->getFrameIndexOffset(MF, FrameIndex) +
+                    MI->getOperand(FIOperandNum + 1).getImm());
+
+  // Special handling of dbg_value instructions.
+  if (MI->isDebugValue()) {
+    MI->getOperand(FIOperandNum).ChangeToRegister(BasePtr, /*isDef*/ false);
+    MI->getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+    return;
+  }
+
+  // See if the offset is in range, or if an equivalent instruction that
+  // accepts the offset exists.
+  unsigned Opcode = MI->getOpcode();
+  unsigned OpcodeForOffset = TII.getOpcodeForOffset(Opcode, Offset);
+  if (OpcodeForOffset)
+    MI->getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
+  else {
+    // Create an anchor point that is in range.  Start at 0xffff so that
+    // can use LLILH to load the immediate.
+    int64_t OldOffset = Offset;
+    int64_t Mask = 0xffff;
+    do {
+      Offset = OldOffset & Mask;
+      OpcodeForOffset = TII.getOpcodeForOffset(Opcode, Offset);
+      Mask >>= 1;
+      assert(Mask && "One offset must be OK");
+    } while (!OpcodeForOffset);
+
+    unsigned ScratchReg =
+      MF.getRegInfo().createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+    int64_t HighOffset = OldOffset - Offset;
+
+    if (MI->getDesc().TSFlags & SystemZII::HasIndex
+        && MI->getOperand(FIOperandNum + 2).getReg() == 0) {
+      // Load the offset into the scratch register and use it as an index.
+      // The scratch register then dies here.
+      TII.loadImmediate(MBB, MI, ScratchReg, HighOffset);
+      MI->getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
+      MI->getOperand(FIOperandNum + 2).ChangeToRegister(ScratchReg,
+                                                        false, false, true);
+    } else {
+      // Load the anchor address into a scratch register.
+      unsigned LAOpcode = TII.getOpcodeForOffset(SystemZ::LA, HighOffset);
+      if (LAOpcode)
+        BuildMI(MBB, MI, DL, TII.get(LAOpcode),ScratchReg)
+          .addReg(BasePtr).addImm(HighOffset).addReg(0);
+      else {
+        // Load the high offset into the scratch register and use it as
+        // an index.
+        TII.loadImmediate(MBB, MI, ScratchReg, HighOffset);
+        BuildMI(MBB, MI, DL, TII.get(SystemZ::AGR),ScratchReg)
+          .addReg(ScratchReg, RegState::Kill).addReg(BasePtr);
+      }
+
+      // Use the scratch register as the base.  It then dies here.
+      MI->getOperand(FIOperandNum).ChangeToRegister(ScratchReg,
+                                                    false, false, true);
+    }
+  }
+  MI->setDesc(TII.get(OpcodeForOffset));
+  MI->getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+}
+
+unsigned
+SystemZRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  return TFI->hasFP(MF) ? SystemZ::R11D : SystemZ::R15D;
+}
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.h b/lib/Target/SystemZ/SystemZRegisterInfo.h
new file mode 100644
index 0000000..047cb4a
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -0,0 +1,69 @@
+//===-- SystemZRegisterInfo.h - SystemZ register information ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SystemZREGISTERINFO_H
+#define SystemZREGISTERINFO_H
+
+#include "SystemZ.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "SystemZGenRegisterInfo.inc"
+
+namespace llvm {
+
+namespace SystemZ {
+  // Return the subreg to use for referring to the even and odd registers
+  // in a GR128 pair.  Is32Bit says whether we want a GR32 or GR64.
+  inline unsigned even128(bool Is32bit) {
+    return Is32bit ? subreg_32bit : subreg_high;
+  }
+  inline unsigned odd128(bool Is32bit) {
+    return Is32bit ? subreg_low32 : subreg_low;
+  }
+}
+
+class SystemZSubtarget;
+class SystemZInstrInfo;
+
+struct SystemZRegisterInfo : public SystemZGenRegisterInfo {
+private:
+  SystemZTargetMachine &TM;
+
+public:
+  SystemZRegisterInfo(SystemZTargetMachine &tm);
+
+  // Override TargetRegisterInfo.h.
+  virtual bool requiresRegisterScavenging(const MachineFunction &MF) const
+    LLVM_OVERRIDE {
+    return true;
+  }
+  virtual bool requiresFrameIndexScavenging(const MachineFunction &MF) const
+    LLVM_OVERRIDE {
+    return true;
+  }
+  virtual const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0)
+    const LLVM_OVERRIDE;
+  virtual BitVector getReservedRegs(const MachineFunction &MF)
+    const LLVM_OVERRIDE;
+  virtual bool saveScavengerRegister(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator SaveMBBI,
+                                     MachineBasicBlock::iterator &UseMBBI,
+                                     const TargetRegisterClass *RC,
+                                     unsigned Reg) const LLVM_OVERRIDE;
+  virtual void eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                                   int SPAdj, unsigned FIOperandNum,
+                                   RegScavenger *RS) const LLVM_OVERRIDE;
+  virtual unsigned getFrameRegister(const MachineFunction &MF) const
+    LLVM_OVERRIDE;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.td b/lib/Target/SystemZ/SystemZRegisterInfo.td
new file mode 100644
index 0000000..d65553e
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -0,0 +1,151 @@
+//==- SystemZRegisterInfo.td - SystemZ register definitions -*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Class definitions.
+//===----------------------------------------------------------------------===//
+
+class SystemZReg<string n> : Register<n> {
+  let Namespace = "SystemZ";
+}
+
+class SystemZRegWithSubregs<string n, list<Register> subregs>
+  : RegisterWithSubRegs<n, subregs> {
+  let Namespace = "SystemZ";
+}
+
+let Namespace = "SystemZ" in {
+def subreg_32bit  : SubRegIndex<32>; // could also be named "subreg_high32"
+// Indices are used in a variety of ways, so don't set an Offset.
+def subreg_high   : SubRegIndex<64, -1>;
+def subreg_low    : SubRegIndex<64, -1>;
+def subreg_low32  : ComposedSubRegIndex<subreg_low, subreg_32bit>;
+}
+
+// Define a register class that contains values of type TYPE and an
+// associated operand called NAME.  SIZE is the size and alignment
+// of the registers and REGLIST is the list of individual registers.
+multiclass SystemZRegClass<string name, ValueType type, int size, dag regList> {
+  def AsmOperand : AsmOperandClass {
+    let Name = name;
+    let ParserMethod = "parse"##name;
+    let RenderMethod = "addRegOperands";
+  }
+  def Bit : RegisterClass<"SystemZ", [type], size, regList> {
+    let Size = size;
+  }
+  def "" : RegisterOperand<!cast<RegisterClass>(name##"Bit")> {
+    let ParserMatchClass = !cast<AsmOperandClass>(name##"AsmOperand");
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// General-purpose registers
+//===----------------------------------------------------------------------===//
+
+// Lower 32 bits of one of the 16 64-bit general-purpose registers
+class GPR32<bits<16> num, string n> : SystemZReg<n> {
+  let HWEncoding = num;
+}
+
+// One of the 16 64-bit general-purpose registers.
+class GPR64<bits<16> num, string n, GPR32 low>
+ : SystemZRegWithSubregs<n, [low]> {
+  let HWEncoding = num;
+  let SubRegIndices = [subreg_32bit];
+}
+
+// 8 even-odd pairs of GPR64s.
+class GPR128<bits<16> num, string n, GPR64 high, GPR64 low>
+ : SystemZRegWithSubregs<n, [high, low]> {
+  let HWEncoding = num;
+  let SubRegIndices = [subreg_high, subreg_low];
+}
+
+// General-purpose registers
+foreach I = 0-15 in {
+  def R#I#W : GPR32<I, "r"#I>;
+  def R#I#D : GPR64<I, "r"#I, !cast<GPR32>("R"#I#"W")>, DwarfRegNum<[I]>;
+}
+
+foreach I = [0, 2, 4, 6, 8, 10, 12, 14] in {
+  def R#I#Q : GPR128<I, "r"#I, !cast<GPR64>("R"#I#"D"),
+                     !cast<GPR64>("R"#!add(I, 1)#"D")>;
+}
+
+/// Allocate the callee-saved R6-R13 backwards. That way they can be saved
+/// together with R14 and R15 in one prolog instruction.
+defm GR32 : SystemZRegClass<"GR32", i32, 32, (add (sequence "R%uW",  0, 5),
+                                                  (sequence "R%uW", 15, 6))>;
+defm GR64 : SystemZRegClass<"GR64", i64, 64, (add (sequence "R%uD",  0, 5),
+                                                  (sequence "R%uD", 15, 6))>;
+
+// The architecture doesn't really have any i128 support, so model the
+// register pairs as untyped instead.
+defm GR128 : SystemZRegClass<"GR128", untyped, 128, (add R0Q, R2Q, R4Q,
+                                                         R12Q, R10Q, R8Q, R6Q,
+                                                         R14Q)>;
+
+// Base and index registers.  Everything except R0, which in an address
+// context evaluates as 0.
+defm ADDR32 : SystemZRegClass<"ADDR32", i32, 32, (sub GR32Bit, R0W)>;
+defm ADDR64 : SystemZRegClass<"ADDR64", i64, 64, (sub GR64Bit, R0D)>;
+
+// Not used directly, but needs to exist for ADDR32 and ADDR64 subregs
+// of a GR128.
+defm ADDR128 : SystemZRegClass<"ADDR128", untyped, 128, (sub GR128Bit, R0Q)>;
+
+//===----------------------------------------------------------------------===//
+// Floating-point registers
+//===----------------------------------------------------------------------===//
+
+// Lower 32 bits of one of the 16 64-bit floating-point registers
+class FPR32<bits<16> num, string n> : SystemZReg<n> {
+  let HWEncoding = num;
+}
+
+// One of the 16 64-bit floating-point registers
+class FPR64<bits<16> num, string n, FPR32 low>
+ : SystemZRegWithSubregs<n, [low]> {
+  let HWEncoding = num;
+  let SubRegIndices = [subreg_32bit];
+}
+
+// 8 pairs of FPR64s, with a one-register gap inbetween.
+class FPR128<bits<16> num, string n, FPR64 high, FPR64 low>
+ : SystemZRegWithSubregs<n, [high, low]> {
+  let HWEncoding = num;
+  let SubRegIndices = [subreg_high, subreg_low];
+}
+
+// Floating-point registers
+foreach I = 0-15 in {
+  def F#I#S : FPR32<I, "f"#I>;
+  def F#I#D : FPR64<I, "f"#I, !cast<FPR32>("F"#I#"S")>,
+              DwarfRegNum<[!add(I, 16)]>;
+}
+
+foreach I = [0, 1, 4, 5, 8, 9, 12, 13] in {
+  def F#I#Q  : FPR128<I, "f"#I, !cast<FPR64>("F"#I#"D"),
+                     !cast<FPR64>("F"#!add(I, 2)#"D")>;
+}
+
+// There's no store-multiple instruction for FPRs, so we're not fussy
+// about the order in which call-saved registers are allocated.
+defm FP32  : SystemZRegClass<"FP32", f32, 32, (sequence "F%uS", 0, 15)>;
+defm FP64  : SystemZRegClass<"FP64", f64, 64, (sequence "F%uD", 0, 15)>;
+defm FP128 : SystemZRegClass<"FP128", f128, 128, (add F0Q, F1Q, F4Q, F5Q,
+                                                      F8Q, F9Q, F12Q, F13Q)>;
+
+//===----------------------------------------------------------------------===//
+// Other registers
+//===----------------------------------------------------------------------===//
+
+// The 2-bit condition code field of the PSW.
+def CC : SystemZReg<"cc">;
diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp
new file mode 100644
index 0000000..cfd3324
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -0,0 +1,56 @@
+//===-- SystemZSubtarget.cpp - SystemZ subtarget information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZSubtarget.h"
+#include "llvm/IR/GlobalValue.h"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "SystemZGenSubtargetInfo.inc"
+
+using namespace llvm;
+
+SystemZSubtarget::SystemZSubtarget(const std::string &TT,
+                                   const std::string &CPU,
+                                   const std::string &FS)
+  : SystemZGenSubtargetInfo(TT, CPU, FS), TargetTriple(TT) {
+  std::string CPUName = CPU;
+  if (CPUName.empty())
+    CPUName = "z10";
+
+  // Parse features string.
+  ParseSubtargetFeatures(CPUName, FS);
+}
+
+// Return true if GV binds locally under reloc model RM.
+static bool bindsLocally(const GlobalValue *GV, Reloc::Model RM) {
+  // For non-PIC, all symbols bind locally.
+  if (RM == Reloc::Static)
+    return true;
+
+  return GV->hasLocalLinkage() || !GV->hasDefaultVisibility();
+}
+
+bool SystemZSubtarget::isPC32DBLSymbol(const GlobalValue *GV,
+                                       Reloc::Model RM,
+                                       CodeModel::Model CM) const {
+  // PC32DBL accesses require the low bit to be clear.  Note that a zero
+  // value selects the default alignment and is therefore OK.
+  if (GV->getAlignment() == 1)
+    return false;
+
+  // For the small model, all locally-binding symbols are in range.
+  if (CM == CodeModel::Small)
+    return bindsLocally(GV, RM);
+
+  // For Medium and above, assume that the symbol is not within the 4GB range.
+  // Taking the address of locally-defined text would be OK, but that
+  // case isn't easy to detect.
+  return false;
+}
diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h
new file mode 100644
index 0000000..8d4d450
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZSubtarget.h
@@ -0,0 +1,48 @@
+//===-- SystemZSubtarget.h - SystemZ subtarget information -----*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SystemZ specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SYSTEMZSUBTARGET_H
+#define SYSTEMZSUBTARGET_H
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "SystemZGenSubtargetInfo.inc"
+
+namespace llvm {
+class GlobalValue;
+class StringRef;
+
+class SystemZSubtarget : public SystemZGenSubtargetInfo {
+private:
+  Triple TargetTriple;
+
+public:
+  SystemZSubtarget(const std::string &TT, const std::string &CPU,
+                   const std::string &FS);
+
+  // Automatically generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  // Return true if GV can be accessed using LARL for reloc model RM
+  // and code model CM.
+  bool isPC32DBLSymbol(const GlobalValue *GV, Reloc::Model RM,
+                       CodeModel::Model CM) const;
+
+  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+};
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp
new file mode 100644
index 0000000..6e7540c
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -0,0 +1,67 @@
+//===-- SystemZTargetMachine.cpp - Define TargetMachine for SystemZ -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZTargetMachine.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+extern "C" void LLVMInitializeSystemZTarget() {
+  // Register the target.
+  RegisterTargetMachine<SystemZTargetMachine> X(TheSystemZTarget);
+}
+
+SystemZTargetMachine::SystemZTargetMachine(const Target &T, StringRef TT,
+                                           StringRef CPU, StringRef FS,
+                                           const TargetOptions &Options,
+                                           Reloc::Model RM,
+                                           CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+    Subtarget(TT, CPU, FS),
+    // Make sure that global data has at least 16 bits of alignment by default,
+    // so that we can refer to it using LARL.  We don't have any special
+    // requirements for stack variables though.
+    DL("E-p:64:64:64-i1:8:16-i8:8:16-i16:16-i32:32-i64:64"
+       "-f32:32-f64:64-f128:64-a0:8:16-n32:64"),
+    InstrInfo(*this), TLInfo(*this), TSInfo(*this),
+    FrameLowering(*this, Subtarget) {
+  initAsmInfo();
+}
+
+namespace {
+/// SystemZ Code Generator Pass Configuration Options.
+class SystemZPassConfig : public TargetPassConfig {
+public:
+  SystemZPassConfig(SystemZTargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  SystemZTargetMachine &getSystemZTargetMachine() const {
+    return getTM<SystemZTargetMachine>();
+  }
+
+  virtual bool addInstSelector() LLVM_OVERRIDE;
+  virtual bool addPreEmitPass() LLVM_OVERRIDE;
+};
+} // end anonymous namespace
+
+bool SystemZPassConfig::addInstSelector() {
+  addPass(createSystemZISelDag(getSystemZTargetMachine(), getOptLevel()));
+  return false;
+}
+
+bool SystemZPassConfig::addPreEmitPass() {
+  addPass(createSystemZLongBranchPass(getSystemZTargetMachine()));
+  return true;
+}
+
+TargetPassConfig *SystemZTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new SystemZPassConfig(this, PM);
+}
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h
new file mode 100644
index 0000000..98614e7
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZTargetMachine.h
@@ -0,0 +1,74 @@
+//==- SystemZTargetMachine.h - Define TargetMachine for SystemZ ---*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SystemZ specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef SYSTEMZTARGETMACHINE_H
+#define SYSTEMZTARGETMACHINE_H
+
+#include "SystemZFrameLowering.h"
+#include "SystemZISelLowering.h"
+#include "SystemZInstrInfo.h"
+#include "SystemZRegisterInfo.h"
+#include "SystemZSubtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+class SystemZTargetMachine : public LLVMTargetMachine {
+  SystemZSubtarget        Subtarget;
+  const DataLayout        DL;
+  SystemZInstrInfo        InstrInfo;
+  SystemZTargetLowering   TLInfo;
+  TargetSelectionDAGInfo  TSInfo;
+  SystemZFrameLowering    FrameLowering;
+
+public:
+  SystemZTargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+
+  // Override TargetMachine.
+  virtual const TargetFrameLowering *getFrameLowering() const LLVM_OVERRIDE {
+    return &FrameLowering;
+  }
+  virtual const SystemZInstrInfo *getInstrInfo() const LLVM_OVERRIDE {
+    return &InstrInfo;
+  }
+  virtual const SystemZSubtarget *getSubtargetImpl() const LLVM_OVERRIDE {
+    return &Subtarget;
+  }
+  virtual const DataLayout *getDataLayout() const LLVM_OVERRIDE {
+    return &DL;
+  }
+  virtual const SystemZRegisterInfo *getRegisterInfo() const LLVM_OVERRIDE {
+    return &InstrInfo.getRegisterInfo();
+  }
+  virtual const SystemZTargetLowering *getTargetLowering() const LLVM_OVERRIDE {
+    return &TLInfo;
+  }
+  virtual const TargetSelectionDAGInfo *getSelectionDAGInfo() const
+    LLVM_OVERRIDE {
+    return &TSInfo;
+  }
+
+  // Override LLVMTargetMachine
+  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM) LLVM_OVERRIDE;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/SystemZ/TargetInfo/CMakeLists.txt b/lib/Target/SystemZ/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..b6051d3
--- /dev/null
+++ b/lib/Target/SystemZ/TargetInfo/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMSystemZInfo
+  SystemZTargetInfo.cpp
+  )
+
+add_dependencies(LLVMSystemZInfo SystemZCommonTableGen)
diff --git a/lib/Target/SystemZ/TargetInfo/LLVMBuild.txt b/lib/Target/SystemZ/TargetInfo/LLVMBuild.txt
new file mode 100644
index 0000000..ea43736
--- /dev/null
+++ b/lib/Target/SystemZ/TargetInfo/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/SystemZ/TargetInfo/LLVMBuild.txt ------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = SystemZInfo
+parent = SystemZ
+required_libraries = MC Support Target
+add_to_library_groups = SystemZ
diff --git a/lib/Target/SystemZ/TargetInfo/Makefile b/lib/Target/SystemZ/TargetInfo/Makefile
new file mode 100644
index 0000000..0be80eb
--- /dev/null
+++ b/lib/Target/SystemZ/TargetInfo/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/SystemZ/TargetInfo/Makefile --------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMSystemZInfo
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp b/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
new file mode 100644
index 0000000..8f9aa28
--- /dev/null
+++ b/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
@@ -0,0 +1,20 @@
+//===-- SystemZTargetInfo.cpp - SystemZ target implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZ.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+Target llvm::TheSystemZTarget;
+
+extern "C" void LLVMInitializeSystemZTargetInfo() {
+  RegisterTarget<Triple::systemz, /*HasJIT=*/true>
+    X(TheSystemZTarget, "systemz", "SystemZ");
+}
diff --git a/lib/Target/TargetLibraryInfo.cpp b/lib/Target/TargetLibraryInfo.cpp
index ee88ce7..d2967d9 100644
--- a/lib/Target/TargetLibraryInfo.cpp
+++ b/lib/Target/TargetLibraryInfo.cpp
@@ -43,6 +43,9 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "__isoc99_scanf",
     "__isoc99_sscanf",
     "__memcpy_chk",
+    "__sqrt_finite",
+    "__sqrtf_finite",
+    "__sqrtl_finite",
     "__strdup",
     "__strndup",
     "__strtok_r",
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index fef5cfe..263eb5e 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -219,7 +219,9 @@ private:
     const MCExpr *getSym() { return Sym; }
     StringRef getSymName() { return SymName; }
     int64_t getImm() { return Imm + IC.execute(); }
-    bool isValidEndState() { return State == IES_RBRAC; }
+    bool isValidEndState() {
+      return State == IES_RBRAC || State == IES_INTEGER;
+    }
     bool getStopOnLBrac() { return StopOnLBrac; }
     bool getAddImmPrefix() { return AddImmPrefix; }
     bool hadError() { return State == IES_ERROR; }
@@ -477,7 +479,7 @@ private:
   MCAsmLexer &getLexer() const { return Parser.getLexer(); }
 
   bool Error(SMLoc L, const Twine &Msg,
-             ArrayRef<SMRange> Ranges = ArrayRef<SMRange>(),
+             ArrayRef<SMRange> Ranges = None,
              bool MatchingInlineAsm = false) {
     if (MatchingInlineAsm) return true;
     return Parser.Error(L, Msg, Ranges);
@@ -500,7 +502,8 @@ private:
   X86Operand *ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
                                        int64_t ImmDisp, unsigned Size);
   X86Operand *ParseIntelIdentifier(const MCExpr *&Val, StringRef &Identifier,
-                                   InlineAsmIdentifierInfo &Info, SMLoc &End);
+                                   InlineAsmIdentifierInfo &Info,
+                                   bool IsUnevaluatedOperand, SMLoc &End);
 
   X86Operand *ParseMemOperand(unsigned SegReg, SMLoc StartLoc);
 
@@ -1193,6 +1196,7 @@ RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> *AsmRewrites,
         }
       }
       assert (Found && "Unable to rewrite ImmDisp.");
+      (void)Found;
     } else {
       // We have a symbolic and an immediate displacement, but no displacement
       // before the bracketed expression.  Put the immediate displacement
@@ -1267,7 +1271,8 @@ X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
             return ErrorOperand(Tok.getLoc(), "Unexpected identifier!");
         } else {
           InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
-          if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info, End))
+          if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info,
+                                                     /*Unevaluated*/ false, End))
             return Err;
         }
         SM.onIdentifierExpr(Val, Identifier);
@@ -1367,27 +1372,26 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
 X86Operand *X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
                                                StringRef &Identifier,
                                                InlineAsmIdentifierInfo &Info,
+                                               bool IsUnevaluatedOperand,
                                                SMLoc &End) {
   assert (isParsingInlineAsm() && "Expected to be parsing inline assembly.");
   Val = 0;
 
   StringRef LineBuf(Identifier.data());
-  SemaCallback->LookupInlineAsmIdentifier(LineBuf, Info);
-  unsigned BufLen = LineBuf.size();
-  assert (BufLen && "Expected a non-zero length identifier.");
+  SemaCallback->LookupInlineAsmIdentifier(LineBuf, Info, IsUnevaluatedOperand);
 
-  // Advance the token stream based on what the frontend parsed.
   const AsmToken &Tok = Parser.getTok();
-  AsmToken IdentEnd = Tok;
-  while (BufLen > 0) {
-    IdentEnd = Tok;
-    BufLen -= Tok.getString().size();
-    getLexer().Lex(); // Consume the token.
+
+  // Advance the token stream until the end of the current token is
+  // after the end of what the frontend claimed.
+  const char *EndPtr = Tok.getLoc().getPointer() + LineBuf.size();
+  while (true) {
+    End = Tok.getEndLoc();
+    getLexer().Lex();
+
+    assert(End.getPointer() <= EndPtr && "frontend claimed part of a token?");
+    if (End.getPointer() == EndPtr) break;
   }
-  if (BufLen != 0)
-    return ErrorOperand(IdentEnd.getLoc(),
-                        "Frontend parser mismatch with asm lexer!");
-  End = IdentEnd.getEndLoc();
 
   // Create the symbol reference.
   Identifier = LineBuf;
@@ -1447,7 +1451,8 @@ X86Operand *X86AsmParser::ParseIntelMemOperand(unsigned SegReg,
 
   InlineAsmIdentifierInfo Info;
   StringRef Identifier = Tok.getString();
-  if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info, End))
+  if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info,
+                                             /*Unevaluated*/ false, End))
     return Err;
   return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0,/*IndexReg=*/0,
                                /*Scale=*/1, Start, End, Size, Identifier, Info);
@@ -1506,7 +1511,8 @@ X86Operand *X86AsmParser::ParseIntelOffsetOfOperator() {
   InlineAsmIdentifierInfo Info;
   SMLoc Start = Tok.getLoc(), End;
   StringRef Identifier = Tok.getString();
-  if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info, End))
+  if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info,
+                                             /*Unevaluated*/ false, End))
     return Err;
 
   // Don't emit the offset operator.
@@ -1541,7 +1547,8 @@ X86Operand *X86AsmParser::ParseIntelOperator(unsigned OpKind) {
   InlineAsmIdentifierInfo Info;
   SMLoc Start = Tok.getLoc(), End;
   StringRef Identifier = Tok.getString();
-  if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info, End))
+  if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info,
+                                             /*Unevaluated*/ true, End))
     return Err;
 
   unsigned CVal = 0;
@@ -2200,7 +2207,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   assert(!Operands.empty() && "Unexpect empty operand list!");
   X86Operand *Op = static_cast<X86Operand*>(Operands[0]);
   assert(Op->isToken() && "Leading operand should always be a mnemonic!");
-  ArrayRef<SMRange> EmptyRanges = ArrayRef<SMRange>();
+  ArrayRef<SMRange> EmptyRanges = None;
 
   // First, handle aliases that expand to multiple instructions.
   // FIXME: This should be replaced with a real .td file alias mechanism.
@@ -2302,25 +2309,25 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   unsigned Match1, Match2, Match3, Match4;
 
   Match1 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
-                                isParsingIntelSyntax());
+                                MatchingInlineAsm, isParsingIntelSyntax());
   // If this returned as a missing feature failure, remember that.
   if (Match1 == Match_MissingFeature)
     ErrorInfoMissingFeature = ErrorInfoIgnore;
   Tmp[Base.size()] = Suffixes[1];
   Match2 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
-                                isParsingIntelSyntax());
+                                MatchingInlineAsm, isParsingIntelSyntax());
   // If this returned as a missing feature failure, remember that.
   if (Match2 == Match_MissingFeature)
     ErrorInfoMissingFeature = ErrorInfoIgnore;
   Tmp[Base.size()] = Suffixes[2];
   Match3 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
-                                isParsingIntelSyntax());
+                                MatchingInlineAsm, isParsingIntelSyntax());
   // If this returned as a missing feature failure, remember that.
   if (Match3 == Match_MissingFeature)
     ErrorInfoMissingFeature = ErrorInfoIgnore;
   Tmp[Base.size()] = Suffixes[3];
   Match4 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
-                                isParsingIntelSyntax());
+                                MatchingInlineAsm, isParsingIntelSyntax());
   // If this returned as a missing feature failure, remember that.
   if (Match4 == Match_MissingFeature)
     ErrorInfoMissingFeature = ErrorInfoIgnore;
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index ca6f80c..ca71c4f 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -190,94 +190,8 @@ static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch,
                                      uint64_t Address, uint64_t Offset,
                                      uint64_t Width, MCInst &MI, 
                                      const MCDisassembler *Dis) {  
-  LLVMOpInfoCallback getOpInfo = Dis->getLLVMOpInfoCallback();
-  struct LLVMOpInfo1 SymbolicOp;
-  memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
-  SymbolicOp.Value = Value;
-  void *DisInfo = Dis->getDisInfoBlock();
-
-  if (!getOpInfo ||
-      !getOpInfo(DisInfo, Address, Offset, Width, 1, &SymbolicOp)) {
-    // Clear SymbolicOp.Value from above and also all other fields.
-    memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
-    LLVMSymbolLookupCallback SymbolLookUp = Dis->getLLVMSymbolLookupCallback();
-    if (!SymbolLookUp)
-      return false;
-    uint64_t ReferenceType;
-    if (isBranch)
-       ReferenceType = LLVMDisassembler_ReferenceType_In_Branch;
-    else
-       ReferenceType = LLVMDisassembler_ReferenceType_InOut_None;
-    const char *ReferenceName;
-    const char *Name = SymbolLookUp(DisInfo, Value, &ReferenceType, Address,
-                                    &ReferenceName);
-    if (Name) {
-      SymbolicOp.AddSymbol.Name = Name;
-      SymbolicOp.AddSymbol.Present = true;
-    }
-    // For branches always create an MCExpr so it gets printed as hex address.
-    else if (isBranch) {
-      SymbolicOp.Value = Value;
-    }
-    if(ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub)
-      (*Dis->CommentStream) << "symbol stub for: " << ReferenceName;
-    if (!Name && !isBranch)
-      return false;
-  }
-
-  MCContext *Ctx = Dis->getMCContext();
-  const MCExpr *Add = NULL;
-  if (SymbolicOp.AddSymbol.Present) {
-    if (SymbolicOp.AddSymbol.Name) {
-      StringRef Name(SymbolicOp.AddSymbol.Name);
-      MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
-      Add = MCSymbolRefExpr::Create(Sym, *Ctx);
-    } else {
-      Add = MCConstantExpr::Create((int)SymbolicOp.AddSymbol.Value, *Ctx);
-    }
-  }
-
-  const MCExpr *Sub = NULL;
-  if (SymbolicOp.SubtractSymbol.Present) {
-      if (SymbolicOp.SubtractSymbol.Name) {
-      StringRef Name(SymbolicOp.SubtractSymbol.Name);
-      MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
-      Sub = MCSymbolRefExpr::Create(Sym, *Ctx);
-    } else {
-      Sub = MCConstantExpr::Create((int)SymbolicOp.SubtractSymbol.Value, *Ctx);
-    }
-  }
-
-  const MCExpr *Off = NULL;
-  if (SymbolicOp.Value != 0)
-    Off = MCConstantExpr::Create(SymbolicOp.Value, *Ctx);
-
-  const MCExpr *Expr;
-  if (Sub) {
-    const MCExpr *LHS;
-    if (Add)
-      LHS = MCBinaryExpr::CreateSub(Add, Sub, *Ctx);
-    else
-      LHS = MCUnaryExpr::CreateMinus(Sub, *Ctx);
-    if (Off != 0)
-      Expr = MCBinaryExpr::CreateAdd(LHS, Off, *Ctx);
-    else
-      Expr = LHS;
-  } else if (Add) {
-    if (Off != 0)
-      Expr = MCBinaryExpr::CreateAdd(Add, Off, *Ctx);
-    else
-      Expr = Add;
-  } else {
-    if (Off != 0)
-      Expr = Off;
-    else
-      Expr = MCConstantExpr::Create(0, *Ctx);
-  }
-
-  MI.addOperand(MCOperand::CreateExpr(Expr));
-
-  return true;
+  return Dis->tryAddingSymbolicOperand(MI, Value, Address, isBranch,
+                                       Offset, Width);
 }
 
 /// tryAddingPcLoadReferenceComment - trys to add a comment as to what is being
@@ -290,15 +204,7 @@ static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch,
 static void tryAddingPcLoadReferenceComment(uint64_t Address, uint64_t Value,
                                             const void *Decoder) {
   const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
-  LLVMSymbolLookupCallback SymbolLookUp = Dis->getLLVMSymbolLookupCallback();
-  if (SymbolLookUp) {
-    void *DisInfo = Dis->getDisInfoBlock();
-    uint64_t ReferenceType = LLVMDisassembler_ReferenceType_In_PCrel_Load;
-    const char *ReferenceName;
-    (void)SymbolLookUp(DisInfo, Value, &ReferenceType, Address, &ReferenceName);
-    if(ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr)
-      (*Dis->CommentStream) << "literal pool for: " << ReferenceName;
-  }
+  Dis->tryAddingPcLoadReferenceComment(Value, Address);
 }
 
 /// translateImmediate  - Appends an immediate operand to an MCInst.
diff --git a/lib/Target/X86/MCTargetDesc/CMakeLists.txt b/lib/Target/X86/MCTargetDesc/CMakeLists.txt
index 1c240e5..2eb5f25 100644
--- a/lib/Target/X86/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/X86/MCTargetDesc/CMakeLists.txt
@@ -6,6 +6,8 @@ add_llvm_library(LLVMX86Desc
   X86MachObjectWriter.cpp
   X86ELFObjectWriter.cpp
   X86WinCOFFObjectWriter.cpp
+  X86MachORelocationInfo.cpp
+  X86ELFRelocationInfo.cpp
   )
 
 add_dependencies(LLVMX86Desc X86CommonTableGen)
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
new file mode 100644
index 0000000..8f4ab46
--- /dev/null
+++ b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
@@ -0,0 +1,135 @@
+//===-- X86ELFRelocationInfo.cpp ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCRelocationInfo.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/Support/ELF.h"
+
+using namespace llvm;
+using namespace object;
+using namespace ELF;
+
+namespace {
+class X86_64ELFRelocationInfo : public MCRelocationInfo {
+public:
+  X86_64ELFRelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {}
+
+  const MCExpr *createExprForRelocation(RelocationRef Rel) {
+    uint64_t RelType; Rel.getType(RelType);
+    symbol_iterator SymI = Rel.getSymbol();
+
+    StringRef SymName; SymI->getName(SymName);
+    uint64_t  SymAddr; SymI->getAddress(SymAddr);
+    uint64_t  SymSize; SymI->getSize(SymSize);
+    int64_t  Addend;  getELFRelocationAddend(Rel, Addend);
+
+    MCSymbol *Sym = Ctx.GetOrCreateSymbol(SymName);
+    // FIXME: check that the value is actually the same.
+    if (Sym->isVariable() == false)
+      Sym->setVariableValue(MCConstantExpr::Create(SymAddr, Ctx));
+
+    const MCExpr *Expr = 0;
+    // If hasAddend is true, then we need to add Addend (r_addend) to Expr.
+    bool hasAddend = false;
+
+    // The AMD64 SysV ABI says:
+    // A: the addend used to compute the value of the relocatable field.
+    // B: the base address at which a shared object has been loaded into memory
+    //    during execution. Generally, a shared object is built with a 0 base
+    //    virtual address, but the execution address will be different.
+    // G: the offset into the global offset table at which the relocation
+    //    entry's symbol will reside during execution.
+    // GOT: the address of the global offset table.
+    // L: the place (section offset or address) of the Procedure Linkage Table
+    //    entry for a symbol.
+    // P: the place (section offset or address) of the storage unit being
+    //    relocated (computed using r_offset).
+    // S: the value of the symbol whose index resides in the relocation entry.
+    // Z: the size of the symbol whose index resides in the relocation entry.
+
+    switch(RelType) {
+    case R_X86_64_NONE:
+    case R_X86_64_COPY:
+      // none
+      break;
+    case R_X86_64_64:
+    case R_X86_64_16:
+    case R_X86_64_8:
+      // S + A
+    case R_X86_64_32:
+    case R_X86_64_32S:
+      // S + A (We don't care about the result not fitting in 32 bits.)
+    case R_X86_64_PC32:
+    case R_X86_64_PC16:
+    case R_X86_64_PC8:
+    case R_X86_64_PC64:
+      // S + A - P (P/pcrel is implicit)
+      hasAddend = true;
+      Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+      break;
+    case R_X86_64_GOT32:
+    case R_X86_64_GOT64:
+    case R_X86_64_GOTPC32:
+    case R_X86_64_GOTPC64:
+    case R_X86_64_GOTPLT64:
+      // G + A
+      hasAddend = true;
+      Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, Ctx);
+      break;
+    case R_X86_64_PLT32:
+      // L + A - P -> S@PLT + A
+      hasAddend = true;
+      Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_PLT, Ctx);
+      break;
+    case R_X86_64_GLOB_DAT:
+    case R_X86_64_JUMP_SLOT:
+      // S
+      Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+      break;
+    case R_X86_64_GOTPCREL:
+    case R_X86_64_GOTPCREL64:
+      // G + GOT + A - P -> S@GOTPCREL + A
+      hasAddend = true;
+      Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Ctx);
+      break;
+    case R_X86_64_GOTOFF64:
+      // S + A - GOT
+      Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTOFF, Ctx);
+      break;
+    case R_X86_64_PLTOFF64:
+      // L + A - GOT
+      break;
+    case R_X86_64_SIZE32:
+    case R_X86_64_SIZE64:
+      // Z + A
+      Expr = MCConstantExpr::Create(SymSize, Ctx);
+      break;
+    default:
+      Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+      break;
+    }
+    if (Expr && hasAddend && Addend != 0)
+      Expr = MCBinaryExpr::CreateAdd(Expr,
+                                     MCConstantExpr::Create(Addend, Ctx),
+                                     Ctx);
+    return Expr;
+  }
+};
+} // End unnamed namespace
+
+/// createX86ELFRelocationInfo - Construct an X86 Mach-O RelocationInfo.
+MCRelocationInfo *llvm::createX86_64ELFRelocationInfo(MCContext &Ctx) {
+  // We only handle x86-64 for now.
+  return new X86_64ELFRelocationInfo(Ctx);
+}
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 5e84530..bd23ce4 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -263,7 +263,7 @@ static MCRegisterInfo *createX86MCRegisterInfo(StringRef TT) {
   return X;
 }
 
-static MCAsmInfo *createX86MCAsmInfo(const Target &T, StringRef TT) {
+static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) {
   Triple TheTriple(TT);
   bool is64Bit = TheTriple.getArch() == Triple::x86_64;
 
@@ -290,14 +290,16 @@ static MCAsmInfo *createX86MCAsmInfo(const Target &T, StringRef TT) {
   int stackGrowth = is64Bit ? -8 : -4;
 
   // Initial state of the frame pointer is esp+stackGrowth.
-  MachineLocation Dst(MachineLocation::VirtualFP);
-  MachineLocation Src(is64Bit ? X86::RSP : X86::ESP, stackGrowth);
-  MAI->addInitialFrameState(0, Dst, Src);
+  unsigned StackPtr = is64Bit ? X86::RSP : X86::ESP;
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(
+      0, MRI.getDwarfRegNum(StackPtr, true), -stackGrowth);
+  MAI->addInitialFrameState(Inst);
 
   // Add return address to move list
-  MachineLocation CSDst(is64Bit ? X86::RSP : X86::ESP, stackGrowth);
-  MachineLocation CSSrc(is64Bit ? X86::RIP : X86::EIP);
-  MAI->addInitialFrameState(0, CSDst, CSSrc);
+  unsigned InstPtr = is64Bit ? X86::RIP : X86::EIP;
+  MCCFIInstruction Inst2 = MCCFIInstruction::createOffset(
+      0, MRI.getDwarfRegNum(InstPtr, true), stackGrowth);
+  MAI->addInitialFrameState(Inst2);
 
   return MAI;
 }
@@ -382,6 +384,17 @@ static MCInstPrinter *createX86MCInstPrinter(const Target &T,
   return 0;
 }
 
+static MCRelocationInfo *createX86MCRelocationInfo(StringRef TT,
+                                                   MCContext &Ctx) {
+  Triple TheTriple(TT);
+  if (TheTriple.isEnvironmentMachO() && TheTriple.getArch() == Triple::x86_64)
+    return createX86_64MachORelocationInfo(Ctx);
+  else if (TheTriple.isOSBinFormatELF())
+    return createX86_64ELFRelocationInfo(Ctx);
+  // Default to the stock relocation info.
+  return llvm::createMCRelocationInfo(TT, Ctx);
+}
+
 static MCInstrAnalysis *createX86MCInstrAnalysis(const MCInstrInfo *Info) {
   return new MCInstrAnalysis(Info);
 }
@@ -439,4 +452,10 @@ extern "C" void LLVMInitializeX86TargetMC() {
                                         createX86MCInstPrinter);
   TargetRegistry::RegisterMCInstPrinter(TheX86_64Target,
                                         createX86MCInstPrinter);
+
+  // Register the MC relocation info.
+  TargetRegistry::RegisterMCRelocationInfo(TheX86_32Target,
+                                           createX86MCRelocationInfo);
+  TargetRegistry::RegisterMCRelocationInfo(TheX86_64Target,
+                                           createX86MCRelocationInfo);
 }
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 981aa1a..2f459b4 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -25,6 +25,7 @@ class MCInstrInfo;
 class MCObjectWriter;
 class MCRegisterInfo;
 class MCSubtargetInfo;
+class MCRelocationInfo;
 class Target;
 class StringRef;
 class raw_ostream;
@@ -94,6 +95,12 @@ MCObjectWriter *createX86ELFObjectWriter(raw_ostream &OS,
                                          uint16_t EMachine);
 /// createX86WinCOFFObjectWriter - Construct an X86 Win COFF object writer.
 MCObjectWriter *createX86WinCOFFObjectWriter(raw_ostream &OS, bool Is64Bit);
+
+/// createX86_64MachORelocationInfo - Construct X86-64 Mach-O relocation info.
+MCRelocationInfo *createX86_64MachORelocationInfo(MCContext &Ctx);
+
+/// createX86_64ELFORelocationInfo - Construct X86-64 ELF relocation info.
+MCRelocationInfo *createX86_64ELFRelocationInfo(MCContext &Ctx);
 } // End llvm namespace
 
 
diff --git a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
new file mode 100644
index 0000000..75b5acf
--- /dev/null
+++ b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
@@ -0,0 +1,116 @@
+//===-- X86MachORelocationInfo.cpp ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCRelocationInfo.h"
+#include "llvm/Object/MachO.h"
+
+using namespace llvm;
+using namespace object;
+using namespace macho;
+
+namespace {
+class X86_64MachORelocationInfo : public MCRelocationInfo {
+public:
+  X86_64MachORelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {}
+
+  const MCExpr *createExprForRelocation(RelocationRef Rel) {
+    const MachOObjectFile *Obj = cast<MachOObjectFile>(Rel.getObjectFile());
+
+    uint64_t RelType; Rel.getType(RelType);
+    symbol_iterator SymI = Rel.getSymbol();
+
+    StringRef SymName; SymI->getName(SymName);
+    uint64_t  SymAddr; SymI->getAddress(SymAddr);
+
+    RelocationEntry RE = Obj->getRelocation(Rel.getRawDataRefImpl());
+    bool isPCRel = Obj->getAnyRelocationPCRel(RE);
+
+    MCSymbol *Sym = Ctx.GetOrCreateSymbol(SymName);
+    // FIXME: check that the value is actually the same.
+    if (Sym->isVariable() == false)
+      Sym->setVariableValue(MCConstantExpr::Create(SymAddr, Ctx));
+    const MCExpr *Expr = 0;
+
+    switch(RelType) {
+    case RIT_X86_64_TLV:
+      Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx);
+      break;
+    case RIT_X86_64_Signed4:
+      Expr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Sym, Ctx),
+                                     MCConstantExpr::Create(4, Ctx),
+                                     Ctx);
+      break;
+    case RIT_X86_64_Signed2:
+      Expr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Sym, Ctx),
+                                     MCConstantExpr::Create(2, Ctx),
+                                     Ctx);
+      break;
+    case RIT_X86_64_Signed1:
+      Expr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Sym, Ctx),
+                                     MCConstantExpr::Create(1, Ctx),
+                                     Ctx);
+      break;
+    case RIT_X86_64_GOTLoad:
+      Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Ctx);
+      break;
+    case RIT_X86_64_GOT:
+      Expr = MCSymbolRefExpr::Create(Sym, isPCRel ?
+                                     MCSymbolRefExpr::VK_GOTPCREL :
+                                     MCSymbolRefExpr::VK_GOT,
+                                     Ctx);
+      break;
+    case RIT_X86_64_Subtractor:
+      {
+        RelocationRef RelNext;
+        Obj->getRelocationNext(Rel.getRawDataRefImpl(), RelNext);
+        RelocationEntry RENext = Obj->getRelocation(RelNext.getRawDataRefImpl());
+
+        // X86_64_SUBTRACTOR must be followed by a relocation of type
+        // X86_64_RELOC_UNSIGNED    .
+        // NOTE: Scattered relocations don't exist on x86_64.
+        unsigned RType = Obj->getAnyRelocationType(RENext);
+        if (RType != RIT_X86_64_Unsigned)
+          report_fatal_error("Expected X86_64_RELOC_UNSIGNED after "
+                             "X86_64_RELOC_SUBTRACTOR.");
+
+        const MCExpr *LHS = MCSymbolRefExpr::Create(Sym, Ctx);
+
+        symbol_iterator RSymI = RelNext.getSymbol();
+        uint64_t RSymAddr;
+        RSymI->getAddress(RSymAddr);
+        StringRef RSymName;
+        RSymI->getName(RSymName);
+
+        MCSymbol *RSym = Ctx.GetOrCreateSymbol(RSymName);
+        if (RSym->isVariable() == false)
+          RSym->setVariableValue(MCConstantExpr::Create(RSymAddr, Ctx));
+
+        const MCExpr *RHS = MCSymbolRefExpr::Create(RSym, Ctx);
+
+        Expr = MCBinaryExpr::CreateSub(LHS, RHS, Ctx);
+        break;
+      }
+    default:
+      Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+      break;
+    }
+    return Expr;
+  }
+};
+} // End unnamed namespace
+
+/// createX86_64MachORelocationInfo - Construct an X86-64 Mach-O RelocationInfo.
+MCRelocationInfo *llvm::createX86_64MachORelocationInfo(MCContext &Ctx) {
+  return new X86_64MachORelocationInfo(Ctx);
+}
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 87bb68d..c865500 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -255,11 +255,16 @@ def : Proc<"amdfam10",        [FeatureSSE4A,
 // Bobcat
 def : Proc<"btver1",          [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B,
                                FeatureLZCNT, FeaturePOPCNT]>;
+// Jaguar
+def : Proc<"btver2",          [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B,
+                               FeatureAES, FeaturePCLMUL, FeatureBMI,
+                               FeatureF16C, FeatureMOVBE, FeatureLZCNT,
+                               FeaturePOPCNT]>;
 // Bulldozer
 def : Proc<"bdver1",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
                                FeatureAES, FeaturePCLMUL,
                                FeatureLZCNT, FeaturePOPCNT]>;
-// Enhanced Bulldozer
+// Piledriver
 def : Proc<"bdver2",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
                                FeatureAES, FeaturePCLMUL,
                                FeatureF16C, FeatureLZCNT,
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
index 8fea6ed..5d72b44 100644
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -53,13 +53,8 @@ namespace {
     static char ID;
     explicit Emitter(X86TargetMachine &tm, CodeEmitter &mce)
       : MachineFunctionPass(ID), II(0), TD(0), TM(tm),
-      MCE(mce), PICBaseOffset(0), Is64BitMode(false),
-      IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
-    Emitter(X86TargetMachine &tm, CodeEmitter &mce,
-            const X86InstrInfo &ii, const DataLayout &td, bool is64)
-      : MachineFunctionPass(ID), II(&ii), TD(&td), TM(tm),
-      MCE(mce), PICBaseOffset(0), Is64BitMode(is64),
-      IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
+        MCE(mce), PICBaseOffset(0), Is64BitMode(false),
+        IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
 
     bool runOnMachineFunction(MachineFunction &MF);
 
@@ -1270,7 +1265,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
 
     unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
       : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
-    if (Opcode == X86::MOV64ri64i32)
+    if (Opcode == X86::MOV32ri64)
       rt = X86::reloc_absolute_word;  // FIXME: add X86II flag?
     // This should not occur on Darwin for relocatable objects.
     if (Opcode == X86::MOV64ri)
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index cf44bd0..295a577 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -45,10 +45,6 @@ class X86FastISel : public FastISel {
   /// make the right decision when generating code for different targets.
   const X86Subtarget *Subtarget;
 
-  /// RegInfo - X86 register info.
-  ///
-  const X86RegisterInfo *RegInfo;
-
   /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
   /// floating point ops.
   /// When SSE is available, use it for f32 operations.
@@ -63,7 +59,6 @@ public:
     Subtarget = &TM.getSubtarget<X86Subtarget>();
     X86ScalarSSEf64 = Subtarget->hasSSE2();
     X86ScalarSSEf32 = Subtarget->hasSSE1();
-    RegInfo = static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
   }
 
   virtual bool TargetSelectInstruction(const Instruction *I);
@@ -1005,10 +1000,6 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
 }
 
 bool X86FastISel::X86SelectZExt(const Instruction *I) {
-  // Handle zero-extension from i1 to i8, which is common.
-  if (!I->getOperand(0)->getType()->isIntegerTy(1))
-    return false;
-
   EVT DstVT = TLI.getValueType(I->getType());
   if (!TLI.isTypeLegal(DstVT))
     return false;
@@ -1017,12 +1008,37 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) {
   if (ResultReg == 0)
     return false;
 
-  // Set the high bits to zero.
-  ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
-  if (ResultReg == 0)
-    return false;
+  // Handle zero-extension from i1 to i8, which is common.
+  MVT SrcVT = TLI.getValueType(I->getOperand(0)->getType()).getSimpleVT();
+  if (SrcVT.SimpleTy == MVT::i1) {
+    // Set the high bits to zero.
+    ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
+    SrcVT = MVT::i8;
+
+    if (ResultReg == 0)
+      return false;
+  }
+
+  if (DstVT == MVT::i64) {
+    // Handle extension to 64-bits via sub-register shenanigans.
+    unsigned MovInst;
 
-  if (DstVT != MVT::i8) {
+    switch (SrcVT.SimpleTy) {
+    case MVT::i8:  MovInst = X86::MOVZX32rr8;  break;
+    case MVT::i16: MovInst = X86::MOVZX32rr16; break;
+    case MVT::i32: MovInst = X86::MOV32rr;     break;
+    default: llvm_unreachable("Unexpected zext to i64 source type");
+    }
+
+    unsigned Result32 = createResultReg(&X86::GR32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(MovInst), Result32)
+      .addReg(ResultReg);
+
+    ResultReg = createResultReg(&X86::GR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::SUBREG_TO_REG),
+            ResultReg)
+      .addImm(0).addReg(Result32).addImm(X86::sub_32bit);
+  } else if (DstVT != MVT::i8) {
     ResultReg = FastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND,
                            ResultReg, /*Kill=*/true);
     if (ResultReg == 0)
@@ -1273,8 +1289,8 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
     { &X86::GR16RegClass, X86::AX,  X86::DX, {
         { X86::IDIV16r, X86::CWD,     Copy,            X86::AX,  S }, // SDiv
         { X86::IDIV16r, X86::CWD,     Copy,            X86::DX,  S }, // SRem
-        { X86::DIV16r,  X86::MOV16r0, Copy,            X86::AX,  U }, // UDiv
-        { X86::DIV16r,  X86::MOV16r0, Copy,            X86::DX,  U }, // URem
+        { X86::DIV16r,  X86::MOV32r0, Copy,            X86::AX,  U }, // UDiv
+        { X86::DIV16r,  X86::MOV32r0, Copy,            X86::DX,  U }, // URem
       }
     }, // i16
     { &X86::GR32RegClass, X86::EAX, X86::EDX, {
@@ -1287,8 +1303,8 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
     { &X86::GR64RegClass, X86::RAX, X86::RDX, {
         { X86::IDIV64r, X86::CQO,     Copy,            X86::RAX, S }, // SDiv
         { X86::IDIV64r, X86::CQO,     Copy,            X86::RDX, S }, // SRem
-        { X86::DIV64r,  X86::MOV64r0, Copy,            X86::RAX, U }, // UDiv
-        { X86::DIV64r,  X86::MOV64r0, Copy,            X86::RDX, U }, // URem
+        { X86::DIV64r,  X86::MOV32r0, Copy,            X86::RAX, U }, // UDiv
+        { X86::DIV64r,  X86::MOV32r0, Copy,            X86::RDX, U }, // URem
       }
     }, // i64
   };
@@ -1334,9 +1350,28 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
     if (OpEntry.IsOpSigned)
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
               TII.get(OpEntry.OpSignExtend));
-    else
+    else {
+      unsigned Zero32 = createResultReg(&X86::GR32RegClass);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-              TII.get(OpEntry.OpSignExtend), TypeEntry.HighInReg);
+              TII.get(X86::MOV32r0), Zero32);
+
+      // Copy the zero into the appropriate sub/super/identical physical
+      // register. Unfortunately the operations needed are not uniform enough to
+      // fit neatly into the table above.
+      if (VT.SimpleTy == MVT::i16) {
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                TII.get(Copy), TypeEntry.HighInReg)
+          .addReg(Zero32, 0, X86::sub_16bit);
+      } else if (VT.SimpleTy == MVT::i32) {
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                TII.get(Copy), TypeEntry.HighInReg)
+            .addReg(Zero32);
+      } else if (VT.SimpleTy == MVT::i64) {
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg)
+            .addImm(0).addReg(Zero32).addImm(X86::sub_32bit);
+      }
+    }
   }
   // Generate the DIV/IDIV instruction.
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
@@ -1985,6 +2020,8 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
     } else {
       unsigned LocMemOffset = VA.getLocMemOffset();
       X86AddressMode AM;
+      const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo*>(
+          getTargetMachine()->getRegisterInfo());
       AM.Base.Reg = RegInfo->getStackRegister();
       AM.Disp = LocMemOffset;
       const Value *ArgVal = ArgVals[VA.getValNo()];
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index 0dd034c..d21cb8a 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -135,8 +135,8 @@ FunctionPass *llvm::createX86FixupLEAs() {
 
 bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
   MF = &Func;
-  TII = Func.getTarget().getInstrInfo();
   TM = &MF->getTarget();
+  TII = TM->getInstrInfo();
 
   DEBUG(dbgs() << "Start X86FixupLEAs\n";);
   // Process all basic blocks.
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 0585b43..8522c8c 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -893,8 +893,8 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
 
   // Produce implicit-defs for free by using killed registers.
   while (Kills && Defs) {
-    unsigned KReg = CountTrailingZeros_32(Kills);
-    unsigned DReg = CountTrailingZeros_32(Defs);
+    unsigned KReg = countTrailingZeros(Kills);
+    unsigned DReg = countTrailingZeros(Defs);
     DEBUG(dbgs() << "Renaming %FP" << KReg << " as imp %FP" << DReg << "\n");
     std::swap(Stack[getSlot(KReg)], Stack[getSlot(DReg)]);
     std::swap(RegMap[KReg], RegMap[DReg]);
@@ -917,7 +917,7 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
 
   // Manually kill the rest.
   while (Kills) {
-    unsigned KReg = CountTrailingZeros_32(Kills);
+    unsigned KReg = countTrailingZeros(Kills);
     DEBUG(dbgs() << "Killing %FP" << KReg << "\n");
     freeStackSlotBefore(I, KReg);
     Kills &= ~(1 << KReg);
@@ -925,7 +925,7 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
 
   // Load zeros for all the imp-defs.
   while(Defs) {
-    unsigned DReg = CountTrailingZeros_32(Defs);
+    unsigned DReg = countTrailingZeros(Defs);
     DEBUG(dbgs() << "Defining %FP" << DReg << " as 0\n");
     BuildMI(*MBB, I, DebugLoc(), TII->get(X86::LD_F0));
     pushReg(DReg);
@@ -1636,7 +1636,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
     // Note: this might be a non-optimal pop sequence.  We might be able to do
     // better by trying to pop in stack order or something.
     while (FPKills) {
-      unsigned FPReg = CountTrailingZeros_32(FPKills);
+      unsigned FPReg = countTrailingZeros(FPKills);
       if (isLive(FPReg))
         freeStackSlotAfter(InsertPt, FPReg);
       FPKills &= ~(1U << FPReg);
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 16e1e42..3061117 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -307,12 +307,12 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF,
                                                  unsigned FramePtr) const {
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineModuleInfo &MMI = MF.getMMI();
+  const MCRegisterInfo &MRI = MMI.getContext().getRegisterInfo();
 
   // Add callee saved registers to move list.
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
   if (CSI.empty()) return;
 
-  std::vector<MachineMove> &Moves = MMI.getFrameMoves();
   const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
   bool HasFP = hasFP(MF);
 
@@ -360,16 +360,22 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF,
     if (HasFP && FramePtr == Reg)
       continue;
 
-    MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
-    MachineLocation CSSrc(Reg);
-    Moves.push_back(MachineMove(Label, CSDst, CSSrc));
+    unsigned DwarfReg = MRI.getDwarfRegNum(Reg, true);
+    MMI.addFrameInst(MCCFIInstruction::createOffset(Label, DwarfReg, Offset));
   }
 }
 
 /// getCompactUnwindRegNum - Get the compact unwind number for a given
 /// register. The number corresponds to the enum lists in
 /// compact_unwind_encoding.h.
-static int getCompactUnwindRegNum(const uint16_t *CURegs, unsigned Reg) {
+static int getCompactUnwindRegNum(unsigned Reg, bool is64Bit) {
+  static const uint16_t CU32BitRegs[] = {
+    X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
+  };
+  static const uint16_t CU64BitRegs[] = {
+    X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
+  };
+  const uint16_t *CURegs = is64Bit ? CU64BitRegs : CU32BitRegs;
   for (int Idx = 1; *CURegs; ++CURegs, ++Idx)
     if (*CURegs == Reg)
       return Idx;
@@ -398,16 +404,8 @@ encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
   //     4       3
   //     5       3
   //
-  static const uint16_t CU32BitRegs[] = {
-    X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
-  };
-  static const uint16_t CU64BitRegs[] = {
-    X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
-  };
-  const uint16_t *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs);
-
   for (unsigned i = 0; i != CU_NUM_SAVED_REGS; ++i) {
-    int CUReg = getCompactUnwindRegNum(CURegs, SavedRegs[i]);
+    int CUReg = getCompactUnwindRegNum(SavedRegs[i], Is64Bit);
     if (CUReg == -1) return ~0U;
     SavedRegs[i] = CUReg;
   }
@@ -466,14 +464,6 @@ encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
 static uint32_t
 encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
                                       bool Is64Bit) {
-  static const uint16_t CU32BitRegs[] = {
-    X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
-  };
-  static const uint16_t CU64BitRegs[] = {
-    X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
-  };
-  const uint16_t *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs);
-
   // Encode the registers in the order they were saved, 3-bits per register. The
   // registers are numbered from 1 to CU_NUM_SAVED_REGS.
   uint32_t RegEnc = 0;
@@ -481,7 +471,7 @@ encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
     unsigned Reg = SavedRegs[I];
     if (Reg == 0) continue;
 
-    int CURegNum = getCompactUnwindRegNum(CURegs, Reg);
+    int CURegNum = getCompactUnwindRegNum(Reg, Is64Bit);
     if (CURegNum == -1) return ~0U;
 
     // Encode the 3-bit register number in order, skipping over 3-bits for each
@@ -534,6 +524,12 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const {
       // If there are too many saved registers, we cannot use compact encoding.
       if (SavedRegIdx >= CU_NUM_SAVED_REGS) return CU::UNWIND_MODE_DWARF;
 
+      unsigned Reg = MI.getOperand(0).getReg();
+      if (Reg == (Is64Bit ? X86::RAX : X86::EAX)) {
+        ExpectEnd = true;
+        continue;
+      }
+
       SavedRegs[SavedRegIdx++] = MI.getOperand(0).getReg();
       StackAdjust += OffsetSize;
       InstrOffset += PushInstrSize;
@@ -735,7 +731,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
   //        REG < 64                    => DW_CFA_offset + Reg
   //        ELSE                        => DW_CFA_offset_extended
 
-  std::vector<MachineMove> &Moves = MMI.getFrameMoves();
   uint64_t NumBytes = 0;
   int stackGrowth = -SlotSize;
 
@@ -768,20 +763,14 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
         .addSym(FrameLabel);
 
       // Define the current CFA rule to use the provided offset.
-      if (StackSize) {
-        MachineLocation SPDst(MachineLocation::VirtualFP);
-        MachineLocation SPSrc(MachineLocation::VirtualFP, 2 * stackGrowth);
-        Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc));
-      } else {
-        MachineLocation SPDst(StackPtr);
-        MachineLocation SPSrc(StackPtr, stackGrowth);
-        Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc));
-      }
+      assert(StackSize);
+      MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(FrameLabel, 2 * stackGrowth));
 
       // Change the rule for the FramePtr to be an "offset" rule.
-      MachineLocation FPDst(MachineLocation::VirtualFP, 2 * stackGrowth);
-      MachineLocation FPSrc(FramePtr);
-      Moves.push_back(MachineMove(FrameLabel, FPDst, FPSrc));
+      unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true);
+      MMI.addFrameInst(MCCFIInstruction::createOffset(FrameLabel, DwarfFramePtr,
+                                                      2 * stackGrowth));
     }
 
     // Update EBP with the new base value.
@@ -797,9 +786,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
         .addSym(FrameLabel);
 
       // Define the current CFA to use the EBP/RBP register.
-      MachineLocation FPDst(FramePtr);
-      MachineLocation FPSrc(MachineLocation::VirtualFP);
-      Moves.push_back(MachineMove(FrameLabel, FPDst, FPSrc));
+      unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true);
+      MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaRegister(FrameLabel, DwarfFramePtr));
     }
 
     // Mark the FramePtr as live-in in every block except the entry.
@@ -827,10 +816,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
       BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)).addSym(Label);
 
       // Define the current CFA rule to use the provided offset.
-      unsigned Ptr = StackSize ? MachineLocation::VirtualFP : StackPtr;
-      MachineLocation SPDst(Ptr);
-      MachineLocation SPSrc(Ptr, StackOffset);
-      Moves.push_back(MachineMove(Label, SPDst, SPSrc));
+      assert(StackSize);
+      MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(Label, StackOffset));
       StackOffset += stackGrowth;
     }
   }
@@ -964,16 +952,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
 
     if (!HasFP && NumBytes) {
       // Define the current CFA rule to use the provided offset.
-      if (StackSize) {
-        MachineLocation SPDst(MachineLocation::VirtualFP);
-        MachineLocation SPSrc(MachineLocation::VirtualFP,
-                              -StackSize + stackGrowth);
-        Moves.push_back(MachineMove(Label, SPDst, SPSrc));
-      } else {
-        MachineLocation SPDst(StackPtr);
-        MachineLocation SPSrc(StackPtr, stackGrowth);
-        Moves.push_back(MachineMove(Label, SPDst, SPSrc));
-      }
+      assert(StackSize);
+      MMI.addFrameInst(MCCFIInstruction::createDefCfaOffset(
+          Label, -StackSize + stackGrowth));
     }
 
     // Emit DWARF info specifying the offsets of the callee-saved registers.
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 968b358..4ffffa1 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -200,9 +200,13 @@ namespace {
     bool SelectAddr(SDNode *Parent, SDValue N, SDValue &Base,
                     SDValue &Scale, SDValue &Index, SDValue &Disp,
                     SDValue &Segment);
+    bool SelectMOV64Imm32(SDValue N, SDValue &Imm);
     bool SelectLEAAddr(SDValue N, SDValue &Base,
                        SDValue &Scale, SDValue &Index, SDValue &Disp,
                        SDValue &Segment);
+    bool SelectLEA64_32Addr(SDValue N, SDValue &Base,
+                            SDValue &Scale, SDValue &Index, SDValue &Disp,
+                            SDValue &Segment);
     bool SelectTLSADDRAddr(SDValue N, SDValue &Base,
                            SDValue &Scale, SDValue &Index, SDValue &Disp,
                            SDValue &Segment);
@@ -229,14 +233,14 @@ namespace {
                                    SDValue &Scale, SDValue &Index,
                                    SDValue &Disp, SDValue &Segment) {
       Base  = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) ?
-        CurDAG->getTargetFrameIndex(AM.Base_FrameIndex, TLI.getPointerTy()) :
+        CurDAG->getTargetFrameIndex(AM.Base_FrameIndex, TLI->getPointerTy()) :
         AM.Base_Reg;
       Scale = getI8Imm(AM.Scale);
       Index = AM.IndexReg;
       // These are 32-bit even in 64-bit mode since RIP relative offset
       // is 32-bit.
       if (AM.GV)
-        Disp = CurDAG->getTargetGlobalAddress(AM.GV, DebugLoc(),
+        Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
                                               MVT::i32, AM.Disp,
                                               AM.SymbolFlags);
       else if (AM.CP)
@@ -373,7 +377,7 @@ static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
       else
         Ops.push_back(Chain.getOperand(i));
     SDValue NewChain =
-      CurDAG->getNode(ISD::TokenFactor, Load.getDebugLoc(),
+      CurDAG->getNode(ISD::TokenFactor, SDLoc(Load),
                       MVT::Other, &Ops[0], Ops.size());
     Ops.clear();
     Ops.push_back(NewChain);
@@ -524,7 +528,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
       MemVT = SrcIsSSE ? SrcVT : DstVT;
 
     SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
-    DebugLoc dl = N->getDebugLoc();
+    SDLoc dl(N);
 
     // FIXME: optimize the case where the src/dest is a load or store?
     SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl,
@@ -782,7 +786,7 @@ static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
     return true;
 
   EVT VT = N.getValueType();
-  DebugLoc DL = N.getDebugLoc();
+  SDLoc DL(N);
   SDValue Eight = DAG.getConstant(8, MVT::i8);
   SDValue NewMask = DAG.getConstant(0xff, VT);
   SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight);
@@ -830,7 +834,7 @@ static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
     return true;
 
   EVT VT = N.getValueType();
-  DebugLoc DL = N.getDebugLoc();
+  SDLoc DL(N);
   SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, VT);
   SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
   SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
@@ -886,8 +890,8 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
     return true;
 
   unsigned ShiftAmt = Shift.getConstantOperandVal(1);
-  unsigned MaskLZ = CountLeadingZeros_64(Mask);
-  unsigned MaskTZ = CountTrailingZeros_64(Mask);
+  unsigned MaskLZ = countLeadingZeros(Mask);
+  unsigned MaskTZ = countTrailingZeros(Mask);
 
   // The amount of shift we're trying to fit into the addressing mode is taken
   // from the trailing zeros of the mask.
@@ -932,11 +936,11 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
   if (ReplacingAnyExtend) {
     assert(X.getValueType() != VT);
     // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
-    SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, X.getDebugLoc(), VT, X);
+    SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
     InsertDAGNode(DAG, N, NewX);
     X = NewX;
   }
-  DebugLoc DL = N.getDebugLoc();
+  SDLoc DL(N);
   SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, MVT::i8);
   SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
   SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, MVT::i8);
@@ -960,7 +964,7 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
 
 bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
                                               unsigned Depth) {
-  DebugLoc dl = N.getDebugLoc();
+  SDLoc dl(N);
   DEBUG({
       dbgs() << "MatchAddress: ";
       AM.dump();
@@ -1380,6 +1384,71 @@ bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root,
 }
 
 
+bool X86DAGToDAGISel::SelectMOV64Imm32(SDValue N, SDValue &Imm) {
+  if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
+    uint64_t ImmVal = CN->getZExtValue();
+    if ((uint32_t)ImmVal != (uint64_t)ImmVal)
+      return false;
+
+    Imm = CurDAG->getTargetConstant(ImmVal, MVT::i64);
+    return true;
+  }
+
+  // In static codegen with small code model, we can get the address of a label
+  // into a register with 'movl'. TableGen has already made sure we're looking
+  // at a label of some kind.
+  assert(N->getOpcode() == X86ISD::Wrapper &&
+         "Unexpected node type for MOV32ri64");
+  N = N.getOperand(0);
+
+  if (N->getOpcode() != ISD::TargetConstantPool &&
+      N->getOpcode() != ISD::TargetJumpTable &&
+      N->getOpcode() != ISD::TargetGlobalAddress &&
+      N->getOpcode() != ISD::TargetExternalSymbol &&
+      N->getOpcode() != ISD::TargetBlockAddress)
+    return false;
+
+  Imm = N;
+  return TM.getCodeModel() == CodeModel::Small;
+}
+
+bool X86DAGToDAGISel::SelectLEA64_32Addr(SDValue N, SDValue &Base,
+                                         SDValue &Scale, SDValue &Index,
+                                         SDValue &Disp, SDValue &Segment) {
+  if (!SelectLEAAddr(N, Base, Scale, Index, Disp, Segment))
+    return false;
+
+  SDLoc DL(N);
+  RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base);
+  if (RN && RN->getReg() == 0)
+    Base = CurDAG->getRegister(0, MVT::i64);
+  else if (Base.getValueType() == MVT::i32 && !dyn_cast<FrameIndexSDNode>(N)) {
+    // Base could already be %rip, particularly in the x32 ABI.
+    Base = SDValue(CurDAG->getMachineNode(
+                       TargetOpcode::SUBREG_TO_REG, DL, MVT::i64,
+                       CurDAG->getTargetConstant(0, MVT::i64),
+                       Base,
+                       CurDAG->getTargetConstant(X86::sub_32bit, MVT::i32)),
+                   0);
+  }
+
+  RN = dyn_cast<RegisterSDNode>(Index);
+  if (RN && RN->getReg() == 0)
+    Index = CurDAG->getRegister(0, MVT::i64);
+  else {
+    assert(Index.getValueType() == MVT::i32 &&
+           "Expect to be extending 32-bit registers for use in LEA");
+    Index = SDValue(CurDAG->getMachineNode(
+                        TargetOpcode::SUBREG_TO_REG, DL, MVT::i64,
+                        CurDAG->getTargetConstant(0, MVT::i64),
+                        Index,
+                        CurDAG->getTargetConstant(X86::sub_32bit, MVT::i32)),
+                    0);
+  }
+
+  return true;
+}
+
 /// SelectLEAAddr - it calls SelectAddr and determines if the maximal addressing
 /// mode it matches can be cost effectively emitted as an LEA instruction.
 bool X86DAGToDAGISel::SelectLEAAddr(SDValue N,
@@ -1487,7 +1556,7 @@ bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N,
 ///
 SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
   unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
-  return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode();
+  return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy()).getNode();
 }
 
 SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
@@ -1502,7 +1571,7 @@ SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
   MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
   const SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, In2L, In2H, Chain};
-  SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(),
+  SDNode *ResNode = CurDAG->getMachineNode(Opc, SDLoc(Node),
                                            MVT::i32, MVT::i32, MVT::Other, Ops);
   cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp + 1);
   return ResNode;
@@ -1637,7 +1706,7 @@ static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = {
 // + empty, the operand is not needed any more with the new op selected.
 // + non-empty, otherwise.
 static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG,
-                                                DebugLoc dl,
+                                                SDLoc dl,
                                                 enum AtomicOpc &Op, EVT NVT,
                                                 SDValue Val) {
   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val)) {
@@ -1689,7 +1758,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) {
   if (Node->hasAnyUseOfValue(0))
     return 0;
 
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
 
   // Optimize common patterns for __sync_or_and_fetch and similar arith
   // operations where the result is not used. This allows us to use the "lock"
@@ -1920,7 +1989,7 @@ static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc,
     if (ChainCheck)
       // Make a new TokenFactor with all the other input chains except
       // for the load.
-      InputChain = CurDAG->getNode(ISD::TokenFactor, Chain.getDebugLoc(),
+      InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain),
                                    MVT::Other, &ChainOps[0], ChainOps.size());
   }
   if (!ChainCheck)
@@ -1968,7 +2037,7 @@ SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) {
   SDValue Segment = CurDAG->getRegister(0, MVT::i32);
   const SDValue Ops[] = { VSrc, Base, getI8Imm(Scale->getSExtValue()), VIdx,
                           Disp, Segment, VMask, Chain};
-  SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(), VTs, Ops);
+  SDNode *ResNode = CurDAG->getMachineNode(Opc, SDLoc(Node), VTs, Ops);
   // Node has 2 outputs: VDst and MVT::Other.
   // ResNode has 3 outputs: VDst, VMask_wb, and MVT::Other.
   // We replace VDst of Node with VDst of ResNode, and Other of Node with Other
@@ -1982,7 +2051,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
   EVT NVT = Node->getValueType(0);
   unsigned Opc, MOpc;
   unsigned Opcode = Node->getOpcode();
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
 
   DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n');
 
@@ -2013,6 +2082,8 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     case Intrinsic::x86_avx2_gather_d_d_256:
     case Intrinsic::x86_avx2_gather_q_d:
     case Intrinsic::x86_avx2_gather_q_d_256: {
+      if (!Subtarget->hasAVX2())
+        break;
       unsigned Opc;
       switch (IntNo) {
       default: llvm_unreachable("Impossible intrinsic");
@@ -2335,9 +2406,6 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n');
     }
 
-    // Propagate ordering to the last node, for now.
-    CurDAG->AssignOrdering(InFlag.getNode(), CurDAG->GetOrdering(Node));
-
     return NULL;
   }
 
@@ -2366,27 +2434,24 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     }
 
     unsigned LoReg, HiReg, ClrReg;
-    unsigned ClrOpcode, SExtOpcode;
+    unsigned SExtOpcode;
     switch (NVT.getSimpleVT().SimpleTy) {
     default: llvm_unreachable("Unsupported VT!");
     case MVT::i8:
       LoReg = X86::AL;  ClrReg = HiReg = X86::AH;
-      ClrOpcode  = 0;
       SExtOpcode = X86::CBW;
       break;
     case MVT::i16:
       LoReg = X86::AX;  HiReg = X86::DX;
-      ClrOpcode  = X86::MOV16r0; ClrReg = X86::DX;
+      ClrReg = X86::DX;
       SExtOpcode = X86::CWD;
       break;
     case MVT::i32:
       LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
-      ClrOpcode  = X86::MOV32r0;
       SExtOpcode = X86::CDQ;
       break;
     case MVT::i64:
       LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
-      ClrOpcode  = X86::MOV64r0;
       SExtOpcode = X86::CQO;
       break;
     }
@@ -2424,8 +2489,29 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
           SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
       } else {
         // Zero out the high part, effectively zero extending the input.
-        SDValue ClrNode =
-          SDValue(CurDAG->getMachineNode(ClrOpcode, dl, NVT), 0);
+        SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);       
+        switch (NVT.getSimpleVT().SimpleTy) {
+        case MVT::i16:
+          ClrNode =
+              SDValue(CurDAG->getMachineNode(
+                          TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
+                          CurDAG->getTargetConstant(X86::sub_16bit, MVT::i32)),
+                      0);
+          break;
+        case MVT::i32:
+          break;
+        case MVT::i64:
+          ClrNode =
+              SDValue(CurDAG->getMachineNode(
+                          TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
+                          CurDAG->getTargetConstant(0, MVT::i64), ClrNode,
+                          CurDAG->getTargetConstant(X86::sub_32bit, MVT::i32)),
+                      0);
+          break;
+        default:
+          llvm_unreachable("Unexpected division source");
+        }
+
         InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
                                       ClrNode, InFlag).getValue(1);
       }
@@ -2666,7 +2752,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     EVT LdVT = LoadNode->getMemoryVT();
     unsigned newOpc = getFusedLdStOpcode(LdVT, Opc);
     MachineSDNode *Result = CurDAG->getMachineNode(newOpc,
-                                                   Node->getDebugLoc(),
+                                                   SDLoc(Node),
                                                    MVT::i32, MVT::Other, Ops);
     Result->setMemRefs(MemOp, MemOp + 2);
 
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index b587336..346dfbb 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -55,7 +55,7 @@ using namespace llvm;
 STATISTIC(NumTailCalls, "Number of tail calls");
 
 // Forward declarations.
-static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
+static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
                        SDValue V2);
 
 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
@@ -64,7 +64,7 @@ static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
 /// want.  It need not be aligned to a 128-bit bounday.  That makes
 /// lowering EXTRACT_VECTOR_ELT operations easier.
 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
-                                   SelectionDAG &DAG, DebugLoc dl) {
+                                   SelectionDAG &DAG, SDLoc dl) {
   EVT VT = Vec.getValueType();
   assert(VT.is256BitVector() && "Unexpected vector size!");
   EVT ElVT = VT.getVectorElementType();
@@ -104,7 +104,7 @@ static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
 /// lowering INSERT_VECTOR_ELT operations easier.
 static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
                                   unsigned IdxVal, SelectionDAG &DAG,
-                                  DebugLoc dl) {
+                                  SDLoc dl) {
   // Inserting UNDEF is Result
   if (Vec.getOpcode() == ISD::UNDEF)
     return Result;
@@ -134,7 +134,7 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
 /// large BUILD_VECTORS.
 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
                                    unsigned NumElems, SelectionDAG &DAG,
-                                   DebugLoc dl) {
+                                   SDLoc dl) {
   SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
   return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
 }
@@ -163,7 +163,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   Subtarget = &TM.getSubtarget<X86Subtarget>();
   X86ScalarSSEf64 = Subtarget->hasSSE2();
   X86ScalarSSEf32 = Subtarget->hasSSE1();
-  RegInfo = TM.getRegisterInfo();
   TD = getDataLayout();
 
   resetOperationActions();
@@ -202,6 +201,8 @@ void X86TargetLowering::resetOperationActions() {
     setSchedulingPreference(Sched::ILP);
   else
     setSchedulingPreference(Sched::RegPressure);
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
 
   // Bypass expensive divides on Atom when compiling with O2
@@ -1361,7 +1362,7 @@ void X86TargetLowering::resetOperationActions() {
   setPrefFunctionAlignment(4); // 2^4 bytes.
 }
 
-EVT X86TargetLowering::getSetCCResultType(EVT VT) const {
+EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
   if (!VT.isVector()) return MVT::i8;
   return VT.changeVectorElementTypeToInteger();
 }
@@ -1504,9 +1505,9 @@ X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
                                                     SelectionDAG &DAG) const {
   if (!Subtarget->is64Bit())
-    // This doesn't have DebugLoc associated with it, but is not really the
+    // This doesn't have SDLoc associated with it, but is not really the
     // same as a Register.
-    return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy());
+    return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
   return Table;
 }
 
@@ -1593,7 +1594,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
                                CallingConv::ID CallConv, bool isVarArg,
                                const SmallVectorImpl<ISD::OutputArg> &Outs,
                                const SmallVectorImpl<SDValue> &OutVals,
-                               DebugLoc dl, SelectionDAG &DAG) const {
+                               SDLoc dl, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
 
@@ -1761,7 +1762,7 @@ SDValue
 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
                                    CallingConv::ID CallConv, bool isVarArg,
                                    const SmallVectorImpl<ISD::InputArg> &Ins,
-                                   DebugLoc dl, SelectionDAG &DAG,
+                                   SDLoc dl, SelectionDAG &DAG,
                                    SmallVectorImpl<SDValue> &InVals) const {
 
   // Assign locations to each value returned by this call.
@@ -1868,7 +1869,7 @@ argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
 static SDValue
 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
-                          DebugLoc dl) {
+                          SDLoc dl) {
   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
 
   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
@@ -1906,7 +1907,7 @@ SDValue
 X86TargetLowering::LowerMemArgument(SDValue Chain,
                                     CallingConv::ID CallConv,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SDLoc dl, SelectionDAG &DAG,
                                     const CCValAssign &VA,
                                     MachineFrameInfo *MFI,
                                     unsigned i) const {
@@ -1948,7 +1949,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
                                         CallingConv::ID CallConv,
                                         bool isVarArg,
                                       const SmallVectorImpl<ISD::InputArg> &Ins,
-                                        DebugLoc dl,
+                                        SDLoc dl,
                                         SelectionDAG &DAG,
                                         SmallVectorImpl<SDValue> &InVals)
                                           const {
@@ -2225,7 +2226,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
 SDValue
 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
                                     SDValue StackPtr, SDValue Arg,
-                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SDLoc dl, SelectionDAG &DAG,
                                     const CCValAssign &VA,
                                     ISD::ArgFlagsTy Flags) const {
   unsigned LocMemOffset = VA.getLocMemOffset();
@@ -2245,7 +2246,7 @@ SDValue
 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
                                            SDValue &OutRetAddr, SDValue Chain,
                                            bool IsTailCall, bool Is64Bit,
-                                           int FPDiff, DebugLoc dl) const {
+                                           int FPDiff, SDLoc dl) const {
   // Adjust the Return address stack slot.
   EVT VT = getPointerTy();
   OutRetAddr = getReturnAddressFrameIndex(DAG);
@@ -2261,7 +2262,7 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
 static SDValue
 EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
                          SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT,
-                         unsigned SlotSize, int FPDiff, DebugLoc dl) {
+                         unsigned SlotSize, int FPDiff, SDLoc dl) {
   // Store the return address to the appropriate stack slot.
   if (!FPDiff) return Chain;
   // Calculate the new stack slot for the return address.
@@ -2278,7 +2279,7 @@ SDValue
 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                              SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG                     = CLI.DAG;
-  DebugLoc &dl                          = CLI.DL;
+  SDLoc &dl                          = CLI.DL;
   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
   SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
   SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
@@ -2354,7 +2355,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   }
 
   if (!IsSibcall)
-    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                                 dl);
 
   SDValue RetAddrFrIdx;
   // Load return address for tail calls.
@@ -2368,6 +2370,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Walk the register/memloc assignments, inserting copies/loads.  In the case
   // of tail call optimization arguments are handle later.
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
     EVT RegVT = VA.getLocVT();
@@ -2443,7 +2447,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // GOT pointer.
     if (!isTailCall) {
       RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
-               DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy())));
+               DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
     } else {
       // If we are tail calling and generating PIC/GOT style code load the
       // address of the callee into ECX. The value in ecx is used as target of
@@ -2640,7 +2644,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   if (!IsSibcall && isTailCall) {
     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                           DAG.getIntPtrConstant(0, true), InFlag);
+                           DAG.getIntPtrConstant(0, true), InFlag, dl);
     InFlag = Chain.getValue(1);
   }
 
@@ -2699,7 +2703,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                DAG.getIntPtrConstant(NumBytes, true),
                                DAG.getIntPtrConstant(NumBytesForCalleeToPush,
                                                      true),
-                               InFlag);
+                               InFlag, dl);
     InFlag = Chain.getValue(1);
   }
 
@@ -2747,6 +2751,8 @@ X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
                                                SelectionDAG& DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   const TargetMachine &TM = MF.getTarget();
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
   const TargetFrameLowering &TFI = *TM.getFrameLowering();
   unsigned StackAlignment = TFI.getStackAlignment();
   uint64_t AlignMask = StackAlignment - 1;
@@ -2859,6 +2865,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
 
   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
   // emit a special epilogue.
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
   if (RegInfo->needsStackRealignment(MF))
     return false;
 
@@ -3062,7 +3070,7 @@ static bool isTargetShuffle(unsigned Opcode) {
   }
 }
 
-static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
+static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
                                     SDValue V1, SelectionDAG &DAG) {
   switch(Opc) {
   default: llvm_unreachable("Unknown x86 shuffle node");
@@ -3073,7 +3081,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   }
 }
 
-static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
+static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
                                     SDValue V1, unsigned TargetMask,
                                     SelectionDAG &DAG) {
   switch(Opc) {
@@ -3087,7 +3095,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   }
 }
 
-static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
+static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
                                     SDValue V1, SDValue V2, unsigned TargetMask,
                                     SelectionDAG &DAG) {
   switch(Opc) {
@@ -3100,7 +3108,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   }
 }
 
-static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
+static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
   switch(Opc) {
   default: llvm_unreachable("Unknown x86 shuffle node");
@@ -3119,6 +3127,8 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
 
 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   int ReturnAddrIndex = FuncInfo->getRAIndex();
 
@@ -3628,7 +3638,7 @@ static
 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
                                SelectionDAG &DAG) {
   MVT VT = SVOp->getValueType(0).getSimpleVT();
-  DebugLoc dl = SVOp->getDebugLoc();
+  SDLoc dl(SVOp);
 
   if (VT != MVT::v8i32 && VT != MVT::v8f32)
     return SDValue();
@@ -4263,7 +4273,7 @@ static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
     }
     MaskVec.push_back(Idx);
   }
-  return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1),
+  return DAG.getVectorShuffle(VT, SDLoc(SVOp), SVOp->getOperand(1),
                               SVOp->getOperand(0), &MaskVec[0]);
 }
 
@@ -4396,7 +4406,7 @@ static bool isZeroShuffle(ShuffleVectorSDNode *N) {
 /// getZeroVector - Returns a vector of specified type with all zero elements.
 ///
 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
-                             SelectionDAG &DAG, DebugLoc dl) {
+                             SelectionDAG &DAG, SDLoc dl) {
   assert(VT.isVector() && "Expected a vector type");
 
   // Always build SSE zero vectors as <4 x i32> bitcasted
@@ -4435,7 +4445,7 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
 /// Then bitcast to their original type, ensuring they get CSE'd.
 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
-                             DebugLoc dl) {
+                             SDLoc dl) {
   assert(VT.isVector() && "Expected a vector type");
 
   SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
@@ -4469,7 +4479,7 @@ static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
 
 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
 /// operation of specified width.
-static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
+static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
                        SDValue V2) {
   unsigned NumElems = VT.getVectorNumElements();
   SmallVector<int, 8> Mask;
@@ -4480,7 +4490,7 @@ static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
 }
 
 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
-static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
+static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
                           SDValue V2) {
   unsigned NumElems = VT.getVectorNumElements();
   SmallVector<int, 8> Mask;
@@ -4492,7 +4502,7 @@ static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
 }
 
 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
-static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
+static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
                           SDValue V2) {
   unsigned NumElems = VT.getVectorNumElements();
   SmallVector<int, 8> Mask;
@@ -4510,7 +4520,7 @@ static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
   EVT VT = V.getValueType();
   int NumElems = VT.getVectorNumElements();
-  DebugLoc dl = V.getDebugLoc();
+  SDLoc dl(V);
 
   while (NumElems > 4) {
     if (EltNo < NumElems/2) {
@@ -4527,7 +4537,7 @@ static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
 /// getLegalSplat - Generate a legal splat with supported x86 shuffles
 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
   EVT VT = V.getValueType();
-  DebugLoc dl = V.getDebugLoc();
+  SDLoc dl(V);
 
   if (VT.is128BitVector()) {
     V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
@@ -4554,7 +4564,7 @@ static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
   EVT SrcVT = SV->getValueType(0);
   SDValue V1 = SV->getOperand(0);
-  DebugLoc dl = SV->getDebugLoc();
+  SDLoc dl(SV);
 
   int EltNo = SV->getSplatIndex();
   int NumElems = SrcVT.getVectorNumElements();
@@ -4599,13 +4609,13 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
                                            SelectionDAG &DAG) {
   EVT VT = V2.getValueType();
   SDValue V1 = IsZero
-    ? getZeroVector(VT, Subtarget, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
+    ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
   unsigned NumElems = VT.getVectorNumElements();
   SmallVector<int, 16> MaskVec;
   for (unsigned i = 0; i != NumElems; ++i)
     // If this is the insertion idx, put the low elt of V2 here.
     MaskVec.push_back(i == Idx ? NumElems : i);
-  return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]);
+  return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
 }
 
 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
@@ -4756,19 +4766,27 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
 /// getNumOfConsecutiveZeros - Return the number of elements of a vector
 /// shuffle operation which come from a consecutively from a zero. The
 /// search can start in two different directions, from left or right.
-static
-unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, unsigned NumElems,
-                                  bool ZerosFromLeft, SelectionDAG &DAG) {
-  unsigned i;
-  for (i = 0; i != NumElems; ++i) {
-    unsigned Index = ZerosFromLeft ? i : NumElems-i-1;
+/// We count undefs as zeros until PreferredNum is reached.
+static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
+                                         unsigned NumElems, bool ZerosFromLeft,
+                                         SelectionDAG &DAG,
+                                         unsigned PreferredNum = -1U) {
+  unsigned NumZeros = 0;
+  for (unsigned i = 0; i != NumElems; ++i) {
+    unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
     SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
-    if (!(Elt.getNode() &&
-         (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt))))
+    if (!Elt.getNode())
+      break;
+
+    if (X86::isZeroNode(Elt))
+      ++NumZeros;
+    else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
+      NumZeros = std::min(NumZeros + 1, PreferredNum);
+    else
       break;
   }
 
-  return i;
+  return NumZeros;
 }
 
 /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
@@ -4806,8 +4824,9 @@ bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
                                bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
   unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
-  unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
-              false /* check zeros from right */, DAG);
+  unsigned NumZeros = getNumOfConsecutiveZeros(
+      SVOp, NumElems, false /* check zeros from right */, DAG,
+      SVOp->getMaskElt(0));
   unsigned OpSrc;
 
   if (!NumZeros)
@@ -4839,8 +4858,9 @@ static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
   unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
-  unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
-              true /* check zeros from left */, DAG);
+  unsigned NumZeros = getNumOfConsecutiveZeros(
+      SVOp, NumElems, true /* check zeros from left */, DAG,
+      NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
   unsigned OpSrc;
 
   if (!NumZeros)
@@ -4893,7 +4913,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
   if (NumNonZero > 8)
     return SDValue();
 
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue V(0, 0);
   bool First = true;
   for (unsigned i = 0; i < 16; ++i) {
@@ -4941,7 +4961,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
   if (NumNonZero > 4)
     return SDValue();
 
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue V(0, 0);
   bool First = true;
   for (unsigned i = 0; i < 8; ++i) {
@@ -4967,7 +4987,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
 ///
 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
                          unsigned NumBits, SelectionDAG &DAG,
-                         const TargetLowering &TLI, DebugLoc dl) {
+                         const TargetLowering &TLI, SDLoc dl) {
   assert(VT.is128BitVector() && "Unknown type for VShift");
   EVT ShVT = MVT::v2i64;
   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
@@ -4979,7 +4999,7 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
 }
 
 SDValue
-X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
+X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, SDLoc dl,
                                           SelectionDAG &DAG) const {
 
   // Check if the scalar load can be widened into a vector load. And if
@@ -5032,7 +5052,7 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
       return SDValue();
     int64_t StartOffset = Offset & ~(RequiredAlign-1);
     if (StartOffset)
-      Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(),
+      Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
 
     int EltNo = (Offset - StartOffset) >> 2;
@@ -5063,7 +5083,7 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
 /// There's even a handy isZeroNode for that purpose.
 static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
-                                        DebugLoc &DL, SelectionDAG &DAG) {
+                                        SDLoc &DL, SelectionDAG &DAG) {
   EVT EltVT = VT.getVectorElementType();
   unsigned NumElems = Elts.size();
 
@@ -5099,15 +5119,27 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
   // load of the entire vector width starting at the base pointer.  If we found
   // consecutive loads for the low half, generate a vzext_load node.
   if (LastLoadedElt == NumElems - 1) {
+    SDValue NewLd = SDValue();
     if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
-      return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
-                         LDBase->getPointerInfo(),
-                         LDBase->isVolatile(), LDBase->isNonTemporal(),
-                         LDBase->isInvariant(), 0);
-    return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
-                       LDBase->getPointerInfo(),
-                       LDBase->isVolatile(), LDBase->isNonTemporal(),
-                       LDBase->isInvariant(), LDBase->getAlignment());
+      NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
+                          LDBase->getPointerInfo(),
+                          LDBase->isVolatile(), LDBase->isNonTemporal(),
+                          LDBase->isInvariant(), 0);
+    NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
+                        LDBase->getPointerInfo(),
+                        LDBase->isVolatile(), LDBase->isNonTemporal(),
+                        LDBase->isInvariant(), LDBase->getAlignment());
+
+    if (LDBase->hasAnyUseOfValue(1)) {
+      SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                                     SDValue(LDBase, 1),
+                                     SDValue(NewLd.getNode(), 1));
+      DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
+      DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
+                             SDValue(NewLd.getNode(), 1));
+    }
+
+    return NewLd;
   }
   if (NumElems == 4 && LastLoadedElt == 1 &&
       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
@@ -5150,7 +5182,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
     return SDValue();
 
   MVT VT = Op.getValueType().getSimpleVT();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   assert((VT.is128BitVector() || VT.is256BitVector()) &&
          "Unsupported vector type for broadcast.");
@@ -5277,7 +5309,7 @@ X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const {
   if (!isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
     return SDValue();
 
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   unsigned NumElems = Op.getNumOperands();
 
   SDValue VecIn1;
@@ -5345,7 +5377,7 @@ X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue
 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   MVT VT = Op.getValueType().getSimpleVT();
   MVT ExtVT = VT.getVectorElementType();
@@ -5404,7 +5436,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 
   // Special case for single non-zero, non-undef, element.
   if (NumNonZero == 1) {
-    unsigned Idx = CountTrailingZeros_32(NonZeros);
+    unsigned Idx = countTrailingZeros(NonZeros);
     SDValue Item = Op.getOperand(Idx);
 
     // If this is an insertion of an i64 value on x86-32, and if the top bits of
@@ -5513,7 +5545,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
       // Check if it's possible to issue this instead.
       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
-      unsigned Idx = CountTrailingZeros_32(NonZeros);
+      unsigned Idx = countTrailingZeros(NonZeros);
       SDValue Item = Op.getOperand(Idx);
       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
@@ -5548,7 +5580,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   if (EVTBits == 64) {
     if (NumNonZero == 1) {
       // One half is zero or undef.
-      unsigned Idx = CountTrailingZeros_32(NonZeros);
+      unsigned Idx = countTrailingZeros(NonZeros);
       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
                                  Op.getOperand(Idx));
       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
@@ -5678,7 +5710,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
 // to create 256-bit vectors from two other 128-bit ones.
 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   MVT ResVT = Op.getValueType().getSimpleVT();
 
   assert(ResVT.is256BitVector() && "Value type must be 256-bit wide");
@@ -5704,7 +5736,7 @@ LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
                            const X86Subtarget *Subtarget, SelectionDAG &DAG) {
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
-  DebugLoc dl = SVOp->getDebugLoc();
+  SDLoc dl(SVOp);
   MVT VT = SVOp->getValueType(0).getSimpleVT();
   MVT EltVT = VT.getVectorElementType();
   unsigned NumElems = VT.getVectorNumElements();
@@ -5765,7 +5797,7 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
-  DebugLoc dl = SVOp->getDebugLoc();
+  SDLoc dl(SVOp);
   SmallVector<int, 8> MaskVals;
 
   // Determine if more than 1 of the words in each of the low and high quadwords
@@ -6020,7 +6052,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
                                  const X86TargetLowering &TLI) {
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
-  DebugLoc dl = SVOp->getDebugLoc();
+  SDLoc dl(SVOp);
   ArrayRef<int> MaskVals = SVOp->getMask();
 
   // Promote splats to a larger type which usually leads to more efficient code.
@@ -6149,7 +6181,7 @@ SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
   MVT VT = SVOp->getValueType(0).getSimpleVT();
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
-  DebugLoc dl = SVOp->getDebugLoc();
+  SDLoc dl(SVOp);
   SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
 
   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
@@ -6195,7 +6227,7 @@ static
 SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
                                  SelectionDAG &DAG) {
   MVT VT = SVOp->getValueType(0).getSimpleVT();
-  DebugLoc dl = SVOp->getDebugLoc();
+  SDLoc dl(SVOp);
   unsigned NumElems = VT.getVectorNumElements();
   MVT NewVT;
   unsigned Scale;
@@ -6233,7 +6265,7 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
 ///
 static SDValue getVZextMovL(MVT VT, EVT OpVT,
                             SDValue SrcOp, SelectionDAG &DAG,
-                            const X86Subtarget *Subtarget, DebugLoc dl) {
+                            const X86Subtarget *Subtarget, SDLoc dl) {
   if (VT == MVT::v2f64 || VT == MVT::v4f32) {
     LoadSDNode *LD = NULL;
     if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
@@ -6278,7 +6310,7 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
   unsigned NumElems = VT.getVectorNumElements();
   unsigned NumLaneElems = NumElems / 2;
 
-  DebugLoc dl = SVOp->getDebugLoc();
+  SDLoc dl(SVOp);
   MVT EltVT = VT.getVectorElementType();
   MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
   SDValue Output[2];
@@ -6384,7 +6416,7 @@ static SDValue
 LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
-  DebugLoc dl = SVOp->getDebugLoc();
+  SDLoc dl(SVOp);
   MVT VT = SVOp->getValueType(0).getSimpleVT();
 
   assert(VT.is128BitVector() && "Unsupported vector size");
@@ -6535,7 +6567,7 @@ static bool MayFoldVectorLoad(SDValue V) {
 }
 
 static
-SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) {
+SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
 
   // Canonizalize to v2f64.
@@ -6546,7 +6578,7 @@ SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) {
 }
 
 static
-SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG,
+SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG,
                         bool HasSSE2) {
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
@@ -6565,7 +6597,7 @@ SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG,
 }
 
 static
-SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) {
+SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) {
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
   EVT VT = Op.getValueType();
@@ -6581,7 +6613,7 @@ SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) {
 }
 
 static
-SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
+SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
   EVT VT = Op.getValueType();
@@ -6651,7 +6683,7 @@ X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
     return SDValue();
 
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
   unsigned NumElems = VT.getVectorNumElements();
@@ -6712,10 +6744,10 @@ X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
         // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)).
         unsigned Ratio = V.getValueSizeInBits() / V1.getValueSizeInBits();
         EVT FullVT = V.getValueType();
-        EVT SubVecVT = EVT::getVectorVT(*Context, 
+        EVT SubVecVT = EVT::getVectorVT(*Context,
                                         FullVT.getVectorElementType(),
                                         FullVT.getVectorNumElements()/Ratio);
-        V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V, 
+        V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V,
                         DAG.getIntPtrConstant(0));
       }
       V1 = DAG.getNode(ISD::BITCAST, DL, V1.getValueType(), V);
@@ -6730,7 +6762,7 @@ SDValue
 X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   MVT VT = Op.getValueType().getSimpleVT();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
 
@@ -6789,7 +6821,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
   MVT VT = Op.getValueType().getSimpleVT();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned NumElems = VT.getVectorNumElements();
   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
@@ -6871,6 +6903,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
                                 TargetMask, DAG);
   }
 
+  if (isPALIGNRMask(M, VT, Subtarget))
+    return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
+                                getShufflePALIGNRImmediate(SVOp),
+                                DAG);
+
   // Check if this can be converted into a logical shift.
   bool isLeft = false;
   unsigned ShAmt = 0;
@@ -6988,11 +7025,6 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   // inlined here right now to enable us to directly emit target specific
   // nodes, and remove one by one until they don't return Op anymore.
 
-  if (isPALIGNRMask(M, VT, Subtarget))
-    return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
-                                getShufflePALIGNRImmediate(SVOp),
-                                DAG);
-
   if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
       SVOp->getSplatIndex() == 0 && V2IsUndef) {
     if (VT == MVT::v2f64 || VT == MVT::v2i64)
@@ -7100,7 +7132,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
 
 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getValueType().getSimpleVT();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   if (!Op.getOperand(0).getValueType().getSimpleVT().is128BitVector())
     return SDValue();
@@ -7172,7 +7204,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
   // If this is a 256-bit vector result, first extract the 128-bit vector and
   // then extract the element from the 128-bit vector.
   if (VecVT.is256BitVector()) {
-    DebugLoc dl = Op.getNode()->getDebugLoc();
+    SDLoc dl(Op.getNode());
     unsigned NumElems = VecVT.getVectorNumElements();
     SDValue Idx = Op.getOperand(1);
     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
@@ -7195,7 +7227,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
   }
 
   MVT VT = Op.getValueType().getSimpleVT();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   // TODO: handle v16i8.
   if (VT.getSizeInBits() == 16) {
     SDValue Vec = Op.getOperand(0);
@@ -7254,7 +7286,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getValueType().getSimpleVT();
   MVT EltVT = VT.getVectorElementType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   SDValue N0 = Op.getOperand(0);
   SDValue N1 = Op.getOperand(1);
@@ -7309,7 +7341,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
   MVT VT = Op.getValueType().getSimpleVT();
   MVT EltVT = VT.getVectorElementType();
 
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue N0 = Op.getOperand(0);
   SDValue N1 = Op.getOperand(1);
   SDValue N2 = Op.getOperand(2);
@@ -7354,7 +7386,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
 
 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
   LLVMContext *Context = DAG.getContext();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   MVT OpVT = Op.getValueType().getSimpleVT();
 
   // If this is a 256-bit vector result, first insert into a 128-bit
@@ -7387,7 +7419,7 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
                                       SelectionDAG &DAG) {
   if (Subtarget->hasFp256()) {
-    DebugLoc dl = Op.getNode()->getDebugLoc();
+    SDLoc dl(Op.getNode());
     SDValue Vec = Op.getNode()->getOperand(0);
     SDValue Idx = Op.getNode()->getOperand(1);
 
@@ -7407,7 +7439,7 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
                                      SelectionDAG &DAG) {
   if (Subtarget->hasFp256()) {
-    DebugLoc dl = Op.getNode()->getDebugLoc();
+    SDLoc dl(Op.getNode());
     SDValue Vec = Op.getNode()->getOperand(0);
     SDValue SubVec = Op.getNode()->getOperand(1);
     SDValue Idx = Op.getNode()->getOperand(2);
@@ -7449,13 +7481,13 @@ X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
   SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
                                              CP->getAlignment(),
                                              CP->getOffset(), OpFlag);
-  DebugLoc DL = CP->getDebugLoc();
+  SDLoc DL(CP);
   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
   // With PIC, the address is actually $g + Offset.
   if (OpFlag) {
     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
                          DAG.getNode(X86ISD::GlobalBaseReg,
-                                     DebugLoc(), getPointerTy()),
+                                     SDLoc(), getPointerTy()),
                          Result);
   }
 
@@ -7481,14 +7513,14 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
                                           OpFlag);
-  DebugLoc DL = JT->getDebugLoc();
+  SDLoc DL(JT);
   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
 
   // With PIC, the address is actually $g + Offset.
   if (OpFlag)
     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
                          DAG.getNode(X86ISD::GlobalBaseReg,
-                                     DebugLoc(), getPointerTy()),
+                                     SDLoc(), getPointerTy()),
                          Result);
 
   return Result;
@@ -7519,7 +7551,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
 
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
 
   // With PIC, the address is actually $g + Offset.
@@ -7527,7 +7559,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
       !Subtarget->is64Bit()) {
     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
                          DAG.getNode(X86ISD::GlobalBaseReg,
-                                     DebugLoc(), getPointerTy()),
+                                     SDLoc(), getPointerTy()),
                          Result);
   }
 
@@ -7548,7 +7580,7 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
   CodeModel::Model M = getTargetMachine().getCodeModel();
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
                                              OpFlags);
 
@@ -7569,7 +7601,7 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
 }
 
 SDValue
-X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
+X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
                                       int64_t Offset, SelectionDAG &DAG) const {
   // Create the TargetGlobalAddress node, folding in the constant
   // offset if it is legal.
@@ -7618,7 +7650,7 @@ SDValue
 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
-  return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
+  return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
 }
 
 static SDValue
@@ -7627,7 +7659,7 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
            unsigned char OperandFlags, bool LocalDynamic = false) {
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-  DebugLoc dl = GA->getDebugLoc();
+  SDLoc dl(GA);
   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
                                            GA->getValueType(0),
                                            GA->getOffset(),
@@ -7656,10 +7688,10 @@ static SDValue
 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                                 const EVT PtrVT) {
   SDValue InFlag;
-  DebugLoc dl = GA->getDebugLoc();  // ? function entry point might be better
+  SDLoc dl(GA);  // ? function entry point might be better
   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
                                    DAG.getNode(X86ISD::GlobalBaseReg,
-                                               DebugLoc(), PtrVT), InFlag);
+                                               SDLoc(), PtrVT), InFlag);
   InFlag = Chain.getValue(1);
 
   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
@@ -7677,7 +7709,7 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
                                            SelectionDAG &DAG,
                                            const EVT PtrVT,
                                            bool is64Bit) {
-  DebugLoc dl = GA->getDebugLoc();
+  SDLoc dl(GA);
 
   // Get the start address of the TLS block for this module.
   X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
@@ -7691,7 +7723,7 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
   } else {
     SDValue InFlag;
     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
-        DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT), InFlag);
+        DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
     InFlag = Chain.getValue(1);
     Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
                       X86II::MO_TLSLDM, /*LocalDynamic=*/true);
@@ -7716,7 +7748,7 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                                    const EVT PtrVT, TLSModel::Model model,
                                    bool is64Bit, bool isPIC) {
-  DebugLoc dl = GA->getDebugLoc();
+  SDLoc dl(GA);
 
   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
@@ -7755,7 +7787,7 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   if (model == TLSModel::InitialExec) {
     if (isPIC && !is64Bit) {
       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
-                          DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT),
+                          DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
                            Offset);
     }
 
@@ -7809,7 +7841,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
       OpFlag = X86II::MO_TLVP_PIC_BASE;
     else
       OpFlag = X86II::MO_TLVP;
-    DebugLoc DL = Op.getDebugLoc();
+    SDLoc DL(Op);
     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
                                                 GA->getValueType(0),
                                                 GA->getOffset(), OpFlag);
@@ -7819,7 +7851,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     if (PIC32)
       Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
                            DAG.getNode(X86ISD::GlobalBaseReg,
-                                       DebugLoc(), getPointerTy()),
+                                       SDLoc(), getPointerTy()),
                            Offset);
 
     // Lowering the machine isd will make sure everything is in the right
@@ -7856,7 +7888,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     // thread-localness.
     if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
       GV = GA->resolveAliasedGlobal(false);
-    DebugLoc dl = GA->getDebugLoc();
+    SDLoc dl(GA);
     SDValue Chain = DAG.getEntryNode();
 
     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
@@ -7914,7 +7946,7 @@ SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{
   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   EVT VT = Op.getValueType();
   unsigned VTBits = VT.getSizeInBits();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
   SDValue ShOpLo = Op.getOperand(0);
   SDValue ShOpHi = Op.getOperand(1);
@@ -7973,7 +8005,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
     return Op;
   }
 
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned Size = SrcVT.getSizeInBits()/8;
   MachineFunction &MF = DAG.getMachineFunction();
   int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
@@ -7989,7 +8021,7 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
                                      SDValue StackSlot,
                                      SelectionDAG &DAG) const {
   // Build the FILD
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   SDVTList Tys;
   bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
   if (useSSE)
@@ -8064,7 +8096,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
      #endif
   */
 
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   LLVMContext *Context = DAG.getContext();
 
   // Build some magic constants.
@@ -8118,7 +8150,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
 // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
                                                SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   // FP constant to bias correct the final result.
   SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
                                    MVT::f64);
@@ -8166,7 +8198,7 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
                                                SelectionDAG &DAG) const {
   SDValue N0 = Op.getOperand(0);
   EVT SVT = N0.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 ||
           SVT == MVT::v8i8 || SVT == MVT::v8i16) &&
@@ -8181,7 +8213,7 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
                                            SelectionDAG &DAG) const {
   SDValue N0 = Op.getOperand(0);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   if (Op.getValueType().isVector())
     return lowerUINT_TO_FP_vec(Op, DAG);
@@ -8240,7 +8272,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   APInt FF(32, 0x5F800000ULL);
 
   // Check whether the sign bit is set.
-  SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64),
+  SDValue SignSet = DAG.getSetCC(dl,
+                                 getSetCCResultType(*DAG.getContext(), MVT::i64),
                                  Op.getOperand(0), DAG.getConstant(0, MVT::i64),
                                  ISD::SETLT);
 
@@ -8269,7 +8302,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
 std::pair<SDValue,SDValue>
 X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
                                     bool IsSigned, bool IsReplace) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   EVT DstTy = Op.getValueType();
 
@@ -8366,7 +8399,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
   MVT VT = Op->getValueType(0).getSimpleVT();
   SDValue In = Op->getOperand(0);
   MVT InVT = In.getValueType().getSimpleVT();
-  DebugLoc dl = Op->getDebugLoc();
+  SDLoc dl(Op);
 
   // Optimize vectors in AVX mode:
   //
@@ -8415,7 +8448,7 @@ SDValue X86TargetLowering::LowerANY_EXTEND(SDValue Op,
 }
 SDValue X86TargetLowering::LowerZERO_EXTEND(SDValue Op,
                                             SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   MVT VT = Op.getValueType().getSimpleVT();
   SDValue In = Op.getOperand(0);
   MVT SVT = In.getValueType().getSimpleVT();
@@ -8447,7 +8480,7 @@ SDValue X86TargetLowering::LowerZERO_EXTEND(SDValue Op,
 }
 
 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   MVT VT = Op.getValueType().getSimpleVT();
   SDValue In = Op.getOperand(0);
   MVT SVT = In.getValueType().getSimpleVT();
@@ -8568,8 +8601,8 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
   MVT VT = Op.getValueType().getSimpleVT();
   if (VT.isVector()) {
     if (VT == MVT::v8i16)
-      return DAG.getNode(ISD::TRUNCATE, Op.getDebugLoc(), VT,
-                         DAG.getNode(ISD::FP_TO_SINT, Op.getDebugLoc(),
+      return DAG.getNode(ISD::TRUNCATE, SDLoc(Op), VT,
+                         DAG.getNode(ISD::FP_TO_SINT, SDLoc(Op),
                                      MVT::v8i32, Op.getOperand(0)));
     return SDValue();
   }
@@ -8582,7 +8615,7 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
 
   if (StackSlot.getNode())
     // Load the result.
-    return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
+    return DAG.getLoad(Op.getValueType(), SDLoc(Op),
                        FIST, StackSlot, MachinePointerInfo(),
                        false, false, false, 0);
 
@@ -8599,7 +8632,7 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
 
   if (StackSlot.getNode())
     // Load the result.
-    return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
+    return DAG.getLoad(Op.getValueType(), SDLoc(Op),
                        FIST, StackSlot, MachinePointerInfo(),
                        false, false, false, 0);
 
@@ -8608,7 +8641,7 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
 }
 
 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   MVT VT = Op.getValueType().getSimpleVT();
   SDValue In = Op.getOperand(0);
   MVT SVT = In.getValueType().getSimpleVT();
@@ -8622,7 +8655,7 @@ static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
 
 SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
   LLVMContext *Context = DAG.getContext();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   MVT VT = Op.getValueType().getSimpleVT();
   MVT EltVT = VT;
   unsigned NumElts = VT == MVT::f64 ? 2 : 4;
@@ -8656,7 +8689,7 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
   LLVMContext *Context = DAG.getContext();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   MVT VT = Op.getValueType().getSimpleVT();
   MVT EltVT = VT;
   unsigned NumElts = VT == MVT::f64 ? 2 : 4;
@@ -8693,7 +8726,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
   LLVMContext *Context = DAG.getContext();
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   MVT VT = Op.getValueType().getSimpleVT();
   MVT SrcVT = Op1.getValueType().getSimpleVT();
 
@@ -8770,7 +8803,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
 
 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
   SDValue N0 = Op.getOperand(0);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   MVT VT = Op.getValueType().getSimpleVT();
 
   // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
@@ -8792,7 +8825,7 @@ SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op,
     return SDValue();
 
   SDNode *N = Op.getNode();
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   SmallVector<SDValue, 8> Opnds;
   DenseMap<SDValue, unsigned> VecInMap;
@@ -8876,7 +8909,7 @@ SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op,
 /// equivalent.
 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
                                     SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   // CF and OF aren't always set the way we want. Determine which
   // of these we need.
@@ -9091,7 +9124,7 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
     if (C->getAPIntValue() == 0)
       return EmitTest(Op0, X86CC, DAG);
 
-  DebugLoc dl = Op0.getDebugLoc();
+  SDLoc dl(Op0);
   if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
        Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
     // Use SUB instead of CMP to enable CSE between SUB and CMP.
@@ -9118,7 +9151,7 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
   // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
   // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
   // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
-  DebugLoc dl = Cmp.getDebugLoc();
+  SDLoc dl(Cmp);
   SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
   SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
   SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
@@ -9135,7 +9168,7 @@ static bool isAllOnes(SDValue V) {
 /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
 /// if it's possible.
 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
-                                     DebugLoc dl, SelectionDAG &DAG) const {
+                                     SDLoc dl, SelectionDAG &DAG) const {
   SDValue Op0 = And.getOperand(0);
   SDValue Op1 = And.getOperand(1);
   if (Op0.getOpcode() == ISD::TRUNCATE)
@@ -9180,14 +9213,6 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
   }
 
   if (LHS.getNode()) {
-    // If the LHS is of the form (x ^ -1) then replace the LHS with x and flip
-    // the condition code later.
-    bool Invert = false;
-    if (LHS.getOpcode() == ISD::XOR && isAllOnes(LHS.getOperand(1))) {
-      Invert = true;
-      LHS = LHS.getOperand(0);
-    }
-
     // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
     // instruction.  Since the shift amount is in-range-or-undefined, we know
     // that doing a bittest on the i32 value is ok.  We extend to i32 because
@@ -9204,9 +9229,6 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
 
     SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
     X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
-    // Flip the condition if the LHS was a not instruction
-    if (Invert)
-      Cond = X86::GetOppositeBranchCondition(Cond);
     return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
                        DAG.getConstant(Cond, MVT::i8), BT);
   }
@@ -9223,7 +9245,7 @@ static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
          "Unsupported value type for operation");
 
   unsigned NumElems = VT.getVectorNumElements();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue CC = Op.getOperand(2);
 
   // Extract the LHS vectors
@@ -9253,7 +9275,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
   MVT VT = Op.getValueType().getSimpleVT();
   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
   bool isFP = Op.getOperand(1).getValueType().getSimpleVT().isFloatingPoint();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   if (isFP) {
 #ifndef NDEBUG
@@ -9347,29 +9369,31 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
   if (Swap)
     std::swap(Op0, Op1);
 
-  // Since SSE has no unsigned integer comparisons, we need to flip  the sign
-  // bits of the inputs before performing those operations.
-  if (FlipSigns) {
-    EVT EltVT = VT.getVectorElementType();
-    SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()),
-                                      EltVT);
-    std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit);
-    SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0],
-                                    SignBits.size());
-    Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec);
-    Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec);
-  }
-
   // Check that the operation in question is available (most are plain SSE2,
   // but PCMPGTQ and PCMPEQQ have different requirements).
   if (VT == MVT::v2i64) {
     if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) {
       assert(Subtarget->hasSSE2() && "Don't know how to lower!");
 
-      // First cast everything to the right type,
+      // First cast everything to the right type.
       Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
       Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
 
+      // Since SSE has no unsigned integer comparisons, we need to flip the sign
+      // bits of the inputs before performing those operations. The lower
+      // compare is always unsigned.
+      SDValue SB;
+      if (FlipSigns) {
+        SB = DAG.getConstant(0x80000000U, MVT::v4i32);
+      } else {
+        SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32);
+        SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32);
+        SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                         Sign, Zero, Sign, Zero);
+      }
+      Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
+      Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
+
       // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
       SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
@@ -9395,7 +9419,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
       // pcmpeqd + pshufd + pand.
       assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
 
-      // First cast everything to the right type,
+      // First cast everything to the right type.
       Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
       Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
 
@@ -9414,6 +9438,15 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
     }
   }
 
+  // Since SSE has no unsigned integer comparisons, we need to flip the sign
+  // bits of the inputs before performing those operations.
+  if (FlipSigns) {
+    EVT EltVT = VT.getVectorElementType();
+    SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT);
+    Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
+    Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
+  }
+
   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
 
   // If the logical-not of the result is required, perform that now.
@@ -9432,7 +9465,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
 
   // Optimize to BT if possible.
@@ -9526,7 +9559,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   SDValue Cond  = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   SDValue Op2 = Op.getOperand(2);
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   SDValue CC;
 
   if (Cond.getOpcode() == ISD::SETCC) {
@@ -9716,7 +9749,7 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND(SDValue Op,
   MVT VT = Op->getValueType(0).getSimpleVT();
   SDValue In = Op->getOperand(0);
   MVT InVT = In.getValueType().getSimpleVT();
-  DebugLoc dl = Op->getDebugLoc();
+  SDLoc dl(Op);
 
   if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
       (VT != MVT::v8i32 || InVT != MVT::v8i16))
@@ -9789,7 +9822,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
   SDValue Cond  = Op.getOperand(1);
   SDValue Dest  = Op.getOperand(2);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue CC;
   bool Inverted = false;
 
@@ -10059,7 +10092,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
          "This should be used only on Windows targets or when segmented stacks "
          "are being used");
   assert(!Subtarget->isTargetEnvMacho() && "Not implemented");
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   // Get the inputs.
   SDValue Chain = Op.getOperand(0);
@@ -10104,6 +10137,8 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
     Flag = Chain.getValue(1);
 
+    const X86RegisterInfo *RegInfo =
+      static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
     Chain = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
                                SPTy).getValue(1);
 
@@ -10117,7 +10152,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
 
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
     // vastart just stores the address of the VarArgsFrameIndex slot into the
@@ -10184,7 +10219,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   SDValue SrcPtr = Op.getOperand(1);
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   unsigned Align = Op.getConstantOperandVal(3);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   EVT ArgVT = Op.getNode()->getValueType(0);
   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
@@ -10250,7 +10285,7 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
   SDValue SrcPtr = Op.getOperand(2);
   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
                        DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
@@ -10260,7 +10295,7 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
 
 // getTargetVShiftNode - Handle vector element shifts where the shift amount
 // may or may not be a constant. Takes immediate version of shift as input.
-static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
+static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, EVT VT,
                                    SDValue SrcOp, SDValue ShAmt,
                                    SelectionDAG &DAG) {
   assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32");
@@ -10304,7 +10339,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
 }
 
 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   switch (IntNo) {
   default: return SDValue();    // Don't custom lower most intrinsics.
@@ -10949,7 +10984,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
 }
 
 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   switch (IntNo) {
   default: return SDValue();    // Don't custom lower most intrinsics.
@@ -11004,13 +11039,14 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
   MFI->setReturnAddressIsTaken(true);
 
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   EVT PtrVT = getPointerTy();
 
   if (Depth > 0) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
-    SDValue Offset =
-      DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
+    const X86RegisterInfo *RegInfo =
+      static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+    SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
                        DAG.getNode(ISD::ADD, dl, PtrVT,
                                    FrameAddr, Offset),
@@ -11028,8 +11064,10 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   MFI->setFrameAddressIsTaken(true);
 
   EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
+  SDLoc dl(Op);  // FIXME probably not meaningful
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
           (FrameReg == X86::EBP && VT == MVT::i32)) &&
@@ -11044,6 +11082,8 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
                                                      SelectionDAG &DAG) const {
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
 }
 
@@ -11051,9 +11091,11 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain     = Op.getOperand(0);
   SDValue Offset    = Op.getOperand(1);
   SDValue Handler   = Op.getOperand(2);
-  DebugLoc dl       = Op.getDebugLoc();
+  SDLoc dl      (Op);
 
   EVT PtrVT = getPointerTy();
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
@@ -11074,7 +11116,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
                                                SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
                      DAG.getVTList(MVT::i32, MVT::Other),
                      Op.getOperand(0), Op.getOperand(1));
@@ -11082,7 +11124,7 @@ SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
 
 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
                                                 SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
                      Op.getOperand(0), Op.getOperand(1));
 }
@@ -11097,7 +11139,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   SDValue Trmp = Op.getOperand(1); // trampoline
   SDValue FPtr = Op.getOperand(2); // nested function
   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
-  DebugLoc dl  = Op.getDebugLoc();
+  SDLoc dl (Op);
 
   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
   const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
@@ -11267,7 +11309,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
   const TargetFrameLowering &TFI = *TM.getFrameLowering();
   unsigned StackAlignment = TFI.getStackAlignment();
   EVT VT = Op.getValueType();
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   // Save FP Control Word to stack slot
   int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
@@ -11314,7 +11356,7 @@ static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
   EVT OpVT = VT;
   unsigned NumBits = VT.getSizeInBits();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   Op = Op.getOperand(0);
   if (VT == MVT::i8) {
@@ -11348,7 +11390,7 @@ static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
   EVT OpVT = VT;
   unsigned NumBits = VT.getSizeInBits();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   Op = Op.getOperand(0);
   if (VT == MVT::i8) {
@@ -11372,7 +11414,7 @@ static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
   unsigned NumBits = VT.getSizeInBits();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   Op = Op.getOperand(0);
 
   // Issue a bsf (scan bits forward) which also sets EFLAGS.
@@ -11398,7 +11440,7 @@ static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
          "Unsupported value type for operation");
 
   unsigned NumElems = VT.getVectorNumElements();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   // Extract the LHS vectors
   SDValue LHS = Op.getOperand(0);
@@ -11434,7 +11476,7 @@ static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
 
 static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
                         SelectionDAG &DAG) {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   EVT VT = Op.getValueType();
 
   // Decompose 256-bit ops into smaller 128-bit ops.
@@ -11510,7 +11552,7 @@ SDValue X86TargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
   EVT EltTy = VT.getVectorElementType();
   unsigned NumElts = VT.getVectorNumElements();
   SDValue N0 = Op.getOperand(0);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   // Lower sdiv X, pow2-const.
   BuildVectorSDNode *C = dyn_cast<BuildVectorSDNode>(Op.getOperand(1));
@@ -11551,7 +11593,7 @@ SDValue X86TargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
                                          const X86Subtarget *Subtarget) {
   EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue R = Op.getOperand(0);
   SDValue Amt = Op.getOperand(1);
 
@@ -11717,7 +11759,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
                                         const X86Subtarget* Subtarget) {
   EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue R = Op.getOperand(0);
   SDValue Amt = Op.getOperand(1);
 
@@ -11853,7 +11895,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
 SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
 
   EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue R = Op.getOperand(0);
   SDValue Amt = Op.getOperand(1);
   SDValue V;
@@ -11989,7 +12031,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
   SDValue RHS = N->getOperand(1);
   unsigned BaseOp = 0;
   unsigned Cond = 0;
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Unknown ovf instruction!");
   case ISD::SADDO:
@@ -12056,7 +12098,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
 
 SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
                                                   SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
   EVT VT = Op.getValueType();
 
@@ -12123,7 +12165,7 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
 
 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
                                  SelectionDAG &DAG) {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
     cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
   SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
@@ -12160,7 +12202,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
                              SelectionDAG &DAG) {
   EVT T = Op.getValueType();
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   unsigned Reg = 0;
   unsigned size = 0;
   switch(T.getSimpleVT().SimpleTy) {
@@ -12194,7 +12236,7 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
   assert(Subtarget->is64Bit() && "Result not type legalized?");
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue TheChain = Op.getOperand(0);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
   SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1));
   SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64,
@@ -12230,7 +12272,7 @@ SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
 
 static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
   SDNode *Node = Op.getNode();
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
   EVT T = Node->getValueType(0);
   SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
                               DAG.getConstant(0, T), Node->getOperand(2));
@@ -12246,7 +12288,7 @@ static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
 
 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
   SDNode *Node = Op.getNode();
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
 
   // Convert seq_cst store -> xchg
@@ -12289,9 +12331,9 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
   }
 
   if (!ExtraOp)
-    return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
+    return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
                        Op.getOperand(1));
-  return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
+  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
                      Op.getOperand(1), Op.getOperand(2));
 }
 
@@ -12301,7 +12343,7 @@ SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
   // which returns the values as { float, float } (in XMM0) or
   // { double, double } (which is returned in XMM0, XMM1).
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue Arg = Op.getOperand(0);
   EVT ArgVT = Arg.getValueType();
   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
@@ -12434,7 +12476,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 static void ReplaceATOMIC_LOAD(SDNode *Node,
                                   SmallVectorImpl<SDValue> &Results,
                                   SelectionDAG &DAG) {
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
 
   // Convert wide load -> cmpxchg8b/cmpxchg16b
@@ -12455,7 +12497,7 @@ static void ReplaceATOMIC_LOAD(SDNode *Node,
 static void
 ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
                         SelectionDAG &DAG, unsigned NewOp) {
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
   assert (Node->getValueType(0) == MVT::i64 &&
           "Only know how to expand i64 atomics");
 
@@ -12480,7 +12522,7 @@ ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
                                            SmallVectorImpl<SDValue>&Results,
                                            SelectionDAG &DAG) const {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   switch (N->getOpcode()) {
   default:
@@ -14589,6 +14631,9 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   // Setup
   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
           .addMBB(restoreMBB);
+
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
   MIB.addRegMask(RegInfo->getNoPreservedMask());
   thisMBB->addSuccessor(mainMBB);
   thisMBB->addSuccessor(restoreMBB);
@@ -14634,6 +14679,8 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
   unsigned Tmp = MRI.createVirtualRegister(RC);
   // Since FP is only updated here but NOT referenced, it's treated as GPR.
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
   unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
   unsigned SP = RegInfo->getStackRegister();
 
@@ -15034,7 +15081,7 @@ static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
 static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
                                         TargetLowering::DAGCombinerInfo &DCI,
                                         const X86Subtarget* Subtarget) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
@@ -15130,7 +15177,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
 static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const X86Subtarget *Subtarget) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   EVT VT = N->getValueType(0);
 
   // Don't create instructions with illegal types after legalize types has run.
@@ -15249,7 +15296,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
 
   // All checks match so transform back to vector_shuffle so that DAG combiner
   // can finish the job
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   // Create shuffle node taking into account the case that its a unary shuffle
   SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1);
@@ -15276,7 +15323,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
   if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST &&
       InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx &&
       InputVector.hasOneUse() && N->getValueType(0) == MVT::i32)
-    return DAG.getNode(X86ISD::MMX_MOVD2W, InputVector.getDebugLoc(),
+    return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
                        N->getValueType(0),
                        InputVector.getNode()->getOperand(0));
 
@@ -15321,7 +15368,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   // Ok, we've now decided to do the transformation.
-  DebugLoc dl = InputVector.getDebugLoc();
+  SDLoc dl(InputVector);
 
   // Store the value to a temporary stack slot.
   SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
@@ -15432,7 +15479,7 @@ static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS,
 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     const X86Subtarget *Subtarget) {
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   SDValue Cond = N->getOperand(0);
   // Get the LHS/RHS of the select.
   SDValue LHS = N->getOperand(1);
@@ -15700,7 +15747,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
     case ISD::SETLT:
     case ISD::SETGT: {
       ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
-      Cond = DAG.getSetCC(Cond.getDebugLoc(), Cond.getValueType(),
+      Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
       return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
     }
@@ -15976,7 +16023,7 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
 static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
                                   TargetLowering::DAGCombinerInfo &DCI,
                                   const X86Subtarget *Subtarget) {
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   // If the flag operand isn't dead, don't touch this CMOV.
   if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
@@ -16179,7 +16226,7 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
   }
   if (MulAmt2 &&
       (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
-    DebugLoc DL = N->getDebugLoc();
+    SDLoc DL(N);
 
     if (isPowerOf2_64(MulAmt2) &&
         !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
@@ -16229,7 +16276,7 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
       APInt ShAmt = N1C->getAPIntValue();
       Mask = Mask.shl(ShAmt);
       if (Mask != 0)
-        return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+        return DAG.getNode(ISD::AND, SDLoc(N), VT,
                            N00, DAG.getConstant(Mask, VT));
     }
   }
@@ -16245,15 +16292,14 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
     // hardware support for this operation. This is better expressed as an ADD
     // of two values.
     if (N1C && (1 == N1C->getZExtValue())) {
-      return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, N0);
+      return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
     }
   }
 
   return SDValue();
 }
 
-/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts
-///                       when possible.
+/// PerformShiftCombine - Combine shifts.
 static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
                                    TargetLowering::DAGCombinerInfo &DCI,
                                    const X86Subtarget *Subtarget) {
@@ -16280,7 +16326,7 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
     SDValue N1 = N->getOperand(1);
     SDValue CMP0 = N0->getOperand(1);
     SDValue CMP1 = N1->getOperand(1);
-    DebugLoc DL = N->getDebugLoc();
+    SDLoc DL(N);
 
     // The SETCCs should both refer to the same CMP.
     if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
@@ -16399,7 +16445,7 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
 
   SDValue N0  = Narrow->getOperand(0);
   SDValue N1  = Narrow->getOperand(1);
-  DebugLoc DL = Narrow->getDebugLoc();
+  SDLoc DL(Narrow);
 
   // The Left side has to be a trunc.
   if (N0.getOpcode() != ISD::TRUNCATE)
@@ -16471,7 +16517,7 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
   if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) {
     SDValue N0 = N->getOperand(0);
     SDValue N1 = N->getOperand(1);
-    DebugLoc DL = N->getDebugLoc();
+    SDLoc DL(N);
 
     // Check LHS for neg
     if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 &&
@@ -16505,7 +16551,7 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   // Check LHS for vnot
   if (N0.getOpcode() == ISD::XOR &&
@@ -16589,7 +16635,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
       if ((SraAmt + 1) != EltBits)
         return SDValue();
 
-      DebugLoc DL = N->getDebugLoc();
+      SDLoc DL(N);
 
       // Now we know we at least have a plendvb with the mask val.  See if
       // we can form a psignb/w/d.
@@ -16638,7 +16684,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
   if (ShAmt1.getOpcode() == ISD::TRUNCATE)
     ShAmt1 = ShAmt1.getOperand(0);
 
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   unsigned Opc = X86ISD::SHLD;
   SDValue Op0 = N0.getOperand(0);
   SDValue Op1 = N1.getOperand(0);
@@ -16685,7 +16731,7 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
   // and change it to SUB and CMOV.
@@ -16735,7 +16781,7 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
   // Create BLSMSK instructions by finding X ^ (X-1)
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
       isAllOnes(N0.getOperand(1)))
@@ -16755,7 +16801,7 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
   LoadSDNode *Ld = cast<LoadSDNode>(N);
   EVT RegVT = Ld->getValueType(0);
   EVT MemVT = Ld->getMemoryVT();
-  DebugLoc dl = Ld->getDebugLoc();
+  SDLoc dl(Ld);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   unsigned RegSz = RegVT.getSizeInBits();
 
@@ -16950,7 +16996,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
   StoreSDNode *St = cast<StoreSDNode>(N);
   EVT VT = St->getValue().getValueType();
   EVT StVT = St->getMemoryVT();
-  DebugLoc dl = St->getDebugLoc();
+  SDLoc dl(St);
   SDValue StoredVal = St->getOperand(1);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
@@ -17113,8 +17159,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
     if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
       return SDValue();
 
-    DebugLoc LdDL = Ld->getDebugLoc();
-    DebugLoc StDL = N->getDebugLoc();
+    SDLoc LdDL(Ld);
+    SDLoc StDL(N);
     // If we are a 64-bit capable x86, lower to a single movq load/store pair.
     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
     // pair instead.
@@ -17313,7 +17359,7 @@ static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
        (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
       isHorizontalBinOp(LHS, RHS, true))
-    return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS);
+    return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS);
   return SDValue();
 }
 
@@ -17328,7 +17374,7 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
        (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
       isHorizontalBinOp(LHS, RHS, false))
-    return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS);
+    return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);
   return SDValue();
 }
 
@@ -17365,7 +17411,7 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
     case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
   }
 
-  return DAG.getNode(NewOp, N->getDebugLoc(), N->getValueType(0),
+  return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
                      N->getOperand(0), N->getOperand(1));
 }
 
@@ -17409,12 +17455,12 @@ static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
   if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
       VT.getVectorElementType().getSizeInBits() ==
       OpVT.getVectorElementType().getSizeInBits()) {
-    return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op);
+    return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
   }
   return SDValue();
 }
 
-static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, 
+static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
                                                const X86Subtarget *Subtarget) {
   EVT VT = N->getValueType(0);
   if (!VT.isVector())
@@ -17423,7 +17469,7 @@ static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
   // both SSE and AVX2 since there is no sign-extended shift right
@@ -17434,14 +17480,14 @@ static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
       N0.getOpcode() == ISD::SIGN_EXTEND)) {
     SDValue N00 = N0.getOperand(0);
 
-    // EXTLOAD has a better solution on AVX2, 
+    // EXTLOAD has a better solution on AVX2,
     // it may be replaced with X86ISD::VSEXT node.
     if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
       if (!ISD::isNormalLoad(N00.getNode()))
         return SDValue();
 
     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
-        SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, 
+        SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
                                   N00, N1);
       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
     }
@@ -17470,7 +17516,7 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
 
 static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
                                  const X86Subtarget* Subtarget) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   EVT VT = N->getValueType(0);
 
   // Let legalize expand this if it isn't a legal type yet.
@@ -17515,7 +17561,7 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
   //           (and (i32 x86isd::setcc_carry), 1)
   // This eliminates the zext. This transformation is necessary because
   // ISD::SETCC is always legalized to i8.
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
@@ -17553,17 +17599,17 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) {
   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
       if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
-        SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(),
+        SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
                                    LHS.getValueType(), RHS, LHS.getOperand(1));
-        return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0),
+        return DAG.getSetCC(SDLoc(N), N->getValueType(0),
                             addV, DAG.getConstant(0, addV.getValueType()), CC);
       }
   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
       if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
-        SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(),
+        SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
                                    RHS.getValueType(), LHS, RHS.getOperand(1));
-        return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0),
+        return DAG.getSetCC(SDLoc(N), N->getValueType(0),
                             addV, DAG.getConstant(0, addV.getValueType()), CC);
       }
   return SDValue();
@@ -17572,7 +17618,7 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) {
 // Helper function of PerformSETCCCombine. It is to materialize "setb reg"
 // as "sbb reg,reg", since it can be extended without zext and produces
 // an all-ones bit which is more useful than 0/1 in some cases.
-static SDValue MaterializeSETB(DebugLoc DL, SDValue EFLAGS, SelectionDAG &DAG) {
+static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG) {
   return DAG.getNode(ISD::AND, DL, MVT::i8,
                      DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
                                  DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
@@ -17583,7 +17629,7 @@ static SDValue MaterializeSETB(DebugLoc DL, SDValue EFLAGS, SelectionDAG &DAG) {
 static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
                                    TargetLowering::DAGCombinerInfo &DCI,
                                    const X86Subtarget *Subtarget) {
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
   SDValue EFLAGS = N->getOperand(1);
 
@@ -17597,7 +17643,7 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
         EFLAGS.getValueType().isInteger() &&
         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
-      SDValue NewSub = DAG.getNode(X86ISD::SUB, EFLAGS.getDebugLoc(),
+      SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
                                    EFLAGS.getNode()->getVTList(),
                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
       SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
@@ -17627,7 +17673,7 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
 static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     const X86Subtarget *Subtarget) {
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   SDValue Chain = N->getOperand(0);
   SDValue Dest = N->getOperand(1);
   SDValue EFLAGS = N->getOperand(3);
@@ -17652,7 +17698,7 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
 
   // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
   if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
-    DebugLoc dl = N->getDebugLoc();
+    SDLoc dl(N);
     MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
     return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
@@ -17687,7 +17733,7 @@ static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
       // We don't have a good way to replace an EFLAGS use, so only do this when
       // dead right now.
       SDValue(N, 1).use_empty()) {
-    DebugLoc DL = N->getDebugLoc();
+    SDLoc DL(N);
     EVT VT = N->getValueType(0);
     SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));
     SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
@@ -17706,7 +17752,7 @@ static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
 //      (sub (sete  X, 0), Y) -> sbb  0, Y
 //      (sub (setne X, 0), Y) -> adc -1, Y
 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   // Look through ZExts.
   SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
@@ -17752,7 +17798,7 @@ static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
        (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
       isHorizontalBinOp(Op0, Op1, true))
-    return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1);
+    return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
 
   return OptimizeConditionalInDecrement(N, DAG);
 }
@@ -17772,10 +17818,10 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
         isa<ConstantSDNode>(Op1.getOperand(1))) {
       APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
       EVT VT = Op0.getValueType();
-      SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT,
+      SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
                                    Op1.getOperand(0),
                                    DAG.getConstant(~XorC, VT));
-      return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor,
+      return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
                          DAG.getConstant(C->getAPIntValue()+1, VT));
     }
   }
@@ -17785,7 +17831,7 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
        (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
       isHorizontalBinOp(Op0, Op1, true))
-    return DAG.getNode(X86ISD::HSUB, N->getDebugLoc(), VT, Op0, Op1);
+    return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
 
   return OptimizeConditionalInDecrement(N, DAG);
 }
@@ -17802,7 +17848,7 @@ static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
   if (In.getOpcode() != X86ISD::VZEXT)
     return SDValue();
 
-  return DAG.getNode(X86ISD::VZEXT, N->getDebugLoc(), N->getValueType(0),
+  return DAG.getNode(X86ISD::VZEXT, SDLoc(N), N->getValueType(0),
                      In.getOperand(0));
 }
 
@@ -18362,7 +18408,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
                                                         getTargetMachine())))
       return;
 
-    Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(),
+    Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
                                         GA->getValueType(0), Offset);
     break;
   }
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 2727e22..c0e1015 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -511,7 +511,7 @@ namespace llvm {
     /// It returns EVT::Other if the type should be determined using generic
     /// target-independent logic.
     virtual EVT
-    getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, 
+    getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
                         bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
                         MachineFunction &MF) const;
 
@@ -563,7 +563,7 @@ namespace llvm {
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
     /// getSetCCResultType - Return the value type to use for ISD::SETCC.
-    virtual EVT getSetCCResultType(EVT VT) const;
+    virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
 
     /// computeMaskedBitsForTargetNode - Determine which of the bits specified
     /// in Mask are known to be either zero or one and return them in the
@@ -734,7 +734,6 @@ namespace llvm {
     /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
     /// make the right decision when generating code for different targets.
     const X86Subtarget *Subtarget;
-    const X86RegisterInfo *RegInfo;
     const DataLayout *TD;
 
     /// Used to store the TargetOptions so that we don't waste time resetting
@@ -760,16 +759,16 @@ namespace llvm {
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                             CallingConv::ID CallConv, bool isVarArg,
                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                            DebugLoc dl, SelectionDAG &DAG,
+                            SDLoc dl, SelectionDAG &DAG,
                             SmallVectorImpl<SDValue> &InVals) const;
     SDValue LowerMemArgument(SDValue Chain,
                              CallingConv::ID CallConv,
                              const SmallVectorImpl<ISD::InputArg> &ArgInfo,
-                             DebugLoc dl, SelectionDAG &DAG,
+                             SDLoc dl, SelectionDAG &DAG,
                              const CCValAssign &VA,  MachineFrameInfo *MFI,
                               unsigned i) const;
     SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
-                             DebugLoc dl, SelectionDAG &DAG,
+                             SDLoc dl, SelectionDAG &DAG,
                              const CCValAssign &VA,
                              ISD::ArgFlagsTy Flags) const;
 
@@ -791,7 +790,7 @@ namespace llvm {
     bool IsCalleePop(bool isVarArg, CallingConv::ID CallConv) const;
     SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
                                 SDValue Chain, bool IsTailCall, bool Is64Bit,
-                                int FPDiff, DebugLoc dl) const;
+                                int FPDiff, SDLoc dl) const;
 
     unsigned GetAlignedArgumentStackSize(unsigned StackSize,
                                          SelectionDAG &DAG) const;
@@ -800,7 +799,7 @@ namespace llvm {
                                                bool isSigned,
                                                bool isReplace) const;
 
-    SDValue LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
+    SDValue LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, SDLoc dl,
                                    SelectionDAG &DAG) const;
     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
@@ -808,7 +807,7 @@ namespace llvm {
     SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
+    SDValue LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
                                int64_t Offset, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
@@ -830,7 +829,7 @@ namespace llvm {
     SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerToBT(SDValue And, ISD::CondCode CC,
-                      DebugLoc dl, SelectionDAG &DAG) const;
+                      SDLoc dl, SelectionDAG &DAG) const;
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
@@ -865,7 +864,7 @@ namespace llvm {
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv, bool isVarArg,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
-                           DebugLoc dl, SelectionDAG &DAG,
+                           SDLoc dl, SelectionDAG &DAG,
                            SmallVectorImpl<SDValue> &InVals) const;
     virtual SDValue
       LowerCall(CallLoweringInfo &CLI,
@@ -876,7 +875,7 @@ namespace llvm {
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
                   const SmallVectorImpl<SDValue> &OutVals,
-                  DebugLoc dl, SelectionDAG &DAG) const;
+                  SDLoc dl, SelectionDAG &DAG) const;
 
     virtual bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const;
 
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index 225e972..fa2b2d8 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -294,7 +294,7 @@ def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem,                      // GR64 = [mem64]*I8
 // unsigned division/remainder
 let hasSideEffects = 1 in { // so that we don't speculatively execute
 let SchedRW = [WriteIDiv] in {
-let Defs = [AL,EFLAGS,AX], Uses = [AX] in
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
 def DIV8r  : I<0xF6, MRM6r, (outs),  (ins GR8:$src),    // AX/r8 = AL,AH
                "div{b}\t$src", [], IIC_DIV8_REG>;
 let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
@@ -310,7 +310,7 @@ def DIV64r : RI<0xF7, MRM6r, (outs), (ins GR64:$src),
 } // SchedRW
 
 let mayLoad = 1 in {
-let Defs = [AL,EFLAGS,AX], Uses = [AX] in
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
 def DIV8m  : I<0xF6, MRM6m, (outs), (ins i8mem:$src),   // AX/[mem8] = AL,AH
                "div{b}\t$src", [], IIC_DIV8_MEM>,
              SchedLoadReg<WriteIDivLd>;
@@ -331,7 +331,7 @@ def DIV64m : RI<0xF7, MRM6m, (outs), (ins i64mem:$src),
 
 // Signed division/remainder.
 let SchedRW = [WriteIDiv] in {
-let Defs = [AL,EFLAGS,AX], Uses = [AX] in
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
 def IDIV8r : I<0xF6, MRM7r, (outs),  (ins GR8:$src),    // AX/r8 = AL,AH
                "idiv{b}\t$src", [], IIC_IDIV8>;
 let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
@@ -347,7 +347,7 @@ def IDIV64r: RI<0xF7, MRM7r, (outs), (ins GR64:$src),
 } // SchedRW
 
 let mayLoad = 1 in {
-let Defs = [AL,EFLAGS,AX], Uses = [AX] in
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
 def IDIV8m : I<0xF6, MRM7m, (outs), (ins i8mem:$src),   // AX/[mem8] = AL,AH
                "idiv{b}\t$src", [], IIC_IDIV8>,
              SchedLoadReg<WriteIDivLd>;
@@ -960,7 +960,7 @@ class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo,
              [(set EFLAGS, (opnode (load addr:$dst),
                                    typeinfo.Imm8Operator:$src))]>;
 
-// BinOpAI - Instructions like "add %eax, %eax, imm".
+// BinOpAI - Instructions like "add %eax, %eax, imm", that imp-def EFLAGS.
 class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
               Register areg, string operands>
   : ITy<opcode, RawFrm, typeinfo,
@@ -968,10 +968,18 @@ class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
         mnemonic, operands, []>, Sched<[WriteALU]> {
   let ImmT = typeinfo.ImmEncoding;
   let Uses = [areg];
-  let Defs = [areg];
+  let Defs = [areg, EFLAGS];
   let hasSideEffects = 0;
 }
 
+// BinOpAI_FF - Instructions like "adc %eax, %eax, imm", that implicitly define
+// and use EFLAGS.
+class BinOpAI_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                Register areg, string operands>
+  : BinOpAI<opcode, mnemonic, typeinfo, areg, operands> {
+  let Uses = [areg, EFLAGS];
+}
+
 /// ArithBinOp_RF - This is an arithmetic binary operator where the pattern is
 /// defined with "(set GPR:$dst, EFLAGS, (...".
 ///
@@ -1030,16 +1038,16 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
     def NAME#16mi   : BinOpMI_RMW<mnemonic, Xi16, opnode, MemMRM>;
     def NAME#32mi   : BinOpMI_RMW<mnemonic, Xi32, opnode, MemMRM>;
     def NAME#64mi32 : BinOpMI_RMW<mnemonic, Xi64, opnode, MemMRM>;
+  } // Defs = [EFLAGS]
 
-    def NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
-                             "{$src, %al|AL, $src}">;
-    def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
-                             "{$src, %ax|AX, $src}">;
-    def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
-                             "{$src, %eax|EAX, $src}">;
-    def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
-                             "{$src, %rax|RAX, $src}">;
-  }
+  def NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
+                           "{$src, %al|AL, $src}">;
+  def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
+                           "{$src, %ax|AX, $src}">;
+  def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
+                           "{$src, %eax|EAX, $src}">;
+  def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
+                           "{$src, %rax|RAX, $src}">;
 }
 
 /// ArithBinOp_RFF - This is an arithmetic binary operator where the pattern is
@@ -1052,7 +1060,7 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
                           string mnemonic, Format RegMRM, Format MemMRM,
                           SDNode opnode, bit CommutableRR,
                            bit ConvertibleToThreeAddress> {
-  let Defs = [EFLAGS] in {
+  let Uses = [EFLAGS], Defs = [EFLAGS] in {
     let Constraints = "$src1 = $dst" in {
       let isCommutable = CommutableRR,
           isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
@@ -1101,16 +1109,16 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
     def NAME#16mi   : BinOpMI_RMW_FF<mnemonic, Xi16, opnode, MemMRM>;
     def NAME#32mi   : BinOpMI_RMW_FF<mnemonic, Xi32, opnode, MemMRM>;
     def NAME#64mi32 : BinOpMI_RMW_FF<mnemonic, Xi64, opnode, MemMRM>;
-
-    def NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
-                             "{$src, %al|AL, $src}">;
-    def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
-                             "{$src, %ax|AX, $src}">;
-    def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
-                             "{$src, %eax|EAX, $src}">;
-    def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
-                             "{$src, %rax|RAX, $src}">;
-  }
+  } // Uses = [EFLAGS], Defs = [EFLAGS]
+
+  def NAME#8i8   : BinOpAI_FF<BaseOpc4, mnemonic, Xi8 , AL,
+                              "{$src, %al|AL, $src}">;
+  def NAME#16i16 : BinOpAI_FF<BaseOpc4, mnemonic, Xi16, AX,
+                              "{$src, %ax|AX, $src}">;
+  def NAME#32i32 : BinOpAI_FF<BaseOpc4, mnemonic, Xi32, EAX,
+                              "{$src, %eax|EAX, $src}">;
+  def NAME#64i32 : BinOpAI_FF<BaseOpc4, mnemonic, Xi64, RAX,
+                              "{$src, %rax|RAX, $src}">;
 }
 
 /// ArithBinOp_F - This is an arithmetic binary operator where the pattern is
@@ -1168,16 +1176,16 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
     def NAME#16mi   : BinOpMI_F<mnemonic, Xi16, opnode, MemMRM>;
     def NAME#32mi   : BinOpMI_F<mnemonic, Xi32, opnode, MemMRM>;
     def NAME#64mi32 : BinOpMI_F<mnemonic, Xi64, opnode, MemMRM>;
+  } // Defs = [EFLAGS]
 
-    def NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
-                             "{$src, %al|AL, $src}">;
-    def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
-                             "{$src, %ax|AX, $src}">;
-    def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
-                             "{$src, %eax|EAX, $src}">;
-    def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
-                             "{$src, %rax|RAX, $src}">;
-  }
+  def NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
+                           "{$src, %al|AL, $src}">;
+  def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
+                           "{$src, %ax|AX, $src}">;
+  def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
+                           "{$src, %eax|EAX, $src}">;
+  def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
+                           "{$src, %rax|RAX, $src}">;
 }
 
 
@@ -1195,12 +1203,10 @@ defm SUB : ArithBinOp_RF<0x28, 0x2A, 0x2C, "sub", MRM5r, MRM5m,
 }
 
 // Arithmetic.
-let Uses = [EFLAGS] in {
-  defm ADC : ArithBinOp_RFF<0x10, 0x12, 0x14, "adc", MRM2r, MRM2m, X86adc_flag,
-                            1, 0>;
-  defm SBB : ArithBinOp_RFF<0x18, 0x1A, 0x1C, "sbb", MRM3r, MRM3m, X86sbb_flag,
-                            0, 0>;
-}
+defm ADC : ArithBinOp_RFF<0x10, 0x12, 0x14, "adc", MRM2r, MRM2m, X86adc_flag,
+                          1, 0>;
+defm SBB : ArithBinOp_RFF<0x18, 0x1A, 0x1C, "sbb", MRM3r, MRM3m, X86sbb_flag,
+                          0, 0>;
 
 let isCompare = 1 in {
 defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>;
@@ -1215,28 +1221,36 @@ defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>;
 def X86testpat : PatFrag<(ops node:$lhs, node:$rhs),
                          (X86cmp (and_su node:$lhs, node:$rhs), 0)>;
 
-let isCompare = 1, Defs = [EFLAGS] in {
-  let isCommutable = 1 in {
-    def TEST8rr  : BinOpRR_F<0x84, "test", Xi8 , X86testpat, MRMSrcReg>;
-    def TEST16rr : BinOpRR_F<0x84, "test", Xi16, X86testpat, MRMSrcReg>;
-    def TEST32rr : BinOpRR_F<0x84, "test", Xi32, X86testpat, MRMSrcReg>;
-    def TEST64rr : BinOpRR_F<0x84, "test", Xi64, X86testpat, MRMSrcReg>;
-  } // isCommutable
-
-  def TEST8rm    : BinOpRM_F<0x84, "test", Xi8 , X86testpat>;
-  def TEST16rm   : BinOpRM_F<0x84, "test", Xi16, X86testpat>;
-  def TEST32rm   : BinOpRM_F<0x84, "test", Xi32, X86testpat>;
-  def TEST64rm   : BinOpRM_F<0x84, "test", Xi64, X86testpat>;
-
-  def TEST8ri    : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>;
-  def TEST16ri   : BinOpRI_F<0xF6, "test", Xi16, X86testpat, MRM0r>;
-  def TEST32ri   : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>;
-  def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>;
-
-  def TEST8mi    : BinOpMI_F<"test", Xi8 , X86testpat, MRM0m, 0xF6>;
-  def TEST16mi   : BinOpMI_F<"test", Xi16, X86testpat, MRM0m, 0xF6>;
-  def TEST32mi   : BinOpMI_F<"test", Xi32, X86testpat, MRM0m, 0xF6>;
-  def TEST64mi32 : BinOpMI_F<"test", Xi64, X86testpat, MRM0m, 0xF6>;
+let isCompare = 1 in {
+  let Defs = [EFLAGS] in {
+    let isCommutable = 1 in {
+      def TEST8rr  : BinOpRR_F<0x84, "test", Xi8 , X86testpat, MRMSrcReg>;
+      def TEST16rr : BinOpRR_F<0x84, "test", Xi16, X86testpat, MRMSrcReg>;
+      def TEST32rr : BinOpRR_F<0x84, "test", Xi32, X86testpat, MRMSrcReg>;
+      def TEST64rr : BinOpRR_F<0x84, "test", Xi64, X86testpat, MRMSrcReg>;
+    } // isCommutable
+
+    def TEST8rm    : BinOpRM_F<0x84, "test", Xi8 , X86testpat>;
+    def TEST16rm   : BinOpRM_F<0x84, "test", Xi16, X86testpat>;
+    def TEST32rm   : BinOpRM_F<0x84, "test", Xi32, X86testpat>;
+    def TEST64rm   : BinOpRM_F<0x84, "test", Xi64, X86testpat>;
+
+    def TEST8ri    : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>;
+    def TEST16ri   : BinOpRI_F<0xF6, "test", Xi16, X86testpat, MRM0r>;
+    def TEST32ri   : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>;
+    def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>;
+
+    def TEST8mi    : BinOpMI_F<"test", Xi8 , X86testpat, MRM0m, 0xF6>;
+    def TEST16mi   : BinOpMI_F<"test", Xi16, X86testpat, MRM0m, 0xF6>;
+    def TEST32mi   : BinOpMI_F<"test", Xi32, X86testpat, MRM0m, 0xF6>;
+    def TEST64mi32 : BinOpMI_F<"test", Xi64, X86testpat, MRM0m, 0xF6>;
+
+    // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the
+    // register class is constrained to GR8_NOREX.
+    let isPseudo = 1 in
+    def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask),
+                          "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>;
+  } // Defs = [EFLAGS]
 
   def TEST8i8    : BinOpAI<0xA8, "test", Xi8 , AL,
                            "{$src, %al|AL, $src}">;
@@ -1246,13 +1260,7 @@ let isCompare = 1, Defs = [EFLAGS] in {
                            "{$src, %eax|EAX, $src}">;
   def TEST64i32  : BinOpAI<0xA8, "test", Xi64, RAX,
                            "{$src, %rax|RAX, $src}">;
-
-  // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the
-  // register class is constrained to GR8_NOREX.
-  let isPseudo = 1 in
-  def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask),
-                        "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>;
-}
+} // isCompare
 
 //===----------------------------------------------------------------------===//
 // ANDN Instruction
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index d9ff0c6..8a7ee7d 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -216,48 +216,38 @@ def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins),
 // Alias Instructions
 //===----------------------------------------------------------------------===//
 
-// Alias instructions that map movr0 to xor.
+// Alias instruction mapping movr0 to xor.
 // FIXME: remove when we can teach regalloc that xor reg, reg is ok.
 // FIXME: Set encoding to pseudo.
 let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
-    isCodeGenOnly = 1 in {
-def MOV8r0   : I<0x30, MRMInitReg, (outs GR8 :$dst), (ins), "",
-                 [(set GR8:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;
-
-// We want to rewrite MOV16r0 in terms of MOV32r0, because it's a smaller
-// encoding and avoids a partial-register update sometimes, but doing so
-// at isel time interferes with rematerialization in the current register
-// allocator. For now, this is rewritten when the instruction is lowered
-// to an MCInst.
-def MOV16r0   : I<0x31, MRMInitReg, (outs GR16:$dst), (ins),
-                 "",
-                 [(set GR16:$dst, 0)], IIC_ALU_NONMEM>, OpSize,
-                 Sched<[WriteZero]>;
-
-// FIXME: Set encoding to pseudo.
+    isCodeGenOnly = 1 in
 def MOV32r0  : I<0x31, MRMInitReg, (outs GR32:$dst), (ins), "",
                  [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;
-}
 
-// We want to rewrite MOV64r0 in terms of MOV32r0, because it's sometimes a
-// smaller encoding, but doing so at isel time interferes with rematerialization
-// in the current register allocator. For now, this is rewritten when the
-// instruction is lowered to an MCInst.
-// FIXME: AddedComplexity gives this a higher priority than MOV64ri32. Remove
-// when we have a better way to specify isel priority.
-let Defs = [EFLAGS], isCodeGenOnly=1,
-    AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in
-def MOV64r0   : I<0x31, MRMInitReg, (outs GR64:$dst), (ins), "",
-                 [(set GR64:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;
+// Other widths can also make use of the 32-bit xor, which may have a smaller
+// encoding and avoid partial register updates.
+def : Pat<(i8 0), (EXTRACT_SUBREG (MOV32r0), sub_8bit)>;
+def : Pat<(i16 0), (EXTRACT_SUBREG (MOV32r0), sub_16bit)>;
+def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> {
+  let AddedComplexity = 20;
+}
 
 // Materialize i64 constant where top 32-bits are zero. This could theoretically
 // use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however
 // that would make it more difficult to rematerialize.
 let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1,
-    isCodeGenOnly = 1 in
-def MOV64ri64i32 : Ii32<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64i32imm:$src),
-                        "", [(set GR64:$dst, i64immZExt32:$src)],
-                        IIC_ALU_NONMEM>, Sched<[WriteALU]>;
+    isCodeGenOnly = 1, neverHasSideEffects = 1 in
+def MOV32ri64 : Ii32<0xb8, AddRegFrm, (outs GR32:$dst), (ins i64i32imm:$src),
+                     "", [], IIC_ALU_NONMEM>, Sched<[WriteALU]>;
+
+// This 64-bit pseudo-move can be used for both a 64-bit constant that is
+// actually the zero-extension of a 32-bit constant, and for labels in the
+// x86-64 small code model.
+def mov64imm32 : ComplexPattern<i64, 1, "SelectMOV64Imm32", [imm, X86Wrapper]>;
+
+let AddedComplexity = 1 in
+def : Pat<(i64 mov64imm32:$src),
+          (SUBREG_TO_REG (i64 0), (MOV32ri64 mov64imm32:$src), sub_32bit)>;
 
 // Use sbb to materialize carry bit.
 let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteALU] in {
@@ -942,20 +932,6 @@ def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
 def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
           (MOV64ri tblockaddress:$dst)>, Requires<[FarData]>;
 
-// In static codegen with small code model, we can get the address of a label
-// into a register with 'movl'.  FIXME: This is a hack, the 'imm' predicate of
-// the MOV64ri64i32 should accept these.
-def : Pat<(i64 (X86Wrapper tconstpool  :$dst)),
-          (MOV64ri64i32 tconstpool  :$dst)>, Requires<[SmallCode]>;
-def : Pat<(i64 (X86Wrapper tjumptable  :$dst)),
-          (MOV64ri64i32 tjumptable  :$dst)>, Requires<[SmallCode]>;
-def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
-          (MOV64ri64i32 tglobaladdr :$dst)>, Requires<[SmallCode]>;
-def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
-          (MOV64ri64i32 texternalsym:$dst)>, Requires<[SmallCode]>;
-def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
-          (MOV64ri64i32 tblockaddress:$dst)>, Requires<[SmallCode]>;
-
 // In kernel code model, we can get the address of a label
 // into a register with 'movq'.  FIXME: This is a hack, the 'imm' predicate of
 // the MOV64ri32 should accept these.
@@ -1119,7 +1095,8 @@ defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>;
 def : Pat<(zextloadi8i1  addr:$src), (MOV8rm     addr:$src)>;
 def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>;
 def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
-def : Pat<(zextloadi64i1 addr:$src), (MOVZX64rm8 addr:$src)>;
+def : Pat<(zextloadi64i1 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
 
 // extload bool -> extload byte
 // When extloading from 16-bit and smaller memory locations into 64-bit
@@ -1133,14 +1110,16 @@ def : Pat<(extloadi16i8 addr:$src),  (MOVZX16rm8  addr:$src)>;
 def : Pat<(extloadi32i8 addr:$src),  (MOVZX32rm8  addr:$src)>;
 def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>;
 
-def : Pat<(extloadi64i1 addr:$src),  (MOVZX64rm8  addr:$src)>;
-def : Pat<(extloadi64i8 addr:$src),  (MOVZX64rm8  addr:$src)>;
-def : Pat<(extloadi64i16 addr:$src), (MOVZX64rm16 addr:$src)>;
 // For other extloads, use subregs, since the high contents of the register are
 // defined after an extload.
+def : Pat<(extloadi64i1 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
+def : Pat<(extloadi64i8 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
+def : Pat<(extloadi64i16 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>;
 def : Pat<(extloadi64i32 addr:$src),
-          (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src),
-                         sub_32bit)>;
+          (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>;
 
 // anyext. Define these to do an explicit zero-extend to
 // avoid partial-register updates.
@@ -1152,8 +1131,10 @@ def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8  GR8 :$src)>;
 def : Pat<(i32 (anyext GR16:$src)),
           (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit)>;
 
-def : Pat<(i64 (anyext GR8 :$src)), (MOVZX64rr8  GR8  :$src)>;
-def : Pat<(i64 (anyext GR16:$src)), (MOVZX64rr16 GR16 :$src)>;
+def : Pat<(i64 (anyext GR8 :$src)),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rr8  GR8  :$src), sub_32bit)>;
+def : Pat<(i64 (anyext GR16:$src)),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16 :$src), sub_32bit)>;
 def : Pat<(i64 (anyext GR32:$src)),
           (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
 
@@ -1318,13 +1299,19 @@ def : Pat<(and GR16:$src1, 0xff),
 
 // r & (2^32-1) ==> movz
 def : Pat<(and GR64:$src, 0x00000000FFFFFFFF),
-          (MOVZX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>;
+          (SUBREG_TO_REG (i64 0),
+                         (MOV32rr (EXTRACT_SUBREG GR64:$src, sub_32bit)),
+                         sub_32bit)>;
 // r & (2^16-1) ==> movz
 def : Pat<(and GR64:$src, 0xffff),
-          (MOVZX64rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit)))>;
+          (SUBREG_TO_REG (i64 0),
+                      (MOVZX32rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit))),
+                      sub_32bit)>;
 // r & (2^8-1) ==> movz
 def : Pat<(and GR64:$src, 0xff),
-          (MOVZX64rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit)))>;
+          (SUBREG_TO_REG (i64 0),
+                         (MOVZX32rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit))),
+                         sub_32bit)>;
 // r & (2^8-1) ==> movz
 def : Pat<(and GR32:$src1, 0xff),
            (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, sub_8bit))>,
diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td
index 6dc7175..28954c6 100644
--- a/lib/Target/X86/X86InstrExtension.td
+++ b/lib/Target/X86/X86InstrExtension.td
@@ -149,38 +149,24 @@ def MOVZX64rm16_Q : RI<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
                        "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
                        TB, Sched<[WriteALULd]>;
 
-// FIXME: These should be Pat patterns.
-let isCodeGenOnly = 1 in {
-
-// Use movzbl instead of movzbq when the destination is a register; it's
-// equivalent due to implicit zero-extending, and it has a smaller encoding.
-def MOVZX64rr8 : I<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src),
-                   "", [(set GR64:$dst, (zext GR8:$src))], IIC_MOVZX>, TB,
-                   Sched<[WriteALU]>;
-def MOVZX64rm8 : I<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src),
-                   "", [(set GR64:$dst, (zextloadi64i8 addr:$src))], IIC_MOVZX>,
-                   TB, Sched<[WriteALULd]>;
-// Use movzwl instead of movzwq when the destination is a register; it's
-// equivalent due to implicit zero-extending, and it has a smaller encoding.
-def MOVZX64rr16: I<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
-                   "", [(set GR64:$dst, (zext GR16:$src))], IIC_MOVZX>, TB,
-                   Sched<[WriteALU]>;
-def MOVZX64rm16: I<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
-                   "", [(set GR64:$dst, (zextloadi64i16 addr:$src))],
-                   IIC_MOVZX>, TB, Sched<[WriteALULd]>;
-
-// There's no movzlq instruction, but movl can be used for this purpose, using
-// implicit zero-extension. The preferred way to do 32-bit-to-64-bit zero
-// extension on x86-64 is to use a SUBREG_TO_REG to utilize implicit
-// zero-extension, however this isn't possible when the 32-bit value is
-// defined by a truncate or is copied from something where the high bits aren't
-// necessarily all zero. In such cases, we fall back to these explicit zext
-// instructions.
-def MOVZX64rr32 : I<0x89, MRMDestReg, (outs GR64:$dst), (ins GR32:$src),
-                    "", [(set GR64:$dst, (zext GR32:$src))], IIC_MOVZX>,
-                    Sched<[WriteALU]>;
-def MOVZX64rm32 : I<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
-                    "", [(set GR64:$dst, (zextloadi64i32 addr:$src))],
-                    IIC_MOVZX>, Sched<[WriteALULd]>;
-}
-
+// 64-bit zero-extension patterns use SUBREG_TO_REG and an operation writing a
+// 32-bit register.
+def : Pat<(i64 (zext GR8:$src)),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rr8 GR8:$src), sub_32bit)>;
+def : Pat<(zextloadi64i8 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
+
+def : Pat<(i64 (zext GR16:$src)),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16:$src), sub_32bit)>;
+def : Pat<(zextloadi64i16 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>;
+
+// The preferred way to do 32-bit-to-64-bit zero extension on x86-64 is to use a
+// SUBREG_TO_REG to utilize implicit zero-extension, however this isn't possible
+// when the 32-bit value is defined by a truncate or is copied from something
+// where the high bits aren't necessarily all zero. In such cases, we fall back
+// to these explicit zext instructions.
+def : Pat<(i64 (zext GR32:$src)),
+          (SUBREG_TO_REG (i64 0), (MOV32rr GR32:$src), sub_32bit)>;
+def : Pat<(i64 (zextloadi64i32 addr:$src)),
+          (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>;
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index a71e024..1432414 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -292,13 +292,16 @@ class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm,
 }
 
 def __xs : XS;
+def __xd : XD;
 
 // SI - SSE 1 & 2 scalar instructions
 class SI<bits<8> o, Format F, dag outs, dag ins, string asm,
          list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, asm, pattern, itin> {
   let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
-            !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2]));
+                   !if(!eq(Prefix, __xs.Prefix), [UseSSE1],
+                   !if(!eq(Prefix, __xd.Prefix), [UseSSE2],
+                   !if(hasOpSizePrefix, [UseSSE2], [UseSSE1]))));
 
   // AVX instructions have a 'v' prefix in the mnemonic
   let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
@@ -350,7 +353,7 @@ class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 //   PSI   - SSE1 instructions with TB prefix.
 //   PSIi8 - SSE1 instructions with ImmT == Imm8 and TB prefix.
 //   VSSI  - SSE1 instructions with XS prefix in AVX form.
-//   VPSI  - SSE1 instructions with TB prefix in AVX form.
+//   VPSI  - SSE1 instructions with TB prefix in AVX form, packed single.
 
 class SSI<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern, InstrItinClass itin = NoItinerary>
@@ -381,10 +384,13 @@ class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
 //   SDIi8  - SSE2 instructions with ImmT == Imm8 and XD prefix.
 //   S2SI   - SSE2 instructions with XS prefix.
 //   SSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix.
-//   PDI    - SSE2 instructions with TB and OpSize prefixes.
+//   PDI    - SSE2 instructions with TB and OpSize prefixes, packed double domain.
 //   PDIi8  - SSE2 instructions with ImmT == Imm8 and TB and OpSize prefixes.
-//   VSDI   - SSE2 instructions with XD prefix in AVX form.
-//   VPDI   - SSE2 instructions with TB and OpSize prefixes in AVX form.
+//   VSDI   - SSE2 scalar instructions with XD prefix in AVX form.
+//   VPDI   - SSE2 vector instructions with TB and OpSize prefixes in AVX form,
+//                 packed double domain.
+//   VS2I   - SSE2 scalar instructions with TB and OpSize prefixes in AVX form.
+//   S2I    - SSE2 scalar instructions with TB and OpSize prefixes.
 //   MMXSDIi8  - SSE2 instructions with ImmT == Imm8 and XD prefix as well as
 //               MMX operands.
 //   MMXSSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix as well as
@@ -422,6 +428,14 @@ class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedDouble>, TB,
         OpSize, Requires<[HasAVX]>;
+class VS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, TB,
+        OpSize, Requires<[HasAVX]>;
+class S2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, TB,
+        OpSize, Requires<[UseSSE2]>;
 class MMXSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
                list<dag> pattern, InstrItinClass itin = NoItinerary>
       : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>;
@@ -626,6 +640,12 @@ class RPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
 class VRPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
       : VPDI<o, F, outs, ins, asm, pattern, itin>, VEX_W;
+class RS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : S2I<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class VRS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : VS2I<o, F, outs, ins, asm, pattern, itin>, VEX_W;
 
 // MMX Instruction templates
 //
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 7c0423f..df7b721 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -97,7 +97,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
                     (tm.getSubtarget<X86Subtarget>().is64Bit()
                      ? X86::ADJCALLSTACKUP64
                      : X86::ADJCALLSTACKUP32)),
-    TM(tm), RI(tm, *this) {
+    TM(tm), RI(tm) {
 
   static const X86OpTblEntry OpTbl2Addr[] = {
     { X86::ADC32ri,     X86::ADC32mi,    0 },
@@ -451,9 +451,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::MOVZX32rr16,     X86::MOVZX32rm16,         0 },
     { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8,   0 },
     { X86::MOVZX32rr8,      X86::MOVZX32rm8,          0 },
-    { X86::MOVZX64rr16,     X86::MOVZX64rm16,         0 },
-    { X86::MOVZX64rr32,     X86::MOVZX64rm32,         0 },
-    { X86::MOVZX64rr8,      X86::MOVZX64rm8,          0 },
     { X86::PABSBrr128,      X86::PABSBrm128,          TB_ALIGN_16 },
     { X86::PABSDrr128,      X86::PABSDrm128,          TB_ALIGN_16 },
     { X86::PABSWrr128,      X86::PABSWrm128,          TB_ALIGN_16 },
@@ -1381,7 +1378,6 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
   case X86::MOVSX32rr8:
   case X86::MOVZX32rr8:
   case X86::MOVSX64rr8:
-  case X86::MOVZX64rr8:
     if (!TM.getSubtarget<X86Subtarget>().is64Bit())
       // It's not always legal to reference the low 8-bit of the larger
       // register in 32-bit mode.
@@ -1389,9 +1385,7 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
   case X86::MOVSX32rr16:
   case X86::MOVZX32rr16:
   case X86::MOVSX64rr16:
-  case X86::MOVZX64rr16:
-  case X86::MOVSX64rr32:
-  case X86::MOVZX64rr32: {
+  case X86::MOVSX64rr32: {
     if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
       // Be conservative.
       return false;
@@ -1404,17 +1398,14 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
     case X86::MOVSX32rr8:
     case X86::MOVZX32rr8:
     case X86::MOVSX64rr8:
-    case X86::MOVZX64rr8:
       SubIdx = X86::sub_8bit;
       break;
     case X86::MOVSX32rr16:
     case X86::MOVZX32rr16:
     case X86::MOVSX64rr16:
-    case X86::MOVZX64rr16:
       SubIdx = X86::sub_16bit;
       break;
     case X86::MOVSX64rr32:
-    case X86::MOVZX64rr32:
       SubIdx = X86::sub_32bit;
       break;
     }
@@ -1722,37 +1713,16 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
                                  unsigned DestReg, unsigned SubIdx,
                                  const MachineInstr *Orig,
                                  const TargetRegisterInfo &TRI) const {
-  DebugLoc DL = Orig->getDebugLoc();
-
-  // MOV32r0 etc. are implemented with xor which clobbers condition code.
-  // Re-materialize them as movri instructions to avoid side effects.
-  bool Clone = true;
+  // MOV32r0 is implemented with a xor which clobbers condition code.
+  // Re-materialize it as movri instructions to avoid side effects.
   unsigned Opc = Orig->getOpcode();
-  switch (Opc) {
-  default: break;
-  case X86::MOV8r0:
-  case X86::MOV16r0:
-  case X86::MOV32r0:
-  case X86::MOV64r0: {
-    if (!isSafeToClobberEFLAGS(MBB, I)) {
-      switch (Opc) {
-      default: llvm_unreachable("Unreachable!");
-      case X86::MOV8r0:  Opc = X86::MOV8ri;  break;
-      case X86::MOV16r0: Opc = X86::MOV16ri; break;
-      case X86::MOV32r0: Opc = X86::MOV32ri; break;
-      case X86::MOV64r0: Opc = X86::MOV64ri64i32; break;
-      }
-      Clone = false;
-    }
-    break;
-  }
-  }
-
-  if (Clone) {
+  if (Opc == X86::MOV32r0 && !isSafeToClobberEFLAGS(MBB, I)) {
+    DebugLoc DL = Orig->getDebugLoc();
+    BuildMI(MBB, I, DL, get(X86::MOV32ri)).addOperand(Orig->getOperand(0))
+      .addImm(0);
+  } else {
     MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
     MBB.insert(I, MI);
-  } else {
-    BuildMI(MBB, I, DL, get(Opc)).addOperand(Orig->getOperand(0)).addImm(0);
   }
 
   MachineInstr *NewMI = prior(I);
@@ -1772,6 +1742,98 @@ static bool hasLiveCondCodeDef(MachineInstr *MI) {
   return false;
 }
 
+/// getTruncatedShiftCount - check whether the shift count for a machine operand
+/// is non-zero.
+inline static unsigned getTruncatedShiftCount(MachineInstr *MI,
+                                              unsigned ShiftAmtOperandIdx) {
+  // The shift count is six bits with the REX.W prefix and five bits without.
+  unsigned ShiftCountMask = (MI->getDesc().TSFlags & X86II::REX_W) ? 63 : 31;
+  unsigned Imm = MI->getOperand(ShiftAmtOperandIdx).getImm();
+  return Imm & ShiftCountMask;
+}
+
+/// isTruncatedShiftCountForLEA - check whether the given shift count is appropriate
+/// can be represented by a LEA instruction.
+inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
+  // Left shift instructions can be transformed into load-effective-address
+  // instructions if we can encode them appropriately.
+  // A LEA instruction utilizes a SIB byte to encode it's scale factor.
+  // The SIB.scale field is two bits wide which means that we can encode any
+  // shift amount less than 4.
+  return ShAmt < 4 && ShAmt > 0;
+}
+
+bool X86InstrInfo::classifyLEAReg(MachineInstr *MI, const MachineOperand &Src,
+                                  unsigned Opc, bool AllowSP,
+                                  unsigned &NewSrc, bool &isKill, bool &isUndef,
+                                  MachineOperand &ImplicitOp) const {
+  MachineFunction &MF = *MI->getParent()->getParent();
+  const TargetRegisterClass *RC;
+  if (AllowSP) {
+    RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass;
+  } else {
+    RC = Opc != X86::LEA32r ?
+      &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
+  }
+  unsigned SrcReg = Src.getReg();
+
+  // For both LEA64 and LEA32 the register already has essentially the right
+  // type (32-bit or 64-bit) we may just need to forbid SP.
+  if (Opc != X86::LEA64_32r) {
+    NewSrc = SrcReg;
+    isKill = Src.isKill();
+    isUndef = Src.isUndef();
+
+    if (TargetRegisterInfo::isVirtualRegister(NewSrc) &&
+        !MF.getRegInfo().constrainRegClass(NewSrc, RC))
+      return false;
+
+    return true;
+  }
+
+  // This is for an LEA64_32r and incoming registers are 32-bit. One way or
+  // another we need to add 64-bit registers to the final MI.
+  if (TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
+    ImplicitOp = Src;
+    ImplicitOp.setImplicit();
+
+    NewSrc = getX86SubSuperRegister(Src.getReg(), MVT::i64);
+    MachineBasicBlock::LivenessQueryResult LQR =
+      MI->getParent()->computeRegisterLiveness(&getRegisterInfo(), NewSrc, MI);
+
+    switch (LQR) {
+    case MachineBasicBlock::LQR_Unknown:
+      // We can't give sane liveness flags to the instruction, abandon LEA
+      // formation.
+      return false;
+    case MachineBasicBlock::LQR_Live:
+      isKill = MI->killsRegister(SrcReg);
+      isUndef = false;
+      break;
+    default:
+      // The physreg itself is dead, so we have to use it as an <undef>.
+      isKill = false;
+      isUndef = true;
+      break;
+    }
+  } else {
+    // Virtual register of the wrong class, we have to create a temporary 64-bit
+    // vreg to feed into the LEA.
+    NewSrc = MF.getRegInfo().createVirtualRegister(RC);
+    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+            get(TargetOpcode::COPY))
+      .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
+        .addOperand(Src);
+
+    // Which is obviously going to be dead after we're done with it.
+    isKill = true;
+    isUndef = false;
+  }
+
+  // We've set all the parameters without issue.
+  return true;
+}
+
 /// convertToThreeAddressWithLEA - Helper for convertToThreeAddress when
 /// 16-bit LEA is disabled, use 32-bit LEA to form 3-address code by promoting
 /// to a 32-bit superregister and then truncating back down to a 16-bit
@@ -1787,11 +1849,16 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
   bool isDead = MI->getOperand(0).isDead();
   bool isKill = MI->getOperand(1).isKill();
 
-  unsigned Opc = TM.getSubtarget<X86Subtarget>().is64Bit()
-    ? X86::LEA64_32r : X86::LEA32r;
   MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
-  unsigned leaInReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
   unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+  unsigned Opc, leaInReg;
+  if (TM.getSubtarget<X86Subtarget>().is64Bit()) {
+    Opc = X86::LEA64_32r;
+    leaInReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+  } else {
+    Opc = X86::LEA32r;
+    leaInReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+  }
 
   // Build and insert into an implicit UNDEF value. This is OK because
   // well be shifting and then extracting the lower 16-bits.
@@ -1841,7 +1908,10 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
       // just a single insert_subreg.
       addRegReg(MIB, leaInReg, true, leaInReg, false);
     } else {
-      leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+      if (TM.getSubtarget<X86Subtarget>().is64Bit())
+        leaInReg2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+      else
+        leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
       // Build and insert into an implicit UNDEF value. This is OK because
       // well be shifting and then extracting the lower 16-bits.
       BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(X86::IMPLICIT_DEF),leaInReg2);
@@ -1891,6 +1961,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
                                     MachineBasicBlock::iterator &MBBI,
                                     LiveVariables *LV) const {
   MachineInstr *MI = MBBI;
+
+  // The following opcodes also sets the condition code register(s). Only
+  // convert them to equivalent lea if the condition code register def's
+  // are dead!
+  if (hasLiveCondCodeDef(MI))
+    return 0;
+
   MachineFunction &MF = *MI->getParent()->getParent();
   // All instructions input are two-addr instructions.  Get the known operands.
   const MachineOperand &Dest = MI->getOperand(0);
@@ -1935,10 +2012,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   }
   case X86::SHL64ri: {
     assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
-    // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
-    // the flags produced by a shift yet, so this is safe.
-    unsigned ShAmt = MI->getOperand(2).getImm();
-    if (ShAmt == 0 || ShAmt >= 4) return 0;
+    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+    if (!isTruncatedShiftCountForLEA(ShAmt)) return 0;
 
     // LEA can't handle RSP.
     if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
@@ -1953,29 +2028,34 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   }
   case X86::SHL32ri: {
     assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
-    // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
-    // the flags produced by a shift yet, so this is safe.
-    unsigned ShAmt = MI->getOperand(2).getImm();
-    if (ShAmt == 0 || ShAmt >= 4) return 0;
+    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+    if (!isTruncatedShiftCountForLEA(ShAmt)) return 0;
+
+    unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
 
     // LEA can't handle ESP.
-    if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
-        !MF.getRegInfo().constrainRegClass(Src.getReg(),
-                                           &X86::GR32_NOSPRegClass))
+    bool isKill, isUndef;
+    unsigned SrcReg;
+    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
+                        SrcReg, isKill, isUndef, ImplicitOp))
       return 0;
 
-    unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
-    NewMI = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+    MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
       .addOperand(Dest)
-      .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
+      .addReg(0).addImm(1 << ShAmt)
+      .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
+      .addImm(0).addReg(0);
+    if (ImplicitOp.getReg() != 0)
+      MIB.addOperand(ImplicitOp);
+    NewMI = MIB;
+
     break;
   }
   case X86::SHL16ri: {
     assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
-    // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
-    // the flags produced by a shift yet, so this is safe.
-    unsigned ShAmt = MI->getOperand(2).getImm();
-    if (ShAmt == 0 || ShAmt >= 4) return 0;
+    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+    if (!isTruncatedShiftCountForLEA(ShAmt)) return 0;
 
     if (DisableLEA16)
       return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
@@ -1985,11 +2065,6 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     break;
   }
   default: {
-    // The following opcodes also sets the condition code register(s). Only
-    // convert them to equivalent lea if the condition code register def's
-    // are dead!
-    if (hasLiveCondCodeDef(MI))
-      return 0;
 
     switch (MIOpc) {
     default: return 0;
@@ -1999,17 +2074,20 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
       unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r
         : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
-      const TargetRegisterClass *RC = MIOpc == X86::INC64r ?
-        (const TargetRegisterClass*)&X86::GR64_NOSPRegClass :
-        (const TargetRegisterClass*)&X86::GR32_NOSPRegClass;
-
-      // LEA can't handle RSP.
-      if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
-          !MF.getRegInfo().constrainRegClass(Src.getReg(), RC))
+      bool isKill, isUndef;
+      unsigned SrcReg;
+      MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+      if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
+                          SrcReg, isKill, isUndef, ImplicitOp))
         return 0;
 
-      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
-                        .addOperand(Dest).addOperand(Src), 1);
+      MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+          .addOperand(Dest)
+          .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef));
+      if (ImplicitOp.getReg() != 0)
+        MIB.addOperand(ImplicitOp);
+
+      NewMI = addOffset(MIB, 1);
       break;
     }
     case X86::INC16r:
@@ -2026,16 +2104,22 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
       unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
         : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
-      const TargetRegisterClass *RC = MIOpc == X86::DEC64r ?
-        (const TargetRegisterClass*)&X86::GR64_NOSPRegClass :
-        (const TargetRegisterClass*)&X86::GR32_NOSPRegClass;
-      // LEA can't handle RSP.
-      if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
-          !MF.getRegInfo().constrainRegClass(Src.getReg(), RC))
+
+      bool isKill, isUndef;
+      unsigned SrcReg;
+      MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+      if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
+                          SrcReg, isKill, isUndef, ImplicitOp))
         return 0;
 
-      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
-                        .addOperand(Dest).addOperand(Src), -1);
+      MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+          .addOperand(Dest)
+          .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill));
+      if (ImplicitOp.getReg() != 0)
+        MIB.addOperand(ImplicitOp);
+
+      NewMI = addOffset(MIB, -1);
+
       break;
     }
     case X86::DEC16r:
@@ -2052,36 +2136,41 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     case X86::ADD32rr_DB: {
       assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
       unsigned Opc;
-      const TargetRegisterClass *RC;
-      if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB) {
+      if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB)
         Opc = X86::LEA64r;
-        RC = &X86::GR64_NOSPRegClass;
-      } else {
+      else
         Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
-        RC = &X86::GR32_NOSPRegClass;
-      }
 
+      bool isKill, isUndef;
+      unsigned SrcReg;
+      MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+      if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
+                          SrcReg, isKill, isUndef, ImplicitOp))
+        return 0;
 
-      unsigned Src2 = MI->getOperand(2).getReg();
-      bool isKill2 = MI->getOperand(2).isKill();
-
-      // LEA can't handle RSP.
-      if (TargetRegisterInfo::isVirtualRegister(Src2) &&
-          !MF.getRegInfo().constrainRegClass(Src2, RC))
+      const MachineOperand &Src2 = MI->getOperand(2);
+      bool isKill2, isUndef2;
+      unsigned SrcReg2;
+      MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
+      if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false,
+                          SrcReg2, isKill2, isUndef2, ImplicitOp2))
         return 0;
 
-      NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(Opc))
-                        .addOperand(Dest),
-                        Src.getReg(), Src.isKill(), Src2, isKill2);
+      MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+        .addOperand(Dest);
+      if (ImplicitOp.getReg() != 0)
+        MIB.addOperand(ImplicitOp);
+      if (ImplicitOp2.getReg() != 0)
+        MIB.addOperand(ImplicitOp2);
+
+      NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2);
 
       // Preserve undefness of the operands.
-      bool isUndef = MI->getOperand(1).isUndef();
-      bool isUndef2 = MI->getOperand(2).isUndef();
       NewMI->getOperand(1).setIsUndef(isUndef);
       NewMI->getOperand(3).setIsUndef(isUndef2);
 
-      if (LV && isKill2)
-        LV->replaceKillInstruction(Src2, MI, NewMI);
+      if (LV && Src2.isKill())
+        LV->replaceKillInstruction(SrcReg2, MI, NewMI);
       break;
     }
     case X86::ADD16rr:
@@ -2120,9 +2209,21 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     case X86::ADD32ri8_DB: {
       assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
       unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
-      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
-                        .addOperand(Dest).addOperand(Src),
-                        MI->getOperand(2).getImm());
+
+      bool isKill, isUndef;
+      unsigned SrcReg;
+      MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+      if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
+                          SrcReg, isKill, isUndef, ImplicitOp))
+        return 0;
+
+      MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+          .addOperand(Dest)
+          .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill));
+      if (ImplicitOp.getReg() != 0)
+        MIB.addOperand(ImplicitOp);
+
+      NewMI = addOffset(MIB, MI->getOperand(2).getImm());
       break;
     }
     case X86::ADD16ri:
@@ -3171,6 +3272,25 @@ inline static bool isRedundantFlagInstr(MachineInstr *FlagI, unsigned SrcReg,
 inline static bool isDefConvertible(MachineInstr *MI) {
   switch (MI->getOpcode()) {
   default: return false;
+
+  // The shift instructions only modify ZF if their shift count is non-zero.
+  // N.B.: The processor truncates the shift count depending on the encoding.
+  case X86::SAR8ri:    case X86::SAR16ri:  case X86::SAR32ri:case X86::SAR64ri:
+  case X86::SHR8ri:    case X86::SHR16ri:  case X86::SHR32ri:case X86::SHR64ri:
+     return getTruncatedShiftCount(MI, 2) != 0;
+
+  // Some left shift instructions can be turned into LEA instructions but only
+  // if their flags aren't used. Avoid transforming such instructions.
+  case X86::SHL8ri:    case X86::SHL16ri:  case X86::SHL32ri:case X86::SHL64ri:{
+    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+    if (isTruncatedShiftCountForLEA(ShAmt)) return false;
+    return ShAmt != 0;
+  }
+
+  case X86::SHRD16rri8:case X86::SHRD32rri8:case X86::SHRD64rri8:
+  case X86::SHLD16rri8:case X86::SHLD32rri8:case X86::SHLD64rri8:
+     return getTruncatedShiftCount(MI, 3) != 0;
+
   case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri:
   case X86::SUB32ri8:  case X86::SUB16ri:  case X86::SUB16ri8:
   case X86::SUB8ri:    case X86::SUB64rr:  case X86::SUB32rr:
@@ -3200,8 +3320,37 @@ inline static bool isDefConvertible(MachineInstr *MI) {
   case X86::OR8ri:     case X86::OR64rr:   case X86::OR32rr:
   case X86::OR16rr:    case X86::OR8rr:    case X86::OR64rm:
   case X86::OR32rm:    case X86::OR16rm:   case X86::OR8rm:
+  case X86::NEG8r:     case X86::NEG16r:   case X86::NEG32r: case X86::NEG64r:
+  case X86::SAR8r1:    case X86::SAR16r1:  case X86::SAR32r1:case X86::SAR64r1:
+  case X86::SHR8r1:    case X86::SHR16r1:  case X86::SHR32r1:case X86::SHR64r1:
+  case X86::SHL8r1:    case X86::SHL16r1:  case X86::SHL32r1:case X86::SHL64r1:
+  case X86::ADC32ri:   case X86::ADC32ri8:
+  case X86::ADC32rr:   case X86::ADC64ri32:
+  case X86::ADC64ri8:  case X86::ADC64rr:
+  case X86::SBB32ri:   case X86::SBB32ri8:
+  case X86::SBB32rr:   case X86::SBB64ri32:
+  case X86::SBB64ri8:  case X86::SBB64rr:
   case X86::ANDN32rr:  case X86::ANDN32rm:
   case X86::ANDN64rr:  case X86::ANDN64rm:
+  case X86::BEXTR32rr: case X86::BEXTR64rr:
+  case X86::BEXTR32rm: case X86::BEXTR64rm:
+  case X86::BLSI32rr:  case X86::BLSI32rm:
+  case X86::BLSI64rr:  case X86::BLSI64rm:
+  case X86::BLSMSK32rr:case X86::BLSMSK32rm:
+  case X86::BLSMSK64rr:case X86::BLSMSK64rm:
+  case X86::BLSR32rr:  case X86::BLSR32rm:
+  case X86::BLSR64rr:  case X86::BLSR64rm:
+  case X86::BZHI32rr:  case X86::BZHI32rm:
+  case X86::BZHI64rr:  case X86::BZHI64rm:
+  case X86::LZCNT16rr: case X86::LZCNT16rm:
+  case X86::LZCNT32rr: case X86::LZCNT32rm:
+  case X86::LZCNT64rr: case X86::LZCNT64rm:
+  case X86::POPCNT16rr:case X86::POPCNT16rm:
+  case X86::POPCNT32rr:case X86::POPCNT32rm:
+  case X86::POPCNT64rr:case X86::POPCNT64rm:
+  case X86::TZCNT16rr: case X86::TZCNT16rm:
+  case X86::TZCNT32rr: case X86::TZCNT32rm:
+  case X86::TZCNT64rr: case X86::TZCNT64rm:
     return true;
   }
 }
@@ -3308,10 +3457,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
       // MOV32r0 etc. are implemented with xor which clobbers condition code.
       // They are safe to move up, if the definition to EFLAGS is dead and
       // earlier instructions do not read or write EFLAGS.
-      if (!Movr0Inst && (Instr->getOpcode() == X86::MOV8r0 ||
-           Instr->getOpcode() == X86::MOV16r0 ||
-           Instr->getOpcode() == X86::MOV32r0 ||
-           Instr->getOpcode() == X86::MOV64r0) &&
+      if (!Movr0Inst && Instr->getOpcode() == X86::MOV32r0 &&
           Instr->registerDefIsDead(X86::EFLAGS, TRI)) {
         Movr0Inst = Instr;
         continue;
@@ -3420,20 +3566,38 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
 
   // The instruction to be updated is either Sub or MI.
   Sub = IsCmpZero ? MI : Sub;
-  // Move Movr0Inst to the place right before Sub.
+  // Move Movr0Inst to the appropriate place before Sub.
   if (Movr0Inst) {
-    Sub->getParent()->remove(Movr0Inst);
-    Sub->getParent()->insert(MachineBasicBlock::iterator(Sub), Movr0Inst);
+    // Look backwards until we find a def that doesn't use the current EFLAGS.
+    Def = Sub;
+    MachineBasicBlock::reverse_iterator
+      InsertI = MachineBasicBlock::reverse_iterator(++Def),
+                InsertE = Sub->getParent()->rend();
+    for (; InsertI != InsertE; ++InsertI) {
+      MachineInstr *Instr = &*InsertI;
+      if (!Instr->readsRegister(X86::EFLAGS, TRI) &&
+          Instr->modifiesRegister(X86::EFLAGS, TRI)) {
+        Sub->getParent()->remove(Movr0Inst);
+        Instr->getParent()->insert(MachineBasicBlock::iterator(Instr),
+                                   Movr0Inst);
+        break;
+      }
+    }
+    if (InsertI == InsertE)
+      return false;
   }
 
   // Make sure Sub instruction defines EFLAGS and mark the def live.
-  unsigned LastOperand = Sub->getNumOperands() - 1;
-  assert(Sub->getNumOperands() >= 2 &&
-         Sub->getOperand(LastOperand).isReg() &&
-         Sub->getOperand(LastOperand).getReg() == X86::EFLAGS &&
-         "EFLAGS should be the last operand of SUB, ADD, OR, XOR, AND");
-  Sub->getOperand(LastOperand).setIsDef(true);
-  Sub->getOperand(LastOperand).setIsDead(false);
+  unsigned i = 0, e = Sub->getNumOperands();
+  for (; i != e; ++i) {
+    MachineOperand &MO = Sub->getOperand(i);
+    if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) {
+      MO.setIsDead(false);
+      break;
+    }
+  }
+  assert(i != e && "Unable to locate a def EFLAGS operand");
+
   CmpInstr->eraseFromParent();
 
   // Modify the condition code of instructions in OpsToUpdate.
@@ -3686,18 +3850,11 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     OpcodeTablePtr = &RegOp2MemOpTable2Addr;
     isTwoAddrFold = true;
   } else if (i == 0) { // If operand 0
-    unsigned Opc = 0;
-    switch (MI->getOpcode()) {
-    default: break;
-    case X86::MOV64r0: Opc = X86::MOV64mi32; break;
-    case X86::MOV32r0: Opc = X86::MOV32mi;   break;
-    case X86::MOV16r0: Opc = X86::MOV16mi;   break;
-    case X86::MOV8r0:  Opc = X86::MOV8mi;    break;
+    if (MI->getOpcode() == X86::MOV32r0) {
+      NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, MI);
+      if (NewMI)
+        return NewMI;
     }
-    if (Opc)
-       NewMI = MakeM0Inst(*this, Opc, MOs, MI);
-    if (NewMI)
-      return NewMI;
 
     OpcodeTablePtr = &RegOp2MemOpTable0;
   } else if (i == 1) {
@@ -4083,13 +4240,9 @@ bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
   if (isTwoAddr && NumOps >= 2 && OpNum < 2) {
     OpcodeTablePtr = &RegOp2MemOpTable2Addr;
   } else if (OpNum == 0) { // If operand 0
-    switch (Opc) {
-    case X86::MOV8r0:
-    case X86::MOV16r0:
-    case X86::MOV32r0:
-    case X86::MOV64r0: return true;
-    default: break;
-    }
+    if (Opc == X86::MOV32r0)
+      return true;
+
     OpcodeTablePtr = &RegOp2MemOpTable0;
   } else if (OpNum == 1) {
     OpcodeTablePtr = &RegOp2MemOpTable1;
@@ -4250,7 +4403,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
   std::vector<SDValue> AddrOps;
   std::vector<SDValue> BeforeOps;
   std::vector<SDValue> AfterOps;
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   unsigned NumOps = N->getNumOperands();
   for (unsigned i = 0; i != NumOps-1; ++i) {
     SDValue Op = N->getOperand(i);
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 260f054..332874f 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -192,6 +192,19 @@ public:
                      const MachineInstr *Orig,
                      const TargetRegisterInfo &TRI) const;
 
+  /// Given an operand within a MachineInstr, insert preceding code to put it
+  /// into the right format for a particular kind of LEA instruction. This may
+  /// involve using an appropriate super-register instead (with an implicit use
+  /// of the original) or creating a new virtual register and inserting COPY
+  /// instructions to get the data into the right class.
+  ///
+  /// Reference parameters are set to indicate how caller should add this
+  /// operand to the LEA instruction.
+  bool classifyLEAReg(MachineInstr *MI, const MachineOperand &Src,
+                      unsigned LEAOpcode, bool AllowSP,
+                      unsigned &NewSrc, bool &isKill,
+                      bool &isUndef, MachineOperand &ImplicitOp) const;
+
   /// convertToThreeAddress - This method must be implemented by targets that
   /// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
   /// may be able to convert a two-address instruction into a true
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 3380d8c..817bd6c 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -523,8 +523,7 @@ def i64i8imm   : Operand<i64> {
 
 def lea64_32mem : Operand<i32> {
   let PrintMethod = "printi32mem";
-  let AsmOperandLowerMethod = "lower_lea64_32mem";
-  let MIOperandInfo = (ops GR32, i8imm, GR32_NOSP, i32imm, i8imm);
+  let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, i8imm);
   let ParserMatchClass = X86MemAsmOperand;
 }
 
@@ -546,7 +545,7 @@ def lea32addr : ComplexPattern<i32, 5, "SelectLEAAddr",
                                [add, sub, mul, X86mul_imm, shl, or, frameindex],
                                []>;
 // In 64-bit mode 32-bit LEAs can use RIP-relative addressing.
-def lea64_32addr : ComplexPattern<i32, 5, "SelectLEAAddr",
+def lea64_32addr : ComplexPattern<i32, 5, "SelectLEA64_32Addr",
                                   [add, sub, mul, X86mul_imm, shl, or,
                                    frameindex, X86WrapperRIP],
                                   []>;
@@ -884,12 +883,12 @@ def PUSHF64    : I<0x9C, RawFrm, (outs), (ins), "pushfq", [], IIC_PUSH_F>,
 
 let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP],
     mayLoad = 1, neverHasSideEffects = 1, SchedRW = [WriteLoad] in {
-def POPA32   : I<0x61, RawFrm, (outs), (ins), "popa{l|d}", [], IIC_POP_A>,
+def POPA32   : I<0x61, RawFrm, (outs), (ins), "popa{l}", [], IIC_POP_A>,
                Requires<[In32BitMode]>;
 }
 let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP],
     mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in {
-def PUSHA32  : I<0x60, RawFrm, (outs), (ins), "pusha{l|d}", [], IIC_PUSH_A>,
+def PUSHA32  : I<0x60, RawFrm, (outs), (ins), "pusha{l}", [], IIC_PUSH_A>,
                Requires<[In32BitMode]>;
 }
 
@@ -1867,6 +1866,9 @@ def : MnemonicAlias<"pushf",  "pushfl", "att">, Requires<[In32BitMode]>;
 def : MnemonicAlias<"pushf",  "pushfq", "att">, Requires<[In64BitMode]>;
 def : MnemonicAlias<"pushfd", "pushfl", "att">;
 
+def : MnemonicAlias<"popad",   "popa", "intel">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pushad",  "pusha", "intel">, Requires<[In32BitMode]>;
+
 def : MnemonicAlias<"repe",  "rep",   "att">;
 def : MnemonicAlias<"repz",  "rep",   "att">;
 def : MnemonicAlias<"repnz", "repne", "att">;
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index 49721df..07314a0 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -357,21 +357,21 @@ defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw,
 defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b,
                                    MMX_INTALU_ITINS>;
 defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w,
-                                   MMX_INTALU_ITINS, 1>;
+                                   MMX_INTALU_ITINS>;
 defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d,
-                                   MMX_INTALU_ITINS, 1>;
+                                   MMX_INTALU_ITINS>;
 defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q,
-                                   MMX_INTALUQ_ITINS, 1>;
+                                   MMX_INTALUQ_ITINS>;
 
 defm MMX_PSUBSB  : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b,
-                                   MMX_INTALU_ITINS, 1>;
+                                   MMX_INTALU_ITINS>;
 defm MMX_PSUBSW  : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w,
-                                   MMX_INTALU_ITINS, 1>;
+                                   MMX_INTALU_ITINS>;
 
 defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b,
-                                   MMX_INTALU_ITINS, 1>;
+                                   MMX_INTALU_ITINS>;
 defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w,
-                                   MMX_INTALU_ITINS, 1>;
+                                   MMX_INTALU_ITINS>;
 
 defm MMX_PHSUBW  : SS3I_binop_rm_int_mm<0x05, "phsubw", int_x86_ssse3_phsub_w,
                                    MMX_PHADDSUBW>;
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index cce938b..79b1ca3 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -2342,65 +2342,62 @@ let Constraints = "$src1 = $dst" in {
 // sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
 multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
                             ValueType vt, X86MemOperand x86memop,
-                            PatFrag ld_frag, string OpcodeStr, Domain d> {
-  def rr: PI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
+                            PatFrag ld_frag, string OpcodeStr> {
+  def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
                      !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
                      [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
-                     IIC_SSE_COMIS_RR, d>,
+                     IIC_SSE_COMIS_RR>,
           Sched<[WriteFAdd]>;
-  def rm: PI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
+  def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
                      !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
                      [(set EFLAGS, (OpNode (vt RC:$src1),
                                            (ld_frag addr:$src2)))],
-                                           IIC_SSE_COMIS_RM, d>,
+                                           IIC_SSE_COMIS_RM>,
           Sched<[WriteFAddLd, ReadAfterLd]>;
 }
 
 let Defs = [EFLAGS] in {
   defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
-                                  "ucomiss", SSEPackedSingle>, TB, VEX, VEX_LIG;
+                                  "ucomiss">, TB, VEX, VEX_LIG;
   defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
-                                  "ucomisd", SSEPackedDouble>, TB, OpSize, VEX,
-                                  VEX_LIG;
+                                  "ucomisd">, TB, OpSize, VEX, VEX_LIG;
   let Pattern = []<dag> in {
     defm VCOMISS  : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
-                                    "comiss", SSEPackedSingle>, TB, VEX,
-                                    VEX_LIG;
+                                    "comiss">, TB, VEX, VEX_LIG;
     defm VCOMISD  : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
-                                    "comisd", SSEPackedDouble>, TB, OpSize, VEX,
-                                    VEX_LIG;
+                                    "comisd">, TB, OpSize, VEX, VEX_LIG;
   }
 
   defm Int_VUCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
-                            load, "ucomiss", SSEPackedSingle>, TB, VEX;
+                            load, "ucomiss">, TB, VEX;
   defm Int_VUCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
-                            load, "ucomisd", SSEPackedDouble>, TB, OpSize, VEX;
+                            load, "ucomisd">, TB, OpSize, VEX;
 
   defm Int_VCOMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem,
-                            load, "comiss", SSEPackedSingle>, TB, VEX;
+                            load, "comiss">, TB, VEX;
   defm Int_VCOMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem,
-                            load, "comisd", SSEPackedDouble>, TB, OpSize, VEX;
+                            load, "comisd">, TB, OpSize, VEX;
   defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
-                                  "ucomiss", SSEPackedSingle>, TB;
+                                  "ucomiss">, TB;
   defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
-                                  "ucomisd", SSEPackedDouble>, TB, OpSize;
+                                  "ucomisd">, TB, OpSize;
 
   let Pattern = []<dag> in {
     defm COMISS  : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
-                                    "comiss", SSEPackedSingle>, TB;
+                                    "comiss">, TB;
     defm COMISD  : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
-                                    "comisd", SSEPackedDouble>, TB, OpSize;
+                                    "comisd">, TB, OpSize;
   }
 
   defm Int_UCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
-                              load, "ucomiss", SSEPackedSingle>, TB;
+                              load, "ucomiss">, TB;
   defm Int_UCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
-                              load, "ucomisd", SSEPackedDouble>, TB, OpSize;
+                              load, "ucomisd">, TB, OpSize;
 
   defm Int_COMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load,
-                                  "comiss", SSEPackedSingle>, TB;
+                                  "comiss">, TB;
   defm Int_COMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load,
-                                  "comisd", SSEPackedDouble>, TB, OpSize;
+                                  "comisd">, TB, OpSize;
 } // Defs = [EFLAGS]
 
 // sse12_cmp_packed - sse 1 & 2 compare packed instructions
@@ -3049,12 +3046,20 @@ let isCodeGenOnly = 1 in {
 /// And, we have a special variant form for a full-vector intrinsic form.
 
 let Sched = WriteFSqrt in {
-def SSE_SQRTP : OpndItins<
-  IIC_SSE_SQRTP_RR, IIC_SSE_SQRTP_RM
+def SSE_SQRTPS : OpndItins<
+  IIC_SSE_SQRTPS_RR, IIC_SSE_SQRTPS_RM
 >;
 
-def SSE_SQRTS : OpndItins<
-  IIC_SSE_SQRTS_RR, IIC_SSE_SQRTS_RM
+def SSE_SQRTSS : OpndItins<
+  IIC_SSE_SQRTSS_RR, IIC_SSE_SQRTSS_RM
+>;
+
+def SSE_SQRTPD : OpndItins<
+  IIC_SSE_SQRTPD_RR, IIC_SSE_SQRTPD_RM
+>;
+
+def SSE_SQRTSD : OpndItins<
+  IIC_SSE_SQRTSD_RR, IIC_SSE_SQRTSD_RM
 >;
 }
 
@@ -3319,18 +3324,18 @@ let Predicates = [HasAVX] in {
 
 // Square root.
 defm SQRT  : sse1_fp_unop_s<0x51, "sqrt",  fsqrt, int_x86_sse_sqrt_ss,
-                            SSE_SQRTS>,
-             sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTP>,
+                            SSE_SQRTSS>,
+             sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS>,
              sse2_fp_unop_s<0x51, "sqrt",  fsqrt, int_x86_sse2_sqrt_sd,
-                            SSE_SQRTS>,
-             sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTP>;
+                            SSE_SQRTSD>,
+             sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>;
 
 // Reciprocal approximations. Note that these typically require refinement
 // in order to obtain suitable precision.
-defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_SQRTS>,
-             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTP>,
+defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_SQRTSS>,
+             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTPS>,
              sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps,
-                                int_x86_avx_rsqrt_ps_256, SSE_SQRTP>;
+                                int_x86_avx_rsqrt_ps_256, SSE_SQRTPS>;
 defm RCP   : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>,
              sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>,
              sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps,
@@ -4369,43 +4374,43 @@ def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
 //===---------------------------------------------------------------------===//
 // Move Int Doubleword to Packed Double Int
 //
-def VMOVDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
                         (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
                         VEX, Sched<[WriteMove]>;
-def VMOVDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
                         (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
                         IIC_SSE_MOVDQ>,
                       VEX, Sched<[WriteLoad]>;
-def VMOV64toPQIrr : VRPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
                         "mov{d|q}\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
                           (v2i64 (scalar_to_vector GR64:$src)))],
                           IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
-def VMOV64toSDrr : VRPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
                        "mov{d|q}\t{$src, $dst|$dst, $src}",
                        [(set FR64:$dst, (bitconvert GR64:$src))],
                        IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
 
-def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
                         (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
                   Sched<[WriteMove]>;
-def MOVDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
                         (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
                         IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
-def MOV64toPQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
                         "mov{d|q}\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
                           (v2i64 (scalar_to_vector GR64:$src)))],
                           IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
-def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
                        "mov{d|q}\t{$src, $dst|$dst, $src}",
                        [(set FR64:$dst, (bitconvert GR64:$src))],
                        IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
@@ -4413,22 +4418,22 @@ def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
 //===---------------------------------------------------------------------===//
 // Move Int Doubleword to Single Scalar
 //
-def VMOVDI2SSrr  : VPDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+def VMOVDI2SSrr  : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set FR32:$dst, (bitconvert GR32:$src))],
                       IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
 
-def VMOVDI2SSrm  : VPDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+def VMOVDI2SSrm  : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
                       IIC_SSE_MOVDQ>,
                       VEX, Sched<[WriteLoad]>;
-def MOVDI2SSrr  : PDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+def MOVDI2SSrr  : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set FR32:$dst, (bitconvert GR32:$src))],
                       IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
 
-def MOVDI2SSrm  : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+def MOVDI2SSrm  : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
                       IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
@@ -4436,23 +4441,23 @@ def MOVDI2SSrm  : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
 //===---------------------------------------------------------------------===//
 // Move Packed Doubleword Int to Packed Double Int
 //
-def VMOVPDI2DIrr  : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
+def VMOVPDI2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
                                         (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX,
                     Sched<[WriteMove]>;
-def VMOVPDI2DImr  : VPDI<0x7E, MRMDestMem, (outs),
+def VMOVPDI2DImr  : VS2I<0x7E, MRMDestMem, (outs),
                        (ins i32mem:$dst, VR128:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(store (i32 (vector_extract (v4i32 VR128:$src),
                                      (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
                                      VEX, Sched<[WriteLoad]>;
-def MOVPDI2DIrr  : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
+def MOVPDI2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
                                         (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
                    Sched<[WriteMove]>;
-def MOVPDI2DImr  : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
+def MOVPDI2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(store (i32 (vector_extract (v4i32 VR128:$src),
                                      (iPTR 0))), addr:$dst)],
@@ -4462,14 +4467,14 @@ def MOVPDI2DImr  : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
 // Move Packed Doubleword Int first element to Doubleword Int
 //
 let SchedRW = [WriteMove] in {
-def VMOVPQIto64rr : VRPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
                           "mov{d|q}\t{$src, $dst|$dst, $src}",
                           [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
                                                            (iPTR 0)))],
                                                            IIC_SSE_MOVD_ToGP>,
                       VEX;
 
-def MOVPQIto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
                         "mov{d|q}\t{$src, $dst|$dst, $src}",
                         [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
                                                          (iPTR 0)))],
@@ -4484,11 +4489,11 @@ def VMOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
                         "vmovq\t{$src, $dst|$dst, $src}",
                         [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
                         VEX, Sched<[WriteLoad]>;
-def VMOVSDto64rr : VRPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
                          "mov{d|q}\t{$src, $dst|$dst, $src}",
                          [(set GR64:$dst, (bitconvert FR64:$src))],
                          IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
-def VMOVSDto64mr : VRPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
                          "movq\t{$src, $dst|$dst, $src}",
                          [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
                          IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
@@ -4497,11 +4502,11 @@ def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
                        "movq\t{$src, $dst|$dst, $src}",
                        [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
                        IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
-def MOVSDto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
                        "mov{d|q}\t{$src, $dst|$dst, $src}",
                        [(set GR64:$dst, (bitconvert FR64:$src))],
                        IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
-def MOVSDto64mr : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
                        "movq\t{$src, $dst|$dst, $src}",
                        [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
                        IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
@@ -4509,19 +4514,19 @@ def MOVSDto64mr : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
 //===---------------------------------------------------------------------===//
 // Move Scalar Single to Double Int
 //
-def VMOVSS2DIrr  : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
+def VMOVSS2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set GR32:$dst, (bitconvert FR32:$src))],
                       IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>;
-def VMOVSS2DImr  : VPDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
+def VMOVSS2DImr  : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
                       IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
-def MOVSS2DIrr  : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
+def MOVSS2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set GR32:$dst, (bitconvert FR32:$src))],
                       IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
-def MOVSS2DImr  : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
+def MOVSS2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
                       IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
@@ -4531,12 +4536,12 @@ def MOVSS2DImr  : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
 //
 let SchedRW = [WriteMove] in {
 let AddedComplexity = 15 in {
-def VMOVZDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+def VMOVZDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (v4i32 (X86vzmovl
                                       (v4i32 (scalar_to_vector GR32:$src)))))],
                                       IIC_SSE_MOVDQ>, VEX;
-def VMOVZQI2PQIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+def VMOVZQI2PQIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
                        "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only
                        [(set VR128:$dst, (v2i64 (X86vzmovl
                                       (v2i64 (scalar_to_vector GR64:$src)))))],
@@ -4544,12 +4549,12 @@ def VMOVZQI2PQIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
                                       VEX, VEX_W;
 }
 let AddedComplexity = 15 in {
-def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+def MOVZDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (v4i32 (X86vzmovl
                                       (v4i32 (scalar_to_vector GR32:$src)))))],
                                       IIC_SSE_MOVDQ>;
-def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+def MOVZQI2PQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
                        "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only
                        [(set VR128:$dst, (v2i64 (X86vzmovl
                                       (v2i64 (scalar_to_vector GR64:$src)))))],
@@ -4558,13 +4563,13 @@ def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
 } // SchedRW
 
 let AddedComplexity = 20, SchedRW = [WriteLoad] in {
-def VMOVZDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+def VMOVZDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
                          (v4i32 (X86vzmovl (v4i32 (scalar_to_vector
                                                    (loadi32 addr:$src))))))],
                                                    IIC_SSE_MOVDQ>, VEX;
-def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+def MOVZDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
                          (v4i32 (X86vzmovl (v4i32 (scalar_to_vector
@@ -4638,12 +4643,12 @@ def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
 // Move Packed Quadword Int to Quadword Int
 //
 let SchedRW = [WriteStore] in {
-def VMOVPQI2QImr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                       "movq\t{$src, $dst|$dst, $src}",
                       [(store (i64 (vector_extract (v2i64 VR128:$src),
                                     (iPTR 0))), addr:$dst)],
                                     IIC_SSE_MOVDQ>, VEX;
-def MOVPQI2QImr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                       "movq\t{$src, $dst|$dst, $src}",
                       [(store (i64 (vector_extract (v2i64 VR128:$src),
                                     (iPTR 0))), addr:$dst)],
@@ -4653,11 +4658,11 @@ def MOVPQI2QImr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
 //===---------------------------------------------------------------------===//
 // Store / copy lower 64-bits of a XMM register.
 //
-def VMOVLQ128mr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+def VMOVLQ128mr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                      "movq\t{$src, $dst|$dst, $src}",
                      [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>, VEX,
                   Sched<[WriteStore]>;
-def MOVLQ128mr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+def MOVLQ128mr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                      "movq\t{$src, $dst|$dst, $src}",
                      [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)],
                      IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
@@ -4758,14 +4763,14 @@ let AddedComplexity = 20 in {
 
 // Instructions to match in the assembler
 let SchedRW = [WriteMove] in {
-def VMOVQs64rr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+def VMOVQs64rr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
                       "movq\t{$src, $dst|$dst, $src}", [],
                       IIC_SSE_MOVDQ>, VEX, VEX_W;
-def VMOVQd64rr : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+def VMOVQd64rr : VS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
                       "movq\t{$src, $dst|$dst, $src}", [],
                       IIC_SSE_MOVDQ>, VEX, VEX_W;
 // Recognize "movd" with GR64 destination, but encode as a "movq"
-def VMOVQd64rr_alt : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+def VMOVQd64rr_alt : VS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
                           "movd\t{$src, $dst|$dst, $src}", [],
                           IIC_SSE_MOVDQ>, VEX, VEX_W;
 } // SchedRW
@@ -8367,7 +8372,9 @@ multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
             []>, VEX_4VOp3, VEX_L;
 }
 
-let mayLoad = 1, Constraints = "$src1 = $dst, $mask = $mask_wb" in {
+let mayLoad = 1, Constraints
+  = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
+  in {
   defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx64mem, vx64mem>, VEX_W;
   defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx64mem, vy64mem>, VEX_W;
   defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx32mem, vy32mem>;
diff --git a/lib/Target/X86/X86InstrShiftRotate.td b/lib/Target/X86/X86InstrShiftRotate.td
index 5b6298b..89c1a689 100644
--- a/lib/Target/X86/X86InstrShiftRotate.td
+++ b/lib/Target/X86/X86InstrShiftRotate.td
@@ -34,7 +34,7 @@ def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
 def SHL8ri   : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
                    "shl{b}\t{$src2, $dst|$dst, $src2}",
                    [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
-                   
+
 let isConvertibleToThreeAddress = 1 in {   // Can transform into LEA.
 def SHL16ri  : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
                    "shl{w}\t{$src2, $dst|$dst, $src2}",
@@ -43,7 +43,7 @@ def SHL16ri  : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
 def SHL32ri  : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
                    "shl{l}\t{$src2, $dst|$dst, $src2}",
                    [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))], IIC_SR>;
-def SHL64ri  : RIi8<0xC1, MRM4r, (outs GR64:$dst), 
+def SHL64ri  : RIi8<0xC1, MRM4r, (outs GR64:$dst),
                     (ins GR64:$src1, i8imm:$src2),
                     "shl{q}\t{$src2, $dst|$dst, $src2}",
                     [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))],
diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp
index 44d8cce..fc86e1e 100644
--- a/lib/Target/X86/X86JITInfo.cpp
+++ b/lib/Target/X86/X86JITInfo.cpp
@@ -339,6 +339,7 @@ extern "C" {
 /// must locate the start of the stub or call site and pass it into the JIT
 /// compiler function.
 extern "C" {
+LLVM_ATTRIBUTE_USED // Referenced from inline asm.
 LLVM_LIBRARY_VISIBILITY void LLVMX86CompilationCallback2(intptr_t *StackPtr,
                                                          intptr_t RetAddr) {
   intptr_t *RetAddrLoc = &StackPtr[1];
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index a8a9fd8..a453245 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -225,32 +225,6 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
 }
 
 
-
-static void lower_subreg32(MCInst *MI, unsigned OpNo) {
-  // Convert registers in the addr mode according to subreg32.
-  unsigned Reg = MI->getOperand(OpNo).getReg();
-  if (Reg != 0)
-    MI->getOperand(OpNo).setReg(getX86SubSuperRegister(Reg, MVT::i32));
-}
-
-static void lower_lea64_32mem(MCInst *MI, unsigned OpNo) {
-  // Convert registers in the addr mode according to subreg64.
-  for (unsigned i = 0; i != 4; ++i) {
-    if (!MI->getOperand(OpNo+i).isReg()) continue;
-
-    unsigned Reg = MI->getOperand(OpNo+i).getReg();
-    // LEAs can use RIP-relative addressing, and RIP has no sub/super register.
-    if (Reg == 0 || Reg == X86::RIP) continue;
-
-    MI->getOperand(OpNo+i).setReg(getX86SubSuperRegister(Reg, MVT::i64));
-  }
-}
-
-/// LowerSubReg32_Op0 - Things like MOVZX16rr8 -> MOVZX32rr8.
-static void LowerSubReg32_Op0(MCInst &OutMI, unsigned NewOpc) {
-  OutMI.setOpcode(NewOpc);
-  lower_subreg32(&OutMI, 0);
-}
 /// LowerUnaryToTwoAddr - R = setb   -> R = sbb R, R
 static void LowerUnaryToTwoAddr(MCInst &OutMI, unsigned NewOpc) {
   OutMI.setOpcode(NewOpc);
@@ -376,9 +350,7 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
   // Handle a few special cases to eliminate operand modifiers.
 ReSimplify:
   switch (OutMI.getOpcode()) {
-  case X86::LEA64_32r: // Handle 'subreg rewriting' for the lea64_32mem operand.
-    lower_lea64_32mem(&OutMI, 1);
-    // FALL THROUGH.
+  case X86::LEA64_32r:
   case X86::LEA64r:
   case X86::LEA16r:
   case X86::LEA32r:
@@ -388,23 +360,10 @@ ReSimplify:
     assert(OutMI.getOperand(1+X86::AddrSegmentReg).getReg() == 0 &&
            "LEA has segment specified!");
     break;
-  case X86::MOVZX64rr32:  LowerSubReg32_Op0(OutMI, X86::MOV32rr); break;
-  case X86::MOVZX64rm32:  LowerSubReg32_Op0(OutMI, X86::MOV32rm); break;
-  case X86::MOV64ri64i32: LowerSubReg32_Op0(OutMI, X86::MOV32ri); break;
-  case X86::MOVZX64rr8:   LowerSubReg32_Op0(OutMI, X86::MOVZX32rr8); break;
-  case X86::MOVZX64rm8:   LowerSubReg32_Op0(OutMI, X86::MOVZX32rm8); break;
-  case X86::MOVZX64rr16:  LowerSubReg32_Op0(OutMI, X86::MOVZX32rr16); break;
-  case X86::MOVZX64rm16:  LowerSubReg32_Op0(OutMI, X86::MOVZX32rm16); break;
-  case X86::MOV8r0:       LowerUnaryToTwoAddr(OutMI, X86::XOR8rr); break;
   case X86::MOV32r0:      LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); break;
 
-  case X86::MOV16r0:
-    LowerSubReg32_Op0(OutMI, X86::MOV32r0);   // MOV16r0 -> MOV32r0
-    LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); // MOV32r0 -> XOR32rr
-    break;
-  case X86::MOV64r0:
-    LowerSubReg32_Op0(OutMI, X86::MOV32r0);   // MOV64r0 -> MOV32r0
-    LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); // MOV32r0 -> XOR32rr
+  case X86::MOV32ri64:
+    OutMI.setOpcode(X86::MOV32ri);
     break;
 
   // Commute operands to get a smaller encoding by using VEX.R instead of VEX.B
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 16886e4..eacae2c 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -54,15 +54,14 @@ static cl::opt<bool>
 EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true),
           cl::desc("Enable use of a base pointer for complex stack frames"));
 
-X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm,
-                                 const TargetInstrInfo &tii)
+X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm)
   : X86GenRegisterInfo((tm.getSubtarget<X86Subtarget>().is64Bit()
                          ? X86::RIP : X86::EIP),
                        X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), false),
                        X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), true),
                        (tm.getSubtarget<X86Subtarget>().is64Bit()
                          ? X86::RIP : X86::EIP)),
-                       TM(tm), TII(tii) {
+                       TM(tm) {
   X86_MC::InitLLVM2SEHRegisterMapping(this);
 
   // Cache some information.
@@ -306,19 +305,19 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
 
   // Set the stack-pointer register and its aliases as reserved.
-  Reserved.set(X86::RSP);
-  for (MCSubRegIterator I(X86::RSP, this); I.isValid(); ++I)
+  for (MCSubRegIterator I(X86::RSP, this, /*IncludeSelf=*/true); I.isValid();
+       ++I)
     Reserved.set(*I);
 
   // Set the instruction pointer register and its aliases as reserved.
-  Reserved.set(X86::RIP);
-  for (MCSubRegIterator I(X86::RIP, this); I.isValid(); ++I)
+  for (MCSubRegIterator I(X86::RIP, this, /*IncludeSelf=*/true); I.isValid();
+       ++I)
     Reserved.set(*I);
 
   // Set the frame-pointer register and its aliases as reserved if needed.
   if (TFI->hasFP(MF)) {
-    Reserved.set(X86::RBP);
-    for (MCSubRegIterator I(X86::RBP, this); I.isValid(); ++I)
+    for (MCSubRegIterator I(X86::RBP, this, /*IncludeSelf=*/true); I.isValid();
+         ++I)
       Reserved.set(*I);
   }
 
@@ -331,8 +330,8 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
         "Stack realignment in presence of dynamic allocas is not supported with"
         "this calling convention.");
 
-    Reserved.set(getBaseRegister());
-    for (MCSubRegIterator I(getBaseRegister(), this); I.isValid(); ++I)
+    for (MCSubRegIterator I(getBaseRegister(), this, /*IncludeSelf=*/true);
+         I.isValid(); ++I)
       Reserved.set(*I);
   }
 
@@ -373,8 +372,11 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
         Reserved.set(*AI);
 
       // XMM8, XMM9, ...
-      assert(X86::XMM15 == X86::XMM8+7);
-      for (MCRegAliasIterator AI(X86::XMM8 + n, this, true); AI.isValid(); ++AI)
+      static const uint16_t XMMReg[] = {
+        X86::XMM8,  X86::XMM9, X86::XMM10, X86::XMM11,
+        X86::XMM12, X86::XMM13, X86::XMM14, X86::XMM15
+      };
+      for (MCRegAliasIterator AI(XMMReg[n], this, true); AI.isValid(); ++AI)
         Reserved.set(*AI);
     }
   }
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index b9d7b8c..6a1b328 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -27,7 +27,6 @@ namespace llvm {
 class X86RegisterInfo : public X86GenRegisterInfo {
 public:
   X86TargetMachine &TM;
-  const TargetInstrInfo &TII;
 
 private:
   /// Is64Bit - Is the target 64-bits.
@@ -56,7 +55,7 @@ private:
   unsigned BasePtr;
 
 public:
-  X86RegisterInfo(X86TargetMachine &tm, const TargetInstrInfo &tii);
+  X86RegisterInfo(X86TargetMachine &tm);
 
   // FIXME: This should be tablegen'd like getDwarfRegNum is
   int getSEHRegNum(unsigned i) const;
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index be6282a..fbbb257 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -21,11 +21,11 @@ class X86Reg<string n, bits<16> Enc, list<Register> subregs = []> : Register<n>
 
 // Subregister indices.
 let Namespace = "X86" in {
-  def sub_8bit    : SubRegIndex;
-  def sub_8bit_hi : SubRegIndex;
-  def sub_16bit   : SubRegIndex;
-  def sub_32bit   : SubRegIndex;
-  def sub_xmm     : SubRegIndex;
+  def sub_8bit    : SubRegIndex<8>;
+  def sub_8bit_hi : SubRegIndex<8, 8>;
+  def sub_16bit   : SubRegIndex<16>;
+  def sub_32bit   : SubRegIndex<32>;
+  def sub_xmm     : SubRegIndex<128>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td
index 9fbde88..9f2c781 100644
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td
@@ -266,10 +266,14 @@ def IIC_SSE_PINSRW : InstrItinClass;
 def IIC_SSE_PABS_RR : InstrItinClass;
 def IIC_SSE_PABS_RM : InstrItinClass;
 
-def IIC_SSE_SQRTP_RR : InstrItinClass;
-def IIC_SSE_SQRTP_RM : InstrItinClass;
-def IIC_SSE_SQRTS_RR : InstrItinClass;
-def IIC_SSE_SQRTS_RM : InstrItinClass;
+def IIC_SSE_SQRTPS_RR : InstrItinClass;
+def IIC_SSE_SQRTPS_RM : InstrItinClass;
+def IIC_SSE_SQRTSS_RR : InstrItinClass;
+def IIC_SSE_SQRTSS_RM : InstrItinClass;
+def IIC_SSE_SQRTPD_RR : InstrItinClass;
+def IIC_SSE_SQRTPD_RM : InstrItinClass;
+def IIC_SSE_SQRTSD_RR : InstrItinClass;
+def IIC_SSE_SQRTSD_RM : InstrItinClass;
 
 def IIC_SSE_RCPP_RR : InstrItinClass;
 def IIC_SSE_RCPP_RM : InstrItinClass;
diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td
index cce8f1b..cb0960a 100644
--- a/lib/Target/X86/X86ScheduleAtom.td
+++ b/lib/Target/X86/X86ScheduleAtom.td
@@ -211,10 +211,15 @@ def AtomItineraries : ProcessorItineraries<
 
   InstrItinData<IIC_SSE_UNPCK, [InstrStage<1, [Port0]>] >,
 
-  InstrItinData<IIC_SSE_SQRTP_RR, [InstrStage<13, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_SQRTP_RM, [InstrStage<14, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_SQRTS_RR, [InstrStage<11, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_SQRTS_RM, [InstrStage<12, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTPS_RR, [InstrStage<70, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTPS_RM, [InstrStage<70, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTSS_RR, [InstrStage<34, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTSS_RM, [InstrStage<34, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_SQRTPD_RR, [InstrStage<125, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTPD_RM, [InstrStage<125, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<62, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<62, [Port0, Port1]>] >,
 
   InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [Port0, Port1]>] >,
   InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<10, [Port0, Port1]>] >,
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index f934fdd..d1db79f 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -27,7 +27,7 @@ X86SelectionDAGInfo::~X86SelectionDAGInfo() {
 }
 
 SDValue
-X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
                                              SDValue Chain,
                                              SDValue Dst, SDValue Src,
                                              SDValue Size, unsigned Align,
@@ -175,7 +175,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
 }
 
 SDValue
-X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
                                         SDValue Chain, SDValue Dst, SDValue Src,
                                         SDValue Size, unsigned Align,
                                         bool isVolatile, bool AlwaysInline,
diff --git a/lib/Target/X86/X86SelectionDAGInfo.h b/lib/Target/X86/X86SelectionDAGInfo.h
index d1d66fe..d728af5 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.h
+++ b/lib/Target/X86/X86SelectionDAGInfo.h
@@ -34,7 +34,7 @@ public:
   ~X86SelectionDAGInfo();
 
   virtual
-  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
                                   SDValue Chain,
                                   SDValue Dst, SDValue Src,
                                   SDValue Size, unsigned Align,
@@ -42,7 +42,7 @@ public:
                                   MachinePointerInfo DstPtrInfo) const;
 
   virtual
-  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
                                   SDValue Chain,
                                   SDValue Dst, SDValue Src,
                                   SDValue Size, unsigned Align,
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 448d2e6..74da2a9 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -170,6 +170,26 @@ bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const {
   return isTargetELF() || TM.getRelocationModel() == Reloc::Static;
 }
 
+static bool OSHasAVXSupport() {
+#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)\
+    || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
+#if defined(__GNUC__)
+  // Check xgetbv; this uses a .byte sequence instead of the instruction
+  // directly because older assemblers do not include support for xgetbv and
+  // there is no easy way to conditionally compile based on the assembler used.
+  int rEAX, rEDX;
+  __asm__ (".byte 0x0f, 0x01, 0xd0" : "=a" (rEAX), "=d" (rEDX) : "c" (0));
+#elif defined(_MSC_FULL_VER) && defined(_XCR_XFEATURE_ENABLED_MASK)
+  unsigned long long rEAX = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
+#else
+  int rEAX = 0; // Ensures we return false
+#endif
+  return (rEAX & 6) == 6;
+#else
+  return false;
+#endif
+}
+
 void X86Subtarget::AutoDetectSubtargetFeatures() {
   unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
   unsigned MaxLevel;
@@ -192,7 +212,9 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
   if ((ECX >> 9)  & 1) { X86SSELevel = SSSE3; ToggleFeature(X86::FeatureSSSE3);}
   if ((ECX >> 19) & 1) { X86SSELevel = SSE41; ToggleFeature(X86::FeatureSSE41);}
   if ((ECX >> 20) & 1) { X86SSELevel = SSE42; ToggleFeature(X86::FeatureSSE42);}
-  if ((ECX >> 28) & 1) { X86SSELevel = AVX;   ToggleFeature(X86::FeatureAVX); }
+  if (((ECX >> 27) & 1) && ((ECX >> 28) & 1) && OSHasAVXSupport()) {
+    X86SSELevel = AVX;   ToggleFeature(X86::FeatureAVX);
+  }
 
   bool IsIntel = memcmp(text.c, "GenuineIntel", 12) == 0;
   bool IsAMD   = !IsIntel && memcmp(text.c, "AuthenticAMD", 12) == 0;
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 00fa47f..0422a61 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -49,6 +49,7 @@ X86_32TargetMachine::X86_32TargetMachine(const Target &T, StringRef TT,
     TLInfo(*this),
     TSInfo(*this),
     JITInfo(*this) {
+  initAsmInfo();
 }
 
 void X86_64TargetMachine::anchor() { }
@@ -69,6 +70,7 @@ X86_64TargetMachine::X86_64TargetMachine(const Target &T, StringRef TT,
     TLInfo(*this),
     TSInfo(*this),
     JITInfo(*this) {
+  initAsmInfo();
 }
 
 /// X86TargetMachine ctor - Create an X86 target.
diff --git a/lib/Target/XCore/CMakeLists.txt b/lib/Target/XCore/CMakeLists.txt
index 099ad39..d5bfddc 100644
--- a/lib/Target/XCore/CMakeLists.txt
+++ b/lib/Target/XCore/CMakeLists.txt
@@ -15,6 +15,7 @@ add_llvm_target(XCoreCodeGen
   XCoreInstrInfo.cpp
   XCoreISelDAGToDAG.cpp
   XCoreISelLowering.cpp
+  XCoreLowerThreadLocal.cpp
   XCoreMachineFunctionInfo.cpp
   XCoreMCInstLower.cpp
   XCoreRegisterInfo.cpp
diff --git a/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
index 7b99967..dcc0955 100644
--- a/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
+++ b/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
@@ -53,7 +53,7 @@ static bool readInstruction16(const MemoryObject &region,
   uint8_t Bytes[4];
 
   // We want to read exactly 2 Bytes of data.
-  if (region.readBytes(address, 2, Bytes, NULL) == -1) {
+  if (region.readBytes(address, 2, Bytes) == -1) {
     size = 0;
     return false;
   }
@@ -69,7 +69,7 @@ static bool readInstruction32(const MemoryObject &region,
   uint8_t Bytes[4];
 
   // We want to read exactly 4 Bytes of data.
-  if (region.readBytes(address, 4, Bytes, NULL) == -1) {
+  if (region.readBytes(address, 4, Bytes) == -1) {
     size = 0;
     return false;
   }
@@ -97,8 +97,8 @@ static DecodeStatus DecodeRRegsRegisterClass(MCInst &Inst,
 static DecodeStatus DecodeBitpOperand(MCInst &Inst, unsigned Val,
                                       uint64_t Address, const void *Decoder);
 
-static DecodeStatus DecodeMEMiiOperand(MCInst &Inst, unsigned Val,
-                                       uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeNegImmOperand(MCInst &Inst, unsigned Val,
+                                        uint64_t Address, const void *Decoder);
 
 static DecodeStatus Decode2RInstruction(MCInst &Inst,
                                         unsigned Insn,
@@ -242,10 +242,9 @@ static DecodeStatus DecodeBitpOperand(MCInst &Inst, unsigned Val,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeMEMiiOperand(MCInst &Inst, unsigned Val,
-                                       uint64_t Address, const void *Decoder) {
-  Inst.addOperand(MCOperand::CreateImm(Val));
-  Inst.addOperand(MCOperand::CreateImm(0));
+static DecodeStatus DecodeNegImmOperand(MCInst &Inst, unsigned Val,
+                                        uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(-(int64_t)Val));
   return MCDisassembler::Success;
 }
 
diff --git a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp b/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp
index 1592351..9ae8c0d 100644
--- a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp
+++ b/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp
@@ -84,14 +84,3 @@ printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
   assert(Op.isExpr() && "unknown operand kind in printOperand");
   printExpr(Op.getExpr(), O);
 }
-
-void XCoreInstPrinter::
-printMemOperand(const MCInst *MI, int opNum, raw_ostream &O) {
-  printOperand(MI, opNum, O);
-
-  if (MI->getOperand(opNum+1).isImm() && MI->getOperand(opNum+1).getImm() == 0)
-    return;
-
-  O << "+";
-  printOperand(MI, opNum+1, O);
-}
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
index 1cfdbda..6f44551 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
@@ -13,7 +13,7 @@ using namespace llvm;
 
 void XCoreMCAsmInfo::anchor() { }
 
-XCoreMCAsmInfo::XCoreMCAsmInfo(const Target &T, StringRef TT) {
+XCoreMCAsmInfo::XCoreMCAsmInfo(StringRef TT) {
   SupportsDebugInformation = true;
   Data16bitsDirective = "\t.short\t";
   Data32bitsDirective = "\t.long\t";
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
index 0767775..b5a9660 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
@@ -23,7 +23,7 @@ namespace llvm {
   class XCoreMCAsmInfo : public MCAsmInfo {
     virtual void anchor();
   public:
-    explicit XCoreMCAsmInfo(const Target &T, StringRef TT);
+    explicit XCoreMCAsmInfo(StringRef TT);
   };
 
 } // namespace llvm
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
index b5b072d..10bb6df 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
@@ -51,13 +51,13 @@ static MCSubtargetInfo *createXCoreMCSubtargetInfo(StringRef TT, StringRef CPU,
   return X;
 }
 
-static MCAsmInfo *createXCoreMCAsmInfo(const Target &T, StringRef TT) {
-  MCAsmInfo *MAI = new XCoreMCAsmInfo(T, TT);
+static MCAsmInfo *createXCoreMCAsmInfo(const MCRegisterInfo &MRI,
+                                       StringRef TT) {
+  MCAsmInfo *MAI = new XCoreMCAsmInfo(TT);
 
   // Initial state of the frame pointer is SP.
-  MachineLocation Dst(MachineLocation::VirtualFP);
-  MachineLocation Src(XCore::SP, 0);
-  MAI->addInitialFrameState(0, Dst, Src);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(0, XCore::SP, 0);
+  MAI->addInitialFrameState(Inst);
 
   return MAI;
 }
@@ -66,6 +66,9 @@ static MCCodeGenInfo *createXCoreMCCodeGenInfo(StringRef TT, Reloc::Model RM,
                                                CodeModel::Model CM,
                                                CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
+  if (RM == Reloc::Default) {
+    RM = Reloc::Static;
+  }
   X->InitMCCodeGenInfo(RM, CM, OL);
   return X;
 }
diff --git a/lib/Target/XCore/XCore.h b/lib/Target/XCore/XCore.h
index 08f091e..2f375fc 100644
--- a/lib/Target/XCore/XCore.h
+++ b/lib/Target/XCore/XCore.h
@@ -20,12 +20,16 @@
 
 namespace llvm {
   class FunctionPass;
+  class ModulePass;
   class TargetMachine;
   class XCoreTargetMachine;
   class formatted_raw_ostream;
 
+  void initializeXCoreLowerThreadLocalPass(PassRegistry &p);
+
   FunctionPass *createXCoreISelDag(XCoreTargetMachine &TM,
                                    CodeGenOpt::Level OptLevel);
+  ModulePass *createXCoreLowerThreadLocalPass();
 
 } // end namespace llvm;
 
diff --git a/lib/Target/XCore/XCoreAsmPrinter.cpp b/lib/Target/XCore/XCoreAsmPrinter.cpp
index 0d146ba..e177ad3 100644
--- a/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -36,7 +36,6 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
@@ -46,12 +45,6 @@
 #include <cctype>
 using namespace llvm;
 
-static cl::opt<unsigned> MaxThreads("xcore-max-threads", cl::Optional,
-  cl::desc("Maximum number of threads (for emulation thread-local storage)"),
-  cl::Hidden,
-  cl::value_desc("number"),
-  cl::init(8));
-
 namespace {
   class XCoreAsmPrinter : public AsmPrinter {
     const XCoreSubtarget &Subtarget;
@@ -152,10 +145,10 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 
   EmitAlignment(Align > 2 ? Align : 2, GV);
   
-  unsigned Size = TD->getTypeAllocSize(C->getType());
   if (GV->isThreadLocal()) {
-    Size *= MaxThreads;
+    report_fatal_error("TLS is not supported by this target!");
   }
+  unsigned Size = TD->getTypeAllocSize(C->getType());
   if (MAI->hasDotTypeDotSizeDirective()) {
     OutStreamer.EmitSymbolAttribute(GVSym, MCSA_ELF_TypeObject);
     OutStreamer.EmitRawText("\t.size " + Twine(GVSym->getName()) + "," +
@@ -164,10 +157,6 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   OutStreamer.EmitLabel(GVSym);
   
   EmitGlobalConstant(C);
-  if (GV->isThreadLocal()) {
-    for (unsigned i = 1; i < MaxThreads; ++i)
-      EmitGlobalConstant(C);
-  }
   // The ABI requires that unsigned scalar types smaller than 32 bits
   // are padded to 32 bits.
   if (Size < 4)
diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp
index beeb07f..736a4ef 100644
--- a/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -116,50 +116,34 @@ void XCoreFrameLowering::emitPrologue(MachineFunction &MF) const {
   }
   bool emitFrameMoves = XCoreRegisterInfo::needsFrameMoves(MF);
 
+  bool saveLR = XFI->getUsesLR();
   // Do we need to allocate space on the stack?
   if (FrameSize) {
-    bool saveLR = XFI->getUsesLR();
-    bool LRSavedOnEntry = false;
     int Opcode;
     if (saveLR && (MFI->getObjectOffset(XFI->getLRSpillSlot()) == 0)) {
       Opcode = (isU6) ? XCore::ENTSP_u6 : XCore::ENTSP_lu6;
       MBB.addLiveIn(XCore::LR);
       saveLR = false;
-      LRSavedOnEntry = true;
     } else {
       Opcode = (isU6) ? XCore::EXTSP_u6 : XCore::EXTSP_lu6;
     }
     BuildMI(MBB, MBBI, dl, TII.get(Opcode)).addImm(FrameSize);
 
     if (emitFrameMoves) {
-      std::vector<MachineMove> &Moves = MMI->getFrameMoves();
 
       // Show update of SP.
       MCSymbol *FrameLabel = MMI->getContext().CreateTempSymbol();
       BuildMI(MBB, MBBI, dl, TII.get(XCore::PROLOG_LABEL)).addSym(FrameLabel);
-
-      MachineLocation SPDst(MachineLocation::VirtualFP);
-      MachineLocation SPSrc(MachineLocation::VirtualFP, -FrameSize * 4);
-      Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc));
-
-      if (LRSavedOnEntry) {
-        MachineLocation CSDst(MachineLocation::VirtualFP, 0);
-        MachineLocation CSSrc(XCore::LR);
-        Moves.push_back(MachineMove(FrameLabel, CSDst, CSSrc));
-      }
     }
-    if (saveLR) {
-      int LRSpillOffset = MFI->getObjectOffset(XFI->getLRSpillSlot());
-      storeToStack(MBB, MBBI, XCore::LR, LRSpillOffset + FrameSize*4, dl, TII);
-      MBB.addLiveIn(XCore::LR);
+  }
+  if (saveLR) {
+    int LRSpillOffset = MFI->getObjectOffset(XFI->getLRSpillSlot());
+    storeToStack(MBB, MBBI, XCore::LR, LRSpillOffset + FrameSize*4, dl, TII);
+    MBB.addLiveIn(XCore::LR);
 
-      if (emitFrameMoves) {
-        MCSymbol *SaveLRLabel = MMI->getContext().CreateTempSymbol();
-        BuildMI(MBB, MBBI, dl, TII.get(XCore::PROLOG_LABEL)).addSym(SaveLRLabel);
-        MachineLocation CSDst(MachineLocation::VirtualFP, LRSpillOffset);
-        MachineLocation CSSrc(XCore::LR);
-        MMI->getFrameMoves().push_back(MachineMove(SaveLRLabel, CSDst, CSSrc));
-      }
+    if (emitFrameMoves) {
+      MCSymbol *SaveLRLabel = MMI->getContext().CreateTempSymbol();
+      BuildMI(MBB, MBBI, dl, TII.get(XCore::PROLOG_LABEL)).addSym(SaveLRLabel);
     }
   }
 
@@ -172,9 +156,6 @@ void XCoreFrameLowering::emitPrologue(MachineFunction &MF) const {
     if (emitFrameMoves) {
       MCSymbol *SaveR10Label = MMI->getContext().CreateTempSymbol();
       BuildMI(MBB, MBBI, dl, TII.get(XCore::PROLOG_LABEL)).addSym(SaveR10Label);
-      MachineLocation CSDst(MachineLocation::VirtualFP, FPSpillOffset);
-      MachineLocation CSSrc(XCore::R10);
-      MMI->getFrameMoves().push_back(MachineMove(SaveR10Label, CSDst, CSSrc));
     }
     // Set the FP from the SP.
     unsigned FramePtr = XCore::R10;
@@ -184,25 +165,6 @@ void XCoreFrameLowering::emitPrologue(MachineFunction &MF) const {
       // Show FP is now valid.
       MCSymbol *FrameLabel = MMI->getContext().CreateTempSymbol();
       BuildMI(MBB, MBBI, dl, TII.get(XCore::PROLOG_LABEL)).addSym(FrameLabel);
-      MachineLocation SPDst(FramePtr);
-      MachineLocation SPSrc(MachineLocation::VirtualFP);
-      MMI->getFrameMoves().push_back(MachineMove(FrameLabel, SPDst, SPSrc));
-    }
-  }
-
-  if (emitFrameMoves) {
-    // Frame moves for callee saved.
-    std::vector<MachineMove> &Moves = MMI->getFrameMoves();
-    std::vector<std::pair<MCSymbol*, CalleeSavedInfo> >&SpillLabels =
-        XFI->getSpillLabels();
-    for (unsigned I = 0, E = SpillLabels.size(); I != E; ++I) {
-      MCSymbol *SpillLabel = SpillLabels[I].first;
-      CalleeSavedInfo &CSI = SpillLabels[I].second;
-      int Offset = MFI->getObjectOffset(CSI.getFrameIdx());
-      unsigned Reg = CSI.getReg();
-      MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
-      MachineLocation CSSrc(Reg);
-      Moves.push_back(MachineMove(SpillLabel, CSDst, CSSrc));
     }
   }
 }
@@ -213,6 +175,7 @@ void XCoreFrameLowering::emitEpilogue(MachineFunction &MF,
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   const XCoreInstrInfo &TII =
     *static_cast<const XCoreInstrInfo*>(MF.getTarget().getInstrInfo());
+  XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
   DebugLoc dl = MBBI->getDebugLoc();
 
   bool FP = hasFP(MF);
@@ -237,24 +200,26 @@ void XCoreFrameLowering::emitEpilogue(MachineFunction &MF,
     report_fatal_error("emitEpilogue Frame size too big: " + Twine(FrameSize));
   }
 
-  if (FrameSize) {
-    XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+  if (FP) {
+    // Restore R10
+    int FPSpillOffset = MFI->getObjectOffset(XFI->getFPSpillSlot());
+    FPSpillOffset += FrameSize*4;
+    loadFromStack(MBB, MBBI, XCore::R10, FPSpillOffset, dl, TII);
+  }
 
-    if (FP) {
-      // Restore R10
-      int FPSpillOffset = MFI->getObjectOffset(XFI->getFPSpillSlot());
-      FPSpillOffset += FrameSize*4;
-      loadFromStack(MBB, MBBI, XCore::R10, FPSpillOffset, dl, TII);
-    }
-    bool restoreLR = XFI->getUsesLR();
-    if (restoreLR && MFI->getObjectOffset(XFI->getLRSpillSlot()) != 0) {
-      int LRSpillOffset = MFI->getObjectOffset(XFI->getLRSpillSlot());
-      LRSpillOffset += FrameSize*4;
-      loadFromStack(MBB, MBBI, XCore::LR, LRSpillOffset, dl, TII);
-      restoreLR = false;
-    }
+  bool restoreLR = XFI->getUsesLR();
+  if (restoreLR &&
+      (FrameSize == 0 || MFI->getObjectOffset(XFI->getLRSpillSlot()) != 0)) {
+    int LRSpillOffset = MFI->getObjectOffset(XFI->getLRSpillSlot());
+    LRSpillOffset += FrameSize*4;
+    loadFromStack(MBB, MBBI, XCore::LR, LRSpillOffset, dl, TII);
+    restoreLR = false;
+  }
+
+  if (FrameSize) {
     if (restoreLR) {
       // Fold prologue into return instruction
+      assert(MFI->getObjectOffset(XFI->getLRSpillSlot()) == 0);
       assert(MBBI->getOpcode() == XCore::RETSP_u6
         || MBBI->getOpcode() == XCore::RETSP_lu6);
       int Opcode = (isU6) ? XCore::RETSP_u6 : XCore::RETSP_lu6;
diff --git a/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
index d16811c..ee183aa 100644
--- a/lib/Target/XCore/XCoreISelDAGToDAG.cpp
+++ b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
@@ -61,15 +61,13 @@ namespace {
       if (!isMask_32(value)) {
         return false;
       }
-      int msksize = 32 - CountLeadingZeros_32(value);
+      int msksize = 32 - countLeadingZeros(value);
       return (msksize >= 1 && msksize <= 8) ||
               msksize == 16 || msksize == 24 || msksize == 32;
     }
 
     // Complex Pattern Selectors.
     bool SelectADDRspii(SDValue Addr, SDValue &Base, SDValue &Offset);
-    bool SelectADDRdpii(SDValue Addr, SDValue &Base, SDValue &Offset);
-    bool SelectADDRcpii(SDValue Addr, SDValue &Base, SDValue &Offset);
     
     virtual const char *getPassName() const {
       return "XCore DAG->DAG Pattern Instruction Selection";
@@ -110,50 +108,8 @@ bool XCoreDAGToDAGISel::SelectADDRspii(SDValue Addr, SDValue &Base,
   return false;
 }
 
-bool XCoreDAGToDAGISel::SelectADDRdpii(SDValue Addr, SDValue &Base,
-                                       SDValue &Offset) {
-  if (Addr.getOpcode() == XCoreISD::DPRelativeWrapper) {
-    Base = Addr.getOperand(0);
-    Offset = CurDAG->getTargetConstant(0, MVT::i32);
-    return true;
-  }
-  if (Addr.getOpcode() == ISD::ADD) {
-    ConstantSDNode *CN = 0;
-    if ((Addr.getOperand(0).getOpcode() == XCoreISD::DPRelativeWrapper)
-      && (CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
-      && (CN->getSExtValue() % 4 == 0 && CN->getSExtValue() >= 0)) {
-      // Constant word offset from a object in the data region
-      Base = Addr.getOperand(0).getOperand(0);
-      Offset = CurDAG->getTargetConstant(CN->getSExtValue(), MVT::i32);
-      return true;
-    }
-  }
-  return false;
-}
-
-bool XCoreDAGToDAGISel::SelectADDRcpii(SDValue Addr, SDValue &Base,
-                                       SDValue &Offset) {
-  if (Addr.getOpcode() == XCoreISD::CPRelativeWrapper) {
-    Base = Addr.getOperand(0);
-    Offset = CurDAG->getTargetConstant(0, MVT::i32);
-    return true;
-  }
-  if (Addr.getOpcode() == ISD::ADD) {
-    ConstantSDNode *CN = 0;
-    if ((Addr.getOperand(0).getOpcode() == XCoreISD::CPRelativeWrapper)
-      && (CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
-      && (CN->getSExtValue() % 4 == 0 && CN->getSExtValue() >= 0)) {
-      // Constant word offset from a object in the data region
-      Base = Addr.getOperand(0).getOperand(0);
-      Offset = CurDAG->getTargetConstant(CN->getSExtValue(), MVT::i32);
-      return true;
-    }
-  }
-  return false;
-}
-
 SDNode *XCoreDAGToDAGISel::Select(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   switch (N->getOpcode()) {
   default: break;
   case ISD::Constant: {
@@ -161,7 +117,7 @@ SDNode *XCoreDAGToDAGISel::Select(SDNode *N) {
     if (immMskBitp(N)) {
       // Transformation function: get the size of a mask
       // Look for the first non-zero bit
-      SDValue MskSize = getI32Imm(32 - CountLeadingZeros_32(Val));
+      SDValue MskSize = getI32Imm(32 - countLeadingZeros(Val));
       return CurDAG->getMachineNode(XCore::MKMSK_rus, dl,
                                     MVT::i32, MskSize);
     }
@@ -169,7 +125,7 @@ SDNode *XCoreDAGToDAGISel::Select(SDNode *N) {
       SDValue CPIdx =
         CurDAG->getTargetConstantPool(ConstantInt::get(
                               Type::getInt32Ty(*CurDAG->getContext()), Val),
-                                      TLI.getPointerTy());
+                                      TLI->getPointerTy());
       SDNode *node = CurDAG->getMachineNode(XCore::LDWCP_lru6, dl, MVT::i32,
                                             MVT::Other, CPIdx,
                                             CurDAG->getEntryNode());
@@ -248,12 +204,12 @@ replaceInChain(SelectionDAG *CurDAG, SDValue Chain, SDValue Old, SDValue New)
   }
   if (!found)
     return SDValue();
-  return CurDAG->getNode(ISD::TokenFactor, Chain->getDebugLoc(), MVT::Other,
+  return CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other,
                          &Ops[0], Ops.size());
 }
 
 SDNode *XCoreDAGToDAGISel::SelectBRIND(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   // (brind (int_xcore_checkevent (addr)))
   SDValue Chain = N->getOperand(0);
   SDValue Addr = N->getOperand(1);
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index a5d2be8..7b89b1a 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -36,6 +36,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+
 using namespace llvm;
 
 const char *XCoreTargetLowering::
@@ -120,9 +122,6 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
   setOperationAction(ISD::GlobalAddress, MVT::i32,   Custom);
   setOperationAction(ISD::BlockAddress, MVT::i32 , Custom);
 
-  // Thread Local Storage
-  setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
-
   // Conversion of i64 -> double produces constantpool nodes
   setOperationAction(ISD::ConstantPool, MVT::i32,   Custom);
 
@@ -172,7 +171,6 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode())
   {
   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
-  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
   case ISD::BR_JT:              return LowerBR_JT(Op, DAG);
@@ -217,7 +215,7 @@ void XCoreTargetLowering::ReplaceNodeResults(SDNode *N,
 SDValue XCoreTargetLowering::
 LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
 {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue Cond = DAG.getNode(ISD::SETCC, dl, MVT::i32, Op.getOperand(2),
                              Op.getOperand(3), Op.getOperand(4));
   return DAG.getNode(ISD::SELECT, dl, MVT::i32, Cond, Op.getOperand(0),
@@ -229,7 +227,7 @@ getGlobalAddressWrapper(SDValue GA, const GlobalValue *GV,
                         SelectionDAG &DAG) const
 {
   // FIXME there is no actual debug info here
-  DebugLoc dl = GA.getDebugLoc();
+  SDLoc dl(GA);
   const GlobalValue *UnderlyingGV = GV;
   // If GV is an alias then use the aliasee to determine the wrapper type
   if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
@@ -245,58 +243,31 @@ getGlobalAddressWrapper(SDValue GA, const GlobalValue *GV,
 SDValue XCoreTargetLowering::
 LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const
 {
-  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-  SDValue GA = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), MVT::i32);
-  return getGlobalAddressWrapper(GA, GV, DAG);
+  SDLoc DL(Op);
+  const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
+  const GlobalValue *GV = GN->getGlobal();
+  int64_t Offset = GN->getOffset();
+  // We can only fold positive offsets that are a multiple of the word size.
+  int64_t FoldedOffset = std::max(Offset & ~3, (int64_t)0);
+  SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, FoldedOffset);
+  GA = getGlobalAddressWrapper(GA, GV, DAG);
+  // Handle the rest of the offset.
+  if (Offset != FoldedOffset) {
+    SDValue Remaining = DAG.getConstant(Offset - FoldedOffset, MVT::i32);
+    GA = DAG.getNode(ISD::ADD, DL, MVT::i32, GA, Remaining);
+  }
+  return GA;
 }
 
-static inline SDValue BuildGetId(SelectionDAG &DAG, DebugLoc dl) {
+static inline SDValue BuildGetId(SelectionDAG &DAG, SDLoc dl) {
   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
                      DAG.getConstant(Intrinsic::xcore_getid, MVT::i32));
 }
 
-static inline bool isZeroLengthArray(Type *Ty) {
-  ArrayType *AT = dyn_cast_or_null<ArrayType>(Ty);
-  return AT && (AT->getNumElements() == 0);
-}
-
-SDValue XCoreTargetLowering::
-LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
-{
-  // FIXME there isn't really debug info here
-  DebugLoc dl = Op.getDebugLoc();
-  // transform to label + getid() * size
-  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-  SDValue GA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32);
-  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
-  if (!GVar) {
-    // If GV is an alias then use the aliasee to determine size
-    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-      GVar = dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal());
-  }
-  if (!GVar) {
-    llvm_unreachable("Thread local object not a GlobalVariable?");
-  }
-  Type *Ty = cast<PointerType>(GV->getType())->getElementType();
-  if (!Ty->isSized() || isZeroLengthArray(Ty)) {
-#ifndef NDEBUG
-    errs() << "Size of thread local object " << GVar->getName()
-           << " is unknown\n";
-#endif
-    llvm_unreachable(0);
-  }
-  SDValue base = getGlobalAddressWrapper(GA, GV, DAG);
-  const DataLayout *TD = TM.getDataLayout();
-  unsigned Size = TD->getTypeAllocSize(Ty);
-  SDValue offset = DAG.getNode(ISD::MUL, dl, MVT::i32, BuildGetId(DAG, dl),
-                       DAG.getConstant(Size, MVT::i32));
-  return DAG.getNode(ISD::ADD, dl, MVT::i32, base, offset);
-}
-
 SDValue XCoreTargetLowering::
 LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const
 {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy());
@@ -309,7 +280,7 @@ LowerConstantPool(SDValue Op, SelectionDAG &DAG) const
 {
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
   // FIXME there isn't really debug info here
-  DebugLoc dl = CP->getDebugLoc();
+  SDLoc dl(CP);
   EVT PtrVT = Op.getValueType();
   SDValue Res;
   if (CP->isMachineConstantPoolEntry()) {
@@ -332,7 +303,7 @@ LowerBR_JT(SDValue Op, SelectionDAG &DAG) const
   SDValue Chain = Op.getOperand(0);
   SDValue Table = Op.getOperand(1);
   SDValue Index = Op.getOperand(2);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
   unsigned JTI = JT->getIndex();
   MachineFunction &MF = DAG.getMachineFunction();
@@ -350,55 +321,58 @@ LowerBR_JT(SDValue Op, SelectionDAG &DAG) const
                      ScaledIndex);
 }
 
-static bool
-IsWordAlignedBasePlusConstantOffset(SDValue Addr, SDValue &AlignedBase,
-                                    int64_t &Offset)
+SDValue XCoreTargetLowering::
+lowerLoadWordFromAlignedBasePlusOffset(SDLoc DL, SDValue Chain, SDValue Base,
+                                       int64_t Offset, SelectionDAG &DAG) const
 {
-  if (Addr.getOpcode() != ISD::ADD) {
-    return false;
+  if ((Offset & 0x3) == 0) {
+    return DAG.getLoad(getPointerTy(), DL, Chain, Base, MachinePointerInfo(),
+                       false, false, false, 0);
   }
-  ConstantSDNode *CN = 0;
-  if (!(CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
-    return false;
-  }
-  int64_t off = CN->getSExtValue();
-  const SDValue &Base = Addr.getOperand(0);
-  const SDValue *Root = &Base;
-  if (Base.getOpcode() == ISD::ADD &&
-      Base.getOperand(1).getOpcode() == ISD::SHL) {
-    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Base.getOperand(1)
-                                                      .getOperand(1));
-    if (CN && (CN->getSExtValue() >= 2)) {
-      Root = &Base.getOperand(0);
-    }
-  }
-  if (isa<FrameIndexSDNode>(*Root)) {
-    // All frame indicies are word aligned
-    AlignedBase = Base;
-    Offset = off;
-    return true;
-  }
-  if (Root->getOpcode() == XCoreISD::DPRelativeWrapper ||
-      Root->getOpcode() == XCoreISD::CPRelativeWrapper) {
-    // All dp / cp relative addresses are word aligned
-    AlignedBase = Base;
-    Offset = off;
-    return true;
-  }
-  // Check for an aligned global variable.
-  if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(*Root)) {
-    const GlobalValue *GV = GA->getGlobal();
-    if (GA->getOffset() == 0 && GV->getAlignment() >= 4) {
-      AlignedBase = Base;
-      Offset = off;
-      return true;
-    }
+  // Lower to pair of consecutive word aligned loads plus some bit shifting.
+  int32_t HighOffset = RoundUpToAlignment(Offset, 4);
+  int32_t LowOffset = HighOffset - 4;
+  SDValue LowAddr, HighAddr;
+  if (GlobalAddressSDNode *GASD =
+        dyn_cast<GlobalAddressSDNode>(Base.getNode())) {
+    LowAddr = DAG.getGlobalAddress(GASD->getGlobal(), DL, Base.getValueType(),
+                                   LowOffset);
+    HighAddr = DAG.getGlobalAddress(GASD->getGlobal(), DL, Base.getValueType(),
+                                    HighOffset);
+  } else {
+    LowAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, Base,
+                          DAG.getConstant(LowOffset, MVT::i32));
+    HighAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, Base,
+                           DAG.getConstant(HighOffset, MVT::i32));
   }
-  return false;
+  SDValue LowShift = DAG.getConstant((Offset - LowOffset) * 8, MVT::i32);
+  SDValue HighShift = DAG.getConstant((HighOffset - Offset) * 8, MVT::i32);
+
+  SDValue Low = DAG.getLoad(getPointerTy(), DL, Chain,
+                            LowAddr, MachinePointerInfo(),
+                            false, false, false, 0);
+  SDValue High = DAG.getLoad(getPointerTy(), DL, Chain,
+                             HighAddr, MachinePointerInfo(),
+                             false, false, false, 0);
+  SDValue LowShifted = DAG.getNode(ISD::SRL, DL, MVT::i32, Low, LowShift);
+  SDValue HighShifted = DAG.getNode(ISD::SHL, DL, MVT::i32, High, HighShift);
+  SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i32, LowShifted, HighShifted);
+  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Low.getValue(1),
+                      High.getValue(1));
+  SDValue Ops[] = { Result, Chain };
+  return DAG.getMergeValues(Ops, 2, DL);
+}
+
+static bool isWordAligned(SDValue Value, SelectionDAG &DAG)
+{
+  APInt KnownZero, KnownOne;
+  DAG.ComputeMaskedBits(Value, KnownZero, KnownOne);
+  return KnownZero.countTrailingOnes() >= 2;
 }
 
 SDValue XCoreTargetLowering::
 LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   LoadSDNode *LD = cast<LoadSDNode>(Op);
   assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
          "Unexpected extension type");
@@ -414,47 +388,25 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue Chain = LD->getChain();
   SDValue BasePtr = LD->getBasePtr();
-  DebugLoc DL = Op.getDebugLoc();
-
-  SDValue Base;
-  int64_t Offset;
-  if (!LD->isVolatile() &&
-      IsWordAlignedBasePlusConstantOffset(BasePtr, Base, Offset)) {
-    if (Offset % 4 == 0) {
-      // We've managed to infer better alignment information than the load
-      // already has. Use an aligned load.
-      //
-      return DAG.getLoad(getPointerTy(), DL, Chain, BasePtr,
-                         MachinePointerInfo(),
-                         false, false, false, 0);
+  SDLoc DL(Op);
+
+  if (!LD->isVolatile()) {
+    const GlobalValue *GV;
+    int64_t Offset = 0;
+    if (DAG.isBaseWithConstantOffset(BasePtr) &&
+        isWordAligned(BasePtr->getOperand(0), DAG)) {
+      SDValue NewBasePtr = BasePtr->getOperand(0);
+      Offset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
+      return lowerLoadWordFromAlignedBasePlusOffset(DL, Chain, NewBasePtr,
+                                                    Offset, DAG);
+    }
+    if (TLI.isGAPlusOffset(BasePtr.getNode(), GV, Offset) &&
+        MinAlign(GV->getAlignment(), 4) == 4) {
+      SDValue NewBasePtr = DAG.getGlobalAddress(GV, DL,
+                                                BasePtr->getValueType(0));
+      return lowerLoadWordFromAlignedBasePlusOffset(DL, Chain, NewBasePtr,
+                                                    Offset, DAG);
     }
-    // Lower to
-    // ldw low, base[offset >> 2]
-    // ldw high, base[(offset >> 2) + 1]
-    // shr low_shifted, low, (offset & 0x3) * 8
-    // shl high_shifted, high, 32 - (offset & 0x3) * 8
-    // or result, low_shifted, high_shifted
-    SDValue LowOffset = DAG.getConstant(Offset & ~0x3, MVT::i32);
-    SDValue HighOffset = DAG.getConstant((Offset & ~0x3) + 4, MVT::i32);
-    SDValue LowShift = DAG.getConstant((Offset & 0x3) * 8, MVT::i32);
-    SDValue HighShift = DAG.getConstant(32 - (Offset & 0x3) * 8, MVT::i32);
-
-    SDValue LowAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, Base, LowOffset);
-    SDValue HighAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, Base, HighOffset);
-
-    SDValue Low = DAG.getLoad(getPointerTy(), DL, Chain,
-                              LowAddr, MachinePointerInfo(),
-                              false, false, false, 0);
-    SDValue High = DAG.getLoad(getPointerTy(), DL, Chain,
-                               HighAddr, MachinePointerInfo(),
-                               false, false, false, 0);
-    SDValue LowShifted = DAG.getNode(ISD::SRL, DL, MVT::i32, Low, LowShift);
-    SDValue HighShifted = DAG.getNode(ISD::SHL, DL, MVT::i32, High, HighShift);
-    SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i32, LowShifted, HighShifted);
-    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Low.getValue(1),
-                             High.getValue(1));
-    SDValue Ops[] = { Result, Chain };
-    return DAG.getMergeValues(Ops, 2, DL);
   }
 
   if (LD->getAlignment() == 2) {
@@ -517,7 +469,7 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const
   SDValue Chain = ST->getChain();
   SDValue BasePtr = ST->getBasePtr();
   SDValue Value = ST->getValue();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   if (ST->getAlignment() == 2) {
     SDValue Low = Value;
@@ -564,7 +516,7 @@ LowerSMUL_LOHI(SDValue Op, SelectionDAG &DAG) const
 {
   assert(Op.getValueType() == MVT::i32 && Op.getOpcode() == ISD::SMUL_LOHI &&
          "Unexpected operand to lower!");
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   SDValue Zero = DAG.getConstant(0, MVT::i32);
@@ -581,7 +533,7 @@ LowerUMUL_LOHI(SDValue Op, SelectionDAG &DAG) const
 {
   assert(Op.getValueType() == MVT::i32 && Op.getOpcode() == ISD::UMUL_LOHI &&
          "Unexpected operand to lower!");
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   SDValue Zero = DAG.getConstant(0, MVT::i32);
@@ -666,7 +618,7 @@ TryExpandADDWithMul(SDNode *N, SelectionDAG &DAG) const
   } else {
     return SDValue();
   }
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   SDValue LL, RL, AddendL, AddendH;
   LL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
                    Mul.getOperand(0),  DAG.getConstant(0, MVT::i32));
@@ -725,7 +677,7 @@ ExpandADDSUB(SDNode *N, SelectionDAG &DAG) const
       return Result;
   }
 
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   // Extract components
   SDValue LHSL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
@@ -758,7 +710,7 @@ LowerVAARG(SDValue Op, SelectionDAG &DAG) const
   llvm_unreachable("unimplemented");
   // FIXME Arguments passed by reference need a extra dereference.
   SDNode *Node = Op.getNode();
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
   const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
   EVT VT = Node->getValueType(0);
   SDValue VAList = DAG.getLoad(getPointerTy(), dl, Node->getOperand(0),
@@ -779,7 +731,7 @@ LowerVAARG(SDValue Op, SelectionDAG &DAG) const
 SDValue XCoreTargetLowering::
 LowerVASTART(SDValue Op, SelectionDAG &DAG) const
 {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   // vastart stores the address of the VarArgsFrameIndex slot into the
   // memory location argument
   MachineFunction &MF = DAG.getMachineFunction();
@@ -791,7 +743,7 @@ LowerVASTART(SDValue Op, SelectionDAG &DAG) const
 
 SDValue XCoreTargetLowering::LowerFRAMEADDR(SDValue Op,
                                             SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   // Depths > 0 not supported yet!
   if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() > 0)
     return SDValue();
@@ -831,7 +783,7 @@ LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue Addr = Trmp;
 
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   OutChains[0] = DAG.getStore(Chain, dl, DAG.getConstant(0x0a3cd805, MVT::i32),
                               Addr, MachinePointerInfo(TrmpAddr), false, false,
                               0);
@@ -865,7 +817,7 @@ LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue XCoreTargetLowering::
 LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   switch (IntNo) {
     case Intrinsic::xcore_crc8:
@@ -895,7 +847,7 @@ SDValue
 XCoreTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG                     = CLI.DAG;
-  DebugLoc &dl                          = CLI.DL;
+  SDLoc &dl                          = CLI.DL;
   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
   SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
   SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
@@ -931,7 +883,7 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
                                     const SmallVectorImpl<SDValue> &OutVals,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SDLoc dl, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &InVals) const {
 
   // Analyze operands of the call, assigning locations to each operand.
@@ -949,7 +901,7 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   unsigned NumBytes = CCInfo.getNextStackOffset();
 
   Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes,
-                                 getPointerTy(), true));
+                                 getPointerTy(), true), dl);
 
   SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
   SmallVector<SDValue, 12> MemOpChains;
@@ -1039,7 +991,7 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   Chain = DAG.getCALLSEQ_END(Chain,
                              DAG.getConstant(NumBytes, getPointerTy(), true),
                              DAG.getConstant(0, getPointerTy(), true),
-                             InFlag);
+                             InFlag, dl);
   InFlag = Chain.getValue(1);
 
   // Handle result values, copying them out of physregs into vregs that we
@@ -1054,7 +1006,7 @@ SDValue
 XCoreTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
                                      CallingConv::ID CallConv, bool isVarArg,
                                      const SmallVectorImpl<ISD::InputArg> &Ins,
-                                     DebugLoc dl, SelectionDAG &DAG,
+                                     SDLoc dl, SelectionDAG &DAG,
                                      SmallVectorImpl<SDValue> &InVals) const {
 
   // Assign locations to each value returned by this call.
@@ -1085,7 +1037,7 @@ XCoreTargetLowering::LowerFormalArguments(SDValue Chain,
                                           CallingConv::ID CallConv,
                                           bool isVarArg,
                                       const SmallVectorImpl<ISD::InputArg> &Ins,
-                                          DebugLoc dl,
+                                          SDLoc dl,
                                           SelectionDAG &DAG,
                                           SmallVectorImpl<SDValue> &InVals)
                                             const {
@@ -1110,7 +1062,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
                                        bool isVarArg,
                                        const SmallVectorImpl<ISD::InputArg>
                                          &Ins,
-                                       DebugLoc dl,
+                                       SDLoc dl,
                                        SelectionDAG &DAG,
                                        SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -1236,7 +1188,7 @@ XCoreTargetLowering::LowerReturn(SDValue Chain,
                                  CallingConv::ID CallConv, bool isVarArg,
                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
                                  const SmallVectorImpl<SDValue> &OutVals,
-                                 DebugLoc dl, SelectionDAG &DAG) const {
+                                 SDLoc dl, SelectionDAG &DAG) const {
 
   // CCValAssign - represent the assignment of
   // the return value to a location
@@ -1353,7 +1305,7 @@ XCoreTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   switch (N->getOpcode()) {
   default: break;
   case XCoreISD::LADD: {
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index 8d258f5..f765f02 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -115,7 +115,7 @@ namespace llvm {
                               CallingConv::ID CallConv,
                               bool isVarArg,
                               const SmallVectorImpl<ISD::InputArg> &Ins,
-                              DebugLoc dl, SelectionDAG &DAG,
+                              SDLoc dl, SelectionDAG &DAG,
                               SmallVectorImpl<SDValue> &InVals) const;
     SDValue LowerCCCCallTo(SDValue Chain, SDValue Callee,
                            CallingConv::ID CallConv, bool isVarArg,
@@ -123,16 +123,19 @@ namespace llvm {
                            const SmallVectorImpl<ISD::OutputArg> &Outs,
                            const SmallVectorImpl<SDValue> &OutVals,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
-                           DebugLoc dl, SelectionDAG &DAG,
+                           SDLoc dl, SelectionDAG &DAG,
                            SmallVectorImpl<SDValue> &InVals) const;
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                             CallingConv::ID CallConv, bool isVarArg,
                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                            DebugLoc dl, SelectionDAG &DAG,
+                            SDLoc dl, SelectionDAG &DAG,
                             SmallVectorImpl<SDValue> &InVals) const;
     SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
     SDValue getGlobalAddressWrapper(SDValue GA, const GlobalValue *GV,
                                     SelectionDAG &DAG) const;
+    SDValue lowerLoadWordFromAlignedBasePlusOffset(SDLoc DL, SDValue Chain,
+                                                   SDValue Base, int64_t Offset,
+                                                   SelectionDAG &DAG) const;
 
     // Lower Operand specifics
     SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
@@ -174,7 +177,7 @@ namespace llvm {
                            CallingConv::ID CallConv,
                            bool isVarArg,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
-                           DebugLoc dl, SelectionDAG &DAG,
+                           SDLoc dl, SelectionDAG &DAG,
                            SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
@@ -186,7 +189,7 @@ namespace llvm {
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
                   const SmallVectorImpl<SDValue> &OutVals,
-                  DebugLoc dl, SelectionDAG &DAG) const;
+                  SDLoc dl, SelectionDAG &DAG) const;
 
     virtual bool
       CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
diff --git a/lib/Target/XCore/XCoreInstrInfo.cpp b/lib/Target/XCore/XCoreInstrInfo.cpp
index e457e0d..eb7a936 100644
--- a/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -41,7 +41,7 @@ using namespace llvm;
 
 XCoreInstrInfo::XCoreInstrInfo()
   : XCoreGenInstrInfo(XCore::ADJCALLSTACKDOWN, XCore::ADJCALLSTACKUP),
-    RI(*this) {
+    RI() {
 }
 
 static bool isZeroImm(const MachineOperand &op) {
diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
index 03653cb..e06419a 100644
--- a/lib/Target/XCore/XCoreInstrInfo.td
+++ b/lib/Target/XCore/XCoreInstrInfo.td
@@ -84,7 +84,7 @@ def msksize_xform : SDNodeXForm<imm, [{
   // Transformation function: get the size of a mask
   assert(isMask_32(N->getZExtValue()));
   // look for the first non-zero bit
-  return getI32Imm(32 - CountLeadingZeros_32(N->getZExtValue()));
+  return getI32Imm(32 - countLeadingZeros(N->getZExtValue()));
 }]>;
 
 def neg_xform : SDNodeXForm<imm, [{
@@ -168,21 +168,20 @@ def ldawb : PatFrag<(ops node:$addr, node:$offset),
                      (sub node:$addr, (shl node:$offset, 2))>;
 
 // Instruction operand types
-def calltarget  : Operand<i32>;
+def pcrel_imm  : Operand<i32>;
+def pcrel_imm_neg  : Operand<i32> {
+  let DecoderMethod = "DecodeNegImmOperand";
+}
 def brtarget : Operand<OtherVT>;
-def pclabel : Operand<i32>;
+def brtarget_neg : Operand<OtherVT> {
+  let DecoderMethod = "DecodeNegImmOperand";
+}
 
 // Addressing modes
 def ADDRspii : ComplexPattern<i32, 2, "SelectADDRspii", [add, frameindex], []>;
-def ADDRdpii : ComplexPattern<i32, 2, "SelectADDRdpii", [add, dprelwrapper],
-                 []>;
-def ADDRcpii : ComplexPattern<i32, 2, "SelectADDRcpii", [add, cprelwrapper],
-                 []>;
 
 // Address operands
 def MEMii : Operand<i32> {
-  let PrintMethod = "printMemOperand";
-  let DecoderMethod = "DecodeMEMiiOperand";
   let MIOperandInfo = (ops i32imm, i32imm);
 }
 
@@ -274,10 +273,10 @@ multiclass FRU6_LRU6_branch<bits<6> opc, string OpcStr> {
 }
 
 multiclass FRU6_LRU6_backwards_branch<bits<6> opc, string OpcStr> {
-  def _ru6: _FRU6<opc, (outs), (ins GRRegs:$a, brtarget:$b),
-                  !strconcat(OpcStr, " $a, -$b"), []>;
-  def _lru6: _FLRU6<opc, (outs), (ins GRRegs:$a, brtarget:$b),
-                    !strconcat(OpcStr, " $a, -$b"), []>;
+  def _ru6: _FRU6<opc, (outs), (ins GRRegs:$a, brtarget_neg:$b),
+                  !strconcat(OpcStr, " $a, $b"), []>;
+  def _lru6: _FLRU6<opc, (outs), (ins GRRegs:$a, brtarget_neg:$b),
+                    !strconcat(OpcStr, " $a, $b"), []>;
 }
 
 multiclass FRU6_LRU6_cp<bits<6> opc, string OpcStr> {
@@ -515,29 +514,29 @@ def LMUL_l6r : _FL6R<
 
 //let Uses = [DP] in ...
 let neverHasSideEffects = 1, isReMaterializable = 1 in
-def LDAWDP_ru6: _FRU6<0b011000, (outs RRegs:$a), (ins MEMii:$b),
+def LDAWDP_ru6: _FRU6<0b011000, (outs RRegs:$a), (ins i32imm:$b),
                       "ldaw $a, dp[$b]", []>;
 
 let isReMaterializable = 1 in                    
-def LDAWDP_lru6: _FLRU6<0b011000, (outs RRegs:$a), (ins MEMii:$b),
+def LDAWDP_lru6: _FLRU6<0b011000, (outs RRegs:$a), (ins i32imm:$b),
                         "ldaw $a, dp[$b]",
-                        [(set RRegs:$a, ADDRdpii:$b)]>;
+                        [(set RRegs:$a, (dprelwrapper tglobaladdr:$b))]>;
 
 let mayLoad=1 in
-def LDWDP_ru6: _FRU6<0b010110, (outs RRegs:$a), (ins MEMii:$b),
+def LDWDP_ru6: _FRU6<0b010110, (outs RRegs:$a), (ins i32imm:$b),
                      "ldw $a, dp[$b]", []>;
 
-def LDWDP_lru6: _FLRU6<0b010110, (outs RRegs:$a), (ins MEMii:$b),
+def LDWDP_lru6: _FLRU6<0b010110, (outs RRegs:$a), (ins i32imm:$b),
                        "ldw $a, dp[$b]",
-                       [(set RRegs:$a, (load ADDRdpii:$b))]>;
+                       [(set RRegs:$a, (load (dprelwrapper tglobaladdr:$b)))]>;
 
 let mayStore=1 in
-def STWDP_ru6 : _FRU6<0b010100, (outs), (ins RRegs:$a, MEMii:$b),
+def STWDP_ru6 : _FRU6<0b010100, (outs), (ins RRegs:$a, i32imm:$b),
                       "stw $a, dp[$b]", []>;
 
-def STWDP_lru6 : _FLRU6<0b010100, (outs), (ins RRegs:$a, MEMii:$b),
+def STWDP_lru6 : _FLRU6<0b010100, (outs), (ins RRegs:$a, i32imm:$b),
                         "stw $a, dp[$b]",
-                        [(store RRegs:$a, ADDRdpii:$b)]>;
+                        [(store RRegs:$a, (dprelwrapper tglobaladdr:$b))]>;
 
 //let Uses = [CP] in ..
 let mayLoad = 1, isReMaterializable = 1, neverHasSideEffects = 1 in
@@ -615,9 +614,9 @@ let Uses = [R11], isCall=1 in
 defm BLAT : FU6_LU6_np<0b0111001101, "blat">;
 
 let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
-def BRBU_u6 : _FU6<0b0111011100, (outs), (ins brtarget:$a), "bu -$a", []>;
+def BRBU_u6 : _FU6<0b0111011100, (outs), (ins brtarget_neg:$a), "bu $a", []>;
 
-def BRBU_lu6 : _FLU6<0b0111011100, (outs), (ins brtarget:$a), "bu -$a", []>;
+def BRBU_lu6 : _FLU6<0b0111011100, (outs), (ins brtarget_neg:$a), "bu $a", []>;
 
 def BRFU_u6 : _FU6<0b0111001100, (outs), (ins brtarget:$a), "bu $a", []>;
 
@@ -626,12 +625,12 @@ def BRFU_lu6 : _FLU6<0b0111001100, (outs), (ins brtarget:$a), "bu $a", []>;
 
 //let Uses = [CP] in ...
 let Defs = [R11], neverHasSideEffects = 1, isReMaterializable = 1 in
-def LDAWCP_u6: _FU6<0b0111111101, (outs), (ins MEMii:$a), "ldaw r11, cp[$a]",
+def LDAWCP_u6: _FU6<0b0111111101, (outs), (ins i32imm:$a), "ldaw r11, cp[$a]",
                     []>;
 
 let Defs = [R11], isReMaterializable = 1 in
-def LDAWCP_lu6: _FLU6<0b0111111101, (outs), (ins MEMii:$a), "ldaw r11, cp[$a]",
-                      [(set R11, ADDRcpii:$a)]>;
+def LDAWCP_lu6: _FLU6<0b0111111101, (outs), (ins i32imm:$a), "ldaw r11, cp[$a]",
+                      [(set R11, (cprelwrapper tglobaladdr:$a))]>;
 
 let Defs = [R11] in
 defm GETSR : FU6_LU6_np<0b0111111100, "getsr r11,">;
@@ -658,16 +657,26 @@ defm KRESTSP : FU6_LU6_np<0b0111101111, "krestsp">;
 
 // U10
 
-let Defs = [R11], isReMaterializable = 1, neverHasSideEffects = 1 in
-def LDAPF_u10 : _FU10<0b110110, (outs), (ins i32imm:$a), "ldap r11, $a", []>;
+let Defs = [R11], isReMaterializable = 1 in {
+let neverHasSideEffects = 1 in
+def LDAPF_u10 : _FU10<0b110110, (outs), (ins pcrel_imm:$a), "ldap r11, $a", []>;
+
+def LDAPF_lu10 : _FLU10<0b110110, (outs), (ins pcrel_imm:$a), "ldap r11, $a",
+                        [(set R11, (pcrelwrapper tglobaladdr:$a))]>;
 
-let Defs = [R11], isReMaterializable = 1 in
-def LDAPF_lu10 : _FLU10<0b110110, (outs), (ins i32imm:$a), "ldap r11, $a",
+let neverHasSideEffects = 1 in
+def LDAPB_u10 : _FU10<0b110111, (outs), (ins pcrel_imm_neg:$a), "ldap r11, $a",
+                      []>;
+
+let neverHasSideEffects = 1 in
+def LDAPB_lu10 : _FLU10<0b110111, (outs), (ins pcrel_imm_neg:$a),
+                        "ldap r11, $a",
                         [(set R11, (pcrelwrapper tglobaladdr:$a))]>;
 
-let Defs = [R11], isReMaterializable = 1, isCodeGenOnly = 1 in
-def LDAPF_lu10_ba : _FLU10<0b110110, (outs), (ins i32imm:$a), "ldap r11, $a",
+let isCodeGenOnly = 1 in
+def LDAPF_lu10_ba : _FLU10<0b110110, (outs), (ins pcrel_imm:$a), "ldap r11, $a",
                            [(set R11, (pcrelwrapper tblockaddress:$a))]>;
+}
 
 let isCall=1,
 // All calls clobber the link register and the non-callee-saved registers:
@@ -676,11 +685,15 @@ def BLACP_u10 : _FU10<0b111000, (outs), (ins i32imm:$a), "bla cp[$a]", []>;
 
 def BLACP_lu10 : _FLU10<0b111000, (outs), (ins i32imm:$a), "bla cp[$a]", []>;
 
-def BLRF_u10 : _FU10<0b110100, (outs), (ins calltarget:$a), "bl $a",
+def BLRF_u10 : _FU10<0b110100, (outs), (ins pcrel_imm:$a), "bl $a",
                      [(XCoreBranchLink immU10:$a)]>;
 
-def BLRF_lu10 : _FLU10<0b110100, (outs), (ins calltarget:$a), "bl $a",
+def BLRF_lu10 : _FLU10<0b110100, (outs), (ins pcrel_imm:$a), "bl $a",
                        [(XCoreBranchLink immU20:$a)]>;
+
+def BLRB_u10 : _FU10<0b110101, (outs), (ins pcrel_imm_neg:$a), "bl $a", []>;
+
+def BLRB_lu10 : _FLU10<0b110101, (outs), (ins pcrel_imm_neg:$a), "bl $a", []>;
 }
 
 let Defs = [R11], mayLoad = 1, isReMaterializable = 1,
diff --git a/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/lib/Target/XCore/XCoreLowerThreadLocal.cpp
new file mode 100644
index 0000000..2e328b4
--- /dev/null
+++ b/lib/Target/XCore/XCoreLowerThreadLocal.cpp
@@ -0,0 +1,145 @@
+//===-- XCoreLowerThreadLocal - Lower thread local variables --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains a pass that lowers thread local variables on the
+///        XCore.
+///
+//===----------------------------------------------------------------------===//
+
+#include "XCore.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+
+#define DEBUG_TYPE "xcore-lower-thread-local"
+
+using namespace llvm;
+
+static cl::opt<unsigned> MaxThreads(
+  "xcore-max-threads", cl::Optional,
+  cl::desc("Maximum number of threads (for emulation thread-local storage)"),
+  cl::Hidden, cl::value_desc("number"), cl::init(8));
+
+namespace {
+  /// Lowers thread local variables on the XCore. Each thread local variable is
+  /// expanded to an array of n elements indexed by the thread ID where n is the
+  /// fixed number hardware threads supported by the device.
+  struct XCoreLowerThreadLocal : public ModulePass {
+    static char ID;
+
+    XCoreLowerThreadLocal() : ModulePass(ID) {
+      initializeXCoreLowerThreadLocalPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool lowerGlobal(GlobalVariable *GV);
+
+    bool runOnModule(Module &M);
+  };
+}
+
+char XCoreLowerThreadLocal::ID = 0;
+
+INITIALIZE_PASS(XCoreLowerThreadLocal, "xcore-lower-thread-local",
+                "Lower thread local variables", false, false)
+
+ModulePass *llvm::createXCoreLowerThreadLocalPass() {
+  return new XCoreLowerThreadLocal();
+}
+
+static ArrayType *createLoweredType(Type *OriginalType) {
+  return ArrayType::get(OriginalType, MaxThreads);
+}
+
+static Constant *
+createLoweredInitializer(ArrayType *NewType, Constant *OriginalInitializer) {
+  SmallVector<Constant *, 8> Elements(MaxThreads);
+  for (unsigned i = 0; i != MaxThreads; ++i) {
+    Elements[i] = OriginalInitializer;
+  }
+  return ConstantArray::get(NewType, Elements);
+}
+
+static bool hasNonInstructionUse(GlobalVariable *GV) {
+  for (Value::use_iterator UI = GV->use_begin(), E = GV->use_end(); UI != E;
+       ++UI)
+    if (!isa<Instruction>(*UI))
+      return true;
+
+  return false;
+}
+
+static bool isZeroLengthArray(Type *Ty) {
+  ArrayType *AT = dyn_cast<ArrayType>(Ty);
+  return AT && (AT->getNumElements() == 0);
+}
+
+bool XCoreLowerThreadLocal::lowerGlobal(GlobalVariable *GV) {
+  Module *M = GV->getParent();
+  LLVMContext &Ctx = M->getContext();
+  if (!GV->isThreadLocal())
+    return false;
+
+  // Skip globals that we can't lower and leave it for the backend to error.
+  if (hasNonInstructionUse(GV) ||
+      !GV->getType()->isSized() || isZeroLengthArray(GV->getType()))
+    return false;
+
+  // Create replacement global.
+  ArrayType *NewType = createLoweredType(GV->getType()->getElementType());
+  Constant *NewInitializer = createLoweredInitializer(NewType,
+                                                      GV->getInitializer());
+  GlobalVariable *NewGV =
+    new GlobalVariable(*M, NewType, GV->isConstant(), GV->getLinkage(),
+                       NewInitializer, "", 0, GlobalVariable::NotThreadLocal,
+                       GV->getType()->getAddressSpace(),
+                       GV->isExternallyInitialized());
+
+  // Update uses.
+  SmallVector<User *, 16> Users(GV->use_begin(), GV->use_end());
+  for (unsigned I = 0, E = Users.size(); I != E; ++I) {
+    User *U = Users[I];
+    Instruction *Inst = cast<Instruction>(U);
+    IRBuilder<> Builder(Inst);
+    Function *GetID = Intrinsic::getDeclaration(GV->getParent(),
+                                                Intrinsic::xcore_getid);
+    Value *ThreadID = Builder.CreateCall(GetID);
+    SmallVector<Value *, 2> Indices;
+    Indices.push_back(Constant::getNullValue(Type::getInt64Ty(Ctx)));
+    Indices.push_back(ThreadID);
+    Value *Addr = Builder.CreateInBoundsGEP(NewGV, Indices);
+    U->replaceUsesOfWith(GV, Addr);
+  }
+
+  // Remove old global.
+  NewGV->takeName(GV);
+  GV->eraseFromParent();
+  return true;
+}
+
+bool XCoreLowerThreadLocal::runOnModule(Module &M) {
+  // Find thread local globals.
+  bool MadeChange = false;
+  SmallVector<GlobalVariable *, 16> ThreadLocalGlobals;
+  for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
+       GVI != E; ++GVI) {
+    GlobalVariable *GV = GVI;
+    if (GV->isThreadLocal())
+      ThreadLocalGlobals.push_back(GV);
+  }
+  for (unsigned I = 0, E = ThreadLocalGlobals.size(); I != E; ++I) {
+    MadeChange |= lowerGlobal(ThreadLocalGlobals[I]);
+  }
+  return MadeChange;
+}
diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp
index 49b5634..dbd2f52 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.cpp
+++ b/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -37,8 +37,8 @@
 
 using namespace llvm;
 
-XCoreRegisterInfo::XCoreRegisterInfo(const TargetInstrInfo &tii)
-  : XCoreGenRegisterInfo(XCore::LR), TII(tii) {
+XCoreRegisterInfo::XCoreRegisterInfo()
+  : XCoreGenRegisterInfo(XCore::LR) {
 }
 
 // helper functions
@@ -112,6 +112,7 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   int FrameIndex = FrameOp.getIndex();
 
   MachineFunction &MF = *MI.getParent()->getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
   int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex);
   int StackSize = MF.getFrameInfo()->getStackSize();
@@ -249,6 +250,7 @@ loadConstant(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     report_fatal_error("loadConstant value too big " + Twine(Value));
   }
   int Opcode = isImmU6(Value) ? XCore::LDC_ru6 : XCore::LDC_lru6;
+  const TargetInstrInfo &TII = *MBB.getParent()->getTarget().getInstrInfo();
   BuildMI(MBB, I, dl, TII.get(Opcode), DstReg).addImm(Value);
 }
 
diff --git a/lib/Target/XCore/XCoreRegisterInfo.h b/lib/Target/XCore/XCoreRegisterInfo.h
index 1db3248..2370c62 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.h
+++ b/lib/Target/XCore/XCoreRegisterInfo.h
@@ -25,8 +25,6 @@ class TargetInstrInfo;
 
 struct XCoreRegisterInfo : public XCoreGenRegisterInfo {
 private:
-  const TargetInstrInfo &TII;
-
   void loadConstant(MachineBasicBlock &MBB,
                   MachineBasicBlock::iterator I,
                   unsigned DstReg, int64_t Value, DebugLoc dl) const;
@@ -40,7 +38,7 @@ private:
                   unsigned DstReg, int Offset, DebugLoc dl) const;
 
 public:
-  XCoreRegisterInfo(const TargetInstrInfo &tii);
+  XCoreRegisterInfo();
 
   /// Code Generation virtual methods...
 
diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
index 28c3d12..3ef1520 100644
--- a/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -33,6 +33,7 @@ XCoreTargetMachine::XCoreTargetMachine(const Target &T, StringRef TT,
     FrameLowering(Subtarget),
     TLInfo(*this),
     TSInfo(*this) {
+  initAsmInfo();
 }
 
 namespace {
@@ -46,6 +47,7 @@ public:
     return getTM<XCoreTargetMachine>();
   }
 
+  virtual bool addPreISel();
   virtual bool addInstSelector();
 };
 } // namespace
@@ -54,6 +56,11 @@ TargetPassConfig *XCoreTargetMachine::createPassConfig(PassManagerBase &PM) {
   return new XCorePassConfig(this, PM);
 }
 
+bool XCorePassConfig::addPreISel() {
+  addPass(createXCoreLowerThreadLocalPass());
+  return false;
+}
+
 bool XCorePassConfig::addInstSelector() {
   addPass(createXCoreISelDag(getXCoreTargetMachine(), getOptLevel()));
   return false;
diff --git a/lib/Target/XCore/XCoreTargetObjectFile.cpp b/lib/Target/XCore/XCoreTargetObjectFile.cpp
index 8203899..88e3bfd 100644
--- a/lib/Target/XCore/XCoreTargetObjectFile.cpp
+++ b/lib/Target/XCore/XCoreTargetObjectFile.cpp
@@ -57,9 +57,4 @@ void XCoreTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM){
                       ELF::SHF_ALLOC |
                       ELF::XCORE_SHF_CP_SECTION,
                       SectionKind::getReadOnlyWithRel());
-
-  // Dynamic linking is not supported. Data with relocations is placed in the
-  // same section as data without relocations.
-  DataRelSection = DataRelLocalSection = DataSection;
-  DataRelROSection = DataRelROLocalSection = ReadOnlySection;
 }